341 lines
13 KiB
Python
341 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Belgian ISIL institutions with Wikidata Q-numbers, VIAF IDs, and other identifiers.
|
|
|
|
This script:
|
|
1. Loads Belgian institutions from enriched YAML
|
|
2. Queries Wikidata SPARQL endpoint for Belgian ISIL codes (BE-*)
|
|
3. Adds Wikidata Q-numbers, VIAF IDs, founding dates, coordinates
|
|
4. Updates GHCIDs with Q-numbers for collision resolution
|
|
5. Exports enriched YAML with Wikidata data
|
|
|
|
Query strategy:
|
|
- Query by ISIL code (P791) for exact matches
|
|
- Batch queries for efficiency (100 codes per query)
|
|
- Add multilingual labels (English, Dutch, French)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Optional, Dict
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import yaml
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
|
|
|
|
def query_wikidata_batch(isil_codes: list[str], sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
|
|
"""
|
|
Query Wikidata for a batch of Belgian ISIL codes.
|
|
|
|
Args:
|
|
isil_codes: List of ISIL codes (e.g., BE-OSE00, BE-A0001)
|
|
sparql: Configured SPARQL wrapper
|
|
|
|
Returns:
|
|
Dict mapping ISIL code → Wikidata data
|
|
"""
|
|
# Build VALUES clause
|
|
isil_values = " ".join(f'"{code}"' for code in isil_codes)
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemLabelNl ?itemLabelFr ?itemDescription ?isil ?viaf ?coords ?website ?inception
|
|
WHERE {{
|
|
VALUES ?isil {{ {isil_values} }}
|
|
?item wdt:P791 ?isil .
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
|
|
# Multilingual labels
|
|
OPTIONAL {{ ?item rdfs:label ?itemLabelNl . FILTER(LANG(?itemLabelNl) = "nl") }}
|
|
OPTIONAL {{ ?item rdfs:label ?itemLabelFr . FILTER(LANG(?itemLabelFr) = "fr") }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl,fr,de" . }}
|
|
}}
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
raw_results = sparql.query().convert()
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
# Parse results into dict keyed by ISIL code
|
|
results = {}
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
isil = binding.get("isil", {}).get("value")
|
|
|
|
if not qid or not qid.startswith("Q") or not isil:
|
|
continue
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"name_nl": binding.get("itemLabelNl", {}).get("value"),
|
|
"name_fr": binding.get("itemLabelFr", {}).get("value"),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"identifiers": {}
|
|
}
|
|
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
result["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
results[isil] = result
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ SPARQL Error: {e}")
|
|
return {}
|
|
|
|
|
|
def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
|
|
"""
|
|
Enrich an institution with Wikidata data.
|
|
|
|
Returns True if any new data was added.
|
|
"""
|
|
enriched = False
|
|
|
|
# Ensure identifiers list exists
|
|
if "identifiers" not in inst or not inst["identifiers"]:
|
|
inst["identifiers"] = []
|
|
|
|
identifiers_list = inst["identifiers"]
|
|
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
|
|
|
|
# Add Wikidata ID
|
|
if "Wikidata" not in existing_schemes:
|
|
identifiers_list.append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": wd_data["qid"],
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
|
|
})
|
|
enriched = True
|
|
|
|
# Add VIAF, Website, etc.
|
|
wd_identifiers = wd_data.get("identifiers", {})
|
|
for scheme, value in wd_identifiers.items():
|
|
if scheme not in existing_schemes:
|
|
id_obj = {
|
|
"identifier_scheme": scheme,
|
|
"identifier_value": value
|
|
}
|
|
|
|
if scheme == "VIAF":
|
|
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
|
|
elif scheme == "Website":
|
|
id_obj["identifier_url"] = value
|
|
|
|
identifiers_list.append(id_obj)
|
|
enriched = True
|
|
|
|
# Add founding date
|
|
if "founding_date" in wd_data and not inst.get("founded_date"):
|
|
inst["founded_date"] = wd_data["founding_date"]
|
|
enriched = True
|
|
|
|
# Add coordinates to location
|
|
if "latitude" in wd_data and "longitude" in wd_data:
|
|
locations = inst.get("locations", [])
|
|
if isinstance(locations, list) and len(locations) > 0:
|
|
first_loc = locations[0]
|
|
if isinstance(first_loc, dict):
|
|
if first_loc.get("latitude") is None:
|
|
first_loc["latitude"] = wd_data["latitude"]
|
|
first_loc["longitude"] = wd_data["longitude"]
|
|
enriched = True
|
|
|
|
# Add multilingual names to alternative_names
|
|
if "alternative_names" not in inst or not inst["alternative_names"]:
|
|
inst["alternative_names"] = []
|
|
|
|
alt_names = inst["alternative_names"]
|
|
if isinstance(alt_names, list):
|
|
if wd_data.get("name_nl") and wd_data["name_nl"] not in alt_names and wd_data["name_nl"] != inst.get("name"):
|
|
alt_names.append(wd_data["name_nl"])
|
|
enriched = True
|
|
if wd_data.get("name_fr") and wd_data["name_fr"] not in alt_names and wd_data["name_fr"] != inst.get("name"):
|
|
alt_names.append(wd_data["name_fr"])
|
|
enriched = True
|
|
|
|
# Update provenance
|
|
if enriched:
|
|
prov = inst.get("provenance", {})
|
|
if isinstance(prov, dict):
|
|
existing_method = prov.get("extraction_method", "")
|
|
if "Wikidata enrichment" not in existing_method:
|
|
prov["extraction_method"] = f"{existing_method} + Wikidata enrichment"
|
|
|
|
return enriched
|
|
|
|
|
|
def main():
|
|
"""Main enrichment workflow."""
|
|
|
|
print("=" * 70)
|
|
print("Belgian Institutions Wikidata Enrichment")
|
|
print("=" * 70)
|
|
|
|
# Input/output files
|
|
input_file = Path("data/instances/belgium_isil_institutions_enriched.yaml")
|
|
output_file = Path("data/instances/belgium_isil_institutions_wikidata.yaml")
|
|
|
|
if not input_file.exists():
|
|
print(f"\n❌ Input file not found: {input_file}")
|
|
print(" Run scripts/enrich_belgian_locations.py first")
|
|
return
|
|
|
|
# Load Belgian institutions
|
|
print(f"\n1. Loading institutions from {input_file}...")
|
|
import re
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Skip header comments and first ---
|
|
lines = content.split('\n')
|
|
start_idx = next((i for i, line in enumerate(lines) if line.strip() == '---'), 0)
|
|
yaml_content = '\n'.join(lines[start_idx+1:])
|
|
|
|
# Split into individual YAML documents by detecting 'id: BE-' at start of line
|
|
records_text = re.split(r'\n(?=id: BE-)', yaml_content)
|
|
records_text = [r.strip() for r in records_text if r.strip()]
|
|
|
|
# Parse each record
|
|
institutions = []
|
|
for record_text in records_text:
|
|
try:
|
|
inst = yaml.safe_load(record_text)
|
|
if inst:
|
|
institutions.append(inst)
|
|
except Exception:
|
|
continue
|
|
|
|
print(f" ✓ Loaded {len(institutions)} institutions")
|
|
|
|
# Extract ISIL codes
|
|
isil_codes = [inst.get("id") for inst in institutions if inst.get("id", "").startswith("BE-")]
|
|
print(f" ✓ Found {len(isil_codes)} Belgian ISIL codes")
|
|
|
|
# Setup SPARQL endpoint
|
|
print(f"\n2. Querying Wikidata SPARQL endpoint...")
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
sparql.setMethod("POST")
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extractor/1.0 (https://github.com/kempersc/glam)")
|
|
|
|
# Query in batches
|
|
batch_size = 100
|
|
total_batches = (len(isil_codes) + batch_size - 1) // batch_size
|
|
|
|
all_wd_data = {}
|
|
|
|
for batch_num in range(total_batches):
|
|
start_idx = batch_num * batch_size
|
|
end_idx = min(start_idx + batch_size, len(isil_codes))
|
|
batch_codes = isil_codes[start_idx:end_idx]
|
|
|
|
print(f" Batch {batch_num + 1}/{total_batches}: Querying {len(batch_codes)} ISIL codes...")
|
|
|
|
wd_data = query_wikidata_batch(batch_codes, sparql)
|
|
all_wd_data.update(wd_data)
|
|
|
|
print(f" ✓ Found {len(wd_data)} Wikidata matches")
|
|
|
|
# Rate limiting
|
|
if batch_num < total_batches - 1:
|
|
time.sleep(1)
|
|
|
|
print(f"\n ✓ Total Wikidata matches: {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)")
|
|
|
|
# Enrich institutions
|
|
print(f"\n3. Enriching institutions with Wikidata data...")
|
|
enriched_count = 0
|
|
|
|
for inst in institutions:
|
|
isil_code = inst.get("id")
|
|
if isil_code in all_wd_data:
|
|
if enrich_institution(inst, all_wd_data[isil_code]):
|
|
enriched_count += 1
|
|
|
|
print(f" ✓ Enriched {enriched_count} institutions")
|
|
|
|
# Show enrichment examples
|
|
print(f"\n4. Sample enriched institutions:")
|
|
enriched_samples = [inst for inst in institutions if any(
|
|
i.get("identifier_scheme") == "Wikidata" for i in inst.get("identifiers", []) if isinstance(i, dict)
|
|
)][:5]
|
|
|
|
for inst in enriched_samples:
|
|
wd_id = next((i["identifier_value"] for i in inst.get("identifiers", []) if isinstance(i, dict) and i.get("identifier_scheme") == "Wikidata"), None)
|
|
print(f" {inst.get('id')}: {inst.get('name', '')[:40]:40} → {wd_id}")
|
|
|
|
# Export enriched data
|
|
print(f"\n5. Exporting enriched YAML to {output_file}...")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("# Belgian ISIL Registry Institutions (Wikidata Enriched)\n")
|
|
f.write("# Scraped from https://isil.kbr.be/ + Wikidata SPARQL queries\n")
|
|
f.write(f"# Total institutions: {len(institutions)}\n")
|
|
f.write(f"# Wikidata enriched: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)\n")
|
|
f.write(f"# Enrichment date: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write("#\n")
|
|
f.write("---\n\n")
|
|
|
|
for idx, inst in enumerate(institutions, 1):
|
|
yaml.dump(inst, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
f.write("\n")
|
|
|
|
if idx % 50 == 0:
|
|
print(f" ... exported {idx} institutions")
|
|
|
|
file_size_kb = output_file.stat().st_size / 1024
|
|
print(f" ✓ Exported to: {output_file}")
|
|
print(f" ✓ File size: {file_size_kb:.1f} KB")
|
|
|
|
# Summary statistics
|
|
print("\n" + "=" * 70)
|
|
print("Wikidata Enrichment Summary")
|
|
print("=" * 70)
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"Wikidata Q-numbers added: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
|
|
print(f"Wikidata coverage: {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)")
|
|
|
|
# Count additional identifiers
|
|
viaf_count = sum(1 for inst in institutions if any(
|
|
i.get("identifier_scheme") == "VIAF" for i in inst.get("identifiers", []) if isinstance(i, dict)
|
|
))
|
|
print(f"VIAF IDs added: {viaf_count}")
|
|
|
|
coords_count = sum(1 for inst in institutions
|
|
if inst.get("locations") and len(inst["locations"]) > 0
|
|
and inst["locations"][0].get("latitude"))
|
|
print(f"Coordinates added: {coords_count}")
|
|
|
|
founding_count = sum(1 for inst in institutions if inst.get("founded_date"))
|
|
print(f"Founding dates added: {founding_count}")
|
|
|
|
print("\n✓ Wikidata enrichment complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|