glam/scripts/enrich_belgian_wikidata.py
2025-11-19 23:25:22 +01:00

341 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Enrich Belgian ISIL institutions with Wikidata Q-numbers, VIAF IDs, and other identifiers.
This script:
1. Loads Belgian institutions from enriched YAML
2. Queries Wikidata SPARQL endpoint for Belgian ISIL codes (BE-*)
3. Adds Wikidata Q-numbers, VIAF IDs, founding dates, coordinates
4. Updates GHCIDs with Q-numbers for collision resolution
5. Exports enriched YAML with Wikidata data
Query strategy:
- Query by ISIL code (P791) for exact matches
- Batch queries for efficiency (100 codes per query)
- Add multilingual labels (English, Dutch, French)
"""
import sys
from pathlib import Path
from typing import Any, Optional, Dict
from datetime import datetime, timezone
import time
import yaml
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
def query_wikidata_batch(isil_codes: list[str], sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
"""
Query Wikidata for a batch of Belgian ISIL codes.
Args:
isil_codes: List of ISIL codes (e.g., BE-OSE00, BE-A0001)
sparql: Configured SPARQL wrapper
Returns:
Dict mapping ISIL code → Wikidata data
"""
# Build VALUES clause
isil_values = " ".join(f'"{code}"' for code in isil_codes)
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemLabelNl ?itemLabelFr ?itemDescription ?isil ?viaf ?coords ?website ?inception
WHERE {{
VALUES ?isil {{ {isil_values} }}
?item wdt:P791 ?isil .
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
OPTIONAL {{ ?item wdt:P625 ?coords . }}
OPTIONAL {{ ?item wdt:P856 ?website . }}
OPTIONAL {{ ?item wdt:P571 ?inception . }}
# Multilingual labels
OPTIONAL {{ ?item rdfs:label ?itemLabelNl . FILTER(LANG(?itemLabelNl) = "nl") }}
OPTIONAL {{ ?item rdfs:label ?itemLabelFr . FILTER(LANG(?itemLabelFr) = "fr") }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl,fr,de" . }}
}}
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
# Parse results into dict keyed by ISIL code
results = {}
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
isil = binding.get("isil", {}).get("value")
if not qid or not qid.startswith("Q") or not isil:
continue
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"name_nl": binding.get("itemLabelNl", {}).get("value"),
"name_fr": binding.get("itemLabelFr", {}).get("value"),
"description": binding.get("itemDescription", {}).get("value", ""),
"identifiers": {}
}
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
result["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
results[isil] = result
return results
except Exception as e:
print(f"\n❌ SPARQL Error: {e}")
return {}
def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
"""
Enrich an institution with Wikidata data.
Returns True if any new data was added.
"""
enriched = False
# Ensure identifiers list exists
if "identifiers" not in inst or not inst["identifiers"]:
inst["identifiers"] = []
identifiers_list = inst["identifiers"]
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
# Add Wikidata ID
if "Wikidata" not in existing_schemes:
identifiers_list.append({
"identifier_scheme": "Wikidata",
"identifier_value": wd_data["qid"],
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
})
enriched = True
# Add VIAF, Website, etc.
wd_identifiers = wd_data.get("identifiers", {})
for scheme, value in wd_identifiers.items():
if scheme not in existing_schemes:
id_obj = {
"identifier_scheme": scheme,
"identifier_value": value
}
if scheme == "VIAF":
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
elif scheme == "Website":
id_obj["identifier_url"] = value
identifiers_list.append(id_obj)
enriched = True
# Add founding date
if "founding_date" in wd_data and not inst.get("founded_date"):
inst["founded_date"] = wd_data["founding_date"]
enriched = True
# Add coordinates to location
if "latitude" in wd_data and "longitude" in wd_data:
locations = inst.get("locations", [])
if isinstance(locations, list) and len(locations) > 0:
first_loc = locations[0]
if isinstance(first_loc, dict):
if first_loc.get("latitude") is None:
first_loc["latitude"] = wd_data["latitude"]
first_loc["longitude"] = wd_data["longitude"]
enriched = True
# Add multilingual names to alternative_names
if "alternative_names" not in inst or not inst["alternative_names"]:
inst["alternative_names"] = []
alt_names = inst["alternative_names"]
if isinstance(alt_names, list):
if wd_data.get("name_nl") and wd_data["name_nl"] not in alt_names and wd_data["name_nl"] != inst.get("name"):
alt_names.append(wd_data["name_nl"])
enriched = True
if wd_data.get("name_fr") and wd_data["name_fr"] not in alt_names and wd_data["name_fr"] != inst.get("name"):
alt_names.append(wd_data["name_fr"])
enriched = True
# Update provenance
if enriched:
prov = inst.get("provenance", {})
if isinstance(prov, dict):
existing_method = prov.get("extraction_method", "")
if "Wikidata enrichment" not in existing_method:
prov["extraction_method"] = f"{existing_method} + Wikidata enrichment"
return enriched
def main():
"""Main enrichment workflow."""
print("=" * 70)
print("Belgian Institutions Wikidata Enrichment")
print("=" * 70)
# Input/output files
input_file = Path("data/instances/belgium_isil_institutions_enriched.yaml")
output_file = Path("data/instances/belgium_isil_institutions_wikidata.yaml")
if not input_file.exists():
print(f"\n❌ Input file not found: {input_file}")
print(" Run scripts/enrich_belgian_locations.py first")
return
# Load Belgian institutions
print(f"\n1. Loading institutions from {input_file}...")
import re
with open(input_file, 'r', encoding='utf-8') as f:
content = f.read()
# Skip header comments and first ---
lines = content.split('\n')
start_idx = next((i for i, line in enumerate(lines) if line.strip() == '---'), 0)
yaml_content = '\n'.join(lines[start_idx+1:])
# Split into individual YAML documents by detecting 'id: BE-' at start of line
records_text = re.split(r'\n(?=id: BE-)', yaml_content)
records_text = [r.strip() for r in records_text if r.strip()]
# Parse each record
institutions = []
for record_text in records_text:
try:
inst = yaml.safe_load(record_text)
if inst:
institutions.append(inst)
except Exception:
continue
print(f" ✓ Loaded {len(institutions)} institutions")
# Extract ISIL codes
isil_codes = [inst.get("id") for inst in institutions if inst.get("id", "").startswith("BE-")]
print(f" ✓ Found {len(isil_codes)} Belgian ISIL codes")
# Setup SPARQL endpoint
print(f"\n2. Querying Wikidata SPARQL endpoint...")
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
sparql.setMethod("POST")
sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extractor/1.0 (https://github.com/kempersc/glam)")
# Query in batches
batch_size = 100
total_batches = (len(isil_codes) + batch_size - 1) // batch_size
all_wd_data = {}
for batch_num in range(total_batches):
start_idx = batch_num * batch_size
end_idx = min(start_idx + batch_size, len(isil_codes))
batch_codes = isil_codes[start_idx:end_idx]
print(f" Batch {batch_num + 1}/{total_batches}: Querying {len(batch_codes)} ISIL codes...")
wd_data = query_wikidata_batch(batch_codes, sparql)
all_wd_data.update(wd_data)
print(f" ✓ Found {len(wd_data)} Wikidata matches")
# Rate limiting
if batch_num < total_batches - 1:
time.sleep(1)
print(f"\n ✓ Total Wikidata matches: {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)")
# Enrich institutions
print(f"\n3. Enriching institutions with Wikidata data...")
enriched_count = 0
for inst in institutions:
isil_code = inst.get("id")
if isil_code in all_wd_data:
if enrich_institution(inst, all_wd_data[isil_code]):
enriched_count += 1
print(f" ✓ Enriched {enriched_count} institutions")
# Show enrichment examples
print(f"\n4. Sample enriched institutions:")
enriched_samples = [inst for inst in institutions if any(
i.get("identifier_scheme") == "Wikidata" for i in inst.get("identifiers", []) if isinstance(i, dict)
)][:5]
for inst in enriched_samples:
wd_id = next((i["identifier_value"] for i in inst.get("identifiers", []) if isinstance(i, dict) and i.get("identifier_scheme") == "Wikidata"), None)
print(f" {inst.get('id')}: {inst.get('name', '')[:40]:40}{wd_id}")
# Export enriched data
print(f"\n5. Exporting enriched YAML to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# Belgian ISIL Registry Institutions (Wikidata Enriched)\n")
f.write("# Scraped from https://isil.kbr.be/ + Wikidata SPARQL queries\n")
f.write(f"# Total institutions: {len(institutions)}\n")
f.write(f"# Wikidata enriched: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)\n")
f.write(f"# Enrichment date: {datetime.now(timezone.utc).isoformat()}\n")
f.write("#\n")
f.write("---\n\n")
for idx, inst in enumerate(institutions, 1):
yaml.dump(inst, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
f.write("\n")
if idx % 50 == 0:
print(f" ... exported {idx} institutions")
file_size_kb = output_file.stat().st_size / 1024
print(f" ✓ Exported to: {output_file}")
print(f" ✓ File size: {file_size_kb:.1f} KB")
# Summary statistics
print("\n" + "=" * 70)
print("Wikidata Enrichment Summary")
print("=" * 70)
print(f"Total institutions: {len(institutions)}")
print(f"Wikidata Q-numbers added: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
print(f"Wikidata coverage: {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)")
# Count additional identifiers
viaf_count = sum(1 for inst in institutions if any(
i.get("identifier_scheme") == "VIAF" for i in inst.get("identifiers", []) if isinstance(i, dict)
))
print(f"VIAF IDs added: {viaf_count}")
coords_count = sum(1 for inst in institutions
if inst.get("locations") and len(inst["locations"]) > 0
and inst["locations"][0].get("latitude"))
print(f"Coordinates added: {coords_count}")
founding_count = sum(1 for inst in institutions if inst.get("founded_date"))
print(f"Founding dates added: {founding_count}")
print("\n✓ Wikidata enrichment complete!")
if __name__ == "__main__":
main()