#!/usr/bin/env python3 """ Enrich Belgian ISIL institutions with Wikidata Q-numbers, VIAF IDs, and other identifiers. This script: 1. Loads Belgian institutions from enriched YAML 2. Queries Wikidata SPARQL endpoint for Belgian ISIL codes (BE-*) 3. Adds Wikidata Q-numbers, VIAF IDs, founding dates, coordinates 4. Updates GHCIDs with Q-numbers for collision resolution 5. Exports enriched YAML with Wikidata data Query strategy: - Query by ISIL code (P791) for exact matches - Batch queries for efficiency (100 codes per query) - Add multilingual labels (English, Dutch, French) """ import sys from pathlib import Path from typing import Any, Optional, Dict from datetime import datetime, timezone import time import yaml sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON def query_wikidata_batch(isil_codes: list[str], sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]: """ Query Wikidata for a batch of Belgian ISIL codes. Args: isil_codes: List of ISIL codes (e.g., BE-OSE00, BE-A0001) sparql: Configured SPARQL wrapper Returns: Dict mapping ISIL code → Wikidata data """ # Build VALUES clause isil_values = " ".join(f'"{code}"' for code in isil_codes) query = f""" SELECT DISTINCT ?item ?itemLabel ?itemLabelNl ?itemLabelFr ?itemDescription ?isil ?viaf ?coords ?website ?inception WHERE {{ VALUES ?isil {{ {isil_values} }} ?item wdt:P791 ?isil . OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} # Multilingual labels OPTIONAL {{ ?item rdfs:label ?itemLabelNl . FILTER(LANG(?itemLabelNl) = "nl") }} OPTIONAL {{ ?item rdfs:label ?itemLabelFr . FILTER(LANG(?itemLabelFr) = "fr") }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,nl,fr,de" . }} }} """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] # Parse results into dict keyed by ISIL code results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None isil = binding.get("isil", {}).get("value") if not qid or not qid.startswith("Q") or not isil: continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "name_nl": binding.get("itemLabelNl", {}).get("value"), "name_fr": binding.get("itemLabelFr", {}).get("value"), "description": binding.get("itemDescription", {}).get("value", ""), "identifiers": {} } if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[isil] = result return results except Exception as e: print(f"\n❌ SPARQL Error: {e}") return {} def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool: """ Enrich an institution with Wikidata data. Returns True if any new data was added. """ enriched = False # Ensure identifiers list exists if "identifiers" not in inst or not inst["identifiers"]: inst["identifiers"] = [] identifiers_list = inst["identifiers"] existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)} # Add Wikidata ID if "Wikidata" not in existing_schemes: identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wd_data["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}" }) enriched = True # Add VIAF, Website, etc. wd_identifiers = wd_data.get("identifiers", {}) for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj = { "identifier_scheme": scheme, "identifier_value": value } if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" elif scheme == "Website": id_obj["identifier_url"] = value identifiers_list.append(id_obj) enriched = True # Add founding date if "founding_date" in wd_data and not inst.get("founded_date"): inst["founded_date"] = wd_data["founding_date"] enriched = True # Add coordinates to location if "latitude" in wd_data and "longitude" in wd_data: locations = inst.get("locations", []) if isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict): if first_loc.get("latitude") is None: first_loc["latitude"] = wd_data["latitude"] first_loc["longitude"] = wd_data["longitude"] enriched = True # Add multilingual names to alternative_names if "alternative_names" not in inst or not inst["alternative_names"]: inst["alternative_names"] = [] alt_names = inst["alternative_names"] if isinstance(alt_names, list): if wd_data.get("name_nl") and wd_data["name_nl"] not in alt_names and wd_data["name_nl"] != inst.get("name"): alt_names.append(wd_data["name_nl"]) enriched = True if wd_data.get("name_fr") and wd_data["name_fr"] not in alt_names and wd_data["name_fr"] != inst.get("name"): alt_names.append(wd_data["name_fr"]) enriched = True # Update provenance if enriched: prov = inst.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") if "Wikidata enrichment" not in existing_method: prov["extraction_method"] = f"{existing_method} + Wikidata enrichment" return enriched def main(): """Main enrichment workflow.""" print("=" * 70) print("Belgian Institutions Wikidata Enrichment") print("=" * 70) # Input/output files input_file = Path("data/instances/belgium_isil_institutions_enriched.yaml") output_file = Path("data/instances/belgium_isil_institutions_wikidata.yaml") if not input_file.exists(): print(f"\n❌ Input file not found: {input_file}") print(" Run scripts/enrich_belgian_locations.py first") return # Load Belgian institutions print(f"\n1. Loading institutions from {input_file}...") import re with open(input_file, 'r', encoding='utf-8') as f: content = f.read() # Skip header comments and first --- lines = content.split('\n') start_idx = next((i for i, line in enumerate(lines) if line.strip() == '---'), 0) yaml_content = '\n'.join(lines[start_idx+1:]) # Split into individual YAML documents by detecting 'id: BE-' at start of line records_text = re.split(r'\n(?=id: BE-)', yaml_content) records_text = [r.strip() for r in records_text if r.strip()] # Parse each record institutions = [] for record_text in records_text: try: inst = yaml.safe_load(record_text) if inst: institutions.append(inst) except Exception: continue print(f" ✓ Loaded {len(institutions)} institutions") # Extract ISIL codes isil_codes = [inst.get("id") for inst in institutions if inst.get("id", "").startswith("BE-")] print(f" ✓ Found {len(isil_codes)} Belgian ISIL codes") # Setup SPARQL endpoint print(f"\n2. Querying Wikidata SPARQL endpoint...") sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod("POST") sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extractor/1.0 (https://github.com/kempersc/glam)") # Query in batches batch_size = 100 total_batches = (len(isil_codes) + batch_size - 1) // batch_size all_wd_data = {} for batch_num in range(total_batches): start_idx = batch_num * batch_size end_idx = min(start_idx + batch_size, len(isil_codes)) batch_codes = isil_codes[start_idx:end_idx] print(f" Batch {batch_num + 1}/{total_batches}: Querying {len(batch_codes)} ISIL codes...") wd_data = query_wikidata_batch(batch_codes, sparql) all_wd_data.update(wd_data) print(f" ✓ Found {len(wd_data)} Wikidata matches") # Rate limiting if batch_num < total_batches - 1: time.sleep(1) print(f"\n ✓ Total Wikidata matches: {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)") # Enrich institutions print(f"\n3. Enriching institutions with Wikidata data...") enriched_count = 0 for inst in institutions: isil_code = inst.get("id") if isil_code in all_wd_data: if enrich_institution(inst, all_wd_data[isil_code]): enriched_count += 1 print(f" ✓ Enriched {enriched_count} institutions") # Show enrichment examples print(f"\n4. Sample enriched institutions:") enriched_samples = [inst for inst in institutions if any( i.get("identifier_scheme") == "Wikidata" for i in inst.get("identifiers", []) if isinstance(i, dict) )][:5] for inst in enriched_samples: wd_id = next((i["identifier_value"] for i in inst.get("identifiers", []) if isinstance(i, dict) and i.get("identifier_scheme") == "Wikidata"), None) print(f" {inst.get('id')}: {inst.get('name', '')[:40]:40} → {wd_id}") # Export enriched data print(f"\n5. Exporting enriched YAML to {output_file}...") with open(output_file, 'w', encoding='utf-8') as f: f.write("# Belgian ISIL Registry Institutions (Wikidata Enriched)\n") f.write("# Scraped from https://isil.kbr.be/ + Wikidata SPARQL queries\n") f.write(f"# Total institutions: {len(institutions)}\n") f.write(f"# Wikidata enriched: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)\n") f.write(f"# Enrichment date: {datetime.now(timezone.utc).isoformat()}\n") f.write("#\n") f.write("---\n\n") for idx, inst in enumerate(institutions, 1): yaml.dump(inst, f, default_flow_style=False, allow_unicode=True, sort_keys=False) f.write("\n") if idx % 50 == 0: print(f" ... exported {idx} institutions") file_size_kb = output_file.stat().st_size / 1024 print(f" ✓ Exported to: {output_file}") print(f" ✓ File size: {file_size_kb:.1f} KB") # Summary statistics print("\n" + "=" * 70) print("Wikidata Enrichment Summary") print("=" * 70) print(f"Total institutions: {len(institutions)}") print(f"Wikidata Q-numbers added: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)") print(f"Wikidata coverage: {len(all_wd_data)} / {len(isil_codes)} ({len(all_wd_data)/len(isil_codes)*100:.1f}%)") # Count additional identifiers viaf_count = sum(1 for inst in institutions if any( i.get("identifier_scheme") == "VIAF" for i in inst.get("identifiers", []) if isinstance(i, dict) )) print(f"VIAF IDs added: {viaf_count}") coords_count = sum(1 for inst in institutions if inst.get("locations") and len(inst["locations"]) > 0 and inst["locations"][0].get("latitude")) print(f"Coordinates added: {coords_count}") founding_count = sum(1 for inst in institutions if inst.get("founded_date")) print(f"Founding dates added: {founding_count}") print("\n✓ Wikidata enrichment complete!") if __name__ == "__main__": main()