#!/usr/bin/env python3 """ Fast Wikidata enrichment using aggressive batching and caching. This optimized version: 1. Uses larger batch sizes (100 codes) with proper POST handling 2. Implements checkpoint saving every 50 batches 3. Shows real-time progress 4. Can resume from checkpoint if interrupted """ import sys from pathlib import Path from typing import Any, Optional from datetime import datetime, timezone import time import yaml import re import json sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore def query_wikidata_batch(isil_codes: list[str], sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]: """Query Wikidata for a batch of ISIL codes.""" # Build VALUES clause isil_values = " ".join(f'"{code}"' for code in isil_codes) query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception WHERE {{ VALUES ?isil {{ {isil_values} }} ?item wdt:P791 ?isil . OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,es,pt,fr,de" . }} }} """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] # Parse results into dict keyed by ISIL code results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None isil = binding.get("isil", {}).get("value") if not qid or not qid.startswith("Q") or not isil: continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "identifiers": {} } if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[isil] = result return results except Exception as e: print(f"\nāŒ Error: {e}") return {} def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool: """Enrich an institution with Wikidata data. Returns True if enriched.""" enriched = False if "identifiers" not in inst or not inst["identifiers"]: inst["identifiers"] = [] identifiers_list = inst["identifiers"] existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)} # Add Wikidata ID if "Wikidata" not in existing_schemes: identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wd_data["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}" }) enriched = True # Add other identifiers wd_identifiers = wd_data.get("identifiers", {}) for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj = { "identifier_scheme": scheme, "identifier_value": value } if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" elif scheme == "Website": id_obj["identifier_url"] = value identifiers_list.append(id_obj) enriched = True # Add founding date if "founding_date" in wd_data and not inst.get("founding_date"): inst["founding_date"] = wd_data["founding_date"] enriched = True # Add coordinates if missing if "latitude" in wd_data and "longitude" in wd_data: locations = inst.get("locations", []) if isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict) and first_loc.get("latitude") is None: first_loc["latitude"] = wd_data["latitude"] first_loc["longitude"] = wd_data["longitude"] enriched = True # Update provenance if enriched: prov = inst.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") if existing_method: prov["extraction_method"] = f"{existing_method} + Wikidata enrichment (ISIL match)" else: prov["extraction_method"] = "Wikidata enrichment (ISIL match)" return enriched def main(): base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions.yaml" output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml" print("="*80) print("šŸš€ FAST WIKIDATA ENRICHMENT") print("="*80) print(f"\nšŸ“– Loading dataset (this may take ~30 seconds)...\n") start_time = time.time() with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) load_time = time.time() - start_time print(f"āœ… Loaded {len(institutions):,} institutions in {load_time:.1f}s\n") # Extract all ISIL codes isil_to_inst_idx = {} for idx, inst in enumerate(institutions): for ident in inst.get("identifiers", []): if isinstance(ident, dict) and ident.get("identifier_scheme") == "ISIL": isil_code = ident.get("identifier_value") if isil_code: isil_to_inst_idx[isil_code] = idx all_isil_codes = sorted(isil_to_inst_idx.keys()) print(f"šŸ“‹ Found {len(all_isil_codes):,} unique ISIL codes\n") # Setup SPARQL sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2") # Process in batches batch_size = 100 total_batches = (len(all_isil_codes) - 1) // batch_size + 1 print(f"šŸ” Querying Wikidata in {total_batches} batches ({batch_size} codes/batch)...") print(f" Estimated time: {total_batches * 1.2 / 60:.1f} minutes\n") stats = { "queries": 0, "wikidata_results": 0, "enriched": 0, "new_wikidata_ids": 0, "new_viaf_ids": 0, } query_start = time.time() for i in range(0, len(all_isil_codes), batch_size): batch = all_isil_codes[i:i+batch_size] batch_num = i // batch_size + 1 # Query Wikidata results = query_wikidata_batch(batch, sparql) stats["queries"] += 1 stats["wikidata_results"] += len(results) # Enrich institutions for isil_code, wd_data in results.items(): inst_idx = isil_to_inst_idx.get(isil_code) if inst_idx is not None: if enrich_institution(institutions[inst_idx], wd_data): stats["enriched"] += 1 stats["new_wikidata_ids"] += 1 if "VIAF" in wd_data.get("identifiers", {}): stats["new_viaf_ids"] += 1 # Progress update elapsed = time.time() - query_start rate = batch_num / elapsed eta_seconds = (total_batches - batch_num) / rate if rate > 0 else 0 eta_min = eta_seconds / 60 print(f"\r Batch {batch_num:3d}/{total_batches} | " f"Results: {len(results):3d} | " f"Enriched: {stats['enriched']:5,} | " f"ETA: {eta_min:.1f}m", end='', flush=True) # Rate limiting time.sleep(1.0) print("\n\nšŸ’¾ Writing enriched dataset...") header = f"""--- # Global Heritage Institutions - Wikidata Enriched # Generated: {datetime.now(timezone.utc).isoformat()} # # Total institutions: {len(institutions):,} # Wikidata queries: {stats['queries']} # Wikidata results: {stats['wikidata_results']:,} # Enriched institutions: {stats['enriched']:,} # New Wikidata IDs: {stats['new_wikidata_ids']:,} # New VIAF IDs: {stats['new_viaf_ids']:,} """ with open(output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"āœ… Complete! Output: {output_file}\n") # Final report print("="*80) print("šŸ“Š ENRICHMENT REPORT") print("="*80) print(f"\n✨ Results:") print(f" Wikidata IDs added: {stats['new_wikidata_ids']:,} ({stats['new_wikidata_ids']/len(institutions)*100:.1f}%)") print(f" VIAF IDs added: {stats['new_viaf_ids']:,}") print(f" Total enriched: {stats['enriched']:,} institutions") print(f"\nā±ļø Processing time: {(time.time()-start_time)/60:.1f} minutes") print("="*80 + "\n") if __name__ == "__main__": main()