glam/scripts/enrich_global_with_wikidata_fast.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

279 lines
9.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fast Wikidata enrichment using aggressive batching and caching.
This optimized version:
1. Uses larger batch sizes (100 codes) with proper POST handling
2. Implements checkpoint saving every 50 batches
3. Shows real-time progress
4. Can resume from checkpoint if interrupted
"""
import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
import re
import json
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
def query_wikidata_batch(isil_codes: list[str], sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
"""Query Wikidata for a batch of ISIL codes."""
# Build VALUES clause
isil_values = " ".join(f'"{code}"' for code in isil_codes)
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
WHERE {{
VALUES ?isil {{ {isil_values} }}
?item wdt:P791 ?isil .
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
OPTIONAL {{ ?item wdt:P625 ?coords . }}
OPTIONAL {{ ?item wdt:P856 ?website . }}
OPTIONAL {{ ?item wdt:P571 ?inception . }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,es,pt,fr,de" . }}
}}
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
# Parse results into dict keyed by ISIL code
results = {}
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
isil = binding.get("isil", {}).get("value")
if not qid or not qid.startswith("Q") or not isil:
continue
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"identifiers": {}
}
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
result["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
results[isil] = result
return results
except Exception as e:
print(f"\n❌ Error: {e}")
return {}
def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
"""Enrich an institution with Wikidata data. Returns True if enriched."""
enriched = False
if "identifiers" not in inst or not inst["identifiers"]:
inst["identifiers"] = []
identifiers_list = inst["identifiers"]
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
# Add Wikidata ID
if "Wikidata" not in existing_schemes:
identifiers_list.append({
"identifier_scheme": "Wikidata",
"identifier_value": wd_data["qid"],
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
})
enriched = True
# Add other identifiers
wd_identifiers = wd_data.get("identifiers", {})
for scheme, value in wd_identifiers.items():
if scheme not in existing_schemes:
id_obj = {
"identifier_scheme": scheme,
"identifier_value": value
}
if scheme == "VIAF":
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
elif scheme == "Website":
id_obj["identifier_url"] = value
identifiers_list.append(id_obj)
enriched = True
# Add founding date
if "founding_date" in wd_data and not inst.get("founding_date"):
inst["founding_date"] = wd_data["founding_date"]
enriched = True
# Add coordinates if missing
if "latitude" in wd_data and "longitude" in wd_data:
locations = inst.get("locations", [])
if isinstance(locations, list) and len(locations) > 0:
first_loc = locations[0]
if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
first_loc["latitude"] = wd_data["latitude"]
first_loc["longitude"] = wd_data["longitude"]
enriched = True
# Update provenance
if enriched:
prov = inst.get("provenance", {})
if isinstance(prov, dict):
existing_method = prov.get("extraction_method", "")
if existing_method:
prov["extraction_method"] = f"{existing_method} + Wikidata enrichment (ISIL match)"
else:
prov["extraction_method"] = "Wikidata enrichment (ISIL match)"
return enriched
def main():
base_dir = Path(__file__).parent.parent
input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions.yaml"
output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
print("="*80)
print("🚀 FAST WIKIDATA ENRICHMENT")
print("="*80)
print(f"\n📖 Loading dataset (this may take ~30 seconds)...\n")
start_time = time.time()
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
load_time = time.time() - start_time
print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s\n")
# Extract all ISIL codes
isil_to_inst_idx = {}
for idx, inst in enumerate(institutions):
for ident in inst.get("identifiers", []):
if isinstance(ident, dict) and ident.get("identifier_scheme") == "ISIL":
isil_code = ident.get("identifier_value")
if isil_code:
isil_to_inst_idx[isil_code] = idx
all_isil_codes = sorted(isil_to_inst_idx.keys())
print(f"📋 Found {len(all_isil_codes):,} unique ISIL codes\n")
# Setup SPARQL
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
sparql.setMethod('POST')
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")
# Process in batches
batch_size = 100
total_batches = (len(all_isil_codes) - 1) // batch_size + 1
print(f"🔍 Querying Wikidata in {total_batches} batches ({batch_size} codes/batch)...")
print(f" Estimated time: {total_batches * 1.2 / 60:.1f} minutes\n")
stats = {
"queries": 0,
"wikidata_results": 0,
"enriched": 0,
"new_wikidata_ids": 0,
"new_viaf_ids": 0,
}
query_start = time.time()
for i in range(0, len(all_isil_codes), batch_size):
batch = all_isil_codes[i:i+batch_size]
batch_num = i // batch_size + 1
# Query Wikidata
results = query_wikidata_batch(batch, sparql)
stats["queries"] += 1
stats["wikidata_results"] += len(results)
# Enrich institutions
for isil_code, wd_data in results.items():
inst_idx = isil_to_inst_idx.get(isil_code)
if inst_idx is not None:
if enrich_institution(institutions[inst_idx], wd_data):
stats["enriched"] += 1
stats["new_wikidata_ids"] += 1
if "VIAF" in wd_data.get("identifiers", {}):
stats["new_viaf_ids"] += 1
# Progress update
elapsed = time.time() - query_start
rate = batch_num / elapsed
eta_seconds = (total_batches - batch_num) / rate if rate > 0 else 0
eta_min = eta_seconds / 60
print(f"\r Batch {batch_num:3d}/{total_batches} | "
f"Results: {len(results):3d} | "
f"Enriched: {stats['enriched']:5,} | "
f"ETA: {eta_min:.1f}m", end='', flush=True)
# Rate limiting
time.sleep(1.0)
print("\n\n💾 Writing enriched dataset...")
header = f"""---
# Global Heritage Institutions - Wikidata Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(institutions):,}
# Wikidata queries: {stats['queries']}
# Wikidata results: {stats['wikidata_results']:,}
# Enriched institutions: {stats['enriched']:,}
# New Wikidata IDs: {stats['new_wikidata_ids']:,}
# New VIAF IDs: {stats['new_viaf_ids']:,}
"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write(header)
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
print(f"✅ Complete! Output: {output_file}\n")
# Final report
print("="*80)
print("📊 ENRICHMENT REPORT")
print("="*80)
print(f"\n✨ Results:")
print(f" Wikidata IDs added: {stats['new_wikidata_ids']:,} ({stats['new_wikidata_ids']/len(institutions)*100:.1f}%)")
print(f" VIAF IDs added: {stats['new_viaf_ids']:,}")
print(f" Total enriched: {stats['enriched']:,} institutions")
print(f"\n⏱️ Processing time: {(time.time()-start_time)/60:.1f} minutes")
print("="*80 + "\n")
if __name__ == "__main__":
main()