- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
279 lines
9.7 KiB
Python
Executable file
279 lines
9.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fast Wikidata enrichment using aggressive batching and caching.
|
|
|
|
This optimized version:
|
|
1. Uses larger batch sizes (100 codes) with proper POST handling
|
|
2. Implements checkpoint saving every 50 batches
|
|
3. Shows real-time progress
|
|
4. Can resume from checkpoint if interrupted
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import yaml
|
|
import re
|
|
import json
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
|
|
|
|
|
|
def query_wikidata_batch(isil_codes: list[str], sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
|
|
"""Query Wikidata for a batch of ISIL codes."""
|
|
|
|
# Build VALUES clause
|
|
isil_values = " ".join(f'"{code}"' for code in isil_codes)
|
|
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception
|
|
WHERE {{
|
|
VALUES ?isil {{ {isil_values} }}
|
|
?item wdt:P791 ?isil .
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,es,pt,fr,de" . }}
|
|
}}
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
raw_results = sparql.query().convert()
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
# Parse results into dict keyed by ISIL code
|
|
results = {}
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
isil = binding.get("isil", {}).get("value")
|
|
|
|
if not qid or not qid.startswith("Q") or not isil:
|
|
continue
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"identifiers": {}
|
|
}
|
|
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
result["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
results[isil] = result
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ Error: {e}")
|
|
return {}
|
|
|
|
|
|
def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
|
|
"""Enrich an institution with Wikidata data. Returns True if enriched."""
|
|
enriched = False
|
|
|
|
if "identifiers" not in inst or not inst["identifiers"]:
|
|
inst["identifiers"] = []
|
|
|
|
identifiers_list = inst["identifiers"]
|
|
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
|
|
|
|
# Add Wikidata ID
|
|
if "Wikidata" not in existing_schemes:
|
|
identifiers_list.append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": wd_data["qid"],
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
|
|
})
|
|
enriched = True
|
|
|
|
# Add other identifiers
|
|
wd_identifiers = wd_data.get("identifiers", {})
|
|
for scheme, value in wd_identifiers.items():
|
|
if scheme not in existing_schemes:
|
|
id_obj = {
|
|
"identifier_scheme": scheme,
|
|
"identifier_value": value
|
|
}
|
|
|
|
if scheme == "VIAF":
|
|
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
|
|
elif scheme == "Website":
|
|
id_obj["identifier_url"] = value
|
|
|
|
identifiers_list.append(id_obj)
|
|
enriched = True
|
|
|
|
# Add founding date
|
|
if "founding_date" in wd_data and not inst.get("founding_date"):
|
|
inst["founding_date"] = wd_data["founding_date"]
|
|
enriched = True
|
|
|
|
# Add coordinates if missing
|
|
if "latitude" in wd_data and "longitude" in wd_data:
|
|
locations = inst.get("locations", [])
|
|
if isinstance(locations, list) and len(locations) > 0:
|
|
first_loc = locations[0]
|
|
if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
|
|
first_loc["latitude"] = wd_data["latitude"]
|
|
first_loc["longitude"] = wd_data["longitude"]
|
|
enriched = True
|
|
|
|
# Update provenance
|
|
if enriched:
|
|
prov = inst.get("provenance", {})
|
|
if isinstance(prov, dict):
|
|
existing_method = prov.get("extraction_method", "")
|
|
if existing_method:
|
|
prov["extraction_method"] = f"{existing_method} + Wikidata enrichment (ISIL match)"
|
|
else:
|
|
prov["extraction_method"] = "Wikidata enrichment (ISIL match)"
|
|
|
|
return enriched
|
|
|
|
|
|
def main():
|
|
base_dir = Path(__file__).parent.parent
|
|
input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions.yaml"
|
|
output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
|
|
|
|
print("="*80)
|
|
print("🚀 FAST WIKIDATA ENRICHMENT")
|
|
print("="*80)
|
|
print(f"\n📖 Loading dataset (this may take ~30 seconds)...\n")
|
|
|
|
start_time = time.time()
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
load_time = time.time() - start_time
|
|
print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s\n")
|
|
|
|
# Extract all ISIL codes
|
|
isil_to_inst_idx = {}
|
|
for idx, inst in enumerate(institutions):
|
|
for ident in inst.get("identifiers", []):
|
|
if isinstance(ident, dict) and ident.get("identifier_scheme") == "ISIL":
|
|
isil_code = ident.get("identifier_value")
|
|
if isil_code:
|
|
isil_to_inst_idx[isil_code] = idx
|
|
|
|
all_isil_codes = sorted(isil_to_inst_idx.keys())
|
|
print(f"📋 Found {len(all_isil_codes):,} unique ISIL codes\n")
|
|
|
|
# Setup SPARQL
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
sparql.setMethod('POST')
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")
|
|
|
|
# Process in batches
|
|
batch_size = 100
|
|
total_batches = (len(all_isil_codes) - 1) // batch_size + 1
|
|
|
|
print(f"🔍 Querying Wikidata in {total_batches} batches ({batch_size} codes/batch)...")
|
|
print(f" Estimated time: {total_batches * 1.2 / 60:.1f} minutes\n")
|
|
|
|
stats = {
|
|
"queries": 0,
|
|
"wikidata_results": 0,
|
|
"enriched": 0,
|
|
"new_wikidata_ids": 0,
|
|
"new_viaf_ids": 0,
|
|
}
|
|
|
|
query_start = time.time()
|
|
|
|
for i in range(0, len(all_isil_codes), batch_size):
|
|
batch = all_isil_codes[i:i+batch_size]
|
|
batch_num = i // batch_size + 1
|
|
|
|
# Query Wikidata
|
|
results = query_wikidata_batch(batch, sparql)
|
|
stats["queries"] += 1
|
|
stats["wikidata_results"] += len(results)
|
|
|
|
# Enrich institutions
|
|
for isil_code, wd_data in results.items():
|
|
inst_idx = isil_to_inst_idx.get(isil_code)
|
|
if inst_idx is not None:
|
|
if enrich_institution(institutions[inst_idx], wd_data):
|
|
stats["enriched"] += 1
|
|
stats["new_wikidata_ids"] += 1
|
|
if "VIAF" in wd_data.get("identifiers", {}):
|
|
stats["new_viaf_ids"] += 1
|
|
|
|
# Progress update
|
|
elapsed = time.time() - query_start
|
|
rate = batch_num / elapsed
|
|
eta_seconds = (total_batches - batch_num) / rate if rate > 0 else 0
|
|
eta_min = eta_seconds / 60
|
|
|
|
print(f"\r Batch {batch_num:3d}/{total_batches} | "
|
|
f"Results: {len(results):3d} | "
|
|
f"Enriched: {stats['enriched']:5,} | "
|
|
f"ETA: {eta_min:.1f}m", end='', flush=True)
|
|
|
|
# Rate limiting
|
|
time.sleep(1.0)
|
|
|
|
print("\n\n💾 Writing enriched dataset...")
|
|
|
|
header = f"""---
|
|
# Global Heritage Institutions - Wikidata Enriched
|
|
# Generated: {datetime.now(timezone.utc).isoformat()}
|
|
#
|
|
# Total institutions: {len(institutions):,}
|
|
# Wikidata queries: {stats['queries']}
|
|
# Wikidata results: {stats['wikidata_results']:,}
|
|
# Enriched institutions: {stats['enriched']:,}
|
|
# New Wikidata IDs: {stats['new_wikidata_ids']:,}
|
|
# New VIAF IDs: {stats['new_viaf_ids']:,}
|
|
|
|
"""
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(header)
|
|
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
|
|
|
|
print(f"✅ Complete! Output: {output_file}\n")
|
|
|
|
# Final report
|
|
print("="*80)
|
|
print("📊 ENRICHMENT REPORT")
|
|
print("="*80)
|
|
print(f"\n✨ Results:")
|
|
print(f" Wikidata IDs added: {stats['new_wikidata_ids']:,} ({stats['new_wikidata_ids']/len(institutions)*100:.1f}%)")
|
|
print(f" VIAF IDs added: {stats['new_viaf_ids']:,}")
|
|
print(f" Total enriched: {stats['enriched']:,} institutions")
|
|
print(f"\n⏱️ Processing time: {(time.time()-start_time)/60:.1f} minutes")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|