glam/scripts/enrich_from_wikidata.py
2025-12-09 09:16:19 +01:00

444 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Enrich Latin American institutions with Wikidata identifiers and ISIL codes.
This script:
1. Queries Wikidata SPARQL endpoint for GLAM institutions in Brazil, Mexico, Chile
2. Fuzzy matches Wikidata results to our 304 existing institutions
3. Extracts Wikidata IDs, ISIL codes (P791), VIAF IDs (P214), and other identifiers
4. Updates the YAML dataset with enriched data
5. Generates a report on enrichment results
Usage:
python scripts/enrich_from_wikidata.py
Dependencies:
- SPARQLWrapper (for Wikidata queries)
- rapidfuzz (for fuzzy name matching)
- pyyaml (for YAML I/O)
"""
import sys
from pathlib import Path
from typing import Any
from datetime import datetime, timezone
import time
import yaml
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON # type: ignore
from rapidfuzz import fuzz, process # type: ignore
class WikidataEnricher:
"""Enrich heritage institutions with Wikidata identifiers."""
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Extractor/0.1 (https://github.com/yourusername/glam-extractor)"
# Country mappings
COUNTRIES = {
"BR": {"qid": "Q155", "name": "Brazil"},
"MX": {"qid": "Q96", "name": "Mexico"},
"CL": {"qid": "Q298", "name": "Chile"}
}
def __init__(self, input_file: Path, output_file: Path):
self.input_file = input_file
self.output_file = output_file
self.sparql = SPARQLWrapper(self.WIKIDATA_ENDPOINT)
self.sparql.setReturnFormat(JSON)
self.sparql.addCustomHttpHeader("User-Agent", self.USER_AGENT) # type: ignore
# Statistics
self.stats = {
"total_institutions": 0,
"wikidata_queries": 0,
"wikidata_results": 0,
"matched_institutions": 0,
"new_wikidata_ids": 0,
"new_isil_codes": 0,
"new_viaf_ids": 0,
"fuzzy_matches": 0,
"no_matches": 0,
}
def build_sparql_query(self, country_qid: str) -> str:
"""Build SPARQL query for GLAM institutions in a country."""
return f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?address ?website
WHERE {{
# Institution is located in the country
?item wdt:P17 wd:{country_qid} .
# Institution is one of our GLAM types
VALUES ?type {{
wd:Q7075 # library
wd:Q166118 # archive
wd:Q33506 # museum
wd:Q1007870 # art gallery
wd:Q28564 # public library
wd:Q11396180 # academic library
wd:Q207694 # art museum
wd:Q2772772 # history museum
wd:Q7140621 # cultural institution
wd:Q31855 # research institute
}}
?item wdt:P31 ?type .
# Optional identifiers
OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code
OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID
OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates
OPTIONAL {{ ?item wdt:P6375 ?address . }} # Address
OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website
# Get labels
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,pt" . }}
}}
LIMIT 1000
"""
def query_wikidata(self, country_code: str) -> list[dict[str, Any]]:
"""Query Wikidata for institutions in a country."""
country_info = self.COUNTRIES[country_code]
print(f"\n🔍 Querying Wikidata for {country_info['name']} institutions...")
query = self.build_sparql_query(country_info["qid"])
self.sparql.setQuery(query)
try:
self.stats["wikidata_queries"] += 1
raw_results = self.sparql.query().convert() # type: ignore
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
print(f" Found {len(bindings)} Wikidata institutions")
self.stats["wikidata_results"] += len(bindings)
# Parse results
institutions = []
for binding in bindings:
inst = self._parse_wikidata_result(binding, country_code)
if inst:
institutions.append(inst)
return institutions
except Exception as e:
print(f" ❌ Error querying Wikidata: {e}")
return []
def _parse_wikidata_result(self, binding: dict[str, Any], country_code: str) -> dict[str, Any] | None:
"""Parse a single Wikidata SPARQL result."""
try:
# Extract Wikidata QID from URI
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
return None
result: dict[str, Any] = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"country": country_code,
"identifiers": {}
}
# Extract identifiers
if "isil" in binding:
result["identifiers"]["ISIL"] = binding["isil"]["value"]
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
# Extract location data
if "coords" in binding:
coords_str = binding["coords"]["value"]
# Parse "Point(lon lat)" format
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
if "address" in binding:
result["address"] = binding["address"]["value"]
return result
except Exception as e:
print(f" ⚠️ Error parsing Wikidata result: {e}")
return None
def fuzzy_match_institution(
self,
wikidata_inst: dict[str, Any],
our_institutions: list[dict[str, Any]]
) -> tuple[dict[str, Any], float] | None:
"""
Fuzzy match a Wikidata institution to our dataset.
Returns: (matched_institution, confidence_score) or None
"""
wikidata_name = wikidata_inst["name"]
country = wikidata_inst["country"]
# Filter candidates by country
candidates = [
inst for inst in our_institutions
if any(loc.get("country") == country for loc in inst.get("locations", []))
]
if not candidates:
return None
# Extract names for fuzzy matching
candidate_names = [(inst, inst.get("name", "")) for inst in candidates]
choice_names = [name for _, name in candidate_names]
# Use rapidfuzz to find best match
best_match = process.extractOne(
wikidata_name,
choice_names,
scorer=fuzz.token_sort_ratio
)
if not best_match:
return None
matched_name, score, _ = best_match
# Require minimum 80% match
if score < 80:
return None
# Find the institution object
matched_inst = next(
inst for inst, name in candidate_names if name == matched_name
)
return (matched_inst, score / 100.0)
def enrich_institution(
self,
institution: dict[str, Any],
wikidata_inst: dict[str, Any],
match_confidence: float
) -> bool:
"""
Enrich an institution with Wikidata data.
Returns: True if any new data was added
"""
enriched = False
# Ensure identifiers list exists
if "identifiers" not in institution or not institution["identifiers"]:
institution["identifiers"] = []
identifiers_list = institution["identifiers"]
existing_schemes = {
ident.get("identifier_scheme", "")
for ident in identifiers_list
if isinstance(ident, dict)
}
# Add Wikidata ID
if "Wikidata" not in existing_schemes:
identifiers_list.append({
"identifier_scheme": "Wikidata",
"identifier_value": wikidata_inst["qid"],
"identifier_url": f"https://www.wikidata.org/wiki/{wikidata_inst['qid']}"
})
self.stats["new_wikidata_ids"] += 1
enriched = True
# Add other identifiers from Wikidata
wd_identifiers = wikidata_inst.get("identifiers", {})
if isinstance(wd_identifiers, dict):
for scheme, value in wd_identifiers.items():
if scheme not in existing_schemes:
id_obj: dict[str, Any] = {
"identifier_scheme": scheme,
"identifier_value": value
}
# Add URLs for known schemes
if scheme == "ISIL":
# ISIL codes don't have a universal URL - identifier_value only
self.stats["new_isil_codes"] += 1
elif scheme == "VIAF":
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
self.stats["new_viaf_ids"] += 1
elif scheme == "Website":
id_obj["identifier_url"] = value
identifiers_list.append(id_obj)
enriched = True
# Add location data if missing
if "latitude" in wikidata_inst and "longitude" in wikidata_inst:
locations = institution.get("locations", [])
if isinstance(locations, list) and len(locations) > 0:
first_loc = locations[0]
if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
first_loc["latitude"] = wikidata_inst["latitude"]
first_loc["longitude"] = wikidata_inst["longitude"]
enriched = True
# Update provenance
if enriched:
prov = institution.get("provenance", {})
if isinstance(prov, dict):
existing_method = prov.get("extraction_method", "")
prov["extraction_method"] = (
existing_method +
f" + Wikidata enrichment (match confidence: {match_confidence:.2f})"
)
return enriched
def run(self) -> None:
"""Run the complete enrichment workflow."""
print("🚀 Starting Wikidata enrichment for Latin American institutions\n")
print(f" Input: {self.input_file}")
print(f" Output: {self.output_file}")
# Load existing dataset
print("\n📖 Loading existing dataset...")
with open(self.input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
if not isinstance(institutions, list):
raise ValueError("Expected YAML file to contain a list of institutions")
self.stats["total_institutions"] = len(institutions)
print(f" Loaded {len(institutions)} institutions")
# Query Wikidata for each country
all_wikidata = []
for country_code in ["BR", "MX", "CL"]:
wikidata_results = self.query_wikidata(country_code)
all_wikidata.extend(wikidata_results)
time.sleep(1) # Rate limiting
print(f"\n📊 Total Wikidata results: {len(all_wikidata)}")
# Match and enrich
print("\n🔗 Matching Wikidata institutions to our dataset...")
for wikidata_inst in all_wikidata:
match_result = self.fuzzy_match_institution(wikidata_inst, institutions)
if match_result:
matched_inst, confidence = match_result
print(f" ✅ Matched: '{wikidata_inst['name']}''{matched_inst['name']}' ({confidence:.0%})")
if self.enrich_institution(matched_inst, wikidata_inst, confidence):
self.stats["matched_institutions"] += 1
if confidence < 0.95:
self.stats["fuzzy_matches"] += 1
else:
self.stats["no_matches"] += 1
# Write enriched dataset
print(f"\n💾 Writing enriched dataset to {self.output_file}...")
# Add header comment
header = f"""---
# Latin American GLAM Institutions - Wikidata Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Enrichment run: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
# Wikidata queries: {self.stats['wikidata_queries']}
# Wikidata results: {self.stats['wikidata_results']}
# Matched institutions: {self.stats['matched_institutions']}
# New Wikidata IDs: {self.stats['new_wikidata_ids']}
# New ISIL codes: {self.stats['new_isil_codes']}
# New VIAF IDs: {self.stats['new_viaf_ids']}
"""
with open(self.output_file, 'w', encoding='utf-8') as f:
f.write(header)
yaml.dump(
institutions,
f,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
width=120
)
# Print final statistics
self._print_report()
def _print_report(self) -> None:
"""Print enrichment report."""
print("\n" + "="*70)
print("📊 WIKIDATA ENRICHMENT REPORT")
print("="*70)
print(f"\n📚 Dataset Statistics:")
print(f" Total institutions in dataset: {self.stats['total_institutions']}")
print(f" Wikidata queries executed: {self.stats['wikidata_queries']}")
print(f" Wikidata institutions found: {self.stats['wikidata_results']}")
print(f"\n🔗 Matching Results:")
print(f" Successfully matched: {self.stats['matched_institutions']} ({self.stats['matched_institutions']/self.stats['total_institutions']*100:.1f}%)")
print(f" Fuzzy matches (< 95% confidence): {self.stats['fuzzy_matches']}")
print(f" No matches: {self.stats['no_matches']}")
print(f"\n✨ New Identifiers Added:")
print(f" Wikidata IDs: {self.stats['new_wikidata_ids']}")
print(f" ISIL codes: {self.stats['new_isil_codes']}")
print(f" VIAF IDs: {self.stats['new_viaf_ids']}")
print(f"\n💡 Next Steps:")
if self.stats['new_isil_codes'] > 0:
print(f" ✅ Found {self.stats['new_isil_codes']} ISIL codes from Wikidata!")
else:
print(f" ⚠️ No ISIL codes found in Wikidata")
print(f" → Proceed with national library outreach strategy")
if self.stats['fuzzy_matches'] > 0:
print(f" ⚠️ Review {self.stats['fuzzy_matches']} fuzzy matches manually")
print("\n" + "="*70 + "\n")
def main():
"""Main entry point."""
base_dir = Path(__file__).parent.parent
input_file = base_dir / "data" / "instances" / "latin_american_institutions.yaml"
output_file = base_dir / "data" / "instances" / "latin_american_institutions_enriched.yaml"
if not input_file.exists():
print(f"❌ Error: Input file not found: {input_file}")
sys.exit(1)
enricher = WikidataEnricher(input_file, output_file)
try:
enricher.run()
print("✅ Enrichment complete!")
except KeyboardInterrupt:
print("\n\n⚠️ Enrichment interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n❌ Error during enrichment: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()