444 lines
16 KiB
Python
444 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Latin American institutions with Wikidata identifiers and ISIL codes.
|
|
|
|
This script:
|
|
1. Queries Wikidata SPARQL endpoint for GLAM institutions in Brazil, Mexico, Chile
|
|
2. Fuzzy matches Wikidata results to our 304 existing institutions
|
|
3. Extracts Wikidata IDs, ISIL codes (P791), VIAF IDs (P214), and other identifiers
|
|
4. Updates the YAML dataset with enriched data
|
|
5. Generates a report on enrichment results
|
|
|
|
Usage:
|
|
python scripts/enrich_from_wikidata.py
|
|
|
|
Dependencies:
|
|
- SPARQLWrapper (for Wikidata queries)
|
|
- rapidfuzz (for fuzzy name matching)
|
|
- pyyaml (for YAML I/O)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import yaml
|
|
|
|
# Add src to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON # type: ignore
|
|
from rapidfuzz import fuzz, process # type: ignore
|
|
|
|
|
|
class WikidataEnricher:
|
|
"""Enrich heritage institutions with Wikidata identifiers."""
|
|
|
|
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Extractor/0.1 (https://github.com/yourusername/glam-extractor)"
|
|
|
|
# Country mappings
|
|
COUNTRIES = {
|
|
"BR": {"qid": "Q155", "name": "Brazil"},
|
|
"MX": {"qid": "Q96", "name": "Mexico"},
|
|
"CL": {"qid": "Q298", "name": "Chile"}
|
|
}
|
|
|
|
def __init__(self, input_file: Path, output_file: Path):
|
|
self.input_file = input_file
|
|
self.output_file = output_file
|
|
self.sparql = SPARQLWrapper(self.WIKIDATA_ENDPOINT)
|
|
self.sparql.setReturnFormat(JSON)
|
|
self.sparql.addCustomHttpHeader("User-Agent", self.USER_AGENT) # type: ignore
|
|
|
|
# Statistics
|
|
self.stats = {
|
|
"total_institutions": 0,
|
|
"wikidata_queries": 0,
|
|
"wikidata_results": 0,
|
|
"matched_institutions": 0,
|
|
"new_wikidata_ids": 0,
|
|
"new_isil_codes": 0,
|
|
"new_viaf_ids": 0,
|
|
"fuzzy_matches": 0,
|
|
"no_matches": 0,
|
|
}
|
|
|
|
def build_sparql_query(self, country_qid: str) -> str:
|
|
"""Build SPARQL query for GLAM institutions in a country."""
|
|
return f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?address ?website
|
|
WHERE {{
|
|
# Institution is located in the country
|
|
?item wdt:P17 wd:{country_qid} .
|
|
|
|
# Institution is one of our GLAM types
|
|
VALUES ?type {{
|
|
wd:Q7075 # library
|
|
wd:Q166118 # archive
|
|
wd:Q33506 # museum
|
|
wd:Q1007870 # art gallery
|
|
wd:Q28564 # public library
|
|
wd:Q11396180 # academic library
|
|
wd:Q207694 # art museum
|
|
wd:Q2772772 # history museum
|
|
wd:Q7140621 # cultural institution
|
|
wd:Q31855 # research institute
|
|
}}
|
|
?item wdt:P31 ?type .
|
|
|
|
# Optional identifiers
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates
|
|
OPTIONAL {{ ?item wdt:P6375 ?address . }} # Address
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website
|
|
|
|
# Get labels
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,pt" . }}
|
|
}}
|
|
LIMIT 1000
|
|
"""
|
|
|
|
def query_wikidata(self, country_code: str) -> list[dict[str, Any]]:
|
|
"""Query Wikidata for institutions in a country."""
|
|
country_info = self.COUNTRIES[country_code]
|
|
print(f"\n🔍 Querying Wikidata for {country_info['name']} institutions...")
|
|
|
|
query = self.build_sparql_query(country_info["qid"])
|
|
self.sparql.setQuery(query)
|
|
|
|
try:
|
|
self.stats["wikidata_queries"] += 1
|
|
raw_results = self.sparql.query().convert() # type: ignore
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
print(f" Found {len(bindings)} Wikidata institutions")
|
|
self.stats["wikidata_results"] += len(bindings)
|
|
|
|
# Parse results
|
|
institutions = []
|
|
for binding in bindings:
|
|
inst = self._parse_wikidata_result(binding, country_code)
|
|
if inst:
|
|
institutions.append(inst)
|
|
|
|
return institutions
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error querying Wikidata: {e}")
|
|
return []
|
|
|
|
def _parse_wikidata_result(self, binding: dict[str, Any], country_code: str) -> dict[str, Any] | None:
|
|
"""Parse a single Wikidata SPARQL result."""
|
|
try:
|
|
# Extract Wikidata QID from URI
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
return None
|
|
|
|
result: dict[str, Any] = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"country": country_code,
|
|
"identifiers": {}
|
|
}
|
|
|
|
# Extract identifiers
|
|
if "isil" in binding:
|
|
result["identifiers"]["ISIL"] = binding["isil"]["value"]
|
|
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
# Extract location data
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
# Parse "Point(lon lat)" format
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
if "address" in binding:
|
|
result["address"] = binding["address"]["value"]
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ Error parsing Wikidata result: {e}")
|
|
return None
|
|
|
|
def fuzzy_match_institution(
|
|
self,
|
|
wikidata_inst: dict[str, Any],
|
|
our_institutions: list[dict[str, Any]]
|
|
) -> tuple[dict[str, Any], float] | None:
|
|
"""
|
|
Fuzzy match a Wikidata institution to our dataset.
|
|
|
|
Returns: (matched_institution, confidence_score) or None
|
|
"""
|
|
wikidata_name = wikidata_inst["name"]
|
|
country = wikidata_inst["country"]
|
|
|
|
# Filter candidates by country
|
|
candidates = [
|
|
inst for inst in our_institutions
|
|
if any(loc.get("country") == country for loc in inst.get("locations", []))
|
|
]
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
# Extract names for fuzzy matching
|
|
candidate_names = [(inst, inst.get("name", "")) for inst in candidates]
|
|
choice_names = [name for _, name in candidate_names]
|
|
|
|
# Use rapidfuzz to find best match
|
|
best_match = process.extractOne(
|
|
wikidata_name,
|
|
choice_names,
|
|
scorer=fuzz.token_sort_ratio
|
|
)
|
|
|
|
if not best_match:
|
|
return None
|
|
|
|
matched_name, score, _ = best_match
|
|
|
|
# Require minimum 80% match
|
|
if score < 80:
|
|
return None
|
|
|
|
# Find the institution object
|
|
matched_inst = next(
|
|
inst for inst, name in candidate_names if name == matched_name
|
|
)
|
|
|
|
return (matched_inst, score / 100.0)
|
|
|
|
def enrich_institution(
|
|
self,
|
|
institution: dict[str, Any],
|
|
wikidata_inst: dict[str, Any],
|
|
match_confidence: float
|
|
) -> bool:
|
|
"""
|
|
Enrich an institution with Wikidata data.
|
|
|
|
Returns: True if any new data was added
|
|
"""
|
|
enriched = False
|
|
|
|
# Ensure identifiers list exists
|
|
if "identifiers" not in institution or not institution["identifiers"]:
|
|
institution["identifiers"] = []
|
|
|
|
identifiers_list = institution["identifiers"]
|
|
|
|
existing_schemes = {
|
|
ident.get("identifier_scheme", "")
|
|
for ident in identifiers_list
|
|
if isinstance(ident, dict)
|
|
}
|
|
|
|
# Add Wikidata ID
|
|
if "Wikidata" not in existing_schemes:
|
|
identifiers_list.append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": wikidata_inst["qid"],
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{wikidata_inst['qid']}"
|
|
})
|
|
self.stats["new_wikidata_ids"] += 1
|
|
enriched = True
|
|
|
|
# Add other identifiers from Wikidata
|
|
wd_identifiers = wikidata_inst.get("identifiers", {})
|
|
if isinstance(wd_identifiers, dict):
|
|
for scheme, value in wd_identifiers.items():
|
|
if scheme not in existing_schemes:
|
|
id_obj: dict[str, Any] = {
|
|
"identifier_scheme": scheme,
|
|
"identifier_value": value
|
|
}
|
|
|
|
# Add URLs for known schemes
|
|
if scheme == "ISIL":
|
|
# ISIL codes don't have a universal URL - identifier_value only
|
|
self.stats["new_isil_codes"] += 1
|
|
elif scheme == "VIAF":
|
|
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
|
|
self.stats["new_viaf_ids"] += 1
|
|
elif scheme == "Website":
|
|
id_obj["identifier_url"] = value
|
|
|
|
identifiers_list.append(id_obj)
|
|
enriched = True
|
|
|
|
# Add location data if missing
|
|
if "latitude" in wikidata_inst and "longitude" in wikidata_inst:
|
|
locations = institution.get("locations", [])
|
|
if isinstance(locations, list) and len(locations) > 0:
|
|
first_loc = locations[0]
|
|
if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
|
|
first_loc["latitude"] = wikidata_inst["latitude"]
|
|
first_loc["longitude"] = wikidata_inst["longitude"]
|
|
enriched = True
|
|
|
|
# Update provenance
|
|
if enriched:
|
|
prov = institution.get("provenance", {})
|
|
if isinstance(prov, dict):
|
|
existing_method = prov.get("extraction_method", "")
|
|
prov["extraction_method"] = (
|
|
existing_method +
|
|
f" + Wikidata enrichment (match confidence: {match_confidence:.2f})"
|
|
)
|
|
|
|
return enriched
|
|
|
|
def run(self) -> None:
|
|
"""Run the complete enrichment workflow."""
|
|
print("🚀 Starting Wikidata enrichment for Latin American institutions\n")
|
|
print(f" Input: {self.input_file}")
|
|
print(f" Output: {self.output_file}")
|
|
|
|
# Load existing dataset
|
|
print("\n📖 Loading existing dataset...")
|
|
with open(self.input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
if not isinstance(institutions, list):
|
|
raise ValueError("Expected YAML file to contain a list of institutions")
|
|
|
|
self.stats["total_institutions"] = len(institutions)
|
|
print(f" Loaded {len(institutions)} institutions")
|
|
|
|
# Query Wikidata for each country
|
|
all_wikidata = []
|
|
for country_code in ["BR", "MX", "CL"]:
|
|
wikidata_results = self.query_wikidata(country_code)
|
|
all_wikidata.extend(wikidata_results)
|
|
time.sleep(1) # Rate limiting
|
|
|
|
print(f"\n📊 Total Wikidata results: {len(all_wikidata)}")
|
|
|
|
# Match and enrich
|
|
print("\n🔗 Matching Wikidata institutions to our dataset...")
|
|
|
|
for wikidata_inst in all_wikidata:
|
|
match_result = self.fuzzy_match_institution(wikidata_inst, institutions)
|
|
|
|
if match_result:
|
|
matched_inst, confidence = match_result
|
|
print(f" ✅ Matched: '{wikidata_inst['name']}' → '{matched_inst['name']}' ({confidence:.0%})")
|
|
|
|
if self.enrich_institution(matched_inst, wikidata_inst, confidence):
|
|
self.stats["matched_institutions"] += 1
|
|
|
|
if confidence < 0.95:
|
|
self.stats["fuzzy_matches"] += 1
|
|
else:
|
|
self.stats["no_matches"] += 1
|
|
|
|
# Write enriched dataset
|
|
print(f"\n💾 Writing enriched dataset to {self.output_file}...")
|
|
|
|
# Add header comment
|
|
header = f"""---
|
|
# Latin American GLAM Institutions - Wikidata Enriched
|
|
# Generated: {datetime.now(timezone.utc).isoformat()}
|
|
#
|
|
# Enrichment run: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
|
|
# Wikidata queries: {self.stats['wikidata_queries']}
|
|
# Wikidata results: {self.stats['wikidata_results']}
|
|
# Matched institutions: {self.stats['matched_institutions']}
|
|
# New Wikidata IDs: {self.stats['new_wikidata_ids']}
|
|
# New ISIL codes: {self.stats['new_isil_codes']}
|
|
# New VIAF IDs: {self.stats['new_viaf_ids']}
|
|
|
|
"""
|
|
|
|
with open(self.output_file, 'w', encoding='utf-8') as f:
|
|
f.write(header)
|
|
yaml.dump(
|
|
institutions,
|
|
f,
|
|
allow_unicode=True,
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
|
|
# Print final statistics
|
|
self._print_report()
|
|
|
|
def _print_report(self) -> None:
|
|
"""Print enrichment report."""
|
|
print("\n" + "="*70)
|
|
print("📊 WIKIDATA ENRICHMENT REPORT")
|
|
print("="*70)
|
|
print(f"\n📚 Dataset Statistics:")
|
|
print(f" Total institutions in dataset: {self.stats['total_institutions']}")
|
|
print(f" Wikidata queries executed: {self.stats['wikidata_queries']}")
|
|
print(f" Wikidata institutions found: {self.stats['wikidata_results']}")
|
|
|
|
print(f"\n🔗 Matching Results:")
|
|
print(f" Successfully matched: {self.stats['matched_institutions']} ({self.stats['matched_institutions']/self.stats['total_institutions']*100:.1f}%)")
|
|
print(f" Fuzzy matches (< 95% confidence): {self.stats['fuzzy_matches']}")
|
|
print(f" No matches: {self.stats['no_matches']}")
|
|
|
|
print(f"\n✨ New Identifiers Added:")
|
|
print(f" Wikidata IDs: {self.stats['new_wikidata_ids']}")
|
|
print(f" ISIL codes: {self.stats['new_isil_codes']}")
|
|
print(f" VIAF IDs: {self.stats['new_viaf_ids']}")
|
|
|
|
print(f"\n💡 Next Steps:")
|
|
if self.stats['new_isil_codes'] > 0:
|
|
print(f" ✅ Found {self.stats['new_isil_codes']} ISIL codes from Wikidata!")
|
|
else:
|
|
print(f" ⚠️ No ISIL codes found in Wikidata")
|
|
print(f" → Proceed with national library outreach strategy")
|
|
|
|
if self.stats['fuzzy_matches'] > 0:
|
|
print(f" ⚠️ Review {self.stats['fuzzy_matches']} fuzzy matches manually")
|
|
|
|
print("\n" + "="*70 + "\n")
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
base_dir = Path(__file__).parent.parent
|
|
input_file = base_dir / "data" / "instances" / "latin_american_institutions.yaml"
|
|
output_file = base_dir / "data" / "instances" / "latin_american_institutions_enriched.yaml"
|
|
|
|
if not input_file.exists():
|
|
print(f"❌ Error: Input file not found: {input_file}")
|
|
sys.exit(1)
|
|
|
|
enricher = WikidataEnricher(input_file, output_file)
|
|
|
|
try:
|
|
enricher.run()
|
|
print("✅ Enrichment complete!")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\n⚠️ Enrichment interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n❌ Error during enrichment: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|