#!/usr/bin/env python3 """ Enrich Latin American institutions with Wikidata identifiers and ISIL codes. This script: 1. Queries Wikidata SPARQL endpoint for GLAM institutions in Brazil, Mexico, Chile 2. Fuzzy matches Wikidata results to our 304 existing institutions 3. Extracts Wikidata IDs, ISIL codes (P791), VIAF IDs (P214), and other identifiers 4. Updates the YAML dataset with enriched data 5. Generates a report on enrichment results Usage: python scripts/enrich_from_wikidata.py Dependencies: - SPARQLWrapper (for Wikidata queries) - rapidfuzz (for fuzzy name matching) - pyyaml (for YAML I/O) """ import sys from pathlib import Path from typing import Any from datetime import datetime, timezone import time import yaml # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON # type: ignore from rapidfuzz import fuzz, process # type: ignore class WikidataEnricher: """Enrich heritage institutions with Wikidata identifiers.""" WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Extractor/0.1 (https://github.com/yourusername/glam-extractor)" # Country mappings COUNTRIES = { "BR": {"qid": "Q155", "name": "Brazil"}, "MX": {"qid": "Q96", "name": "Mexico"}, "CL": {"qid": "Q298", "name": "Chile"} } def __init__(self, input_file: Path, output_file: Path): self.input_file = input_file self.output_file = output_file self.sparql = SPARQLWrapper(self.WIKIDATA_ENDPOINT) self.sparql.setReturnFormat(JSON) self.sparql.addCustomHttpHeader("User-Agent", self.USER_AGENT) # type: ignore # Statistics self.stats = { "total_institutions": 0, "wikidata_queries": 0, "wikidata_results": 0, "matched_institutions": 0, "new_wikidata_ids": 0, "new_isil_codes": 0, "new_viaf_ids": 0, "fuzzy_matches": 0, "no_matches": 0, } def build_sparql_query(self, country_qid: str) -> str: """Build SPARQL query for GLAM institutions in a country.""" return f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?address ?website WHERE {{ # Institution is located in the country ?item wdt:P17 wd:{country_qid} . # Institution is one of our GLAM types VALUES ?type {{ wd:Q7075 # library wd:Q166118 # archive wd:Q33506 # museum wd:Q1007870 # art gallery wd:Q28564 # public library wd:Q11396180 # academic library wd:Q207694 # art museum wd:Q2772772 # history museum wd:Q7140621 # cultural institution wd:Q31855 # research institute }} ?item wdt:P31 ?type . # Optional identifiers OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates OPTIONAL {{ ?item wdt:P6375 ?address . }} # Address OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website # Get labels SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,es,pt" . }} }} LIMIT 1000 """ def query_wikidata(self, country_code: str) -> list[dict[str, Any]]: """Query Wikidata for institutions in a country.""" country_info = self.COUNTRIES[country_code] print(f"\nšŸ” Querying Wikidata for {country_info['name']} institutions...") query = self.build_sparql_query(country_info["qid"]) self.sparql.setQuery(query) try: self.stats["wikidata_queries"] += 1 raw_results = self.sparql.query().convert() # type: ignore bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] print(f" Found {len(bindings)} Wikidata institutions") self.stats["wikidata_results"] += len(bindings) # Parse results institutions = [] for binding in bindings: inst = self._parse_wikidata_result(binding, country_code) if inst: institutions.append(inst) return institutions except Exception as e: print(f" āŒ Error querying Wikidata: {e}") return [] def _parse_wikidata_result(self, binding: dict[str, Any], country_code: str) -> dict[str, Any] | None: """Parse a single Wikidata SPARQL result.""" try: # Extract Wikidata QID from URI item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): return None result: dict[str, Any] = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "country": country_code, "identifiers": {} } # Extract identifiers if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] # Extract location data if "coords" in binding: coords_str = binding["coords"]["value"] # Parse "Point(lon lat)" format if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) if "address" in binding: result["address"] = binding["address"]["value"] return result except Exception as e: print(f" āš ļø Error parsing Wikidata result: {e}") return None def fuzzy_match_institution( self, wikidata_inst: dict[str, Any], our_institutions: list[dict[str, Any]] ) -> tuple[dict[str, Any], float] | None: """ Fuzzy match a Wikidata institution to our dataset. Returns: (matched_institution, confidence_score) or None """ wikidata_name = wikidata_inst["name"] country = wikidata_inst["country"] # Filter candidates by country candidates = [ inst for inst in our_institutions if any(loc.get("country") == country for loc in inst.get("locations", [])) ] if not candidates: return None # Extract names for fuzzy matching candidate_names = [(inst, inst.get("name", "")) for inst in candidates] choice_names = [name for _, name in candidate_names] # Use rapidfuzz to find best match best_match = process.extractOne( wikidata_name, choice_names, scorer=fuzz.token_sort_ratio ) if not best_match: return None matched_name, score, _ = best_match # Require minimum 80% match if score < 80: return None # Find the institution object matched_inst = next( inst for inst, name in candidate_names if name == matched_name ) return (matched_inst, score / 100.0) def enrich_institution( self, institution: dict[str, Any], wikidata_inst: dict[str, Any], match_confidence: float ) -> bool: """ Enrich an institution with Wikidata data. Returns: True if any new data was added """ enriched = False # Ensure identifiers list exists if "identifiers" not in institution or not institution["identifiers"]: institution["identifiers"] = [] identifiers_list = institution["identifiers"] existing_schemes = { ident.get("identifier_scheme", "") for ident in identifiers_list if isinstance(ident, dict) } # Add Wikidata ID if "Wikidata" not in existing_schemes: identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wikidata_inst["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wikidata_inst['qid']}" }) self.stats["new_wikidata_ids"] += 1 enriched = True # Add other identifiers from Wikidata wd_identifiers = wikidata_inst.get("identifiers", {}) if isinstance(wd_identifiers, dict): for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj: dict[str, Any] = { "identifier_scheme": scheme, "identifier_value": value } # Add URLs for known schemes if scheme == "ISIL": # ISIL codes don't have a universal URL - identifier_value only self.stats["new_isil_codes"] += 1 elif scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" self.stats["new_viaf_ids"] += 1 elif scheme == "Website": id_obj["identifier_url"] = value identifiers_list.append(id_obj) enriched = True # Add location data if missing if "latitude" in wikidata_inst and "longitude" in wikidata_inst: locations = institution.get("locations", []) if isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict) and first_loc.get("latitude") is None: first_loc["latitude"] = wikidata_inst["latitude"] first_loc["longitude"] = wikidata_inst["longitude"] enriched = True # Update provenance if enriched: prov = institution.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") prov["extraction_method"] = ( existing_method + f" + Wikidata enrichment (match confidence: {match_confidence:.2f})" ) return enriched def run(self) -> None: """Run the complete enrichment workflow.""" print("šŸš€ Starting Wikidata enrichment for Latin American institutions\n") print(f" Input: {self.input_file}") print(f" Output: {self.output_file}") # Load existing dataset print("\nšŸ“– Loading existing dataset...") with open(self.input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) if not isinstance(institutions, list): raise ValueError("Expected YAML file to contain a list of institutions") self.stats["total_institutions"] = len(institutions) print(f" Loaded {len(institutions)} institutions") # Query Wikidata for each country all_wikidata = [] for country_code in ["BR", "MX", "CL"]: wikidata_results = self.query_wikidata(country_code) all_wikidata.extend(wikidata_results) time.sleep(1) # Rate limiting print(f"\nšŸ“Š Total Wikidata results: {len(all_wikidata)}") # Match and enrich print("\nšŸ”— Matching Wikidata institutions to our dataset...") for wikidata_inst in all_wikidata: match_result = self.fuzzy_match_institution(wikidata_inst, institutions) if match_result: matched_inst, confidence = match_result print(f" āœ… Matched: '{wikidata_inst['name']}' → '{matched_inst['name']}' ({confidence:.0%})") if self.enrich_institution(matched_inst, wikidata_inst, confidence): self.stats["matched_institutions"] += 1 if confidence < 0.95: self.stats["fuzzy_matches"] += 1 else: self.stats["no_matches"] += 1 # Write enriched dataset print(f"\nšŸ’¾ Writing enriched dataset to {self.output_file}...") # Add header comment header = f"""--- # Latin American GLAM Institutions - Wikidata Enriched # Generated: {datetime.now(timezone.utc).isoformat()} # # Enrichment run: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')} # Wikidata queries: {self.stats['wikidata_queries']} # Wikidata results: {self.stats['wikidata_results']} # Matched institutions: {self.stats['matched_institutions']} # New Wikidata IDs: {self.stats['new_wikidata_ids']} # New ISIL codes: {self.stats['new_isil_codes']} # New VIAF IDs: {self.stats['new_viaf_ids']} """ with open(self.output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump( institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120 ) # Print final statistics self._print_report() def _print_report(self) -> None: """Print enrichment report.""" print("\n" + "="*70) print("šŸ“Š WIKIDATA ENRICHMENT REPORT") print("="*70) print(f"\nšŸ“š Dataset Statistics:") print(f" Total institutions in dataset: {self.stats['total_institutions']}") print(f" Wikidata queries executed: {self.stats['wikidata_queries']}") print(f" Wikidata institutions found: {self.stats['wikidata_results']}") print(f"\nšŸ”— Matching Results:") print(f" Successfully matched: {self.stats['matched_institutions']} ({self.stats['matched_institutions']/self.stats['total_institutions']*100:.1f}%)") print(f" Fuzzy matches (< 95% confidence): {self.stats['fuzzy_matches']}") print(f" No matches: {self.stats['no_matches']}") print(f"\n✨ New Identifiers Added:") print(f" Wikidata IDs: {self.stats['new_wikidata_ids']}") print(f" ISIL codes: {self.stats['new_isil_codes']}") print(f" VIAF IDs: {self.stats['new_viaf_ids']}") print(f"\nšŸ’” Next Steps:") if self.stats['new_isil_codes'] > 0: print(f" āœ… Found {self.stats['new_isil_codes']} ISIL codes from Wikidata!") else: print(f" āš ļø No ISIL codes found in Wikidata") print(f" → Proceed with national library outreach strategy") if self.stats['fuzzy_matches'] > 0: print(f" āš ļø Review {self.stats['fuzzy_matches']} fuzzy matches manually") print("\n" + "="*70 + "\n") def main(): """Main entry point.""" base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "latin_american_institutions.yaml" output_file = base_dir / "data" / "instances" / "latin_american_institutions_enriched.yaml" if not input_file.exists(): print(f"āŒ Error: Input file not found: {input_file}") sys.exit(1) enricher = WikidataEnricher(input_file, output_file) try: enricher.run() print("āœ… Enrichment complete!") except KeyboardInterrupt: print("\n\nāš ļø Enrichment interrupted by user") sys.exit(1) except Exception as e: print(f"\nāŒ Error during enrichment: {e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()