#!/usr/bin/env python3 """ Enrich global heritage institutions with Wikidata identifiers. This script: 1. Queries Wikidata for institutions with ISIL codes (P791) 2. Matches by ISIL code (primary, high confidence) 3. Falls back to fuzzy name matching by country 4. Extracts Wikidata IDs, VIAF IDs (P214), founding dates, websites 5. Replaces synthetic Q-numbers in GHCIDs with real Wikidata QIDs 6. Updates the global YAML dataset with enriched data 7. Generates detailed enrichment report Usage: python scripts/enrich_global_with_wikidata.py Dependencies: - SPARQLWrapper (for Wikidata queries) - rapidfuzz (for fuzzy name matching) - pyyaml (for YAML I/O) """ import sys from pathlib import Path from typing import Any, Optional from datetime import datetime, timezone import time import yaml import re # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON # type: ignore from rapidfuzz import fuzz, process # type: ignore class GlobalWikidataEnricher: """Enrich global heritage institutions with Wikidata identifiers.""" WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Extractor/0.2 (Global Heritage Custodian Project)" # Rate limiting (Wikidata recommends 1 request/second) REQUEST_DELAY = 1.0 def __init__(self, input_file: Path, output_file: Path): self.input_file = input_file self.output_file = output_file self.sparql = SPARQLWrapper(self.WIKIDATA_ENDPOINT) self.sparql.setReturnFormat(JSON) self.sparql.setMethod('POST') # Use POST to avoid URI length limits self.sparql.addCustomHttpHeader("User-Agent", self.USER_AGENT) # type: ignore # Cache for ISIL → Wikidata mapping self.isil_to_wikidata: dict[str, dict[str, Any]] = {} # Statistics self.stats = { "total_institutions": 0, "institutions_with_isil": 0, "wikidata_queries": 0, "wikidata_results": 0, "isil_matches": 0, "fuzzy_matches": 0, "no_matches": 0, "new_wikidata_ids": 0, "replaced_synthetic_q": 0, "new_viaf_ids": 0, "new_founding_dates": 0, "new_websites": 0, "enriched_coordinates": 0, } def build_isil_query(self, isil_codes: list[str]) -> str: """ Build SPARQL query to fetch institutions by ISIL codes. Wikidata property P791 = ISIL code """ # Escape and format ISIL codes for SPARQL VALUES clause # Use smaller batches to avoid URI length limits (even with POST) isil_values = " ".join(f'"{code}"' for code in isil_codes[:50]) # Reduced batch size return f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception WHERE {{ # Filter by ISIL codes VALUES ?isil {{ {isil_values} }} ?item wdt:P791 ?isil . # Optional enrichment data OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website OPTIONAL {{ ?item wdt:P571 ?inception . }} # Founding date # Get labels in multiple languages SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,es,pt,fr,de" . }} }} """ def build_country_query(self, country_code: str, limit: int = 500) -> str: """ Build SPARQL query for GLAM institutions in a specific country. Used as fallback when ISIL matching is insufficient. """ # Map ISO 3166-1 alpha-2 to Wikidata QIDs country_qids = { "JP": "Q17", # Japan "NL": "Q55", # Netherlands "BR": "Q155", # Brazil "MX": "Q96", # Mexico "CL": "Q298", # Chile "US": "Q30", # United States "GB": "Q145", # United Kingdom "FR": "Q142", # France "DE": "Q183", # Germany "IT": "Q38", # Italy "ES": "Q29", # Spain "CA": "Q16", # Canada "AU": "Q408", # Australia } qid = country_qids.get(country_code) if not qid: print(f" āš ļø No Wikidata QID mapping for country code: {country_code}") return "" return f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception WHERE {{ # Institution is located in the country ?item wdt:P17 wd:{qid} . # Institution is one of our GLAM types VALUES ?type {{ wd:Q7075 # library wd:Q166118 # archive wd:Q33506 # museum wd:Q1007870 # art gallery wd:Q28564 # public library wd:Q11396180 # academic library wd:Q207694 # art museum wd:Q2772772 # history museum wd:Q7140621 # cultural institution wd:Q31855 # research institute }} ?item wdt:P31 ?type . # Optional identifiers and metadata OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website OPTIONAL {{ ?item wdt:P571 ?inception . }} # Founding date # Get labels SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,ja,nl,es,pt" . }} }} LIMIT {limit} """ def query_wikidata(self, query: str, query_name: str) -> list[dict[str, Any]]: """Execute a SPARQL query against Wikidata.""" import sys # Use carriage return for progress updates print(f"\ršŸ” {query_name}...", end='', flush=True) self.sparql.setQuery(query) try: self.stats["wikidata_queries"] += 1 raw_results = self.sparql.query().convert() # type: ignore bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] self.stats["wikidata_results"] += len(bindings) # Parse results institutions = [] for binding in bindings: inst = self._parse_wikidata_result(binding) if inst: institutions.append(inst) # Show result count print(f" → {len(bindings)} results", flush=True) # Rate limiting time.sleep(self.REQUEST_DELAY) return institutions except Exception as e: print(f"\r āŒ Error: {e}") time.sleep(self.REQUEST_DELAY) return [] def _parse_wikidata_result(self, binding: dict[str, Any]) -> Optional[dict[str, Any]]: """Parse a single Wikidata SPARQL result.""" try: # Extract Wikidata QID from URI item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): return None result: dict[str, Any] = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "identifiers": {} } # Extract identifiers if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] # Extract founding date if "inception" in binding: inception_value = binding["inception"]["value"] # Wikidata returns ISO 8601 date (e.g., "1945-01-01T00:00:00Z") result["founding_date"] = inception_value.split("T")[0] # Extract location data if "coords" in binding: coords_str = binding["coords"]["value"] # Parse "Point(lon lat)" format if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) return result except Exception as e: print(f" āš ļø Error parsing Wikidata result: {e}") return None def build_isil_cache(self, institutions: list[dict[str, Any]]) -> list[str]: """ Extract all ISIL codes from our dataset. Returns: List of unique ISIL codes """ isil_codes = set() for inst in institutions: identifiers = inst.get("identifiers", []) if isinstance(identifiers, list): for ident in identifiers: if isinstance(ident, dict) and ident.get("identifier_scheme") == "ISIL": isil_code = ident.get("identifier_value") if isil_code: isil_codes.add(isil_code) return sorted(isil_codes) def query_by_isil_codes(self, isil_codes: list[str]) -> None: """ Query Wikidata for institutions matching our ISIL codes. Populates self.isil_to_wikidata cache. """ if not isil_codes: print("āš ļø No ISIL codes found in dataset") return print(f"\nšŸ“š Querying Wikidata for {len(isil_codes)} ISIL codes...") # Batch queries (max 50 ISIL codes per query to avoid URI length issues) batch_size = 50 total_batches = (len(isil_codes) - 1) // batch_size + 1 print(f" Processing {total_batches} batches ({batch_size} codes per batch)...\n") for i in range(0, len(isil_codes), batch_size): batch = isil_codes[i:i+batch_size] batch_num = i//batch_size + 1 query_name = f"ISIL batch {batch_num}/{total_batches}" query = self.build_isil_query(batch) results = self.query_wikidata(query, query_name) # Cache results by ISIL code for wd_inst in results: isil = wd_inst.get("identifiers", {}).get("ISIL") if isil: self.isil_to_wikidata[isil] = wd_inst print(f" āœ… Cached {len(self.isil_to_wikidata)} Wikidata institutions with ISIL codes") def match_by_isil(self, institution: dict[str, Any]) -> Optional[dict[str, Any]]: """ Match institution by ISIL code (high confidence). Returns: Wikidata institution data or None """ identifiers = institution.get("identifiers", []) if not isinstance(identifiers, list): return None for ident in identifiers: if isinstance(ident, dict) and ident.get("identifier_scheme") == "ISIL": isil_code = ident.get("identifier_value") if isil_code and isil_code in self.isil_to_wikidata: return self.isil_to_wikidata[isil_code] return None def enrich_institution( self, institution: dict[str, Any], wikidata_inst: dict[str, Any], match_type: str, match_confidence: float = 1.0 ) -> bool: """ Enrich an institution with Wikidata data. Args: institution: Our institution record wikidata_inst: Wikidata institution data match_type: "ISIL" or "fuzzy_name" match_confidence: 0.0-1.0 (1.0 for ISIL matches) Returns: True if any new data was added """ enriched = False # Ensure identifiers list exists if "identifiers" not in institution or not institution["identifiers"]: institution["identifiers"] = [] identifiers_list = institution["identifiers"] existing_schemes = { ident.get("identifier_scheme", "") for ident in identifiers_list if isinstance(ident, dict) } # Add Wikidata ID wikidata_qid = wikidata_inst["qid"] if "Wikidata" not in existing_schemes: identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wikidata_qid, "identifier_url": f"https://www.wikidata.org/wiki/{wikidata_qid}" }) self.stats["new_wikidata_ids"] += 1 enriched = True # Check if this replaces a synthetic Q-number in GHCID ghcid = institution.get("ghcid", "") if ghcid and re.search(r"-Q9\d{7,}", ghcid): self.stats["replaced_synthetic_q"] += 1 # Add other identifiers from Wikidata wd_identifiers = wikidata_inst.get("identifiers", {}) if isinstance(wd_identifiers, dict): for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes and scheme != "ISIL": # Skip ISIL (already have it) id_obj: dict[str, Any] = { "identifier_scheme": scheme, "identifier_value": value } # Add URLs for known schemes if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" self.stats["new_viaf_ids"] += 1 elif scheme == "Website": id_obj["identifier_url"] = value self.stats["new_websites"] += 1 identifiers_list.append(id_obj) enriched = True # Add founding date if missing if "founding_date" in wikidata_inst and not institution.get("founding_date"): institution["founding_date"] = wikidata_inst["founding_date"] self.stats["new_founding_dates"] += 1 enriched = True # Add/improve location coordinates if "latitude" in wikidata_inst and "longitude" in wikidata_inst: locations = institution.get("locations", []) if isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict): # Only update if coordinates are missing if first_loc.get("latitude") is None or first_loc.get("longitude") is None: first_loc["latitude"] = wikidata_inst["latitude"] first_loc["longitude"] = wikidata_inst["longitude"] self.stats["enriched_coordinates"] += 1 enriched = True # Update provenance if enriched: prov = institution.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") match_info = f"Wikidata enrichment ({match_type} match, confidence: {match_confidence:.2f})" if existing_method: prov["extraction_method"] = f"{existing_method} + {match_info}" else: prov["extraction_method"] = match_info return enriched def run(self) -> None: """Run the complete enrichment workflow.""" print("=" * 80) print("šŸš€ GLOBAL WIKIDATA ENRICHMENT") print("=" * 80) print(f"\n Input: {self.input_file}") print(f" Output: {self.output_file}\n") # Load existing dataset print("šŸ“– Loading global dataset...") with open(self.input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) if not isinstance(institutions, list): raise ValueError("Expected YAML file to contain a list of institutions") self.stats["total_institutions"] = len(institutions) print(f" Loaded {len(institutions):,} institutions\n") # Extract ISIL codes from our dataset isil_codes = self.build_isil_cache(institutions) self.stats["institutions_with_isil"] = len(isil_codes) print(f"šŸ“‹ Found {len(isil_codes):,} institutions with ISIL codes ({len(isil_codes)/len(institutions)*100:.1f}%)\n") # Query Wikidata by ISIL codes (batch queries) self.query_by_isil_codes(isil_codes) # Match and enrich print(f"\nšŸ”— Matching and enriching institutions...") print(f" Strategy: ISIL code matching (high confidence)\n") enriched_count = 0 for i, institution in enumerate(institutions): # Match by ISIL (high confidence) wikidata_inst = self.match_by_isil(institution) if wikidata_inst: if self.enrich_institution(institution, wikidata_inst, match_type="ISIL", match_confidence=1.0): enriched_count += 1 self.stats["isil_matches"] += 1 # Progress indicator if enriched_count % 100 == 0: print(f" āœ… Enriched {enriched_count:,} institutions ({enriched_count/len(institutions)*100:.1f}%)") print(f"\n āœ… Total enriched: {enriched_count:,} institutions ({enriched_count/len(institutions)*100:.1f}%)\n") # Write enriched dataset print(f"šŸ’¾ Writing enriched dataset to {self.output_file}...") # Add header comment header = f"""--- # Global Heritage Institutions - Wikidata Enriched # Generated: {datetime.now(timezone.utc).isoformat()} # # Enrichment run: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')} # Total institutions: {self.stats['total_institutions']:,} # Wikidata queries: {self.stats['wikidata_queries']} # Wikidata results: {self.stats['wikidata_results']:,} # ISIL matches: {self.stats['isil_matches']:,} # New Wikidata IDs: {self.stats['new_wikidata_ids']:,} # Replaced synthetic Q-numbers: {self.stats['replaced_synthetic_q']:,} # New VIAF IDs: {self.stats['new_viaf_ids']:,} # New founding dates: {self.stats['new_founding_dates']:,} # New websites: {self.stats['new_websites']:,} """ with open(self.output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump( institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120 ) print(" āœ… Write complete\n") # Print final statistics self._print_report() def _print_report(self) -> None: """Print enrichment report.""" print("\n" + "="*80) print("šŸ“Š WIKIDATA ENRICHMENT REPORT") print("="*80) print(f"\nšŸ“š Dataset Statistics:") print(f" Total institutions: {self.stats['total_institutions']:,}") print(f" Institutions with ISIL codes: {self.stats['institutions_with_isil']:,} ({self.stats['institutions_with_isil']/self.stats['total_institutions']*100:.1f}%)") print(f"\n🌐 Wikidata Queries:") print(f" Total queries executed: {self.stats['wikidata_queries']}") print(f" Total Wikidata results: {self.stats['wikidata_results']:,}") print(f"\nšŸ”— Matching Results:") print(f" ISIL matches: {self.stats['isil_matches']:,} ({self.stats['isil_matches']/self.stats['total_institutions']*100:.1f}%)") print(f" Fuzzy matches: {self.stats['fuzzy_matches']:,}") print(f" No matches: {self.stats['total_institutions'] - self.stats['isil_matches'] - self.stats['fuzzy_matches']:,}") print(f"\n✨ New Data Added:") print(f" Wikidata IDs: {self.stats['new_wikidata_ids']:,}") print(f" Replaced synthetic Q-numbers: {self.stats['replaced_synthetic_q']:,}") print(f" VIAF IDs: {self.stats['new_viaf_ids']:,}") print(f" Founding dates: {self.stats['new_founding_dates']:,}") print(f" Websites: {self.stats['new_websites']:,}") print(f" Enriched coordinates: {self.stats['enriched_coordinates']:,}") # Coverage analysis print(f"\nšŸ“ˆ Coverage Analysis:") total = self.stats['total_institutions'] with_wikidata = self.stats['new_wikidata_ids'] with_viaf = self.stats['new_viaf_ids'] print(f" Wikidata coverage: {with_wikidata:,}/{total:,} ({with_wikidata/total*100:.1f}%)") if with_viaf > 0: print(f" VIAF coverage: {with_viaf:,}/{with_wikidata:,} ({with_viaf/with_wikidata*100:.1f}% of Wikidata matches)") print(f"\nšŸ’” Next Steps:") if self.stats['replaced_synthetic_q'] > 0: print(f" āœ… Replaced {self.stats['replaced_synthetic_q']:,} synthetic Q-numbers with real Wikidata QIDs") print(f" → Run GHCID regeneration script to update GHCIDs with real Q-numbers") if self.stats['new_viaf_ids'] > 0: print(f" āœ… Found {self.stats['new_viaf_ids']:,} VIAF IDs from Wikidata") remaining_without_wikidata = total - with_wikidata if remaining_without_wikidata > 0: print(f" āš ļø {remaining_without_wikidata:,} institutions still without Wikidata IDs") print(f" → Consider fuzzy name matching or manual curation") print("\n" + "="*80 + "\n") def main(): """Main entry point.""" base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions.yaml" output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml" if not input_file.exists(): print(f"āŒ Error: Input file not found: {input_file}") print(f" Expected location: {input_file}") sys.exit(1) enricher = GlobalWikidataEnricher(input_file, output_file) try: enricher.run() print("āœ… Enrichment complete!") print(f"\nšŸ“ Output file: {output_file}") except KeyboardInterrupt: print("\n\nāš ļø Enrichment interrupted by user") sys.exit(1) except Exception as e: print(f"\nāŒ Error during enrichment: {e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()