#!/usr/bin/env python3 """ Manually enrich well-known institutions with verified Wikidata QIDs. This script handles high-profile institutions that should be in Wikidata but may not match via fuzzy name matching due to naming variations. """ import yaml import sys from datetime import datetime, timezone from pathlib import Path # Verified Wikidata QIDs for well-known institutions KNOWN_INSTITUTIONS = { # US Institutions "Internet Archive": "Q461", "HathiTrust": "Q5683317", "HathiTrust Digital Library": "Q5683317", "Library of Congress": "Q131454", "OCLC": "Q190927", "WorldCat": "Q76630151", # Belgian/EU Institutions "European Parliament": "Q8889", "European Commission": "Q8880", "Council of the European Union": "Q8896", # Brazilian Institutions "Museu Nacional": "Q1850416", # National Museum Rio de Janeiro "MASP": "Q82941", # São Paulo Museum of Art "Pinacoteca": "Q2095209", # Pinacoteca do Estado de São Paulo "Pinacoteca do Estado": "Q2095209", "Pinacoteca de São Paulo": "Q2095209", # Add more as needed } def has_wikidata_id(institution: dict) -> bool: """Check if institution already has a real Wikidata ID.""" return any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") and int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000 for id_obj in institution.get("identifiers", []) ) def add_wikidata_id(institution: dict, qid: str) -> bool: """Add Wikidata identifier to institution.""" if "identifiers" not in institution: institution["identifiers"] = [] # Check if already exists for id_obj in institution["identifiers"]: if id_obj.get("identifier_scheme") == "Wikidata": return False # Add new Wikidata ID institution["identifiers"].append({ "identifier_scheme": "Wikidata", "identifier_value": qid, "identifier_url": f"https://www.wikidata.org/wiki/{qid}" }) # Update provenance if "provenance" not in institution: institution["provenance"] = {} provenance = institution["provenance"] if "enrichment_history" not in provenance: provenance["enrichment_history"] = [] provenance["enrichment_history"].append({ "enrichment_date": datetime.now(timezone.utc).isoformat(), "enrichment_method": "Manual Wikidata QID assignment (verified)", "data_source": "Wikidata", "confidence_score": 1.0 }) return True def main(): input_file = Path("data/instances/global/global_heritage_institutions_wikidata_enriched.yaml") if not input_file.exists(): print(f"❌ Input file not found: {input_file}") sys.exit(1) print("=" * 80) print("🔧 MANUAL WIKIDATA ENRICHMENT FOR KNOWN INSTITUTIONS") print("=" * 80) print(f"\n📖 Loading dataset from: {input_file.name}") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"✅ Loaded {len(institutions):,} institutions\n") enriched_count = 0 matched_institutions = [] for institution in institutions: name = institution.get("name", "") # Check if this institution matches any known names for known_name, qid in KNOWN_INSTITUTIONS.items(): if known_name.lower() in name.lower() or name.lower() in known_name.lower(): # Check if already has Wikidata if has_wikidata_id(institution): continue # Add Wikidata ID if add_wikidata_id(institution, qid): enriched_count += 1 matched_institutions.append({ "name": name, "qid": qid, "matched_pattern": known_name }) print(f"✅ Enriched: {name}") print(f" → Wikidata: {qid} (matched '{known_name}')") print() if enriched_count == 0: print("ℹ️ No institutions enriched (all already have Wikidata IDs)") return # Create backup backup_file = input_file.with_suffix('.yaml.manual_enrichment_backup') print(f"\n💾 Creating backup: {backup_file.name}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) # Write updated dataset print(f"💾 Writing updated dataset: {input_file.name}") with open(input_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print("\n" + "=" * 80) print(f"✨ ENRICHMENT COMPLETE") print("=" * 80) print(f"Institutions enriched: {enriched_count}") print(f"\nMatched institutions:") for match in matched_institutions: print(f" • {match['name']} → {match['qid']}") if __name__ == "__main__": main()