#!/usr/bin/env python3 """ Global Wikidata SPARQL Enrichment for Heritage Institutions This script enriches heritage institutions worldwide by querying Wikidata's SPARQL endpoint and performing fuzzy name matching to find real Q-numbers. 🚨 CRITICAL POLICY: REAL IDENTIFIERS ONLY 🚨 This script NEVER generates synthetic Q-numbers. If no Wikidata match is found, institutions remain without Q-numbers and are flagged for manual enrichment. Strategy: 1. Process institutions by country (configurable priority order) 2. Query Wikidata for museums/archives/libraries in each country using SPARQL 3. Fuzzy match institution names (threshold > 0.85) 4. Cross-reference ISIL/VIAF identifiers for high-confidence matches 5. Update GHCIDs ONLY when collision resolution requires Q-number 6. Track provenance with match confidence scores Priority Countries (configurable): - Netherlands (NL, Q55): Highest data quality, 1,351 institutions - Chile (CL, Q298): Good name quality, 28.9% current coverage - Belgium (BE, Q31): ~500 institutions - Italy (IT, Q38): ~400 institutions - Denmark (DK, Q35): ~300 institutions Usage: python enrich_institutions_wikidata_sparql.py --countries NL CL BE --threshold 0.85 --dry-run python enrich_institutions_wikidata_sparql.py --all-countries --skip-existing """ import sys from pathlib import Path from typing import Any, Optional from datetime import datetime, timezone import time import yaml from difflib import SequenceMatcher import re import argparse sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore # Country configurations (Wikidata QIDs) # Prioritized by data quality, institution count, and expected match rate COUNTRY_CONFIGS = { # Priority 1: High data quality, large datasets 'NL': {'name': 'Netherlands', 'qid': 'Q55', 'flag': '🇳🇱', 'languages': 'nl,en', 'priority': 1}, 'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱', 'languages': 'es,en', 'priority': 1}, # Priority 2: Medium datasets, good coverage potential 'BE': {'name': 'Belgium', 'qid': 'Q31', 'flag': '🇧🇪', 'languages': 'nl,fr,en', 'priority': 2}, 'IT': {'name': 'Italy', 'qid': 'Q38', 'flag': '🇮🇹', 'languages': 'it,en', 'priority': 2}, 'DK': {'name': 'Denmark', 'qid': 'Q35', 'flag': '🇩🇰', 'languages': 'da,en', 'priority': 2}, 'AT': {'name': 'Austria', 'qid': 'Q40', 'flag': '🇦🇹', 'languages': 'de,en', 'priority': 2}, 'CH': {'name': 'Switzerland', 'qid': 'Q39', 'flag': '🇨🇭', 'languages': 'de,fr,it,en', 'priority': 2}, 'NO': {'name': 'Norway', 'qid': 'Q20', 'flag': '🇳🇴', 'languages': 'no,en', 'priority': 2}, # Priority 3: Latin America (already partially enriched) 'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷', 'languages': 'pt,en', 'priority': 3}, 'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽', 'languages': 'es,en', 'priority': 3}, 'AR': {'name': 'Argentina', 'qid': 'Q414', 'flag': '🇦🇷', 'languages': 'es,en', 'priority': 3}, 'CO': {'name': 'Colombia', 'qid': 'Q739', 'flag': '🇨🇴', 'languages': 'es,en', 'priority': 3}, # Priority 4: Asian countries (language barriers) 'JP': {'name': 'Japan', 'qid': 'Q17', 'flag': '🇯🇵', 'languages': 'ja,en', 'priority': 4}, 'VN': {'name': 'Vietnam', 'qid': 'Q881', 'flag': '🇻🇳', 'languages': 'vi,en', 'priority': 4}, 'TH': {'name': 'Thailand', 'qid': 'Q869', 'flag': '🇹🇭', 'languages': 'th,en', 'priority': 4}, 'TW': {'name': 'Taiwan', 'qid': 'Q865', 'flag': '🇹🇼', 'languages': 'zh,en', 'priority': 4}, 'KR': {'name': 'South Korea', 'qid': 'Q884', 'flag': '🇰🇷', 'languages': 'ko,en', 'priority': 4}, 'MY': {'name': 'Malaysia', 'qid': 'Q833', 'flag': '🇲🇾', 'languages': 'ms,en', 'priority': 4}, 'ID': {'name': 'Indonesia', 'qid': 'Q252', 'flag': '🇮🇩', 'languages': 'id,en', 'priority': 4}, 'PH': {'name': 'Philippines', 'qid': 'Q928', 'flag': '🇵🇭', 'languages': 'en,tl', 'priority': 4}, # Priority 5: African/Middle Eastern countries (fewer Wikidata entries) 'EG': {'name': 'Egypt', 'qid': 'Q79', 'flag': '🇪🇬', 'languages': 'ar,en', 'priority': 5}, 'ZA': {'name': 'South Africa', 'qid': 'Q258', 'flag': '🇿🇦', 'languages': 'en,af', 'priority': 5}, 'KE': {'name': 'Kenya', 'qid': 'Q114', 'flag': '🇰🇪', 'languages': 'en,sw', 'priority': 5}, 'NG': {'name': 'Nigeria', 'qid': 'Q1033', 'flag': '🇳🇬', 'languages': 'en', 'priority': 5}, 'GH': {'name': 'Ghana', 'qid': 'Q117', 'flag': '🇬🇭', 'languages': 'en', 'priority': 5}, # Add more countries as needed } def normalize_name(name: str) -> str: """ Normalize institution name for fuzzy matching. Removes common prefixes/suffixes in multiple languages to improve matching. """ # Lowercase name = name.lower() # Remove common prefixes (multilingual) prefixes = [ # Dutch r'^(het |de |museum |archief |bibliotheek |stichting |nationaal |provinciaal |gemeentelijk |regionaal )', # English r'^(the |museum |archive |library |foundation |national |provincial |municipal |regional )', # Spanish/Portuguese r'^(el |la |los |las |museo |archivo |biblioteca |fundación |fundação |nacional |provincial |municipal |regional )', # French r'^(le |la |les |musée |archives |bibliothèque |fondation |national |provincial |municipal |régional )', # German r'^(das |die |der |museum |archiv |bibliothek |stiftung |national |provinziell |kommunal |regional )', # Italian r'^(il |lo |la |museo |archivio |biblioteca |fondazione |nazionale |provinciale |comunale |regionale )', ] for prefix_pattern in prefixes: name = re.sub(prefix_pattern, '', name, flags=re.IGNORECASE) # Remove common suffixes (multilingual) suffixes = [ r'\s+(museum|museu|museo|musée)$', r'\s+(archief|archive|archivo|arquivo|archives)$', r'\s+(bibliotheek|library|biblioteca|bibliothèque)$', r'\s+(stichting|foundation|fundación|fundação|fondation|fondazione)$', r'\s+(national|nacional|nationale|nationaal)$', r'\s+(regional|regional|régional)$', r'\s+(municipal|comunal|municipale)$', ] for suffix_pattern in suffixes: name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name.strip() def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names using SequenceMatcher (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def query_wikidata_institutions( sparql: SPARQLWrapper, country_qid: str, institution_types: list[str], languages: str = "en" ) -> dict[str, dict[str, Any]]: """ Query Wikidata for heritage institutions in a specific country. 🔧 OPTIMIZED VERSION: Queries each institution type separately to avoid expensive transitive subclass queries (wdt:P279*) that cause 504 timeouts. Args: sparql: Configured SPARQLWrapper instance country_qid: Wikidata QID for country (e.g., Q55 for Netherlands) institution_types: List of Wikidata QIDs for institution types: Q33506 - museum Q7075 - library Q166118 - archive Q2668072 - art gallery Q5282129 - cultural center languages: Comma-separated language codes for labels (e.g., "nl,en") Returns: Dictionary mapping Wikidata QIDs to institution metadata """ # Query each type separately to avoid timeout all_results = {} for inst_type_qid in institution_types: print(f" - Querying {inst_type_qid}...", end="", flush=True) query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?itemAltLabel ?isil ?viaf ?coords ?website ?inception ?instType ?instTypeLabel WHERE {{ # Direct instance-of match (no expensive transitive subclass) ?item wdt:P31/wdt:P279? wd:{inst_type_qid} . # instance of (or subclass of) type ?item wdt:P17 wd:{country_qid} . # country # Capture the specific type ?item wdt:P31 ?instType . # Optional identifiers and metadata OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website OPTIONAL {{ ?item wdt:P571 ?inception . }} # Founding date SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{languages}" . ?item rdfs:label ?itemLabel . ?item schema:description ?itemDescription . ?item skos:altLabel ?itemAltLabel . ?instType rdfs:label ?instTypeLabel . }} }} LIMIT 1000 """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] # Merge results type_results = _parse_sparql_bindings(bindings) all_results.update(type_results) print(f" {len(type_results)} found") except Exception as e: print(f" ❌ Error: {e}") continue return all_results def _parse_sparql_bindings(bindings: list[dict]) -> dict[str, dict[str, Any]]: """ Helper function to parse SPARQL query bindings into institution metadata. Returns: Dictionary mapping Wikidata QIDs to institution metadata """ results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue # Check if it's a REAL Wikidata Q-number (not synthetic) try: qid_num = int(qid[1:]) if qid_num >= 90000000: # Synthetic Q-number range - SKIP continue except ValueError: continue # Initialize or update result if qid not in results: results[qid] = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "type": binding.get("instTypeLabel", {}).get("value", ""), "alternative_names": [], "identifiers": {} } # Collect alternative names (multilingual labels) alt_label = binding.get("itemAltLabel", {}).get("value", "") if alt_label and alt_label not in results[qid]["alternative_names"]: results[qid]["alternative_names"].append(alt_label) # Add identifiers if "isil" in binding: results[qid]["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: results[qid]["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: results[qid]["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: results[qid]["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() results[qid]["latitude"] = float(lat) results[qid]["longitude"] = float(lon) return results def has_real_wikidata_id(inst: dict[str, Any]) -> bool: """Check if institution already has a REAL (non-synthetic) Wikidata ID.""" for id_obj in inst.get("identifiers", []): if not isinstance(id_obj, dict): continue if id_obj.get("identifier_scheme") == "Wikidata": qid = id_obj.get("identifier_value", "") if qid.startswith("Q"): try: qid_num = int(qid[1:]) if qid_num < 90000000: return True # Real Wikidata ID except ValueError: pass return False def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool: """ Check if institution types are compatible (avoid museum/archive/library mismatches). Uses both the institution's formal type and name keywords to validate compatibility. """ inst_lower = inst_name.lower() wd_lower = wd_type.lower() formal_type = inst_type.upper() # Define type keywords (multilingual) museum_keywords = ['museum', 'museo', 'museu', 'musée', 'muzeum', 'muzeu'] archive_keywords = ['archief', 'archive', 'archivo', 'arquivo', 'archiv', 'arkiv'] library_keywords = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque', 'bibliothek', 'bibliotek'] gallery_keywords = ['gallery', 'galerie', 'galería', 'galleria', 'kunsthal', 'kunsthalle'] # Check if institution name contains type keyword inst_is_museum = any(kw in inst_lower for kw in museum_keywords) inst_is_archive = any(kw in inst_lower for kw in archive_keywords) inst_is_library = any(kw in inst_lower for kw in library_keywords) inst_is_gallery = any(kw in inst_lower for kw in gallery_keywords) # Check if Wikidata type contains type keyword wd_is_museum = any(kw in wd_lower for kw in museum_keywords) wd_is_archive = any(kw in wd_lower for kw in archive_keywords) wd_is_library = any(kw in wd_lower for kw in library_keywords) wd_is_gallery = any(kw in wd_lower for kw in gallery_keywords) # Check formal institution type formal_is_museum = formal_type in ('MUSEUM', 'GALLERY', 'BOTANICAL_ZOO') formal_is_archive = formal_type == 'ARCHIVE' formal_is_library = formal_type == 'LIBRARY' # If Wikidata type is empty, allow match (type will be determined by name/formal type) if not wd_type or not wd_lower.strip(): return True # If both have explicit types, they must match if (inst_is_museum or formal_is_museum) and not wd_is_museum and not wd_is_gallery: return False if (inst_is_archive or formal_is_archive) and not wd_is_archive: return False if (inst_is_library or formal_is_library) and not wd_is_library: return False return True def isil_cross_reference_match(inst: dict[str, Any], wd_data: dict[str, Any]) -> Optional[float]: """ Check for ISIL code cross-reference (highest confidence match). Returns: 1.0 if ISIL codes match exactly None if no ISIL match """ inst_isil = None for id_obj in inst.get("identifiers", []): if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "ISIL": inst_isil = id_obj.get("identifier_value", "").strip() break wd_isil = wd_data.get("identifiers", {}).get("ISIL", "").strip() if inst_isil and wd_isil and inst_isil == wd_isil: return 1.0 # Perfect match via ISIL return None def viaf_cross_reference_match(inst: dict[str, Any], wd_data: dict[str, Any]) -> Optional[float]: """ Check for VIAF ID cross-reference (high confidence match). Returns: 0.98 if VIAF IDs match exactly None if no VIAF match """ inst_viaf = None for id_obj in inst.get("identifiers", []): if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "VIAF": inst_viaf = id_obj.get("identifier_value", "").strip() break wd_viaf = wd_data.get("identifiers", {}).get("VIAF", "").strip() if inst_viaf and wd_viaf and inst_viaf == wd_viaf: return 0.98 # Very high confidence via VIAF return None def fuzzy_match_institutions( institutions: list[dict[str, Any]], wikidata_results: dict[str, dict[str, Any]], threshold: float = 0.85 ) -> list[tuple[int, str, float, dict[str, Any], str]]: """ Fuzzy match institutions with Wikidata results using multiple strategies. Matching strategies (in priority order): 1. ISIL code cross-reference (confidence: 1.0) 2. VIAF ID cross-reference (confidence: 0.98) 3. Fuzzy name matching (confidence: similarity score) Returns: List of (institution_idx, qid, confidence_score, wd_data, match_method) """ matches = [] for idx, inst in enumerate(institutions): inst_name = inst.get("name", "") inst_type = inst.get("institution_type", "") if not inst_name: continue # Skip if already has real Wikidata ID if has_real_wikidata_id(inst): continue # Find best match using multiple strategies best_score = 0.0 best_qid = None best_data = None best_method = "fuzzy_name_match" for qid, wd_data in wikidata_results.items(): wd_name = wd_data.get("name", "") wd_type = wd_data.get("type", "") if not wd_name: continue # Check type compatibility if not institution_type_compatible(inst_name, inst_type, wd_type): continue # Strategy 1: ISIL cross-reference (highest confidence) isil_score = isil_cross_reference_match(inst, wd_data) if isil_score: best_score = isil_score best_qid = qid best_data = wd_data best_method = "isil_cross_reference" break # Perfect match, no need to continue # Strategy 2: VIAF cross-reference (very high confidence) viaf_score = viaf_cross_reference_match(inst, wd_data) if viaf_score and viaf_score > best_score: best_score = viaf_score best_qid = qid best_data = wd_data best_method = "viaf_cross_reference" # Strategy 3: Fuzzy name matching (check primary name and alternatives) names_to_check = [wd_name] + wd_data.get("alternative_names", []) for wd_name_variant in names_to_check: score = similarity_score(inst_name, wd_name_variant) if score > best_score: best_score = score best_qid = qid best_data = wd_data best_method = "fuzzy_name_match" # Only include matches above threshold if best_score >= threshold and best_qid and best_data: matches.append((idx, best_qid, best_score, best_data, best_method)) return matches def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any], match_method: str, confidence: float) -> bool: """ Enrich an institution with Wikidata data. 🚨 CRITICAL: This function ONLY adds REAL Wikidata Q-numbers. It NEVER generates synthetic Q-numbers. Returns: True if institution was enriched """ enriched = False if "identifiers" not in inst or not inst["identifiers"]: inst["identifiers"] = [] identifiers_list = inst["identifiers"] existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)} # Check if Q-number is REAL (not synthetic) qid = wd_data["qid"] try: qid_num = int(qid[1:]) if qid_num >= 90000000: print(f"⚠️ WARNING: Attempted to add synthetic Q-number {qid} - REJECTED") return False except ValueError: print(f"⚠️ WARNING: Invalid Q-number format {qid} - REJECTED") return False # Add or replace Wikidata ID wikidata_idx = None for i, id_obj in enumerate(identifiers_list): if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "Wikidata": wikidata_idx = i break if wikidata_idx is not None: # Replace existing (possibly synthetic) Wikidata ID old_value = identifiers_list[wikidata_idx].get("identifier_value", "") if old_value != qid: identifiers_list[wikidata_idx] = { "identifier_scheme": "Wikidata", "identifier_value": qid, "identifier_url": f"https://www.wikidata.org/wiki/{qid}" } enriched = True else: # Add new Wikidata ID identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": qid, "identifier_url": f"https://www.wikidata.org/wiki/{qid}" }) enriched = True # Add other identifiers from Wikidata wd_identifiers = wd_data.get("identifiers", {}) for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj = { "identifier_scheme": scheme, "identifier_value": value } if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" elif scheme == "Website": id_obj["identifier_url"] = value elif scheme == "ISIL": # Don't override existing ISIL, but add if missing pass identifiers_list.append(id_obj) enriched = True # Add founding date if missing if "founding_date" in wd_data and not inst.get("founding_date"): inst["founding_date"] = wd_data["founding_date"] enriched = True # Add coordinates if missing if "latitude" in wd_data and "longitude" in wd_data: locations = inst.get("locations", []) if isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict): if first_loc.get("latitude") is None or first_loc.get("longitude") is None: first_loc["latitude"] = wd_data["latitude"] first_loc["longitude"] = wd_data["longitude"] enriched = True # Update provenance if enriched: prov = inst.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") match_type_desc = { "isil_cross_reference": "Wikidata enrichment (ISIL cross-reference)", "viaf_cross_reference": "Wikidata enrichment (VIAF cross-reference)", "fuzzy_name_match": f"Wikidata enrichment (fuzzy name match, confidence: {confidence:.3f})" }.get(match_method, "Wikidata enrichment") if existing_method: prov["extraction_method"] = f"{existing_method} + {match_type_desc}" else: prov["extraction_method"] = match_type_desc # Update extraction date prov["enrichment_date"] = datetime.now(timezone.utc).isoformat() return enriched def process_country( institutions: list[dict[str, Any]], country_code: str, sparql: SPARQLWrapper, threshold: float = 0.85, dry_run: bool = False ) -> tuple[int, int, dict[str, int]]: """ Process a single country's institutions. Returns: (institutions_without_wikidata, enriched_count, match_methods_stats) """ country_info = COUNTRY_CONFIGS.get(country_code) if not country_info: print(f"\n⚠️ Unknown country code: {country_code}") return 0, 0, {} print(f"\n{'='*80}") print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})") print(f"{'='*80}\n") # Filter institutions for this country country_institutions_idx = [ idx for idx, inst in enumerate(institutions) if inst.get('locations', [{}])[0].get('country') == country_code ] print(f"📊 Found {len(country_institutions_idx):,} {country_info['name']} institutions") # Count those without real Wikidata without_wikidata = [ idx for idx in country_institutions_idx if not has_real_wikidata_id(institutions[idx]) ] current_coverage = (len(country_institutions_idx) - len(without_wikidata)) / len(country_institutions_idx) * 100 if country_institutions_idx else 0 print(f"✅ With Wikidata: {len(country_institutions_idx) - len(without_wikidata):,} ({current_coverage:.1f}%)") print(f"❓ Without Wikidata: {len(without_wikidata):,}\n") if not without_wikidata: print("✨ All institutions already have Wikidata IDs!") return 0, 0, {} # Query Wikidata print(f"🔍 Querying Wikidata for {country_info['name']} heritage institutions...") print(" (This may take 30-90 seconds)\n") # Query for museums, libraries, archives, galleries institution_types = ["Q33506", "Q7075", "Q166118", "Q2668072"] languages = country_info.get('languages', 'en') wikidata_results = query_wikidata_institutions(sparql, country_info['qid'], institution_types, languages) print(f"✅ Found {len(wikidata_results):,} {country_info['name']} institutions in Wikidata\n") if not wikidata_results: print("⚠️ No Wikidata results, skipping fuzzy matching") return len(without_wikidata), 0, {} # Fuzzy match print(f"🔗 Matching institutions (threshold: {threshold:.2f})...\n") country_insts = [institutions[idx] for idx in without_wikidata] matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=threshold) print(f"✨ Found {len(matches):,} high-confidence matches\n") # Track match methods match_methods_stats = { "isil_cross_reference": 0, "viaf_cross_reference": 0, "fuzzy_name_match": 0 } # Show sample matches if matches: print(f"{'='*80}") print(f"📋 SAMPLE MATCHES (Top 10)") print(f"{'='*80}") for i, (local_idx, qid, score, wd_data, method) in enumerate(matches[:10]): inst = country_insts[local_idx] print(f"\n{i+1}. Method: {method.upper()}, Confidence: {score:.3f}") print(f" Local: {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})") print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})") print(f" Type: {wd_data.get('type', 'Unknown')}") if "ISIL" in wd_data.get("identifiers", {}): print(f" ISIL: {wd_data['identifiers']['ISIL']}") if "VIAF" in wd_data.get("identifiers", {}): print(f" VIAF: {wd_data['identifiers']['VIAF']}") print(f"\n{'='*80}\n") if dry_run: print("🔍 DRY RUN: Would enrich the following institutions:\n") for local_idx, qid, score, wd_data, method in matches: inst = country_insts[local_idx] print(f" - {inst.get('name')} → {qid} (method: {method}, confidence: {score:.3f})") print(f"\n✅ Dry run complete. Use --no-dry-run to apply changes.\n") return len(without_wikidata), 0, {} # Apply all matches print("✅ Applying all matches...\n") enriched_count = 0 for local_idx, qid, score, wd_data, method in matches: global_idx = without_wikidata[local_idx] if enrich_institution(institutions[global_idx], wd_data, method, score): enriched_count += 1 match_methods_stats[method] += 1 new_coverage = (len(country_institutions_idx) - len(without_wikidata) + enriched_count) / len(country_institutions_idx) * 100 if country_institutions_idx else 0 print(f"✨ Enriched {enriched_count:,} institutions") print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%") print(f"\n📊 Match methods:") for method, count in match_methods_stats.items(): if count > 0: print(f" {method}: {count:,}") print() return len(without_wikidata), enriched_count, match_methods_stats else: print("❌ No matches found. Try lowering threshold.\n") return len(without_wikidata), 0, {} def main(): parser = argparse.ArgumentParser( description="Enrich heritage institutions with real Wikidata Q-numbers using SPARQL", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Dutch and Chilean institutions (priority 1) %(prog)s --countries NL CL --threshold 0.85 # All priority 1 and 2 countries %(prog)s --priority 1 2 --threshold 0.85 # Dry run (preview matches without applying) %(prog)s --countries NL --dry-run # All countries (not recommended - use priority groups) %(prog)s --all-countries --threshold 0.85 """ ) parser.add_argument( '--countries', nargs='+', metavar='CODE', help='Country codes to process (e.g., NL CL BE IT)' ) parser.add_argument( '--priority', nargs='+', type=int, metavar='N', help='Process countries by priority level (1-5)' ) parser.add_argument( '--all-countries', action='store_true', help='Process all configured countries (use with caution)' ) parser.add_argument( '--threshold', type=float, default=0.85, help='Fuzzy match threshold (0.0-1.0, default: 0.85)' ) parser.add_argument( '--dry-run', action='store_true', help='Preview matches without applying changes' ) parser.add_argument( '--input', type=Path, help='Input YAML file (default: data/instances/global/global_heritage_institutions_wikidata_enriched.yaml)' ) parser.add_argument( '--output', type=Path, help='Output YAML file (default: overwrites input or creates new file with _sparql_enriched suffix)' ) args = parser.parse_args() # Determine countries to process countries_to_process = [] if args.countries: countries_to_process = args.countries elif args.priority: countries_to_process = [ code for code, info in COUNTRY_CONFIGS.items() if info.get('priority') in args.priority ] elif args.all_countries: countries_to_process = list(COUNTRY_CONFIGS.keys()) else: # Default: Priority 1 countries countries_to_process = [ code for code, info in COUNTRY_CONFIGS.items() if info.get('priority') == 1 ] # Validate country codes invalid_countries = [c for c in countries_to_process if c not in COUNTRY_CONFIGS] if invalid_countries: print(f"❌ Invalid country codes: {', '.join(invalid_countries)}") print(f" Valid codes: {', '.join(sorted(COUNTRY_CONFIGS.keys()))}") return 1 # File paths base_dir = Path(__file__).parent.parent if args.input: input_file = args.input else: input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml" if args.output: output_file = args.output elif args.dry_run: output_file = None # No output for dry run else: output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_sparql_enriched.yaml" # Header print("="*80) print("🌍 GLOBAL WIKIDATA SPARQL ENRICHMENT") print("="*80) print(f"\n📖 Loading dataset: {input_file.name}\n") start_time = time.time() # Load dataset with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) load_time = time.time() - start_time print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s") # Setup SPARQL sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.0 (Wikidata Enrichment)") # Process countries print(f"\n🌍 Processing {len(countries_to_process)} countries:") country_names = [COUNTRY_CONFIGS[c]['name'] for c in countries_to_process] print(f" {', '.join(country_names)}\n") if args.dry_run: print("🔍 DRY RUN MODE: No changes will be saved\n") total_without_wikidata = 0 total_enriched = 0 total_match_methods = { "isil_cross_reference": 0, "viaf_cross_reference": 0, "fuzzy_name_match": 0 } for i, country_code in enumerate(countries_to_process): without, enriched, methods = process_country( institutions, country_code, sparql, threshold=args.threshold, dry_run=args.dry_run ) total_without_wikidata += without total_enriched += enriched for method, count in methods.items(): total_match_methods[method] += count # Rate limiting - be nice to Wikidata if i < len(countries_to_process) - 1: print("⏸️ Waiting 5 seconds (Wikidata rate limiting)...\n") time.sleep(5) # Write output (unless dry run) if not args.dry_run and total_enriched > 0 and output_file: print("="*80) print("💾 WRITING ENRICHED DATASET") print("="*80 + "\n") header = f"""--- # Global Heritage Institutions - SPARQL Enriched # Generated: {datetime.now(timezone.utc).isoformat()} # # Total institutions: {len(institutions):,} # Countries processed: {', '.join(countries_to_process)} # New Wikidata matches: {total_enriched:,} # Match threshold: {args.threshold:.2f} """ with open(output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"✅ Complete! Output: {output_file}\n") # Final report print("="*80) print("📊 FINAL ENRICHMENT REPORT") print("="*80) print(f"\n✨ Results:") print(f" Total institutions enriched: {total_enriched:,}") print(f" Institutions still without Wikidata: {total_without_wikidata - total_enriched:,}") if total_enriched > 0: print(f"\n📊 Enrichment methods:") for method, count in total_match_methods.items(): if count > 0: percentage = (count / total_enriched * 100) if total_enriched > 0 else 0 print(f" {method}: {count:,} ({percentage:.1f}%)") print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes") if args.dry_run: print("\n🔍 This was a dry run. Use --no-dry-run to apply changes.") print("="*80 + "\n") return 0 if __name__ == "__main__": sys.exit(main())