#!/usr/bin/env python3 """ Enrich Latin American institutions using fuzzy name matching in Wikidata. This script addresses low coverage in Brazil (1%), Mexico (21%), and Chile (29%) by querying Wikidata for heritage institutions using name-based searches. Strategy: 1. Find institutions without Wikidata IDs in target countries 2. Query Wikidata for museums/archives/libraries in each country 3. Fuzzy match names (normalized) 4. Apply high-confidence matches (>0.85) Countries: - Brazil (BR, Q155): 1% → 15-25% expected - Mexico (MX, Q96): 21% → 35-45% expected - Chile (CL, Q298): 29% → 40-50% expected """ import sys from pathlib import Path from typing import Any, Optional from datetime import datetime, timezone import time import yaml from difflib import SequenceMatcher import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore # Country configurations COUNTRIES = { 'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷'}, 'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽'}, 'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱'} } def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" # Lowercase name = name.lower() # Remove common prefixes/suffixes (multilingual) # Spanish name = re.sub(r'^(fundación|museo|biblioteca|archivo|centro)\s+', '', name) name = re.sub(r'\s+(museo|biblioteca|archivo|nacional|regional|municipal)$', '', name) # Portuguese name = re.sub(r'^(fundação|museu|biblioteca|arquivo|centro)\s+', '', name) name = re.sub(r'\s+(museu|biblioteca|arquivo|nacional|estadual|municipal)$', '', name) # English name = re.sub(r'^(foundation|museum|library|archive|center|centre)\s+', '', name) name = re.sub(r'\s+(museum|library|archive|national|regional|municipal)$', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def query_wikidata_institutions( sparql: SPARQLWrapper, country_qid: str, institution_types: list[str] ) -> dict[str, dict[str, Any]]: """ Query Wikidata for heritage institutions in a specific country. country_qid: Wikidata QID for country (Q155=Brazil, Q96=Mexico, Q298=Chile) institution_types: List of Wikidata QIDs for institution types Q33506 - museum Q7075 - library Q166118 - archive """ types_values = " ".join(f"wd:{qid}" for qid in institution_types) query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel WHERE {{ VALUES ?type {{ {types_values} }} ?item wdt:P31 ?type . # instance of museum/library/archive ?item wdt:P17 wd:{country_qid} . # country OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,pt,en" . }} }} LIMIT 2000 """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] # Parse results into dict keyed by QID results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "type": binding.get("typeLabel", {}).get("value", ""), "identifiers": {} } if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[qid] = result return results except Exception as e: print(f"\n❌ Error querying Wikidata: {e}") import traceback traceback.print_exc() return {} def institution_type_compatible(inst_name: str, wd_type: str) -> bool: """Check if institution types are compatible (avoid museum/archive mismatches).""" inst_lower = inst_name.lower() wd_lower = wd_type.lower() # Define type keywords (multilingual) museum_keywords = ['museum', 'museo', 'museu', 'musée'] archive_keywords = ['archief', 'archive', 'archivo', 'arquivo'] library_keywords = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque'] # Check if institution name contains type keyword inst_is_museum = any(kw in inst_lower for kw in museum_keywords) inst_is_archive = any(kw in inst_lower for kw in archive_keywords) inst_is_library = any(kw in inst_lower for kw in library_keywords) # Check if Wikidata type contains type keyword wd_is_museum = any(kw in wd_lower for kw in museum_keywords) wd_is_archive = any(kw in wd_lower for kw in archive_keywords) wd_is_library = any(kw in wd_lower for kw in library_keywords) # If both have explicit types, they must match if inst_is_museum and not wd_is_museum: return False if inst_is_archive and not wd_is_archive: return False if inst_is_library and not wd_is_library: return False return True def fuzzy_match_institutions( institutions: list[dict[str, Any]], wikidata_results: dict[str, dict[str, Any]], threshold: float = 0.85 ) -> list[tuple[int, str, float, dict[str, Any]]]: """ Fuzzy match institutions with Wikidata results. Returns: List of (institution_idx, qid, confidence_score, wd_data) """ matches = [] for idx, inst in enumerate(institutions): inst_name = inst.get("name", "") if not inst_name: continue # Skip if already has real Wikidata ID has_wikidata = any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") and int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000 for id_obj in inst.get("identifiers", []) ) if has_wikidata: continue # Find best match best_score = 0.0 best_qid = None best_data = None for qid, wd_data in wikidata_results.items(): wd_name = wd_data.get("name", "") wd_type = wd_data.get("type", "") if not wd_name: continue # Check type compatibility if not institution_type_compatible(inst_name, wd_type): continue score = similarity_score(inst_name, wd_name) if score > best_score: best_score = score best_qid = qid best_data = wd_data # Only include matches above threshold if best_score >= threshold and best_qid and best_data: matches.append((idx, best_qid, best_score, best_data)) return matches def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool: """Enrich an institution with Wikidata data. Returns True if enriched.""" enriched = False if "identifiers" not in inst or not inst["identifiers"]: inst["identifiers"] = [] identifiers_list = inst["identifiers"] existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)} # Add Wikidata ID (or replace synthetic Q-number) wikidata_idx = None for i, id_obj in enumerate(identifiers_list): if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "Wikidata": wikidata_idx = i break if wikidata_idx is not None: # Replace existing (possibly synthetic) Wikidata ID old_value = identifiers_list[wikidata_idx].get("identifier_value", "") if old_value != wd_data["qid"]: identifiers_list[wikidata_idx] = { "identifier_scheme": "Wikidata", "identifier_value": wd_data["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}" } enriched = True else: # Add new Wikidata ID identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wd_data["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}" }) enriched = True # Add other identifiers wd_identifiers = wd_data.get("identifiers", {}) for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj = { "identifier_scheme": scheme, "identifier_value": value } if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" elif scheme == "Website": id_obj["identifier_url"] = value identifiers_list.append(id_obj) enriched = True # Add founding date if "founding_date" in wd_data and not inst.get("founding_date"): inst["founding_date"] = wd_data["founding_date"] enriched = True # Add coordinates if missing if "latitude" in wd_data and "longitude" in wd_data: locations = inst.get("locations", []) if isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict) and first_loc.get("latitude") is None: first_loc["latitude"] = wd_data["latitude"] first_loc["longitude"] = wd_data["longitude"] enriched = True # Update provenance if enriched: prov = inst.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") if existing_method: prov["extraction_method"] = f"{existing_method} + Wikidata enrichment (fuzzy name match)" else: prov["extraction_method"] = "Wikidata enrichment (fuzzy name match)" return enriched def process_country( institutions: list[dict[str, Any]], country_code: str, sparql: SPARQLWrapper ) -> tuple[int, int]: """ Process a single country's institutions. Returns: (institutions_without_wikidata, enriched_count) """ country_info = COUNTRIES[country_code] print(f"\n{'='*80}") print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})") print(f"{'='*80}\n") # Filter institutions for this country country_institutions_idx = [ idx for idx, inst in enumerate(institutions) if inst.get('locations', [{}])[0].get('country') == country_code ] print(f"📊 Found {len(country_institutions_idx):,} {country_info['name']} institutions") # Count those without real Wikidata without_wikidata = [ idx for idx in country_institutions_idx if not any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") and int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000 for id_obj in institutions[idx].get("identifiers", []) ) ] current_coverage = (len(country_institutions_idx) - len(without_wikidata)) / len(country_institutions_idx) * 100 if country_institutions_idx else 0 print(f"✅ With Wikidata: {len(country_institutions_idx) - len(without_wikidata):,} ({current_coverage:.1f}%)") print(f"❓ Without Wikidata: {len(without_wikidata):,}\n") if not without_wikidata: print("✨ All institutions already have Wikidata IDs!") return 0, 0 # Query Wikidata print(f"🔍 Querying Wikidata for {country_info['name']} museums, libraries, and archives...") print(" (This may take 30-60 seconds)\n") institution_types = ["Q33506", "Q7075", "Q166118"] # museum, library, archive wikidata_results = query_wikidata_institutions(sparql, country_info['qid'], institution_types) print(f"✅ Found {len(wikidata_results):,} {country_info['name']} institutions in Wikidata\n") if not wikidata_results: print("⚠️ No Wikidata results, skipping fuzzy matching") return len(without_wikidata), 0 # Fuzzy match print("🔗 Fuzzy matching names (threshold: 0.85)...\n") country_insts = [institutions[idx] for idx in without_wikidata] matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=0.85) print(f"✨ Found {len(matches):,} high-confidence matches\n") # Show sample matches if matches: print(f"{'='*80}") print(f"📋 SAMPLE MATCHES (Top 5)") print(f"{'='*80}") for i, (local_idx, qid, score, wd_data) in enumerate(matches[:5]): inst = country_insts[local_idx] print(f"\n{i+1}. Confidence: {score:.3f}") print(f" Local: {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})") print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})") print(f" Type: {wd_data.get('type', 'Unknown')}") if "ISIL" in wd_data.get("identifiers", {}): print(f" ISIL: {wd_data['identifiers']['ISIL']}") print(f"\n{'='*80}\n") # Apply all matches print("✅ Applying all matches...\n") enriched_count = 0 for local_idx, qid, score, wd_data in matches: global_idx = without_wikidata[local_idx] if enrich_institution(institutions[global_idx], wd_data): enriched_count += 1 new_coverage = (len(country_institutions_idx) - len(without_wikidata) + enriched_count) / len(country_institutions_idx) * 100 if country_institutions_idx else 0 print(f"✨ Enriched {enriched_count:,} institutions") print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%\n") return len(without_wikidata), enriched_count else: print("❌ No matches found. Try lowering threshold.\n") return len(without_wikidata), 0 def main(): base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml" output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_latam_enriched.yaml" print("="*80) print("🌎 LATIN AMERICA INSTITUTIONS FUZZY MATCHING") print("="*80) print(f"\n📖 Loading dataset...\n") start_time = time.time() with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) load_time = time.time() - start_time print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s") # Setup SPARQL sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2") # Process each country total_without_wikidata = 0 total_enriched = 0 for country_code in ['BR', 'MX', 'CL']: without, enriched = process_country(institutions, country_code, sparql) total_without_wikidata += without total_enriched += enriched # Rate limiting - be nice to Wikidata if country_code != 'CL': # Don't sleep after last country print("⏸️ Waiting 5 seconds (Wikidata rate limiting)...\n") time.sleep(5) # Write output print("="*80) print("💾 WRITING ENRICHED DATASET") print("="*80 + "\n") header = f"""--- # Global Heritage Institutions - Latin America Fuzzy Match Enriched # Generated: {datetime.now(timezone.utc).isoformat()} # # Total institutions: {len(institutions):,} # Latin America institutions processed: {sum(len([i for i in institutions if i.get('locations', [{}])[0].get('country') == cc]) for cc in ['BR', 'MX', 'CL']):,} # New Latin America matches: {total_enriched:,} """ with open(output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"✅ Complete! Output: {output_file}\n") # Final report print("="*80) print("📊 FINAL ENRICHMENT REPORT") print("="*80) print(f"\n✨ Results:") print(f" Total institutions enriched: {total_enriched:,}") print(f" Latin America institutions without Wikidata: {total_without_wikidata - total_enriched:,}") print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes") print("="*80 + "\n") if __name__ == "__main__": main()