#!/usr/bin/env python3 """ Enrich low-coverage countries using fuzzy name matching with Wikidata. Target countries with <30% Wikidata coverage: - Brazil (BR): 1.0% - Belgium (BE): 0.0% - Italy (IT): 0.0% - Thailand (TH): <30% - Norway (NO): <30% - Vietnam (VN): <30% Strategy: 1. Query Wikidata for museums/libraries/archives in target country 2. Fuzzy match institution names (normalized, threshold 0.85) 3. Verify type compatibility (don't match museum → archive) 4. Enrich with Wikidata IDs, VIAF, founding dates, websites """ import sys from pathlib import Path from typing import Any, Optional from datetime import datetime, timezone import time import yaml from difflib import SequenceMatcher import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" # Lowercase name = name.lower() # Remove common prefixes/suffixes in multiple languages name = re.sub(r'^(stichting|gemeentearchief|regionaal archief|museum|museu|museo|biblioteca|bibliotheek|library|archive|archief|archivo)[\s\-]+', '', name) name = re.sub(r'[\s\-]+(archief|museum|museo|museu|bibliotheek|biblioteca|library|archive|archivo)$', '', name) # Remove organizational forms name = re.sub(r'\b(s\.a\.|sa|nv|bv|vzw|asbl|inc|ltd|gmbh)\b', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def institution_type_compatible(inst_type: str, inst_name: str, wd_name: str, wd_desc: str) -> bool: """ Check if institution types are compatible. Prevents mismatches like museum → archive or library → museum. """ # Define type keywords by language museum_kw = ['museum', 'museo', 'museu', 'muzeum'] archive_kw = ['archief', 'archive', 'archivo', 'archivio'] library_kw = ['bibliotheek', 'biblioteca', 'library', 'bibliothèque', 'bibliothek'] inst_lower = (inst_name + ' ' + inst_type).lower() wd_lower = (wd_name + ' ' + wd_desc).lower() # Check institutional type inst_is_museum = any(kw in inst_lower for kw in museum_kw) inst_is_archive = any(kw in inst_lower for kw in archive_kw) inst_is_library = any(kw in inst_lower for kw in library_kw) wd_is_museum = any(kw in wd_lower for kw in museum_kw) wd_is_archive = any(kw in wd_lower for kw in archive_kw) wd_is_library = any(kw in wd_lower for kw in library_kw) # If both have explicit types, they must match if inst_is_museum and not wd_is_museum: return False if inst_is_archive and not wd_is_archive: return False if inst_is_library and not wd_is_library: return False return True def query_country_institutions(sparql: SPARQLWrapper, country_code: str) -> dict[str, dict[str, Any]]: """ Query Wikidata for GLAM institutions in a specific country. Returns: dict keyed by QID """ # Map ISO 3166-1 alpha-2 to Wikidata QIDs country_qids = { "BR": "Q155", # Brazil "BE": "Q31", # Belgium "IT": "Q38", # Italy "NO": "Q20", # Norway "TH": "Q869", # Thailand "VN": "Q881", # Vietnam "MX": "Q96", # Mexico "CL": "Q298", # Chile } qid = country_qids.get(country_code) if not qid: print(f" ⚠️ No Wikidata QID mapping for country code: {country_code}") return {} query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?isil ?viaf ?coords ?website ?inception WHERE {{ # Institution is in target country ?item wdt:P17 wd:{qid} . # Institution is a GLAM type VALUES ?type {{ wd:Q7075 # library wd:Q166118 # archive wd:Q33506 # museum wd:Q1007870 # art gallery wd:Q28564 # public library wd:Q11396180 # academic library wd:Q207694 # art museum wd:Q2772772 # history museum }} ?item wdt:P31 ?type . # Optional enrichment data OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website OPTIONAL {{ ?item wdt:P571 ?inception . }} # Founding date # Get labels (adjust languages by region) SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,pt,es,nl,fr,it,no,th,vi" . }} }} LIMIT 2000 """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] # Parse results into dict keyed by QID results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "type": binding.get("typeLabel", {}).get("value", ""), "identifiers": {} } if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[qid] = result return results except Exception as e: print(f"\n❌ Error querying Wikidata: {e}") import traceback traceback.print_exc() return {} def fuzzy_match_institutions( institutions: list[dict[str, Any]], wikidata_results: dict[str, dict[str, Any]], threshold: float = 0.85 ) -> list[tuple[int, str, float, dict[str, Any]]]: """ Fuzzy match institutions with Wikidata results. Returns: List of (institution_idx, qid, confidence_score, wd_data) """ matches = [] for idx, inst in enumerate(institutions): inst_name = inst.get("name", "") inst_type = inst.get("institution_type", "") if not inst_name: continue # Skip if already has real Wikidata ID has_wikidata = any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") and int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000 for id_obj in inst.get("identifiers", []) ) if has_wikidata: continue # Find best match best_score = 0.0 best_qid = None best_data = None for qid, wd_data in wikidata_results.items(): wd_name = wd_data.get("name", "") wd_desc = wd_data.get("description", "") if not wd_name: continue # Check type compatibility if not institution_type_compatible(inst_type, inst_name, wd_name, wd_desc): continue score = similarity_score(inst_name, wd_name) if score > best_score: best_score = score best_qid = qid best_data = wd_data # Only include matches above threshold if best_score >= threshold and best_qid and best_data: matches.append((idx, best_qid, best_score, best_data)) return matches def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool: """Enrich an institution with Wikidata data. Returns True if enriched.""" enriched = False if "identifiers" not in inst or not inst["identifiers"]: inst["identifiers"] = [] identifiers_list = inst["identifiers"] existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)} # Add Wikidata ID if "Wikidata" not in existing_schemes: identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wd_data["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}" }) enriched = True # Add other identifiers wd_identifiers = wd_data.get("identifiers", {}) for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj = { "identifier_scheme": scheme, "identifier_value": value } if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" elif scheme == "Website": id_obj["identifier_url"] = value identifiers_list.append(id_obj) enriched = True # Add founding date if "founding_date" in wd_data and not inst.get("founding_date"): inst["founding_date"] = wd_data["founding_date"] enriched = True # Add coordinates if missing if "latitude" in wd_data and "longitude" in wd_data: locations = inst.get("locations", []) if isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict) and first_loc.get("latitude") is None: first_loc["latitude"] = wd_data["latitude"] first_loc["longitude"] = wd_data["longitude"] enriched = True # Update provenance if enriched: prov = inst.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") if existing_method: prov["extraction_method"] = f"{existing_method} + Wikidata fuzzy enrichment" else: prov["extraction_method"] = "Wikidata fuzzy enrichment" return enriched def main(): base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml" output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml" # Target countries with low coverage target_countries = ["BR", "BE", "IT", "NO", "TH", "VN"] print("="*80) print("🌍 LOW-COVERAGE COUNTRIES FUZZY MATCHING") print("="*80) print(f"\n🎯 Target countries: {', '.join(target_countries)}\n") print(f"📖 Loading dataset...\n") start_time = time.time() with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) load_time = time.time() - start_time print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s\n") # Setup SPARQL sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2") total_enriched = 0 # Process each country for country_code in target_countries: print("="*80) print(f"🌍 Processing {country_code}") print("="*80) # Filter institutions for this country country_institutions_idx = [ idx for idx, inst in enumerate(institutions) if any(loc.get('country') == country_code for loc in inst.get('locations', [])) ] if not country_institutions_idx: print(f" ⚠️ No institutions found for {country_code}\n") continue print(f" Found {len(country_institutions_idx):,} institutions") # Count those without Wikidata country_without_wikidata = [ idx for idx in country_institutions_idx if not any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") and int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000 for id_obj in institutions[idx].get("identifiers", []) ) ] current_coverage = (len(country_institutions_idx) - len(country_without_wikidata)) / len(country_institutions_idx) * 100 print(f" Current Wikidata coverage: {current_coverage:.1f}%") print(f" Institutions needing enrichment: {len(country_without_wikidata):,}\n") # Query Wikidata print(f"🔍 Querying Wikidata for {country_code} institutions...") print(" (This may take 30-60 seconds)\n") wikidata_results = query_country_institutions(sparql, country_code) print(f"✅ Found {len(wikidata_results):,} {country_code} institutions in Wikidata\n") if not wikidata_results: print(f" ⚠️ No Wikidata results for {country_code}, skipping\n") continue # Fuzzy match print("🔗 Fuzzy matching names (threshold: 0.85)...\n") country_insts = [institutions[idx] for idx in country_without_wikidata] matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=0.85) print(f"✨ Found {len(matches):,} high-confidence matches\n") # Show sample matches if matches: print(f"📋 Sample matches (showing first 5):") for i, (local_idx, qid, score, wd_data) in enumerate(matches[:5]): inst = country_insts[local_idx] print(f"\n{i+1}. Confidence: {score:.3f}") print(f" Local: {inst.get('name')}") print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})") print(f" Type: {wd_data.get('type', 'Unknown')}") print(f"\n✅ Applying {len(matches)} matches for {country_code}...\n") country_enriched = 0 for local_idx, qid, score, wd_data in matches: global_idx = country_without_wikidata[local_idx] if enrich_institution(institutions[global_idx], wd_data): country_enriched += 1 print(f"✨ Enriched {country_enriched:,} institutions") # Calculate new coverage new_coverage = (len(country_institutions_idx) - len(country_without_wikidata) + country_enriched) / len(country_institutions_idx) * 100 print(f" Coverage: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}%)\n") total_enriched += country_enriched else: print(f" ❌ No matches found for {country_code}\n") # Rate limiting between countries time.sleep(2.0) # Write output if total_enriched > 0: print("="*80) print("💾 Writing enriched dataset...") print("="*80 + "\n") with open(output_file, 'w', encoding='utf-8') as f: header = f"""--- # Global Heritage Institutions - Low-Coverage Countries Fuzzy Enriched # Generated: {datetime.now(timezone.utc).isoformat()} # # Total institutions: {len(institutions):,} # Countries processed: {', '.join(target_countries)} # Total new matches: {total_enriched:,} """ f.write(header) yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"✅ Complete! Output: {output_file}\n") # Final report print("="*80) print("📊 ENRICHMENT REPORT") print("="*80) print(f"\n✨ Total institutions enriched: {total_enriched:,}") print(f"⏱️ Processing time: {(time.time()-start_time)/60:.1f} minutes") print("="*80 + "\n") else: print("❌ No institutions enriched\n") if __name__ == "__main__": main()