#!/usr/bin/env python3 """ Phase 2 Enrichment: Mexico (MX) Target: 192 institutions, 17.7% Wikidata coverage → 35%+ (67+ institutions) Strategy: SPARQL batch query + fuzzy name matching (Spanish normalization) Based on: Brazil Phase 2 methodology (achieved 32.5% coverage from 13.7%) GLAM Data Extraction Project - Phase 2: High-Volume Country Enrichment """ import sys from pathlib import Path from typing import Any, Dict, List, Tuple, Optional from datetime import datetime, timezone import time import yaml from difflib import SequenceMatcher import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching (Spanish + English).""" name = name.lower() # Remove common prefixes/suffixes (Spanish + English) name = re.sub(r'^(fundación|museo|biblioteca|archivo|centro|memorial|parque|galería)\s+', '', name) name = re.sub(r'\s+(museo|biblioteca|archivo|nacional|estatal|municipal|federal|regional|memorial)$', '', name) name = re.sub(r'^(foundation|museum|library|archive|center|centre|memorial|park|gallery)\s+', '', name) name = re.sub(r'\s+(museum|library|archive|national|state|federal|regional|municipal|memorial)$', '', name) # Remove abbreviations in parentheses name = re.sub(r'\s*\([^)]*\)\s*', ' ', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def query_wikidata_mexican_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]: """ Query Wikidata for ALL heritage institutions in Mexico. Institution types: museums, libraries, archives, galleries, universities with collections """ query = """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel WHERE { VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q207694 wd:Q473972 wd:Q641635 } ?item wdt:P31/wdt:P279* ?type . # instance of (or subclass of) institution type ?item wdt:P17 wd:Q96 . # country = Mexico (Q96) OPTIONAL { ?item wdt:P791 ?isil . } OPTIONAL { ?item wdt:P214 ?viaf . } OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P856 ?website . } OPTIONAL { ?item wdt:P571 ?inception . } SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,pt" . } } LIMIT 5000 """ sparql.setQuery(query) try: raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] # Parse results into dict keyed by QID results = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "type": binding.get("typeLabel", {}).get("value", ""), "identifiers": {} } if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founding_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) results[qid] = result return results except Exception as e: print(f"\n❌ Error querying Wikidata: {e}") import traceback traceback.print_exc() return {} def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool: """Check if institution types are compatible (prevent museum → library mismatches).""" inst_lower = inst_name.lower() wd_lower = wd_type.lower() museum_kw = ['museo', 'museu', 'museum'] archive_kw = ['archivo', 'arquivo', 'archive'] library_kw = ['biblioteca', 'library', 'bibliothèque'] gallery_kw = ['galería', 'galeria', 'gallery', 'galerie'] inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == "MUSEUM" inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == "ARCHIVE" inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == "LIBRARY" inst_is_gallery = any(kw in inst_lower for kw in gallery_kw) or inst_type == "GALLERY" wd_is_museum = any(kw in wd_lower for kw in museum_kw) wd_is_archive = any(kw in wd_lower for kw in archive_kw) wd_is_library = any(kw in wd_lower for kw in library_kw) wd_is_gallery = any(kw in wd_lower for kw in gallery_kw) # MIXED, OFFICIAL_INSTITUTION, EDUCATION_PROVIDER can match any type if inst_type in ["MIXED", "OFFICIAL_INSTITUTION", "EDUCATION_PROVIDER"]: return True # If both have explicit types, they must match if inst_is_museum and not wd_is_museum: return False if inst_is_archive and not wd_is_archive: return False if inst_is_library and not wd_is_library: return False if inst_is_gallery and not wd_is_gallery: return False return True def fuzzy_match_institutions( institutions: List[Dict[str, Any]], wikidata_results: Dict[str, Dict[str, Any]], threshold: float = 0.70 ) -> List[Tuple[int, str, float, Dict[str, Any]]]: """ Fuzzy match Mexican institutions with Wikidata results. Returns: List of (institution_idx, qid, confidence_score, wd_data) """ matches = [] for idx, inst in enumerate(institutions): inst_name = inst.get("name", "") inst_type = inst.get("institution_type", "") if not inst_name: continue # Skip if already has real Wikidata ID has_wikidata = any( id_obj.get("identifier_scheme") == "Wikidata" and id_obj.get("identifier_value", "").startswith("Q") for id_obj in inst.get("identifiers", []) or [] ) if has_wikidata: continue # Find best match best_score = 0.0 best_qid = None best_data = None for qid, wd_data in wikidata_results.items(): wd_name = wd_data.get("name", "") wd_type = wd_data.get("type", "") if not wd_name: continue # Check type compatibility if not institution_type_compatible(inst_name, inst_type, wd_type): continue score = similarity_score(inst_name, wd_name) if score > best_score: best_score = score best_qid = qid best_data = wd_data # Only include matches above threshold if best_score >= threshold and best_qid and best_data: matches.append((idx, best_qid, best_score, best_data)) return matches def enrich_institution(inst: Dict[str, Any], wd_data: Dict[str, Any], match_score: float) -> bool: """Enrich an institution with Wikidata data. Returns True if enriched.""" enriched = False if "identifiers" not in inst or not inst["identifiers"]: inst["identifiers"] = [] identifiers_list = inst["identifiers"] existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)} # Add Wikidata ID if "Wikidata" not in existing_schemes: identifiers_list.append({ "identifier_scheme": "Wikidata", "identifier_value": wd_data["qid"], "identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}" }) enriched = True # Add other identifiers wd_identifiers = wd_data.get("identifiers", {}) for scheme, value in wd_identifiers.items(): if scheme not in existing_schemes: id_obj = { "identifier_scheme": scheme, "identifier_value": value } if scheme == "VIAF": id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}" elif scheme == "Website": id_obj["identifier_url"] = value identifiers_list.append(id_obj) enriched = True # Update provenance with enrichment metadata if enriched: prov = inst.get("provenance", {}) if isinstance(prov, dict): existing_method = prov.get("extraction_method", "") if existing_method: prov["extraction_method"] = f"{existing_method} + Phase 2 Mexico Wikidata enrichment" else: prov["extraction_method"] = "Phase 2 Mexico Wikidata enrichment" # Add enrichment history if "enrichment_history" not in prov: prov["enrichment_history"] = [] prov["enrichment_history"].append({ "enrichment_date": datetime.now(timezone.utc).isoformat(), "enrichment_method": "SPARQL query + fuzzy name matching (Spanish normalization, 70% threshold)", "enrichment_source": [f"https://www.wikidata.org/wiki/{wd_data['qid']}"], "match_score": match_score, "enrichment_notes": f"Phase 2: Fuzzy matched '{inst.get('name')}' to Wikidata '{wd_data.get('name')}'" }) return enriched def main(): base_dir = Path(__file__).parent.parent master_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml" backup_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml.phase2_mexico_backup" print("="*80) print("🇲🇽 PHASE 2 MEXICO WIKIDATA ENRICHMENT") print("="*80) print(f"\n📖 Loading master dataset: {master_file.name}\n") start_time = time.time() # Load master dataset with open(master_file, 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) # Filter Mexico institutions mexico_institutions = [ inst for inst in all_institutions if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations']) ] # Get Mexico institution indices in master dataset mexico_indices = [ i for i, inst in enumerate(all_institutions) if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations']) ] load_time = time.time() - start_time print(f"✅ Loaded {len(all_institutions):,} total institutions in {load_time:.1f}s") print(f"✅ Found {len(mexico_institutions):,} Mexican institutions\n") # Count Wikidata coverage with_wikidata = sum( 1 for inst in mexico_institutions if inst.get('identifiers') and any( id_obj.get("identifier_scheme") == "Wikidata" for id_obj in inst['identifiers'] ) ) without_wikidata = len(mexico_institutions) - with_wikidata current_coverage = (with_wikidata / len(mexico_institutions) * 100) if mexico_institutions else 0 print(f"✅ With Wikidata: {with_wikidata:,} ({current_coverage:.1f}%)") print(f"❓ Without Wikidata: {without_wikidata:,}\n") if without_wikidata == 0: print("✨ All Mexican institutions already have Wikidata IDs!") return # Create backup print(f"💾 Creating backup: {backup_file.name}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"✅ Backup created\n") # Query Wikidata print("🔍 Querying Wikidata for Mexican heritage institutions...") print(" (This may take 30-60 seconds)\n") sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.setTimeout(120) # 2 minute timeout sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Phase 2 Mexico Enrichment)") query_start = time.time() wikidata_results = query_wikidata_mexican_institutions(sparql) query_time = time.time() - query_start print(f"✅ Found {len(wikidata_results):,} Mexican institutions in Wikidata (query took {query_time:.1f}s)\n") if not wikidata_results: print("⚠️ No Wikidata results, aborting enrichment") return # Fuzzy match print("🔗 Fuzzy matching names (threshold: 0.70, Spanish normalization)...\n") matches = fuzzy_match_institutions(mexico_institutions, wikidata_results, threshold=0.70) print(f"✨ Found {len(matches):,} high-confidence matches\n") if not matches: print("❌ No matches found. Try lowering threshold below 0.70.\n") return # Show sample matches print(f"{'='*80}") print(f"📋 SAMPLE MATCHES (Top 10)") print(f"{'='*80}") sorted_matches = sorted(matches, key=lambda x: x[2], reverse=True) for i, (idx, qid, score, wd_data) in enumerate(sorted_matches[:10], 1): inst = mexico_institutions[idx] city = inst.get("locations", [{}])[0].get("city", inst.get("locations", [{}])[0].get("region", "Unknown")) print(f"\n{i}. Confidence: {score:.3f}") print(f" Local: {inst.get('name')} ({city})") print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})") print(f" Type: {wd_data.get('type', 'Unknown')}") print(f"\n{'='*80}\n") # Apply matches to master dataset print("✅ Applying matches to master dataset...\n") enriched_count = 0 for local_idx, qid, score, wd_data in matches: master_idx = mexico_indices[local_idx] if enrich_institution(all_institutions[master_idx], wd_data, score): enriched_count += 1 new_coverage = (with_wikidata + enriched_count) / len(mexico_institutions) * 100 print(f"✨ Enriched {enriched_count:,} institutions") print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%") print(f" (+{new_coverage - current_coverage:.1f} percentage points)\n") # Write updated master dataset print("="*80) print("💾 WRITING UPDATED MASTER DATASET") print("="*80 + "\n") print(f"📝 Writing {len(all_institutions):,} institutions to disk...") print(" (This may take 2-3 minutes for large datasets)\n") write_start = time.time() with open(master_file, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) write_time = time.time() - write_start print(f"✅ Updated: {master_file} (write took {write_time:.1f}s)\n") # Final summary print("="*80) print("📊 ENRICHMENT COMPLETE") print("="*80) print(f"\n✨ Results:") print(f" Mexican institutions enriched: {enriched_count:,}") print(f" Coverage increase: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}pp)") print(f" Remaining without Wikidata: {without_wikidata - enriched_count:,}") print(f" Overall dataset: {len(all_institutions):,} institutions") print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes") print(f"\n🎯 Phase 2 Target: 35%+ coverage (67+ institutions)") if new_coverage >= 35: print(f" ✅ TARGET ACHIEVED!") else: print(f" ⏳ In progress... ({new_coverage:.1f}% / 35%)") print("="*80 + "\n") if __name__ == "__main__": main()