#!/usr/bin/env python3 """ Chilean GLAM Institutions - Batch 7 Wikidata Enrichment Uses bulk SPARQL matches from query_wikidata_chilean_museums.py 32 museums with verified Q-numbers from Wikidata Query Service Target: 52/90 institutions (57.8% coverage) """ import yaml from pathlib import Path from datetime import datetime, timezone # Batch 7: 32 museums from SPARQL bulk query (all regions) BATCH_7_ENRICHMENTS = [ # ARICA Y PARINACOTA (1) { "name": "Museo Universidad de Tarapacá San Miguel de Azapa (MASMA)", "city": "Arica", "q_number": "Q9046776", "wikidata_name": "Museo Arqueológico y Antropológico de San Miguel de Azapa", "confidence": "partial", # Name variation but city + type match "notes": "SPARQL match - partial name (includes full institutional title)", }, # ANTOFAGASTA (2) { "name": "Museo de Historia Natural y Cultural del Desierto de Atacama (MUHNCAL)", "city": "Calama", "q_number": "Q86276638", "wikidata_name": "Museo de Historia Natural y Cultural del Desierto de Atacama", "confidence": "partial", "notes": "SPARQL match - exact name match", }, { "name": "Museo Indígena Atacameño", "city": "Calama", "q_number": "Q86276595", "wikidata_name": "Museo Indígena Atacameño de Arqueología y Etnografía", "confidence": "partial", "notes": "SPARQL match - partial name (full title in Wikidata)", }, # ATACAMA (1) { "name": "Museo Mineralógico Universidad de Atacama", "city": "Copiapó", "q_number": "Q28501803", "wikidata_name": "Museo Mineralógico de la Universidad de Atacama", "confidence": "partial", "notes": "SPARQL match - partial name match", }, # VALPARAÍSO (5) { "name": "Museo de Historia Natural", "city": "Valparaíso", "q_number": "Q19950374", "wikidata_name": "Museo de Historia Natural de Valparaíso", "confidence": "partial", "notes": "SPARQL match - partial name (city-specific), founded 1878", }, { "name": "Casa Museo La Sebastiana", "city": "Valparaíso", "q_number": "Q86278008", "wikidata_name": "Casa Museo La Sebastiana", "confidence": "exact", "notes": "SPARQL match - exact name match (Pablo Neruda house)", }, { "name": "Casa Museo Isla Negra", "city": "El Quisco", "q_number": "Q86277516", "wikidata_name": "Casa Museo Isla Negra", "confidence": "exact", "notes": "SPARQL match - exact name match (Pablo Neruda house)", }, { "name": "Museo Antropológico Padre Sebastián Englert (MAPSE)", "city": "Isla de Pascua", "q_number": "Q5437650", "wikidata_name": "Museo Antropológico Padre Sebastián Englert", "confidence": "partial", "notes": "SPARQL match - partial name (excludes acronym), founded 1973", }, { "name": "Museo Arqueológico", "city": "Los Andes", "q_number": "Q86277234", "wikidata_name": "Museo Arqueológico de Los Andes", "confidence": "partial", "notes": "SPARQL match - partial name (city-specific), founded 1905", }, # REGIÓN METROPOLITANA (3) { "name": "Museo Histórico", "city": "San Felipe", "q_number": "Q86277658", "wikidata_name": "Museo Histórico de San Felipe", "confidence": "partial", "notes": "SPARQL match - partial name (city-specific)", }, { "name": "Museo de La Ligua", "city": "La Ligua", "q_number": "Q6034082", "wikidata_name": "Museo de La Ligua", "confidence": "exact", "notes": "SPARQL match - exact name match, founded 1985", }, { "name": "Museo de Talagante", "city": "Talagante", "q_number": "Q86280216", "wikidata_name": "Museo de Talagante", "confidence": "exact", "notes": "SPARQL match - exact name match", }, # O'HIGGINS (2) { "name": "Museo Histórico de Pichilemu", "city": "Pichilemu", "q_number": "Q112044338", "wikidata_name": "Museo Histórico de Pichilemu", "confidence": "exact", "notes": "SPARQL match - exact name match", }, { "name": "Museo Lircunlauta", "city": "San Fernando", "q_number": "Q86280637", "wikidata_name": "Museo Lircunlauta", "confidence": "exact", "notes": "SPARQL match - exact name match", }, # MAULE (3) { "name": "Museo de Arte y Artesanía", "city": "Linares", "q_number": "Q6033923", "wikidata_name": "Museo Arte y Artesanía de Linares", "confidence": "partial", "notes": "SPARQL match - partial name match, founded 1962", }, { "name": "Museo Histórico de Yerbas Buenas", "city": "Yerbas Buenas", "q_number": "Q20022173", "wikidata_name": "Museo Histórico de Yerbas Buenas", "confidence": "exact", "notes": "SPARQL match - exact name match", }, # ÑUBLE (3) { "name": "Museo Marta Colvin", "city": "Chillán", "q_number": "Q112044588", "wikidata_name": "Museo Marta Colvin", "confidence": "exact", "notes": "SPARQL match - exact name match", }, { "name": "Museo Municipal de Ciencias Naturales", "city": "Chillán", "q_number": "Q112044585", "wikidata_name": "Museo Municipal de Ciencias Naturales y Arqueológico Profesor Pedro Ramírez Fuentes", "confidence": "partial", "notes": "SPARQL match - partial name (full title in Wikidata)", }, { "name": "Itata Museo Antropológico", "city": "Quirihue", "q_number": "Q112044584", "wikidata_name": "Itata Museo Antropológico", "confidence": "exact", "notes": "SPARQL match - exact name match", }, # BIOBÍO (1) { "name": "Museo Mapuche de Cañete", "city": "Cañete", "q_number": "Q16609804", "wikidata_name": "Museo Mapuche de Cañete", "confidence": "exact", "notes": "SPARQL match - exact name match, founded 1977", }, # LOS RÍOS (4) { "name": "Museo Histórico y Antropológico", "city": "Valdivia", "q_number": "Q6940480", "wikidata_name": "Museo Histórico y Antropológico de Valdivia Mauricio Van de Maele", "confidence": "partial", "notes": "SPARQL match - partial name (full title in Wikidata), founded 1994", }, { "name": "Museo de la Catedral", "city": "Valdivia", "q_number": "Q86283115", "wikidata_name": "Museo de la Catedral de Valdivia", "confidence": "partial", "notes": "SPARQL match - partial name (city-specific)", }, { "name": "Museo de sitio Castillo de Niebla", "city": "Valdivia", "q_number": "Q20022172", "wikidata_name": "Museo de Sitio Castillo de Niebla", "confidence": "exact", "notes": "SPARQL match - exact name match (case variation)", }, { "name": "Museo Tringlo", "city": "Lago Ranco", "q_number": "Q86282868", "wikidata_name": "Museo Tringlo", "confidence": "exact", "notes": "SPARQL match - exact name match", }, # LOS LAGOS (3) { "name": "Museo Colonial Alemán de Frutillar", "city": "Frutillar", "q_number": "Q20010979", "wikidata_name": "Museo Colonial Alemán de Frutillar", "confidence": "exact", "notes": "SPARQL match - exact name match", }, { "name": "Museo Antonio Felmer", "city": "Puerto Varas", "q_number": "Q20022171", "wikidata_name": "Museo Antonio Felmer", "confidence": "exact", "notes": "SPARQL match - exact name match", }, { "name": "Museo y Archivo Histórico Municipal", "city": "Osorno", "q_number": "Q16609772", "wikidata_name": "Museo y Archivo Histórico Municipal de Osorno", "confidence": "partial", "notes": "SPARQL match - partial name (city-specific)", }, # AYSÉN (3) { "name": "Museo de Sitio de Chaitén", "city": "Chaitén", "q_number": "Q112044386", "wikidata_name": "Museo de Sitio de Chaitén", "confidence": "exact", "notes": "SPARQL match - exact name match", }, { "name": "Museo Municipal de Cochrane", "city": "Cochrane", "q_number": "Q86284188", "wikidata_name": "Museo Municipal de Cochrane", "confidence": "exact", "notes": "SPARQL match - exact name match", }, { "name": "Museo Pioneros del Baker", "city": "Cochrane", "q_number": "Q86284160", "wikidata_name": "Museo Rural Pioneros del Baker", "confidence": "partial", "notes": "SPARQL match - partial name (includes 'Rural')", }, # MAGALLANES (2) { "name": "Museo Salesiano", "city": "Punta Arenas", "q_number": "Q86284641", "wikidata_name": "Museo Salesiano Maggiorino Borgatello", "confidence": "partial", "notes": "SPARQL match - partial name (full title in Wikidata)", }, { "name": "Museo Municipal Fernando Cordero Rusque", "city": "Porvenir", "q_number": "Q83551041", "wikidata_name": "Museo Municipal Fernando Cordero Rusque", "confidence": "exact", "notes": "SPARQL match - exact name match, founded 1980", }, ] def load_institutions(yaml_path: Path) -> list: """Load institutions from YAML file.""" with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) return data if isinstance(data, list) else [] def enrich_institution(institution: dict, enrichment: dict) -> dict: """Add Wikidata identifier to institution.""" # Initialize identifiers list if not present if 'identifiers' not in institution: institution['identifiers'] = [] # Check if Wikidata already exists wikidata_ids = [i for i in institution['identifiers'] if i.get('identifier_scheme') == 'Wikidata'] if wikidata_ids: print(f"⚠️ {institution['name']} already has Wikidata: {wikidata_ids[0]['identifier_value']}") return institution # Add new Wikidata identifier wikidata_entry = { 'identifier_scheme': 'Wikidata', 'identifier_value': enrichment['q_number'], 'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}" } institution['identifiers'].append(wikidata_entry) # Update provenance if 'provenance' not in institution: institution['provenance'] = {} institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() institution['provenance']['enrichment_batch'] = 7 institution['provenance']['enrichment_method'] = 'SPARQL_BULK_QUERY' institution['provenance']['enrichment_confidence'] = enrichment['confidence'] institution['provenance']['wikidata_verified'] = True if enrichment.get('notes'): if 'notes' not in institution['provenance']: institution['provenance']['notes'] = [] elif isinstance(institution['provenance']['notes'], str): institution['provenance']['notes'] = [institution['provenance']['notes']] institution['provenance']['notes'].append( f"Batch 7: {enrichment['notes']}" ) print(f"✅ Enriched: {institution['name']} → {enrichment['q_number']}") return institution def main(): # File paths input_file = Path("data/instances/chile/chilean_institutions_batch6_enriched.yaml") output_file = Path("data/instances/chile/chilean_institutions_batch7_enriched.yaml") print("=" * 80) print("CHILEAN INSTITUTIONS - BATCH 7 WIKIDATA ENRICHMENT") print("=" * 80) print() print(f"📂 Input: {input_file}") print(f"📝 Output: {output_file}") print(f"🎯 Target: Add {len(BATCH_7_ENRICHMENTS)} Wikidata Q-numbers (SPARQL bulk)") print() # Load institutions institutions = load_institutions(input_file) print(f"📖 Loaded {len(institutions)} institutions") print() # Track statistics enriched_count = 0 already_enriched_count = 0 not_found_count = 0 # Process each enrichment for enrichment in BATCH_7_ENRICHMENTS: # Find matching institution matches = [ inst for inst in institutions if inst.get('name') == enrichment['name'] ] if not matches: print(f"❌ NOT FOUND: {enrichment['name']} ({enrichment['city']})") not_found_count += 1 continue if len(matches) > 1: print(f"⚠️ MULTIPLE MATCHES: {enrichment['name']}") continue institution = matches[0] # Check if already enriched existing_wikidata = [ i for i in institution.get('identifiers', []) if i.get('identifier_scheme') == 'Wikidata' ] if existing_wikidata: already_enriched_count += 1 continue # Enrich institution enrich_institution(institution, enrichment) enriched_count += 1 print() print("=" * 80) print("ENRICHMENT SUMMARY") print("=" * 80) print(f"✅ Newly enriched: {enriched_count}") print(f"⚠️ Already enriched: {already_enriched_count}") print(f"❌ Not found: {not_found_count}") print() # Count total with Wikidata total_with_wikidata = sum( 1 for inst in institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])) ) coverage = (total_with_wikidata / len(institutions)) * 100 print(f"📊 Total institutions: {len(institutions)}") print(f"🔗 With Wikidata: {total_with_wikidata} ({coverage:.1f}%)") print() # Save enriched data print(f"💾 Saving to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump( institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120 ) print("✅ Batch 7 enrichment complete!") print() # Regional breakdown museums = [inst for inst in institutions if inst.get('institution_type') == 'MUSEUM'] museums_with_wd = [ m for m in museums if any(i.get('identifier_scheme') == 'Wikidata' for i in m.get('identifiers', [])) ] print(f"🏛️ Museums: {len(museums_with_wd)}/{len(museums)} with Wikidata") print() if __name__ == "__main__": main()