#!/usr/bin/env python3 """ Chilean GLAM Institutions - Batch 3 Wikidata Enrichment Target: 5 more university departments - Universidad del Bío-Bío (Chillán) → Q2661431 - Universidad de Talca (Talca) → Q3244354 - Universidad de la Frontera (Temuco) → Q3244350 - Universidad de Magallanes (Punta Arenas) → Q3244396 - Universidad de Playa Ancha (Valparaíso) → Q3244389 Strategy: Direct Q-number mapping with exact matching (100% accuracy in Batch 2) Expected result: 6 → 11 institutions (12.2% coverage) """ import yaml from pathlib import Path from datetime import datetime, timezone import shutil # File paths INPUT_FILE = Path("data/instances/chile/chilean_institutions_batch2_enriched.yaml") OUTPUT_FILE = Path("data/instances/chile/chilean_institutions_batch3_enriched.yaml") BACKUP_SUFFIX = ".batch3_backup" # Batch 3 enrichment mappings (hardcoded Q-numbers) BATCH3_MAPPINGS = { "Universidad del Bío-Bío": { "q_number": "Q2661431", "city": "Chillán", "region": "Diguillín", "verification": "Chile's state university in Chillán, founded 1988" }, "Universidad de Talca": { "q_number": "Q3244354", "city": "Talca", "region": "Talca", "verification": "State university in Talca, founded 1981" }, "Universidad de la Frontera": { "q_number": "Q3244350", "city": "Temuco", "region": "Cautín", "verification": "State university in Temuco, founded 1981" }, "Universidad de Magallanes": { "q_number": "Q3244396", "city": "Punta Arenas", "region": "Magallanes", "verification": "State university in Punta Arenas, founded 1961" }, "Universidad de Playa Ancha": { "q_number": "Q3244389", "city": "Valparaíso", "region": "Valparaíso", "verification": "State university in Valparaíso, founded 1948" } } def exact_match(institution_name: str, target_name: str, city: str, target_city: str) -> bool: """ Exact matching strategy (zero false positives). Criteria: 1. Institution name contains target university name 2. City/region matches 3. Institution type is EDUCATION_PROVIDER """ name_lower = institution_name.lower() target_lower = target_name.lower() city_lower = city.lower() if city else "" target_city_lower = target_city.lower() # Check if target university name is in institution name name_match = target_lower in name_lower # Check if city matches (either in city or region field) city_match = target_city_lower in city_lower return name_match and city_match def enrich_institutions(): """Main enrichment function.""" print("=" * 80) print("CHILEAN GLAM INSTITUTIONS - BATCH 3 WIKIDATA ENRICHMENT") print("=" * 80) print() # Load institutions print(f"📖 Loading institutions from: {INPUT_FILE}") with open(INPUT_FILE, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) total_institutions = len(institutions) print(f" Loaded {total_institutions} institutions") print() # Count current Wikidata coverage enriched_before = sum(1 for inst in institutions if inst.get('identifiers') and any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers'])) print(f"📊 Current Wikidata coverage: {enriched_before}/{total_institutions} ({enriched_before/total_institutions*100:.1f}%)") print() # Create backup backup_file = str(INPUT_FILE) + BACKUP_SUFFIX print(f"💾 Creating backup: {backup_file}") shutil.copy2(INPUT_FILE, backup_file) print() # Enrich institutions print("🔍 Starting Batch 3 enrichment...") print() enriched_count = 0 skipped_count = 0 for institution in institutions: name = institution.get('name', '') institution_type = institution.get('institution_type', '') # Only process EDUCATION_PROVIDER institutions if institution_type != 'EDUCATION_PROVIDER': continue # Get location info locations = institution.get('locations', []) if not locations: continue location = locations[0] city = location.get('city', '') region = location.get('region', '') city_or_region = city or region # Check if already enriched identifiers = institution.get('identifiers', []) has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers) if has_wikidata: continue # Try to match with Batch 3 mappings matched = False for target_name, mapping in BATCH3_MAPPINGS.items(): if exact_match(name, target_name, city_or_region, mapping['city']): q_number = mapping['q_number'] print(f"✅ MATCH: {name}") print(f" Location: {city_or_region}") print(f" Q-number: {q_number}") print(f" Verification: {mapping['verification']}") # Add Wikidata identifier wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' } if not identifiers: institution['identifiers'] = [] institution['identifiers'].append(wikidata_id) # Update extraction method in provenance if 'provenance' in institution: current_method = institution['provenance'].get('extraction_method', '') institution['provenance']['extraction_method'] = ( f"{current_method} + Wikidata enrichment " f"(Batch 3, parent: {target_name}, exact match)" ) enriched_count += 1 matched = True print() break if not matched and institution_type == 'EDUCATION_PROVIDER': skipped_count += 1 print("=" * 80) print(f"📊 Batch 3 Enrichment Summary") print("=" * 80) print(f"✅ Enriched: {enriched_count} institutions") print(f"⏭️ Skipped: {skipped_count} institutions (no match)") print() # Count final Wikidata coverage enriched_after = sum(1 for inst in institutions if inst.get('identifiers') and any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers'])) print(f"📈 New Wikidata coverage: {enriched_after}/{total_institutions} ({enriched_after/total_institutions*100:.1f}%)") print(f" Improvement: +{enriched_after - enriched_before} institutions") print() # Save enriched dataset print(f"💾 Saving enriched dataset to: {OUTPUT_FILE}") with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("✅ Batch 3 enrichment complete!") print() print("📁 Files:") print(f" Input: {INPUT_FILE}") print(f" Output: {OUTPUT_FILE}") print(f" Backup: {backup_file}") print() # Next steps print("🎯 Next Steps:") if enriched_after < 20: remaining = 20 - enriched_after print(f" - Need {remaining} more institutions to reach 22.2% coverage goal (20 institutions)") print(f" - Consider Batch 4: Major Santiago museums or regional universities") else: print(f" - 🎉 GOAL ACHIEVED! 22.2% coverage reached ({enriched_after} institutions)") print() if __name__ == "__main__": enrich_institutions()