#!/usr/bin/env python3 """ Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (CORRECTED) FINAL VERSION with strict matching to avoid false positives. Strategy: - ONLY enrich institutions we can match with 100% certainty - Use exact name matching + location verification - Direct Q-number mapping (no SPARQL queries) BATCH 2 FINAL TARGETS (4 institutions verified in dataset): 1. Universidad de Chile's Archivo Central Andrés Bello → Q219576 Location: Santiago, Provincia de Santiago ✓ 2. Universidad de Concepción's SIBUDEC → Q1163431 Location: Concepción, Concepción ✓ 3. Universidad Austral → Q1163558 Location: Valdivia, Valdivia ✓ 4. Universidad Católica (Temuco) → Q2900814 Location: Maipo region, Temuco ✓ (This is Universidad Católica de Temuco, NOT PUC Santiago) """ import yaml from pathlib import Path from typing import Dict, Any # EXACT institution matches (verified in dataset) BATCH_2_EXACT_MATCHES = [ { 'exact_name': "Universidad de Chile's Archivo Central Andrés Bello", 'inst_type': 'ARCHIVE', 'expected_city': 'Provincia de Santiago', 'expected_region': 'Santiago', 'parent_university': 'Universidad de Chile', 'wikidata_q': 'Q219576', 'notes': 'Central archive of Universidad de Chile, Chile\'s oldest university (founded 1842)' }, { 'exact_name': "Universidad de Concepción's SIBUDEC", 'inst_type': 'EDUCATION_PROVIDER', 'expected_city': 'Concepción', 'expected_region': 'Concepción', 'parent_university': 'Universidad de Concepción', 'wikidata_q': 'Q1163431', 'notes': 'Library system (SIBUDEC) of Universidad de Concepción, third oldest university in Chile (founded 1919)' }, { 'exact_name': 'Universidad Austral', 'inst_type': 'EDUCATION_PROVIDER', 'expected_city': 'Valdivia', 'expected_region': 'Valdivia', 'parent_university': 'Universidad Austral de Chile', 'wikidata_q': 'Q1163558', 'notes': 'Universidad Austral de Chile in Valdivia, southern Chile (founded 1954)' }, { 'exact_name': 'Universidad Católica', 'inst_type': 'EDUCATION_PROVIDER', 'expected_city': 'Temuco', 'expected_region': 'Maipo', 'parent_university': 'Universidad Católica de Temuco', 'wikidata_q': 'Q2900814', 'notes': 'Universidad Católica de Temuco (founded 1991, previously sede of PUC Valparaíso)' } ] def has_wikidata(inst: Dict[str, Any]) -> bool: """Check if institution already has Wikidata identifier.""" return any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) def exact_match(inst: Dict[str, Any], target: Dict[str, Any]) -> bool: """Verify institution exactly matches target (name + type + location).""" # Exact name match if inst.get('name') != target['exact_name']: return False # Institution type match if inst.get('institution_type') != target['inst_type']: return False # Location match locations = inst.get('locations', []) if not locations: return False location = locations[0] city = location.get('city', '') region = location.get('region', '') # Must match BOTH city and region city_match = city == target['expected_city'] region_match = region == target['expected_region'] return city_match and region_match def add_wikidata_identifier( inst: Dict[str, Any], q_number: str, parent_university: str, notes: str ) -> Dict[str, Any]: """Add Wikidata identifier with provenance tracking.""" wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' } if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append(wikidata_id) # Update description if 'description' in inst and inst['description']: if parent_university not in inst['description']: inst['description'] = f"{inst['description']} Part of {parent_university}. {notes}" else: inst['description'] = f"Part of {parent_university}. {notes}" # Update provenance if 'provenance' in inst: old_method = inst['provenance'].get('extraction_method', '') enrichment_note = f" + Wikidata enrichment (Batch 2, parent: {parent_university}, exact match)" inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}" return inst def main(): data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml' backup_file = data_file.with_suffix('.batch2_backup') output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml') print("=" * 80) print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (CORRECTED)") print("Exact Matching Only - Zero False Positives") print("Session: November 9, 2025") print("=" * 80) print() # Load data print(f"📂 Loading: {data_file.name}") with open(data_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f" Total institutions: {len(institutions)}") # Check existing Wikidata coverage with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst)) print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)") print() # Create backup print(f"💾 Creating backup: {backup_file.name}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("Batch 2 Enrichment (Exact Matching):") print("-" * 80) # Process each target enriched_count = 0 skipped_count = 0 not_found_count = 0 for i, target in enumerate(BATCH_2_EXACT_MATCHES, 1): print(f"\n[{i}/{len(BATCH_2_EXACT_MATCHES)}] 🎓 {target['exact_name']}") print(f" Parent: {target['parent_university']}") print(f" Wikidata: {target['wikidata_q']}") print(f" Expected location: {target['expected_city']}, {target['expected_region']}") # Find exact match matched = None for inst in institutions: if exact_match(inst, target): matched = inst break if not matched: print(f" ❌ NOT FOUND (exact match failed)") not_found_count += 1 continue print(f" ✅ EXACT MATCH CONFIRMED") # Check if already has Wikidata if has_wikidata(matched): existing_q = next( (id_obj['identifier_value'] for id_obj in matched.get('identifiers', []) if id_obj.get('identifier_scheme') == 'Wikidata'), None ) print(f" ⏭️ Already enriched with {existing_q}") skipped_count += 1 continue # Add Wikidata identifier print(f" ➕ Adding Wikidata: {target['wikidata_q']} ({target['parent_university']})") add_wikidata_identifier( matched, target['wikidata_q'], target['parent_university'], target['notes'] ) enriched_count += 1 print() print("=" * 80) print("Batch 2 Summary:") print("-" * 80) print(f"✅ Enriched: {enriched_count}") print(f"⏭️ Already enriched: {skipped_count}") print(f"❌ Not found: {not_found_count}") # Calculate updated coverage with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst)) print() print("Chilean Institution Coverage:") print(f" Total: {len(institutions)}") print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)") print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)") print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions") if with_wikidata_after > 0: coverage_pct = with_wikidata_after / len(institutions) * 100 print(f" Progress toward 22% goal: {coverage_pct:.1f}% / 22.0%") # Save if any enrichments if enriched_count > 0: print() print(f"💾 Saving enriched data to: {output_file.name}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("✅ Batch 2 enrichment complete!") print() print("NEXT STEPS - Batch 3 Options:") print() print("Option A: Major Museums (5 institutions):") print(" - Museo Nacional de Historia Natural (Santiago)") print(" - Museo de Arte Precolombino (Santiago)") print(" - Museo Histórico Nacional (Santiago)") print(" - Museo de Bellas Artes (Santiago)") print(" - Museo Regional de Ancud (Chiloé)") print() print("Option B: More University Departments (5 institutions):") print(" - Universidad del Bío-Bío's [department]") print(" - Universidad de Talca's Centro [department]") print(" - Universidad de la Frontera [department]") print(" - Universidad de Magallanes [department]") print(" - Universidad de Playa Ancha's [department]") print() print("Recommendation: Try Option B first (universities have better Wikidata coverage)") else: print() print("⚠️ No enrichments - all targets already enriched or not found") if __name__ == '__main__': main()