#!/usr/bin/env python3 """ Chilean GLAM Institutions - Batch 5 Wikidata Enrichment Target: Universities and high-profile regional museums Goal: 12/90 → 16/90 (13.3% → 17.8% coverage) """ import yaml from datetime import datetime, timezone from pathlib import Path from typing import Dict, List # Batch 5 targets with verified Wikidata Q-numbers BATCH_5_TARGETS = [ # Universities (3 institutions - high Wikidata coverage priority) { "q_number": "Q3551323", "name_pattern": "Universidad Arturo Prat", "location": "Iquique", "institution_type": "EDUCATION_PROVIDER", "verification": "Universidad Arturo Prat, public university in Iquique, founded 1984" }, { "q_number": "Q7895095", "name_pattern": "Universidad de Atacama", "location": "Copiapó", "institution_type": "EDUCATION_PROVIDER", "verification": "Universidad de Atacama, public university in Copiapó, founded 1981" }, { "q_number": "Q634259", "name_pattern": "Universidad de Playa Ancha", "location": "Valparaíso", # May appear as San Felipe in dataset "institution_type": "EDUCATION_PROVIDER", "verification": "Universidad de Playa Ancha, public university in Valparaíso, founded 1948", "notes": "May appear with 's possessive (Universidad de Playa Ancha's)" }, # Museums (1 institution - verified match) { "q_number": "Q2885665", "name_pattern": "Museo Gabriela Mistral", "location": "Vicuña", "institution_type": "MUSEUM", "verification": "Museo-Biblioteca Gabriela Mistral, archaeological/public museum in Vicuña, founded 1957" }, # Note: Museo Arqueológico de La Serena Q86276952 EXCLUDED (Wikidata shows art/private museum, # but institutional description suggests archaeological/public museum - data quality issue requires verification) ] def load_institutions(file_path: Path) -> List[Dict]: """Load institutions from YAML file.""" print(f"📖 Loading institutions from: {file_path}") with open(file_path, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f" Loaded {len(institutions)} institutions") return institutions def count_wikidata_coverage(institutions: List[Dict]) -> tuple: """Count institutions with Wikidata identifiers.""" with_wikidata = sum( 1 for inst in institutions if any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) ) return with_wikidata, len(institutions) def institution_has_wikidata(institution: Dict) -> bool: """Check if institution already has Wikidata identifier.""" return any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in institution.get('identifiers', []) ) def matches_target(institution: Dict, target: Dict) -> bool: """Check if institution matches target criteria.""" name = institution.get('name', '') inst_type = institution.get('institution_type', '') locations = institution.get('locations', []) # Institution type must match if inst_type != target['institution_type']: return False # Name must contain the pattern (handle possessive 's) name_normalized = name.rstrip("'s") # Remove trailing possessive if target['name_pattern'] not in name_normalized: return False # Location match (flexible for universities with multiple campuses) if locations: city = locations[0].get('city', '') # Flexible location matching if city and city != 'Unknown': # Accept if city matches target OR if we can't determine location if target['location'] not in city and city not in target['location']: # Allow university matches even with location mismatch (campuses) if 'Universidad' not in name: return False return True def enrich_institution(institution: Dict, target: Dict) -> bool: """Add Wikidata identifier to institution.""" q_number = target['q_number'] # Create Wikidata identifier wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' } # Ensure identifiers list exists if 'identifiers' not in institution: institution['identifiers'] = [] # Add Wikidata identifier institution['identifiers'].append(wikidata_id) # Update provenance if 'provenance' not in institution: institution['provenance'] = {} provenance = institution['provenance'] # Record enrichment if 'enrichment_history' not in provenance: provenance['enrichment_history'] = [] provenance['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Chilean Batch 5 - University + museum Wikidata verification', 'enrichment_batch': 'batch_5', 'q_number': q_number, 'verification': target['verification'] }) # Update data tier if not already set if 'data_tier' not in provenance or provenance['data_tier'] == 'TIER_4_INFERRED': provenance['data_tier'] = 'TIER_3_CROWD_SOURCED' # Wikidata is TIER_3 return True def main(): """Main enrichment workflow.""" print("=" * 80) print("CHILEAN GLAM INSTITUTIONS - BATCH 5 WIKIDATA ENRICHMENT") print("=" * 80) # Paths input_file = Path('data/instances/chile/chilean_institutions_batch4_enriched.yaml') output_file = Path('data/instances/chile/chilean_institutions_batch5_enriched.yaml') backup_file = Path(f'{input_file}.batch5_backup') # Load institutions institutions = load_institutions(input_file) # Count current coverage with_wikidata, total = count_wikidata_coverage(institutions) coverage_pct = (with_wikidata / total * 100) if total > 0 else 0 print(f"📊 Current Wikidata coverage: {with_wikidata}/{total} ({coverage_pct:.1f}%)") # Create backup print(f"💾 Creating backup: {backup_file}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) # Enrichment tracking enriched_count = 0 skipped_count = 0 print(f"🔍 Starting Batch 5 enrichment...") print() # Process each target for target in BATCH_5_TARGETS: matched = False for institution in institutions: # Skip if already has Wikidata if institution_has_wikidata(institution): continue # Check if matches target if matches_target(institution, target): print(f"✅ MATCH: {institution.get('name', 'Unknown')}") locations = institution.get('locations', []) if locations: print(f" Location: {locations[0].get('city', 'Unknown')}") print(f" Q-number: {target['q_number']}") print(f" Verification: {target['verification']}") # Enrich institution enrich_institution(institution, target) enriched_count += 1 matched = True print() break if not matched: print(f"⏭️ SKIP: {target['name_pattern']} ({target['location']}) - No match found") print(f" Q-number: {target['q_number']}") print(f" Notes: {target.get('notes', 'Institution not in dataset or different naming')}") skipped_count += 1 print() # Final coverage new_with_wikidata, _ = count_wikidata_coverage(institutions) new_coverage_pct = (new_with_wikidata / total * 100) if total > 0 else 0 # Summary print("=" * 80) print("📊 Batch 5 Enrichment Summary") print("=" * 80) print(f"✅ Enriched: {enriched_count} institutions") print(f"⏭️ Skipped: {skipped_count} institutions (no match)") print(f"📈 New Wikidata coverage: {new_with_wikidata}/{total} ({new_coverage_pct:.1f}%)") print(f" Improvement: +{enriched_count} institutions") print(f" Progress to 20-institution goal: {new_with_wikidata}/20") # Save enriched dataset print(f"💾 Saving enriched dataset to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print() print("✅ Batch 5 enrichment complete!") print() print("📁 Files:") print(f" Input: {input_file}") print(f" Output: {output_file}") print(f" Backup: {backup_file}") print() print("🎯 Next Steps:") if new_with_wikidata < 20: print(f" - Need {20 - new_with_wikidata} more institutions to reach 22.2% coverage goal (20 institutions)") print(" - Consider Batch 6: 3-4 more regional museums") print(" - Candidates: Museo de Historia Natural y Cultural del Desierto de Atacama (Calama)") print(" Museo del Limarí (Ovalle)") print(" Museo Arqueológico de La Serena (verify Q86276952 data quality)") else: print(" - 🎉 GOAL REACHED: 20+ institutions with Wikidata!") print(" - Consider validating enriched dataset or starting Brazil continuation") if __name__ == '__main__': main()