#!/usr/bin/env python3 """ Chilean GLAM Institutions - Batch 4 Wikidata Enrichment Target: National institutions and major regional museums Goal: 10/90 → 14/90 (11.1% → 15.6% coverage) """ import yaml from datetime import datetime, timezone from pathlib import Path from typing import Dict, List # Batch 4 targets with verified Wikidata Q-numbers BATCH_4_TARGETS = [ { "q_number": "Q2901485", "name_pattern": "Biblioteca Nacional", "location": "Santiago", # Note: Dataset may list as Iquique (digital platform) "institution_type": "LIBRARY", "verification": "National Library of Chile, founded 1813, Santiago", "notes": "May appear as 'Biblioteca Nacional Digital' in dataset" }, { "q_number": "Q17166403", "name_pattern": "Museo Marítimo Nacional", "location": "Valparaíso", "institution_type": "MUSEUM", "verification": "National Maritime Museum, Valparaíso, founded on Artillery Hill" }, { "q_number": "Q6970429", "name_pattern": "Archivo Nacional", "location": "Santiago", "institution_type": "ARCHIVE", "verification": "National Archives of Chile, founded 1927, Santiago" }, # Note: Servicio Nacional del Patrimonio Cultural (Q-number not found in Wikidata) ] def load_institutions(file_path: Path) -> List[Dict]: """Load institutions from YAML file.""" print(f"📖 Loading institutions from: {file_path}") with open(file_path, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f" Loaded {len(institutions)} institutions") return institutions def count_wikidata_coverage(institutions: List[Dict]) -> tuple: """Count institutions with Wikidata identifiers.""" with_wikidata = sum( 1 for inst in institutions if any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) ) return with_wikidata, len(institutions) def institution_has_wikidata(institution: Dict) -> bool: """Check if institution already has Wikidata identifier.""" return any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in institution.get('identifiers', []) ) def matches_target(institution: Dict, target: Dict) -> bool: """Check if institution matches target criteria.""" name = institution.get('name', '') inst_type = institution.get('institution_type', '') locations = institution.get('locations', []) # Institution type must match if inst_type != target['institution_type']: return False # Name must contain the pattern if target['name_pattern'] not in name: return False # Location match (optional - some records may have incomplete location data) if locations: city = locations[0].get('city', '') # Flexible location matching (target location or Unknown) if city and city != 'Unknown' and target['location'] not in city: return False return True def enrich_institution(institution: Dict, target: Dict) -> bool: """Add Wikidata identifier to institution.""" q_number = target['q_number'] # Create Wikidata identifier wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' } # Ensure identifiers list exists if 'identifiers' not in institution: institution['identifiers'] = [] # Add Wikidata identifier institution['identifiers'].append(wikidata_id) # Update provenance if 'provenance' not in institution: institution['provenance'] = {} provenance = institution['provenance'] # Record enrichment if 'enrichment_history' not in provenance: provenance['enrichment_history'] = [] provenance['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Chilean Batch 4 - Manual Wikidata verification', 'enrichment_batch': 'batch_4', 'q_number': q_number, 'verification': target['verification'] }) # Update data tier if not already set if 'data_tier' not in provenance or provenance['data_tier'] == 'TIER_4_INFERRED': provenance['data_tier'] = 'TIER_3_CROWD_SOURCED' # Wikidata is TIER_3 return True def main(): """Main enrichment workflow.""" print("=" * 80) print("CHILEAN GLAM INSTITUTIONS - BATCH 4 WIKIDATA ENRICHMENT") print("=" * 80) # Paths input_file = Path('data/instances/chile/chilean_institutions_batch3_enriched.yaml') output_file = Path('data/instances/chile/chilean_institutions_batch4_enriched.yaml') backup_file = Path(f'{input_file}.batch4_backup') # Load institutions institutions = load_institutions(input_file) # Count current coverage with_wikidata, total = count_wikidata_coverage(institutions) coverage_pct = (with_wikidata / total * 100) if total > 0 else 0 print(f"📊 Current Wikidata coverage: {with_wikidata}/{total} ({coverage_pct:.1f}%)") # Create backup print(f"💾 Creating backup: {backup_file}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) # Enrichment tracking enriched_count = 0 skipped_count = 0 print(f"🔍 Starting Batch 4 enrichment...") print() # Process each target for target in BATCH_4_TARGETS: matched = False for institution in institutions: # Skip if already has Wikidata if institution_has_wikidata(institution): continue # Check if matches target if matches_target(institution, target): print(f"✅ MATCH: {institution.get('name', 'Unknown')}") locations = institution.get('locations', []) if locations: print(f" Location: {locations[0].get('city', 'Unknown')}") print(f" Q-number: {target['q_number']}") print(f" Verification: {target['verification']}") # Enrich institution enrich_institution(institution, target) enriched_count += 1 matched = True print() break if not matched: print(f"⏭️ SKIP: {target['name_pattern']} ({target['location']}) - No match found") print(f" Q-number: {target['q_number']}") print(f" Notes: {target.get('notes', 'Institution not in dataset or different naming')}") skipped_count += 1 print() # Final coverage new_with_wikidata, _ = count_wikidata_coverage(institutions) new_coverage_pct = (new_with_wikidata / total * 100) if total > 0 else 0 # Summary print("=" * 80) print("📊 Batch 4 Enrichment Summary") print("=" * 80) print(f"✅ Enriched: {enriched_count} institutions") print(f"⏭️ Skipped: {skipped_count} institutions (no match)") print(f"📈 New Wikidata coverage: {new_with_wikidata}/{total} ({new_coverage_pct:.1f}%)") print(f" Improvement: +{enriched_count} institutions") # Save enriched dataset print(f"💾 Saving enriched dataset to: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False) print() print("✅ Batch 4 enrichment complete!") print() print("📁 Files:") print(f" Input: {input_file}") print(f" Output: {output_file}") print(f" Backup: {backup_file}") print() print("🎯 Next Steps:") if new_with_wikidata < 20: print(f" - Need {20 - new_with_wikidata} more institutions to reach 22.2% coverage goal (20 institutions)") print(" - Consider Batch 5: More regional museums or verify Servicio Nacional del Patrimonio Cultural") else: print(" - 🎉 GOAL REACHED: 20+ institutions with Wikidata!") print(" - Consider moving to Brazil Batch 7 or updating documentation") if __name__ == '__main__': main()