#!/usr/bin/env python3 """ Chilean GLAM Wikidata Enrichment - Batch 14 (Manual WebFetch Results) Apply manually verified Wikidata identifier found via WebFetch tool: - Museo Rudolph Philippi (Valdivia) → Q6940547 Target: 62/90 institutions (68.9% coverage) Previous: 61/90 (67.8%) Manual verification performed Nov 9, 2025 via WebFetch after API rate limits. See: scripts/batch14_manual_results.json """ import sys from pathlib import Path from datetime import datetime, timezone import yaml def load_institutions(filepath: Path) -> list[dict]: """Load YAML file as list of dictionaries.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_institutions(institutions: list[dict], filepath: Path): """Save institutions to YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, width=120) def enrich_batch14(institutions: list[dict]) -> tuple[list[dict], int]: """ Apply Batch 14 Wikidata enrichments (manual WebFetch verification). Returns: Tuple of (enriched_institutions, count_enriched) """ enrichment_date = datetime.now(timezone.utc).isoformat() enriched_count = 0 # Batch 14: Manual WebFetch verification (1 institution) # Q6940547 - Museo de la Exploración Rudolph Amandus Philippi valdivia_philippi_matches = [ ("Museo Rudolph Philippi", "Valdivia", "Q6940547", "Museo de la Exploración Rudolph Amandus Philippi, museum in Valdivia, founded 1914, reopened 2006") ] for inst in institutions: name = inst.get('name', '') city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else '' # Check if institution already has Wikidata identifier existing_wikidata = any( id_item.get('identifier_scheme') == 'Wikidata' for id_item in inst.get('identifiers', []) ) if existing_wikidata: continue # Skip if already enriched # Check against Batch 14 matches for match_name, match_city, q_number, verification in valdivia_philippi_matches: if match_name.lower() in name.lower() and match_city.lower() == city.lower(): # Add Wikidata identifier if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' }) # Update provenance if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['data_tier'] = 'TIER_3_CROWD_SOURCED' inst['provenance']['last_updated'] = enrichment_date inst['provenance']['enrichment_batch'] = 14 inst['provenance']['wikidata_match_confidence'] = 'HIGH' inst['provenance']['wikidata_match_reason'] = 'Manual verification via WebFetch after API rate limits' inst['provenance']['wikidata_name'] = verification enriched_count += 1 print(f"✓ Enriched: {name} ({city}) → {q_number}") print(f" Wikidata: {verification}") break return institutions, enriched_count def main(): """Main enrichment workflow.""" # Paths input_file = Path("data/instances/chile/chilean_institutions_batch13_enriched.yaml") output_file = Path("data/instances/chile/chilean_institutions_batch14_enriched.yaml") print("=" * 80) print("Chilean GLAM Wikidata Enrichment - Batch 14") print("Manual WebFetch Verification Results") print("=" * 80) print() # Load data print(f"Loading: {input_file}") institutions = load_institutions(input_file) print(f" Total institutions: {len(institutions)}") # Count current Wikidata coverage current_with_wikidata = sum( 1 for inst in institutions if any(id_item.get('identifier_scheme') == 'Wikidata' for id_item in inst.get('identifiers', [])) ) print(f" Current Wikidata coverage: {current_with_wikidata}/{len(institutions)} ({current_with_wikidata/len(institutions)*100:.1f}%)") print() # Apply Batch 14 enrichments print("Applying Batch 14 enrichments...") institutions, enriched_count = enrich_batch14(institutions) print() # Calculate new coverage new_with_wikidata = sum( 1 for inst in institutions if any(id_item.get('identifier_scheme') == 'Wikidata' for id_item in inst.get('identifiers', [])) ) print("=" * 80) print("Enrichment Summary") print("=" * 80) print(f"Institutions enriched: {enriched_count}") print(f"New Wikidata coverage: {new_with_wikidata}/{len(institutions)} ({new_with_wikidata/len(institutions)*100:.1f}%)") print(f"Coverage change: +{new_with_wikidata - current_with_wikidata} ({(new_with_wikidata - current_with_wikidata)/len(institutions)*100:.1f}%)") print() # Save enriched data print(f"Saving: {output_file}") save_institutions(institutions, output_file) print("✓ Done!") print() # Progress towards 70% target target = 63 # 70% of 90 remaining = target - new_with_wikidata print(f"Progress to 70% target: {new_with_wikidata}/{target}") if remaining > 0: print(f" Still need: {remaining} more institution(s)") else: print(" 🎉 Target reached!") if __name__ == '__main__': main()