#!/usr/bin/env python3 """ Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (University Departments) REVISED STRATEGY based on dataset structure: - Dataset contains university DEPARTMENTS/ARCHIVES, not universities themselves - Example: "Universidad de Chile's Archivo Central" (archive department) - Strategy: Enrich department records with PARENT UNIVERSITY's Wikidata Q-number - This provides valuable linkage to authoritative university entities BATCH 2 TARGET INSTITUTIONS (University departments/archives): 1. Universidad de Chile's Archivo Central Andrés Bello → Q219576 (Universidad de Chile) 2. Universidad de Concepción's SIBUDEC → Q1163431 (Universidad de Concepción) 3. Universidad Austral → Q1163558 (Universidad Austral de Chile) 4. Universidad Católica → Q1562315 (Pontificia Universidad Católica de Chile) IMPROVEMENTS FROM BATCH 1: 1. Direct Q-number mapping (no SPARQL needed for major universities) 2. Fuzzy matching against department name patterns 3. Parent organization linkage (department → university) 4. Fast execution (no slow Wikidata queries) """ import yaml from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any, Optional from rapidfuzz import fuzz import re # Direct Q-number mapping for major Chilean universities UNIVERSITY_WIKIDATA_MAP = { 'Universidad de Chile': 'Q219576', 'Universidad de Concepción': 'Q1163431', 'Universidad Austral de Chile': 'Q1163558', 'Pontificia Universidad Católica de Chile': 'Q1562315', 'Universidad de Santiago de Chile': 'Q2006105', 'Universidad Católica del Norte': 'Q3244385', # Already enriched in Batch 1 'Universidad de Tarapacá': 'Q3138071' # Already enriched in Batch 1 } # Batch 2 targets: Department records in dataset BATCH_2_TARGETS = [ { 'name_pattern': "Universidad de Chile's Archivo", 'name_variants': [ "Universidad de Chile's Archivo Central", "Archivo Central Andrés Bello", "Universidad de Chile Archivo" ], 'parent_university': 'Universidad de Chile', 'wikidata_q': 'Q219576', 'inst_type': 'ARCHIVE', 'notes': 'Central archive of Universidad de Chile (founded 1842)' }, { 'name_pattern': "Universidad de Concepción's SIBUDEC", 'name_variants': [ "SIBUDEC", "Sistema de Bibliotecas UdeC", "Universidad de Concepción SIBUDEC" ], 'parent_university': 'Universidad de Concepción', 'wikidata_q': 'Q1163431', 'inst_type': 'EDUCATION_PROVIDER', 'notes': 'Library system of Universidad de Concepción (founded 1919)' }, { 'name_pattern': 'Universidad Austral', 'name_variants': [ 'Universidad Austral', 'Universidad Austral de Chile', 'UACh' ], 'parent_university': 'Universidad Austral de Chile', 'wikidata_q': 'Q1163558', 'inst_type': 'EDUCATION_PROVIDER', 'notes': 'Universidad Austral de Chile in Valdivia (founded 1954)' }, { 'name_pattern': 'Universidad Católica', 'name_variants': [ 'Universidad Católica', 'Pontificia Universidad Católica', 'UC Chile', 'PUC' ], 'parent_university': 'Pontificia Universidad Católica de Chile', 'wikidata_q': 'Q1562315', 'inst_type': 'EDUCATION_PROVIDER', 'notes': 'Pontificia Universidad Católica de Chile in Santiago (founded 1888)' } ] def normalize_name(name: str) -> str: """Normalize institution name for better matching.""" # Remove possessive markers name = re.sub(r"'s\b", "", name) # Remove leading/trailing whitespace name = name.strip() # Normalize whitespace name = re.sub(r'\s+', ' ', name) return name def fuzzy_match_name(inst_name: str, name_variants: List[str]) -> tuple[bool, float]: """Check if institution name matches any variant with fuzzy matching.""" inst_name_norm = normalize_name(inst_name).lower() best_score = 0 for variant in name_variants: variant_norm = normalize_name(variant).lower() # Try multiple strategies scores = [ fuzz.ratio(inst_name_norm, variant_norm), fuzz.partial_ratio(inst_name_norm, variant_norm), fuzz.token_set_ratio(inst_name_norm, variant_norm), fuzz.token_sort_ratio(inst_name_norm, variant_norm) ] best_score = max(best_score, max(scores)) return best_score >= 80, best_score def has_wikidata(inst: Dict[str, Any]) -> bool: """Check if institution already has Wikidata identifier.""" return any( id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in inst.get('identifiers', []) ) def add_wikidata_identifier( inst: Dict[str, Any], q_number: str, parent_university: str, confidence: float, notes: str ) -> Dict[str, Any]: """Add Wikidata identifier to institution with provenance tracking.""" wikidata_id = { 'identifier_scheme': 'Wikidata', 'identifier_value': q_number, 'identifier_url': f'https://www.wikidata.org/wiki/{q_number}' } if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append(wikidata_id) # Update description to note parent university if 'description' in inst and inst['description']: if parent_university not in inst['description']: inst['description'] = f"{inst['description']} Part of {parent_university}." else: inst['description'] = f"Part of {parent_university}. {notes}" # Update provenance if 'provenance' in inst: old_method = inst['provenance'].get('extraction_method', '') enrichment_note = ( f" + Wikidata enrichment (Batch 2 university depts, " f"parent: {parent_university}, confidence={confidence:.2f})" ) inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}" return inst def main(): data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml' backup_file = data_file.with_suffix('.batch2_backup') output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml') print("=" * 80) print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment") print("University Departments/Archives Focus") print("Session: November 9, 2025") print("Strategy: Enrich dept records with parent university Q-numbers") print("=" * 80) print() # Load data print(f"📂 Loading: {data_file}") with open(data_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f" Total institutions: {len(institutions)}") # Check existing Wikidata coverage with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst)) print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)") print() # Create backup print(f"💾 Creating backup: {backup_file.name}") with open(backup_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("Batch 2 Enrichment Process:") print("-" * 80) # Process each target enriched_count = 0 skipped_count = 0 not_found_count = 0 for i, target in enumerate(BATCH_2_TARGETS, 1): print(f"\n[{i}/{len(BATCH_2_TARGETS)}] 🎓 Searching: {target['name_pattern']}") print(f" Parent: {target['parent_university']}") print(f" Wikidata: {target['wikidata_q']}") print(f" Name variants: {', '.join(target['name_variants'][:3])}") # Find matching institution in dataset matched = None match_score = 0 for inst in institutions: is_match, score = fuzzy_match_name(inst.get('name', ''), target['name_variants']) # Also check institution type if is_match and inst.get('institution_type') == target['inst_type']: matched = inst match_score = score break if not matched: print(f" ❌ NOT FOUND in dataset") print(f" (No match for any variant above 80% similarity)") not_found_count += 1 continue print(f" ✓ Found: {matched.get('name')}") print(f" Match score: {match_score:.1f}%") # Check if already has Wikidata if has_wikidata(matched): existing_q = next( (id_obj['identifier_value'] for id_obj in matched.get('identifiers', []) if id_obj.get('identifier_scheme') == 'Wikidata'), None ) print(f" ⏭️ Already enriched with {existing_q}") skipped_count += 1 continue # Add Wikidata identifier (direct mapping, no query needed) print(f" ✅ Adding Wikidata identifier: {target['wikidata_q']}") print(f" Linking to parent: {target['parent_university']}") add_wikidata_identifier( matched, target['wikidata_q'], target['parent_university'], match_score / 100, target['notes'] ) enriched_count += 1 print() print("=" * 80) print("Batch 2 Summary:") print("-" * 80) print(f"✅ Enriched: {enriched_count}") print(f"⏭️ Already enriched: {skipped_count}") print(f"❌ Not found: {not_found_count}") # Calculate updated coverage with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst)) print() print("Chilean Institution Coverage:") print(f" Total: {len(institutions)}") print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)") print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)") print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions ({(with_wikidata_after - with_wikidata_before)/len(institutions)*100:.1f}%)") # Save if any enrichments if enriched_count > 0: print() print(f"💾 Saving enriched data to: {output_file.name}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print() print("✅ Batch 2 enrichment complete!") print() print("NEXT STEPS:") print("1. Create Batch 3 targeting major museums (5 institutions):") print(" - Museo Nacional de Historia Natural (Santiago)") print(" - Museo de Arte Precolombino (Santiago)") print(" - Museo Histórico Nacional (Santiago)") print(" - Museo de Bellas Artes (Santiago)") print(" - Museo Regional de Ancud (Chiloé)") print("2. Continue until 20+ institutions enriched (22% coverage)") else: print() print("⚠️ No enrichments - all targets already enriched or not found") if __name__ == '__main__': main()