#!/usr/bin/env python3 """ Chilean Batch 13: Apply Validated Wikidata Enrichment Adds 1 confirmed Q-number from manual search results. Validated match: - Archivo General de Asuntos Indígenas (CONADI) → Q21002896 Expected outcome: 61/90 (67.8%) coverage """ import yaml from datetime import datetime, timezone from pathlib import Path def load_yaml(filepath: str): """Load YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(data, filepath: str): """Save data to YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, width=120) def main(): """Apply validated Wikidata enrichment for Batch 13.""" # Load the current enriched dataset (from Batch 11) input_file = 'data/instances/chile/chilean_institutions_batch11_enriched.yaml' output_file = 'data/instances/chile/chilean_institutions_batch13_enriched.yaml' print("=" * 80) print("Chilean Batch 13: Wikidata Enrichment") print("=" * 80) print(f"Input: {input_file}") print(f"Output: {output_file}") print() institutions = load_yaml(input_file) # Validated match from manual search validated_match = { 'name': 'Archivo General de Asuntos Indígenas (CONADI)', 'q_number': 'Q21002896', 'wikidata_url': 'https://www.wikidata.org/wiki/Q21002896', 'rationale': 'Exact name match from Wikidata SPARQL query' } print(f"Applying validated match:") print(f" {validated_match['name']} → {validated_match['q_number']}") print() # Find and enrich the institution enriched_count = 0 for institution in institutions: # Match by name if institution.get('name') == validated_match['name']: # Check if already has Wikidata identifier existing_wikidata = False if institution.get('identifiers'): for identifier in institution['identifiers']: if identifier.get('identifier_scheme') == 'Wikidata': existing_wikidata = True print(f" ⚠ Already has Wikidata: {identifier.get('identifier_value')}") break if not existing_wikidata: # Add Wikidata identifier if not institution.get('identifiers'): institution['identifiers'] = [] institution['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': validated_match['q_number'], 'identifier_url': validated_match['wikidata_url'] }) # Update provenance if not institution.get('provenance'): institution['provenance'] = {} institution['provenance']['wikidata_enrichment'] = { 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Manual Wikidata SPARQL search with exact name matching', 'enrichment_batch': 'batch13', 'match_confidence': 'high', 'match_rationale': validated_match['rationale'] } enriched_count += 1 print(f" ✓ Added Wikidata Q-number: {validated_match['q_number']}") break # Save enriched dataset save_yaml(institutions, output_file) print() print("=" * 80) print("Enrichment Summary") print("=" * 80) # Count institutions with Wikidata total_institutions = len(institutions) institutions_with_wikidata = sum( 1 for inst in institutions if inst.get('identifiers') and any( id.get('identifier_scheme') == 'Wikidata' for id in inst.get('identifiers', []) ) ) coverage_pct = (institutions_with_wikidata / total_institutions) * 100 print(f"Total institutions: {total_institutions}") print(f"Institutions with Wikidata: {institutions_with_wikidata}") print(f"Coverage: {coverage_pct:.1f}%") print(f"Institutions enriched in this batch: {enriched_count}") print() print(f"Output saved to: {output_file}") print() # Report on target print("=" * 80) print("Target Progress") print("=" * 80) print(f"Target coverage: 70% (63/90)") print(f"Current coverage: {coverage_pct:.1f}% ({institutions_with_wikidata}/{total_institutions})") if coverage_pct >= 70.0: print("✓ TARGET REACHED!") else: remaining = 63 - institutions_with_wikidata print(f"⚠ Need {remaining} more matches to reach 70% target") print() if __name__ == '__main__': main()