#!/usr/bin/env python3 """ Merge Batch 13 Wikidata Enrichments - Corrected Version Adds verified Q-numbers to 3 Brazilian institutions. """ import yaml from datetime import datetime, timezone from pathlib import Path # Verified enrichments (IDs corrected) ENRICHMENTS = { "3008281717687280329": { "name": "UNIR", "qid": "Q7894377", "label": "Federal University of Rondônia", "description": "Brazilian public university" }, "709508309148680086": { "name": "Secult Tocantins", "qid": "Q108397863", "label": "Secretary of Culture of the State of Tocantins", "description": "state secretariat responsible for cultural related affairs in the state of Tocantins, Brazil" }, "2519599505258789521": { "name": "Instituto Histórico e Geográfico de Alagoas", "qid": "Q10302531", "label": "Instituto Histórico e Geográfico de Alagoas", "description": "research institute and museum in Maceió, Brazil" } } def main(): dataset_path = Path("data/instances/all/globalglam-20251111.yaml") backup_path = Path("data/instances/all/globalglam-20251111.yaml.bak.batch13") print("="*80) print("BRAZIL BATCH 13 WIKIDATA ENRICHMENT MERGE") print("="*80) # Load dataset print(f"\nLoading dataset: {dataset_path}") with open(dataset_path, 'r', encoding='utf-8') as f: institutions = list(yaml.safe_load_all(f)) if len(institutions) == 1 and isinstance(institutions[0], list): institutions = institutions[0] print(f"Loaded {len(institutions)} institutions") # Track changes enriched_count = 0 skipped_count = 0 errors = [] print(f"\n" + "="*80) print("PROCESSING ENRICHMENTS") print("="*80) for inst in institutions: if not isinstance(inst, dict): continue inst_id = str(inst.get('id', '')) if inst_id in ENRICHMENTS: enrichment = ENRICHMENTS[inst_id] # Check if already has Wikidata identifiers = inst.get('identifiers', []) has_wikidata = any( i.get('identifier_scheme') == 'Wikidata' for i in identifiers if isinstance(i, dict) ) if has_wikidata: print(f"\n⚠ SKIP: {inst.get('name')} (ID: {inst_id})") print(f" Already has Wikidata Q-number") skipped_count += 1 continue # Add Wikidata identifier new_identifier = { 'identifier_scheme': 'Wikidata', 'identifier_value': enrichment['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}" } if 'identifiers' not in inst or inst['identifiers'] is None: inst['identifiers'] = [] inst['identifiers'].append(new_identifier) # Update provenance if 'provenance' not in inst: inst['provenance'] = {} if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Wikidata authenticated entity search (Batch 13)', 'enrichment_source': 'batch13_enriched.yaml', 'fields_enriched': ['identifiers.Wikidata'], 'wikidata_label': enrichment['label'], 'wikidata_description': enrichment['description'] }) print(f"\n✓ ENRICHED: {inst.get('name')}") print(f" ID: {inst_id}") print(f" Q-number: {enrichment['qid']}") print(f" Label: {enrichment['label']}") enriched_count += 1 # Summary print(f"\n" + "="*80) print("MERGE SUMMARY") print("="*80) print(f"Institutions enriched: {enriched_count}") print(f"Institutions skipped: {skipped_count}") print(f"Errors: {len(errors)}") if errors: print("\nErrors:") for error in errors: print(f" - {error}") if enriched_count > 0: # Create backup print(f"\nCreating backup: {backup_path}") import shutil shutil.copy2(dataset_path, backup_path) # Write updated dataset print(f"Writing updated dataset: {dataset_path}") with open(dataset_path, 'w', encoding='utf-8') as f: yaml.dump_all( [institutions], f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=1000 ) print("\n✓ Merge completed successfully!") # Calculate coverage brazil_institutions = [ inst for inst in institutions if isinstance(inst, dict) and any( loc.get('country') == 'BR' for loc in inst.get('locations', []) if isinstance(loc, dict) ) ] brazil_with_wikidata = [ inst for inst in brazil_institutions if any( i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []) if isinstance(i, dict) ) ] coverage = len(brazil_with_wikidata) / len(brazil_institutions) * 100 print(f"\n" + "="*80) print("WIKIDATA COVERAGE (BRAZIL)") print("="*80) print(f"Total Brazilian institutions: {len(brazil_institutions)}") print(f"With Wikidata Q-numbers: {len(brazil_with_wikidata)}") print(f"Coverage: {coverage:.1f}%") print(f"Previous coverage: 57.0% (69/121)") print(f"Improvement: +{len(brazil_with_wikidata) - 69} institutions (+{coverage - 57.0:.1f}%)") else: print("\nNo changes made - no merge performed.") if __name__ == "__main__": main()