#!/usr/bin/env python3 """Verify and correct institution IDs for Batch 13.""" import yaml # Load main dataset with open("data/instances/all/globalglam-20251111.yaml", 'r', encoding='utf-8') as f: institutions = list(yaml.safe_load_all(f)) if len(institutions) == 1 and isinstance(institutions[0], list): institutions = institutions[0] # IDs from batch13_enriched.yaml that need verification batch13_targets = { "3008281717687280329": {"name": "UNIR", "qid": "Q7894377", "status": "UNKNOWN"}, "Q108397863": {"name": "Secretaria Cultura Tocantins", "qid": "Q108397863", "status": "UNKNOWN"}, "2519599505258789521": {"name": "Instituto Histórico Alagoas", "qid": "Q10302531", "status": "UNKNOWN"}, } print("="*80) print("VERIFYING BATCH 13 INSTITUTION IDS") print("="*80) for inst_id, info in batch13_targets.items(): # Search by ID found = False for inst in institutions: if not isinstance(inst, dict): continue if str(inst.get('id')) == inst_id: found = True print(f"\n✓ ID VERIFIED: {inst_id}") print(f" Name in dataset: {inst.get('name')}") print(f" Expected name: {info['name']}") print(f" Q-number to add: {info['qid']}") # Check if already has Wikidata identifiers = inst.get('identifiers', []) has_wikidata = any( i.get('identifier_scheme') == 'Wikidata' for i in identifiers if isinstance(i, dict) ) print(f" Already has Wikidata: {has_wikidata}") locations = inst.get('locations', []) if locations and isinstance(locations[0], dict): print(f" Location: {locations[0].get('city', 'N/A')}, {locations[0].get('region', 'N/A')}") info['status'] = 'FOUND' info['actual_name'] = inst.get('name') info['has_wikidata'] = has_wikidata break if not found: print(f"\n✗ ID NOT FOUND: {inst_id}") print(f" Expected name: {info['name']}") info['status'] = 'NOT_FOUND' print("\n" + "="*80) print("SUMMARY") print("="*80) found_count = sum(1 for v in batch13_targets.values() if v['status'] == 'FOUND') print(f"Found: {found_count}/{len(batch13_targets)}") print(f"Missing: {len(batch13_targets) - found_count}")