glam/verify_batch13_ids.py
2025-11-19 23:25:22 +01:00

64 lines
2.4 KiB
Python

#!/usr/bin/env python3
"""Verify and correct institution IDs for Batch 13."""
import yaml
# Load main dataset
with open("data/instances/all/globalglam-20251111.yaml", 'r', encoding='utf-8') as f:
institutions = list(yaml.safe_load_all(f))
if len(institutions) == 1 and isinstance(institutions[0], list):
institutions = institutions[0]
# IDs from batch13_enriched.yaml that need verification
batch13_targets = {
"3008281717687280329": {"name": "UNIR", "qid": "Q7894377", "status": "UNKNOWN"},
"Q108397863": {"name": "Secretaria Cultura Tocantins", "qid": "Q108397863", "status": "UNKNOWN"},
"2519599505258789521": {"name": "Instituto Histórico Alagoas", "qid": "Q10302531", "status": "UNKNOWN"},
}
print("="*80)
print("VERIFYING BATCH 13 INSTITUTION IDS")
print("="*80)
for inst_id, info in batch13_targets.items():
# Search by ID
found = False
for inst in institutions:
if not isinstance(inst, dict):
continue
if str(inst.get('id')) == inst_id:
found = True
print(f"\n✓ ID VERIFIED: {inst_id}")
print(f" Name in dataset: {inst.get('name')}")
print(f" Expected name: {info['name']}")
print(f" Q-number to add: {info['qid']}")
# Check if already has Wikidata
identifiers = inst.get('identifiers', [])
has_wikidata = any(
i.get('identifier_scheme') == 'Wikidata'
for i in identifiers if isinstance(i, dict)
)
print(f" Already has Wikidata: {has_wikidata}")
locations = inst.get('locations', [])
if locations and isinstance(locations[0], dict):
print(f" Location: {locations[0].get('city', 'N/A')}, {locations[0].get('region', 'N/A')}")
info['status'] = 'FOUND'
info['actual_name'] = inst.get('name')
info['has_wikidata'] = has_wikidata
break
if not found:
print(f"\n✗ ID NOT FOUND: {inst_id}")
print(f" Expected name: {info['name']}")
info['status'] = 'NOT_FOUND'
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
found_count = sum(1 for v in batch13_targets.values() if v['status'] == 'FOUND')
print(f"Found: {found_count}/{len(batch13_targets)}")
print(f"Missing: {len(batch13_targets) - found_count}")