64 lines
2.4 KiB
Python
64 lines
2.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Verify and correct institution IDs for Batch 13."""
|
|
|
|
import yaml
|
|
|
|
# Load main dataset
|
|
with open("data/instances/all/globalglam-20251111.yaml", 'r', encoding='utf-8') as f:
|
|
institutions = list(yaml.safe_load_all(f))
|
|
if len(institutions) == 1 and isinstance(institutions[0], list):
|
|
institutions = institutions[0]
|
|
|
|
# IDs from batch13_enriched.yaml that need verification
|
|
batch13_targets = {
|
|
"3008281717687280329": {"name": "UNIR", "qid": "Q7894377", "status": "UNKNOWN"},
|
|
"Q108397863": {"name": "Secretaria Cultura Tocantins", "qid": "Q108397863", "status": "UNKNOWN"},
|
|
"2519599505258789521": {"name": "Instituto Histórico Alagoas", "qid": "Q10302531", "status": "UNKNOWN"},
|
|
}
|
|
|
|
print("="*80)
|
|
print("VERIFYING BATCH 13 INSTITUTION IDS")
|
|
print("="*80)
|
|
|
|
for inst_id, info in batch13_targets.items():
|
|
# Search by ID
|
|
found = False
|
|
for inst in institutions:
|
|
if not isinstance(inst, dict):
|
|
continue
|
|
|
|
if str(inst.get('id')) == inst_id:
|
|
found = True
|
|
print(f"\n✓ ID VERIFIED: {inst_id}")
|
|
print(f" Name in dataset: {inst.get('name')}")
|
|
print(f" Expected name: {info['name']}")
|
|
print(f" Q-number to add: {info['qid']}")
|
|
|
|
# Check if already has Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
has_wikidata = any(
|
|
i.get('identifier_scheme') == 'Wikidata'
|
|
for i in identifiers if isinstance(i, dict)
|
|
)
|
|
print(f" Already has Wikidata: {has_wikidata}")
|
|
|
|
locations = inst.get('locations', [])
|
|
if locations and isinstance(locations[0], dict):
|
|
print(f" Location: {locations[0].get('city', 'N/A')}, {locations[0].get('region', 'N/A')}")
|
|
|
|
info['status'] = 'FOUND'
|
|
info['actual_name'] = inst.get('name')
|
|
info['has_wikidata'] = has_wikidata
|
|
break
|
|
|
|
if not found:
|
|
print(f"\n✗ ID NOT FOUND: {inst_id}")
|
|
print(f" Expected name: {info['name']}")
|
|
info['status'] = 'NOT_FOUND'
|
|
|
|
print("\n" + "="*80)
|
|
print("SUMMARY")
|
|
print("="*80)
|
|
found_count = sum(1 for v in batch13_targets.values() if v['status'] == 'FOUND')
|
|
print(f"Found: {found_count}/{len(batch13_targets)}")
|
|
print(f"Missing: {len(batch13_targets) - found_count}")
|