106 lines
4 KiB
Python
106 lines
4 KiB
Python
#!/usr/bin/env python3
|
||
"""Brazilian Institutions Wikidata Enrichment - Batch 5"""
|
||
import json
|
||
import yaml
|
||
from datetime import datetime, timezone
|
||
|
||
WIKIDATA_MAPPINGS = {
|
||
"Teatro da Paz": {"qid": "Q3063375", "city": "Belém"},
|
||
"Forte dos Reis Magos": {"qid": "Q3304114", "city": "Natal"},
|
||
"Forte do Presépio": {"qid": "Q56694297", "city": "Belém"},
|
||
"Museu da Borracha": {"qid": "Q10333651", "city": "Rio Branco"},
|
||
"Museu do Piauí": {"qid": "Q10333916", "city": "Teresina"},
|
||
|
||
}
|
||
|
||
def enrich_batch5():
|
||
"""Enrich Batch 5 institutions with Wikidata Q-numbers."""
|
||
print("=" * 80)
|
||
print("🇧🇷 BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT - BATCH 5")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
json_path = "data/instances/global/global_heritage_institutions_merged.json"
|
||
print(f"📖 Loading dataset from JSON: {json_path.split('/')[-1]}")
|
||
|
||
with open(json_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
print(f"✅ Loaded {len(data):,} institutions")
|
||
print()
|
||
|
||
brazilian = [inst for inst in data if inst.get('locations') and
|
||
any(loc.get('country') == 'BR' for loc in inst['locations'])]
|
||
|
||
enriched_before = sum(1 for inst in brazilian if
|
||
any(id.get('identifier_scheme') == 'Wikidata'
|
||
for id in inst.get('identifiers', [])))
|
||
|
||
print(f"🔍 Found {len(brazilian)} Brazilian institutions")
|
||
print(f"📊 Currently enriched: {enriched_before}/{len(brazilian)} ({enriched_before/len(brazilian)*100:.1f}%)")
|
||
print()
|
||
|
||
enriched_count = 0
|
||
|
||
for institution in data:
|
||
name = institution.get('name', '')
|
||
|
||
if name in WIKIDATA_MAPPINGS:
|
||
mapping = WIKIDATA_MAPPINGS[name]
|
||
identifiers = institution.get('identifiers', [])
|
||
has_wikidata = any(id.get('identifier_scheme') == 'Wikidata' for id in identifiers)
|
||
|
||
if not has_wikidata:
|
||
wikidata_id = {
|
||
'identifier_scheme': 'Wikidata',
|
||
'identifier_value': mapping['qid'],
|
||
'identifier_url': f"https://www.wikidata.org/wiki/{mapping['qid']}"
|
||
}
|
||
|
||
if identifiers:
|
||
identifiers.append(wikidata_id)
|
||
else:
|
||
institution['identifiers'] = [wikidata_id]
|
||
|
||
city = mapping['city']
|
||
print(f"✅ Enriched: {name} ({city})")
|
||
print(f" → Wikidata: {mapping['qid']}")
|
||
print()
|
||
enriched_count += 1
|
||
|
||
if enriched_count == 0:
|
||
print("ℹ️ No new institutions to enrich")
|
||
return
|
||
|
||
backup_path = json_path + ".batch5_backup"
|
||
print(f"💾 Creating backup: {backup_path.split('/')[-1]}")
|
||
with open(backup_path, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"💾 Writing updated JSON dataset...")
|
||
with open(json_path, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
|
||
yaml_path = "data/instances/global/global_heritage_institutions_merged.yaml"
|
||
print(f"💾 Updating YAML file (this may take a minute)...")
|
||
with open(yaml_path, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||
|
||
print(f"✅ YAML file updated")
|
||
print()
|
||
|
||
enriched_after = enriched_before + enriched_count
|
||
|
||
print("=" * 80)
|
||
print("✨ ENRICHMENT COMPLETE - BATCH 5")
|
||
print("=" * 80)
|
||
print(f"Institutions enriched in this batch: {enriched_count}")
|
||
print()
|
||
print(f"Brazilian coverage: {enriched_after}/{len(brazilian)} = {enriched_after/len(brazilian)*100:.1f}%")
|
||
print()
|
||
print("Matched institutions:")
|
||
for name, mapping in WIKIDATA_MAPPINGS.items():
|
||
print(f" • {name} ({mapping['city']}) → {mapping['qid']}")
|
||
|
||
if __name__ == "__main__":
|
||
enrich_batch5()
|