glam/archive/scripts/brazil/enrich_brazilian_batch5.py
2025-11-19 23:25:22 +01:00

106 lines
4 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Brazilian Institutions Wikidata Enrichment - Batch 5"""
import json
import yaml
from datetime import datetime, timezone
WIKIDATA_MAPPINGS = {
"Teatro da Paz": {"qid": "Q3063375", "city": "Belém"},
"Forte dos Reis Magos": {"qid": "Q3304114", "city": "Natal"},
"Forte do Presépio": {"qid": "Q56694297", "city": "Belém"},
"Museu da Borracha": {"qid": "Q10333651", "city": "Rio Branco"},
"Museu do Piauí": {"qid": "Q10333916", "city": "Teresina"},
}
def enrich_batch5():
"""Enrich Batch 5 institutions with Wikidata Q-numbers."""
print("=" * 80)
print("🇧🇷 BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT - BATCH 5")
print("=" * 80)
print()
json_path = "data/instances/global/global_heritage_institutions_merged.json"
print(f"📖 Loading dataset from JSON: {json_path.split('/')[-1]}")
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"✅ Loaded {len(data):,} institutions")
print()
brazilian = [inst for inst in data if inst.get('locations') and
any(loc.get('country') == 'BR' for loc in inst['locations'])]
enriched_before = sum(1 for inst in brazilian if
any(id.get('identifier_scheme') == 'Wikidata'
for id in inst.get('identifiers', [])))
print(f"🔍 Found {len(brazilian)} Brazilian institutions")
print(f"📊 Currently enriched: {enriched_before}/{len(brazilian)} ({enriched_before/len(brazilian)*100:.1f}%)")
print()
enriched_count = 0
for institution in data:
name = institution.get('name', '')
if name in WIKIDATA_MAPPINGS:
mapping = WIKIDATA_MAPPINGS[name]
identifiers = institution.get('identifiers', [])
has_wikidata = any(id.get('identifier_scheme') == 'Wikidata' for id in identifiers)
if not has_wikidata:
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': mapping['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{mapping['qid']}"
}
if identifiers:
identifiers.append(wikidata_id)
else:
institution['identifiers'] = [wikidata_id]
city = mapping['city']
print(f"✅ Enriched: {name} ({city})")
print(f" → Wikidata: {mapping['qid']}")
print()
enriched_count += 1
if enriched_count == 0:
print(" No new institutions to enrich")
return
backup_path = json_path + ".batch5_backup"
print(f"💾 Creating backup: {backup_path.split('/')[-1]}")
with open(backup_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"💾 Writing updated JSON dataset...")
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
yaml_path = "data/instances/global/global_heritage_institutions_merged.yaml"
print(f"💾 Updating YAML file (this may take a minute)...")
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✅ YAML file updated")
print()
enriched_after = enriched_before + enriched_count
print("=" * 80)
print("✨ ENRICHMENT COMPLETE - BATCH 5")
print("=" * 80)
print(f"Institutions enriched in this batch: {enriched_count}")
print()
print(f"Brazilian coverage: {enriched_after}/{len(brazilian)} = {enriched_after/len(brazilian)*100:.1f}%")
print()
print("Matched institutions:")
for name, mapping in WIKIDATA_MAPPINGS.items():
print(f"{name} ({mapping['city']}) → {mapping['qid']}")
if __name__ == "__main__":
enrich_batch5()