glam/archive/scripts/brazil/enrich_brazilian_batch4.py
2025-11-19 23:25:22 +01:00

120 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Brazilian Institutions Wikidata Enrichment - Batch 4
Adds verified Wikidata Q-numbers for major Brazilian institutions
"""
import json
import yaml
from datetime import datetime, timezone
# Wikidata mappings (verified via SPARQL queries)
WIKIDATA_MAPPINGS = {
"MARGS": {"qid": "Q7335252", "city": "Porto Alegre"},
"FUNDAJ": {"qid": "Q10286348", "city": "Recife"},
"MON": {"qid": "Q4991927", "city": "Curitiba"},
}
def enrich_batch4():
"""Enrich Batch 4 institutions with Wikidata Q-numbers."""
print("=" * 80)
print("🇧🇷 BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT - BATCH 4")
print("=" * 80)
print()
# Load JSON dataset
json_path = "data/instances/global/global_heritage_institutions_merged.json"
print(f"📖 Loading dataset from JSON: {json_path.split('/')[-1]}")
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"✅ Loaded {len(data):,} institutions")
print()
# Find Brazilian institutions
brazilian = [inst for inst in data if inst.get('locations') and
any(loc.get('country') == 'BR' for loc in inst['locations'])]
enriched_before = sum(1 for inst in brazilian if
any(id.get('identifier_scheme') == 'Wikidata'
for id in inst.get('identifiers', [])))
print(f"🔍 Found {len(brazilian)} Brazilian institutions")
print(f"📊 Currently enriched: {enriched_before}/{len(brazilian)} ({enriched_before/len(brazilian)*100:.1f}%)")
print()
# Enrich institutions
enriched_count = 0
for institution in data:
name = institution.get('name', '')
if name in WIKIDATA_MAPPINGS:
mapping = WIKIDATA_MAPPINGS[name]
# Check if already has Wikidata ID
identifiers = institution.get('identifiers', [])
has_wikidata = any(id.get('identifier_scheme') == 'Wikidata' for id in identifiers)
if not has_wikidata:
# Add Wikidata identifier
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': mapping['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{mapping['qid']}"
}
if identifiers:
identifiers.append(wikidata_id)
else:
institution['identifiers'] = [wikidata_id]
city = mapping['city']
print(f"✅ Enriched: {name} ({city})")
print(f" → Wikidata: {mapping['qid']} (matched '{name}')")
print()
enriched_count += 1
if enriched_count == 0:
print(" No new institutions to enrich (all already have Wikidata IDs)")
return
# Create backup
backup_path = json_path + ".batch4_backup"
print(f"💾 Creating backup: {backup_path.split('/')[-1]}")
with open(backup_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# Write updated JSON
print(f"💾 Writing updated JSON dataset...")
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# Update YAML file
yaml_path = "data/instances/global/global_heritage_institutions_merged.yaml"
print(f"💾 Updating YAML file (this may take a minute)...")
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✅ YAML file updated")
print()
# Final statistics
enriched_after = enriched_before + enriched_count
print("=" * 80)
print("✨ ENRICHMENT COMPLETE - BATCH 4")
print("=" * 80)
print(f"Institutions enriched in this batch: {enriched_count}")
print()
print(f"Brazilian coverage: {enriched_after}/{len(brazilian)} = {enriched_after/len(brazilian)*100:.1f}%")
print()
print("Matched institutions:")
for name, mapping in WIKIDATA_MAPPINGS.items():
print(f"{name} ({mapping['city']}) → {mapping['qid']}")
if __name__ == "__main__":
enrich_batch4()