120 lines
4.3 KiB
Python
120 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Brazilian Institutions Wikidata Enrichment - Batch 4
|
||
Adds verified Wikidata Q-numbers for major Brazilian institutions
|
||
"""
|
||
|
||
import json
|
||
import yaml
|
||
from datetime import datetime, timezone
|
||
|
||
# Wikidata mappings (verified via SPARQL queries)
|
||
WIKIDATA_MAPPINGS = {
|
||
"MARGS": {"qid": "Q7335252", "city": "Porto Alegre"},
|
||
"FUNDAJ": {"qid": "Q10286348", "city": "Recife"},
|
||
"MON": {"qid": "Q4991927", "city": "Curitiba"},
|
||
}
|
||
|
||
def enrich_batch4():
|
||
"""Enrich Batch 4 institutions with Wikidata Q-numbers."""
|
||
|
||
print("=" * 80)
|
||
print("🇧🇷 BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT - BATCH 4")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
# Load JSON dataset
|
||
json_path = "data/instances/global/global_heritage_institutions_merged.json"
|
||
print(f"📖 Loading dataset from JSON: {json_path.split('/')[-1]}")
|
||
|
||
with open(json_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
print(f"✅ Loaded {len(data):,} institutions")
|
||
print()
|
||
|
||
# Find Brazilian institutions
|
||
brazilian = [inst for inst in data if inst.get('locations') and
|
||
any(loc.get('country') == 'BR' for loc in inst['locations'])]
|
||
|
||
enriched_before = sum(1 for inst in brazilian if
|
||
any(id.get('identifier_scheme') == 'Wikidata'
|
||
for id in inst.get('identifiers', [])))
|
||
|
||
print(f"🔍 Found {len(brazilian)} Brazilian institutions")
|
||
print(f"📊 Currently enriched: {enriched_before}/{len(brazilian)} ({enriched_before/len(brazilian)*100:.1f}%)")
|
||
print()
|
||
|
||
# Enrich institutions
|
||
enriched_count = 0
|
||
|
||
for institution in data:
|
||
name = institution.get('name', '')
|
||
|
||
if name in WIKIDATA_MAPPINGS:
|
||
mapping = WIKIDATA_MAPPINGS[name]
|
||
|
||
# Check if already has Wikidata ID
|
||
identifiers = institution.get('identifiers', [])
|
||
has_wikidata = any(id.get('identifier_scheme') == 'Wikidata' for id in identifiers)
|
||
|
||
if not has_wikidata:
|
||
# Add Wikidata identifier
|
||
wikidata_id = {
|
||
'identifier_scheme': 'Wikidata',
|
||
'identifier_value': mapping['qid'],
|
||
'identifier_url': f"https://www.wikidata.org/wiki/{mapping['qid']}"
|
||
}
|
||
|
||
if identifiers:
|
||
identifiers.append(wikidata_id)
|
||
else:
|
||
institution['identifiers'] = [wikidata_id]
|
||
|
||
city = mapping['city']
|
||
print(f"✅ Enriched: {name} ({city})")
|
||
print(f" → Wikidata: {mapping['qid']} (matched '{name}')")
|
||
print()
|
||
|
||
enriched_count += 1
|
||
|
||
if enriched_count == 0:
|
||
print("ℹ️ No new institutions to enrich (all already have Wikidata IDs)")
|
||
return
|
||
|
||
# Create backup
|
||
backup_path = json_path + ".batch4_backup"
|
||
print(f"💾 Creating backup: {backup_path.split('/')[-1]}")
|
||
with open(backup_path, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
|
||
# Write updated JSON
|
||
print(f"💾 Writing updated JSON dataset...")
|
||
with open(json_path, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
|
||
# Update YAML file
|
||
yaml_path = "data/instances/global/global_heritage_institutions_merged.yaml"
|
||
print(f"💾 Updating YAML file (this may take a minute)...")
|
||
with open(yaml_path, 'w', encoding='utf-8') as f:
|
||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||
|
||
print(f"✅ YAML file updated")
|
||
print()
|
||
|
||
# Final statistics
|
||
enriched_after = enriched_before + enriched_count
|
||
|
||
print("=" * 80)
|
||
print("✨ ENRICHMENT COMPLETE - BATCH 4")
|
||
print("=" * 80)
|
||
print(f"Institutions enriched in this batch: {enriched_count}")
|
||
print()
|
||
print(f"Brazilian coverage: {enriched_after}/{len(brazilian)} = {enriched_after/len(brazilian)*100:.1f}%")
|
||
print()
|
||
print("Matched institutions:")
|
||
for name, mapping in WIKIDATA_MAPPINGS.items():
|
||
print(f" • {name} ({mapping['city']}) → {mapping['qid']}")
|
||
|
||
if __name__ == "__main__":
|
||
enrich_batch4()
|