glam/archive/scripts/brazil/enrich_brazilian_batch3.py
2025-11-19 23:25:22 +01:00

156 lines
5.8 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Enrich Brazilian institutions - Batch 3
Adds 6 more major institutions with verified Wikidata QIDs.
"""
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
# Verified Wikidata QIDs for Batch 3 Brazilian institutions
BRAZILIAN_BATCH3_QID_MAP = {
"MAM-BA": "Q10333768", # Museu de Arte Moderna da Bahia, Salvador
"Centro Dragão do Mar": "Q5305525", # Dragão do Mar Center, Fortaleza
"CCBB Brasília": "Q56693296", # Centro Cultural Banco do Brasil, Brasília
"Parque Memorial Quilombo": "Q10345196", # Quilombo dos Palmares Memorial Park
"Museu Zoroastro Artiaga": "Q10333459", # Museu Zoroastro Artiaga, Goiânia
}
def has_wikidata_id(institution: dict) -> bool:
"""Check if institution already has a real Wikidata ID."""
for id_obj in institution.get("identifiers", []):
if id_obj.get("identifier_scheme") == "Wikidata":
value = id_obj.get("identifier_value", "")
if value.startswith("Q") and value[1:].isdigit():
try:
q_num = int(value[1:])
if q_num < 100000000: # Real Q-numbers
return True
except ValueError:
pass
return False
def add_wikidata_id(institution: dict, qid: str, matched_name: str) -> bool:
"""Add Wikidata identifier to institution."""
if "identifiers" not in institution:
institution["identifiers"] = []
# Check if already exists
if has_wikidata_id(institution):
return False
# Add new Wikidata ID
institution["identifiers"].append({
"identifier_scheme": "Wikidata",
"identifier_value": qid,
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
})
# Update provenance
if "provenance" not in institution:
institution["provenance"] = {}
provenance = institution["provenance"]
if "enrichment_history" not in provenance:
provenance["enrichment_history"] = []
provenance["enrichment_history"].append({
"enrichment_date": datetime.now(timezone.utc).isoformat(),
"enrichment_method": f"Manual Wikidata QID assignment - Batch 3 (matched: {matched_name})",
"data_source": "Wikidata",
"confidence_score": 1.0
})
return True
def main():
input_file = Path("data/instances/global/global_heritage_institutions_merged.json")
if not input_file.exists():
print(f"❌ Input file not found: {input_file}")
sys.exit(1)
print("=" * 80)
print("🇧🇷 BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT - BATCH 3")
print("=" * 80)
print(f"\n📖 Loading dataset from JSON: {input_file.name}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = json.load(f)
print(f"✅ Loaded {len(institutions):,} institutions\n")
# Filter to Brazilian institutions only
brazilian = [inst for inst in institutions if any(
loc.get('country') == 'BR' for loc in inst.get('locations', [])
)]
print(f"🔍 Found {len(brazilian)} Brazilian institutions")
# Count existing Wikidata IDs
existing_wd = sum(1 for inst in brazilian if has_wikidata_id(inst))
print(f"📊 Currently enriched: {existing_wd}/{len(brazilian)} ({existing_wd/len(brazilian)*100:.1f}%)\n")
enriched_count = 0
matched_institutions = []
for institution in brazilian:
name = institution.get("name", "")
# Check if this institution matches any known names
for known_name, qid in BRAZILIAN_BATCH3_QID_MAP.items():
if known_name.lower() in name.lower():
# Add Wikidata ID
if add_wikidata_id(institution, qid, known_name):
enriched_count += 1
city = institution.get('locations', [{}])[0].get('city', 'Unknown')
matched_institutions.append({
"name": name,
"city": city,
"qid": qid,
"matched_pattern": known_name
})
print(f"✅ Enriched: {name} ({city})")
print(f" → Wikidata: {qid} (matched '{known_name}')")
print()
break
if enriched_count == 0:
print(" No new institutions enriched (all already have Wikidata IDs)")
return
# Create backup of JSON
backup_file = input_file.with_suffix('.json.batch3_backup')
print(f"\n💾 Creating backup: {backup_file.name}")
import shutil
shutil.copy2(input_file, backup_file)
# Write updated JSON (fast!)
print(f"💾 Writing updated JSON dataset...")
with open(input_file, 'w', encoding='utf-8') as f:
json.dump(institutions, f, ensure_ascii=False, indent=2)
# Also update YAML file if it exists
yaml_file = Path("data/instances/global/global_heritage_institutions_merged.yaml")
if yaml_file.exists():
print(f"💾 Updating YAML file (this may take a minute)...")
import yaml
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(f"✅ YAML file updated")
print("\n" + "=" * 80)
print(f"✨ ENRICHMENT COMPLETE - BATCH 3")
print("=" * 80)
print(f"Institutions enriched in this batch: {enriched_count}")
new_total = existing_wd + enriched_count
print(f"\nBrazilian coverage: {new_total}/{len(brazilian)} = {(new_total / len(brazilian) * 100):.1f}%")
print(f"\nMatched institutions:")
for match in matched_institutions:
print(f"{match['name']} ({match['city']}) → {match['qid']}")
if __name__ == "__main__":
main()