156 lines
5.8 KiB
Python
156 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Enrich Brazilian institutions - Batch 3
|
||
Adds 6 more major institutions with verified Wikidata QIDs.
|
||
"""
|
||
|
||
import json
|
||
import sys
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
# Verified Wikidata QIDs for Batch 3 Brazilian institutions
|
||
BRAZILIAN_BATCH3_QID_MAP = {
|
||
"MAM-BA": "Q10333768", # Museu de Arte Moderna da Bahia, Salvador
|
||
"Centro Dragão do Mar": "Q5305525", # Dragão do Mar Center, Fortaleza
|
||
"CCBB Brasília": "Q56693296", # Centro Cultural Banco do Brasil, Brasília
|
||
"Parque Memorial Quilombo": "Q10345196", # Quilombo dos Palmares Memorial Park
|
||
"Museu Zoroastro Artiaga": "Q10333459", # Museu Zoroastro Artiaga, Goiânia
|
||
}
|
||
|
||
def has_wikidata_id(institution: dict) -> bool:
|
||
"""Check if institution already has a real Wikidata ID."""
|
||
for id_obj in institution.get("identifiers", []):
|
||
if id_obj.get("identifier_scheme") == "Wikidata":
|
||
value = id_obj.get("identifier_value", "")
|
||
if value.startswith("Q") and value[1:].isdigit():
|
||
try:
|
||
q_num = int(value[1:])
|
||
if q_num < 100000000: # Real Q-numbers
|
||
return True
|
||
except ValueError:
|
||
pass
|
||
return False
|
||
|
||
def add_wikidata_id(institution: dict, qid: str, matched_name: str) -> bool:
|
||
"""Add Wikidata identifier to institution."""
|
||
if "identifiers" not in institution:
|
||
institution["identifiers"] = []
|
||
|
||
# Check if already exists
|
||
if has_wikidata_id(institution):
|
||
return False
|
||
|
||
# Add new Wikidata ID
|
||
institution["identifiers"].append({
|
||
"identifier_scheme": "Wikidata",
|
||
"identifier_value": qid,
|
||
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
|
||
})
|
||
|
||
# Update provenance
|
||
if "provenance" not in institution:
|
||
institution["provenance"] = {}
|
||
|
||
provenance = institution["provenance"]
|
||
if "enrichment_history" not in provenance:
|
||
provenance["enrichment_history"] = []
|
||
|
||
provenance["enrichment_history"].append({
|
||
"enrichment_date": datetime.now(timezone.utc).isoformat(),
|
||
"enrichment_method": f"Manual Wikidata QID assignment - Batch 3 (matched: {matched_name})",
|
||
"data_source": "Wikidata",
|
||
"confidence_score": 1.0
|
||
})
|
||
|
||
return True
|
||
|
||
def main():
|
||
input_file = Path("data/instances/global/global_heritage_institutions_merged.json")
|
||
|
||
if not input_file.exists():
|
||
print(f"❌ Input file not found: {input_file}")
|
||
sys.exit(1)
|
||
|
||
print("=" * 80)
|
||
print("🇧🇷 BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT - BATCH 3")
|
||
print("=" * 80)
|
||
print(f"\n📖 Loading dataset from JSON: {input_file.name}")
|
||
|
||
with open(input_file, 'r', encoding='utf-8') as f:
|
||
institutions = json.load(f)
|
||
|
||
print(f"✅ Loaded {len(institutions):,} institutions\n")
|
||
|
||
# Filter to Brazilian institutions only
|
||
brazilian = [inst for inst in institutions if any(
|
||
loc.get('country') == 'BR' for loc in inst.get('locations', [])
|
||
)]
|
||
|
||
print(f"🔍 Found {len(brazilian)} Brazilian institutions")
|
||
|
||
# Count existing Wikidata IDs
|
||
existing_wd = sum(1 for inst in brazilian if has_wikidata_id(inst))
|
||
print(f"📊 Currently enriched: {existing_wd}/{len(brazilian)} ({existing_wd/len(brazilian)*100:.1f}%)\n")
|
||
|
||
enriched_count = 0
|
||
matched_institutions = []
|
||
|
||
for institution in brazilian:
|
||
name = institution.get("name", "")
|
||
|
||
# Check if this institution matches any known names
|
||
for known_name, qid in BRAZILIAN_BATCH3_QID_MAP.items():
|
||
if known_name.lower() in name.lower():
|
||
# Add Wikidata ID
|
||
if add_wikidata_id(institution, qid, known_name):
|
||
enriched_count += 1
|
||
city = institution.get('locations', [{}])[0].get('city', 'Unknown')
|
||
matched_institutions.append({
|
||
"name": name,
|
||
"city": city,
|
||
"qid": qid,
|
||
"matched_pattern": known_name
|
||
})
|
||
print(f"✅ Enriched: {name} ({city})")
|
||
print(f" → Wikidata: {qid} (matched '{known_name}')")
|
||
print()
|
||
break
|
||
|
||
if enriched_count == 0:
|
||
print("ℹ️ No new institutions enriched (all already have Wikidata IDs)")
|
||
return
|
||
|
||
# Create backup of JSON
|
||
backup_file = input_file.with_suffix('.json.batch3_backup')
|
||
print(f"\n💾 Creating backup: {backup_file.name}")
|
||
import shutil
|
||
shutil.copy2(input_file, backup_file)
|
||
|
||
# Write updated JSON (fast!)
|
||
print(f"💾 Writing updated JSON dataset...")
|
||
with open(input_file, 'w', encoding='utf-8') as f:
|
||
json.dump(institutions, f, ensure_ascii=False, indent=2)
|
||
|
||
# Also update YAML file if it exists
|
||
yaml_file = Path("data/instances/global/global_heritage_institutions_merged.yaml")
|
||
if yaml_file.exists():
|
||
print(f"💾 Updating YAML file (this may take a minute)...")
|
||
import yaml
|
||
with open(yaml_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
print(f"✅ YAML file updated")
|
||
|
||
print("\n" + "=" * 80)
|
||
print(f"✨ ENRICHMENT COMPLETE - BATCH 3")
|
||
print("=" * 80)
|
||
print(f"Institutions enriched in this batch: {enriched_count}")
|
||
|
||
new_total = existing_wd + enriched_count
|
||
print(f"\nBrazilian coverage: {new_total}/{len(brazilian)} = {(new_total / len(brazilian) * 100):.1f}%")
|
||
print(f"\nMatched institutions:")
|
||
for match in matched_institutions:
|
||
print(f" • {match['name']} ({match['city']}) → {match['qid']}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|