glam/archive/scripts/brazil/enrich_brazilian_major_institutions.py
2025-11-19 23:25:22 +01:00

146 lines
5.5 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Enrich major Brazilian institutions with verified Wikidata QIDs.
Works on the merged dataset file.
"""
import yaml
import sys
from datetime import datetime, timezone
from pathlib import Path
# Verified Wikidata QIDs for major Brazilian institutions
BRAZILIAN_QID_MAP = {
"Museu Nacional": "Q1850416", # National Museum, Rio de Janeiro
"MASP": "Q82941", # São Paulo Museum of Art
"Pinacoteca": "Q2095209", # Pinacoteca do Estado de São Paulo
"Inhotim": "Q478245", # Inhotim Institute, Brumadinho
"Museu Goeldi": "Q3328425", # Museu Paraense Emílio Goeldi, Belém
"Teatro Amazonas": "Q1434444", # Amazon Theatre, Manaus
"Instituto Ricardo Brennand": "Q2216591", # Ricardo Brennand Institute, Recife
"IMS": "Q6041378", # Instituto Moreira Salles (main org)
}
def has_wikidata_id(institution: dict) -> bool:
"""Check if institution already has a real Wikidata ID."""
for id_obj in institution.get("identifiers", []):
if id_obj.get("identifier_scheme") == "Wikidata":
value = id_obj.get("identifier_value", "")
if value.startswith("Q") and value[1:].isdigit():
# Check it's a real Q-number (not synthetic)
try:
q_num = int(value[1:])
if q_num < 100000000:
return True
except ValueError:
pass
return False
def add_wikidata_id(institution: dict, qid: str, matched_name: str) -> bool:
"""Add Wikidata identifier to institution."""
if "identifiers" not in institution:
institution["identifiers"] = []
# Check if already exists
if has_wikidata_id(institution):
return False
# Add new Wikidata ID
institution["identifiers"].append({
"identifier_scheme": "Wikidata",
"identifier_value": qid,
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
})
# Update provenance
if "provenance" not in institution:
institution["provenance"] = {}
provenance = institution["provenance"]
if "enrichment_history" not in provenance:
provenance["enrichment_history"] = []
provenance["enrichment_history"].append({
"enrichment_date": datetime.now(timezone.utc).isoformat(),
"enrichment_method": f"Manual Wikidata QID assignment (matched: {matched_name})",
"data_source": "Wikidata",
"confidence_score": 1.0
})
return True
def main():
input_file = Path("data/instances/global/global_heritage_institutions_merged.yaml")
if not input_file.exists():
print(f"❌ Input file not found: {input_file}")
sys.exit(1)
print("=" * 80)
print("🇧🇷 MAJOR BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT")
print("=" * 80)
print(f"\n📖 Loading dataset from: {input_file.name}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"✅ Loaded {len(institutions):,} institutions\n")
# Filter to Brazilian institutions only
brazilian = [inst for inst in institutions if any(
loc.get('country') == 'BR' for loc in inst.get('locations', [])
)]
print(f"🔍 Found {len(brazilian)} Brazilian institutions\n")
enriched_count = 0
matched_institutions = []
for institution in brazilian:
name = institution.get("name", "")
# Check if this institution matches any known names
for known_name, qid in BRAZILIAN_QID_MAP.items():
if known_name.lower() in name.lower():
# Add Wikidata ID
if add_wikidata_id(institution, qid, known_name):
enriched_count += 1
city = institution.get('locations', [{}])[0].get('city', 'Unknown')
matched_institutions.append({
"name": name,
"city": city,
"qid": qid,
"matched_pattern": known_name
})
print(f"✅ Enriched: {name} ({city})")
print(f" → Wikidata: {qid} (matched '{known_name}')")
print()
break
if enriched_count == 0:
print(" No institutions enriched (all already have Wikidata IDs)")
return
# Create backup
backup_file = input_file.with_suffix('.yaml.before_brazilian_manual')
print(f"\n💾 Creating backup: {backup_file.name}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
# Write updated dataset
print(f"💾 Writing updated dataset: {input_file.name}")
with open(input_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print("\n" + "=" * 80)
print(f"✨ ENRICHMENT COMPLETE")
print("=" * 80)
print(f"Institutions enriched: {enriched_count}")
print(f"\nBrazilian coverage: {enriched_count + 1}/97 = {((enriched_count + 1) / 97 * 100):.1f}%")
print(f"(Including 1 previously enriched institution)")
print(f"\nMatched institutions:")
for match in matched_institutions:
print(f"{match['name']} ({match['city']}) → {match['qid']}")
if __name__ == "__main__":
main()