146 lines
5.5 KiB
Python
146 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Enrich major Brazilian institutions with verified Wikidata QIDs.
|
||
Works on the merged dataset file.
|
||
"""
|
||
|
||
import yaml
|
||
import sys
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
# Verified Wikidata QIDs for major Brazilian institutions
|
||
BRAZILIAN_QID_MAP = {
|
||
"Museu Nacional": "Q1850416", # National Museum, Rio de Janeiro
|
||
"MASP": "Q82941", # São Paulo Museum of Art
|
||
"Pinacoteca": "Q2095209", # Pinacoteca do Estado de São Paulo
|
||
"Inhotim": "Q478245", # Inhotim Institute, Brumadinho
|
||
"Museu Goeldi": "Q3328425", # Museu Paraense Emílio Goeldi, Belém
|
||
"Teatro Amazonas": "Q1434444", # Amazon Theatre, Manaus
|
||
"Instituto Ricardo Brennand": "Q2216591", # Ricardo Brennand Institute, Recife
|
||
"IMS": "Q6041378", # Instituto Moreira Salles (main org)
|
||
}
|
||
|
||
def has_wikidata_id(institution: dict) -> bool:
|
||
"""Check if institution already has a real Wikidata ID."""
|
||
for id_obj in institution.get("identifiers", []):
|
||
if id_obj.get("identifier_scheme") == "Wikidata":
|
||
value = id_obj.get("identifier_value", "")
|
||
if value.startswith("Q") and value[1:].isdigit():
|
||
# Check it's a real Q-number (not synthetic)
|
||
try:
|
||
q_num = int(value[1:])
|
||
if q_num < 100000000:
|
||
return True
|
||
except ValueError:
|
||
pass
|
||
return False
|
||
|
||
def add_wikidata_id(institution: dict, qid: str, matched_name: str) -> bool:
|
||
"""Add Wikidata identifier to institution."""
|
||
if "identifiers" not in institution:
|
||
institution["identifiers"] = []
|
||
|
||
# Check if already exists
|
||
if has_wikidata_id(institution):
|
||
return False
|
||
|
||
# Add new Wikidata ID
|
||
institution["identifiers"].append({
|
||
"identifier_scheme": "Wikidata",
|
||
"identifier_value": qid,
|
||
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
|
||
})
|
||
|
||
# Update provenance
|
||
if "provenance" not in institution:
|
||
institution["provenance"] = {}
|
||
|
||
provenance = institution["provenance"]
|
||
if "enrichment_history" not in provenance:
|
||
provenance["enrichment_history"] = []
|
||
|
||
provenance["enrichment_history"].append({
|
||
"enrichment_date": datetime.now(timezone.utc).isoformat(),
|
||
"enrichment_method": f"Manual Wikidata QID assignment (matched: {matched_name})",
|
||
"data_source": "Wikidata",
|
||
"confidence_score": 1.0
|
||
})
|
||
|
||
return True
|
||
|
||
def main():
|
||
input_file = Path("data/instances/global/global_heritage_institutions_merged.yaml")
|
||
|
||
if not input_file.exists():
|
||
print(f"❌ Input file not found: {input_file}")
|
||
sys.exit(1)
|
||
|
||
print("=" * 80)
|
||
print("🇧🇷 MAJOR BRAZILIAN INSTITUTIONS WIKIDATA ENRICHMENT")
|
||
print("=" * 80)
|
||
print(f"\n📖 Loading dataset from: {input_file.name}")
|
||
|
||
with open(input_file, 'r', encoding='utf-8') as f:
|
||
institutions = yaml.safe_load(f)
|
||
|
||
print(f"✅ Loaded {len(institutions):,} institutions\n")
|
||
|
||
# Filter to Brazilian institutions only
|
||
brazilian = [inst for inst in institutions if any(
|
||
loc.get('country') == 'BR' for loc in inst.get('locations', [])
|
||
)]
|
||
|
||
print(f"🔍 Found {len(brazilian)} Brazilian institutions\n")
|
||
|
||
enriched_count = 0
|
||
matched_institutions = []
|
||
|
||
for institution in brazilian:
|
||
name = institution.get("name", "")
|
||
|
||
# Check if this institution matches any known names
|
||
for known_name, qid in BRAZILIAN_QID_MAP.items():
|
||
if known_name.lower() in name.lower():
|
||
# Add Wikidata ID
|
||
if add_wikidata_id(institution, qid, known_name):
|
||
enriched_count += 1
|
||
city = institution.get('locations', [{}])[0].get('city', 'Unknown')
|
||
matched_institutions.append({
|
||
"name": name,
|
||
"city": city,
|
||
"qid": qid,
|
||
"matched_pattern": known_name
|
||
})
|
||
print(f"✅ Enriched: {name} ({city})")
|
||
print(f" → Wikidata: {qid} (matched '{known_name}')")
|
||
print()
|
||
break
|
||
|
||
if enriched_count == 0:
|
||
print("ℹ️ No institutions enriched (all already have Wikidata IDs)")
|
||
return
|
||
|
||
# Create backup
|
||
backup_file = input_file.with_suffix('.yaml.before_brazilian_manual')
|
||
print(f"\n💾 Creating backup: {backup_file.name}")
|
||
with open(backup_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
|
||
# Write updated dataset
|
||
print(f"💾 Writing updated dataset: {input_file.name}")
|
||
with open(input_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
|
||
print("\n" + "=" * 80)
|
||
print(f"✨ ENRICHMENT COMPLETE")
|
||
print("=" * 80)
|
||
print(f"Institutions enriched: {enriched_count}")
|
||
print(f"\nBrazilian coverage: {enriched_count + 1}/97 = {((enriched_count + 1) / 97 * 100):.1f}%")
|
||
print(f"(Including 1 previously enriched institution)")
|
||
print(f"\nMatched institutions:")
|
||
for match in matched_institutions:
|
||
print(f" • {match['name']} ({match['city']}) → {match['qid']}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|