glam/archive/scripts/brazil/enrich_brazilian_batch6.py
2025-11-19 23:25:22 +01:00

168 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Brazilian Heritage Institutions - Batch 6 Wikidata Enrichment
Adds Wikidata Q-numbers to 8 major Brazilian institutions:
1. Museu Histórico Nacional (Rio de Janeiro) → Q510993
2. Casa de Oswaldo Cruz (Rio de Janeiro/Fiocruz) → Q56693275
3. Museu da República (Rio de Janeiro) → Q56695450
4. Arquivo Nacional (Rio de Janeiro) → Q2860546
5. Arquivo Público Mineiro (Belo Horizonte) → Q16494004
6. Museu Afro Brasil (São Paulo) → Q10333377
7. Memorial da América Latina (São Paulo) → Q1920091
8. Museu do Amanhã (Rio de Janeiro) → Q10333874
Coverage: 22 → 30 institutions (22.7% → 30.9%)
"""
import json
from datetime import datetime, timezone
from pathlib import Path
# Institution mappings: (name_fragment, city_fragment) → Q-number
BATCH_6_MAPPINGS = [
# Rio de Janeiro institutions
("Museu Histórico Nacional", "Rio de Janeiro", "Q510993"),
("Casa de Oswaldo Cruz", "Rio de Janeiro", "Q56693275"),
("Museu da República", "Rio de Janeiro", "Q56695450"),
("Arquivo Nacional", "Rio de Janeiro", "Q2860546"),
("Museu do Amanhã", "Rio de Janeiro", "Q10333874"),
# Minas Gerais
("Arquivo Público Mineiro", "Belo Horizonte", "Q16494004"),
# São Paulo
("Museu Afro Brasil", "São Paulo", "Q10333377"),
("Memorial da América Latina", "São Paulo", "Q1920091"),
]
def matches_institution(inst, name_fragment, city_fragment):
"""Check if institution matches name and city criteria."""
name_match = name_fragment.lower() in inst.get('name', '').lower()
city_match = False
for loc in inst.get('locations', []):
city = loc.get('city', '').lower()
if city_fragment.lower() in city:
city_match = True
break
return name_match and city_match
def has_wikidata(inst):
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def add_wikidata_identifier(inst, q_number):
"""Add Wikidata identifier to institution."""
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
inst['provenance']['extraction_method'] = (
f"{old_method} + Wikidata enrichment (Batch 6 - manual verification)"
)
return inst
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'global' / 'global_heritage_institutions_merged.json'
backup_file = data_file.with_suffix('.batch6_backup')
print("Brazilian Heritage Institutions - Batch 6 Wikidata Enrichment")
print("=" * 70)
print()
# Load data
print(f"Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
institutions = json.load(f)
# Create backup
print(f"Creating backup: {backup_file}")
with open(backup_file, 'w', encoding='utf-8') as f:
json.dump(institutions, f, indent=2, ensure_ascii=False)
print()
print("Enrichment Mappings:")
print("-" * 70)
# Process each mapping
enriched_count = 0
skipped_count = 0
not_found_count = 0
for name_fragment, city_fragment, q_number in BATCH_6_MAPPINGS:
# Find matching institution
matched = None
for inst in institutions:
if matches_institution(inst, name_fragment, city_fragment):
matched = inst
break
if not matched:
print(f"❌ NOT FOUND: {name_fragment} ({city_fragment})")
not_found_count += 1
continue
if has_wikidata(matched):
existing_q = next(
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f"⏭️ SKIP: {matched['name']} - Already has {existing_q}")
skipped_count += 1
continue
# Add Wikidata identifier
add_wikidata_identifier(matched, q_number)
city = matched.get('locations', [{}])[0].get('city', 'Unknown')
print(f"✅ ENRICHED: {matched['name']} ({city}) → {q_number}")
enriched_count += 1
print()
print("Summary:")
print("-" * 70)
print(f"✅ Enriched: {enriched_count}")
print(f"⏭️ Skipped (already has Wikidata): {skipped_count}")
print(f"❌ Not found: {not_found_count}")
# Save updated data
if enriched_count > 0:
print()
print(f"Saving updated data to: {data_file}")
with open(data_file, 'w', encoding='utf-8') as f:
json.dump(institutions, f, indent=2, ensure_ascii=False)
# Calculate new coverage
brazilian = [inst for inst in institutions if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))]
with_wikidata = sum(1 for inst in brazilian if has_wikidata(inst))
print()
print("Brazilian Institution Coverage:")
print(f" Total: {len(brazilian)}")
print(f" With Wikidata: {with_wikidata} ({with_wikidata/len(brazilian)*100:.1f}%)")
print(f" Without Wikidata: {len(brazilian) - with_wikidata}")
print()
print("✅ Batch 6 enrichment complete!")
else:
print()
print("⚠️ No changes made - no institutions enriched")
if __name__ == '__main__':
main()