168 lines
5.7 KiB
Python
168 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Brazilian Heritage Institutions - Batch 6 Wikidata Enrichment
|
|
|
|
Adds Wikidata Q-numbers to 8 major Brazilian institutions:
|
|
1. Museu Histórico Nacional (Rio de Janeiro) → Q510993
|
|
2. Casa de Oswaldo Cruz (Rio de Janeiro/Fiocruz) → Q56693275
|
|
3. Museu da República (Rio de Janeiro) → Q56695450
|
|
4. Arquivo Nacional (Rio de Janeiro) → Q2860546
|
|
5. Arquivo Público Mineiro (Belo Horizonte) → Q16494004
|
|
6. Museu Afro Brasil (São Paulo) → Q10333377
|
|
7. Memorial da América Latina (São Paulo) → Q1920091
|
|
8. Museu do Amanhã (Rio de Janeiro) → Q10333874
|
|
|
|
Coverage: 22 → 30 institutions (22.7% → 30.9%)
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Institution mappings: (name_fragment, city_fragment) → Q-number
|
|
BATCH_6_MAPPINGS = [
|
|
# Rio de Janeiro institutions
|
|
("Museu Histórico Nacional", "Rio de Janeiro", "Q510993"),
|
|
("Casa de Oswaldo Cruz", "Rio de Janeiro", "Q56693275"),
|
|
("Museu da República", "Rio de Janeiro", "Q56695450"),
|
|
("Arquivo Nacional", "Rio de Janeiro", "Q2860546"),
|
|
("Museu do Amanhã", "Rio de Janeiro", "Q10333874"),
|
|
|
|
# Minas Gerais
|
|
("Arquivo Público Mineiro", "Belo Horizonte", "Q16494004"),
|
|
|
|
# São Paulo
|
|
("Museu Afro Brasil", "São Paulo", "Q10333377"),
|
|
("Memorial da América Latina", "São Paulo", "Q1920091"),
|
|
]
|
|
|
|
def matches_institution(inst, name_fragment, city_fragment):
|
|
"""Check if institution matches name and city criteria."""
|
|
name_match = name_fragment.lower() in inst.get('name', '').lower()
|
|
|
|
city_match = False
|
|
for loc in inst.get('locations', []):
|
|
city = loc.get('city', '').lower()
|
|
if city_fragment.lower() in city:
|
|
city_match = True
|
|
break
|
|
|
|
return name_match and city_match
|
|
|
|
def has_wikidata(inst):
|
|
"""Check if institution already has Wikidata identifier."""
|
|
return any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
|
|
def add_wikidata_identifier(inst, q_number):
|
|
"""Add Wikidata identifier to institution."""
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(wikidata_id)
|
|
|
|
# Update provenance
|
|
if 'provenance' in inst:
|
|
old_method = inst['provenance'].get('extraction_method', '')
|
|
inst['provenance']['extraction_method'] = (
|
|
f"{old_method} + Wikidata enrichment (Batch 6 - manual verification)"
|
|
)
|
|
|
|
return inst
|
|
|
|
def main():
|
|
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'global' / 'global_heritage_institutions_merged.json'
|
|
backup_file = data_file.with_suffix('.batch6_backup')
|
|
|
|
print("Brazilian Heritage Institutions - Batch 6 Wikidata Enrichment")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Load data
|
|
print(f"Loading: {data_file}")
|
|
with open(data_file, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
|
|
# Create backup
|
|
print(f"Creating backup: {backup_file}")
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
json.dump(institutions, f, indent=2, ensure_ascii=False)
|
|
|
|
print()
|
|
print("Enrichment Mappings:")
|
|
print("-" * 70)
|
|
|
|
# Process each mapping
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
not_found_count = 0
|
|
|
|
for name_fragment, city_fragment, q_number in BATCH_6_MAPPINGS:
|
|
# Find matching institution
|
|
matched = None
|
|
for inst in institutions:
|
|
if matches_institution(inst, name_fragment, city_fragment):
|
|
matched = inst
|
|
break
|
|
|
|
if not matched:
|
|
print(f"❌ NOT FOUND: {name_fragment} ({city_fragment})")
|
|
not_found_count += 1
|
|
continue
|
|
|
|
if has_wikidata(matched):
|
|
existing_q = next(
|
|
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
|
|
if id_obj.get('identifier_scheme') == 'Wikidata'),
|
|
None
|
|
)
|
|
print(f"⏭️ SKIP: {matched['name']} - Already has {existing_q}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Add Wikidata identifier
|
|
add_wikidata_identifier(matched, q_number)
|
|
city = matched.get('locations', [{}])[0].get('city', 'Unknown')
|
|
print(f"✅ ENRICHED: {matched['name']} ({city}) → {q_number}")
|
|
enriched_count += 1
|
|
|
|
print()
|
|
print("Summary:")
|
|
print("-" * 70)
|
|
print(f"✅ Enriched: {enriched_count}")
|
|
print(f"⏭️ Skipped (already has Wikidata): {skipped_count}")
|
|
print(f"❌ Not found: {not_found_count}")
|
|
|
|
# Save updated data
|
|
if enriched_count > 0:
|
|
print()
|
|
print(f"Saving updated data to: {data_file}")
|
|
with open(data_file, 'w', encoding='utf-8') as f:
|
|
json.dump(institutions, f, indent=2, ensure_ascii=False)
|
|
|
|
# Calculate new coverage
|
|
brazilian = [inst for inst in institutions if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))]
|
|
with_wikidata = sum(1 for inst in brazilian if has_wikidata(inst))
|
|
|
|
print()
|
|
print("Brazilian Institution Coverage:")
|
|
print(f" Total: {len(brazilian)}")
|
|
print(f" With Wikidata: {with_wikidata} ({with_wikidata/len(brazilian)*100:.1f}%)")
|
|
print(f" Without Wikidata: {len(brazilian) - with_wikidata}")
|
|
|
|
print()
|
|
print("✅ Batch 6 enrichment complete!")
|
|
else:
|
|
print()
|
|
print("⚠️ No changes made - no institutions enriched")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|