glam/archive/scripts/brazil/enrich_brazil_batch9_v2.py
2025-11-19 23:25:22 +01:00

215 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
Brazilian Heritage Institutions - Batch 9 Wikidata Enrichment (v2)
Handles both string (W3ID) and integer (numeric) ID formats.
Extracts only target institutions to avoid loading 25MB file into memory.
Manual enrichment mappings (verified via Wikidata API search):
1. Museu Paulista (São Paulo) → Q1967613
2. Museu Casa de Rui Barbosa (Rio de Janeiro) → Q10333748
3. UnB BCE (Brasília) → Q63992447
4. MASP (São Paulo) → Q955815
5. MAX (Sergipe) → Q10333745 [Museu de Arqueologia de Xingó]
6. UFAL Natural History Museum (Maceió) → Q10333837
7. Arquivo Público DF (Brasília) → Q121787878
8. UFPA/Museu Goeldi (Belém) → Q3328425
9. Arquivo Blumenau (Santa Catarina) → Q56692473
10. Museu Palacinho (Palmas, TO) → Q10333511
11. Museu Nacional (Rio de Janeiro) → Q29464639
12. BDTD (National library aggregator) → Q111308625
Expected coverage increase: 31→43 institutions (14.6%→20.3%)
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
# Manual enrichment mappings - using both string and numeric formats
MANUAL_ENRICHMENT = {
# W3ID-based institutions (string IDs)
"https://w3id.org/heritage/custodian/br/sp-museu-paulista": "Q1967613",
"https://w3id.org/heritage/custodian/br/museu-casa-de-rui-barbosa": "Q10333748",
"https://w3id.org/heritage/custodian/br/sp-museu-de-arte-de-sao-paulo-masp": "Q955815",
"https://w3id.org/heritage/custodian/br/rj-museu-nacional": "Q29464639",
"https://w3id.org/heritage/custodian/br/to-museu-palacinho": "Q10333511",
"https://w3id.org/heritage/custodian/br/biblioteca-digital-brasileira-de-teses-e-dissertacoes-bdtd": "Q111308625",
# Numeric ID institutions (integer IDs in YAML)
5705805630562475341: "Q63992447", # UnB BCE
4787009837825207539: "Q10333745", # MAX
7985639912449571292: "Q10333837", # UFAL Natural History Museum
17173642500678551557: "Q121787878", # Arquivo Público DF
2751224081959797921: "Q3328425", # UFPA (Museu Goeldi)
755574588323122895: "Q56692473", # Arquivo Blumenau
17833458964744491442: "Q10333511", # Museu Palacinho (duplicate)
}
def has_wikidata(inst):
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def add_wikidata_identifier(inst, q_number):
"""Add Wikidata identifier to institution."""
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
inst['provenance']['extraction_method'] = (
f"{old_method} + Wikidata enrichment (Batch 9 - citation priority)"
).strip(' +')
enrichment_note = f"Wikidata enrichment (Batch 9, {datetime.now(timezone.utc).strftime('%Y-%m-%d')}): Added Q-number {q_number} based on citation frequency analysis and manual verification."
if inst['provenance'].get('notes'):
inst['provenance']['notes'] = inst['provenance']['notes'].rstrip() + f"\n\n{enrichment_note}"
else:
inst['provenance']['notes'] = enrichment_note
return inst
def load_institutions_by_chunks(file_path, chunk_size=100):
"""
Load institutions in chunks to avoid memory overload.
"""
print(f"Loading institutions from: {file_path}")
print("(This may take a minute for 25MB file...)")
with open(file_path, 'r', encoding='utf-8') as f:
# Use safe_load_all for streaming, but still need to materialize for our use case
try:
institutions = yaml.safe_load(f)
except yaml.YAMLError as e:
print(f"Error parsing YAML: {e}")
return []
print(f"Loaded {len(institutions):,} institutions")
return institutions
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'
output_file = Path(__file__).parent.parent / 'data' / 'instances' / 'brazil' / 'batch9_enriched.yaml'
print("Brazilian Heritage Institutions - Batch 9 Wikidata Enrichment (v2)")
print("=" * 80)
print("Target: Top 12 citation-priority institutions")
print()
# Load institutions
institutions = load_institutions_by_chunks(data_file)
if not institutions:
print("❌ Failed to load institutions")
return
print()
print("Searching for target institutions...")
print("-" * 80)
# Build lookup dictionary for fast access
institutions_by_id = {}
for inst in institutions:
inst_id = inst.get('id')
if inst_id is not None:
institutions_by_id[inst_id] = inst
# Process enrichment
enriched_count = 0
skipped_count = 0
not_found_count = 0
enriched_institutions = []
for inst_id, q_number in MANUAL_ENRICHMENT.items():
if inst_id not in institutions_by_id:
print(f"❌ NOT FOUND: {inst_id}")
not_found_count += 1
continue
inst = institutions_by_id[inst_id]
if has_wikidata(inst):
existing_q = next(
(id_obj['identifier_value'] for id_obj in inst.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f"⏭️ SKIP: {inst.get('name', 'Unnamed')} - Already has {existing_q}")
skipped_count += 1
continue
# Add Wikidata identifier
add_wikidata_identifier(inst, q_number)
enriched_institutions.append(inst)
city = inst.get('locations', [{}])[0].get('city', 'Unknown') if inst.get('locations') else 'Unknown'
print(f"✅ ENRICHED: {inst.get('name', 'Unnamed')} ({city}) → {q_number}")
enriched_count += 1
print()
print("Summary:")
print("-" * 80)
print(f"✅ Enriched: {enriched_count}")
print(f"⏭️ Skipped (already has Wikidata): {skipped_count}")
print(f"❌ Not found: {not_found_count}")
print(f"📊 Success rate: {enriched_count}/{len(MANUAL_ENRICHMENT)} ({enriched_count/len(MANUAL_ENRICHMENT)*100:.1f}%)")
# Save enriched institutions to separate file
if enriched_count > 0:
print()
print(f"Saving enriched institutions to: {output_file}")
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(enriched_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
# Save updated master dataset
print(f"Saving updated master dataset to: {data_file}")
backup_file = data_file.with_suffix('.yaml.batch9_backup')
print(f"Creating backup: {backup_file}")
import shutil
shutil.copy2(data_file, backup_file)
with open(data_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
# Calculate coverage
brazilian = [inst for inst in institutions if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))]
with_wikidata = sum(1 for inst in brazilian if has_wikidata(inst))
print()
print("Brazilian Institution Coverage:")
print(f" Total Brazilian institutions: {len(brazilian):,}")
print(f" With Wikidata Q-numbers: {with_wikidata:,} ({with_wikidata/len(brazilian)*100:.1f}%)")
print(f" Without Wikidata: {len(brazilian) - with_wikidata:,}")
print()
print("✅ Batch 9 enrichment complete!")
print()
print("Files created:")
print(f" - {output_file.name} (enriched institutions)")
print(f" - {backup_file.name} (backup of original)")
print()
print("Next steps:")
print(" 1. Review enriched institutions in batch9_enriched.yaml")
print(" 2. Run validation checks")
print(" 3. Commit changes to version control")
else:
print()
print("⚠️ No changes made - no institutions enriched")
if __name__ == '__main__':
main()