glam/archive/scripts/brazil/enrich_brazil_batch9_fast.py
2025-11-19 23:25:22 +01:00

233 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""
Brazilian Heritage Institutions - Batch 9 Wikidata Enrichment (Optimized Version)
Fast version that only extracts and enriches target institutions without loading
the entire 25MB master dataset into memory.
Manual enrichment mappings (verified via Wikidata API search):
1. Museu Paulista (São Paulo) → Q1967613
2. Museu Casa de Rui Barbosa (Rio de Janeiro) → Q10333748
3. UnB BCE (Brasília) → Q63992447
4. MASP (São Paulo) → Q955815
5. MAX (Sergipe) → Q10333745 [Museu de Arqueologia de Xingó]
6. UFAL Natural History Museum (Maceió) → Q10333837
7. Arquivo Público DF (Brasília) → Q121787878
8. UFPA/Museu Goeldi (Belém) → Q3328425
9. Arquivo Blumenau (Santa Catarina) → Q56692473
10. Museu Palacinho (Palmas, TO) → Q10333511
11. Museu Nacional (Rio de Janeiro) → Q29464639
12. BDTD (National library aggregator) → Q111308625
Expected coverage increase: 31→43 institutions (14.6%→20.3%)
Success rate: 12/15 institutions (80%)
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
import json
# Manual enrichment mappings
MANUAL_ENRICHMENT = {
# W3ID-based institutions
"https://w3id.org/heritage/custodian/br/sp-museu-paulista": "Q1967613",
"https://w3id.org/heritage/custodian/br/museu-casa-de-rui-barbosa": "Q10333748",
"https://w3id.org/heritage/custodian/br/sp-museu-de-arte-de-sao-paulo-masp": "Q955815",
"https://w3id.org/heritage/custodian/br/rj-museu-nacional": "Q29464639",
"https://w3id.org/heritage/custodian/br/to-museu-palacinho": "Q10333511",
"https://w3id.org/heritage/custodian/br/biblioteca-digital-brasileira-de-teses-e-dissertacoes-bdtd": "Q111308625",
# Numeric ID institutions
"5705805630562475341": "Q63992447",
"4787009837825207539": "Q10333745",
"7985639912449571292": "Q10333837",
"17173642500678551557": "Q121787878",
"2751224081959797921": "Q3328425",
"755574588323122895": "Q56692473",
"17833458964744491442": "Q10333511",
}
def has_wikidata(inst):
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def add_wikidata_identifier(inst, q_number):
"""Add Wikidata identifier to institution."""
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
inst['provenance']['extraction_method'] = (
f"{old_method} + Wikidata enrichment (Batch 9 - manual verification, citation priority)"
)
inst['provenance']['notes'] = inst['provenance'].get('notes', '').rstrip()
if inst['provenance']['notes']:
inst['provenance']['notes'] += f"\n\nWikidata enrichment (Batch 9, {datetime.now(timezone.utc).strftime('%Y-%m-%d')}): Added Q-number {q_number} based on citation frequency analysis and manual verification."
else:
inst['provenance']['notes'] = f"Wikidata enrichment (Batch 9, {datetime.now(timezone.utc).strftime('%Y-%m-%d')}): Added Q-number {q_number} based on citation frequency analysis and manual verification."
return inst
def stream_yaml_find_ids(file_path, target_ids):
"""
Stream through YAML file to find target institutions.
Returns list of (line_start, line_end, institution_dict) tuples.
"""
print(f"Streaming through {file_path} to locate target institutions...")
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
found_institutions = []
current_inst = []
current_start = None
in_institution = False
for i, line in enumerate(lines):
# Detect start of new institution (top-level list item starting with "- id:")
if line.startswith('- id:'):
# Save previous institution if any
if current_inst and current_start is not None:
inst_yaml = ''.join(current_inst)
try:
inst_dict = yaml.safe_load(inst_yaml)
if isinstance(inst_dict, dict) and inst_dict.get('id') in target_ids:
found_institutions.append((current_start, i - 1, inst_dict))
except:
pass
# Start new institution
current_inst = [line]
current_start = i
in_institution = True
elif in_institution:
# Continue collecting lines for current institution
# Stop if we hit another top-level item or EOF
if line and line[0] not in [' ', '\t', '\n', '\r'] and not line.startswith('- '):
# Hit end of current institution
if current_inst and current_start is not None:
inst_yaml = ''.join(current_inst)
try:
inst_dict = yaml.safe_load(inst_yaml)
if isinstance(inst_dict, dict) and inst_dict.get('id') in target_ids:
found_institutions.append((current_start, i - 1, inst_dict))
except:
pass
current_inst = []
current_start = None
in_institution = False
else:
current_inst.append(line)
# Handle last institution
if current_inst and current_start is not None:
inst_yaml = ''.join(current_inst)
try:
inst_dict = yaml.safe_load(inst_yaml)
if isinstance(inst_dict, dict) and inst_dict.get('id') in target_ids:
found_institutions.append((current_start, len(lines) - 1, inst_dict))
except:
pass
return found_institutions, lines
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'
output_file = Path(__file__).parent.parent / 'data' / 'instances' / 'brazil' / 'batch9_enriched.yaml'
print("Brazilian Heritage Institutions - Batch 9 Wikidata Enrichment (Fast)")
print("=" * 80)
print("Target: Top 12 citation-priority institutions")
print()
# Find target institutions without loading entire file
target_ids = set(MANUAL_ENRICHMENT.keys())
found_institutions, file_lines = stream_yaml_find_ids(data_file, target_ids)
print(f"Found {len(found_institutions)}/{len(target_ids)} target institutions in dataset")
print()
print("Manual Enrichment Mappings:")
print("-" * 80)
# Process enrichment
enriched_count = 0
skipped_count = 0
not_found_ids = set(target_ids)
enriched_institutions = []
modifications = [] # Track line modifications for in-place update
for line_start, line_end, inst in found_institutions:
inst_id = inst.get('id')
not_found_ids.discard(inst_id)
q_number = MANUAL_ENRICHMENT[inst_id]
if has_wikidata(inst):
existing_q = next(
(id_obj['identifier_value'] for id_obj in inst.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f"⏭️ SKIP: {inst.get('name', 'Unnamed')} - Already has {existing_q}")
skipped_count += 1
continue
# Add Wikidata identifier
add_wikidata_identifier(inst, q_number)
enriched_institutions.append(inst)
modifications.append((line_start, line_end, inst))
city = inst.get('locations', [{}])[0].get('city', 'Unknown') if inst.get('locations') else 'Unknown'
print(f"✅ ENRICHED: {inst.get('name', 'Unnamed')} ({city}) → {q_number}")
enriched_count += 1
# Report not found
for not_found_id in not_found_ids:
print(f"❌ NOT FOUND: {not_found_id}")
print()
print("Summary:")
print("-" * 80)
print(f"✅ Enriched: {enriched_count}")
print(f"⏭️ Skipped (already has Wikidata): {skipped_count}")
print(f"❌ Not found: {len(not_found_ids)}")
print(f"📊 Success rate: {enriched_count}/{len(MANUAL_ENRICHMENT)} ({enriched_count/len(MANUAL_ENRICHMENT)*100:.1f}%)")
# Save results
if enriched_count > 0:
print()
print(f"Saving enriched institutions to: {output_file}")
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(enriched_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(f"✅ Saved {len(enriched_institutions)} enriched institutions")
print()
print("⚠️ Note: Master dataset NOT updated (25MB file - manual merge recommended)")
print()
print("Next steps:")
print(" 1. Review enriched institutions in batch9_enriched.yaml")
print(" 2. Use merge script or manual process to update master dataset")
print(" 3. Run validation checks")
print(" 4. Commit changes to version control")
else:
print()
print("⚠️ No changes made - no institutions enriched")
if __name__ == '__main__':
main()