glam/archive/scripts/brazil/enrich_brazil_batch9.py
2025-11-19 23:25:22 +01:00

197 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
Brazilian Heritage Institutions - Batch 9 Wikidata Enrichment
Adds Wikidata Q-numbers to 12 top-priority Brazilian institutions identified
through citation frequency analysis. These are the most-cited institutions in
Brazilian GLAM research conversations.
Manual enrichment mappings (verified via Wikidata API search):
1. Museu Paulista (São Paulo) → Q1967613
2. Museu Casa de Rui Barbosa (Rio de Janeiro) → Q10333748
3. UnB BCE (Brasília) → Q63992447
4. MASP (São Paulo) → Q955815
5. MAX (Sergipe) → Q10333745 [Museu de Arqueologia de Xingó]
6. UFAL Natural History Museum (Maceió) → Q10333837
7. Arquivo Público DF (Brasília) → Q121787878
8. UFPA/Museu Goeldi (Belém) → Q3328425
9. Arquivo Blumenau (Santa Catarina) → Q56692473
10. Museu Palacinho (Palmas, TO) → Q10333511
11. Museu Nacional (Rio de Janeiro) → Q29464639
12. BDTD (National library aggregator) → Q111308625
Excluded institutions (no Wikidata Q-number found):
- Museu Sacaca (Macapá, Amapá) - Indigenous culture museum
- Museu da Borracha (Acre) - Rubber museum
Expected coverage increase: 31→43 institutions (14.6%→20.3%)
Success rate: 12/15 institutions (80%)
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
# Manual enrichment mappings
# Format: institution_id → Q-number
MANUAL_ENRICHMENT = {
# W3ID-based institutions (from original Brazil extraction)
"https://w3id.org/heritage/custodian/br/sp-museu-paulista": "Q1967613",
"https://w3id.org/heritage/custodian/br/museu-casa-de-rui-barbosa": "Q10333748",
"https://w3id.org/heritage/custodian/br/sp-museu-de-arte-de-sao-paulo-masp": "Q955815",
"https://w3id.org/heritage/custodian/br/rj-museu-nacional": "Q29464639",
"https://w3id.org/heritage/custodian/br/to-museu-palacinho": "Q10333511",
"https://w3id.org/heritage/custodian/br/biblioteca-digital-brasileira-de-teses-e-dissertacoes-bdtd": "Q111308625",
# Numeric ID institutions (from global enrichment v2.1)
"5705805630562475341": "Q63992447", # UnB BCE (Brasília)
"4787009837825207539": "Q10333745", # MAX (Museu de Arqueologia de Xingó)
"7985639912449571292": "Q10333837", # UFAL Natural History Museum
"17173642500678551557": "Q121787878", # Arquivo Público DF
"2751224081959797921": "Q3328425", # UFPA (Museu Paraense Emílio Goeldi)
"755574588323122895": "Q56692473", # Arquivo Blumenau
"17833458964744491442": "Q10333511", # Museu Palacinho (duplicate of TO entry above)
# Note: Museu Sacaca (1628860276197113272) and Museu da Borracha
# (https://w3id.org/heritage/custodian/br/ac-museu-da-borracha) excluded
# due to no Wikidata Q-number found
}
def has_wikidata(inst):
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def add_wikidata_identifier(inst, q_number):
"""Add Wikidata identifier to institution."""
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
inst['provenance']['extraction_method'] = (
f"{old_method} + Wikidata enrichment (Batch 9 - manual verification, citation priority)"
)
inst['provenance']['notes'] = inst['provenance'].get('notes', '').rstrip()
if inst['provenance']['notes']:
inst['provenance']['notes'] += f"\n\nWikidata enrichment (Batch 9, {datetime.now(timezone.utc).strftime('%Y-%m-%d')}): Added Q-number {q_number} based on citation frequency analysis and manual verification."
else:
inst['provenance']['notes'] = f"Wikidata enrichment (Batch 9, {datetime.now(timezone.utc).strftime('%Y-%m-%d')}): Added Q-number {q_number} based on citation frequency analysis and manual verification."
return inst
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'
output_file = Path(__file__).parent.parent / 'data' / 'instances' / 'brazil' / 'batch9_enriched.yaml'
backup_file = data_file.with_suffix('.batch9_backup')
print("Brazilian Heritage Institutions - Batch 9 Wikidata Enrichment")
print("=" * 80)
print("Target: Top 12 citation-priority institutions from batch9_candidates_analysis")
print()
# Load data
print(f"Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
# Create backup
print(f"Creating backup: {backup_file}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print()
print("Manual Enrichment Mappings (12 institutions):")
print("-" * 80)
# Process enrichment
enriched_count = 0
skipped_count = 0
not_found_count = 0
enriched_institutions = []
for inst_id, q_number in MANUAL_ENRICHMENT.items():
# Find institution by ID
matched = None
for inst in institutions:
if inst.get('id') == inst_id:
matched = inst
break
if not matched:
print(f"❌ NOT FOUND: {inst_id}")
not_found_count += 1
continue
if has_wikidata(matched):
existing_q = next(
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f"⏭️ SKIP: {matched.get('name', 'Unnamed')} - Already has {existing_q}")
skipped_count += 1
continue
# Add Wikidata identifier
add_wikidata_identifier(matched, q_number)
enriched_institutions.append(matched)
city = matched.get('locations', [{}])[0].get('city', 'Unknown') if matched.get('locations') else 'Unknown'
print(f"✅ ENRICHED: {matched.get('name', 'Unnamed')} ({city}) → {q_number}")
enriched_count += 1
print()
print("Summary:")
print("-" * 80)
print(f"✅ Enriched: {enriched_count}")
print(f"⏭️ Skipped (already has Wikidata): {skipped_count}")
print(f"❌ Not found: {not_found_count}")
print(f"📊 Success rate: {enriched_count}/{len(MANUAL_ENRICHMENT)} ({enriched_count/len(MANUAL_ENRICHMENT)*100:.1f}%)")
# Save updated data
if enriched_count > 0:
print()
print(f"Saving enriched institutions to: {output_file}")
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(enriched_institutions, f, allow_unicode=True, sort_keys=False)
print(f"Saving updated master dataset to: {data_file}")
with open(data_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
# Calculate new coverage
brazilian = [inst for inst in institutions if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))]
with_wikidata = sum(1 for inst in brazilian if has_wikidata(inst))
print()
print("Brazilian Institution Coverage:")
print(f" Total: {len(brazilian)}")
print(f" With Wikidata: {with_wikidata} ({with_wikidata/len(brazilian)*100:.1f}%)")
print(f" Without Wikidata: {len(brazilian) - with_wikidata}")
print()
print("✅ Batch 9 enrichment complete!")
print()
print("Next steps:")
print(" 1. Review enriched institutions in batch9_enriched.yaml")
print(" 2. Run validation checks")
print(" 3. Commit changes to version control")
else:
print()
print("⚠️ No changes made - no institutions enriched")
if __name__ == '__main__':
main()