glam/scripts/enrich_mexico_batch01.py
2025-12-09 09:16:19 +01:00

190 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
Mexican Wikidata Enrichment - Batch 1
Add Wikidata identifiers to 6 national priority institutions.
Institutions:
1. Museo Nacional de Antropología (Q524249, VIAF: 139462066)
2. Museo Nacional de Arte (Q1138147, VIAF: 137951343)
3. Biblioteca Nacional de México (Q5495070, ISIL: MX-MXBN, VIAF: 147873206)
4. Cineteca Nacional (Q1092492)
5. Fototeca Nacional (Q66432183)
6. Instituto Nacional de Antropología e Historia (Q901361, VIAF: 139735572)
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
# Define enrichment data
BATCH_1_ENRICHMENTS = {
"Museo Nacional de Antropología": {
"wikidata": "Q524249",
"viaf": "139462066",
"confidence": 0.98,
"notes": "Verified via SPARQL query and VIAF match"
},
"Museo Nacional de Arte (MUNAL)": {
"wikidata": "Q1138147",
"viaf": "137951343",
"confidence": 0.98,
"notes": "Verified via SPARQL query and VIAF match"
},
"Biblioteca Nacional de México": {
"wikidata": "Q5495070",
"viaf": "147873206",
"isil": "MX-MXBN",
"confidence": 0.98,
"notes": "Verified via SPARQL query, VIAF, and ISIL registry match"
},
"Cineteca Nacional": {
"wikidata": "Q1092492",
"confidence": 0.95,
"notes": "Verified via SPARQL query"
},
"Fototeca Nacional": {
"wikidata": "Q66432183",
"confidence": 0.95,
"notes": "Verified via SPARQL query"
},
"Instituto Nacional de Antropología e Historia": {
"wikidata": "Q901361",
"viaf": "139735572",
"confidence": 0.98,
"notes": "Verified via SPARQL query and VIAF match"
}
}
def add_enrichment_identifiers(institution: dict, enrichment: dict) -> dict:
"""Add Wikidata, VIAF, and ISIL identifiers to institution record."""
# Ensure identifiers list exists
if 'identifiers' not in institution or institution['identifiers'] is None:
institution['identifiers'] = []
# Add Wikidata identifier
if 'wikidata' in enrichment:
q_number = enrichment['wikidata']
institution['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
})
# Add VIAF identifier
if 'viaf' in enrichment:
viaf_id = enrichment['viaf']
institution['identifiers'].append({
'identifier_scheme': 'VIAF',
'identifier_value': viaf_id,
'identifier_url': f'https://viaf.org/viaf/{viaf_id}'
})
# Add ISIL identifier
if 'isil' in enrichment:
isil_code = enrichment['isil']
institution['identifiers'].append({
'identifier_scheme': 'ISIL',
'identifier_value': isil_code,
# ISIL codes don't have a universal URL
})
# Update provenance metadata
current_time = datetime.now(timezone.utc).isoformat()
if 'provenance' not in institution:
institution['provenance'] = {}
# Add enrichment metadata
if 'enrichment_history' not in institution['provenance']:
institution['provenance']['enrichment_history'] = []
enrichment_entry = {
'enrichment_date': current_time,
'enrichment_method': 'Wikidata SPARQL query + VIAF cross-reference',
'identifiers_added': [],
'confidence_score': enrichment['confidence'],
'notes': enrichment['notes']
}
if 'wikidata' in enrichment:
enrichment_entry['identifiers_added'].append(f"Wikidata:{enrichment['wikidata']}")
if 'viaf' in enrichment:
enrichment_entry['identifiers_added'].append(f"VIAF:{enrichment['viaf']}")
if 'isil' in enrichment:
enrichment_entry['identifiers_added'].append(f"ISIL:{enrichment['isil']}")
institution['provenance']['enrichment_history'].append(enrichment_entry)
return institution
def main():
"""Main enrichment workflow."""
# Load YAML file
yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexico/mexican_institutions_geocoded.yaml')
print("Loading Mexican institutions YAML file...")
with open(yaml_path, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Loaded {len(institutions)} institutions")
# Track enrichment statistics
enriched_count = 0
not_found = []
# Enrich each institution
for institution in institutions:
institution_name = institution.get('name', '')
if institution_name in BATCH_1_ENRICHMENTS:
enrichment = BATCH_1_ENRICHMENTS[institution_name]
print(f"\nEnriching: {institution_name}")
print(f" - Wikidata: {enrichment.get('wikidata', 'N/A')}")
print(f" - VIAF: {enrichment.get('viaf', 'N/A')}")
print(f" - ISIL: {enrichment.get('isil', 'N/A')}")
institution = add_enrichment_identifiers(institution, enrichment)
enriched_count += 1
# Check for missing institutions
for name in BATCH_1_ENRICHMENTS.keys():
found = any(inst.get('name') == name for inst in institutions)
if not found:
not_found.append(name)
if not_found:
print("\n⚠️ Warning: Could not find these institutions in YAML:")
for name in not_found:
print(f" - {name}")
# Save enriched YAML
print(f"\n✅ Enriched {enriched_count} institutions")
print(f"Writing to {yaml_path}...")
with open(yaml_path, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=1000)
print("✅ Batch 1 enrichment complete!")
# Generate statistics
total_wikidata = sum(1 for inst in institutions
if any(i.get('identifier_scheme') == 'Wikidata'
for i in inst.get('identifiers', [])))
coverage = (total_wikidata / len(institutions)) * 100
print(f"\n📊 Current Wikidata Coverage:")
print(f" {total_wikidata}/{len(institutions)} institutions ({coverage:.1f}%)")
if __name__ == '__main__':
main()