glam/merge_batch14.py
2025-11-19 23:25:22 +01:00

156 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
Merge Batch 14 Wikidata enrichments into main GlobalGLAM dataset.
Date: 2025-11-11
Institutions: 3 Brazilian heritage custodians
"""
import yaml
from datetime import datetime, timezone
import shutil
# File paths
MAIN_FILE = 'data/instances/all/globalglam-20251111.yaml'
ENRICHMENT_FILE = 'data/instances/brazil/batch14_enriched.yaml'
BACKUP_SUFFIX = '.bak.batch14'
def load_yaml(filepath):
"""Load YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(data, filepath):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def merge_enrichments():
"""Merge Wikidata enrichments into main dataset."""
# Create backup
print(f"Creating backup: {MAIN_FILE}{BACKUP_SUFFIX}")
shutil.copy2(MAIN_FILE, f"{MAIN_FILE}{BACKUP_SUFFIX}")
# Load data
print("Loading main dataset...")
main_data = load_yaml(MAIN_FILE)
print("Loading enrichment data...")
enrichment_data = load_yaml(ENRICHMENT_FILE)
# Enrichment mapping
enrichments = {
'https://w3id.org/heritage/custodian/br/mg-ufmg-tainacan-lab': {
'qid': 'Q132140',
'label': 'Federal University of Minas Gerais',
'description': 'public, federal university in Belo Horizonte, state of Minas Gerais, Brazil',
'confidence': 0.90
},
'https://w3id.org/heritage/custodian/br/mg-mm-gerdau': {
'qid': 'Q10333730',
'label': 'MM Gerdau - Mines and Metal Museum',
'description': 'museum in Belo Horizonte, Brazil',
'confidence': 0.95
},
'https://w3id.org/heritage/custodian/br/pb-pedra-do-ing': {
'qid': 'Q3076249',
'label': 'Ingá Stone',
'description': 'archaeological site in Ingá, Brazil',
'confidence': 0.95
}
}
# Track merge statistics
merged_count = 0
not_found = []
timestamp = datetime.now(timezone.utc).isoformat()
# Process each institution
for institution in main_data:
inst_id = institution.get('id')
if inst_id in enrichments:
enrichment = enrichments[inst_id]
# Check if Wikidata identifier already exists
has_wikidata = False
if institution.get('identifiers'):
for ident in institution['identifiers']:
if ident.get('identifier_scheme') == 'Wikidata':
has_wikidata = True
print(f"⚠️ SKIPPING {institution['name']} - already has Wikidata: {ident.get('identifier_value')}")
break
if not has_wikidata:
# Add Wikidata identifier
if not institution.get('identifiers'):
institution['identifiers'] = []
institution['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': enrichment['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
})
# Add enrichment history
if not institution.get('provenance'):
institution['provenance'] = {}
if not institution['provenance'].get('enrichment_history'):
institution['provenance']['enrichment_history'] = []
institution['provenance']['enrichment_history'].append({
'enrichment_date': timestamp,
'enrichment_method': 'Wikidata authenticated entity search (Batch 14)',
'enrichment_source': 'batch14_enriched.yaml',
'fields_enriched': ['identifiers.Wikidata'],
'wikidata_label': enrichment['label'],
'wikidata_description': enrichment['description'],
'confidence_score': enrichment['confidence']
})
# Update last_updated timestamp
if institution.get('provenance'):
institution['provenance']['last_updated'] = timestamp
print(f"✅ MERGED: {institution['name']}{enrichment['qid']}")
merged_count += 1
# Check for institutions not found
for inst_id in enrichments.keys():
found = False
for institution in main_data:
if institution.get('id') == inst_id:
found = True
break
if not found:
not_found.append(inst_id)
# Report results
print(f"\n{'='*60}")
print(f"BATCH 14 MERGE COMPLETE")
print(f"{'='*60}")
print(f"Total enrichments: {len(enrichments)}")
print(f"Successfully merged: {merged_count}")
print(f"Not found in dataset: {len(not_found)}")
if not_found:
print(f"\n⚠️ Institutions not found:")
for inst_id in not_found:
print(f" - {inst_id}")
# Save updated dataset
print(f"\nSaving updated dataset to {MAIN_FILE}...")
save_yaml(main_data, MAIN_FILE)
print(f"\n✅ Merge complete! Backup saved to {MAIN_FILE}{BACKUP_SUFFIX}")
# Calculate new coverage
print(f"\nCalculating new coverage...")
br_institutions = [inst for inst in main_data if inst.get('locations') and any(loc.get('country') == 'BR' for loc in inst['locations'])]
with_wikidata = sum(1 for inst in br_institutions if inst.get('identifiers') and any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers']))
print(f"Brazilian institutions with Wikidata: {with_wikidata}/{len(br_institutions)} ({with_wikidata/len(br_institutions)*100:.1f}%)")
if __name__ == '__main__':
merge_enrichments()