156 lines
5.9 KiB
Python
156 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge Batch 14 Wikidata enrichments into main GlobalGLAM dataset.
|
|
Date: 2025-11-11
|
|
Institutions: 3 Brazilian heritage custodians
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
import shutil
|
|
|
|
# File paths
|
|
MAIN_FILE = 'data/instances/all/globalglam-20251111.yaml'
|
|
ENRICHMENT_FILE = 'data/instances/brazil/batch14_enriched.yaml'
|
|
BACKUP_SUFFIX = '.bak.batch14'
|
|
|
|
def load_yaml(filepath):
|
|
"""Load YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def save_yaml(data, filepath):
|
|
"""Save YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
def merge_enrichments():
|
|
"""Merge Wikidata enrichments into main dataset."""
|
|
|
|
# Create backup
|
|
print(f"Creating backup: {MAIN_FILE}{BACKUP_SUFFIX}")
|
|
shutil.copy2(MAIN_FILE, f"{MAIN_FILE}{BACKUP_SUFFIX}")
|
|
|
|
# Load data
|
|
print("Loading main dataset...")
|
|
main_data = load_yaml(MAIN_FILE)
|
|
|
|
print("Loading enrichment data...")
|
|
enrichment_data = load_yaml(ENRICHMENT_FILE)
|
|
|
|
# Enrichment mapping
|
|
enrichments = {
|
|
'https://w3id.org/heritage/custodian/br/mg-ufmg-tainacan-lab': {
|
|
'qid': 'Q132140',
|
|
'label': 'Federal University of Minas Gerais',
|
|
'description': 'public, federal university in Belo Horizonte, state of Minas Gerais, Brazil',
|
|
'confidence': 0.90
|
|
},
|
|
'https://w3id.org/heritage/custodian/br/mg-mm-gerdau': {
|
|
'qid': 'Q10333730',
|
|
'label': 'MM Gerdau - Mines and Metal Museum',
|
|
'description': 'museum in Belo Horizonte, Brazil',
|
|
'confidence': 0.95
|
|
},
|
|
'https://w3id.org/heritage/custodian/br/pb-pedra-do-ing': {
|
|
'qid': 'Q3076249',
|
|
'label': 'Ingá Stone',
|
|
'description': 'archaeological site in Ingá, Brazil',
|
|
'confidence': 0.95
|
|
}
|
|
}
|
|
|
|
# Track merge statistics
|
|
merged_count = 0
|
|
not_found = []
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Process each institution
|
|
for institution in main_data:
|
|
inst_id = institution.get('id')
|
|
|
|
if inst_id in enrichments:
|
|
enrichment = enrichments[inst_id]
|
|
|
|
# Check if Wikidata identifier already exists
|
|
has_wikidata = False
|
|
if institution.get('identifiers'):
|
|
for ident in institution['identifiers']:
|
|
if ident.get('identifier_scheme') == 'Wikidata':
|
|
has_wikidata = True
|
|
print(f"⚠️ SKIPPING {institution['name']} - already has Wikidata: {ident.get('identifier_value')}")
|
|
break
|
|
|
|
if not has_wikidata:
|
|
# Add Wikidata identifier
|
|
if not institution.get('identifiers'):
|
|
institution['identifiers'] = []
|
|
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': enrichment['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
|
|
})
|
|
|
|
# Add enrichment history
|
|
if not institution.get('provenance'):
|
|
institution['provenance'] = {}
|
|
|
|
if not institution['provenance'].get('enrichment_history'):
|
|
institution['provenance']['enrichment_history'] = []
|
|
|
|
institution['provenance']['enrichment_history'].append({
|
|
'enrichment_date': timestamp,
|
|
'enrichment_method': 'Wikidata authenticated entity search (Batch 14)',
|
|
'enrichment_source': 'batch14_enriched.yaml',
|
|
'fields_enriched': ['identifiers.Wikidata'],
|
|
'wikidata_label': enrichment['label'],
|
|
'wikidata_description': enrichment['description'],
|
|
'confidence_score': enrichment['confidence']
|
|
})
|
|
|
|
# Update last_updated timestamp
|
|
if institution.get('provenance'):
|
|
institution['provenance']['last_updated'] = timestamp
|
|
|
|
print(f"✅ MERGED: {institution['name']} → {enrichment['qid']}")
|
|
merged_count += 1
|
|
|
|
# Check for institutions not found
|
|
for inst_id in enrichments.keys():
|
|
found = False
|
|
for institution in main_data:
|
|
if institution.get('id') == inst_id:
|
|
found = True
|
|
break
|
|
if not found:
|
|
not_found.append(inst_id)
|
|
|
|
# Report results
|
|
print(f"\n{'='*60}")
|
|
print(f"BATCH 14 MERGE COMPLETE")
|
|
print(f"{'='*60}")
|
|
print(f"Total enrichments: {len(enrichments)}")
|
|
print(f"Successfully merged: {merged_count}")
|
|
print(f"Not found in dataset: {len(not_found)}")
|
|
|
|
if not_found:
|
|
print(f"\n⚠️ Institutions not found:")
|
|
for inst_id in not_found:
|
|
print(f" - {inst_id}")
|
|
|
|
# Save updated dataset
|
|
print(f"\nSaving updated dataset to {MAIN_FILE}...")
|
|
save_yaml(main_data, MAIN_FILE)
|
|
|
|
print(f"\n✅ Merge complete! Backup saved to {MAIN_FILE}{BACKUP_SUFFIX}")
|
|
|
|
# Calculate new coverage
|
|
print(f"\nCalculating new coverage...")
|
|
br_institutions = [inst for inst in main_data if inst.get('locations') and any(loc.get('country') == 'BR' for loc in inst['locations'])]
|
|
with_wikidata = sum(1 for inst in br_institutions if inst.get('identifiers') and any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers']))
|
|
|
|
print(f"Brazilian institutions with Wikidata: {with_wikidata}/{len(br_institutions)} ({with_wikidata/len(br_institutions)*100:.1f}%)")
|
|
|
|
if __name__ == '__main__':
|
|
merge_enrichments()
|