#!/usr/bin/env python3 """ Merge Batch 14 Wikidata enrichments into main GlobalGLAM dataset. Date: 2025-11-11 Institutions: 3 Brazilian heritage custodians """ import yaml from datetime import datetime, timezone import shutil # File paths MAIN_FILE = 'data/instances/all/globalglam-20251111.yaml' ENRICHMENT_FILE = 'data/instances/brazil/batch14_enriched.yaml' BACKUP_SUFFIX = '.bak.batch14' def load_yaml(filepath): """Load YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(data, filepath): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def merge_enrichments(): """Merge Wikidata enrichments into main dataset.""" # Create backup print(f"Creating backup: {MAIN_FILE}{BACKUP_SUFFIX}") shutil.copy2(MAIN_FILE, f"{MAIN_FILE}{BACKUP_SUFFIX}") # Load data print("Loading main dataset...") main_data = load_yaml(MAIN_FILE) print("Loading enrichment data...") enrichment_data = load_yaml(ENRICHMENT_FILE) # Enrichment mapping enrichments = { 'https://w3id.org/heritage/custodian/br/mg-ufmg-tainacan-lab': { 'qid': 'Q132140', 'label': 'Federal University of Minas Gerais', 'description': 'public, federal university in Belo Horizonte, state of Minas Gerais, Brazil', 'confidence': 0.90 }, 'https://w3id.org/heritage/custodian/br/mg-mm-gerdau': { 'qid': 'Q10333730', 'label': 'MM Gerdau - Mines and Metal Museum', 'description': 'museum in Belo Horizonte, Brazil', 'confidence': 0.95 }, 'https://w3id.org/heritage/custodian/br/pb-pedra-do-ing': { 'qid': 'Q3076249', 'label': 'Ingá Stone', 'description': 'archaeological site in Ingá, Brazil', 'confidence': 0.95 } } # Track merge statistics merged_count = 0 not_found = [] timestamp = datetime.now(timezone.utc).isoformat() # Process each institution for institution in main_data: inst_id = institution.get('id') if inst_id in enrichments: enrichment = enrichments[inst_id] # Check if Wikidata identifier already exists has_wikidata = False if institution.get('identifiers'): for ident in institution['identifiers']: if ident.get('identifier_scheme') == 'Wikidata': has_wikidata = True print(f"⚠️ SKIPPING {institution['name']} - already has Wikidata: {ident.get('identifier_value')}") break if not has_wikidata: # Add Wikidata identifier if not institution.get('identifiers'): institution['identifiers'] = [] institution['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': enrichment['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}" }) # Add enrichment history if not institution.get('provenance'): institution['provenance'] = {} if not institution['provenance'].get('enrichment_history'): institution['provenance']['enrichment_history'] = [] institution['provenance']['enrichment_history'].append({ 'enrichment_date': timestamp, 'enrichment_method': 'Wikidata authenticated entity search (Batch 14)', 'enrichment_source': 'batch14_enriched.yaml', 'fields_enriched': ['identifiers.Wikidata'], 'wikidata_label': enrichment['label'], 'wikidata_description': enrichment['description'], 'confidence_score': enrichment['confidence'] }) # Update last_updated timestamp if institution.get('provenance'): institution['provenance']['last_updated'] = timestamp print(f"✅ MERGED: {institution['name']} → {enrichment['qid']}") merged_count += 1 # Check for institutions not found for inst_id in enrichments.keys(): found = False for institution in main_data: if institution.get('id') == inst_id: found = True break if not found: not_found.append(inst_id) # Report results print(f"\n{'='*60}") print(f"BATCH 14 MERGE COMPLETE") print(f"{'='*60}") print(f"Total enrichments: {len(enrichments)}") print(f"Successfully merged: {merged_count}") print(f"Not found in dataset: {len(not_found)}") if not_found: print(f"\n⚠️ Institutions not found:") for inst_id in not_found: print(f" - {inst_id}") # Save updated dataset print(f"\nSaving updated dataset to {MAIN_FILE}...") save_yaml(main_data, MAIN_FILE) print(f"\n✅ Merge complete! Backup saved to {MAIN_FILE}{BACKUP_SUFFIX}") # Calculate new coverage print(f"\nCalculating new coverage...") br_institutions = [inst for inst in main_data if inst.get('locations') and any(loc.get('country') == 'BR' for loc in inst['locations'])] with_wikidata = sum(1 for inst in br_institutions if inst.get('identifiers') and any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers'])) print(f"Brazilian institutions with Wikidata: {with_wikidata}/{len(br_institutions)} ({with_wikidata/len(br_institutions)*100:.1f}%)") if __name__ == '__main__': merge_enrichments()