#!/usr/bin/env python3 """ Merge Batch 16 enrichments into main GlobalGLAM dataset. This script: 1. Backs up the current dataset 2. Loads Batch 16 enriched institutions 3. Updates 5 existing institution records with enrichment 4. Appends 1 new institution (Museu Casa de Rui Barbosa) 5. Saves merged dataset """ import yaml import shutil from datetime import datetime from pathlib import Path # Paths BASE_DIR = Path(__file__).parent.parent MAIN_DATASET = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111.yaml" BATCH16_FILE = BASE_DIR / "data" / "instances" / "brazil" / "batch16_enriched.yaml" OUTPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16.yaml" BACKUP_FILE = BASE_DIR / "data" / "instances" / "all" / f"globalglam-20251111-pre-batch16-{datetime.now().strftime('%Y%m%d-%H%M%S')}.yaml" def load_yaml(filepath): """Load YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(data, filepath): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def find_institution_by_old_id(institutions, old_id_value): """Find institution by OLD_ID identifier.""" for idx, inst in enumerate(institutions): identifiers = inst.get('identifiers', []) for ident in identifiers: if ident.get('identifier_scheme') == 'OLD_ID' and ident.get('identifier_value') == old_id_value: return idx return None def main(): print("=" * 80) print("BRAZIL BATCH 16 MERGE") print("=" * 80) print() # Backup current dataset print(f"1. Creating backup: {BACKUP_FILE.name}") shutil.copy2(MAIN_DATASET, BACKUP_FILE) print(f" ✓ Backup created") print() # Load datasets print("2. Loading datasets...") main_data = load_yaml(MAIN_DATASET) batch16_data = load_yaml(BATCH16_FILE) print(f" ✓ Main dataset: {len(main_data)} institutions") print(f" ✓ Batch 16: {len(batch16_data)} institutions") print() # Count Brazilian institutions before merge brazil_before = sum(1 for inst in main_data if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))) brazil_wikidata_before = sum(1 for inst in main_data if any(loc.get('country') == 'BR' for loc in inst.get('locations', [])) and any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) print(f"3. Brazilian institutions before merge:") print(f" Total: {brazil_before}") print(f" With Wikidata: {brazil_wikidata_before} ({brazil_wikidata_before/brazil_before*100:.1f}%)") print() # Process Batch 16 institutions print("4. Processing Batch 16 institutions:") updated_count = 0 new_count = 0 for inst in batch16_data: inst_name = inst.get('name') inst_id = inst.get('id') # Check for OLD_ID (existing institutions) old_ids = [i.get('identifier_value') for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'OLD_ID'] if old_ids: # This is an existing institution - update it old_id = old_ids[0] idx = find_institution_by_old_id(main_data, old_id) if idx is not None: # Update the institution main_data[idx] = inst updated_count += 1 wikidata_id = next((i.get('identifier_value') for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'Wikidata'), 'N/A') print(f" ✓ Updated: {inst_name} ({wikidata_id})") else: print(f" ⚠ WARNING: Could not find institution with OLD_ID: {old_id}") else: # This is a new institution - append it main_data.append(inst) new_count += 1 wikidata_id = next((i.get('identifier_value') for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'Wikidata'), 'N/A') print(f" ✓ Added NEW: {inst_name} ({wikidata_id})") print() print(f" Summary: {updated_count} updated, {new_count} new") print() # Count Brazilian institutions after merge brazil_after = sum(1 for inst in main_data if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))) brazil_wikidata_after = sum(1 for inst in main_data if any(loc.get('country') == 'BR' for loc in inst.get('locations', [])) and any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) print(f"5. Brazilian institutions after merge:") print(f" Total: {brazil_after} (was {brazil_before}, +{brazil_after - brazil_before})") print(f" With Wikidata: {brazil_wikidata_after} (was {brazil_wikidata_before}, +{brazil_wikidata_after - brazil_wikidata_before})") print(f" Coverage: {brazil_wikidata_after/brazil_after*100:.1f}% (was {brazil_wikidata_before/brazil_before*100:.1f}%)") print() # Save merged dataset print(f"6. Saving merged dataset: {OUTPUT_FILE.name}") save_yaml(main_data, OUTPUT_FILE) print(f" ✓ Saved {len(main_data)} institutions") print() print("=" * 80) print("MERGE COMPLETE") print("=" * 80) print() print(f"Backup: {BACKUP_FILE}") print(f"Output: {OUTPUT_FILE}") print() print("Next steps:") print("1. Validate merged dataset") print("2. Generate Batch 16 report") print("3. Decide if Batch 17 is needed (70% goal = 88/126)") if __name__ == '__main__': main()