#!/usr/bin/env python3 """ Merge Batch 15: Add 4 Bonus Brazilian Institutions to GlobalGLAM Dataset This script adds 4 NEW institutions discovered during Batch 14 Wikidata searches that were not previously present in the GlobalGLAM dataset. Date: 2025-11-11 Batch: 15 (Bonus Institutions) """ import yaml from datetime import datetime, timezone from pathlib import Path import sys def load_yaml(filepath): """Load YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(data, filepath): """Save data to YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False) def create_backup(filepath): """Create timestamped backup of file.""" backup_path = f"{filepath}.bak.batch15" with open(filepath, 'r', encoding='utf-8') as source: with open(backup_path, 'w', encoding='utf-8') as target: target.write(source.read()) print(f"✅ Backup created: {backup_path}") return backup_path def add_bonus_institutions(main_data, bonus_data): """ Add bonus institutions to main dataset. These are NEW institutions, so we simply append them. Args: main_data: Main GlobalGLAM dataset (list of institutions) bonus_data: Bonus institutions to add (list of 4 new institutions) Returns: tuple: (updated_data, additions_count) """ additions = 0 # Get existing institution IDs to avoid accidental duplicates existing_ids = {inst['id'] for inst in main_data} for bonus_inst in bonus_data: bonus_id = bonus_inst['id'] # Double-check this is truly a new institution if bonus_id in existing_ids: print(f"⚠️ WARNING: Institution already exists: {bonus_inst['name']} ({bonus_id})") print(f" Skipping to avoid duplicate...") continue # Add new institution main_data.append(bonus_inst) additions += 1 print(f"✅ Added: {bonus_inst['name']} (Wikidata: {get_wikidata_id(bonus_inst)})") return main_data, additions def get_wikidata_id(institution): """Extract Wikidata Q-number from institution identifiers.""" for identifier in institution.get('identifiers', []): if identifier.get('identifier_scheme') == 'Wikidata': return identifier.get('identifier_value') return None def main(): """Main execution.""" print("=" * 80) print("BATCH 15 MERGE: Adding Bonus Brazilian Institutions") print("=" * 80) print() # File paths main_file = Path("data/instances/all/globalglam-20251111.yaml") bonus_file = Path("data/instances/brazil/batch15_bonus_institutions.yaml") # Verify files exist if not main_file.exists(): print(f"❌ ERROR: Main dataset not found: {main_file}") sys.exit(1) if not bonus_file.exists(): print(f"❌ ERROR: Bonus institutions file not found: {bonus_file}") sys.exit(1) print(f"📂 Main dataset: {main_file}") print(f"📂 Bonus institutions: {bonus_file}") print() # Load data print("Loading datasets...") main_data = load_yaml(main_file) bonus_data = load_yaml(bonus_file) print(f"✅ Main dataset loaded: {len(main_data)} institutions") print(f"✅ Bonus institutions loaded: {len(bonus_data)} institutions") print() # Count current Brazilian institutions with Wikidata br_institutions = [i for i in main_data if any(l.get('country') == 'BR' for l in i.get('locations', []))] br_with_wikidata = sum(1 for i in br_institutions if any(x.get('identifier_scheme') == 'Wikidata' for x in i.get('identifiers', []))) print("📊 BEFORE MERGE:") print(f" Total institutions: {len(main_data)}") print(f" Brazilian institutions: {len(br_institutions)}") print(f" Brazilian with Wikidata: {br_with_wikidata} ({br_with_wikidata/len(br_institutions)*100:.1f}%)") print() # Create backup print("Creating backup...") create_backup(main_file) print() # Add bonus institutions print("Adding bonus institutions...") print("-" * 80) updated_data, additions = add_bonus_institutions(main_data, bonus_data) print("-" * 80) print() # Calculate new statistics br_institutions_after = [i for i in updated_data if any(l.get('country') == 'BR' for l in i.get('locations', []))] br_with_wikidata_after = sum(1 for i in br_institutions_after if any(x.get('identifier_scheme') == 'Wikidata' for x in i.get('identifiers', []))) print("📊 AFTER MERGE:") print(f" Total institutions: {len(updated_data)}") print(f" Brazilian institutions: {len(br_institutions_after)}") print(f" Brazilian with Wikidata: {br_with_wikidata_after} ({br_with_wikidata_after/len(br_institutions_after)*100:.1f}%)") print() print("📈 CHANGES:") print(f" Institutions added: {additions}") print(f" Total growth: +{len(updated_data) - len(main_data)} institutions") print(f" Brazilian growth: +{len(br_institutions_after) - len(br_institutions)} institutions") print(f" Coverage improvement: +{br_with_wikidata_after - br_with_wikidata} institutions with Wikidata") print(f" Coverage change: {br_with_wikidata/len(br_institutions)*100:.1f}% → {br_with_wikidata_after/len(br_institutions_after)*100:.1f}%") print() # Save updated dataset print(f"💾 Saving updated dataset to {main_file}...") save_yaml(updated_data, main_file) print("✅ Save complete!") print() print("=" * 80) print("🎉 BATCH 15 MERGE COMPLETE!") print("=" * 80) print() print("Summary:") print(f" • {additions} bonus institutions added to dataset") print(f" • All 4 institutions include Wikidata Q-numbers") print(f" • Coverage: {br_with_wikidata_after}/{len(br_institutions_after)} ({br_with_wikidata_after/len(br_institutions_after)*100:.1f}%)") print() print("Next steps:") print(" 1. Verify merge with: python3 -c \"import yaml; print(len(yaml.safe_load(open('data/instances/all/globalglam-20251111.yaml'))))\"") print(" 2. Check Brazilian institutions count") print(" 3. Generate Batch 15 final report") print(" 4. Continue with Batch 16 enrichment (target: 70%+ coverage)") print() if __name__ == "__main__": main()