glam/merge_batch15.py
2025-11-19 23:25:22 +01:00

172 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Merge Batch 15: Add 4 Bonus Brazilian Institutions to GlobalGLAM Dataset
This script adds 4 NEW institutions discovered during Batch 14 Wikidata searches
that were not previously present in the GlobalGLAM dataset.
Date: 2025-11-11
Batch: 15 (Bonus Institutions)
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
import sys
def load_yaml(filepath):
"""Load YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(data, filepath):
"""Save data to YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
def create_backup(filepath):
"""Create timestamped backup of file."""
backup_path = f"{filepath}.bak.batch15"
with open(filepath, 'r', encoding='utf-8') as source:
with open(backup_path, 'w', encoding='utf-8') as target:
target.write(source.read())
print(f"✅ Backup created: {backup_path}")
return backup_path
def add_bonus_institutions(main_data, bonus_data):
"""
Add bonus institutions to main dataset.
These are NEW institutions, so we simply append them.
Args:
main_data: Main GlobalGLAM dataset (list of institutions)
bonus_data: Bonus institutions to add (list of 4 new institutions)
Returns:
tuple: (updated_data, additions_count)
"""
additions = 0
# Get existing institution IDs to avoid accidental duplicates
existing_ids = {inst['id'] for inst in main_data}
for bonus_inst in bonus_data:
bonus_id = bonus_inst['id']
# Double-check this is truly a new institution
if bonus_id in existing_ids:
print(f"⚠️ WARNING: Institution already exists: {bonus_inst['name']} ({bonus_id})")
print(f" Skipping to avoid duplicate...")
continue
# Add new institution
main_data.append(bonus_inst)
additions += 1
print(f"✅ Added: {bonus_inst['name']} (Wikidata: {get_wikidata_id(bonus_inst)})")
return main_data, additions
def get_wikidata_id(institution):
"""Extract Wikidata Q-number from institution identifiers."""
for identifier in institution.get('identifiers', []):
if identifier.get('identifier_scheme') == 'Wikidata':
return identifier.get('identifier_value')
return None
def main():
"""Main execution."""
print("=" * 80)
print("BATCH 15 MERGE: Adding Bonus Brazilian Institutions")
print("=" * 80)
print()
# File paths
main_file = Path("data/instances/all/globalglam-20251111.yaml")
bonus_file = Path("data/instances/brazil/batch15_bonus_institutions.yaml")
# Verify files exist
if not main_file.exists():
print(f"❌ ERROR: Main dataset not found: {main_file}")
sys.exit(1)
if not bonus_file.exists():
print(f"❌ ERROR: Bonus institutions file not found: {bonus_file}")
sys.exit(1)
print(f"📂 Main dataset: {main_file}")
print(f"📂 Bonus institutions: {bonus_file}")
print()
# Load data
print("Loading datasets...")
main_data = load_yaml(main_file)
bonus_data = load_yaml(bonus_file)
print(f"✅ Main dataset loaded: {len(main_data)} institutions")
print(f"✅ Bonus institutions loaded: {len(bonus_data)} institutions")
print()
# Count current Brazilian institutions with Wikidata
br_institutions = [i for i in main_data if any(l.get('country') == 'BR' for l in i.get('locations', []))]
br_with_wikidata = sum(1 for i in br_institutions if any(x.get('identifier_scheme') == 'Wikidata' for x in i.get('identifiers', [])))
print("📊 BEFORE MERGE:")
print(f" Total institutions: {len(main_data)}")
print(f" Brazilian institutions: {len(br_institutions)}")
print(f" Brazilian with Wikidata: {br_with_wikidata} ({br_with_wikidata/len(br_institutions)*100:.1f}%)")
print()
# Create backup
print("Creating backup...")
create_backup(main_file)
print()
# Add bonus institutions
print("Adding bonus institutions...")
print("-" * 80)
updated_data, additions = add_bonus_institutions(main_data, bonus_data)
print("-" * 80)
print()
# Calculate new statistics
br_institutions_after = [i for i in updated_data if any(l.get('country') == 'BR' for l in i.get('locations', []))]
br_with_wikidata_after = sum(1 for i in br_institutions_after if any(x.get('identifier_scheme') == 'Wikidata' for x in i.get('identifiers', [])))
print("📊 AFTER MERGE:")
print(f" Total institutions: {len(updated_data)}")
print(f" Brazilian institutions: {len(br_institutions_after)}")
print(f" Brazilian with Wikidata: {br_with_wikidata_after} ({br_with_wikidata_after/len(br_institutions_after)*100:.1f}%)")
print()
print("📈 CHANGES:")
print(f" Institutions added: {additions}")
print(f" Total growth: +{len(updated_data) - len(main_data)} institutions")
print(f" Brazilian growth: +{len(br_institutions_after) - len(br_institutions)} institutions")
print(f" Coverage improvement: +{br_with_wikidata_after - br_with_wikidata} institutions with Wikidata")
print(f" Coverage change: {br_with_wikidata/len(br_institutions)*100:.1f}% → {br_with_wikidata_after/len(br_institutions_after)*100:.1f}%")
print()
# Save updated dataset
print(f"💾 Saving updated dataset to {main_file}...")
save_yaml(updated_data, main_file)
print("✅ Save complete!")
print()
print("=" * 80)
print("🎉 BATCH 15 MERGE COMPLETE!")
print("=" * 80)
print()
print("Summary:")
print(f"{additions} bonus institutions added to dataset")
print(f" • All 4 institutions include Wikidata Q-numbers")
print(f" • Coverage: {br_with_wikidata_after}/{len(br_institutions_after)} ({br_with_wikidata_after/len(br_institutions_after)*100:.1f}%)")
print()
print("Next steps:")
print(" 1. Verify merge with: python3 -c \"import yaml; print(len(yaml.safe_load(open('data/instances/all/globalglam-20251111.yaml'))))\"")
print(" 2. Check Brazilian institutions count")
print(" 3. Generate Batch 15 final report")
print(" 4. Continue with Batch 16 enrichment (target: 70%+ coverage)")
print()
if __name__ == "__main__":
main()