172 lines
6.4 KiB
Python
172 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge Batch 15: Add 4 Bonus Brazilian Institutions to GlobalGLAM Dataset
|
|
|
|
This script adds 4 NEW institutions discovered during Batch 14 Wikidata searches
|
|
that were not previously present in the GlobalGLAM dataset.
|
|
|
|
Date: 2025-11-11
|
|
Batch: 15 (Bonus Institutions)
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
def load_yaml(filepath):
|
|
"""Load YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def save_yaml(data, filepath):
|
|
"""Save data to YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
def create_backup(filepath):
|
|
"""Create timestamped backup of file."""
|
|
backup_path = f"{filepath}.bak.batch15"
|
|
with open(filepath, 'r', encoding='utf-8') as source:
|
|
with open(backup_path, 'w', encoding='utf-8') as target:
|
|
target.write(source.read())
|
|
print(f"✅ Backup created: {backup_path}")
|
|
return backup_path
|
|
|
|
def add_bonus_institutions(main_data, bonus_data):
|
|
"""
|
|
Add bonus institutions to main dataset.
|
|
These are NEW institutions, so we simply append them.
|
|
|
|
Args:
|
|
main_data: Main GlobalGLAM dataset (list of institutions)
|
|
bonus_data: Bonus institutions to add (list of 4 new institutions)
|
|
|
|
Returns:
|
|
tuple: (updated_data, additions_count)
|
|
"""
|
|
additions = 0
|
|
|
|
# Get existing institution IDs to avoid accidental duplicates
|
|
existing_ids = {inst['id'] for inst in main_data}
|
|
|
|
for bonus_inst in bonus_data:
|
|
bonus_id = bonus_inst['id']
|
|
|
|
# Double-check this is truly a new institution
|
|
if bonus_id in existing_ids:
|
|
print(f"⚠️ WARNING: Institution already exists: {bonus_inst['name']} ({bonus_id})")
|
|
print(f" Skipping to avoid duplicate...")
|
|
continue
|
|
|
|
# Add new institution
|
|
main_data.append(bonus_inst)
|
|
additions += 1
|
|
print(f"✅ Added: {bonus_inst['name']} (Wikidata: {get_wikidata_id(bonus_inst)})")
|
|
|
|
return main_data, additions
|
|
|
|
def get_wikidata_id(institution):
|
|
"""Extract Wikidata Q-number from institution identifiers."""
|
|
for identifier in institution.get('identifiers', []):
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
return identifier.get('identifier_value')
|
|
return None
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print("=" * 80)
|
|
print("BATCH 15 MERGE: Adding Bonus Brazilian Institutions")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# File paths
|
|
main_file = Path("data/instances/all/globalglam-20251111.yaml")
|
|
bonus_file = Path("data/instances/brazil/batch15_bonus_institutions.yaml")
|
|
|
|
# Verify files exist
|
|
if not main_file.exists():
|
|
print(f"❌ ERROR: Main dataset not found: {main_file}")
|
|
sys.exit(1)
|
|
|
|
if not bonus_file.exists():
|
|
print(f"❌ ERROR: Bonus institutions file not found: {bonus_file}")
|
|
sys.exit(1)
|
|
|
|
print(f"📂 Main dataset: {main_file}")
|
|
print(f"📂 Bonus institutions: {bonus_file}")
|
|
print()
|
|
|
|
# Load data
|
|
print("Loading datasets...")
|
|
main_data = load_yaml(main_file)
|
|
bonus_data = load_yaml(bonus_file)
|
|
|
|
print(f"✅ Main dataset loaded: {len(main_data)} institutions")
|
|
print(f"✅ Bonus institutions loaded: {len(bonus_data)} institutions")
|
|
print()
|
|
|
|
# Count current Brazilian institutions with Wikidata
|
|
br_institutions = [i for i in main_data if any(l.get('country') == 'BR' for l in i.get('locations', []))]
|
|
br_with_wikidata = sum(1 for i in br_institutions if any(x.get('identifier_scheme') == 'Wikidata' for x in i.get('identifiers', [])))
|
|
|
|
print("📊 BEFORE MERGE:")
|
|
print(f" Total institutions: {len(main_data)}")
|
|
print(f" Brazilian institutions: {len(br_institutions)}")
|
|
print(f" Brazilian with Wikidata: {br_with_wikidata} ({br_with_wikidata/len(br_institutions)*100:.1f}%)")
|
|
print()
|
|
|
|
# Create backup
|
|
print("Creating backup...")
|
|
create_backup(main_file)
|
|
print()
|
|
|
|
# Add bonus institutions
|
|
print("Adding bonus institutions...")
|
|
print("-" * 80)
|
|
updated_data, additions = add_bonus_institutions(main_data, bonus_data)
|
|
print("-" * 80)
|
|
print()
|
|
|
|
# Calculate new statistics
|
|
br_institutions_after = [i for i in updated_data if any(l.get('country') == 'BR' for l in i.get('locations', []))]
|
|
br_with_wikidata_after = sum(1 for i in br_institutions_after if any(x.get('identifier_scheme') == 'Wikidata' for x in i.get('identifiers', [])))
|
|
|
|
print("📊 AFTER MERGE:")
|
|
print(f" Total institutions: {len(updated_data)}")
|
|
print(f" Brazilian institutions: {len(br_institutions_after)}")
|
|
print(f" Brazilian with Wikidata: {br_with_wikidata_after} ({br_with_wikidata_after/len(br_institutions_after)*100:.1f}%)")
|
|
print()
|
|
|
|
print("📈 CHANGES:")
|
|
print(f" Institutions added: {additions}")
|
|
print(f" Total growth: +{len(updated_data) - len(main_data)} institutions")
|
|
print(f" Brazilian growth: +{len(br_institutions_after) - len(br_institutions)} institutions")
|
|
print(f" Coverage improvement: +{br_with_wikidata_after - br_with_wikidata} institutions with Wikidata")
|
|
print(f" Coverage change: {br_with_wikidata/len(br_institutions)*100:.1f}% → {br_with_wikidata_after/len(br_institutions_after)*100:.1f}%")
|
|
print()
|
|
|
|
# Save updated dataset
|
|
print(f"💾 Saving updated dataset to {main_file}...")
|
|
save_yaml(updated_data, main_file)
|
|
print("✅ Save complete!")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("🎉 BATCH 15 MERGE COMPLETE!")
|
|
print("=" * 80)
|
|
print()
|
|
print("Summary:")
|
|
print(f" • {additions} bonus institutions added to dataset")
|
|
print(f" • All 4 institutions include Wikidata Q-numbers")
|
|
print(f" • Coverage: {br_with_wikidata_after}/{len(br_institutions_after)} ({br_with_wikidata_after/len(br_institutions_after)*100:.1f}%)")
|
|
print()
|
|
print("Next steps:")
|
|
print(" 1. Verify merge with: python3 -c \"import yaml; print(len(yaml.safe_load(open('data/instances/all/globalglam-20251111.yaml'))))\"")
|
|
print(" 2. Check Brazilian institutions count")
|
|
print(" 3. Generate Batch 15 final report")
|
|
print(" 4. Continue with Batch 16 enrichment (target: 70%+ coverage)")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|