glam/scripts/merge_georgia_enrichment.py

#!/usr/bin/env python3
"""
Merge Georgia Wikidata enrichment into unified global dataset.
"""

import yaml
from datetime import datetime, timezone
from pathlib import Path

def merge_georgia_enrichment():
    """Merge Georgia enriched institutions into unified dataset."""

    # Load enriched Georgia data
    georgia_enriched_path = Path('data/instances/georgia/georgian_institutions_enriched_batch3_final.yaml')
    with open(georgia_enriched_path, 'r', encoding='utf-8') as f:
        georgia_enriched = yaml.safe_load(f)

    # Load unified dataset
    unified_path = Path('data/instances/all/globalglam-20251111.yaml')
    with open(unified_path, 'r', encoding='utf-8') as f:
        unified = yaml.safe_load(f)

    print(f"Loaded {len(georgia_enriched)} enriched Georgia institutions")
    print(f"Loaded {len(unified)} institutions from unified dataset")

    # Create lookup by institution ID
    georgia_enriched_lookup = {}
    for inst in georgia_enriched:
        inst_id = inst.get('id')
        if inst_id:
            georgia_enriched_lookup[inst_id] = inst

    print(f"\nCreated lookup for {len(georgia_enriched_lookup)} Georgia institutions")

    # Merge enrichment data into unified dataset
    merged_count = 0
    updated_records = []

    for i, inst in enumerate(unified):
        inst_id = inst.get('id')

        # Check if this is a Georgia institution with enrichment
        if not inst_id or inst_id not in georgia_enriched_lookup:
            continue

        # Found matching institution
        enriched = georgia_enriched_lookup[inst_id]

        # Get existing identifiers (or create empty list if field missing)
        if 'identifiers' not in inst or inst['identifiers'] is None:
            inst['identifiers'] = []

        existing_identifiers = inst['identifiers']
        existing_schemes = {id.get('identifier_scheme') for id in existing_identifiers}

        # Add Wikidata and VIAF from enriched version if not present
        added_identifiers = []
        for id in enriched.get('identifiers', []):
            scheme = id.get('identifier_scheme')
            if scheme in ['Wikidata', 'VIAF'] and scheme not in existing_schemes:
                inst['identifiers'].append(id)
                added_identifiers.append(scheme)
                existing_schemes.add(scheme)

        # Also add founding_date if present in enriched version
        if enriched.get('founding_date') and not inst.get('founding_date'):
            inst['founding_date'] = enriched['founding_date']
            added_identifiers.append('founding_date')

        if added_identifiers:
            # Update provenance
            provenance = inst.get('provenance', {})
            if not provenance:
                provenance = {}
                inst['provenance'] = provenance

            # Update enrichment_history
            enrichment_history = enriched.get('provenance', {}).get('enrichment_history', [])
            if enrichment_history:
                if 'enrichment_history' not in provenance:
                    provenance['enrichment_history'] = []
                provenance['enrichment_history'].extend(enrichment_history)

            # Add merge timestamp
            provenance['last_updated'] = datetime.now(timezone.utc).isoformat()
            provenance['wikidata_verified'] = True

            merged_count += 1
            updated_records.append({
                'name': inst.get('name'),
                'id': inst_id,
                'added': added_identifiers
            })

            print(f"✓ Merged {inst.get('name')}: added {', '.join(added_identifiers)}")

    # Save updated unified dataset
    if merged_count > 0:
        backup_path = unified_path.with_suffix('.yaml.backup')
        import shutil
        shutil.copy(unified_path, backup_path)
        print(f"\n✓ Created backup: {backup_path}")

        with open(unified_path, 'w', encoding='utf-8') as f:
            yaml.dump(unified, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        print(f"\n✓ Updated unified dataset: {unified_path}")
        print(f"✓ Merged {merged_count} Georgia institutions")

        # Print summary
        print("\n" + "="*60)
        print("MERGE SUMMARY")
        print("="*60)
        for record in updated_records:
            print(f"  {record['name']} ({record['id']})")
            print(f"    Added: {', '.join(record['added'])}")
    else:
        print("\n⚠ No institutions merged (already up to date)")

    return merged_count

if __name__ == '__main__':
    merge_georgia_enrichment()