glam/scripts/merge_egypt_steps.py

#!/usr/bin/env python3
"""
Egyptian GLAM Institution Extraction - Merge All Steps
Combines Steps 1-4 into a single comprehensive dataset.
"""

import sys
from pathlib import Path
from collections import Counter
import yaml

def main():
    """Merge all step files into final dataset."""
    print("="*60)
    print("Egyptian GLAM Extraction - Merging All Steps")
    print("="*60)

    data_dir = Path(__file__).parent.parent / "data" / "instances"

    # Load all step files
    step_files = [
        ("egypt_step1_2.yaml", "Steps 1+2: National libraries/archives + Museums"),
        ("egypt_step3.yaml", "Step 3: University Libraries"),
        ("egypt_step4.yaml", "Step 4: Galleries & Cultural Centers")
    ]

    all_institutions = []

    for filename, description in step_files:
        filepath = data_dir / filename
        print(f"\nLoading {filename}...")
        print(f"  ({description})")

        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
            count = len(data)
            all_institutions.extend(data)
            print(f"  ✓ Loaded {count} institutions")

    print("\n" + "="*60)
    print(f"TOTAL INSTITUTIONS: {len(all_institutions)}")
    print("="*60)

    # Count by institution type
    type_counts = Counter(inst['institution_type'] for inst in all_institutions)

    print("\nBreakdown by Institution Type:")
    for inst_type, count in sorted(type_counts.items()):
        print(f"  - {inst_type:20s}: {count:2d}")

    # Count institutions with identifiers
    with_identifiers = sum(1 for inst in all_institutions if inst.get('identifiers'))
    with_locations = sum(1 for inst in all_institutions if inst.get('locations'))

    print("\nMetadata Coverage:")
    print(f"  - With identifiers: {with_identifiers}/{len(all_institutions)} ({100*with_identifiers/len(all_institutions):.1f}%)")
    print(f"  - With locations:   {with_locations}/{len(all_institutions)} ({100*with_locations/len(all_institutions):.1f}%)")

    # Save merged file
    output_path = data_dir / "egypt_institutions.yaml"

    print(f"\nSaving merged dataset to: {output_path}")

    with open(output_path, 'w', encoding='utf-8') as f:
        # Add header comment
        f.write("# Egyptian GLAM Institutions Dataset\n")
        f.write("# Extracted from conversation: 39e11630-a2af-407c-a365-d485eb8257b0\n")
        f.write(f"# Total institutions: {len(all_institutions)}\n")
        f.write("# Data tier: TIER_4_INFERRED (from conversation NLP extraction)\n")
        f.write("#\n")
        f.write("# Coverage:\n")
        for inst_type, count in sorted(type_counts.items()):
            f.write(f"#   - {inst_type}: {count}\n")
        f.write("#\n")
        f.write("---\n")

        yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    print(f"✓ Saved {len(all_institutions)} institutions to egypt_institutions.yaml")

    # Print sample institutions
    print("\nSample Institutions (first 10):")
    for i, inst in enumerate(all_institutions[:10], 1):
        print(f"  {i:2d}. {inst['name']:50s} ({inst['institution_type']})")

    if len(all_institutions) > 10:
        print(f"  ... and {len(all_institutions) - 10} more")

    print("\n" + "="*60)
    print("Merge complete!")
    print("="*60)
    print(f"\nFinal dataset: data/instances/egypt_institutions.yaml")
    print(f"Total records: {len(all_institutions)}")

if __name__ == "__main__":
    main()