#!/usr/bin/env python3
"""
Egyptian GLAM Institution Extraction - Merge All Steps
Combines Steps 1-4 into a single comprehensive dataset.
"""

import sys
from pathlib import Path
from collections import Counter
import yaml

def main():
    """Merge all step files into final dataset."""
    print("="*60)
    print("Egyptian GLAM Extraction - Merging All Steps")
    print("="*60)
    
    data_dir = Path(__file__).parent.parent / "data" / "instances"
    
    # Load all step files
    step_files = [
        ("egypt_step1_2.yaml", "Steps 1+2: National libraries/archives + Museums"),
        ("egypt_step3.yaml", "Step 3: University Libraries"),
        ("egypt_step4.yaml", "Step 4: Galleries & Cultural Centers")
    ]
    
    all_institutions = []
    
    for filename, description in step_files:
        filepath = data_dir / filename
        print(f"\nLoading {filename}...")
        print(f"  ({description})")
        
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
            count = len(data)
            all_institutions.extend(data)
            print(f"  ✓ Loaded {count} institutions")
    
    print("\n" + "="*60)
    print(f"TOTAL INSTITUTIONS: {len(all_institutions)}")
    print("="*60)
    
    # Count by institution type
    type_counts = Counter(inst['institution_type'] for inst in all_institutions)
    
    print("\nBreakdown by Institution Type:")
    for inst_type, count in sorted(type_counts.items()):
        print(f"  - {inst_type:20s}: {count:2d}")
    
    # Count institutions with identifiers
    with_identifiers = sum(1 for inst in all_institutions if inst.get('identifiers'))
    with_locations = sum(1 for inst in all_institutions if inst.get('locations'))
    
    print("\nMetadata Coverage:")
    print(f"  - With identifiers: {with_identifiers}/{len(all_institutions)} ({100*with_identifiers/len(all_institutions):.1f}%)")
    print(f"  - With locations:   {with_locations}/{len(all_institutions)} ({100*with_locations/len(all_institutions):.1f}%)")
    
    # Save merged file
    output_path = data_dir / "egypt_institutions.yaml"
    
    print(f"\nSaving merged dataset to: {output_path}")
    
    with open(output_path, 'w', encoding='utf-8') as f:
        # Add header comment
        f.write("# Egyptian GLAM Institutions Dataset\n")
        f.write("# Extracted from conversation: 39e11630-a2af-407c-a365-d485eb8257b0\n")
        f.write(f"# Total institutions: {len(all_institutions)}\n")
        f.write("# Data tier: TIER_4_INFERRED (from conversation NLP extraction)\n")
        f.write("#\n")
        f.write("# Coverage:\n")
        for inst_type, count in sorted(type_counts.items()):
            f.write(f"#   - {inst_type}: {count}\n")
        f.write("#\n")
        f.write("---\n")
        
        yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
    
    print(f"✓ Saved {len(all_institutions)} institutions to egypt_institutions.yaml")
    
    # Print sample institutions
    print("\nSample Institutions (first 10):")
    for i, inst in enumerate(all_institutions[:10], 1):
        print(f"  {i:2d}. {inst['name']:50s} ({inst['institution_type']})")
    
    if len(all_institutions) > 10:
        print(f"  ... and {len(all_institutions) - 10} more")
    
    print("\n" + "="*60)
    print("Merge complete!")
    print("="*60)
    print(f"\nFinal dataset: data/instances/egypt_institutions.yaml")
    print(f"Total records: {len(all_institutions)}")

if __name__ == "__main__":
    main()