#!/usr/bin/env python3 """ Egyptian GLAM Institution Extraction - Merge All Steps Combines Steps 1-4 into a single comprehensive dataset. """ import sys from pathlib import Path from collections import Counter import yaml def main(): """Merge all step files into final dataset.""" print("="*60) print("Egyptian GLAM Extraction - Merging All Steps") print("="*60) data_dir = Path(__file__).parent.parent / "data" / "instances" # Load all step files step_files = [ ("egypt_step1_2.yaml", "Steps 1+2: National libraries/archives + Museums"), ("egypt_step3.yaml", "Step 3: University Libraries"), ("egypt_step4.yaml", "Step 4: Galleries & Cultural Centers") ] all_institutions = [] for filename, description in step_files: filepath = data_dir / filename print(f"\nLoading {filename}...") print(f" ({description})") with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) count = len(data) all_institutions.extend(data) print(f" ✓ Loaded {count} institutions") print("\n" + "="*60) print(f"TOTAL INSTITUTIONS: {len(all_institutions)}") print("="*60) # Count by institution type type_counts = Counter(inst['institution_type'] for inst in all_institutions) print("\nBreakdown by Institution Type:") for inst_type, count in sorted(type_counts.items()): print(f" - {inst_type:20s}: {count:2d}") # Count institutions with identifiers with_identifiers = sum(1 for inst in all_institutions if inst.get('identifiers')) with_locations = sum(1 for inst in all_institutions if inst.get('locations')) print("\nMetadata Coverage:") print(f" - With identifiers: {with_identifiers}/{len(all_institutions)} ({100*with_identifiers/len(all_institutions):.1f}%)") print(f" - With locations: {with_locations}/{len(all_institutions)} ({100*with_locations/len(all_institutions):.1f}%)") # Save merged file output_path = data_dir / "egypt_institutions.yaml" print(f"\nSaving merged dataset to: {output_path}") with open(output_path, 'w', encoding='utf-8') as f: # Add header comment f.write("# Egyptian GLAM Institutions Dataset\n") f.write("# Extracted from conversation: 39e11630-a2af-407c-a365-d485eb8257b0\n") f.write(f"# Total institutions: {len(all_institutions)}\n") f.write("# Data tier: TIER_4_INFERRED (from conversation NLP extraction)\n") f.write("#\n") f.write("# Coverage:\n") for inst_type, count in sorted(type_counts.items()): f.write(f"# - {inst_type}: {count}\n") f.write("#\n") f.write("---\n") yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved {len(all_institutions)} institutions to egypt_institutions.yaml") # Print sample institutions print("\nSample Institutions (first 10):") for i, inst in enumerate(all_institutions[:10], 1): print(f" {i:2d}. {inst['name']:50s} ({inst['institution_type']})") if len(all_institutions) > 10: print(f" ... and {len(all_institutions) - 10} more") print("\n" + "="*60) print("Merge complete!") print("="*60) print(f"\nFinal dataset: data/instances/egypt_institutions.yaml") print(f"Total records: {len(all_institutions)}") if __name__ == "__main__": main()