#!/usr/bin/env python3 """ Merge US enriched institutions back into unified global dataset. """ import yaml from datetime import datetime, timezone def merge_us_enriched(): print("=" * 80) print("πŸ”€ Merging US enriched data into unified dataset") print("=" * 80) # Load unified dataset print("\nπŸ“‚ Loading unified dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) print(f" βœ… Loaded {len(all_institutions)} institutions") # Load enriched US data print("\nπŸ“‚ Loading US enriched data...") with open('data/instances/united_states/us_institutions_enriched_manual.yaml', 'r', encoding='utf-8') as f: us_enriched = yaml.safe_load(f) print(f" βœ… Loaded {len(us_enriched)} enriched US institutions") # Create lookup by GHCID for quick matching us_by_ghcid = {inst['ghcid']: inst for inst in us_enriched} # Merge enriched data print("\nπŸ”„ Merging enriched data...") merged_count = 0 for i, inst in enumerate(all_institutions): ghcid = inst.get('ghcid') if ghcid in us_by_ghcid: # Replace with enriched version all_institutions[i] = us_by_ghcid[ghcid] merged_count += 1 print(f" βœ… Merged: {inst['name']}") print(f"\n πŸ“Š Total merged: {merged_count}") # Save unified dataset with timestamp timestamp = datetime.now(timezone.utc).strftime('%Y%m%d') output_path = f'data/instances/all/globalglam-{timestamp}.yaml' print(f"\nπŸ’Ύ Saving updated unified dataset to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" βœ… Saved") # Verify US coverage us_institutions = [ inst for inst in all_institutions if any(loc.get('country') == 'US' for loc in inst.get('locations', [])) ] us_with_wikidata = sum( 1 for inst in us_institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])) ) print("\n" + "=" * 80) print("πŸ“Š VERIFICATION - US Institutions in Unified Dataset") print("=" * 80) print(f"Total US institutions: {len(us_institutions)}") print(f"With Wikidata identifiers: {us_with_wikidata}") print(f"Coverage: {us_with_wikidata/len(us_institutions)*100:.1f}%") if us_with_wikidata == len(us_institutions): print("\nβœ… SUCCESS: 100% Wikidata coverage verified in unified dataset!") print("πŸ‡ΊπŸ‡Έ Phase 1 United States: COMPLETE") print("\n") if __name__ == '__main__': merge_us_enriched()