#!/usr/bin/env python3 """ Merge GB enriched institutions back into unified global dataset. """ import yaml from datetime import datetime, timezone def merge_gb_enriched(): print("=" * 80) print("šŸ”€ Merging GB enriched data into unified dataset") print("=" * 80) # Load unified dataset print("\nšŸ“‚ Loading unified dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) print(f" āœ… Loaded {len(all_institutions)} institutions") # Load enriched GB data print("\nšŸ“‚ Loading GB enriched data...") with open('data/instances/great_britain/gb_institutions_enriched_manual.yaml', 'r', encoding='utf-8') as f: gb_enriched = yaml.safe_load(f) print(f" āœ… Loaded {len(gb_enriched)} enriched GB institutions") # Create lookup by ID URL for quick matching gb_by_id = {inst['id']: inst for inst in gb_enriched} # Merge enriched data print("\nšŸ”„ Merging enriched data...") merged_count = 0 for i, inst in enumerate(all_institutions): inst_id = inst.get('id') if inst_id in gb_by_id: # Preserve GHCID from original if it exists enriched_inst = gb_by_id[inst_id].copy() if 'ghcid' in inst: enriched_inst['ghcid'] = inst['ghcid'] if 'ghcid_uuid' in inst: enriched_inst['ghcid_uuid'] = inst['ghcid_uuid'] if 'ghcid_uuid_sha256' in inst: enriched_inst['ghcid_uuid_sha256'] = inst['ghcid_uuid_sha256'] if 'ghcid_numeric' in inst: enriched_inst['ghcid_numeric'] = inst['ghcid_numeric'] # Replace with enriched version all_institutions[i] = enriched_inst merged_count += 1 print(f" āœ… Merged: {inst['name']}") print(f"\n šŸ“Š Total merged: {merged_count}") # Save unified dataset with timestamp timestamp = datetime.now(timezone.utc).strftime('%Y%m%d') output_path = f'data/instances/all/globalglam-{timestamp}.yaml' print(f"\nšŸ’¾ Saving updated unified dataset to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" āœ… Saved") # Verify GB coverage gb_institutions = [ inst for inst in all_institutions if any(loc.get('country') == 'GB' for loc in inst.get('locations', [])) ] gb_with_wikidata = sum( 1 for inst in gb_institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])) ) print("\n" + "=" * 80) print("šŸ“Š VERIFICATION - GB Institutions in Unified Dataset") print("=" * 80) print(f"Total GB institutions: {len(gb_institutions)}") print(f"With Wikidata identifiers: {gb_with_wikidata}") print(f"Coverage: {gb_with_wikidata/len(gb_institutions)*100:.1f}%") if gb_with_wikidata == len(gb_institutions): print("\nāœ… SUCCESS: 100% Wikidata coverage verified in unified dataset!") print("šŸ‡¬šŸ‡§ Phase 1 Great Britain: COMPLETE") print("\n") if __name__ == '__main__': merge_gb_enriched()