#!/usr/bin/env python3 """ Merge IT enriched institutions back into unified global dataset. """ import yaml from datetime import datetime, timezone def merge_it_enriched(): print("=" * 80) print("šŸ”€ Merging IT enriched data into unified dataset") print("=" * 80) # Load unified dataset print("\nšŸ“‚ Loading unified dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) print(f" āœ… Loaded {len(all_institutions)} institutions") # Load enriched IT data print("\nšŸ“‚ Loading IT enriched data...") with open('data/instances/italy/it_institutions_enriched_manual.yaml', 'r', encoding='utf-8') as f: it_enriched = yaml.safe_load(f) print(f" āœ… Loaded {len(it_enriched)} enriched IT institutions") # Create lookup by ID URL for quick matching it_by_id = {inst['id']: inst for inst in it_enriched} # Merge enriched data print("\nšŸ”„ Merging enriched data...") merged_count = 0 for i, inst in enumerate(all_institutions): inst_id = inst.get('id') if inst_id in it_by_id: # Preserve GHCID from original if it exists enriched_inst = it_by_id[inst_id].copy() if 'ghcid' in inst: enriched_inst['ghcid'] = inst['ghcid'] if 'ghcid_uuid' in inst: enriched_inst['ghcid_uuid'] = inst['ghcid_uuid'] if 'ghcid_uuid_sha256' in inst: enriched_inst['ghcid_uuid_sha256'] = inst['ghcid_uuid_sha256'] if 'ghcid_numeric' in inst: enriched_inst['ghcid_numeric'] = inst['ghcid_numeric'] # Replace with enriched version all_institutions[i] = enriched_inst merged_count += 1 print(f" āœ… Merged: {inst['name']}") print(f"\n šŸ“Š Total merged: {merged_count}") # Save unified dataset with timestamp timestamp = datetime.now(timezone.utc).strftime('%Y%m%d') output_path = f'data/instances/all/globalglam-{timestamp}.yaml' print(f"\nšŸ’¾ Saving updated unified dataset to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" āœ… Saved") # Verify IT coverage it_institutions = [ inst for inst in all_institutions if any(loc.get('country') == 'IT' for loc in inst.get('locations', [])) ] it_with_wikidata = sum( 1 for inst in it_institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])) ) print("\n" + "=" * 80) print("šŸ“Š VERIFICATION - IT Institutions in Unified Dataset") print("=" * 80) print(f"Total IT institutions: {len(it_institutions)}") print(f"With Wikidata identifiers: {it_with_wikidata}") print(f"Coverage: {it_with_wikidata/len(it_institutions)*100:.1f}%") if it_with_wikidata == len(it_institutions): print("\nāœ… SUCCESS: 100% Wikidata coverage verified in unified dataset!") print("šŸ‡®šŸ‡¹ Phase 1 Italy: COMPLETE") print("\n") if __name__ == '__main__': merge_it_enriched()