#!/usr/bin/env python3 """ Merge Brazil batch 8 enriched institutions back into unified global dataset. This script merges the 2 missing Wikidata enrichments from batch 8. """ import yaml from datetime import datetime, timezone def merge_brazil_batch8(): print("=" * 80) print("šŸ”€ Merging Brazil batch 8 enriched data into unified dataset") print("=" * 80) # Load unified dataset print("\nšŸ“‚ Loading unified dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) print(f" āœ… Loaded {len(all_institutions)} institutions") # Load enriched Brazil batch 8 data print("\nšŸ“‚ Loading Brazil batch 8 enriched data...") with open('data/instances/brazil/brazilian_institutions_batch8_enriched.yaml', 'r', encoding='utf-8') as f: br_batch8 = yaml.safe_load(f) print(f" āœ… Loaded {len(br_batch8)} enriched Brazil institutions") # Create lookup by ID URL for quick matching br_by_id = {inst['id']: inst for inst in br_batch8} # Merge enriched data print("\nšŸ”„ Merging enriched data...") merged_count = 0 updated_count = 0 for i, inst in enumerate(all_institutions): inst_id = inst.get('id') if inst_id in br_by_id: enriched_inst = br_by_id[inst_id] # Check if this institution has Wikidata in batch 8 but not in master has_wd_enriched = any( idf.get('identifier_scheme') == 'Wikidata' for idf in enriched_inst.get('identifiers', []) ) has_wd_original = any( idf.get('identifier_scheme') == 'Wikidata' for idf in inst.get('identifiers', []) ) if has_wd_enriched and not has_wd_original: # This is a new enrichment - merge it enriched_copy = enriched_inst.copy() # Preserve GHCID fields from original if they exist if 'ghcid' in inst: enriched_copy['ghcid'] = inst['ghcid'] if 'ghcid_uuid' in inst: enriched_copy['ghcid_uuid'] = inst['ghcid_uuid'] if 'ghcid_uuid_sha256' in inst: enriched_copy['ghcid_uuid_sha256'] = inst['ghcid_uuid_sha256'] if 'ghcid_numeric' in inst: enriched_copy['ghcid_numeric'] = inst['ghcid_numeric'] # Replace with enriched version all_institutions[i] = enriched_copy merged_count += 1 # Extract Q-number for display q_num = next( (idf.get('identifier_value') for idf in enriched_copy.get('identifiers', []) if idf.get('identifier_scheme') == 'Wikidata'), 'N/A' ) print(f" āœ… Merged: {inst['name']} -> {q_num}") elif has_wd_enriched and has_wd_original: # Already has Wikidata, might have other updates updated_count += 1 print(f"\n šŸ“Š New Wikidata enrichments merged: {merged_count}") print(f" šŸ“Š Already enriched (skipped): {updated_count}") # Save unified dataset with timestamp timestamp = datetime.now(timezone.utc).strftime('%Y%m%d') output_path = f'data/instances/all/globalglam-{timestamp}.yaml' print(f"\nšŸ’¾ Saving updated unified dataset to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" āœ… Saved") # Verify Brazil coverage br_institutions = [ inst for inst in all_institutions if any(loc.get('country') == 'BR' for loc in inst.get('locations', [])) ] br_with_wikidata = sum( 1 for inst in br_institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])) ) print("\n" + "=" * 80) print("šŸ“Š VERIFICATION - Brazil Institutions in Unified Dataset") print("=" * 80) print(f"Total Brazil institutions: {len(br_institutions)}") print(f"With Wikidata identifiers: {br_with_wikidata}") print(f"Coverage: {br_with_wikidata/len(br_institutions)*100:.1f}%") print(f"Progress: {br_with_wikidata}/{len(br_institutions)}") if merged_count > 0: print(f"\nāœ… SUCCESS: {merged_count} new Wikidata enrichments merged!") print(f"šŸ‡§šŸ‡· Brazil coverage improved: 7 → {br_with_wikidata} institutions") else: print("\nāš ļø No new enrichments to merge (already up to date)") print("\n") if __name__ == '__main__': merge_brazil_batch8()