#!/usr/bin/env python3 """ Merge Georgia Wikidata enrichment into unified global dataset. """ import yaml from datetime import datetime, timezone from pathlib import Path def merge_georgia_enrichment(): """Merge Georgia enriched institutions into unified dataset.""" # Load enriched Georgia data georgia_enriched_path = Path('data/instances/georgia/georgian_institutions_enriched_batch3_final.yaml') with open(georgia_enriched_path, 'r', encoding='utf-8') as f: georgia_enriched = yaml.safe_load(f) # Load unified dataset unified_path = Path('data/instances/all/globalglam-20251111.yaml') with open(unified_path, 'r', encoding='utf-8') as f: unified = yaml.safe_load(f) print(f"Loaded {len(georgia_enriched)} enriched Georgia institutions") print(f"Loaded {len(unified)} institutions from unified dataset") # Create lookup by institution ID georgia_enriched_lookup = {} for inst in georgia_enriched: inst_id = inst.get('id') if inst_id: georgia_enriched_lookup[inst_id] = inst print(f"\nCreated lookup for {len(georgia_enriched_lookup)} Georgia institutions") # Merge enrichment data into unified dataset merged_count = 0 updated_records = [] for i, inst in enumerate(unified): inst_id = inst.get('id') # Check if this is a Georgia institution with enrichment if not inst_id or inst_id not in georgia_enriched_lookup: continue # Found matching institution enriched = georgia_enriched_lookup[inst_id] # Get existing identifiers (or create empty list if field missing) if 'identifiers' not in inst or inst['identifiers'] is None: inst['identifiers'] = [] existing_identifiers = inst['identifiers'] existing_schemes = {id.get('identifier_scheme') for id in existing_identifiers} # Add Wikidata and VIAF from enriched version if not present added_identifiers = [] for id in enriched.get('identifiers', []): scheme = id.get('identifier_scheme') if scheme in ['Wikidata', 'VIAF'] and scheme not in existing_schemes: inst['identifiers'].append(id) added_identifiers.append(scheme) existing_schemes.add(scheme) # Also add founding_date if present in enriched version if enriched.get('founding_date') and not inst.get('founding_date'): inst['founding_date'] = enriched['founding_date'] added_identifiers.append('founding_date') if added_identifiers: # Update provenance provenance = inst.get('provenance', {}) if not provenance: provenance = {} inst['provenance'] = provenance # Update enrichment_history enrichment_history = enriched.get('provenance', {}).get('enrichment_history', []) if enrichment_history: if 'enrichment_history' not in provenance: provenance['enrichment_history'] = [] provenance['enrichment_history'].extend(enrichment_history) # Add merge timestamp provenance['last_updated'] = datetime.now(timezone.utc).isoformat() provenance['wikidata_verified'] = True merged_count += 1 updated_records.append({ 'name': inst.get('name'), 'id': inst_id, 'added': added_identifiers }) print(f"✓ Merged {inst.get('name')}: added {', '.join(added_identifiers)}") # Save updated unified dataset if merged_count > 0: backup_path = unified_path.with_suffix('.yaml.backup') import shutil shutil.copy(unified_path, backup_path) print(f"\n✓ Created backup: {backup_path}") with open(unified_path, 'w', encoding='utf-8') as f: yaml.dump(unified, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"\n✓ Updated unified dataset: {unified_path}") print(f"✓ Merged {merged_count} Georgia institutions") # Print summary print("\n" + "="*60) print("MERGE SUMMARY") print("="*60) for record in updated_records: print(f" {record['name']} ({record['id']})") print(f" Added: {', '.join(record['added'])}") else: print("\n⚠ No institutions merged (already up to date)") return merged_count if __name__ == '__main__': merge_georgia_enrichment()