#!/usr/bin/env python3 """ Merge US Wikidata enrichment into unified global dataset. """ import yaml from datetime import datetime, timezone from pathlib import Path def merge_us_enrichment(): """Merge US enriched institutions into unified dataset.""" # Load enriched US data us_enriched_path = Path('data/instances/united_states/us_institutions_enriched_manual.yaml') with open(us_enriched_path, 'r', encoding='utf-8') as f: us_enriched = yaml.safe_load(f) # Load unified dataset unified_path = Path('data/instances/all/globalglam-20251111.yaml') with open(unified_path, 'r', encoding='utf-8') as f: unified = yaml.safe_load(f) print(f"Loaded {len(us_enriched)} enriched US institutions") print(f"Loaded {len(unified)} institutions from unified dataset") # Create lookup by name and country for US institutions us_enriched_lookup = {} for inst in us_enriched: name = inst.get('name') # Find US location locations = inst.get('locations', []) if locations and locations[0].get('country') == 'US': us_enriched_lookup[name] = inst print(f"\nCreated lookup for {len(us_enriched_lookup)} US institutions") # Merge enrichment data into unified dataset merged_count = 0 updated_records = [] for i, inst in enumerate(unified): locations = inst.get('locations', []) if not locations or locations[0].get('country') != 'US': continue name = inst.get('name') if name not in us_enriched_lookup: continue # Found matching institution enriched = us_enriched_lookup[name] # Get existing identifiers existing_identifiers = inst.get('identifiers', []) existing_schemes = {id.get('identifier_scheme') for id in existing_identifiers} # Add Wikidata and VIAF if not present added_identifiers = [] for id in enriched.get('identifiers', []): scheme = id.get('identifier_scheme') if scheme in ['Wikidata', 'VIAF'] and scheme not in existing_schemes: existing_identifiers.append(id) added_identifiers.append(scheme) if added_identifiers: # Update provenance provenance = inst.get('provenance', {}) old_method = provenance.get('extraction_method', '') if 'Manual Wikidata enrichment' not in old_method: # Extract the Wikidata description from enriched version enriched_method = enriched.get('provenance', {}).get('extraction_method', '') if 'Manual Wikidata enrichment:' in enriched_method: wikidata_part = enriched_method.split('Manual Wikidata enrichment:')[1].strip() provenance['extraction_method'] = f"{old_method} + Manual Wikidata enrichment: {wikidata_part}" provenance['last_updated'] = datetime.now(timezone.utc).isoformat() provenance['wikidata_verified'] = True merged_count += 1 updated_records.append({ 'name': name, 'added': added_identifiers }) print(f"✓ Merged {name}: added {', '.join(added_identifiers)}") # Save updated unified dataset if merged_count > 0: backup_path = unified_path.with_suffix('.yaml.backup') import shutil shutil.copy(unified_path, backup_path) print(f"\n✓ Created backup: {backup_path}") with open(unified_path, 'w', encoding='utf-8') as f: yaml.dump(unified, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"\n✓ Updated unified dataset: {unified_path}") print(f"✓ Merged {merged_count} US institutions") # Print summary print("\n" + "="*60) print("MERGE SUMMARY") print("="*60) for record in updated_records: print(f" {record['name']}") print(f" Added: {', '.join(record['added'])}") else: print("\n⚠ No institutions merged (already up to date)") return merged_count if __name__ == '__main__': merge_us_enrichment()