#!/usr/bin/env python3 """ Merge manual VIAF mappings from CSV back into Egypt institutions YAML file. Usage: python scripts/merge_viaf_mappings.py Reads: - data/manual_enrichment/egypt_viaf_mappings.csv (manual VIAF lookups) - data/instances/egypt_institutions_wikidata_viaf.yaml (current data) Writes: - data/instances/egypt_institutions_wikidata_viaf.yaml (updated with new VIAF IDs) CSV columns: - institution_id: Full institution URI - name: Institution name (for reference) - institution_type: Type (for reference) - city: Location (for reference) - viaf_id: VIAF ID (numeric, e.g., 123456789) - viaf_url: Full VIAF URL (optional, will be generated if missing) - notes: Additional notes (optional) - lookup_status: PENDING, FOUND, NOT_FOUND, UNCERTAIN """ import csv import yaml from datetime import datetime, timezone from pathlib import Path def load_viaf_mappings(csv_path: Path) -> dict: """Load VIAF mappings from CSV, returning dict keyed by institution_id.""" mappings = {} with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: inst_id = row['institution_id'].strip() viaf_id = row.get('viaf_id', '').strip() status = row.get('lookup_status', 'PENDING').strip().upper() # Only process if VIAF ID found if viaf_id and status == 'FOUND': viaf_url = row.get('viaf_url', '').strip() if not viaf_url: viaf_url = f"https://viaf.org/viaf/{viaf_id}" mappings[inst_id] = { 'identifier_scheme': 'VIAF', 'identifier_value': viaf_id, 'identifier_url': viaf_url, 'notes': row.get('notes', '').strip() } return mappings def merge_viaf_into_institutions(yaml_path: Path, viaf_mappings: dict) -> tuple[int, int]: """Merge VIAF mappings into institutions YAML file. Returns: (added_count, skipped_count) tuple """ with open(yaml_path, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) added_count = 0 skipped_count = 0 enrichment_date = datetime.now(timezone.utc).isoformat() for inst in institutions: inst_id = inst.get('id') if inst_id in viaf_mappings: # Check if VIAF already exists has_viaf = False if inst.get('identifiers'): for identifier in inst['identifiers']: if identifier.get('identifier_scheme') == 'VIAF': has_viaf = True break if has_viaf: print(f"⏭️ Skipping {inst['name']} - already has VIAF") skipped_count += 1 continue # Add VIAF identifier viaf_mapping = viaf_mappings[inst_id] if not inst.get('identifiers'): inst['identifiers'] = [] inst['identifiers'].append({ 'identifier_scheme': viaf_mapping['identifier_scheme'], 'identifier_value': viaf_mapping['identifier_value'], 'identifier_url': viaf_mapping['identifier_url'] }) # Add enrichment metadata to provenance if not inst.get('provenance'): inst['provenance'] = {} inst['provenance']['viaf_enrichment'] = { 'method': 'Manual VIAF web lookup', 'enrichment_date': enrichment_date, 'verified': True, 'notes': viaf_mapping.get('notes', 'Manual lookup via VIAF website') } print(f"✅ Added VIAF {viaf_mapping['identifier_value']} to {inst['name']}") added_count += 1 # Write updated YAML back with open(yaml_path, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=100) return added_count, skipped_count def main(): csv_path = Path('data/manual_enrichment/egypt_viaf_mappings.csv') yaml_path = Path('data/instances/egypt_institutions_wikidata_viaf.yaml') if not csv_path.exists(): print(f"❌ CSV file not found: {csv_path}") return if not yaml_path.exists(): print(f"❌ YAML file not found: {yaml_path}") return print(f"📂 Loading VIAF mappings from {csv_path}...") viaf_mappings = load_viaf_mappings(csv_path) print(f"✅ Loaded {len(viaf_mappings)} VIAF mappings") if not viaf_mappings: print("⚠️ No VIAF mappings with status=FOUND in CSV") print(" Please update the CSV with VIAF IDs and set lookup_status=FOUND") return print(f"\n📂 Merging into {yaml_path}...") added_count, skipped_count = merge_viaf_into_institutions(yaml_path, viaf_mappings) print(f"\n{'='*60}") print(f"✅ Merge complete!") print(f" Added: {added_count}") print(f" Skipped: {skipped_count}") print(f" Total mappings processed: {len(viaf_mappings)}") print(f"{'='*60}") if __name__ == '__main__': main()