glam/scripts/merge_viaf_mappings.py

#!/usr/bin/env python3
"""
Merge manual VIAF mappings from CSV back into Egypt institutions YAML file.

Usage:
    python scripts/merge_viaf_mappings.py

Reads:
    - data/manual_enrichment/egypt_viaf_mappings.csv (manual VIAF lookups)
    - data/instances/egypt_institutions_wikidata_viaf.yaml (current data)

Writes:
    - data/instances/egypt_institutions_wikidata_viaf.yaml (updated with new VIAF IDs)

CSV columns:
    - institution_id: Full institution URI
    - name: Institution name (for reference)
    - institution_type: Type (for reference)
    - city: Location (for reference)
    - viaf_id: VIAF ID (numeric, e.g., 123456789)
    - viaf_url: Full VIAF URL (optional, will be generated if missing)
    - notes: Additional notes (optional)
    - lookup_status: PENDING, FOUND, NOT_FOUND, UNCERTAIN
"""

import csv
import yaml
from datetime import datetime, timezone
from pathlib import Path


def load_viaf_mappings(csv_path: Path) -> dict:
    """Load VIAF mappings from CSV, returning dict keyed by institution_id."""
    mappings = {}

    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            inst_id = row['institution_id'].strip()
            viaf_id = row.get('viaf_id', '').strip()
            status = row.get('lookup_status', 'PENDING').strip().upper()

            # Only process if VIAF ID found
            if viaf_id and status == 'FOUND':
                viaf_url = row.get('viaf_url', '').strip()
                if not viaf_url:
                    viaf_url = f"https://viaf.org/viaf/{viaf_id}"

                mappings[inst_id] = {
                    'identifier_scheme': 'VIAF',
                    'identifier_value': viaf_id,
                    'identifier_url': viaf_url,
                    'notes': row.get('notes', '').strip()
                }

    return mappings


def merge_viaf_into_institutions(yaml_path: Path, viaf_mappings: dict) -> tuple[int, int]:
    """Merge VIAF mappings into institutions YAML file.

    Returns:
        (added_count, skipped_count) tuple
    """
    with open(yaml_path, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    added_count = 0
    skipped_count = 0
    enrichment_date = datetime.now(timezone.utc).isoformat()

    for inst in institutions:
        inst_id = inst.get('id')

        if inst_id in viaf_mappings:
            # Check if VIAF already exists
            has_viaf = False
            if inst.get('identifiers'):
                for identifier in inst['identifiers']:
                    if identifier.get('identifier_scheme') == 'VIAF':
                        has_viaf = True
                        break

            if has_viaf:
                print(f"⏭️  Skipping {inst['name']} - already has VIAF")
                skipped_count += 1
                continue

            # Add VIAF identifier
            viaf_mapping = viaf_mappings[inst_id]

            if not inst.get('identifiers'):
                inst['identifiers'] = []

            inst['identifiers'].append({
                'identifier_scheme': viaf_mapping['identifier_scheme'],
                'identifier_value': viaf_mapping['identifier_value'],
                'identifier_url': viaf_mapping['identifier_url']
            })

            # Add enrichment metadata to provenance
            if not inst.get('provenance'):
                inst['provenance'] = {}

            inst['provenance']['viaf_enrichment'] = {
                'method': 'Manual VIAF web lookup',
                'enrichment_date': enrichment_date,
                'verified': True,
                'notes': viaf_mapping.get('notes', 'Manual lookup via VIAF website')
            }

            print(f"✅ Added VIAF {viaf_mapping['identifier_value']} to {inst['name']}")
            added_count += 1

    # Write updated YAML back
    with open(yaml_path, 'w', encoding='utf-8') as f:
        yaml.dump(institutions, f,
                  allow_unicode=True,
                  default_flow_style=False,
                  sort_keys=False,
                  width=100)

    return added_count, skipped_count


def main():
    csv_path = Path('data/manual_enrichment/egypt_viaf_mappings.csv')
    yaml_path = Path('data/instances/egypt_institutions_wikidata_viaf.yaml')

    if not csv_path.exists():
        print(f"❌ CSV file not found: {csv_path}")
        return

    if not yaml_path.exists():
        print(f"❌ YAML file not found: {yaml_path}")
        return

    print(f"📂 Loading VIAF mappings from {csv_path}...")
    viaf_mappings = load_viaf_mappings(csv_path)

    print(f"✅ Loaded {len(viaf_mappings)} VIAF mappings")

    if not viaf_mappings:
        print("⚠️  No VIAF mappings with status=FOUND in CSV")
        print("   Please update the CSV with VIAF IDs and set lookup_status=FOUND")
        return

    print(f"\n📂 Merging into {yaml_path}...")
    added_count, skipped_count = merge_viaf_into_institutions(yaml_path, viaf_mappings)

    print(f"\n{'='*60}")
    print(f"✅ Merge complete!")
    print(f"   Added: {added_count}")
    print(f"   Skipped: {skipped_count}")
    print(f"   Total mappings processed: {len(viaf_mappings)}")
    print(f"{'='*60}")


if __name__ == '__main__':
    main()