glam/scripts/enrich_egypt_viaf.py

#!/usr/bin/env python3
"""
VIAF Enrichment for Egyptian Heritage Institutions

Searches VIAF (Virtual International Authority File) for heritage institutions
without VIAF identifiers. VIAF is particularly strong for libraries, archives,
and museums.

VIAF API Documentation: https://www.oclc.org/developer/api/oclc-apis/viaf.en.html

Usage:
    python scripts/enrich_egypt_viaf.py
"""

import yaml
import requests
import time
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote


def search_viaf(institution_name: str, institution_type: str) -> Optional[Tuple[str, str, float]]:
    """
    Search VIAF for an institution by name using AutoSuggest API.

    Args:
        institution_name: Name of the institution
        institution_type: Type (LIBRARY, ARCHIVE, MUSEUM, etc.)

    Returns:
        Tuple of (viaf_id, viaf_label, confidence_score) or None if no match
    """
    # VIAF AutoSuggest API endpoint
    base_url = "https://viaf.org/viaf/AutoSuggest"

    params = {
        'query': institution_name
    }

    try:
        headers = {'Accept': 'application/json'}
        response = requests.get(base_url, params=params, headers=headers, timeout=10)
        response.raise_for_status()

        data = response.json()

        # Check if we got results
        if 'result' not in data or not data['result']:
            return None

        # Process results
        for result in data['result']:
            # Extract VIAF ID
            viaf_id = result.get('viafid')
            if not viaf_id:
                continue

            # Extract term (preferred name)
            viaf_label = result.get('term', '')
            if not viaf_label:
                continue

            # Calculate simple confidence score based on name similarity
            confidence = calculate_name_similarity(institution_name, viaf_label)

            print(f"  Found: {viaf_label} (VIAF: {viaf_id}, confidence: {confidence:.3f})")

            # Return if confidence is reasonable
            if confidence > 0.5:
                return (viaf_id, viaf_label, confidence)

        return None

    except requests.exceptions.RequestException as e:
        print(f"  Error querying VIAF: {e}")
        return None
    except Exception as e:
        print(f"  Error parsing VIAF response: {e}")
        return None


def calculate_name_similarity(name1: str, name2: str) -> float:
    """
    Calculate simple similarity score between two names.
    Uses case-insensitive substring matching and word overlap.

    Args:
        name1: First name
        name2: Second name

    Returns:
        Similarity score between 0.0 and 1.0
    """
    name1_lower = name1.lower()
    name2_lower = name2.lower()

    # Exact match
    if name1_lower == name2_lower:
        return 1.0

    # Substring match
    if name1_lower in name2_lower or name2_lower in name1_lower:
        return 0.9

    # Word overlap
    words1 = set(name1_lower.split())
    words2 = set(name2_lower.split())

    # Remove common stop words
    stop_words = {'the', 'of', 'in', 'and', 'a', 'an', 'for', 'to', 'university', 'library', 'museum'}
    words1 = words1 - stop_words
    words2 = words2 - stop_words

    if not words1 or not words2:
        return 0.0

    overlap = len(words1 & words2)
    total = len(words1 | words2)

    return overlap / total if total > 0 else 0.0


def enrich_with_viaf(institutions: List[Dict]) -> Tuple[List[Dict], Dict]:
    """
    Enrich institutions with VIAF identifiers.

    Args:
        institutions: List of institution records

    Returns:
        Tuple of (enriched_institutions, statistics)
    """
    stats = {
        'total': len(institutions),
        'already_has_viaf': 0,
        'viaf_found': 0,
        'viaf_not_found': 0,
        'by_type': {}
    }

    enriched = []

    for inst in institutions:
        name = inst.get('name', '')
        inst_type = inst.get('institution_type', 'UNKNOWN')
        identifiers = inst.get('identifiers', [])

        # Track by type
        if inst_type not in stats['by_type']:
            stats['by_type'][inst_type] = {
                'total': 0,
                'already_has_viaf': 0,
                'viaf_found': 0,
                'viaf_not_found': 0
            }
        stats['by_type'][inst_type]['total'] += 1

        # Check if already has VIAF
        has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in identifiers)

        if has_viaf:
            print(f"✓ {name}: Already has VIAF identifier")
            stats['already_has_viaf'] += 1
            stats['by_type'][inst_type]['already_has_viaf'] += 1
            enriched.append(inst)
            continue

        # Search VIAF
        print(f"\n🔍 Searching VIAF for: {name} ({inst_type})")
        result = search_viaf(name, inst_type)

        if result:
            viaf_id, viaf_label, confidence = result

            # Add VIAF identifier
            identifiers.append({
                'identifier_scheme': 'VIAF',
                'identifier_value': viaf_id,
                'identifier_url': f'https://viaf.org/viaf/{viaf_id}'
            })

            inst['identifiers'] = identifiers

            # Update provenance
            if 'provenance' not in inst:
                inst['provenance'] = {}

            if 'viaf_enrichment' not in inst['provenance']:
                inst['provenance']['viaf_enrichment'] = {}

            inst['provenance']['viaf_enrichment'].update({
                'method': 'VIAF SRU API search',
                'enrichment_date': datetime.now(timezone.utc).isoformat(),
                'viaf_label': viaf_label,
                'confidence_score': confidence,
                'verified': confidence > 0.8
            })

            print(f"✅ Added VIAF identifier: {viaf_id}")
            stats['viaf_found'] += 1
            stats['by_type'][inst_type]['viaf_found'] += 1
        else:
            print(f"❌ No VIAF identifier found")
            stats['viaf_not_found'] += 1
            stats['by_type'][inst_type]['viaf_not_found'] += 1

        enriched.append(inst)

        # Rate limiting - be respectful to VIAF API
        time.sleep(1)

    return enriched, stats


def main():
    """Main execution function."""
    input_file = 'data/instances/egypt_institutions_wikidata_corrected.yaml'
    output_file = 'data/instances/egypt_institutions_viaf_enriched.yaml'

    print("="*60)
    print("VIAF Enrichment for Egyptian Heritage Institutions")
    print("="*60)

    # Load institutions
    print(f"\nLoading institutions from: {input_file}")
    with open(input_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    print(f"Loaded {len(institutions)} institutions")

    # Enrich with VIAF
    enriched, stats = enrich_with_viaf(institutions)

    # Save enriched data
    print(f"\n{'='*60}")
    print(f"Saving enriched data to: {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(enriched, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    # Print statistics
    print(f"\n{'='*60}")
    print("VIAF Enrichment Statistics")
    print(f"{'='*60}")
    print(f"Total institutions: {stats['total']}")
    print(f"Already had VIAF: {stats['already_has_viaf']}")
    print(f"VIAF found: {stats['viaf_found']}")
    print(f"VIAF not found: {stats['viaf_not_found']}")
    print(f"\nNew VIAF coverage: {stats['viaf_found'] + stats['already_has_viaf']}/{stats['total']} "
          f"({100 * (stats['viaf_found'] + stats['already_has_viaf']) / stats['total']:.1f}%)")

    print(f"\n{'='*60}")
    print("Breakdown by Institution Type")
    print(f"{'='*60}")
    for inst_type, type_stats in sorted(stats['by_type'].items()):
        total = type_stats['total']
        found = type_stats['viaf_found']
        already = type_stats['already_has_viaf']
        coverage = 100 * (found + already) / total if total > 0 else 0

        print(f"\n{inst_type} ({total} institutions):")
        print(f"  Already had VIAF: {already}")
        print(f"  VIAF found: {found}")
        print(f"  VIAF not found: {type_stats['viaf_not_found']}")
        print(f"  Coverage: {found + already}/{total} ({coverage:.1f}%)")

    print(f"\n{'='*60}")
    print("✅ VIAF enrichment complete!")
    print(f"{'='*60}")


if __name__ == '__main__':
    main()