glam/scripts/match_custodians_to_yaml.py

#!/usr/bin/env python3
"""
Match custodians from LinkedIn staff parsing against existing custodian YAML files.

This script identifies which of the 132 custodians to profile already have
YAML files in data/custodian/ and which are missing.
"""

import json
import os
import re
from pathlib import Path
from typing import Optional
import yaml

# Paths
BASE_DIR = Path("/Users/kempersc/apps/glam")
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json"
OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches.json"


def normalize_name(name: str) -> str:
    """Normalize institution name for matching."""
    # Lowercase
    name = name.lower()
    # Remove common suffixes/prefixes
    name = re.sub(r'\b(the|de|het|stichting|museum|archief|bibliotheek)\b', '', name)
    # Remove punctuation and extra spaces
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name


def search_yaml_files(search_terms: list[str], yaml_files: dict[str, dict]) -> list[dict]:
    """Search YAML files for matching institution names."""
    matches = []

    for filepath, data in yaml_files.items():
        if not data:
            continue

        # Get searchable text from YAML
        searchable_parts = []

        # Original entry name
        if 'original_entry' in data:
            oe = data['original_entry']
            if isinstance(oe, dict):
                searchable_parts.append(str(oe.get('name', '')))
                searchable_parts.append(str(oe.get('instelling', '')))

        # Custodian name
        if 'custodian_name' in data:
            cn = data['custodian_name']
            if isinstance(cn, dict):
                searchable_parts.append(str(cn.get('consensus_name', '')))
                searchable_parts.append(str(cn.get('emic_name', '')))
            elif isinstance(cn, str):
                searchable_parts.append(cn)

        # Direct name field
        if 'name' in data:
            searchable_parts.append(str(data['name']))

        # Legal status
        if 'legal_status' in data and isinstance(data['legal_status'], dict):
            searchable_parts.append(str(data['legal_status'].get('legal_name', '')))

        # Combine and normalize
        searchable_text = ' '.join(searchable_parts).lower()

        # Check each search term
        for term in search_terms:
            if term.lower() in searchable_text:
                matches.append({
                    'filepath': filepath,
                    'filename': os.path.basename(filepath),
                    'matched_term': term,
                    'ghcid': data.get('ghcid', {}).get('ghcid_current') if isinstance(data.get('ghcid'), dict) else None,
                    'custodian_name': data.get('custodian_name', {}).get('consensus_name') if isinstance(data.get('custodian_name'), dict) else str(data.get('custodian_name', ''))[:50]
                })
                break

    return matches


def main():
    print("Loading custodians to profile...")
    with open(TO_PROFILE_FILE) as f:
        to_profile_data = json.load(f)

    custodians = to_profile_data['custodians']
    print(f"Found {len(custodians)} custodians to match")

    # Get list of YAML files
    print("\nScanning custodian YAML files...")
    yaml_files = list(CUSTODIAN_DIR.glob("*.yaml"))
    print(f"Found {len(yaml_files)} YAML files")

    # Load a sample of YAML files for quick matching
    # For efficiency, we'll search by filename patterns first
    print("\nMatching custodians to existing YAML files...")

    results = {
        'matched': [],
        'unmatched': [],
        'total_to_profile': len(custodians),
        'total_yaml_files': len(yaml_files)
    }

    # Create search patterns for each custodian
    for custodian in custodians:
        name = custodian['name']
        slug = custodian['slug']

        # Generate search patterns
        patterns = []

        # Pattern from slug (e.g., "rijksmuseum" -> "*rijksmuseum*")
        slug_clean = slug.replace('-', '').replace('_', '')
        patterns.append(f"*{slug_clean}*")

        # Pattern from name words
        name_words = [w.lower() for w in re.split(r'\W+', name) if len(w) > 3]
        for word in name_words[:3]:  # First 3 significant words
            patterns.append(f"*{word}*")

        # Search for matching files
        matched_files = []
        for pattern in patterns:
            try:
                matches = list(CUSTODIAN_DIR.glob(pattern + ".yaml"))
                matched_files.extend([str(m) for m in matches])
            except Exception:
                pass

        # Deduplicate
        matched_files = list(set(matched_files))

        if matched_files:
            results['matched'].append({
                'custodian': custodian,
                'yaml_files': matched_files[:5],  # Limit to 5 matches
                'match_count': len(matched_files)
            })
        else:
            results['unmatched'].append(custodian)

    # Summary
    print(f"\n{'='*60}")
    print("MATCHING RESULTS")
    print(f"{'='*60}")
    print(f"Total custodians to profile: {len(custodians)}")
    print(f"Matched to existing YAML:    {len(results['matched'])}")
    print(f"No match found:              {len(results['unmatched'])}")

    # Show matched custodians
    print(f"\n{'='*60}")
    print("MATCHED CUSTODIANS (have existing YAML files)")
    print(f"{'='*60}")
    for i, match in enumerate(results['matched'][:30], 1):
        c = match['custodian']
        files = [os.path.basename(f) for f in match['yaml_files'][:2]]
        print(f"{i:2}. {c['name'][:40]:<40} -> {', '.join(files)}")

    if len(results['matched']) > 30:
        print(f"    ... and {len(results['matched']) - 30} more")

    # Show unmatched custodians
    print(f"\n{'='*60}")
    print("UNMATCHED CUSTODIANS (need new YAML profiles)")
    print(f"{'='*60}")
    for i, c in enumerate(results['unmatched'][:30], 1):
        print(f"{i:2}. {c['name'][:50]:<50} | {c['heritage_count']:>3} staff | {c.get('location', {}).get('city', 'Unknown')}")

    if len(results['unmatched']) > 30:
        print(f"    ... and {len(results['unmatched']) - 30} more")

    # Save results
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"\nResults saved to: {OUTPUT_FILE}")

    return results


if __name__ == "__main__":
    main()