glam/scripts/match_custodians_to_yaml_v2.py

#!/usr/bin/env python3
"""
Precise matching of LinkedIn custodians to existing YAML files.
Uses multiple matching strategies: direct name search in content, GHCID patterns.
"""

import json
import os
import re
import subprocess
from pathlib import Path
from collections import defaultdict

BASE_DIR = Path("/Users/kempersc/apps/glam")
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json"
OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches_v2.json"

# Manual mapping for known institutions
KNOWN_MAPPINGS = {
    "rijksmuseum": "NL-NH-AMS-M-RM.yaml",
    "kb-nationale-bibliotheek": "NL-ZH-DHA-L-KB.yaml",
    "van-gogh-museum": "NL-NH-AMS-M-GM-van_gogh_museum.yaml",
    "nationaal-archief": "NL-ZH-DHA-A-NA.yaml",
    "eye-filmmuseum": "NL-NH-AMS-M-EFM-eye_filmmuseum.yaml",
    "stedelijk-museum-amsterdam": "NL-NH-AMS-M-SMA.yaml",
    "amsterdam-museum": "NL-NH-AMS-M-AM.yaml",
    "anne-frank-stichting": "NL-NH-AMS-M-AFH.yaml",
    "kroller-muller-museum": "NL-GE-OTT-M-KMM.yaml",
    "noord-hollands-archief": "NL-NH-HAA-A-NHA.yaml",
    "het-utrechts-archief": "NL-UT-UTR-A-HUA.yaml",
    "gelders-archief": "NL-GE-ARN-A-GA.yaml",
    "collectie-overijssel": "NL-OV-ZWO-A-CO-collectie_overijssel.yaml",
    "nieuwe-instituut": "NL-ZH-ROT-M-HNI.yaml",
    "allard-pierson": "NL-NH-AMS-M-AP.yaml",
}


def grep_yaml_files(search_term: str) -> list[str]:
    """Use grep to find YAML files containing search term."""
    try:
        result = subprocess.run(
            ["grep", "-l", "-i", search_term, *list(CUSTODIAN_DIR.glob("*.yaml"))[:5000]],
            capture_output=True,
            text=True,
            timeout=30
        )
        if result.returncode == 0:
            return result.stdout.strip().split('\n')
    except Exception:
        pass
    return []


def find_yaml_by_pattern(patterns: list[str]) -> list[str]:
    """Find YAML files matching filename patterns."""
    matches = []
    for pattern in patterns:
        try:
            found = list(CUSTODIAN_DIR.glob(pattern))
            matches.extend([str(f) for f in found])
        except Exception:
            pass
    return list(set(matches))


def match_custodian(custodian: dict) -> dict:
    """Try to find matching YAML file for a custodian."""
    name = custodian['name']
    slug = custodian['slug']

    result = {
        'custodian': custodian,
        'match_status': 'unmatched',
        'yaml_file': None,
        'match_method': None,
        'confidence': 0
    }

    # Method 1: Check known mappings
    if slug in KNOWN_MAPPINGS:
        yaml_path = CUSTODIAN_DIR / KNOWN_MAPPINGS[slug]
        if yaml_path.exists():
            result['match_status'] = 'matched'
            result['yaml_file'] = str(yaml_path)
            result['match_method'] = 'known_mapping'
            result['confidence'] = 1.0
            return result

    # Method 2: Search by key name words in YAML content
    # Extract key words (skip common words)
    skip_words = {'the', 'van', 'het', 'de', 'voor', 'museum', 'archief', 'stichting', 'institute', 'library'}
    name_words = [w for w in re.split(r'\W+', name.lower()) if len(w) > 3 and w not in skip_words]

    if name_words:
        # Try to find unique matches
        primary_word = name_words[0] if name_words else slug.split('-')[0]

        # Look for files with this word in name
        patterns = [
            f"*{primary_word}*.yaml",
            f"*-{primary_word[:3].upper()}.yaml",  # Short code pattern
            f"*-{primary_word[:3].upper()}-*.yaml"
        ]

        matches = find_yaml_by_pattern(patterns)

        # Filter to Dutch/relevant files if institution seems Dutch
        if any(c.get('location', {}).get('country') == 'Netherlands' for c in [custodian]):
            nl_matches = [m for m in matches if '/NL-' in m]
            if nl_matches:
                matches = nl_matches

        if len(matches) == 1:
            result['match_status'] = 'matched'
            result['yaml_file'] = matches[0]
            result['match_method'] = 'filename_pattern'
            result['confidence'] = 0.8
            return result
        elif len(matches) > 1 and len(matches) <= 5:
            result['match_status'] = 'multiple_candidates'
            result['yaml_file'] = matches[:5]
            result['match_method'] = 'filename_pattern'
            result['confidence'] = 0.5
            return result

    return result


def main():
    print("Loading custodians to profile...")
    with open(TO_PROFILE_FILE) as f:
        to_profile_data = json.load(f)

    custodians = to_profile_data['custodians']
    print(f"Found {len(custodians)} custodians to match\n")

    # Match each custodian
    results = {
        'matched': [],
        'multiple_candidates': [],
        'unmatched': [],
        'summary': {}
    }

    for custodian in custodians:
        match = match_custodian(custodian)

        if match['match_status'] == 'matched':
            results['matched'].append(match)
        elif match['match_status'] == 'multiple_candidates':
            results['multiple_candidates'].append(match)
        else:
            results['unmatched'].append(match)

    # Summary
    results['summary'] = {
        'total': len(custodians),
        'matched': len(results['matched']),
        'multiple_candidates': len(results['multiple_candidates']),
        'unmatched': len(results['unmatched'])
    }

    # Print results
    print("=" * 70)
    print("MATCHING RESULTS")
    print("=" * 70)
    print(f"Total custodians:        {results['summary']['total']}")
    print(f"Definitively matched:    {results['summary']['matched']}")
    print(f"Multiple candidates:     {results['summary']['multiple_candidates']}")
    print(f"Unmatched:               {results['summary']['unmatched']}")

    print("\n" + "=" * 70)
    print("MATCHED CUSTODIANS (confirmed YAML files exist)")
    print("=" * 70)
    for i, m in enumerate(results['matched'], 1):
        c = m['custodian']
        fname = os.path.basename(m['yaml_file']) if m['yaml_file'] else 'N/A'
        print(f"{i:2}. {c['name'][:45]:<45} -> {fname}")

    print("\n" + "=" * 70)
    print("MULTIPLE CANDIDATES (need manual verification)")
    print("=" * 70)
    for i, m in enumerate(results['multiple_candidates'][:15], 1):
        c = m['custodian']
        files = [os.path.basename(f) for f in m['yaml_file'][:3]] if isinstance(m['yaml_file'], list) else []
        print(f"{i:2}. {c['name'][:40]:<40} -> {', '.join(files)}")

    print("\n" + "=" * 70)
    print("UNMATCHED CUSTODIANS (need new YAML profiles)")
    print("=" * 70)
    for i, m in enumerate(results['unmatched'], 1):
        c = m['custodian']
        loc = c.get('location', {})
        city = loc.get('city', 'Unknown') if isinstance(loc, dict) else 'Unknown'
        print(f"{i:2}. {c['name'][:45]:<45} | {c['heritage_count']:>3} staff | {city}")

    # Save results
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(results, f, indent=2, default=str)
    print(f"\nResults saved to: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()