#!/usr/bin/env python3 """ Precise matching of LinkedIn custodians to existing YAML files. Uses multiple matching strategies: direct name search in content, GHCID patterns. """ import json import os import re import subprocess from pathlib import Path from collections import defaultdict BASE_DIR = Path("/Users/kempersc/apps/glam") CUSTODIAN_DIR = BASE_DIR / "data" / "custodian" TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json" OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches_v2.json" # Manual mapping for known institutions KNOWN_MAPPINGS = { "rijksmuseum": "NL-NH-AMS-M-RM.yaml", "kb-nationale-bibliotheek": "NL-ZH-DHA-L-KB.yaml", "van-gogh-museum": "NL-NH-AMS-M-GM-van_gogh_museum.yaml", "nationaal-archief": "NL-ZH-DHA-A-NA.yaml", "eye-filmmuseum": "NL-NH-AMS-M-EFM-eye_filmmuseum.yaml", "stedelijk-museum-amsterdam": "NL-NH-AMS-M-SMA.yaml", "amsterdam-museum": "NL-NH-AMS-M-AM.yaml", "anne-frank-stichting": "NL-NH-AMS-M-AFH.yaml", "kroller-muller-museum": "NL-GE-OTT-M-KMM.yaml", "noord-hollands-archief": "NL-NH-HAA-A-NHA.yaml", "het-utrechts-archief": "NL-UT-UTR-A-HUA.yaml", "gelders-archief": "NL-GE-ARN-A-GA.yaml", "collectie-overijssel": "NL-OV-ZWO-A-CO-collectie_overijssel.yaml", "nieuwe-instituut": "NL-ZH-ROT-M-HNI.yaml", "allard-pierson": "NL-NH-AMS-M-AP.yaml", } def grep_yaml_files(search_term: str) -> list[str]: """Use grep to find YAML files containing search term.""" try: result = subprocess.run( ["grep", "-l", "-i", search_term, *list(CUSTODIAN_DIR.glob("*.yaml"))[:5000]], capture_output=True, text=True, timeout=30 ) if result.returncode == 0: return result.stdout.strip().split('\n') except Exception: pass return [] def find_yaml_by_pattern(patterns: list[str]) -> list[str]: """Find YAML files matching filename patterns.""" matches = [] for pattern in patterns: try: found = list(CUSTODIAN_DIR.glob(pattern)) matches.extend([str(f) for f in found]) except Exception: pass return list(set(matches)) def match_custodian(custodian: dict) -> dict: """Try to find matching YAML file for a custodian.""" name = custodian['name'] slug = custodian['slug'] result = { 'custodian': custodian, 'match_status': 'unmatched', 'yaml_file': None, 'match_method': None, 'confidence': 0 } # Method 1: Check known mappings if slug in KNOWN_MAPPINGS: yaml_path = CUSTODIAN_DIR / KNOWN_MAPPINGS[slug] if yaml_path.exists(): result['match_status'] = 'matched' result['yaml_file'] = str(yaml_path) result['match_method'] = 'known_mapping' result['confidence'] = 1.0 return result # Method 2: Search by key name words in YAML content # Extract key words (skip common words) skip_words = {'the', 'van', 'het', 'de', 'voor', 'museum', 'archief', 'stichting', 'institute', 'library'} name_words = [w for w in re.split(r'\W+', name.lower()) if len(w) > 3 and w not in skip_words] if name_words: # Try to find unique matches primary_word = name_words[0] if name_words else slug.split('-')[0] # Look for files with this word in name patterns = [ f"*{primary_word}*.yaml", f"*-{primary_word[:3].upper()}.yaml", # Short code pattern f"*-{primary_word[:3].upper()}-*.yaml" ] matches = find_yaml_by_pattern(patterns) # Filter to Dutch/relevant files if institution seems Dutch if any(c.get('location', {}).get('country') == 'Netherlands' for c in [custodian]): nl_matches = [m for m in matches if '/NL-' in m] if nl_matches: matches = nl_matches if len(matches) == 1: result['match_status'] = 'matched' result['yaml_file'] = matches[0] result['match_method'] = 'filename_pattern' result['confidence'] = 0.8 return result elif len(matches) > 1 and len(matches) <= 5: result['match_status'] = 'multiple_candidates' result['yaml_file'] = matches[:5] result['match_method'] = 'filename_pattern' result['confidence'] = 0.5 return result return result def main(): print("Loading custodians to profile...") with open(TO_PROFILE_FILE) as f: to_profile_data = json.load(f) custodians = to_profile_data['custodians'] print(f"Found {len(custodians)} custodians to match\n") # Match each custodian results = { 'matched': [], 'multiple_candidates': [], 'unmatched': [], 'summary': {} } for custodian in custodians: match = match_custodian(custodian) if match['match_status'] == 'matched': results['matched'].append(match) elif match['match_status'] == 'multiple_candidates': results['multiple_candidates'].append(match) else: results['unmatched'].append(match) # Summary results['summary'] = { 'total': len(custodians), 'matched': len(results['matched']), 'multiple_candidates': len(results['multiple_candidates']), 'unmatched': len(results['unmatched']) } # Print results print("=" * 70) print("MATCHING RESULTS") print("=" * 70) print(f"Total custodians: {results['summary']['total']}") print(f"Definitively matched: {results['summary']['matched']}") print(f"Multiple candidates: {results['summary']['multiple_candidates']}") print(f"Unmatched: {results['summary']['unmatched']}") print("\n" + "=" * 70) print("MATCHED CUSTODIANS (confirmed YAML files exist)") print("=" * 70) for i, m in enumerate(results['matched'], 1): c = m['custodian'] fname = os.path.basename(m['yaml_file']) if m['yaml_file'] else 'N/A' print(f"{i:2}. {c['name'][:45]:<45} -> {fname}") print("\n" + "=" * 70) print("MULTIPLE CANDIDATES (need manual verification)") print("=" * 70) for i, m in enumerate(results['multiple_candidates'][:15], 1): c = m['custodian'] files = [os.path.basename(f) for f in m['yaml_file'][:3]] if isinstance(m['yaml_file'], list) else [] print(f"{i:2}. {c['name'][:40]:<40} -> {', '.join(files)}") print("\n" + "=" * 70) print("UNMATCHED CUSTODIANS (need new YAML profiles)") print("=" * 70) for i, m in enumerate(results['unmatched'], 1): c = m['custodian'] loc = c.get('location', {}) city = loc.get('city', 'Unknown') if isinstance(loc, dict) else 'Unknown' print(f"{i:2}. {c['name'][:45]:<45} | {c['heritage_count']:>3} staff | {city}") # Save results with open(OUTPUT_FILE, 'w') as f: json.dump(results, f, indent=2, default=str) print(f"\nResults saved to: {OUTPUT_FILE}") if __name__ == "__main__": main()