glam/scripts/match_custodians_to_yaml_v2.py
2025-12-15 22:31:41 +01:00

205 lines
7.1 KiB
Python

#!/usr/bin/env python3
"""
Precise matching of LinkedIn custodians to existing YAML files.
Uses multiple matching strategies: direct name search in content, GHCID patterns.
"""
import json
import os
import re
import subprocess
from pathlib import Path
from collections import defaultdict
BASE_DIR = Path("/Users/kempersc/apps/glam")
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json"
OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches_v2.json"
# Manual mapping for known institutions
KNOWN_MAPPINGS = {
"rijksmuseum": "NL-NH-AMS-M-RM.yaml",
"kb-nationale-bibliotheek": "NL-ZH-DHA-L-KB.yaml",
"van-gogh-museum": "NL-NH-AMS-M-GM-van_gogh_museum.yaml",
"nationaal-archief": "NL-ZH-DHA-A-NA.yaml",
"eye-filmmuseum": "NL-NH-AMS-M-EFM-eye_filmmuseum.yaml",
"stedelijk-museum-amsterdam": "NL-NH-AMS-M-SMA.yaml",
"amsterdam-museum": "NL-NH-AMS-M-AM.yaml",
"anne-frank-stichting": "NL-NH-AMS-M-AFH.yaml",
"kroller-muller-museum": "NL-GE-OTT-M-KMM.yaml",
"noord-hollands-archief": "NL-NH-HAA-A-NHA.yaml",
"het-utrechts-archief": "NL-UT-UTR-A-HUA.yaml",
"gelders-archief": "NL-GE-ARN-A-GA.yaml",
"collectie-overijssel": "NL-OV-ZWO-A-CO-collectie_overijssel.yaml",
"nieuwe-instituut": "NL-ZH-ROT-M-HNI.yaml",
"allard-pierson": "NL-NH-AMS-M-AP.yaml",
}
def grep_yaml_files(search_term: str) -> list[str]:
"""Use grep to find YAML files containing search term."""
try:
result = subprocess.run(
["grep", "-l", "-i", search_term, *list(CUSTODIAN_DIR.glob("*.yaml"))[:5000]],
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0:
return result.stdout.strip().split('\n')
except Exception:
pass
return []
def find_yaml_by_pattern(patterns: list[str]) -> list[str]:
"""Find YAML files matching filename patterns."""
matches = []
for pattern in patterns:
try:
found = list(CUSTODIAN_DIR.glob(pattern))
matches.extend([str(f) for f in found])
except Exception:
pass
return list(set(matches))
def match_custodian(custodian: dict) -> dict:
"""Try to find matching YAML file for a custodian."""
name = custodian['name']
slug = custodian['slug']
result = {
'custodian': custodian,
'match_status': 'unmatched',
'yaml_file': None,
'match_method': None,
'confidence': 0
}
# Method 1: Check known mappings
if slug in KNOWN_MAPPINGS:
yaml_path = CUSTODIAN_DIR / KNOWN_MAPPINGS[slug]
if yaml_path.exists():
result['match_status'] = 'matched'
result['yaml_file'] = str(yaml_path)
result['match_method'] = 'known_mapping'
result['confidence'] = 1.0
return result
# Method 2: Search by key name words in YAML content
# Extract key words (skip common words)
skip_words = {'the', 'van', 'het', 'de', 'voor', 'museum', 'archief', 'stichting', 'institute', 'library'}
name_words = [w for w in re.split(r'\W+', name.lower()) if len(w) > 3 and w not in skip_words]
if name_words:
# Try to find unique matches
primary_word = name_words[0] if name_words else slug.split('-')[0]
# Look for files with this word in name
patterns = [
f"*{primary_word}*.yaml",
f"*-{primary_word[:3].upper()}.yaml", # Short code pattern
f"*-{primary_word[:3].upper()}-*.yaml"
]
matches = find_yaml_by_pattern(patterns)
# Filter to Dutch/relevant files if institution seems Dutch
if any(c.get('location', {}).get('country') == 'Netherlands' for c in [custodian]):
nl_matches = [m for m in matches if '/NL-' in m]
if nl_matches:
matches = nl_matches
if len(matches) == 1:
result['match_status'] = 'matched'
result['yaml_file'] = matches[0]
result['match_method'] = 'filename_pattern'
result['confidence'] = 0.8
return result
elif len(matches) > 1 and len(matches) <= 5:
result['match_status'] = 'multiple_candidates'
result['yaml_file'] = matches[:5]
result['match_method'] = 'filename_pattern'
result['confidence'] = 0.5
return result
return result
def main():
print("Loading custodians to profile...")
with open(TO_PROFILE_FILE) as f:
to_profile_data = json.load(f)
custodians = to_profile_data['custodians']
print(f"Found {len(custodians)} custodians to match\n")
# Match each custodian
results = {
'matched': [],
'multiple_candidates': [],
'unmatched': [],
'summary': {}
}
for custodian in custodians:
match = match_custodian(custodian)
if match['match_status'] == 'matched':
results['matched'].append(match)
elif match['match_status'] == 'multiple_candidates':
results['multiple_candidates'].append(match)
else:
results['unmatched'].append(match)
# Summary
results['summary'] = {
'total': len(custodians),
'matched': len(results['matched']),
'multiple_candidates': len(results['multiple_candidates']),
'unmatched': len(results['unmatched'])
}
# Print results
print("=" * 70)
print("MATCHING RESULTS")
print("=" * 70)
print(f"Total custodians: {results['summary']['total']}")
print(f"Definitively matched: {results['summary']['matched']}")
print(f"Multiple candidates: {results['summary']['multiple_candidates']}")
print(f"Unmatched: {results['summary']['unmatched']}")
print("\n" + "=" * 70)
print("MATCHED CUSTODIANS (confirmed YAML files exist)")
print("=" * 70)
for i, m in enumerate(results['matched'], 1):
c = m['custodian']
fname = os.path.basename(m['yaml_file']) if m['yaml_file'] else 'N/A'
print(f"{i:2}. {c['name'][:45]:<45} -> {fname}")
print("\n" + "=" * 70)
print("MULTIPLE CANDIDATES (need manual verification)")
print("=" * 70)
for i, m in enumerate(results['multiple_candidates'][:15], 1):
c = m['custodian']
files = [os.path.basename(f) for f in m['yaml_file'][:3]] if isinstance(m['yaml_file'], list) else []
print(f"{i:2}. {c['name'][:40]:<40} -> {', '.join(files)}")
print("\n" + "=" * 70)
print("UNMATCHED CUSTODIANS (need new YAML profiles)")
print("=" * 70)
for i, m in enumerate(results['unmatched'], 1):
c = m['custodian']
loc = c.get('location', {})
city = loc.get('city', 'Unknown') if isinstance(loc, dict) else 'Unknown'
print(f"{i:2}. {c['name'][:45]:<45} | {c['heritage_count']:>3} staff | {city}")
# Save results
with open(OUTPUT_FILE, 'w') as f:
json.dump(results, f, indent=2, default=str)
print(f"\nResults saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()