205 lines
7.1 KiB
Python
205 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Precise matching of LinkedIn custodians to existing YAML files.
|
|
Uses multiple matching strategies: direct name search in content, GHCID patterns.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
BASE_DIR = Path("/Users/kempersc/apps/glam")
|
|
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
|
|
TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json"
|
|
OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches_v2.json"
|
|
|
|
# Manual mapping for known institutions
|
|
KNOWN_MAPPINGS = {
|
|
"rijksmuseum": "NL-NH-AMS-M-RM.yaml",
|
|
"kb-nationale-bibliotheek": "NL-ZH-DHA-L-KB.yaml",
|
|
"van-gogh-museum": "NL-NH-AMS-M-GM-van_gogh_museum.yaml",
|
|
"nationaal-archief": "NL-ZH-DHA-A-NA.yaml",
|
|
"eye-filmmuseum": "NL-NH-AMS-M-EFM-eye_filmmuseum.yaml",
|
|
"stedelijk-museum-amsterdam": "NL-NH-AMS-M-SMA.yaml",
|
|
"amsterdam-museum": "NL-NH-AMS-M-AM.yaml",
|
|
"anne-frank-stichting": "NL-NH-AMS-M-AFH.yaml",
|
|
"kroller-muller-museum": "NL-GE-OTT-M-KMM.yaml",
|
|
"noord-hollands-archief": "NL-NH-HAA-A-NHA.yaml",
|
|
"het-utrechts-archief": "NL-UT-UTR-A-HUA.yaml",
|
|
"gelders-archief": "NL-GE-ARN-A-GA.yaml",
|
|
"collectie-overijssel": "NL-OV-ZWO-A-CO-collectie_overijssel.yaml",
|
|
"nieuwe-instituut": "NL-ZH-ROT-M-HNI.yaml",
|
|
"allard-pierson": "NL-NH-AMS-M-AP.yaml",
|
|
}
|
|
|
|
|
|
def grep_yaml_files(search_term: str) -> list[str]:
|
|
"""Use grep to find YAML files containing search term."""
|
|
try:
|
|
result = subprocess.run(
|
|
["grep", "-l", "-i", search_term, *list(CUSTODIAN_DIR.glob("*.yaml"))[:5000]],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30
|
|
)
|
|
if result.returncode == 0:
|
|
return result.stdout.strip().split('\n')
|
|
except Exception:
|
|
pass
|
|
return []
|
|
|
|
|
|
def find_yaml_by_pattern(patterns: list[str]) -> list[str]:
|
|
"""Find YAML files matching filename patterns."""
|
|
matches = []
|
|
for pattern in patterns:
|
|
try:
|
|
found = list(CUSTODIAN_DIR.glob(pattern))
|
|
matches.extend([str(f) for f in found])
|
|
except Exception:
|
|
pass
|
|
return list(set(matches))
|
|
|
|
|
|
def match_custodian(custodian: dict) -> dict:
|
|
"""Try to find matching YAML file for a custodian."""
|
|
name = custodian['name']
|
|
slug = custodian['slug']
|
|
|
|
result = {
|
|
'custodian': custodian,
|
|
'match_status': 'unmatched',
|
|
'yaml_file': None,
|
|
'match_method': None,
|
|
'confidence': 0
|
|
}
|
|
|
|
# Method 1: Check known mappings
|
|
if slug in KNOWN_MAPPINGS:
|
|
yaml_path = CUSTODIAN_DIR / KNOWN_MAPPINGS[slug]
|
|
if yaml_path.exists():
|
|
result['match_status'] = 'matched'
|
|
result['yaml_file'] = str(yaml_path)
|
|
result['match_method'] = 'known_mapping'
|
|
result['confidence'] = 1.0
|
|
return result
|
|
|
|
# Method 2: Search by key name words in YAML content
|
|
# Extract key words (skip common words)
|
|
skip_words = {'the', 'van', 'het', 'de', 'voor', 'museum', 'archief', 'stichting', 'institute', 'library'}
|
|
name_words = [w for w in re.split(r'\W+', name.lower()) if len(w) > 3 and w not in skip_words]
|
|
|
|
if name_words:
|
|
# Try to find unique matches
|
|
primary_word = name_words[0] if name_words else slug.split('-')[0]
|
|
|
|
# Look for files with this word in name
|
|
patterns = [
|
|
f"*{primary_word}*.yaml",
|
|
f"*-{primary_word[:3].upper()}.yaml", # Short code pattern
|
|
f"*-{primary_word[:3].upper()}-*.yaml"
|
|
]
|
|
|
|
matches = find_yaml_by_pattern(patterns)
|
|
|
|
# Filter to Dutch/relevant files if institution seems Dutch
|
|
if any(c.get('location', {}).get('country') == 'Netherlands' for c in [custodian]):
|
|
nl_matches = [m for m in matches if '/NL-' in m]
|
|
if nl_matches:
|
|
matches = nl_matches
|
|
|
|
if len(matches) == 1:
|
|
result['match_status'] = 'matched'
|
|
result['yaml_file'] = matches[0]
|
|
result['match_method'] = 'filename_pattern'
|
|
result['confidence'] = 0.8
|
|
return result
|
|
elif len(matches) > 1 and len(matches) <= 5:
|
|
result['match_status'] = 'multiple_candidates'
|
|
result['yaml_file'] = matches[:5]
|
|
result['match_method'] = 'filename_pattern'
|
|
result['confidence'] = 0.5
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
print("Loading custodians to profile...")
|
|
with open(TO_PROFILE_FILE) as f:
|
|
to_profile_data = json.load(f)
|
|
|
|
custodians = to_profile_data['custodians']
|
|
print(f"Found {len(custodians)} custodians to match\n")
|
|
|
|
# Match each custodian
|
|
results = {
|
|
'matched': [],
|
|
'multiple_candidates': [],
|
|
'unmatched': [],
|
|
'summary': {}
|
|
}
|
|
|
|
for custodian in custodians:
|
|
match = match_custodian(custodian)
|
|
|
|
if match['match_status'] == 'matched':
|
|
results['matched'].append(match)
|
|
elif match['match_status'] == 'multiple_candidates':
|
|
results['multiple_candidates'].append(match)
|
|
else:
|
|
results['unmatched'].append(match)
|
|
|
|
# Summary
|
|
results['summary'] = {
|
|
'total': len(custodians),
|
|
'matched': len(results['matched']),
|
|
'multiple_candidates': len(results['multiple_candidates']),
|
|
'unmatched': len(results['unmatched'])
|
|
}
|
|
|
|
# Print results
|
|
print("=" * 70)
|
|
print("MATCHING RESULTS")
|
|
print("=" * 70)
|
|
print(f"Total custodians: {results['summary']['total']}")
|
|
print(f"Definitively matched: {results['summary']['matched']}")
|
|
print(f"Multiple candidates: {results['summary']['multiple_candidates']}")
|
|
print(f"Unmatched: {results['summary']['unmatched']}")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("MATCHED CUSTODIANS (confirmed YAML files exist)")
|
|
print("=" * 70)
|
|
for i, m in enumerate(results['matched'], 1):
|
|
c = m['custodian']
|
|
fname = os.path.basename(m['yaml_file']) if m['yaml_file'] else 'N/A'
|
|
print(f"{i:2}. {c['name'][:45]:<45} -> {fname}")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("MULTIPLE CANDIDATES (need manual verification)")
|
|
print("=" * 70)
|
|
for i, m in enumerate(results['multiple_candidates'][:15], 1):
|
|
c = m['custodian']
|
|
files = [os.path.basename(f) for f in m['yaml_file'][:3]] if isinstance(m['yaml_file'], list) else []
|
|
print(f"{i:2}. {c['name'][:40]:<40} -> {', '.join(files)}")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("UNMATCHED CUSTODIANS (need new YAML profiles)")
|
|
print("=" * 70)
|
|
for i, m in enumerate(results['unmatched'], 1):
|
|
c = m['custodian']
|
|
loc = c.get('location', {})
|
|
city = loc.get('city', 'Unknown') if isinstance(loc, dict) else 'Unknown'
|
|
print(f"{i:2}. {c['name'][:45]:<45} | {c['heritage_count']:>3} staff | {city}")
|
|
|
|
# Save results
|
|
with open(OUTPUT_FILE, 'w') as f:
|
|
json.dump(results, f, indent=2, default=str)
|
|
print(f"\nResults saved to: {OUTPUT_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|