glam/scripts/match_custodians_to_yaml.py
2025-12-15 22:31:41 +01:00

189 lines
6.5 KiB
Python

#!/usr/bin/env python3
"""
Match custodians from LinkedIn staff parsing against existing custodian YAML files.
This script identifies which of the 132 custodians to profile already have
YAML files in data/custodian/ and which are missing.
"""
import json
import os
import re
from pathlib import Path
from typing import Optional
import yaml
# Paths
BASE_DIR = Path("/Users/kempersc/apps/glam")
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json"
OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches.json"
def normalize_name(name: str) -> str:
"""Normalize institution name for matching."""
# Lowercase
name = name.lower()
# Remove common suffixes/prefixes
name = re.sub(r'\b(the|de|het|stichting|museum|archief|bibliotheek)\b', '', name)
# Remove punctuation and extra spaces
name = re.sub(r'[^\w\s]', '', name)
name = re.sub(r'\s+', ' ', name).strip()
return name
def search_yaml_files(search_terms: list[str], yaml_files: dict[str, dict]) -> list[dict]:
"""Search YAML files for matching institution names."""
matches = []
for filepath, data in yaml_files.items():
if not data:
continue
# Get searchable text from YAML
searchable_parts = []
# Original entry name
if 'original_entry' in data:
oe = data['original_entry']
if isinstance(oe, dict):
searchable_parts.append(str(oe.get('name', '')))
searchable_parts.append(str(oe.get('instelling', '')))
# Custodian name
if 'custodian_name' in data:
cn = data['custodian_name']
if isinstance(cn, dict):
searchable_parts.append(str(cn.get('consensus_name', '')))
searchable_parts.append(str(cn.get('emic_name', '')))
elif isinstance(cn, str):
searchable_parts.append(cn)
# Direct name field
if 'name' in data:
searchable_parts.append(str(data['name']))
# Legal status
if 'legal_status' in data and isinstance(data['legal_status'], dict):
searchable_parts.append(str(data['legal_status'].get('legal_name', '')))
# Combine and normalize
searchable_text = ' '.join(searchable_parts).lower()
# Check each search term
for term in search_terms:
if term.lower() in searchable_text:
matches.append({
'filepath': filepath,
'filename': os.path.basename(filepath),
'matched_term': term,
'ghcid': data.get('ghcid', {}).get('ghcid_current') if isinstance(data.get('ghcid'), dict) else None,
'custodian_name': data.get('custodian_name', {}).get('consensus_name') if isinstance(data.get('custodian_name'), dict) else str(data.get('custodian_name', ''))[:50]
})
break
return matches
def main():
print("Loading custodians to profile...")
with open(TO_PROFILE_FILE) as f:
to_profile_data = json.load(f)
custodians = to_profile_data['custodians']
print(f"Found {len(custodians)} custodians to match")
# Get list of YAML files
print("\nScanning custodian YAML files...")
yaml_files = list(CUSTODIAN_DIR.glob("*.yaml"))
print(f"Found {len(yaml_files)} YAML files")
# Load a sample of YAML files for quick matching
# For efficiency, we'll search by filename patterns first
print("\nMatching custodians to existing YAML files...")
results = {
'matched': [],
'unmatched': [],
'total_to_profile': len(custodians),
'total_yaml_files': len(yaml_files)
}
# Create search patterns for each custodian
for custodian in custodians:
name = custodian['name']
slug = custodian['slug']
# Generate search patterns
patterns = []
# Pattern from slug (e.g., "rijksmuseum" -> "*rijksmuseum*")
slug_clean = slug.replace('-', '').replace('_', '')
patterns.append(f"*{slug_clean}*")
# Pattern from name words
name_words = [w.lower() for w in re.split(r'\W+', name) if len(w) > 3]
for word in name_words[:3]: # First 3 significant words
patterns.append(f"*{word}*")
# Search for matching files
matched_files = []
for pattern in patterns:
try:
matches = list(CUSTODIAN_DIR.glob(pattern + ".yaml"))
matched_files.extend([str(m) for m in matches])
except Exception:
pass
# Deduplicate
matched_files = list(set(matched_files))
if matched_files:
results['matched'].append({
'custodian': custodian,
'yaml_files': matched_files[:5], # Limit to 5 matches
'match_count': len(matched_files)
})
else:
results['unmatched'].append(custodian)
# Summary
print(f"\n{'='*60}")
print("MATCHING RESULTS")
print(f"{'='*60}")
print(f"Total custodians to profile: {len(custodians)}")
print(f"Matched to existing YAML: {len(results['matched'])}")
print(f"No match found: {len(results['unmatched'])}")
# Show matched custodians
print(f"\n{'='*60}")
print("MATCHED CUSTODIANS (have existing YAML files)")
print(f"{'='*60}")
for i, match in enumerate(results['matched'][:30], 1):
c = match['custodian']
files = [os.path.basename(f) for f in match['yaml_files'][:2]]
print(f"{i:2}. {c['name'][:40]:<40} -> {', '.join(files)}")
if len(results['matched']) > 30:
print(f" ... and {len(results['matched']) - 30} more")
# Show unmatched custodians
print(f"\n{'='*60}")
print("UNMATCHED CUSTODIANS (need new YAML profiles)")
print(f"{'='*60}")
for i, c in enumerate(results['unmatched'][:30], 1):
print(f"{i:2}. {c['name'][:50]:<50} | {c['heritage_count']:>3} staff | {c.get('location', {}).get('city', 'Unknown')}")
if len(results['unmatched']) > 30:
print(f" ... and {len(results['unmatched']) - 30} more")
# Save results
with open(OUTPUT_FILE, 'w') as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to: {OUTPUT_FILE}")
return results
if __name__ == "__main__":
main()