189 lines
6.5 KiB
Python
189 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Match custodians from LinkedIn staff parsing against existing custodian YAML files.
|
|
|
|
This script identifies which of the 132 custodians to profile already have
|
|
YAML files in data/custodian/ and which are missing.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import yaml
|
|
|
|
# Paths
|
|
BASE_DIR = Path("/Users/kempersc/apps/glam")
|
|
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
|
|
TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json"
|
|
OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches.json"
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for matching."""
|
|
# Lowercase
|
|
name = name.lower()
|
|
# Remove common suffixes/prefixes
|
|
name = re.sub(r'\b(the|de|het|stichting|museum|archief|bibliotheek)\b', '', name)
|
|
# Remove punctuation and extra spaces
|
|
name = re.sub(r'[^\w\s]', '', name)
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
return name
|
|
|
|
|
|
def search_yaml_files(search_terms: list[str], yaml_files: dict[str, dict]) -> list[dict]:
|
|
"""Search YAML files for matching institution names."""
|
|
matches = []
|
|
|
|
for filepath, data in yaml_files.items():
|
|
if not data:
|
|
continue
|
|
|
|
# Get searchable text from YAML
|
|
searchable_parts = []
|
|
|
|
# Original entry name
|
|
if 'original_entry' in data:
|
|
oe = data['original_entry']
|
|
if isinstance(oe, dict):
|
|
searchable_parts.append(str(oe.get('name', '')))
|
|
searchable_parts.append(str(oe.get('instelling', '')))
|
|
|
|
# Custodian name
|
|
if 'custodian_name' in data:
|
|
cn = data['custodian_name']
|
|
if isinstance(cn, dict):
|
|
searchable_parts.append(str(cn.get('consensus_name', '')))
|
|
searchable_parts.append(str(cn.get('emic_name', '')))
|
|
elif isinstance(cn, str):
|
|
searchable_parts.append(cn)
|
|
|
|
# Direct name field
|
|
if 'name' in data:
|
|
searchable_parts.append(str(data['name']))
|
|
|
|
# Legal status
|
|
if 'legal_status' in data and isinstance(data['legal_status'], dict):
|
|
searchable_parts.append(str(data['legal_status'].get('legal_name', '')))
|
|
|
|
# Combine and normalize
|
|
searchable_text = ' '.join(searchable_parts).lower()
|
|
|
|
# Check each search term
|
|
for term in search_terms:
|
|
if term.lower() in searchable_text:
|
|
matches.append({
|
|
'filepath': filepath,
|
|
'filename': os.path.basename(filepath),
|
|
'matched_term': term,
|
|
'ghcid': data.get('ghcid', {}).get('ghcid_current') if isinstance(data.get('ghcid'), dict) else None,
|
|
'custodian_name': data.get('custodian_name', {}).get('consensus_name') if isinstance(data.get('custodian_name'), dict) else str(data.get('custodian_name', ''))[:50]
|
|
})
|
|
break
|
|
|
|
return matches
|
|
|
|
|
|
def main():
|
|
print("Loading custodians to profile...")
|
|
with open(TO_PROFILE_FILE) as f:
|
|
to_profile_data = json.load(f)
|
|
|
|
custodians = to_profile_data['custodians']
|
|
print(f"Found {len(custodians)} custodians to match")
|
|
|
|
# Get list of YAML files
|
|
print("\nScanning custodian YAML files...")
|
|
yaml_files = list(CUSTODIAN_DIR.glob("*.yaml"))
|
|
print(f"Found {len(yaml_files)} YAML files")
|
|
|
|
# Load a sample of YAML files for quick matching
|
|
# For efficiency, we'll search by filename patterns first
|
|
print("\nMatching custodians to existing YAML files...")
|
|
|
|
results = {
|
|
'matched': [],
|
|
'unmatched': [],
|
|
'total_to_profile': len(custodians),
|
|
'total_yaml_files': len(yaml_files)
|
|
}
|
|
|
|
# Create search patterns for each custodian
|
|
for custodian in custodians:
|
|
name = custodian['name']
|
|
slug = custodian['slug']
|
|
|
|
# Generate search patterns
|
|
patterns = []
|
|
|
|
# Pattern from slug (e.g., "rijksmuseum" -> "*rijksmuseum*")
|
|
slug_clean = slug.replace('-', '').replace('_', '')
|
|
patterns.append(f"*{slug_clean}*")
|
|
|
|
# Pattern from name words
|
|
name_words = [w.lower() for w in re.split(r'\W+', name) if len(w) > 3]
|
|
for word in name_words[:3]: # First 3 significant words
|
|
patterns.append(f"*{word}*")
|
|
|
|
# Search for matching files
|
|
matched_files = []
|
|
for pattern in patterns:
|
|
try:
|
|
matches = list(CUSTODIAN_DIR.glob(pattern + ".yaml"))
|
|
matched_files.extend([str(m) for m in matches])
|
|
except Exception:
|
|
pass
|
|
|
|
# Deduplicate
|
|
matched_files = list(set(matched_files))
|
|
|
|
if matched_files:
|
|
results['matched'].append({
|
|
'custodian': custodian,
|
|
'yaml_files': matched_files[:5], # Limit to 5 matches
|
|
'match_count': len(matched_files)
|
|
})
|
|
else:
|
|
results['unmatched'].append(custodian)
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print("MATCHING RESULTS")
|
|
print(f"{'='*60}")
|
|
print(f"Total custodians to profile: {len(custodians)}")
|
|
print(f"Matched to existing YAML: {len(results['matched'])}")
|
|
print(f"No match found: {len(results['unmatched'])}")
|
|
|
|
# Show matched custodians
|
|
print(f"\n{'='*60}")
|
|
print("MATCHED CUSTODIANS (have existing YAML files)")
|
|
print(f"{'='*60}")
|
|
for i, match in enumerate(results['matched'][:30], 1):
|
|
c = match['custodian']
|
|
files = [os.path.basename(f) for f in match['yaml_files'][:2]]
|
|
print(f"{i:2}. {c['name'][:40]:<40} -> {', '.join(files)}")
|
|
|
|
if len(results['matched']) > 30:
|
|
print(f" ... and {len(results['matched']) - 30} more")
|
|
|
|
# Show unmatched custodians
|
|
print(f"\n{'='*60}")
|
|
print("UNMATCHED CUSTODIANS (need new YAML profiles)")
|
|
print(f"{'='*60}")
|
|
for i, c in enumerate(results['unmatched'][:30], 1):
|
|
print(f"{i:2}. {c['name'][:50]:<50} | {c['heritage_count']:>3} staff | {c.get('location', {}).get('city', 'Unknown')}")
|
|
|
|
if len(results['unmatched']) > 30:
|
|
print(f" ... and {len(results['unmatched']) - 30} more")
|
|
|
|
# Save results
|
|
with open(OUTPUT_FILE, 'w') as f:
|
|
json.dump(results, f, indent=2)
|
|
print(f"\nResults saved to: {OUTPUT_FILE}")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|