#!/usr/bin/env python3 """ Match custodians from LinkedIn staff parsing against existing custodian YAML files. This script identifies which of the 132 custodians to profile already have YAML files in data/custodian/ and which are missing. """ import json import os import re from pathlib import Path from typing import Optional import yaml # Paths BASE_DIR = Path("/Users/kempersc/apps/glam") CUSTODIAN_DIR = BASE_DIR / "data" / "custodian" TO_PROFILE_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodians_to_profile.json" OUTPUT_FILE = CUSTODIAN_DIR / "person" / "affiliated" / "parsed" / "custodian_yaml_matches.json" def normalize_name(name: str) -> str: """Normalize institution name for matching.""" # Lowercase name = name.lower() # Remove common suffixes/prefixes name = re.sub(r'\b(the|de|het|stichting|museum|archief|bibliotheek)\b', '', name) # Remove punctuation and extra spaces name = re.sub(r'[^\w\s]', '', name) name = re.sub(r'\s+', ' ', name).strip() return name def search_yaml_files(search_terms: list[str], yaml_files: dict[str, dict]) -> list[dict]: """Search YAML files for matching institution names.""" matches = [] for filepath, data in yaml_files.items(): if not data: continue # Get searchable text from YAML searchable_parts = [] # Original entry name if 'original_entry' in data: oe = data['original_entry'] if isinstance(oe, dict): searchable_parts.append(str(oe.get('name', ''))) searchable_parts.append(str(oe.get('instelling', ''))) # Custodian name if 'custodian_name' in data: cn = data['custodian_name'] if isinstance(cn, dict): searchable_parts.append(str(cn.get('consensus_name', ''))) searchable_parts.append(str(cn.get('emic_name', ''))) elif isinstance(cn, str): searchable_parts.append(cn) # Direct name field if 'name' in data: searchable_parts.append(str(data['name'])) # Legal status if 'legal_status' in data and isinstance(data['legal_status'], dict): searchable_parts.append(str(data['legal_status'].get('legal_name', ''))) # Combine and normalize searchable_text = ' '.join(searchable_parts).lower() # Check each search term for term in search_terms: if term.lower() in searchable_text: matches.append({ 'filepath': filepath, 'filename': os.path.basename(filepath), 'matched_term': term, 'ghcid': data.get('ghcid', {}).get('ghcid_current') if isinstance(data.get('ghcid'), dict) else None, 'custodian_name': data.get('custodian_name', {}).get('consensus_name') if isinstance(data.get('custodian_name'), dict) else str(data.get('custodian_name', ''))[:50] }) break return matches def main(): print("Loading custodians to profile...") with open(TO_PROFILE_FILE) as f: to_profile_data = json.load(f) custodians = to_profile_data['custodians'] print(f"Found {len(custodians)} custodians to match") # Get list of YAML files print("\nScanning custodian YAML files...") yaml_files = list(CUSTODIAN_DIR.glob("*.yaml")) print(f"Found {len(yaml_files)} YAML files") # Load a sample of YAML files for quick matching # For efficiency, we'll search by filename patterns first print("\nMatching custodians to existing YAML files...") results = { 'matched': [], 'unmatched': [], 'total_to_profile': len(custodians), 'total_yaml_files': len(yaml_files) } # Create search patterns for each custodian for custodian in custodians: name = custodian['name'] slug = custodian['slug'] # Generate search patterns patterns = [] # Pattern from slug (e.g., "rijksmuseum" -> "*rijksmuseum*") slug_clean = slug.replace('-', '').replace('_', '') patterns.append(f"*{slug_clean}*") # Pattern from name words name_words = [w.lower() for w in re.split(r'\W+', name) if len(w) > 3] for word in name_words[:3]: # First 3 significant words patterns.append(f"*{word}*") # Search for matching files matched_files = [] for pattern in patterns: try: matches = list(CUSTODIAN_DIR.glob(pattern + ".yaml")) matched_files.extend([str(m) for m in matches]) except Exception: pass # Deduplicate matched_files = list(set(matched_files)) if matched_files: results['matched'].append({ 'custodian': custodian, 'yaml_files': matched_files[:5], # Limit to 5 matches 'match_count': len(matched_files) }) else: results['unmatched'].append(custodian) # Summary print(f"\n{'='*60}") print("MATCHING RESULTS") print(f"{'='*60}") print(f"Total custodians to profile: {len(custodians)}") print(f"Matched to existing YAML: {len(results['matched'])}") print(f"No match found: {len(results['unmatched'])}") # Show matched custodians print(f"\n{'='*60}") print("MATCHED CUSTODIANS (have existing YAML files)") print(f"{'='*60}") for i, match in enumerate(results['matched'][:30], 1): c = match['custodian'] files = [os.path.basename(f) for f in match['yaml_files'][:2]] print(f"{i:2}. {c['name'][:40]:<40} -> {', '.join(files)}") if len(results['matched']) > 30: print(f" ... and {len(results['matched']) - 30} more") # Show unmatched custodians print(f"\n{'='*60}") print("UNMATCHED CUSTODIANS (need new YAML profiles)") print(f"{'='*60}") for i, c in enumerate(results['unmatched'][:30], 1): print(f"{i:2}. {c['name'][:50]:<50} | {c['heritage_count']:>3} staff | {c.get('location', {}).get('city', 'Unknown')}") if len(results['unmatched']) > 30: print(f" ... and {len(results['unmatched']) - 30} more") # Save results with open(OUTPUT_FILE, 'w') as f: json.dump(results, f, indent=2) print(f"\nResults saved to: {OUTPUT_FILE}") return results if __name__ == "__main__": main()