#!/usr/bin/env python3 """ Match unmatched LinkedIn custodians to existing NL-* files by name similarity. Optimized version using pre-built name index for speed. """ import json import os import re import sys from pathlib import Path from datetime import datetime, timezone import yaml try: from rapidfuzz import fuzz RAPIDFUZZ_AVAILABLE = True except ImportError: RAPIDFUZZ_AVAILABLE = False print("Warning: rapidfuzz not available, using basic matching") def normalize_name(name: str) -> str: """Normalize institution name for comparison.""" if not name: return "" # Lowercase name = name.lower() # Remove common prefixes/suffixes remove_patterns = [ r'^stichting\s+', r'^vereniging\s+', r'^museum\s+', r'\s+museum$', r'^het\s+', r'^de\s+', r'\s+nederland$', r'\s+netherlands$', ] for pattern in remove_patterns: name = re.sub(pattern, '', name) # Remove punctuation name = re.sub(r'[^\w\s]', '', name) # Collapse whitespace name = re.sub(r'\s+', ' ', name).strip() return name def build_nl_name_index(custodian_dir: Path) -> tuple: """Build index of NL-* file names. Only includes non-enriched files.""" index = {} # normalized_name -> file_path name_to_original = {} # normalized -> original name print(" Scanning NL-* files...") count = 0 for yaml_path in custodian_dir.glob("NL-*.yaml"): try: with open(yaml_path, 'r') as f: # Quick parse - just get name fields content = f.read() # Skip already enriched if 'linkedin_enrichment:' in content: continue data = yaml.safe_load(content) if not data: continue # Get primary name name = data.get('name') or data.get('custodian_name', {}).get('emic_name') if name: normalized = normalize_name(name) if normalized and len(normalized) > 2: index[normalized] = str(yaml_path) name_to_original[normalized] = name # Also index alternative names for alt in data.get('alternative_names', []): if alt: normalized = normalize_name(alt) if normalized and len(normalized) > 2 and normalized not in index: index[normalized] = str(yaml_path) name_to_original[normalized] = alt count += 1 if count % 200 == 0: print(f" Processed {count} files...") except Exception as e: pass # Skip problematic files print(f" Indexed {len(index)} names from {count} NL-* files") return index, name_to_original def load_linkedin_names(unmatched_file: Path, linkedin_dir: Path) -> list: """Load unmatched LinkedIn custodians with their names.""" custodians = [] with open(unmatched_file, 'r') as f: slugs = [line.strip() for line in f if line.strip()] print(f" Loading {len(slugs)} unmatched LinkedIn slugs...") for slug in slugs: yaml_path = linkedin_dir / f"{slug}.yaml" if yaml_path.exists(): try: with open(yaml_path, 'r') as f: data = yaml.safe_load(f) if data and data.get('name'): custodians.append({ 'slug': slug, 'name': data['name'], 'normalized': normalize_name(data['name']), 'file': str(yaml_path) }) except: pass print(f" Loaded {len(custodians)} LinkedIn custodians with names") return custodians def find_matches(linkedin_list: list, nl_index: dict, name_to_original: dict, threshold: int = 85) -> list: """Find name matches using rapidfuzz.""" matches = [] nl_names = list(nl_index.keys()) print(f" Matching {len(linkedin_list)} LinkedIn names against {len(nl_names)} NL names...") for i, linkedin in enumerate(linkedin_list): linkedin_norm = linkedin['normalized'] if not linkedin_norm or len(linkedin_norm) < 3: continue best_score = 0 best_nl_name = None # Quick exact match check first if linkedin_norm in nl_index: matches.append({ 'linkedin_slug': linkedin['slug'], 'linkedin_name': linkedin['name'], 'linkedin_file': linkedin['file'], 'nl_file': nl_index[linkedin_norm], 'nl_name': name_to_original.get(linkedin_norm, linkedin_norm), 'score': 100, 'match_type': 'exact' }) continue # Fuzzy match if RAPIDFUZZ_AVAILABLE: from rapidfuzz import fuzz as fuzz_module for nl_name in nl_names: score = fuzz_module.token_sort_ratio(linkedin_norm, nl_name) if score > best_score: best_score = score best_nl_name = nl_name else: # Basic containment for nl_name in nl_names: if linkedin_norm in nl_name or nl_name in linkedin_norm: shorter = min(len(linkedin_norm), len(nl_name)) longer = max(len(linkedin_norm), len(nl_name)) score = int((shorter / longer) * 100) if longer > 0 else 0 if score > best_score: best_score = score best_nl_name = nl_name if best_score >= threshold and best_nl_name: matches.append({ 'linkedin_slug': linkedin['slug'], 'linkedin_name': linkedin['name'], 'linkedin_file': linkedin['file'], 'nl_file': nl_index[best_nl_name], 'nl_name': name_to_original.get(best_nl_name, best_nl_name), 'score': best_score, 'match_type': 'fuzzy' }) if (i + 1) % 100 == 0: print(f" Processed {i + 1}/{len(linkedin_list)}...") return sorted(matches, key=lambda x: -x['score']) def main(): import argparse parser = argparse.ArgumentParser(description='Match LinkedIn custodians by name similarity') parser.add_argument('--threshold', type=int, default=85, help='Minimum similarity score (0-100)') parser.add_argument('--output', type=str, default='data/custodian/linkedin/_name_matches.json', help='Output file for matches') args = parser.parse_args() base_dir = Path('/Users/kempersc/apps/glam') linkedin_dir = base_dir / 'data/custodian/linkedin' custodian_dir = base_dir / 'data/custodian' unmatched_file = linkedin_dir / '_unmatched.txt' print("=" * 60) print("LinkedIn Name Similarity Matching (Optimized)") print("=" * 60) # Build NL name index print("\n1. Building NL-* name index...") nl_index, name_to_original = build_nl_name_index(custodian_dir) # Load LinkedIn names print("\n2. Loading unmatched LinkedIn custodians...") linkedin_list = load_linkedin_names(unmatched_file, linkedin_dir) # Find matches print(f"\n3. Finding matches (threshold: {args.threshold}%)...") matches = find_matches(linkedin_list, nl_index, name_to_original, args.threshold) # Deduplicate - one NL file can only be matched once seen_nl = set() unique_matches = [] for match in matches: if match['nl_file'] not in seen_nl: unique_matches.append(match) seen_nl.add(match['nl_file']) print(f"\n Found {len(unique_matches)} unique matches") # Show top matches print("\n4. Top Matches:") for m in unique_matches[:15]: print(f" {m['score']:3d}% | '{m['linkedin_name'][:40]}' → '{m['nl_name'][:40]}'") if len(unique_matches) > 15: print(f" ... and {len(unique_matches) - 15} more") # Save matches output_path = base_dir / args.output print(f"\n5. Saving matches to {output_path}...") output_data = { 'generated_at': datetime.now(timezone.utc).isoformat(), 'threshold': args.threshold, 'total_matches': len(unique_matches), 'matches': unique_matches } with open(output_path, 'w') as f: json.dump(output_data, f, indent=2) # Summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Unmatched LinkedIn custodians: {len(linkedin_list)}") print(f"Name matches found: {len(unique_matches)}") print(f" - Exact matches: {len([m for m in unique_matches if m['match_type'] == 'exact'])}") print(f" - Fuzzy matches: {len([m for m in unique_matches if m['match_type'] == 'fuzzy'])}") print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}") return 0 if __name__ == '__main__': sys.exit(main())