#!/usr/bin/env python3 """ Fast name matching for LinkedIn to NL-* custodians. Uses token-based indexing for O(1) lookups instead of O(n²) pairwise comparison. """ import json import re import sys from pathlib import Path from datetime import datetime, timezone from collections import defaultdict import yaml def tokenize(name: str) -> set: """Extract significant tokens from a name.""" if not name: return set() # Lowercase and normalize name = name.lower() # Remove common Dutch/English words stopwords = { 'de', 'het', 'een', 'van', 'voor', 'in', 'en', 'te', 'den', 'der', 'stichting', 'vereniging', 'museum', 'archief', 'bibliotheek', 'the', 'a', 'an', 'of', 'and', 'for', 'at', 'in', 'nederland', 'netherlands', 'holland', 'dutch', 'nationaal', 'national', 'rijks', 'gemeente', 'gemeentelijk', 'historisch', 'historical', 'history', 'historic', } # Extract tokens tokens = set(re.findall(r'\b[a-z]{3,}\b', name)) tokens -= stopwords return tokens def build_token_index(custodian_dir: Path) -> tuple: """Build inverted index: token -> list of (file, name, all_tokens).""" token_index = defaultdict(list) # token -> [(file, name, tokens)] all_entries = [] # list of (file, name, tokens) print(" Building token index from NL-* files...") for yaml_path in custodian_dir.glob("NL-*.yaml"): try: with open(yaml_path, 'r') as f: content = f.read() # Skip already enriched if 'linkedin_enrichment:' in content: continue data = yaml.safe_load(content) if not data: continue # Get primary name name = data.get('name') or data.get('custodian_name', {}).get('emic_name', '') if not name: continue tokens = tokenize(name) if tokens: entry = (str(yaml_path), name, tokens) all_entries.append(entry) for token in tokens: token_index[token].append(entry) except Exception: pass print(f" Indexed {len(all_entries)} NL-* files with {len(token_index)} unique tokens") return token_index, all_entries def find_matches_fast(linkedin_list: list, token_index: dict, threshold: float = 0.5) -> list: """Find matches using token overlap (Jaccard similarity).""" matches = [] print(f" Matching {len(linkedin_list)} LinkedIn custodians...") for linkedin in linkedin_list: linkedin_tokens = tokenize(linkedin['name']) if not linkedin_tokens: continue # Find candidates that share at least one token candidates = {} # file -> (name, tokens) for token in linkedin_tokens: for (nl_file, nl_name, nl_tokens) in token_index.get(token, []): if nl_file not in candidates: candidates[nl_file] = (nl_name, nl_tokens) if not candidates: continue # Score candidates by Jaccard similarity best_score = 0 best_match = None for nl_file, (nl_name, nl_tokens) in candidates.items(): # Jaccard similarity: |A ∩ B| / |A ∪ B| intersection = len(linkedin_tokens & nl_tokens) union = len(linkedin_tokens | nl_tokens) score = intersection / union if union > 0 else 0 if score > best_score: best_score = score best_match = (nl_file, nl_name) if best_score >= threshold and best_match: matches.append({ 'linkedin_slug': linkedin['slug'], 'linkedin_name': linkedin['name'], 'linkedin_file': linkedin['file'], 'nl_file': best_match[0], 'nl_name': best_match[1], 'score': int(best_score * 100), 'match_type': 'token_jaccard' }) return sorted(matches, key=lambda x: -x['score']) def load_linkedin_names(unmatched_file: Path, linkedin_dir: Path) -> list: """Load unmatched LinkedIn custodians.""" custodians = [] with open(unmatched_file, 'r') as f: slugs = [line.strip() for line in f if line.strip()] for slug in slugs: yaml_path = linkedin_dir / f"{slug}.yaml" if yaml_path.exists(): try: with open(yaml_path, 'r') as f: data = yaml.safe_load(f) if data and data.get('name'): custodians.append({ 'slug': slug, 'name': data['name'], 'file': str(yaml_path) }) except: pass return custodians def main(): import argparse parser = argparse.ArgumentParser(description='Fast LinkedIn name matching') parser.add_argument('--threshold', type=int, default=50, help='Minimum Jaccard similarity (0-100, default 50)') parser.add_argument('--output', type=str, default='data/custodian/linkedin/_name_matches.json') args = parser.parse_args() base_dir = Path('/Users/kempersc/apps/glam') linkedin_dir = base_dir / 'data/custodian/linkedin' custodian_dir = base_dir / 'data/custodian' unmatched_file = linkedin_dir / '_unmatched.txt' print("=" * 60) print("Fast LinkedIn Name Matching (Token-Based)") print("=" * 60) # Build token index print("\n1. Building token index...") token_index, _ = build_token_index(custodian_dir) # Load LinkedIn names print("\n2. Loading unmatched LinkedIn custodians...") linkedin_list = load_linkedin_names(unmatched_file, linkedin_dir) print(f" Loaded {len(linkedin_list)} LinkedIn custodians") # Find matches print(f"\n3. Finding matches (threshold: {args.threshold}%)...") matches = find_matches_fast(linkedin_list, token_index, args.threshold / 100) # Deduplicate - one NL file can only be matched once seen_nl = set() unique_matches = [] for match in matches: if match['nl_file'] not in seen_nl: unique_matches.append(match) seen_nl.add(match['nl_file']) print(f"\n Found {len(unique_matches)} unique matches") # Show matches by score print("\n4. Matches by Score:") tiers = [(80, 101), (60, 80), (50, 60)] for low, high in tiers: tier_matches = [m for m in unique_matches if low <= m['score'] < high] if tier_matches: print(f"\n Score {low}-{high-1}%: {len(tier_matches)} matches") for m in tier_matches[:5]: print(f" {m['score']:3d}% | '{m['linkedin_name'][:35]}' → '{m['nl_name'][:35]}'") if len(tier_matches) > 5: print(f" ... and {len(tier_matches) - 5} more") # Save matches output_path = base_dir / args.output print(f"\n5. Saving {len(unique_matches)} matches to {output_path}...") with open(output_path, 'w') as f: json.dump({ 'generated_at': datetime.now(timezone.utc).isoformat(), 'threshold': args.threshold, 'total_matches': len(unique_matches), 'matches': unique_matches }, f, indent=2) # Summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Unmatched LinkedIn custodians: {len(linkedin_list)}") print(f"Name matches found: {len(unique_matches)}") print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}") return 0 if __name__ == '__main__': sys.exit(main())