#!/usr/bin/env python3 """ Ultra-fast name matching using regex extraction (no YAML parsing). """ import json import re import sys from pathlib import Path from datetime import datetime, timezone from collections import defaultdict def extract_name_regex(content: str) -> str: """Extract name from YAML content using regex (much faster than parsing).""" # Try "name:" field first match = re.search(r'^name:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE) if match: return match.group(1).strip() # Try "organisatie:" field (most NL-* files use this) match = re.search(r'^\s*organisatie:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE) if match: return match.group(1).strip() # Try nested custodian_name.emic_name match = re.search(r'emic_name:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE) if match: return match.group(1).strip() return "" def tokenize(name: str) -> set: """Extract significant tokens from a name.""" if not name: return set() name = name.lower() # Common stopwords stopwords = { 'de', 'het', 'een', 'van', 'voor', 'in', 'en', 'te', 'den', 'der', 'stichting', 'vereniging', 'museum', 'archief', 'bibliotheek', 'the', 'a', 'an', 'of', 'and', 'for', 'at', 'in', 'nederland', 'netherlands', 'holland', 'dutch', 'nationaal', 'national', 'rijks', 'gemeente', 'gemeentelijk', } tokens = set(re.findall(r'\b[a-z]{3,}\b', name)) tokens -= stopwords return tokens def main(): base_dir = Path('/Users/kempersc/apps/glam') linkedin_dir = base_dir / 'data/custodian/linkedin' custodian_dir = base_dir / 'data/custodian' unmatched_file = linkedin_dir / '_unmatched.txt' print("=" * 60) print("Ultra-Fast Name Matching (Regex-Based)") print("=" * 60) # Build token index from NL-* files print("\n1. Building token index from NL-* files...") token_index = defaultdict(list) # token -> [(file, name)] nl_count = 0 skip_count = 0 for yaml_path in custodian_dir.glob("NL-*.yaml"): content = yaml_path.read_text() # Skip already enriched if 'linkedin_enrichment:' in content: skip_count += 1 continue name = extract_name_regex(content) if name: tokens = tokenize(name) for token in tokens: token_index[token].append((str(yaml_path), name)) nl_count += 1 print(f" Indexed {nl_count} NL-* files ({skip_count} already enriched)") print(f" Token vocabulary: {len(token_index)} unique tokens") # Load unmatched LinkedIn slugs print("\n2. Loading unmatched LinkedIn custodians...") linkedin_list = [] with open(unmatched_file, 'r') as f: slugs = [line.strip() for line in f if line.strip()] for slug in slugs: yaml_path = linkedin_dir / f"{slug}.yaml" if yaml_path.exists(): content = yaml_path.read_text() name = extract_name_regex(content) if name: linkedin_list.append({ 'slug': slug, 'name': name, 'file': str(yaml_path), 'tokens': tokenize(name) }) print(f" Loaded {len(linkedin_list)} LinkedIn custodians with names") # Find matches print("\n3. Finding matches...") matches = [] for linkedin in linkedin_list: if not linkedin['tokens']: continue # Find candidates sharing at least one token candidates = {} # file -> (name, shared_count) for token in linkedin['tokens']: for (nl_file, nl_name) in token_index.get(token, []): if nl_file not in candidates: candidates[nl_file] = {'name': nl_name, 'shared': 0, 'tokens': tokenize(nl_name)} candidates[nl_file]['shared'] += 1 # Score by Jaccard similarity best_score = 0 best_match = None for nl_file, info in candidates.items(): intersection = len(linkedin['tokens'] & info['tokens']) union = len(linkedin['tokens'] | info['tokens']) score = intersection / union if union > 0 else 0 if score > best_score: best_score = score best_match = (nl_file, info['name']) if best_score >= 0.5 and best_match: matches.append({ 'linkedin_slug': linkedin['slug'], 'linkedin_name': linkedin['name'], 'linkedin_file': linkedin['file'], 'nl_file': best_match[0], 'nl_name': best_match[1], 'score': int(best_score * 100) }) # Deduplicate - one NL file can only be matched once (keep best score) matches.sort(key=lambda x: -x['score']) seen_nl = set() unique_matches = [] for match in matches: if match['nl_file'] not in seen_nl: unique_matches.append(match) seen_nl.add(match['nl_file']) print(f" Found {len(unique_matches)} unique matches") # Show matches by score tier print("\n4. Matches by Score Tier:") for low, high in [(80, 101), (60, 80), (50, 60)]: tier = [m for m in unique_matches if low <= m['score'] < high] if tier: print(f"\n {low}-{high-1}%: {len(tier)} matches") for m in tier[:5]: print(f" {m['score']:3d}% | '{m['linkedin_name'][:35]}' → '{m['nl_name'][:35]}'") if len(tier) > 5: print(f" ... and {len(tier) - 5} more") # Save matches output_path = base_dir / 'data/custodian/linkedin/_name_matches.json' print(f"\n5. Saving to {output_path}...") with open(output_path, 'w') as f: json.dump({ 'generated_at': datetime.now(timezone.utc).isoformat(), 'threshold': 50, 'total_matches': len(unique_matches), 'matches': unique_matches }, f, indent=2) # Summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Unmatched LinkedIn: {len(linkedin_list)}") print(f"Name matches found: {len(unique_matches)}") print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}") return 0 if __name__ == '__main__': sys.exit(main())