233 lines
7.8 KiB
Python
233 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Fast name matching for LinkedIn to NL-* custodians.
|
||
|
||
Uses token-based indexing for O(1) lookups instead of O(n²) pairwise comparison.
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
from collections import defaultdict
|
||
|
||
import yaml
|
||
|
||
|
||
def tokenize(name: str) -> set:
|
||
"""Extract significant tokens from a name."""
|
||
if not name:
|
||
return set()
|
||
|
||
# Lowercase and normalize
|
||
name = name.lower()
|
||
|
||
# Remove common Dutch/English words
|
||
stopwords = {
|
||
'de', 'het', 'een', 'van', 'voor', 'in', 'en', 'te', 'den', 'der',
|
||
'stichting', 'vereniging', 'museum', 'archief', 'bibliotheek',
|
||
'the', 'a', 'an', 'of', 'and', 'for', 'at', 'in',
|
||
'nederland', 'netherlands', 'holland', 'dutch',
|
||
'nationaal', 'national', 'rijks', 'gemeente', 'gemeentelijk',
|
||
'historisch', 'historical', 'history', 'historic',
|
||
}
|
||
|
||
# Extract tokens
|
||
tokens = set(re.findall(r'\b[a-z]{3,}\b', name))
|
||
tokens -= stopwords
|
||
|
||
return tokens
|
||
|
||
|
||
def build_token_index(custodian_dir: Path) -> tuple:
|
||
"""Build inverted index: token -> list of (file, name, all_tokens)."""
|
||
token_index = defaultdict(list) # token -> [(file, name, tokens)]
|
||
all_entries = [] # list of (file, name, tokens)
|
||
|
||
print(" Building token index from NL-* files...")
|
||
|
||
for yaml_path in custodian_dir.glob("NL-*.yaml"):
|
||
try:
|
||
with open(yaml_path, 'r') as f:
|
||
content = f.read()
|
||
|
||
# Skip already enriched
|
||
if 'linkedin_enrichment:' in content:
|
||
continue
|
||
|
||
data = yaml.safe_load(content)
|
||
if not data:
|
||
continue
|
||
|
||
# Get primary name
|
||
name = data.get('name') or data.get('custodian_name', {}).get('emic_name', '')
|
||
if not name:
|
||
continue
|
||
|
||
tokens = tokenize(name)
|
||
if tokens:
|
||
entry = (str(yaml_path), name, tokens)
|
||
all_entries.append(entry)
|
||
for token in tokens:
|
||
token_index[token].append(entry)
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
print(f" Indexed {len(all_entries)} NL-* files with {len(token_index)} unique tokens")
|
||
return token_index, all_entries
|
||
|
||
|
||
def find_matches_fast(linkedin_list: list, token_index: dict, threshold: float = 0.5) -> list:
|
||
"""Find matches using token overlap (Jaccard similarity)."""
|
||
matches = []
|
||
|
||
print(f" Matching {len(linkedin_list)} LinkedIn custodians...")
|
||
|
||
for linkedin in linkedin_list:
|
||
linkedin_tokens = tokenize(linkedin['name'])
|
||
if not linkedin_tokens:
|
||
continue
|
||
|
||
# Find candidates that share at least one token
|
||
candidates = {} # file -> (name, tokens)
|
||
for token in linkedin_tokens:
|
||
for (nl_file, nl_name, nl_tokens) in token_index.get(token, []):
|
||
if nl_file not in candidates:
|
||
candidates[nl_file] = (nl_name, nl_tokens)
|
||
|
||
if not candidates:
|
||
continue
|
||
|
||
# Score candidates by Jaccard similarity
|
||
best_score = 0
|
||
best_match = None
|
||
|
||
for nl_file, (nl_name, nl_tokens) in candidates.items():
|
||
# Jaccard similarity: |A ∩ B| / |A ∪ B|
|
||
intersection = len(linkedin_tokens & nl_tokens)
|
||
union = len(linkedin_tokens | nl_tokens)
|
||
score = intersection / union if union > 0 else 0
|
||
|
||
if score > best_score:
|
||
best_score = score
|
||
best_match = (nl_file, nl_name)
|
||
|
||
if best_score >= threshold and best_match:
|
||
matches.append({
|
||
'linkedin_slug': linkedin['slug'],
|
||
'linkedin_name': linkedin['name'],
|
||
'linkedin_file': linkedin['file'],
|
||
'nl_file': best_match[0],
|
||
'nl_name': best_match[1],
|
||
'score': int(best_score * 100),
|
||
'match_type': 'token_jaccard'
|
||
})
|
||
|
||
return sorted(matches, key=lambda x: -x['score'])
|
||
|
||
|
||
def load_linkedin_names(unmatched_file: Path, linkedin_dir: Path) -> list:
|
||
"""Load unmatched LinkedIn custodians."""
|
||
custodians = []
|
||
|
||
with open(unmatched_file, 'r') as f:
|
||
slugs = [line.strip() for line in f if line.strip()]
|
||
|
||
for slug in slugs:
|
||
yaml_path = linkedin_dir / f"{slug}.yaml"
|
||
if yaml_path.exists():
|
||
try:
|
||
with open(yaml_path, 'r') as f:
|
||
data = yaml.safe_load(f)
|
||
if data and data.get('name'):
|
||
custodians.append({
|
||
'slug': slug,
|
||
'name': data['name'],
|
||
'file': str(yaml_path)
|
||
})
|
||
except:
|
||
pass
|
||
|
||
return custodians
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description='Fast LinkedIn name matching')
|
||
parser.add_argument('--threshold', type=int, default=50,
|
||
help='Minimum Jaccard similarity (0-100, default 50)')
|
||
parser.add_argument('--output', type=str, default='data/custodian/linkedin/_name_matches.json')
|
||
args = parser.parse_args()
|
||
|
||
base_dir = Path('/Users/kempersc/apps/glam')
|
||
linkedin_dir = base_dir / 'data/custodian/linkedin'
|
||
custodian_dir = base_dir / 'data/custodian'
|
||
unmatched_file = linkedin_dir / '_unmatched.txt'
|
||
|
||
print("=" * 60)
|
||
print("Fast LinkedIn Name Matching (Token-Based)")
|
||
print("=" * 60)
|
||
|
||
# Build token index
|
||
print("\n1. Building token index...")
|
||
token_index, _ = build_token_index(custodian_dir)
|
||
|
||
# Load LinkedIn names
|
||
print("\n2. Loading unmatched LinkedIn custodians...")
|
||
linkedin_list = load_linkedin_names(unmatched_file, linkedin_dir)
|
||
print(f" Loaded {len(linkedin_list)} LinkedIn custodians")
|
||
|
||
# Find matches
|
||
print(f"\n3. Finding matches (threshold: {args.threshold}%)...")
|
||
matches = find_matches_fast(linkedin_list, token_index, args.threshold / 100)
|
||
|
||
# Deduplicate - one NL file can only be matched once
|
||
seen_nl = set()
|
||
unique_matches = []
|
||
for match in matches:
|
||
if match['nl_file'] not in seen_nl:
|
||
unique_matches.append(match)
|
||
seen_nl.add(match['nl_file'])
|
||
|
||
print(f"\n Found {len(unique_matches)} unique matches")
|
||
|
||
# Show matches by score
|
||
print("\n4. Matches by Score:")
|
||
tiers = [(80, 101), (60, 80), (50, 60)]
|
||
for low, high in tiers:
|
||
tier_matches = [m for m in unique_matches if low <= m['score'] < high]
|
||
if tier_matches:
|
||
print(f"\n Score {low}-{high-1}%: {len(tier_matches)} matches")
|
||
for m in tier_matches[:5]:
|
||
print(f" {m['score']:3d}% | '{m['linkedin_name'][:35]}' → '{m['nl_name'][:35]}'")
|
||
if len(tier_matches) > 5:
|
||
print(f" ... and {len(tier_matches) - 5} more")
|
||
|
||
# Save matches
|
||
output_path = base_dir / args.output
|
||
print(f"\n5. Saving {len(unique_matches)} matches to {output_path}...")
|
||
|
||
with open(output_path, 'w') as f:
|
||
json.dump({
|
||
'generated_at': datetime.now(timezone.utc).isoformat(),
|
||
'threshold': args.threshold,
|
||
'total_matches': len(unique_matches),
|
||
'matches': unique_matches
|
||
}, f, indent=2)
|
||
|
||
# Summary
|
||
print("\n" + "=" * 60)
|
||
print("SUMMARY")
|
||
print("=" * 60)
|
||
print(f"Unmatched LinkedIn custodians: {len(linkedin_list)}")
|
||
print(f"Name matches found: {len(unique_matches)}")
|
||
print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}")
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|