glam/scripts/match_linkedin_by_name_fast.py
2025-12-16 20:27:39 +01:00

233 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Fast name matching for LinkedIn to NL-* custodians.
Uses token-based indexing for O(1) lookups instead of O(n²) pairwise comparison.
"""
import json
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict
import yaml
def tokenize(name: str) -> set:
"""Extract significant tokens from a name."""
if not name:
return set()
# Lowercase and normalize
name = name.lower()
# Remove common Dutch/English words
stopwords = {
'de', 'het', 'een', 'van', 'voor', 'in', 'en', 'te', 'den', 'der',
'stichting', 'vereniging', 'museum', 'archief', 'bibliotheek',
'the', 'a', 'an', 'of', 'and', 'for', 'at', 'in',
'nederland', 'netherlands', 'holland', 'dutch',
'nationaal', 'national', 'rijks', 'gemeente', 'gemeentelijk',
'historisch', 'historical', 'history', 'historic',
}
# Extract tokens
tokens = set(re.findall(r'\b[a-z]{3,}\b', name))
tokens -= stopwords
return tokens
def build_token_index(custodian_dir: Path) -> tuple:
"""Build inverted index: token -> list of (file, name, all_tokens)."""
token_index = defaultdict(list) # token -> [(file, name, tokens)]
all_entries = [] # list of (file, name, tokens)
print(" Building token index from NL-* files...")
for yaml_path in custodian_dir.glob("NL-*.yaml"):
try:
with open(yaml_path, 'r') as f:
content = f.read()
# Skip already enriched
if 'linkedin_enrichment:' in content:
continue
data = yaml.safe_load(content)
if not data:
continue
# Get primary name
name = data.get('name') or data.get('custodian_name', {}).get('emic_name', '')
if not name:
continue
tokens = tokenize(name)
if tokens:
entry = (str(yaml_path), name, tokens)
all_entries.append(entry)
for token in tokens:
token_index[token].append(entry)
except Exception:
pass
print(f" Indexed {len(all_entries)} NL-* files with {len(token_index)} unique tokens")
return token_index, all_entries
def find_matches_fast(linkedin_list: list, token_index: dict, threshold: float = 0.5) -> list:
"""Find matches using token overlap (Jaccard similarity)."""
matches = []
print(f" Matching {len(linkedin_list)} LinkedIn custodians...")
for linkedin in linkedin_list:
linkedin_tokens = tokenize(linkedin['name'])
if not linkedin_tokens:
continue
# Find candidates that share at least one token
candidates = {} # file -> (name, tokens)
for token in linkedin_tokens:
for (nl_file, nl_name, nl_tokens) in token_index.get(token, []):
if nl_file not in candidates:
candidates[nl_file] = (nl_name, nl_tokens)
if not candidates:
continue
# Score candidates by Jaccard similarity
best_score = 0
best_match = None
for nl_file, (nl_name, nl_tokens) in candidates.items():
# Jaccard similarity: |A ∩ B| / |A B|
intersection = len(linkedin_tokens & nl_tokens)
union = len(linkedin_tokens | nl_tokens)
score = intersection / union if union > 0 else 0
if score > best_score:
best_score = score
best_match = (nl_file, nl_name)
if best_score >= threshold and best_match:
matches.append({
'linkedin_slug': linkedin['slug'],
'linkedin_name': linkedin['name'],
'linkedin_file': linkedin['file'],
'nl_file': best_match[0],
'nl_name': best_match[1],
'score': int(best_score * 100),
'match_type': 'token_jaccard'
})
return sorted(matches, key=lambda x: -x['score'])
def load_linkedin_names(unmatched_file: Path, linkedin_dir: Path) -> list:
"""Load unmatched LinkedIn custodians."""
custodians = []
with open(unmatched_file, 'r') as f:
slugs = [line.strip() for line in f if line.strip()]
for slug in slugs:
yaml_path = linkedin_dir / f"{slug}.yaml"
if yaml_path.exists():
try:
with open(yaml_path, 'r') as f:
data = yaml.safe_load(f)
if data and data.get('name'):
custodians.append({
'slug': slug,
'name': data['name'],
'file': str(yaml_path)
})
except:
pass
return custodians
def main():
import argparse
parser = argparse.ArgumentParser(description='Fast LinkedIn name matching')
parser.add_argument('--threshold', type=int, default=50,
help='Minimum Jaccard similarity (0-100, default 50)')
parser.add_argument('--output', type=str, default='data/custodian/linkedin/_name_matches.json')
args = parser.parse_args()
base_dir = Path('/Users/kempersc/apps/glam')
linkedin_dir = base_dir / 'data/custodian/linkedin'
custodian_dir = base_dir / 'data/custodian'
unmatched_file = linkedin_dir / '_unmatched.txt'
print("=" * 60)
print("Fast LinkedIn Name Matching (Token-Based)")
print("=" * 60)
# Build token index
print("\n1. Building token index...")
token_index, _ = build_token_index(custodian_dir)
# Load LinkedIn names
print("\n2. Loading unmatched LinkedIn custodians...")
linkedin_list = load_linkedin_names(unmatched_file, linkedin_dir)
print(f" Loaded {len(linkedin_list)} LinkedIn custodians")
# Find matches
print(f"\n3. Finding matches (threshold: {args.threshold}%)...")
matches = find_matches_fast(linkedin_list, token_index, args.threshold / 100)
# Deduplicate - one NL file can only be matched once
seen_nl = set()
unique_matches = []
for match in matches:
if match['nl_file'] not in seen_nl:
unique_matches.append(match)
seen_nl.add(match['nl_file'])
print(f"\n Found {len(unique_matches)} unique matches")
# Show matches by score
print("\n4. Matches by Score:")
tiers = [(80, 101), (60, 80), (50, 60)]
for low, high in tiers:
tier_matches = [m for m in unique_matches if low <= m['score'] < high]
if tier_matches:
print(f"\n Score {low}-{high-1}%: {len(tier_matches)} matches")
for m in tier_matches[:5]:
print(f" {m['score']:3d}% | '{m['linkedin_name'][:35]}''{m['nl_name'][:35]}'")
if len(tier_matches) > 5:
print(f" ... and {len(tier_matches) - 5} more")
# Save matches
output_path = base_dir / args.output
print(f"\n5. Saving {len(unique_matches)} matches to {output_path}...")
with open(output_path, 'w') as f:
json.dump({
'generated_at': datetime.now(timezone.utc).isoformat(),
'threshold': args.threshold,
'total_matches': len(unique_matches),
'matches': unique_matches
}, f, indent=2)
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Unmatched LinkedIn custodians: {len(linkedin_list)}")
print(f"Name matches found: {len(unique_matches)}")
print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}")
return 0
if __name__ == '__main__':
sys.exit(main())