glam/scripts/match_linkedin_names_ultra.py
2025-12-16 20:27:39 +01:00

196 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Ultra-fast name matching using regex extraction (no YAML parsing).
"""
import json
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict
def extract_name_regex(content: str) -> str:
"""Extract name from YAML content using regex (much faster than parsing)."""
# Try "name:" field first
match = re.search(r'^name:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
if match:
return match.group(1).strip()
# Try "organisatie:" field (most NL-* files use this)
match = re.search(r'^\s*organisatie:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
if match:
return match.group(1).strip()
# Try nested custodian_name.emic_name
match = re.search(r'emic_name:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
if match:
return match.group(1).strip()
return ""
def tokenize(name: str) -> set:
"""Extract significant tokens from a name."""
if not name:
return set()
name = name.lower()
# Common stopwords
stopwords = {
'de', 'het', 'een', 'van', 'voor', 'in', 'en', 'te', 'den', 'der',
'stichting', 'vereniging', 'museum', 'archief', 'bibliotheek',
'the', 'a', 'an', 'of', 'and', 'for', 'at', 'in',
'nederland', 'netherlands', 'holland', 'dutch',
'nationaal', 'national', 'rijks', 'gemeente', 'gemeentelijk',
}
tokens = set(re.findall(r'\b[a-z]{3,}\b', name))
tokens -= stopwords
return tokens
def main():
base_dir = Path('/Users/kempersc/apps/glam')
linkedin_dir = base_dir / 'data/custodian/linkedin'
custodian_dir = base_dir / 'data/custodian'
unmatched_file = linkedin_dir / '_unmatched.txt'
print("=" * 60)
print("Ultra-Fast Name Matching (Regex-Based)")
print("=" * 60)
# Build token index from NL-* files
print("\n1. Building token index from NL-* files...")
token_index = defaultdict(list) # token -> [(file, name)]
nl_count = 0
skip_count = 0
for yaml_path in custodian_dir.glob("NL-*.yaml"):
content = yaml_path.read_text()
# Skip already enriched
if 'linkedin_enrichment:' in content:
skip_count += 1
continue
name = extract_name_regex(content)
if name:
tokens = tokenize(name)
for token in tokens:
token_index[token].append((str(yaml_path), name))
nl_count += 1
print(f" Indexed {nl_count} NL-* files ({skip_count} already enriched)")
print(f" Token vocabulary: {len(token_index)} unique tokens")
# Load unmatched LinkedIn slugs
print("\n2. Loading unmatched LinkedIn custodians...")
linkedin_list = []
with open(unmatched_file, 'r') as f:
slugs = [line.strip() for line in f if line.strip()]
for slug in slugs:
yaml_path = linkedin_dir / f"{slug}.yaml"
if yaml_path.exists():
content = yaml_path.read_text()
name = extract_name_regex(content)
if name:
linkedin_list.append({
'slug': slug,
'name': name,
'file': str(yaml_path),
'tokens': tokenize(name)
})
print(f" Loaded {len(linkedin_list)} LinkedIn custodians with names")
# Find matches
print("\n3. Finding matches...")
matches = []
for linkedin in linkedin_list:
if not linkedin['tokens']:
continue
# Find candidates sharing at least one token
candidates = {} # file -> (name, shared_count)
for token in linkedin['tokens']:
for (nl_file, nl_name) in token_index.get(token, []):
if nl_file not in candidates:
candidates[nl_file] = {'name': nl_name, 'shared': 0, 'tokens': tokenize(nl_name)}
candidates[nl_file]['shared'] += 1
# Score by Jaccard similarity
best_score = 0
best_match = None
for nl_file, info in candidates.items():
intersection = len(linkedin['tokens'] & info['tokens'])
union = len(linkedin['tokens'] | info['tokens'])
score = intersection / union if union > 0 else 0
if score > best_score:
best_score = score
best_match = (nl_file, info['name'])
if best_score >= 0.5 and best_match:
matches.append({
'linkedin_slug': linkedin['slug'],
'linkedin_name': linkedin['name'],
'linkedin_file': linkedin['file'],
'nl_file': best_match[0],
'nl_name': best_match[1],
'score': int(best_score * 100)
})
# Deduplicate - one NL file can only be matched once (keep best score)
matches.sort(key=lambda x: -x['score'])
seen_nl = set()
unique_matches = []
for match in matches:
if match['nl_file'] not in seen_nl:
unique_matches.append(match)
seen_nl.add(match['nl_file'])
print(f" Found {len(unique_matches)} unique matches")
# Show matches by score tier
print("\n4. Matches by Score Tier:")
for low, high in [(80, 101), (60, 80), (50, 60)]:
tier = [m for m in unique_matches if low <= m['score'] < high]
if tier:
print(f"\n {low}-{high-1}%: {len(tier)} matches")
for m in tier[:5]:
print(f" {m['score']:3d}% | '{m['linkedin_name'][:35]}''{m['nl_name'][:35]}'")
if len(tier) > 5:
print(f" ... and {len(tier) - 5} more")
# Save matches
output_path = base_dir / 'data/custodian/linkedin/_name_matches.json'
print(f"\n5. Saving to {output_path}...")
with open(output_path, 'w') as f:
json.dump({
'generated_at': datetime.now(timezone.utc).isoformat(),
'threshold': 50,
'total_matches': len(unique_matches),
'matches': unique_matches
}, f, indent=2)
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Unmatched LinkedIn: {len(linkedin_list)}")
print(f"Name matches found: {len(unique_matches)}")
print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}")
return 0
if __name__ == '__main__':
sys.exit(main())