196 lines
6.4 KiB
Python
196 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ultra-fast name matching using regex extraction (no YAML parsing).
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
|
|
|
|
def extract_name_regex(content: str) -> str:
|
|
"""Extract name from YAML content using regex (much faster than parsing)."""
|
|
# Try "name:" field first
|
|
match = re.search(r'^name:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
# Try "organisatie:" field (most NL-* files use this)
|
|
match = re.search(r'^\s*organisatie:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
# Try nested custodian_name.emic_name
|
|
match = re.search(r'emic_name:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
return ""
|
|
|
|
|
|
def tokenize(name: str) -> set:
|
|
"""Extract significant tokens from a name."""
|
|
if not name:
|
|
return set()
|
|
|
|
name = name.lower()
|
|
|
|
# Common stopwords
|
|
stopwords = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'en', 'te', 'den', 'der',
|
|
'stichting', 'vereniging', 'museum', 'archief', 'bibliotheek',
|
|
'the', 'a', 'an', 'of', 'and', 'for', 'at', 'in',
|
|
'nederland', 'netherlands', 'holland', 'dutch',
|
|
'nationaal', 'national', 'rijks', 'gemeente', 'gemeentelijk',
|
|
}
|
|
|
|
tokens = set(re.findall(r'\b[a-z]{3,}\b', name))
|
|
tokens -= stopwords
|
|
return tokens
|
|
|
|
|
|
def main():
|
|
base_dir = Path('/Users/kempersc/apps/glam')
|
|
linkedin_dir = base_dir / 'data/custodian/linkedin'
|
|
custodian_dir = base_dir / 'data/custodian'
|
|
unmatched_file = linkedin_dir / '_unmatched.txt'
|
|
|
|
print("=" * 60)
|
|
print("Ultra-Fast Name Matching (Regex-Based)")
|
|
print("=" * 60)
|
|
|
|
# Build token index from NL-* files
|
|
print("\n1. Building token index from NL-* files...")
|
|
token_index = defaultdict(list) # token -> [(file, name)]
|
|
nl_count = 0
|
|
skip_count = 0
|
|
|
|
for yaml_path in custodian_dir.glob("NL-*.yaml"):
|
|
content = yaml_path.read_text()
|
|
|
|
# Skip already enriched
|
|
if 'linkedin_enrichment:' in content:
|
|
skip_count += 1
|
|
continue
|
|
|
|
name = extract_name_regex(content)
|
|
if name:
|
|
tokens = tokenize(name)
|
|
for token in tokens:
|
|
token_index[token].append((str(yaml_path), name))
|
|
nl_count += 1
|
|
|
|
print(f" Indexed {nl_count} NL-* files ({skip_count} already enriched)")
|
|
print(f" Token vocabulary: {len(token_index)} unique tokens")
|
|
|
|
# Load unmatched LinkedIn slugs
|
|
print("\n2. Loading unmatched LinkedIn custodians...")
|
|
linkedin_list = []
|
|
|
|
with open(unmatched_file, 'r') as f:
|
|
slugs = [line.strip() for line in f if line.strip()]
|
|
|
|
for slug in slugs:
|
|
yaml_path = linkedin_dir / f"{slug}.yaml"
|
|
if yaml_path.exists():
|
|
content = yaml_path.read_text()
|
|
name = extract_name_regex(content)
|
|
if name:
|
|
linkedin_list.append({
|
|
'slug': slug,
|
|
'name': name,
|
|
'file': str(yaml_path),
|
|
'tokens': tokenize(name)
|
|
})
|
|
|
|
print(f" Loaded {len(linkedin_list)} LinkedIn custodians with names")
|
|
|
|
# Find matches
|
|
print("\n3. Finding matches...")
|
|
matches = []
|
|
|
|
for linkedin in linkedin_list:
|
|
if not linkedin['tokens']:
|
|
continue
|
|
|
|
# Find candidates sharing at least one token
|
|
candidates = {} # file -> (name, shared_count)
|
|
for token in linkedin['tokens']:
|
|
for (nl_file, nl_name) in token_index.get(token, []):
|
|
if nl_file not in candidates:
|
|
candidates[nl_file] = {'name': nl_name, 'shared': 0, 'tokens': tokenize(nl_name)}
|
|
candidates[nl_file]['shared'] += 1
|
|
|
|
# Score by Jaccard similarity
|
|
best_score = 0
|
|
best_match = None
|
|
|
|
for nl_file, info in candidates.items():
|
|
intersection = len(linkedin['tokens'] & info['tokens'])
|
|
union = len(linkedin['tokens'] | info['tokens'])
|
|
score = intersection / union if union > 0 else 0
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = (nl_file, info['name'])
|
|
|
|
if best_score >= 0.5 and best_match:
|
|
matches.append({
|
|
'linkedin_slug': linkedin['slug'],
|
|
'linkedin_name': linkedin['name'],
|
|
'linkedin_file': linkedin['file'],
|
|
'nl_file': best_match[0],
|
|
'nl_name': best_match[1],
|
|
'score': int(best_score * 100)
|
|
})
|
|
|
|
# Deduplicate - one NL file can only be matched once (keep best score)
|
|
matches.sort(key=lambda x: -x['score'])
|
|
seen_nl = set()
|
|
unique_matches = []
|
|
for match in matches:
|
|
if match['nl_file'] not in seen_nl:
|
|
unique_matches.append(match)
|
|
seen_nl.add(match['nl_file'])
|
|
|
|
print(f" Found {len(unique_matches)} unique matches")
|
|
|
|
# Show matches by score tier
|
|
print("\n4. Matches by Score Tier:")
|
|
for low, high in [(80, 101), (60, 80), (50, 60)]:
|
|
tier = [m for m in unique_matches if low <= m['score'] < high]
|
|
if tier:
|
|
print(f"\n {low}-{high-1}%: {len(tier)} matches")
|
|
for m in tier[:5]:
|
|
print(f" {m['score']:3d}% | '{m['linkedin_name'][:35]}' → '{m['nl_name'][:35]}'")
|
|
if len(tier) > 5:
|
|
print(f" ... and {len(tier) - 5} more")
|
|
|
|
# Save matches
|
|
output_path = base_dir / 'data/custodian/linkedin/_name_matches.json'
|
|
print(f"\n5. Saving to {output_path}...")
|
|
|
|
with open(output_path, 'w') as f:
|
|
json.dump({
|
|
'generated_at': datetime.now(timezone.utc).isoformat(),
|
|
'threshold': 50,
|
|
'total_matches': len(unique_matches),
|
|
'matches': unique_matches
|
|
}, f, indent=2)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Unmatched LinkedIn: {len(linkedin_list)}")
|
|
print(f"Name matches found: {len(unique_matches)}")
|
|
print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|