274 lines
9.2 KiB
Python
274 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Match unmatched LinkedIn custodians to existing NL-* files by name similarity.
|
|
|
|
Optimized version using pre-built name index for speed.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
import yaml
|
|
|
|
try:
|
|
from rapidfuzz import fuzz
|
|
RAPIDFUZZ_AVAILABLE = True
|
|
except ImportError:
|
|
RAPIDFUZZ_AVAILABLE = False
|
|
print("Warning: rapidfuzz not available, using basic matching")
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for comparison."""
|
|
if not name:
|
|
return ""
|
|
|
|
# Lowercase
|
|
name = name.lower()
|
|
|
|
# Remove common prefixes/suffixes
|
|
remove_patterns = [
|
|
r'^stichting\s+',
|
|
r'^vereniging\s+',
|
|
r'^museum\s+',
|
|
r'\s+museum$',
|
|
r'^het\s+',
|
|
r'^de\s+',
|
|
r'\s+nederland$',
|
|
r'\s+netherlands$',
|
|
]
|
|
for pattern in remove_patterns:
|
|
name = re.sub(pattern, '', name)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', '', name)
|
|
|
|
# Collapse whitespace
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
|
|
return name
|
|
|
|
|
|
def build_nl_name_index(custodian_dir: Path) -> tuple:
|
|
"""Build index of NL-* file names. Only includes non-enriched files."""
|
|
index = {} # normalized_name -> file_path
|
|
name_to_original = {} # normalized -> original name
|
|
|
|
print(" Scanning NL-* files...")
|
|
count = 0
|
|
|
|
for yaml_path in custodian_dir.glob("NL-*.yaml"):
|
|
try:
|
|
with open(yaml_path, 'r') as f:
|
|
# Quick parse - just get name fields
|
|
content = f.read()
|
|
|
|
# Skip already enriched
|
|
if 'linkedin_enrichment:' in content:
|
|
continue
|
|
|
|
data = yaml.safe_load(content)
|
|
if not data:
|
|
continue
|
|
|
|
# Get primary name
|
|
name = data.get('name') or data.get('custodian_name', {}).get('emic_name')
|
|
if name:
|
|
normalized = normalize_name(name)
|
|
if normalized and len(normalized) > 2:
|
|
index[normalized] = str(yaml_path)
|
|
name_to_original[normalized] = name
|
|
|
|
# Also index alternative names
|
|
for alt in data.get('alternative_names', []):
|
|
if alt:
|
|
normalized = normalize_name(alt)
|
|
if normalized and len(normalized) > 2 and normalized not in index:
|
|
index[normalized] = str(yaml_path)
|
|
name_to_original[normalized] = alt
|
|
|
|
count += 1
|
|
if count % 200 == 0:
|
|
print(f" Processed {count} files...")
|
|
|
|
except Exception as e:
|
|
pass # Skip problematic files
|
|
|
|
print(f" Indexed {len(index)} names from {count} NL-* files")
|
|
return index, name_to_original
|
|
|
|
|
|
def load_linkedin_names(unmatched_file: Path, linkedin_dir: Path) -> list:
|
|
"""Load unmatched LinkedIn custodians with their names."""
|
|
custodians = []
|
|
|
|
with open(unmatched_file, 'r') as f:
|
|
slugs = [line.strip() for line in f if line.strip()]
|
|
|
|
print(f" Loading {len(slugs)} unmatched LinkedIn slugs...")
|
|
|
|
for slug in slugs:
|
|
yaml_path = linkedin_dir / f"{slug}.yaml"
|
|
if yaml_path.exists():
|
|
try:
|
|
with open(yaml_path, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
if data and data.get('name'):
|
|
custodians.append({
|
|
'slug': slug,
|
|
'name': data['name'],
|
|
'normalized': normalize_name(data['name']),
|
|
'file': str(yaml_path)
|
|
})
|
|
except:
|
|
pass
|
|
|
|
print(f" Loaded {len(custodians)} LinkedIn custodians with names")
|
|
return custodians
|
|
|
|
|
|
def find_matches(linkedin_list: list, nl_index: dict, name_to_original: dict, threshold: int = 85) -> list:
|
|
"""Find name matches using rapidfuzz."""
|
|
matches = []
|
|
nl_names = list(nl_index.keys())
|
|
|
|
print(f" Matching {len(linkedin_list)} LinkedIn names against {len(nl_names)} NL names...")
|
|
|
|
for i, linkedin in enumerate(linkedin_list):
|
|
linkedin_norm = linkedin['normalized']
|
|
if not linkedin_norm or len(linkedin_norm) < 3:
|
|
continue
|
|
|
|
best_score = 0
|
|
best_nl_name = None
|
|
|
|
# Quick exact match check first
|
|
if linkedin_norm in nl_index:
|
|
matches.append({
|
|
'linkedin_slug': linkedin['slug'],
|
|
'linkedin_name': linkedin['name'],
|
|
'linkedin_file': linkedin['file'],
|
|
'nl_file': nl_index[linkedin_norm],
|
|
'nl_name': name_to_original.get(linkedin_norm, linkedin_norm),
|
|
'score': 100,
|
|
'match_type': 'exact'
|
|
})
|
|
continue
|
|
|
|
# Fuzzy match
|
|
if RAPIDFUZZ_AVAILABLE:
|
|
from rapidfuzz import fuzz as fuzz_module
|
|
for nl_name in nl_names:
|
|
score = fuzz_module.token_sort_ratio(linkedin_norm, nl_name)
|
|
if score > best_score:
|
|
best_score = score
|
|
best_nl_name = nl_name
|
|
else:
|
|
# Basic containment
|
|
for nl_name in nl_names:
|
|
if linkedin_norm in nl_name or nl_name in linkedin_norm:
|
|
shorter = min(len(linkedin_norm), len(nl_name))
|
|
longer = max(len(linkedin_norm), len(nl_name))
|
|
score = int((shorter / longer) * 100) if longer > 0 else 0
|
|
if score > best_score:
|
|
best_score = score
|
|
best_nl_name = nl_name
|
|
|
|
if best_score >= threshold and best_nl_name:
|
|
matches.append({
|
|
'linkedin_slug': linkedin['slug'],
|
|
'linkedin_name': linkedin['name'],
|
|
'linkedin_file': linkedin['file'],
|
|
'nl_file': nl_index[best_nl_name],
|
|
'nl_name': name_to_original.get(best_nl_name, best_nl_name),
|
|
'score': best_score,
|
|
'match_type': 'fuzzy'
|
|
})
|
|
|
|
if (i + 1) % 100 == 0:
|
|
print(f" Processed {i + 1}/{len(linkedin_list)}...")
|
|
|
|
return sorted(matches, key=lambda x: -x['score'])
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Match LinkedIn custodians by name similarity')
|
|
parser.add_argument('--threshold', type=int, default=85, help='Minimum similarity score (0-100)')
|
|
parser.add_argument('--output', type=str, default='data/custodian/linkedin/_name_matches.json',
|
|
help='Output file for matches')
|
|
args = parser.parse_args()
|
|
|
|
base_dir = Path('/Users/kempersc/apps/glam')
|
|
linkedin_dir = base_dir / 'data/custodian/linkedin'
|
|
custodian_dir = base_dir / 'data/custodian'
|
|
unmatched_file = linkedin_dir / '_unmatched.txt'
|
|
|
|
print("=" * 60)
|
|
print("LinkedIn Name Similarity Matching (Optimized)")
|
|
print("=" * 60)
|
|
|
|
# Build NL name index
|
|
print("\n1. Building NL-* name index...")
|
|
nl_index, name_to_original = build_nl_name_index(custodian_dir)
|
|
|
|
# Load LinkedIn names
|
|
print("\n2. Loading unmatched LinkedIn custodians...")
|
|
linkedin_list = load_linkedin_names(unmatched_file, linkedin_dir)
|
|
|
|
# Find matches
|
|
print(f"\n3. Finding matches (threshold: {args.threshold}%)...")
|
|
matches = find_matches(linkedin_list, nl_index, name_to_original, args.threshold)
|
|
|
|
# Deduplicate - one NL file can only be matched once
|
|
seen_nl = set()
|
|
unique_matches = []
|
|
for match in matches:
|
|
if match['nl_file'] not in seen_nl:
|
|
unique_matches.append(match)
|
|
seen_nl.add(match['nl_file'])
|
|
|
|
print(f"\n Found {len(unique_matches)} unique matches")
|
|
|
|
# Show top matches
|
|
print("\n4. Top Matches:")
|
|
for m in unique_matches[:15]:
|
|
print(f" {m['score']:3d}% | '{m['linkedin_name'][:40]}' → '{m['nl_name'][:40]}'")
|
|
|
|
if len(unique_matches) > 15:
|
|
print(f" ... and {len(unique_matches) - 15} more")
|
|
|
|
# Save matches
|
|
output_path = base_dir / args.output
|
|
print(f"\n5. Saving matches to {output_path}...")
|
|
|
|
output_data = {
|
|
'generated_at': datetime.now(timezone.utc).isoformat(),
|
|
'threshold': args.threshold,
|
|
'total_matches': len(unique_matches),
|
|
'matches': unique_matches
|
|
}
|
|
|
|
with open(output_path, 'w') as f:
|
|
json.dump(output_data, f, indent=2)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Unmatched LinkedIn custodians: {len(linkedin_list)}")
|
|
print(f"Name matches found: {len(unique_matches)}")
|
|
print(f" - Exact matches: {len([m for m in unique_matches if m['match_type'] == 'exact'])}")
|
|
print(f" - Fuzzy matches: {len([m for m in unique_matches if m['match_type'] == 'fuzzy'])}")
|
|
print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|