glam/scripts/match_linkedin_by_name.py
2025-12-16 20:27:39 +01:00

274 lines
9.2 KiB
Python

#!/usr/bin/env python3
"""
Match unmatched LinkedIn custodians to existing NL-* files by name similarity.
Optimized version using pre-built name index for speed.
"""
import json
import os
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
import yaml
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except ImportError:
RAPIDFUZZ_AVAILABLE = False
print("Warning: rapidfuzz not available, using basic matching")
def normalize_name(name: str) -> str:
"""Normalize institution name for comparison."""
if not name:
return ""
# Lowercase
name = name.lower()
# Remove common prefixes/suffixes
remove_patterns = [
r'^stichting\s+',
r'^vereniging\s+',
r'^museum\s+',
r'\s+museum$',
r'^het\s+',
r'^de\s+',
r'\s+nederland$',
r'\s+netherlands$',
]
for pattern in remove_patterns:
name = re.sub(pattern, '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', '', name)
# Collapse whitespace
name = re.sub(r'\s+', ' ', name).strip()
return name
def build_nl_name_index(custodian_dir: Path) -> tuple:
"""Build index of NL-* file names. Only includes non-enriched files."""
index = {} # normalized_name -> file_path
name_to_original = {} # normalized -> original name
print(" Scanning NL-* files...")
count = 0
for yaml_path in custodian_dir.glob("NL-*.yaml"):
try:
with open(yaml_path, 'r') as f:
# Quick parse - just get name fields
content = f.read()
# Skip already enriched
if 'linkedin_enrichment:' in content:
continue
data = yaml.safe_load(content)
if not data:
continue
# Get primary name
name = data.get('name') or data.get('custodian_name', {}).get('emic_name')
if name:
normalized = normalize_name(name)
if normalized and len(normalized) > 2:
index[normalized] = str(yaml_path)
name_to_original[normalized] = name
# Also index alternative names
for alt in data.get('alternative_names', []):
if alt:
normalized = normalize_name(alt)
if normalized and len(normalized) > 2 and normalized not in index:
index[normalized] = str(yaml_path)
name_to_original[normalized] = alt
count += 1
if count % 200 == 0:
print(f" Processed {count} files...")
except Exception as e:
pass # Skip problematic files
print(f" Indexed {len(index)} names from {count} NL-* files")
return index, name_to_original
def load_linkedin_names(unmatched_file: Path, linkedin_dir: Path) -> list:
"""Load unmatched LinkedIn custodians with their names."""
custodians = []
with open(unmatched_file, 'r') as f:
slugs = [line.strip() for line in f if line.strip()]
print(f" Loading {len(slugs)} unmatched LinkedIn slugs...")
for slug in slugs:
yaml_path = linkedin_dir / f"{slug}.yaml"
if yaml_path.exists():
try:
with open(yaml_path, 'r') as f:
data = yaml.safe_load(f)
if data and data.get('name'):
custodians.append({
'slug': slug,
'name': data['name'],
'normalized': normalize_name(data['name']),
'file': str(yaml_path)
})
except:
pass
print(f" Loaded {len(custodians)} LinkedIn custodians with names")
return custodians
def find_matches(linkedin_list: list, nl_index: dict, name_to_original: dict, threshold: int = 85) -> list:
"""Find name matches using rapidfuzz."""
matches = []
nl_names = list(nl_index.keys())
print(f" Matching {len(linkedin_list)} LinkedIn names against {len(nl_names)} NL names...")
for i, linkedin in enumerate(linkedin_list):
linkedin_norm = linkedin['normalized']
if not linkedin_norm or len(linkedin_norm) < 3:
continue
best_score = 0
best_nl_name = None
# Quick exact match check first
if linkedin_norm in nl_index:
matches.append({
'linkedin_slug': linkedin['slug'],
'linkedin_name': linkedin['name'],
'linkedin_file': linkedin['file'],
'nl_file': nl_index[linkedin_norm],
'nl_name': name_to_original.get(linkedin_norm, linkedin_norm),
'score': 100,
'match_type': 'exact'
})
continue
# Fuzzy match
if RAPIDFUZZ_AVAILABLE:
from rapidfuzz import fuzz as fuzz_module
for nl_name in nl_names:
score = fuzz_module.token_sort_ratio(linkedin_norm, nl_name)
if score > best_score:
best_score = score
best_nl_name = nl_name
else:
# Basic containment
for nl_name in nl_names:
if linkedin_norm in nl_name or nl_name in linkedin_norm:
shorter = min(len(linkedin_norm), len(nl_name))
longer = max(len(linkedin_norm), len(nl_name))
score = int((shorter / longer) * 100) if longer > 0 else 0
if score > best_score:
best_score = score
best_nl_name = nl_name
if best_score >= threshold and best_nl_name:
matches.append({
'linkedin_slug': linkedin['slug'],
'linkedin_name': linkedin['name'],
'linkedin_file': linkedin['file'],
'nl_file': nl_index[best_nl_name],
'nl_name': name_to_original.get(best_nl_name, best_nl_name),
'score': best_score,
'match_type': 'fuzzy'
})
if (i + 1) % 100 == 0:
print(f" Processed {i + 1}/{len(linkedin_list)}...")
return sorted(matches, key=lambda x: -x['score'])
def main():
import argparse
parser = argparse.ArgumentParser(description='Match LinkedIn custodians by name similarity')
parser.add_argument('--threshold', type=int, default=85, help='Minimum similarity score (0-100)')
parser.add_argument('--output', type=str, default='data/custodian/linkedin/_name_matches.json',
help='Output file for matches')
args = parser.parse_args()
base_dir = Path('/Users/kempersc/apps/glam')
linkedin_dir = base_dir / 'data/custodian/linkedin'
custodian_dir = base_dir / 'data/custodian'
unmatched_file = linkedin_dir / '_unmatched.txt'
print("=" * 60)
print("LinkedIn Name Similarity Matching (Optimized)")
print("=" * 60)
# Build NL name index
print("\n1. Building NL-* name index...")
nl_index, name_to_original = build_nl_name_index(custodian_dir)
# Load LinkedIn names
print("\n2. Loading unmatched LinkedIn custodians...")
linkedin_list = load_linkedin_names(unmatched_file, linkedin_dir)
# Find matches
print(f"\n3. Finding matches (threshold: {args.threshold}%)...")
matches = find_matches(linkedin_list, nl_index, name_to_original, args.threshold)
# Deduplicate - one NL file can only be matched once
seen_nl = set()
unique_matches = []
for match in matches:
if match['nl_file'] not in seen_nl:
unique_matches.append(match)
seen_nl.add(match['nl_file'])
print(f"\n Found {len(unique_matches)} unique matches")
# Show top matches
print("\n4. Top Matches:")
for m in unique_matches[:15]:
print(f" {m['score']:3d}% | '{m['linkedin_name'][:40]}''{m['nl_name'][:40]}'")
if len(unique_matches) > 15:
print(f" ... and {len(unique_matches) - 15} more")
# Save matches
output_path = base_dir / args.output
print(f"\n5. Saving matches to {output_path}...")
output_data = {
'generated_at': datetime.now(timezone.utc).isoformat(),
'threshold': args.threshold,
'total_matches': len(unique_matches),
'matches': unique_matches
}
with open(output_path, 'w') as f:
json.dump(output_data, f, indent=2)
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Unmatched LinkedIn custodians: {len(linkedin_list)}")
print(f"Name matches found: {len(unique_matches)}")
print(f" - Exact matches: {len([m for m in unique_matches if m['match_type'] == 'exact'])}")
print(f" - Fuzzy matches: {len([m for m in unique_matches if m['match_type'] == 'fuzzy'])}")
print(f"Still unmatched: {len(linkedin_list) - len(unique_matches)}")
return 0
if __name__ == '__main__':
sys.exit(main())