glam/scripts/match_linkedin_by_name_fast.py

#!/usr/bin/env python3
"""
Fast name matching for LinkedIn to NL-* custodians.

Uses token-based indexing for O(1) lookups instead of O(n²) pairwise comparison.
"""

import json
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict

import yaml


def tokenize(name: str) -> set:
    """Extract significant tokens from a name."""
    if not name:
        return set()

    # Lowercase and normalize
    name = name.lower()

    # Remove common Dutch/English words
    stopwords = {
        'de', 'het', 'een', 'van', 'voor', 'in', 'en', 'te', 'den', 'der',
        'stichting', 'vereniging', 'museum', 'archief', 'bibliotheek',
        'the', 'a', 'an', 'of', 'and', 'for', 'at', 'in',
        'nederland', 'netherlands', 'holland', 'dutch',
        'nationaal', 'national', 'rijks', 'gemeente', 'gemeentelijk',
        'historisch', 'historical', 'history', 'historic',
    }

    # Extract tokens
    tokens = set(re.findall(r'\b[a-z]{3,}\b', name))
    tokens -= stopwords

    return tokens


def build_token_index(custodian_dir: Path) -> tuple:
    """Build inverted index: token -> list of (file, name, all_tokens)."""
    token_index = defaultdict(list)  # token -> [(file, name, tokens)]
    all_entries = []  # list of (file, name, tokens)

    print("   Building token index from NL-* files...")

    for yaml_path in custodian_dir.glob("NL-*.yaml"):
        try:
            with open(yaml_path, 'r') as f:
                content = f.read()

            # Skip already enriched
            if 'linkedin_enrichment:' in content:
                continue

            data = yaml.safe_load(content)
            if not data:
                continue

            # Get primary name
            name = data.get('name') or data.get('custodian_name', {}).get('emic_name', '')
            if not name:
                continue

            tokens = tokenize(name)
            if tokens:
                entry = (str(yaml_path), name, tokens)
                all_entries.append(entry)
                for token in tokens:
                    token_index[token].append(entry)

        except Exception:
            pass

    print(f"   Indexed {len(all_entries)} NL-* files with {len(token_index)} unique tokens")
    return token_index, all_entries


def find_matches_fast(linkedin_list: list, token_index: dict, threshold: float = 0.5) -> list:
    """Find matches using token overlap (Jaccard similarity)."""
    matches = []

    print(f"   Matching {len(linkedin_list)} LinkedIn custodians...")

    for linkedin in linkedin_list:
        linkedin_tokens = tokenize(linkedin['name'])
        if not linkedin_tokens:
            continue

        # Find candidates that share at least one token
        candidates = {}  # file -> (name, tokens)
        for token in linkedin_tokens:
            for (nl_file, nl_name, nl_tokens) in token_index.get(token, []):
                if nl_file not in candidates:
                    candidates[nl_file] = (nl_name, nl_tokens)

        if not candidates:
            continue

        # Score candidates by Jaccard similarity
        best_score = 0
        best_match = None

        for nl_file, (nl_name, nl_tokens) in candidates.items():
            # Jaccard similarity: |A ∩ B| / |A ∪ B|
            intersection = len(linkedin_tokens & nl_tokens)
            union = len(linkedin_tokens | nl_tokens)
            score = intersection / union if union > 0 else 0

            if score > best_score:
                best_score = score
                best_match = (nl_file, nl_name)

        if best_score >= threshold and best_match:
            matches.append({
                'linkedin_slug': linkedin['slug'],
                'linkedin_name': linkedin['name'],
                'linkedin_file': linkedin['file'],
                'nl_file': best_match[0],
                'nl_name': best_match[1],
                'score': int(best_score * 100),
                'match_type': 'token_jaccard'
            })

    return sorted(matches, key=lambda x: -x['score'])


def load_linkedin_names(unmatched_file: Path, linkedin_dir: Path) -> list:
    """Load unmatched LinkedIn custodians."""
    custodians = []

    with open(unmatched_file, 'r') as f:
        slugs = [line.strip() for line in f if line.strip()]

    for slug in slugs:
        yaml_path = linkedin_dir / f"{slug}.yaml"
        if yaml_path.exists():
            try:
                with open(yaml_path, 'r') as f:
                    data = yaml.safe_load(f)
                if data and data.get('name'):
                    custodians.append({
                        'slug': slug,
                        'name': data['name'],
                        'file': str(yaml_path)
                    })
            except:
                pass

    return custodians


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Fast LinkedIn name matching')
    parser.add_argument('--threshold', type=int, default=50,
                        help='Minimum Jaccard similarity (0-100, default 50)')
    parser.add_argument('--output', type=str, default='data/custodian/linkedin/_name_matches.json')
    args = parser.parse_args()

    base_dir = Path('/Users/kempersc/apps/glam')
    linkedin_dir = base_dir / 'data/custodian/linkedin'
    custodian_dir = base_dir / 'data/custodian'
    unmatched_file = linkedin_dir / '_unmatched.txt'

    print("=" * 60)
    print("Fast LinkedIn Name Matching (Token-Based)")
    print("=" * 60)

    # Build token index
    print("\n1. Building token index...")
    token_index, _ = build_token_index(custodian_dir)

    # Load LinkedIn names
    print("\n2. Loading unmatched LinkedIn custodians...")
    linkedin_list = load_linkedin_names(unmatched_file, linkedin_dir)
    print(f"   Loaded {len(linkedin_list)} LinkedIn custodians")

    # Find matches
    print(f"\n3. Finding matches (threshold: {args.threshold}%)...")
    matches = find_matches_fast(linkedin_list, token_index, args.threshold / 100)

    # Deduplicate - one NL file can only be matched once
    seen_nl = set()
    unique_matches = []
    for match in matches:
        if match['nl_file'] not in seen_nl:
            unique_matches.append(match)
            seen_nl.add(match['nl_file'])

    print(f"\n   Found {len(unique_matches)} unique matches")

    # Show matches by score
    print("\n4. Matches by Score:")
    tiers = [(80, 101), (60, 80), (50, 60)]
    for low, high in tiers:
        tier_matches = [m for m in unique_matches if low <= m['score'] < high]
        if tier_matches:
            print(f"\n   Score {low}-{high-1}%: {len(tier_matches)} matches")
            for m in tier_matches[:5]:
                print(f"      {m['score']:3d}% | '{m['linkedin_name'][:35]}' → '{m['nl_name'][:35]}'")
            if len(tier_matches) > 5:
                print(f"      ... and {len(tier_matches) - 5} more")

    # Save matches
    output_path = base_dir / args.output
    print(f"\n5. Saving {len(unique_matches)} matches to {output_path}...")

    with open(output_path, 'w') as f:
        json.dump({
            'generated_at': datetime.now(timezone.utc).isoformat(),
            'threshold': args.threshold,
            'total_matches': len(unique_matches),
            'matches': unique_matches
        }, f, indent=2)

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Unmatched LinkedIn custodians: {len(linkedin_list)}")
    print(f"Name matches found:            {len(unique_matches)}")
    print(f"Still unmatched:               {len(linkedin_list) - len(unique_matches)}")

    return 0


if __name__ == '__main__':
    sys.exit(main())