glam/scripts/match_linkedin_names_ultra.py

#!/usr/bin/env python3
"""
Ultra-fast name matching using regex extraction (no YAML parsing).
"""

import json
import re
import sys
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict


def extract_name_regex(content: str) -> str:
    """Extract name from YAML content using regex (much faster than parsing)."""
    # Try "name:" field first
    match = re.search(r'^name:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
    if match:
        return match.group(1).strip()

    # Try "organisatie:" field (most NL-* files use this)
    match = re.search(r'^\s*organisatie:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
    if match:
        return match.group(1).strip()

    # Try nested custodian_name.emic_name
    match = re.search(r'emic_name:\s*["\']?([^"\'\n]+)["\']?\s*$', content, re.MULTILINE)
    if match:
        return match.group(1).strip()

    return ""


def tokenize(name: str) -> set:
    """Extract significant tokens from a name."""
    if not name:
        return set()

    name = name.lower()

    # Common stopwords
    stopwords = {
        'de', 'het', 'een', 'van', 'voor', 'in', 'en', 'te', 'den', 'der',
        'stichting', 'vereniging', 'museum', 'archief', 'bibliotheek',
        'the', 'a', 'an', 'of', 'and', 'for', 'at', 'in',
        'nederland', 'netherlands', 'holland', 'dutch',
        'nationaal', 'national', 'rijks', 'gemeente', 'gemeentelijk',
    }

    tokens = set(re.findall(r'\b[a-z]{3,}\b', name))
    tokens -= stopwords
    return tokens


def main():
    base_dir = Path('/Users/kempersc/apps/glam')
    linkedin_dir = base_dir / 'data/custodian/linkedin'
    custodian_dir = base_dir / 'data/custodian'
    unmatched_file = linkedin_dir / '_unmatched.txt'

    print("=" * 60)
    print("Ultra-Fast Name Matching (Regex-Based)")
    print("=" * 60)

    # Build token index from NL-* files
    print("\n1. Building token index from NL-* files...")
    token_index = defaultdict(list)  # token -> [(file, name)]
    nl_count = 0
    skip_count = 0

    for yaml_path in custodian_dir.glob("NL-*.yaml"):
        content = yaml_path.read_text()

        # Skip already enriched
        if 'linkedin_enrichment:' in content:
            skip_count += 1
            continue

        name = extract_name_regex(content)
        if name:
            tokens = tokenize(name)
            for token in tokens:
                token_index[token].append((str(yaml_path), name))
            nl_count += 1

    print(f"   Indexed {nl_count} NL-* files ({skip_count} already enriched)")
    print(f"   Token vocabulary: {len(token_index)} unique tokens")

    # Load unmatched LinkedIn slugs
    print("\n2. Loading unmatched LinkedIn custodians...")
    linkedin_list = []

    with open(unmatched_file, 'r') as f:
        slugs = [line.strip() for line in f if line.strip()]

    for slug in slugs:
        yaml_path = linkedin_dir / f"{slug}.yaml"
        if yaml_path.exists():
            content = yaml_path.read_text()
            name = extract_name_regex(content)
            if name:
                linkedin_list.append({
                    'slug': slug,
                    'name': name,
                    'file': str(yaml_path),
                    'tokens': tokenize(name)
                })

    print(f"   Loaded {len(linkedin_list)} LinkedIn custodians with names")

    # Find matches
    print("\n3. Finding matches...")
    matches = []

    for linkedin in linkedin_list:
        if not linkedin['tokens']:
            continue

        # Find candidates sharing at least one token
        candidates = {}  # file -> (name, shared_count)
        for token in linkedin['tokens']:
            for (nl_file, nl_name) in token_index.get(token, []):
                if nl_file not in candidates:
                    candidates[nl_file] = {'name': nl_name, 'shared': 0, 'tokens': tokenize(nl_name)}
                candidates[nl_file]['shared'] += 1

        # Score by Jaccard similarity
        best_score = 0
        best_match = None

        for nl_file, info in candidates.items():
            intersection = len(linkedin['tokens'] & info['tokens'])
            union = len(linkedin['tokens'] | info['tokens'])
            score = intersection / union if union > 0 else 0

            if score > best_score:
                best_score = score
                best_match = (nl_file, info['name'])

        if best_score >= 0.5 and best_match:
            matches.append({
                'linkedin_slug': linkedin['slug'],
                'linkedin_name': linkedin['name'],
                'linkedin_file': linkedin['file'],
                'nl_file': best_match[0],
                'nl_name': best_match[1],
                'score': int(best_score * 100)
            })

    # Deduplicate - one NL file can only be matched once (keep best score)
    matches.sort(key=lambda x: -x['score'])
    seen_nl = set()
    unique_matches = []
    for match in matches:
        if match['nl_file'] not in seen_nl:
            unique_matches.append(match)
            seen_nl.add(match['nl_file'])

    print(f"   Found {len(unique_matches)} unique matches")

    # Show matches by score tier
    print("\n4. Matches by Score Tier:")
    for low, high in [(80, 101), (60, 80), (50, 60)]:
        tier = [m for m in unique_matches if low <= m['score'] < high]
        if tier:
            print(f"\n   {low}-{high-1}%: {len(tier)} matches")
            for m in tier[:5]:
                print(f"      {m['score']:3d}% | '{m['linkedin_name'][:35]}' → '{m['nl_name'][:35]}'")
            if len(tier) > 5:
                print(f"      ... and {len(tier) - 5} more")

    # Save matches
    output_path = base_dir / 'data/custodian/linkedin/_name_matches.json'
    print(f"\n5. Saving to {output_path}...")

    with open(output_path, 'w') as f:
        json.dump({
            'generated_at': datetime.now(timezone.utc).isoformat(),
            'threshold': 50,
            'total_matches': len(unique_matches),
            'matches': unique_matches
        }, f, indent=2)

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Unmatched LinkedIn:  {len(linkedin_list)}")
    print(f"Name matches found:  {len(unique_matches)}")
    print(f"Still unmatched:     {len(linkedin_list) - len(unique_matches)}")

    return 0


if __name__ == '__main__':
    sys.exit(main())