glam/scripts/match_linkedin_by_name.py

#!/usr/bin/env python3
"""
Match unmatched LinkedIn custodians to existing NL-* files by name similarity.

Optimized version using pre-built name index for speed.
"""

import json
import os
import re
import sys
from pathlib import Path
from datetime import datetime, timezone

import yaml

try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
except ImportError:
    RAPIDFUZZ_AVAILABLE = False
    print("Warning: rapidfuzz not available, using basic matching")


def normalize_name(name: str) -> str:
    """Normalize institution name for comparison."""
    if not name:
        return ""

    # Lowercase
    name = name.lower()

    # Remove common prefixes/suffixes
    remove_patterns = [
        r'^stichting\s+',
        r'^vereniging\s+',
        r'^museum\s+',
        r'\s+museum$',
        r'^het\s+',
        r'^de\s+',
        r'\s+nederland$',
        r'\s+netherlands$',
    ]
    for pattern in remove_patterns:
        name = re.sub(pattern, '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', '', name)

    # Collapse whitespace
    name = re.sub(r'\s+', ' ', name).strip()

    return name


def build_nl_name_index(custodian_dir: Path) -> tuple:
    """Build index of NL-* file names. Only includes non-enriched files."""
    index = {}  # normalized_name -> file_path
    name_to_original = {}  # normalized -> original name

    print("   Scanning NL-* files...")
    count = 0

    for yaml_path in custodian_dir.glob("NL-*.yaml"):
        try:
            with open(yaml_path, 'r') as f:
                # Quick parse - just get name fields
                content = f.read()

            # Skip already enriched
            if 'linkedin_enrichment:' in content:
                continue

            data = yaml.safe_load(content)
            if not data:
                continue

            # Get primary name
            name = data.get('name') or data.get('custodian_name', {}).get('emic_name')
            if name:
                normalized = normalize_name(name)
                if normalized and len(normalized) > 2:
                    index[normalized] = str(yaml_path)
                    name_to_original[normalized] = name

            # Also index alternative names
            for alt in data.get('alternative_names', []):
                if alt:
                    normalized = normalize_name(alt)
                    if normalized and len(normalized) > 2 and normalized not in index:
                        index[normalized] = str(yaml_path)
                        name_to_original[normalized] = alt

            count += 1
            if count % 200 == 0:
                print(f"      Processed {count} files...")

        except Exception as e:
            pass  # Skip problematic files

    print(f"   Indexed {len(index)} names from {count} NL-* files")
    return index, name_to_original


def load_linkedin_names(unmatched_file: Path, linkedin_dir: Path) -> list:
    """Load unmatched LinkedIn custodians with their names."""
    custodians = []

    with open(unmatched_file, 'r') as f:
        slugs = [line.strip() for line in f if line.strip()]

    print(f"   Loading {len(slugs)} unmatched LinkedIn slugs...")

    for slug in slugs:
        yaml_path = linkedin_dir / f"{slug}.yaml"
        if yaml_path.exists():
            try:
                with open(yaml_path, 'r') as f:
                    data = yaml.safe_load(f)
                if data and data.get('name'):
                    custodians.append({
                        'slug': slug,
                        'name': data['name'],
                        'normalized': normalize_name(data['name']),
                        'file': str(yaml_path)
                    })
            except:
                pass

    print(f"   Loaded {len(custodians)} LinkedIn custodians with names")
    return custodians


def find_matches(linkedin_list: list, nl_index: dict, name_to_original: dict, threshold: int = 85) -> list:
    """Find name matches using rapidfuzz."""
    matches = []
    nl_names = list(nl_index.keys())

    print(f"   Matching {len(linkedin_list)} LinkedIn names against {len(nl_names)} NL names...")

    for i, linkedin in enumerate(linkedin_list):
        linkedin_norm = linkedin['normalized']
        if not linkedin_norm or len(linkedin_norm) < 3:
            continue

        best_score = 0
        best_nl_name = None

        # Quick exact match check first
        if linkedin_norm in nl_index:
            matches.append({
                'linkedin_slug': linkedin['slug'],
                'linkedin_name': linkedin['name'],
                'linkedin_file': linkedin['file'],
                'nl_file': nl_index[linkedin_norm],
                'nl_name': name_to_original.get(linkedin_norm, linkedin_norm),
                'score': 100,
                'match_type': 'exact'
            })
            continue

        # Fuzzy match
        if RAPIDFUZZ_AVAILABLE:
            from rapidfuzz import fuzz as fuzz_module
            for nl_name in nl_names:
                score = fuzz_module.token_sort_ratio(linkedin_norm, nl_name)
                if score > best_score:
                    best_score = score
                    best_nl_name = nl_name
        else:
            # Basic containment
            for nl_name in nl_names:
                if linkedin_norm in nl_name or nl_name in linkedin_norm:
                    shorter = min(len(linkedin_norm), len(nl_name))
                    longer = max(len(linkedin_norm), len(nl_name))
                    score = int((shorter / longer) * 100) if longer > 0 else 0
                    if score > best_score:
                        best_score = score
                        best_nl_name = nl_name

        if best_score >= threshold and best_nl_name:
            matches.append({
                'linkedin_slug': linkedin['slug'],
                'linkedin_name': linkedin['name'],
                'linkedin_file': linkedin['file'],
                'nl_file': nl_index[best_nl_name],
                'nl_name': name_to_original.get(best_nl_name, best_nl_name),
                'score': best_score,
                'match_type': 'fuzzy'
            })

        if (i + 1) % 100 == 0:
            print(f"      Processed {i + 1}/{len(linkedin_list)}...")

    return sorted(matches, key=lambda x: -x['score'])


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Match LinkedIn custodians by name similarity')
    parser.add_argument('--threshold', type=int, default=85, help='Minimum similarity score (0-100)')
    parser.add_argument('--output', type=str, default='data/custodian/linkedin/_name_matches.json',
                        help='Output file for matches')
    args = parser.parse_args()

    base_dir = Path('/Users/kempersc/apps/glam')
    linkedin_dir = base_dir / 'data/custodian/linkedin'
    custodian_dir = base_dir / 'data/custodian'
    unmatched_file = linkedin_dir / '_unmatched.txt'

    print("=" * 60)
    print("LinkedIn Name Similarity Matching (Optimized)")
    print("=" * 60)

    # Build NL name index
    print("\n1. Building NL-* name index...")
    nl_index, name_to_original = build_nl_name_index(custodian_dir)

    # Load LinkedIn names
    print("\n2. Loading unmatched LinkedIn custodians...")
    linkedin_list = load_linkedin_names(unmatched_file, linkedin_dir)

    # Find matches
    print(f"\n3. Finding matches (threshold: {args.threshold}%)...")
    matches = find_matches(linkedin_list, nl_index, name_to_original, args.threshold)

    # Deduplicate - one NL file can only be matched once
    seen_nl = set()
    unique_matches = []
    for match in matches:
        if match['nl_file'] not in seen_nl:
            unique_matches.append(match)
            seen_nl.add(match['nl_file'])

    print(f"\n   Found {len(unique_matches)} unique matches")

    # Show top matches
    print("\n4. Top Matches:")
    for m in unique_matches[:15]:
        print(f"   {m['score']:3d}% | '{m['linkedin_name'][:40]}' → '{m['nl_name'][:40]}'")

    if len(unique_matches) > 15:
        print(f"   ... and {len(unique_matches) - 15} more")

    # Save matches
    output_path = base_dir / args.output
    print(f"\n5. Saving matches to {output_path}...")

    output_data = {
        'generated_at': datetime.now(timezone.utc).isoformat(),
        'threshold': args.threshold,
        'total_matches': len(unique_matches),
        'matches': unique_matches
    }

    with open(output_path, 'w') as f:
        json.dump(output_data, f, indent=2)

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Unmatched LinkedIn custodians: {len(linkedin_list)}")
    print(f"Name matches found:            {len(unique_matches)}")
    print(f"  - Exact matches:             {len([m for m in unique_matches if m['match_type'] == 'exact'])}")
    print(f"  - Fuzzy matches:             {len([m for m in unique_matches if m['match_type'] == 'fuzzy'])}")
    print(f"Still unmatched:               {len(linkedin_list) - len(unique_matches)}")

    return 0


if __name__ == '__main__':
    sys.exit(main())