glam/scripts/match_pending_to_linkedin.py

#!/usr/bin/env python3
"""
Match PENDING files to LinkedIn About pages and extract locations.

This script:
1. Builds a fuzzy lookup of LinkedIn About page organization names
2. Matches PENDING file emic_names to About pages
3. Extracts location data from matched About pages
4. Resolves PENDING files with proper GHCIDs

Usage:
    python scripts/match_pending_to_linkedin.py --dry-run
    python scripts/match_pending_to_linkedin.py
"""

import re
import json
import yaml
import shutil
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List
from collections import defaultdict

# Paths
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
LINKEDIN_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
LOCATIONS_FILE = Path("/Users/kempersc/apps/glam/data/linkedin_locations.json")

# City mappings
CITY_TO_PROVINCE = {
    'amsterdam': 'NH', 'haarlem': 'NH', 'alkmaar': 'NH', 'hilversum': 'NH',
    'zaandam': 'NH', 'hoorn': 'NH', 'enkhuizen': 'NH', 'purmerend': 'NH',
    'amstelveen': 'NH', 'heemstede': 'NH', 'bussum': 'NH', 'naarden': 'NH',
    'muiden': 'NH', 'weesp': 'NH', 'edam': 'NH', 'volendam': 'NH',
    'den helder': 'NH', 'laren': 'NH', 'blaricum': 'NH', 'castricum': 'NH',
    'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', "'s-gravenhage": 'ZH',
    'leiden': 'ZH', 'delft': 'ZH', 'dordrecht': 'ZH', 'gouda': 'ZH',
    'schiedam': 'ZH', 'zoetermeer': 'ZH', 'vlaardingen': 'ZH',
    'hoek van holland': 'ZH', 'maassluis': 'ZH', 'rijswijk': 'ZH',
    'wassenaar': 'ZH', 'lisse': 'ZH', 'noordwijk': 'ZH', 'katwijk': 'ZH',
    'utrecht': 'UT', 'amersfoort': 'UT', 'zeist': 'UT', 'nieuwegein': 'UT',
    'veenendaal': 'UT', 'houten': 'UT', 'soest': 'UT', 'baarn': 'UT',
    'doorn': 'UT', 'driebergen': 'UT', 'bunnik': 'UT',
    'arnhem': 'GE', 'nijmegen': 'GE', 'apeldoorn': 'GE', 'ede': 'GE',
    'wageningen': 'GE', 'doetinchem': 'GE', 'zutphen': 'GE', 'harderwijk': 'GE',
    'tiel': 'GE', 'culemborg': 'GE', 'barneveld': 'GE', 'elburg': 'GE',
    'winterswijk': 'GE', 'oosterbeek': 'GE', 'velp': 'GE', 'rhenen': 'GE',
    'borculo': 'GE', 'lochem': 'GE', 'epe': 'GE',
    'eindhoven': 'NB', 'tilburg': 'NB', 'breda': 'NB', "'s-hertogenbosch": 'NB',
    'den bosch': 'NB', 'helmond': 'NB', 'oss': 'NB', 'roosendaal': 'NB',
    'bergen op zoom': 'NB', 'waalwijk': 'NB', 'uden': 'NB', 'boxtel': 'NB',
    'oisterwijk': 'NB', 'vught': 'NB', 'nuenen': 'NB', 'best': 'NB',
    'etten-leur': 'NB', 'oosterhout': 'NB',
    'maastricht': 'LI', 'venlo': 'LI', 'heerlen': 'LI', 'roermond': 'LI',
    'sittard': 'LI', 'geleen': 'LI', 'weert': 'LI', 'valkenburg': 'LI',
    'thorn': 'LI', 'venray': 'LI',
    'zwolle': 'OV', 'deventer': 'OV', 'enschede': 'OV', 'hengelo': 'OV',
    'almelo': 'OV', 'kampen': 'OV', 'oldenzaal': 'OV', 'rijssen': 'OV',
    'staphorst': 'OV', 'giethoorn': 'OV', 'steenwijk': 'OV',
    'leeuwarden': 'FR', 'drachten': 'FR', 'sneek': 'FR', 'heerenveen': 'FR',
    'harlingen': 'FR', 'dokkum': 'FR', 'franeker': 'FR', 'joure': 'FR',
    'workum': 'FR', 'makkum': 'FR', 'hindeloopen': 'FR',
    'assen': 'DR', 'emmen': 'DR', 'hoogeveen': 'DR', 'meppel': 'DR',
    'coevorden': 'DR', 'borger': 'DR', 'veenhuizen': 'DR',
    'groningen': 'GR', 'veendam': 'GR', 'winschoten': 'GR',
    'appingedam': 'GR', 'delfzijl': 'GR',
    'middelburg': 'ZE', 'vlissingen': 'ZE', 'goes': 'ZE', 'terneuzen': 'ZE',
    'zierikzee': 'ZE', 'veere': 'ZE',
    'almere': 'FL', 'lelystad': 'FL', 'dronten': 'FL', 'urk': 'FL',
}

CITY_TO_CODE = {
    'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA',
    'utrecht': 'UTR', 'eindhoven': 'EIN', 'tilburg': 'TIL', 'groningen': 'GRO',
    'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ', 'apeldoorn': 'APE',
    'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS', 'amersfoort': 'AME',
    'zaandam': 'ZAA', "'s-hertogenbosch": 'DBO', 'den bosch': 'DBO',
    'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR',
    'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'leeuwarden': 'LEE',
    'hilversum': 'HIL', 'assen': 'ASS', 'middelburg': 'MID', 'hoorn': 'HOO',
    'enkhuizen': 'ENK', 'wageningen': 'WAG', 'gouda': 'GOU', 'venlo': 'VEN',
    'heerlen': 'HEE', 'roermond': 'ROE', 'zeist': 'ZEI', 'ede': 'EDE',
    'harderwijk': 'HAR', 'zutphen': 'ZUT', 'helmond': 'HEL', 'oss': 'OSS',
    'schiedam': 'SCH', 'vlaardingen': 'VLA', 'hoek van holland': 'HVH',
    'rijswijk': 'RIJ', 'wassenaar': 'WAS', 'sneek': 'SNE', 'dokkum': 'DOK',
    'joure': 'JOU', 'meppel': 'MEP', 'coevorden': 'COE', 'lelystad': 'LEL',
}


def normalize_name(name: str) -> str:
    """Normalize organization name for matching."""
    normalized = unicodedata.normalize('NFKD', name)
    normalized = normalized.lower().strip()
    # Remove punctuation except spaces
    normalized = re.sub(r'[^\w\s]', '', normalized)
    normalized = ' '.join(normalized.split())
    return normalized


def extract_org_name_from_filename(filename: str) -> str:
    """Extract organization name from LinkedIn HTML filename."""
    match = re.match(r'\(\d+\)\s*(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
    if match:
        return match.group(1).strip()
    match = re.match(r'(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
    if match:
        return match.group(1).strip()
    return filename


def build_linkedin_lookup() -> Dict[str, Path]:
    """Build normalized name -> About page path lookup."""
    lookup = {}
    for f in LINKEDIN_DIR.glob('*About*LinkedIn.html'):
        org_name = extract_org_name_from_filename(f.name)
        norm_name = normalize_name(org_name)
        lookup[norm_name] = f
    return lookup


def extract_location_from_about_page(filepath: Path) -> Optional[Dict]:
    """Extract location from LinkedIn About page."""
    try:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()

        # Pattern 1: org-locations-module with Primary
        pattern = r'org-locations-module.*?Primary.*?<p[^>]*>(.*?)</p>'
        match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)

        if match:
            address = match.group(1).strip()
            address = re.sub(r'<[^>]+>', '', address)
            address = re.sub(r'\s+', ' ', address).strip()
            return parse_address(address)

        # Pattern 2: Locations section without Primary
        pattern2 = r'Locations\s*\(\d+\).*?<p[^>]*class="[^"]*break-words[^"]*"[^>]*>(.*?)</p>'
        match2 = re.search(pattern2, content, re.DOTALL | re.IGNORECASE)

        if match2:
            address = match2.group(1).strip()
            address = re.sub(r'<[^>]+>', '', address)
            address = re.sub(r'\s+', ' ', address).strip()
            return parse_address(address)

        return None
    except Exception as e:
        return None


def parse_address(address_text: str) -> Optional[Dict]:
    """Parse LinkedIn address string."""
    parts = [p.strip() for p in address_text.split(',')]
    if len(parts) < 2:
        return None

    result = {'raw': address_text}

    # Country is last part (2-letter code)
    country = parts[-1].upper().strip()
    # Handle "Province Country" format
    if len(country) > 2:
        tokens = country.split()
        for t in reversed(tokens):
            if len(t) == 2 and t.isalpha():
                country = t
                break
    result['country'] = country if len(country) == 2 else 'XX'

    # Find city (usually second-to-last non-postal part)
    for part in reversed(parts[:-1]):
        part = part.strip()
        # Skip parts that look like postal codes (have digits)
        if not re.search(r'\d', part):
            result['city'] = part
            break

    return result if 'city' in result else None


def normalize_country(country_raw: str) -> str:
    """Normalize country code."""
    if not country_raw:
        return 'XX'
    country = country_raw.upper().strip()
    if len(country) == 2 and country.isalpha():
        return country
    # Handle "Province Country" format
    parts = country.split()
    for p in reversed(parts):
        if len(p) == 2 and p.isalpha():
            return p
    return 'XX'


def get_institution_type(data: Dict) -> str:
    """Get institution type code."""
    type_map = {
        'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
        'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
        'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
        'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
        'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
        'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
    }
    return type_map.get(data.get('institution_type', 'MUSEUM'), 'M')


def extract_abbreviation(name: str) -> str:
    """Extract abbreviation from name."""
    skip_words = {'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der',
                  'en', 'of', 'the', 'a', 'an', 'and', 'or', 'museum', 'stichting'}
    name_clean = re.sub(r'[^\w\s]', ' ', name)
    words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]
    if not words:
        words = name_clean.split()[:3]
    if len(words) == 1:
        return words[0][:4].upper()
    return ''.join(w[0] for w in words[:5]).upper()


def generate_ghcid(name: str, location: Dict, inst_type: str) -> str:
    """Generate GHCID from location data."""
    country = normalize_country(location.get('country', 'XX'))
    city = location.get('city', '').lower().strip()

    province = CITY_TO_PROVINCE.get(city, 'XX')
    city_code = CITY_TO_CODE.get(city)
    if not city_code:
        words = city.replace('-', ' ').split()
        if len(words) == 1:
            city_code = words[0][:3].upper()
        else:
            city_code = ''.join(w[0] for w in words[:3]).upper()

    abbrev = extract_abbreviation(name)
    return f"{country}-{province}-{city_code}-{inst_type}-{abbrev}"


def match_pending_to_linkedin(pending_files: List[Path], linkedin_lookup: Dict[str, Path]) -> Dict[Path, Tuple[Path, Dict]]:
    """Match PENDING files to LinkedIn About pages."""
    matches = {}

    for pending_path in pending_files:
        try:
            with open(pending_path, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            emic_name = data.get('custodian_name', {}).get('emic_name', '')
            if not emic_name:
                continue

            norm_emic = normalize_name(emic_name)

            # Exact match
            if norm_emic in linkedin_lookup:
                about_path = linkedin_lookup[norm_emic]
                location = extract_location_from_about_page(about_path)
                if location:
                    matches[pending_path] = (about_path, location)
                continue

            # Partial match - check if one contains the other
            for norm_linkedin, about_path in linkedin_lookup.items():
                if len(norm_emic) > 5 and len(norm_linkedin) > 5:
                    if norm_emic in norm_linkedin or norm_linkedin in norm_emic:
                        location = extract_location_from_about_page(about_path)
                        if location:
                            matches[pending_path] = (about_path, location)
                            break
        except:
            continue

    return matches


def process_match(pending_path: Path, about_path: Path, location: Dict, dry_run: bool = True) -> Optional[str]:
    """Process a matched PENDING file."""
    try:
        with open(pending_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        emic_name = data.get('custodian_name', {}).get('emic_name', '')
        inst_type = get_institution_type(data)

        new_ghcid = generate_ghcid(emic_name, location, inst_type)
        new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"

        # Check collision
        if new_path.exists() and new_path != pending_path:
            name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
            new_ghcid = f"{new_ghcid}-{name_slug}"
            new_path = CUSTODIAN_DIR / f"{new_ghcid}.yaml"

        if dry_run:
            print(f"[WOULD RESOLVE] {emic_name}")
            print(f"  LinkedIn: {about_path.name}")
            print(f"  Location: {location.get('city')}, {location.get('country')}")
            print(f"  -> {new_ghcid}.yaml")
            return 'dry_run'

        # Update data
        data['ghcid_current'] = new_ghcid
        if 'location' not in data:
            data['location'] = {}
        data['location']['city'] = location.get('city')
        data['location']['country'] = location.get('country')

        if 'ghcid_resolution' not in data:
            data['ghcid_resolution'] = {}
        data['ghcid_resolution']['method'] = 'linkedin_about_page_extraction'
        data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
        data['ghcid_resolution']['linkedin_source'] = about_path.name

        with open(pending_path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        shutil.move(pending_path, new_path)
        print(f"[RESOLVED] {emic_name} -> {new_ghcid}.yaml")
        return new_ghcid

    except Exception as e:
        print(f"[ERROR] {pending_path.name}: {e}")
        return None


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    parser.add_argument('--limit', type=int, default=0)
    args = parser.parse_args()

    print("Building LinkedIn About page lookup...")
    linkedin_lookup = build_linkedin_lookup()
    print(f"Found {len(linkedin_lookup)} About pages")

    print("\nFinding PENDING files...")
    pending_files = list(CUSTODIAN_DIR.glob("*PENDING*.yaml"))
    print(f"Found {len(pending_files)} PENDING files")

    if args.limit:
        pending_files = pending_files[:args.limit]

    print("\nMatching PENDING files to LinkedIn About pages...")
    matches = match_pending_to_linkedin(pending_files, linkedin_lookup)
    print(f"Found {len(matches)} matches")

    # Group by country
    country_counts = defaultdict(int)
    for pending_path, (about_path, location) in matches.items():
        country = normalize_country(location.get('country', 'XX'))
        country_counts[country] += 1

    print("\nMatches by country:")
    for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
        print(f"  {country}: {count}")

    print(f"\n{'DRY RUN - ' if args.dry_run else ''}Processing matches...")

    success = 0
    failed = 0

    for pending_path, (about_path, location) in matches.items():
        result = process_match(pending_path, about_path, location, dry_run=args.dry_run)
        if result:
            success += 1
        else:
            failed += 1

    print(f"\n{'Would resolve' if args.dry_run else 'Resolved'}: {success}")
    if failed:
        print(f"Failed: {failed}")
    print(f"Remaining unmatched: {len(pending_files) - len(matches)}")


if __name__ == '__main__':
    main()