glam/scripts/apply_linkedin_locations.py

#!/usr/bin/env python3
"""
Apply extracted LinkedIn locations to PENDING files.

This script:
1. Loads locations extracted from LinkedIn About pages
2. Matches PENDING files to extracted locations by organization name
3. Generates proper GHCIDs with location data
4. Renames files with correct GHCIDs

Usage:
    python scripts/apply_linkedin_locations.py --dry-run
    python scripts/apply_linkedin_locations.py --apply
"""

import re
import json
import yaml
import unicodedata
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, Tuple, List
from collections import defaultdict

# Paths
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
LOCATIONS_FILE = Path("/Users/kempersc/apps/glam/data/linkedin_locations.json")
ARCHIVE_DIR = CUSTODIAN_DIR / "archive" / "pending_resolved_20250109"

# Extended city to province mapping
CITY_TO_PROVINCE = {
    # Noord-Holland
    'amsterdam': 'NH', 'haarlem': 'NH', 'alkmaar': 'NH', 'hilversum': 'NH',
    'zaandam': 'NH', 'hoorn': 'NH', 'enkhuizen': 'NH', 'purmerend': 'NH',
    'amstelveen': 'NH', 'heemstede': 'NH', 'beverwijk': 'NH', 'velsen': 'NH',
    'castricum': 'NH', 'huizen': 'NH', 'bussum': 'NH', 'naarden': 'NH',
    'weesp': 'NH', 'edam': 'NH', 'volendam': 'NH', 'texel': 'NH',
    'den helder': 'NH', 'schagen': 'NH', 'heerhugowaard': 'NH',

    # Zuid-Holland
    'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', "'s-gravenhage": 'ZH',
    'leiden': 'ZH', 'delft': 'ZH', 'dordrecht': 'ZH', 'gouda': 'ZH',
    'schiedam': 'ZH', 'zoetermeer': 'ZH', 'vlaardingen': 'ZH', 'spijkenisse': 'ZH',
    'alphen aan den rijn': 'ZH', 'katwijk': 'ZH', 'leidschendam': 'ZH',
    'voorburg': 'ZH', 'rijswijk': 'ZH', 'wassenaar': 'ZH', 'oegstgeest': 'ZH',
    'voorschoten': 'ZH', 'lisse': 'ZH', 'noordwijk': 'ZH', 'sassenheim': 'ZH',
    'gorinchem': 'ZH', 'schoonhoven': 'ZH', 'woerden': 'ZH',

    # Utrecht
    'utrecht': 'UT', 'amersfoort': 'UT', 'zeist': 'UT', 'nieuwegein': 'UT',
    'veenendaal': 'UT', 'houten': 'UT', 'soest': 'UT', 'baarn': 'UT',
    'bunnik': 'UT', 'maarssen': 'UT', 'de bilt': 'UT', 'bilthoven': 'UT',
    'doorn': 'UT', 'driebergen': 'UT', 'wijk bij duurstede': 'UT',

    # Gelderland
    'arnhem': 'GE', 'nijmegen': 'GE', 'apeldoorn': 'GE', 'ede': 'GE',
    'wageningen': 'GE', 'doetinchem': 'GE', 'zutphen': 'GE', 'harderwijk': 'GE',
    'zevenaar': 'GE', 'tiel': 'GE', 'culemborg': 'GE', 'barneveld': 'GE',
    'elburg': 'GE', 'nunspeet': 'GE', 'putten': 'GE', 'ermelo': 'GE',
    'epe': 'GE', 'hattem': 'GE', 'zaltbommel': 'GE', 'winterswijk': 'GE',
    'oosterbeek': 'GE', 'velp': 'GE', 'rhenen': 'GE', 'buren': 'GE',

    # Noord-Brabant
    'eindhoven': 'NB', 'tilburg': 'NB', 'breda': 'NB', "'s-hertogenbosch": 'NB',
    'den bosch': 'NB', 'helmond': 'NB', 'oss': 'NB', 'roosendaal': 'NB',
    'bergen op zoom': 'NB', 'waalwijk': 'NB', 'uden': 'NB', 'veghel': 'NB',
    'boxtel': 'NB', 'oisterwijk': 'NB', 'vught': 'NB', 'cuijk': 'NB',
    'deurne': 'NB', 'geldrop': 'NB', 'mierlo': 'NB', 'nuenen': 'NB',
    'valkenswaard': 'NB', 'heeze': 'NB', 'best': 'NB', 'son': 'NB',
    'etten-leur': 'NB', 'oosterhout': 'NB', 'dongen': 'NB', 'gilze': 'NB',

    # Limburg
    'maastricht': 'LI', 'venlo': 'LI', 'heerlen': 'LI', 'roermond': 'LI',
    'sittard': 'LI', 'geleen': 'LI', 'weert': 'LI', 'kerkrade': 'LI',
    'valkenburg': 'LI', 'vaals': 'LI', 'meerssen': 'LI', 'brunssum': 'LI',
    'landgraaf': 'LI', 'stein': 'LI', 'beek': 'LI', 'eijsden': 'LI',
    'gulpen': 'LI', 'margraten': 'LI', 'simpelveld': 'LI',

    # Overijssel
    'zwolle': 'OV', 'deventer': 'OV', 'enschede': 'OV', 'hengelo': 'OV',
    'almelo': 'OV', 'kampen': 'OV', 'oldenzaal': 'OV', 'hardenberg': 'OV',
    'rijssen': 'OV', 'holten': 'OV', 'raalte': 'OV', 'ommen': 'OV',
    'dalfsen': 'OV', 'staphorst': 'OV', 'giethoorn': 'OV', 'hasselt': 'OV',
    'steenwijk': 'OV', 'vollenhove': 'OV', 'blokzijl': 'OV',

    # Friesland
    'leeuwarden': 'FR', 'drachten': 'FR', 'sneek': 'FR', 'heerenveen': 'FR',
    'harlingen': 'FR', 'dokkum': 'FR', 'bolsward': 'FR', 'franeker': 'FR',
    'joure': 'FR', 'lemmer': 'FR', 'workum': 'FR', 'makkum': 'FR',
    'hindeloopen': 'FR', 'stavoren': 'FR', 'sloten': 'FR',

    # Drenthe
    'assen': 'DR', 'emmen': 'DR', 'hoogeveen': 'DR', 'meppel': 'DR',
    'coevorden': 'DR', 'borger': 'DR', 'dwingeloo': 'DR', 'westerbork': 'DR',

    # Groningen
    'groningen': 'GR', 'veendam': 'GR', 'winschoten': 'GR', 'hoogezand': 'GR',
    'stadskanaal': 'GR', 'delfzijl': 'GR', 'appingedam': 'GR', 'ter apel': 'GR',
    'leek': 'GR', 'marum': 'GR', 'zuidhorn': 'GR', 'uithuizen': 'GR',

    # Zeeland
    'middelburg': 'ZE', 'vlissingen': 'ZE', 'goes': 'ZE', 'terneuzen': 'ZE',
    'hulst': 'ZE', 'zierikzee': 'ZE', 'veere': 'ZE', 'domburg': 'ZE',
    'sluis': 'ZE', 'yerseke': 'ZE',

    # Flevoland
    'almere': 'FL', 'lelystad': 'FL', 'dronten': 'FL', 'emmeloord': 'FL',
    'urk': 'FL', 'zeewolde': 'FL', 'biddinghuizen': 'FL',
}

# City to 3-letter code (extended)
CITY_TO_CODE = {
    'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA',
    "'s-gravenhage": 'DHA', 'utrecht': 'UTR', 'eindhoven': 'EIN', 'tilburg': 'TIL',
    'groningen': 'GRO', 'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ',
    'apeldoorn': 'APE', 'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS',
    'amersfoort': 'AME', 'zaandam': 'ZAA', "'s-hertogenbosch": 'DBO', 'den bosch': 'DBO',
    'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR',
    'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'leeuwarden': 'LEE',
    'hilversum': 'HIL', 'assen': 'ASS', 'middelburg': 'MID', 'hoorn': 'HOO',
    'enkhuizen': 'ENK', 'wageningen': 'WAG', 'gouda': 'GOU', 'venlo': 'VEN',
    'heerlen': 'HEE', 'roermond': 'ROE', 'zeist': 'ZEI', 'ede': 'EDE',
    'veenendaal': 'VEE', 'harderwijk': 'HAR', 'doetinchem': 'DOE', 'zutphen': 'ZUT',
    'helmond': 'HEL', 'oss': 'OSS', 'roosendaal': 'ROS', 'bergen op zoom': 'BOZ',
    'hengelo': 'HEN', 'almelo': 'ALO', 'kampen': 'KAM', 'sneek': 'SNE',
    'heerenveen': 'HEV', 'drachten': 'DRA', 'emmen': 'EMM', 'hoogeveen': 'HOV',
    'meppel': 'MEP', 'lelystad': 'LEL', 'vlissingen': 'VLI', 'goes': 'GOE',
    'terneuzen': 'TER', 'zoetermeer': 'ZOE', 'spijkenisse': 'SPI', 'purmerend': 'PUR',
}


def normalize_name(name: str) -> str:
    """Normalize organization name for matching."""
    # Remove common suffixes and normalize
    normalized = unicodedata.normalize('NFKD', name)
    normalized = normalized.lower().strip()
    # Remove special characters except spaces
    normalized = re.sub(r'[^\w\s]', '', normalized)
    # Normalize whitespace
    normalized = ' '.join(normalized.split())
    return normalized


def extract_emic_name_from_pending(filepath: Path) -> Optional[str]:
    """Extract emic_name from PENDING file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        if 'custodian_name' in data and 'emic_name' in data['custodian_name']:
            return data['custodian_name']['emic_name']
        return None
    except:
        return None


def extract_abbreviation(name: str) -> str:
    """Extract abbreviation from organization name."""
    skip_words = {
        'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
        'en', 'of', 'the', 'a', 'an', 'and', 'or', 'of', 'for', 'to', 'at', 'by',
        'museum', 'archief', 'bibliotheek', 'stichting', 'vereniging', 'centrum',
    }

    # Clean name
    name_clean = re.sub(r'[^\w\s]', ' ', name)
    words = [w for w in name_clean.split() if w.lower() not in skip_words and len(w) > 1]

    if not words:
        words = name_clean.split()[:3]

    # Generate abbreviation
    if len(words) == 1:
        abbrev = words[0][:4].upper()
    else:
        abbrev = ''.join(w[0] for w in words[:5]).upper()

    return abbrev if abbrev else 'XXX'


def normalize_country_code(country_raw: str) -> str:
    """Extract proper 2-letter country code from raw country field."""
    if not country_raw:
        return 'XX'

    country_upper = country_raw.upper().strip()

    # If it's already a valid 2-letter code, use it
    if len(country_upper) == 2 and country_upper.isalpha():
        return country_upper

    # Common patterns: "ZUID-HOLLAND NL", "ÎLE-DE-FRANCE FR", etc.
    # Extract the last word which should be the country code
    parts = country_upper.split()
    if parts:
        last_part = parts[-1]
        if len(last_part) == 2 and last_part.isalpha():
            return last_part

    # Handle cases like "JAWA TENGAH 57141 ID"
    for part in reversed(parts):
        if len(part) == 2 and part.isalpha():
            return part

    # Fallback mappings
    country_mappings = {
        'NETHERLANDS': 'NL', 'NEDERLAND': 'NL',
        'FRANCE': 'FR', 'GERMANY': 'DE', 'DEUTSCHLAND': 'DE',
        'BELGIUM': 'BE', 'BELGIË': 'BE', 'BELGIQUE': 'BE',
        'UNITED KINGDOM': 'GB', 'UK': 'GB', 'ENGLAND': 'GB', 'SCOTLAND': 'GB',
        'ITALY': 'IT', 'ITALIA': 'IT',
        'SPAIN': 'ES', 'ESPAÑA': 'ES',
        'PORTUGAL': 'PT',
        'AUSTRIA': 'AT', 'ÖSTERREICH': 'AT',
        'SWITZERLAND': 'CH', 'SCHWEIZ': 'CH', 'SUISSE': 'CH',
        'DENMARK': 'DK', 'DANMARK': 'DK',
        'SWEDEN': 'SE', 'SVERIGE': 'SE',
        'NORWAY': 'NO', 'NORGE': 'NO',
        'FINLAND': 'FI', 'SUOMI': 'FI',
        'INDONESIA': 'ID',
        'UNITED STATES': 'US', 'USA': 'US',
        'CANADA': 'CA',
        'AUSTRALIA': 'AU',
    }

    for name, code in country_mappings.items():
        if name in country_upper:
            return code

    return 'XX'


def generate_ghcid(name: str, location: Dict, institution_type: str = 'M') -> str:
    """Generate GHCID from name and location data."""
    country = normalize_country_code(location.get('country', 'XX'))

    # Get city and normalize
    city = location.get('city', '')
    city_lower = city.lower().strip()

    # Get province code
    province_code = location.get('province_code')
    if not province_code:
        province_code = CITY_TO_PROVINCE.get(city_lower, 'XX')

    # Get city code
    city_code = location.get('city_code')
    if not city_code or city_code == 'XXX':
        city_code = CITY_TO_CODE.get(city_lower)
        if not city_code:
            # Generate from city name
            words = city_lower.replace('-', ' ').split()
            if len(words) == 1:
                city_code = words[0][:3].upper()
            else:
                city_code = ''.join(w[0] for w in words[:3]).upper()

    # Get abbreviation from name
    abbrev = extract_abbreviation(name)

    return f"{country}-{province_code}-{city_code}-{institution_type}-{abbrev}"


def match_pending_to_locations(pending_files: List[Path], locations: Dict) -> Dict[Path, Tuple[str, Dict]]:
    """Match PENDING files to extracted locations."""

    # Build normalized lookup from locations
    normalized_locations = {}
    for org_name, data in locations.items():
        norm_name = normalize_name(org_name)
        normalized_locations[norm_name] = (org_name, data)

    matches = {}

    for pending_path in pending_files:
        emic_name = extract_emic_name_from_pending(pending_path)
        if not emic_name:
            continue

        norm_emic = normalize_name(emic_name)

        # Exact match
        if norm_emic in normalized_locations:
            orig_name, data = normalized_locations[norm_emic]
            matches[pending_path] = (orig_name, data)
            continue

        # Fuzzy match - check if one contains the other
        for norm_loc, (orig_name, data) in normalized_locations.items():
            if norm_emic in norm_loc or norm_loc in norm_emic:
                matches[pending_path] = (orig_name, data)
                break

    return matches


def get_institution_type_from_file(filepath: Path) -> str:
    """Extract institution type from PENDING file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        inst_type = data.get('institution_type', 'MUSEUM')
        type_map = {
            'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
            'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'OFFICIAL_INSTITUTION': 'O',
            'COLLECTING_SOCIETY': 'S', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D',
            'BOTANICAL_ZOO': 'B', 'CORPORATION': 'C', 'FEATURES': 'F',
            'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P',
            'NGO': 'N', 'TASTE_SMELL': 'T', 'UNKNOWN': 'U',
        }
        return type_map.get(inst_type, 'M')
    except:
        return 'M'


def apply_location_to_pending(pending_path: Path, location_data: Dict, dry_run: bool = True) -> Optional[Path]:
    """Apply location data to a PENDING file and rename."""

    try:
        with open(pending_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        emic_name = data.get('custodian_name', {}).get('emic_name', '')
        inst_type = get_institution_type_from_file(pending_path)
        location = location_data['location']

        # Generate new GHCID
        new_ghcid = generate_ghcid(emic_name, location, inst_type)

        # Check for collision
        new_filename = f"{new_ghcid}.yaml"
        new_path = CUSTODIAN_DIR / new_filename

        if new_path.exists() and new_path != pending_path:
            # Collision - add name suffix
            name_slug = re.sub(r'[^\w]+', '-', emic_name.lower()).strip('-')[:30]
            new_ghcid = f"{new_ghcid}-{name_slug}"
            new_filename = f"{new_ghcid}.yaml"
            new_path = CUSTODIAN_DIR / new_filename

        # Update data
        data['ghcid_current'] = new_ghcid

        # Add location data if not present
        if 'location' not in data:
            data['location'] = {}

        data['location']['city'] = location.get('city')
        data['location']['country'] = location.get('country')
        if location.get('postal_code'):
            data['location']['postal_code'] = location.get('postal_code')
        if location.get('street'):
            data['location']['street'] = location.get('street')

        # Add resolution provenance
        if 'ghcid_resolution' not in data:
            data['ghcid_resolution'] = {}
        data['ghcid_resolution']['method'] = 'linkedin_about_page_extraction'
        data['ghcid_resolution']['resolved_at'] = datetime.now(timezone.utc).isoformat()
        data['ghcid_resolution']['source_file'] = location_data.get('source_file')

        if dry_run:
            print(f"  Would rename: {pending_path.name}")
            print(f"           to: {new_filename}")
            return None
        else:
            # Write updated data
            with open(pending_path, 'w', encoding='utf-8') as f:
                yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

            # Rename file
            shutil.move(pending_path, new_path)
            print(f"  Renamed: {pending_path.name} -> {new_filename}")
            return new_path

    except Exception as e:
        print(f"  Error processing {pending_path.name}: {e}")
        return None


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
    parser.add_argument('--apply', action='store_true', help='Actually apply changes')
    parser.add_argument('--limit', type=int, default=0, help='Limit number of files to process')
    args = parser.parse_args()

    if not args.dry_run and not args.apply:
        print("Please specify --dry-run or --apply")
        return

    dry_run = not args.apply

    # Load extracted locations
    print(f"Loading locations from {LOCATIONS_FILE}...")
    with open(LOCATIONS_FILE, 'r', encoding='utf-8') as f:
        locations_data = json.load(f)

    locations = locations_data.get('organizations', {})
    print(f"Loaded {len(locations)} organization locations")

    # Find PENDING files
    pending_files = list(CUSTODIAN_DIR.glob("*-XX-XXX-PENDING-*.yaml"))
    print(f"Found {len(pending_files)} PENDING files")

    if args.limit:
        pending_files = pending_files[:args.limit]

    # Match PENDING files to locations
    print("\nMatching PENDING files to extracted locations...")
    matches = match_pending_to_locations(pending_files, locations)
    print(f"Found {len(matches)} matches")

    # Group by country
    country_counts = defaultdict(int)
    for pending_path, (org_name, data) in matches.items():
        country = data['location'].get('country', 'XX')
        country_counts[country] += 1

    print("\nMatches by country:")
    for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
        print(f"  {country}: {count}")

    # Apply locations
    print(f"\n{'DRY RUN - ' if dry_run else ''}Applying locations...")

    success = 0
    failed = 0

    for pending_path, (org_name, location_data) in matches.items():
        result = apply_location_to_pending(pending_path, location_data, dry_run=dry_run)
        if dry_run or result:
            success += 1
        else:
            failed += 1

    print(f"\n{'Would resolve' if dry_run else 'Resolved'}: {success}")
    if failed:
        print(f"Failed: {failed}")

    # Show remaining
    remaining = len(pending_files) - len(matches)
    print(f"Remaining PENDING (no location match): {remaining}")


if __name__ == '__main__':
    main()