glam/scripts/enrich_xxx_from_linkedin_html.py

#!/usr/bin/env python3
"""
Enrich NL-XX-XXX custodian files with location data from LinkedIn HTML files.

This script:
1. Parses LinkedIn HTML files to extract company names and headquarters locations
2. Matches custodian files by emic_name
3. Updates custodian files with correct country, region (province), and city codes
4. Regenerates GHCID based on new location data
"""

import os
import re
import json
import yaml
from pathlib import Path
from typing import Optional, Dict, Any, Tuple
from collections import defaultdict
import unicodedata

# Dutch province code mapping
DUTCH_PROVINCE_CODES = {
    'drenthe': 'DR',
    'flevoland': 'FL',
    'friesland': 'FR',
    'fryslân': 'FR',
    'gelderland': 'GE',
    'groningen': 'GR',
    'limburg': 'LI',
    'noord-brabant': 'NB',
    'noord brabant': 'NB',
    'north brabant': 'NB',
    'noord-holland': 'NH',
    'noord holland': 'NH',
    'north holland': 'NH',
    'overijssel': 'OV',
    'utrecht': 'UT',
    'zeeland': 'ZE',
    'zuid-holland': 'ZH',
    'zuid holland': 'ZH',
    'south holland': 'ZH',
}

# City to province mapping for common Dutch cities
DUTCH_CITY_TO_PROVINCE = {
    'amsterdam': 'NH',
    'rotterdam': 'ZH',
    'den haag': 'ZH',
    'the hague': 'ZH',
    "'s-gravenhage": 'ZH',
    's-gravenhage': 'ZH',
    'utrecht': 'UT',
    'eindhoven': 'NB',
    'tilburg': 'NB',
    'groningen': 'GR',
    'almere': 'FL',
    'breda': 'NB',
    'nijmegen': 'GE',
    'arnhem': 'GE',
    'haarlem': 'NH',
    'enschede': 'OV',
    'maastricht': 'LI',
    'leiden': 'ZH',
    'dordrecht': 'ZH',
    'apeldoorn': 'GE',
    'zwolle': 'OV',
    'amersfoort': 'UT',
    'delft': 'ZH',
    'alkmaar': 'NH',
    'zaandam': 'NH',
    'leeuwarden': 'FR',
    'hilversum': 'NH',
    'deventer': 'OV',
    'middelburg': 'ZE',
    'assen': 'DR',
    'wageningen': 'GE',
    'lelystad': 'FL',
    'venlo': 'LI',
    'heerlen': 'LI',
    'sittard': 'LI',
    'oss': 'NB',
    "'s-hertogenbosch": 'NB',
    's-hertogenbosch': 'NB',
    'den bosch': 'NB',
    'gouda': 'ZH',
    'schiedam': 'ZH',
    'zoetermeer': 'ZH',
    'alphen aan den rijn': 'ZH',
    'emmen': 'DR',
    'kampen': 'OV',
    'harderwijk': 'GE',
    'hoorn': 'NH',
    'purmerend': 'NH',
    'vlaardingen': 'ZH',
    'beverwijk': 'NH',
    'hoofddorp': 'NH',
    'amstelveen': 'NH',
    'diemen': 'NH',
    'nieuwegein': 'UT',
    'zeist': 'UT',
    'veenendaal': 'UT',
    'helmond': 'NB',
    'roosendaal': 'NB',
    'bergen op zoom': 'NB',
    'waalwijk': 'NB',
    'vlissingen': 'ZE',
    'goes': 'ZE',
    'terneuzen': 'ZE',
    'roermond': 'LI',
    'weert': 'LI',
    'kerkrade': 'LI',
    'geleen': 'LI',
    'doetinchem': 'GE',
    'tiel': 'GE',
    'ede': 'GE',
    'barneveld': 'GE',
    'winterswijk': 'GE',
    'almelo': 'OV',
    'hengelo': 'OV',
    'oldenzaal': 'OV',
    'steenwijk': 'OV',
    'meppel': 'DR',
    'hoogeveen': 'DR',
    'coevorden': 'DR',
    'drachten': 'FR',
    'sneek': 'FR',
    'heerenveen': 'FR',
    'harlingen': 'FR',
    'franeker': 'FR',
}


def normalize_name(name: str) -> str:
    """Normalize a name for matching."""
    if not name:
        return ""
    # NFD decomposition to separate base characters from diacritics
    normalized = unicodedata.normalize('NFD', name.lower())
    # Remove diacritics
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    # Remove punctuation and extra spaces
    ascii_name = re.sub(r'[^\w\s]', ' ', ascii_name)
    ascii_name = re.sub(r'\s+', ' ', ascii_name).strip()
    return ascii_name


def generate_city_code(city: str) -> str:
    """Generate a 3-letter city code from city name."""
    if not city:
        return "XXX"

    city = city.strip()
    words = city.split()

    # Handle Dutch articles
    dutch_articles = {'de', 'het', 'den', "'s", 's'}

    if len(words) == 1:
        # Single word: first 3 letters
        return city[:3].upper()
    elif words[0].lower() in dutch_articles:
        # Dutch article + word: article initial + 2 from main word
        if len(words) >= 2:
            return (words[0][0] + words[1][:2]).upper()
    else:
        # Multi-word: initials
        initials = ''.join(w[0] for w in words if w.lower() not in dutch_articles)
        return initials[:3].upper()

    return city[:3].upper()


def extract_linkedin_locations(html_dir: Path) -> Dict[str, Dict[str, Any]]:
    """
    Extract company names and locations from LinkedIn HTML files.

    Returns dict mapping normalized company name -> location info
    """
    locations = {}

    html_files = list(html_dir.glob("*.html"))
    print(f"Found {len(html_files)} HTML files to process")

    for html_file in html_files:
        try:
            content = html_file.read_text(encoding='utf-8', errors='ignore')

            # Extract company name from filename
            # Format: "(N) Company Name_ People _ LinkedIn.html"
            filename = html_file.stem
            match = re.match(r'\(\d+\)\s*(.+?)_\s*People\s*_\s*LinkedIn', filename)
            if match:
                company_name = match.group(1).strip()
            else:
                # Try without the number prefix
                match = re.match(r'(.+?)_\s*People\s*_\s*LinkedIn', filename)
                if match:
                    company_name = match.group(1).strip()
                else:
                    continue

            # Extract headquarter location from JSON embedded in HTML
            # Pattern: "headquarter":{"streetAddressOptOut":...,"address":{"country":"NL",...,"city":"Amsterdam",...}}
            hq_pattern = r'"headquarter":\s*\{[^}]*"address":\s*\{([^}]+)\}'
            hq_matches = re.findall(hq_pattern, content)

            if hq_matches:
                # Get the first headquarter (usually the main one)
                address_json = '{' + hq_matches[0] + '}'
                try:
                    # Clean up JSON (remove $recipeTypes etc.)
                    address_json = re.sub(r'"\$[^"]*":\s*\[[^\]]*\]', '', address_json)
                    address_json = re.sub(r'"\$[^"]*":\s*"[^"]*"', '', address_json)
                    address_json = re.sub(r',\s*,', ',', address_json)
                    address_json = re.sub(r',\s*}', '}', address_json)

                    address_data = json.loads(address_json)

                    country = address_data.get('country', '')
                    city = address_data.get('city', '')
                    region = address_data.get('geographicArea', '')

                    if country or city:
                        normalized = normalize_name(company_name)
                        locations[normalized] = {
                            'original_name': company_name,
                            'country': country,
                            'city': city,
                            'region': region,
                            'source_file': str(html_file.name),
                        }
                except json.JSONDecodeError:
                    pass

        except Exception as e:
            print(f"Error processing {html_file.name}: {e}")
            continue

    return locations


def get_province_code(city: str, region: str, country: str) -> str:
    """Get Dutch province code from city or region."""
    if country != 'NL':
        return 'XX'

    # Try region first
    if region:
        region_lower = region.lower().strip()
        if region_lower in DUTCH_PROVINCE_CODES:
            return DUTCH_PROVINCE_CODES[region_lower]

    # Try city
    if city:
        city_lower = city.lower().strip()
        if city_lower in DUTCH_CITY_TO_PROVINCE:
            return DUTCH_CITY_TO_PROVINCE[city_lower]

    return 'XX'


def generate_abbreviation(name: str) -> str:
    """Generate abbreviation from institution name."""
    if not name:
        return "XXX"

    # Skip words (articles, prepositions)
    skip_words = {
        'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s",
        'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of',
        'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'and', 'or',
        'stichting', 'vereniging', 'foundation', 'museum', 'archief', 'bibliotheek',
    }

    words = name.split()
    initials = []

    for word in words:
        # Skip if word is in skip list
        word_lower = word.lower().strip('.,;:!?()')
        if word_lower in skip_words:
            continue
        # Skip numbers
        if word.isdigit():
            continue
        # Get first letter, normalize to ASCII
        if word:
            first = word[0]
            # Normalize diacritics
            first = unicodedata.normalize('NFD', first)
            first = ''.join(c for c in first if unicodedata.category(c) != 'Mn')
            if first.isalpha():
                initials.append(first.upper())

    if not initials:
        # Fallback: first 3 letters of first word
        return name[:3].upper()

    return ''.join(initials[:10])  # Max 10 chars


def update_custodian_file(
    file_path: Path,
    country: str,
    region_code: str,
    city: str,
    city_code: str,
    source_file: str
) -> Tuple[str, str]:
    """
    Update a custodian YAML file with new location data.

    Returns (old_ghcid, new_ghcid)
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Get current GHCID
    old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')

    # Get institution type
    inst_types = data.get('institution_type', ['M'])
    inst_type = inst_types[0] if inst_types else 'M'

    # Get emic name for abbreviation
    emic_name = data.get('custodian_name', {}).get('emic_name', '')
    abbreviation = generate_abbreviation(emic_name)

    # Generate new GHCID
    new_ghcid = f"{country}-{region_code}-{city_code}-{inst_type}-{abbreviation}"

    # Check for name suffix in old GHCID
    old_parts = old_ghcid.split('-')
    if len(old_parts) > 5:
        # Has name suffix
        name_suffix = '-'.join(old_parts[5:])
        new_ghcid = f"{new_ghcid}-{name_suffix}"

    # Update location
    if 'location' not in data:
        data['location'] = {}

    data['location']['city'] = city
    data['location']['region'] = region_code
    data['location']['country'] = country

    # Update GHCID
    if 'ghcid' not in data:
        data['ghcid'] = {}

    data['ghcid']['ghcid_current'] = new_ghcid
    data['ghcid']['location_resolution'] = {
        'method': 'LINKEDIN_HTML_EXTRACTION',
        'source_file': source_file,
        'city_code': city_code,
        'region_code': region_code,
        'country_code': country,
    }

    # Write back
    with open(file_path, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return old_ghcid, new_ghcid


def main():
    custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
    linkedin_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/affiliated/manual')

    # Step 1: Extract locations from LinkedIn HTML files
    print("Extracting locations from LinkedIn HTML files...")
    linkedin_locations = extract_linkedin_locations(linkedin_dir)
    print(f"Extracted locations for {len(linkedin_locations)} companies")

    # Step 2: Find XXX custodian files
    xxx_files = list(custodian_dir.glob('NL-XX-XXX-*.yaml'))
    print(f"\nFound {len(xxx_files)} NL-XX-XXX files to process")

    # Step 3: Match and update
    matched = 0
    not_matched = []
    updates = []
    non_nl_files = []

    for file_path in xxx_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            emic_name = data.get('custodian_name', {}).get('emic_name', '')
            if not emic_name:
                not_matched.append((file_path.name, "No emic_name"))
                continue

            normalized = normalize_name(emic_name)

            # Try to find a match
            location = linkedin_locations.get(normalized)

            if not location:
                # Try partial matching
                for key, loc in linkedin_locations.items():
                    if normalized in key or key in normalized:
                        location = loc
                        break

            if location:
                country = location['country']
                city = location['city']
                region = location.get('region', '')
                source_file = location['source_file']

                if country != 'NL':
                    # Non-Dutch institution - mark for later
                    non_nl_files.append({
                        'file': file_path.name,
                        'emic_name': emic_name,
                        'country': country,
                        'city': city,
                    })
                    continue

                # Get province code
                region_code = get_province_code(city, region, country)
                city_code = generate_city_code(city)

                if region_code == 'XX' and city_code == 'XXX':
                    not_matched.append((file_path.name, f"No province/city for {city}"))
                    continue

                # Update the file
                old_ghcid, new_ghcid = update_custodian_file(
                    file_path, country, region_code, city, city_code, source_file
                )

                updates.append({
                    'file': file_path.name,
                    'emic_name': emic_name,
                    'old_ghcid': old_ghcid,
                    'new_ghcid': new_ghcid,
                    'city': city,
                    'region': region_code,
                })
                matched += 1
            else:
                not_matched.append((file_path.name, emic_name))

        except Exception as e:
            print(f"Error processing {file_path.name}: {e}")

    # Report results
    print(f"\n{'='*60}")
    print(f"RESULTS")
    print(f"{'='*60}")
    print(f"Total XXX files: {len(xxx_files)}")
    print(f"Matched and updated: {matched}")
    print(f"Not matched: {len(not_matched)}")
    print(f"Non-NL institutions found: {len(non_nl_files)}")

    if updates:
        print(f"\n--- UPDATES ({len(updates)}) ---")
        for u in updates[:20]:
            print(f"  {u['emic_name'][:40]:<40} | {u['city']:<15} | {u['region']} | {u['new_ghcid']}")
        if len(updates) > 20:
            print(f"  ... and {len(updates) - 20} more")

    if non_nl_files:
        print(f"\n--- NON-NL INSTITUTIONS ({len(non_nl_files)}) ---")
        for nf in non_nl_files[:10]:
            print(f"  {nf['emic_name'][:40]:<40} | {nf['country']} | {nf['city']}")
        if len(non_nl_files) > 10:
            print(f"  ... and {len(non_nl_files) - 10} more")

    if not_matched:
        print(f"\n--- NOT MATCHED ({len(not_matched)}) ---")
        for nm in not_matched[:20]:
            print(f"  {nm[0]:<50} | {nm[1][:40]}")
        if len(not_matched) > 20:
            print(f"  ... and {len(not_matched) - 20} more")

    return updates, not_matched, non_nl_files


if __name__ == '__main__':
    main()