glam/scripts/enrich_xxx_via_web_search.py

#!/usr/bin/env python3
"""
Enrich NL-XX-XXX files with correct location data via web search.

The LinkedIn HTML extraction method was flawed - it extracted location data from
wrong companies in the HTML. This script uses web search to find correct locations.

Strategy:
1. Read custodian name and website from YAML file
2. Search web for "[name] Netherlands location address city"
3. Parse results to extract city/region
4. Update YAML file with correct location
5. Regenerate GHCID based on new location
"""

import os
import re
import yaml
import json
import subprocess
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Tuple

# Directory containing custodian files
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

# GeoNames database for settlement lookup
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")

# Dutch province mapping
PROVINCE_MAP = {
    'drenthe': 'DR',
    'friesland': 'FR', 'fryslân': 'FR',
    'gelderland': 'GE',
    'groningen': 'GR',
    'limburg': 'LI',
    'noord-brabant': 'NB', 'north brabant': 'NB', 'nordbrabant': 'NB', 'brabant': 'NB',
    'noord-holland': 'NH', 'north holland': 'NH',
    'overijssel': 'OV',
    'utrecht': 'UT',
    'zeeland': 'ZE',
    'zuid-holland': 'ZH', 'south holland': 'ZH',
    'flevoland': 'FL',
}

# Dutch city to 3-letter code mapping (common cities)
CITY_CODES = {
    'amsterdam': 'AMS',
    'rotterdam': 'ROT',
    'den haag': 'DHA', 'the hague': 'DHA', "'s-gravenhage": 'DHA',
    'utrecht': 'UTR',
    'eindhoven': 'EIN',
    'groningen': 'GRO',
    'tilburg': 'TIL',
    'almere': 'ALM',
    'breda': 'BRE',
    'nijmegen': 'NIJ',
    'apeldoorn': 'APE',
    'haarlem': 'HAA',
    'arnhem': 'ARN',
    'enschede': 'ENS',
    'amersfoort': 'AME',
    'zaanstad': 'ZAA',
    'haarlemmermeer': 'HMM',
    'zwolle': 'ZWO',
    'leiden': 'LEI',
    'maastricht': 'MAA',
    'dordrecht': 'DOR',
    'zoetermeer': 'ZOE',
    'deventer': 'DEV',
    'delft': 'DEL',
    'alkmaar': 'ALK',
    'venlo': 'VEN',
    'leeuwarden': 'LEE',
    'heerlen': 'HEE',
    'hilversum': 'HIL',
    'assen': 'ASS',
    'schiedam': 'SCH',
    'weert': 'WEE',
    'duivendrecht': 'DUI',
    'noordwijk': 'NOO',
}


def get_city_code(city: str) -> str:
    """Get 3-letter code for a city."""
    city_lower = city.lower().strip()
    if city_lower in CITY_CODES:
        return CITY_CODES[city_lower]
    # Generate code from first 3 letters
    clean = re.sub(r'[^a-z]', '', city_lower)
    return clean[:3].upper() if len(clean) >= 3 else clean.upper().ljust(3, 'X')


def get_region_code(region: str) -> Optional[str]:
    """Get 2-letter province code from region name."""
    region_lower = region.lower().strip()
    for key, code in PROVINCE_MAP.items():
        if key in region_lower:
            return code
    return None


def extract_location_from_search_results(results: list) -> Optional[dict]:
    """Extract city and region from Exa search results."""

    # Patterns to match Dutch locations
    patterns = [
        # "City, Netherlands" or "City (Province)"
        r'(\w+(?:\s+\w+)?)\s*,\s*Netherlands\s*\((\w+(?:\s+\w+)?)\)',
        # "in City, Province"
        r'in\s+(\w+(?:\s+\w+)?)\s*,\s*(Noord-Holland|Zuid-Holland|Noord-Brabant|Gelderland|Limburg|Overijssel|Friesland|Drenthe|Groningen|Utrecht|Zeeland|Flevoland)',
        # "legal seat in City"
        r'legal\s+seat\s+in\s+(\w+)',
        # "Address: ... City"
        r'Address[:\s]+[^,]+,\s*(\d{4}\s*[A-Z]{2})\s+(\w+)',
        # Dutch postal code pattern
        r'(\d{4}\s*[A-Z]{2})\s+(\w+(?:\s+\w+)?)\s*,?\s*(?:Netherlands|NL)',
    ]

    for result in results:
        text = result.get('text', '') + ' ' + result.get('title', '')

        # Try each pattern
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                groups = match.groups()
                if len(groups) >= 2:
                    # Check if first group is postal code
                    if re.match(r'\d{4}\s*[A-Z]{2}', groups[0]):
                        city = groups[1]
                        region = None
                    else:
                        city = groups[0]
                        region = groups[1] if len(groups) > 1 else None
                else:
                    city = groups[0]
                    region = None

                city = city.strip()
                region_code = get_region_code(region) if region else None

                return {
                    'city': city,
                    'region_code': region_code,
                    'source_text': text[:200]
                }

    return None


def search_institution_location(name: str, website: Optional[str] = None) -> Optional[dict]:
    """Search web for institution location using Exa."""

    # Build search query
    query = f'"{name}" Netherlands location address city'
    if website and 'lnkd.in' not in website:
        # Add website domain to query for better results
        domain = re.sub(r'https?://(www\.)?', '', website).split('/')[0]
        query = f'site:{domain} OR "{name}" Netherlands address city location'

    # Use Exa via subprocess (since we can't import the MCP client directly)
    # For now, return None - we'll use the MCP tool directly in the main flow
    return None


def find_xxx_files_needing_enrichment():
    """Find NL-XX-XXX files that need location enrichment."""
    files = []

    for f in sorted(CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml")):
        try:
            with open(f, 'r', encoding='utf-8') as file:
                content = yaml.safe_load(file)

            if not content:
                continue

            # Get institution name
            name = content.get('custodian_name', {}).get('emic_name', '')

            # Get website
            website = content.get('linkedin_enrichment', {}).get('website')

            # Get LinkedIn slug
            slug = content.get('linkedin_enrichment', {}).get('linkedin_slug', '')

            files.append({
                'file': f,
                'name': name,
                'website': website,
                'slug': slug,
                'content': content
            })
        except Exception as e:
            print(f"Error reading {f}: {e}")

    return files


def update_file_with_location(file_info: dict, city: str, region_code: str, source: str):
    """Update a YAML file with correct location data."""
    f = file_info['file']
    content = file_info['content']
    name = file_info['name']

    # Get city code
    city_code = get_city_code(city)

    # Update location
    content['location'] = {
        'city': city,
        'region': region_code,
        'country': 'NL'
    }

    # Generate new GHCID
    # Extract type and abbreviation from filename
    filename = f.stem
    # Pattern: NL-XX-XXX-{TYPE}-{ABBREV}[-{name_suffix}]
    match = re.match(r'NL-XX-XXX-([A-Z])-(.+)', filename)
    if match:
        inst_type = match.group(1)
        abbrev_suffix = match.group(2)

        new_ghcid = f"NL-{region_code}-{city_code}-{inst_type}-{abbrev_suffix}"

        # Update GHCID
        if 'ghcid' not in content:
            content['ghcid'] = {}

        old_ghcid = content['ghcid'].get('ghcid_current', filename)
        content['ghcid']['ghcid_current'] = new_ghcid
        content['ghcid']['ghcid_original'] = old_ghcid

        # Update history
        content['ghcid']['ghcid_history'] = [{
            'ghcid': new_ghcid,
            'ghcid_numeric': content['ghcid'].get('ghcid_numeric'),
            'valid_from': datetime.now(timezone.utc).isoformat(),
            'valid_to': None,
            'reason': f'Location enriched via web search: {city}, {region_code}'
        }]

        # Add location resolution
        content['ghcid']['location_resolution'] = {
            'method': 'WEB_SEARCH',
            'city': city,
            'city_code': city_code,
            'region_code': region_code,
            'country_code': 'NL',
            'source': source,
            'resolution_date': datetime.now(timezone.utc).isoformat()
        }

    # Add provenance note
    if 'provenance' not in content:
        content['provenance'] = {}
    if 'notes' not in content['provenance']:
        content['provenance']['notes'] = []
    content['provenance']['notes'].append(
        f"Location enriched via web search on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}: {city}, {region_code}"
    )

    # Write back
    with open(f, 'w', encoding='utf-8') as file:
        yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False)

    return new_ghcid if match else None


def main():
    """Main function to list files needing enrichment."""
    print("Finding NL-XX-XXX files needing location enrichment...\n")

    files = find_xxx_files_needing_enrichment()
    print(f"Found {len(files)} files\n")

    # Group by whether they have website
    with_website = [f for f in files if f['website'] and 'lnkd.in' not in str(f['website'])]
    without_website = [f for f in files if not f['website'] or 'lnkd.in' in str(f['website'])]

    print(f"Files with valid website: {len(with_website)}")
    print(f"Files without valid website: {len(without_website)}")

    print("\n--- Sample files with websites (first 20) ---")
    for f in with_website[:20]:
        print(f"  {f['name']}")
        print(f"    Website: {f['website']}")
        print(f"    File: {f['file'].name}")
        print()


if __name__ == "__main__":
    main()