glam/scripts/exa_search_kien_locations.py

#!/usr/bin/env python3
"""
Use EXA search to find locations for KIEN entries without location data.
"""

import json
import os
import re
import sqlite3
import subprocess
import sys
import time
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List

# Paths
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')

# Province mapping
ADMIN1_TO_PROVINCE = {
    '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
    '05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
    '10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
}

# Dutch cities to look for
DUTCH_CITIES = None

def load_dutch_cities():
    """Load Dutch city names from GeoNames database."""
    global DUTCH_CITIES
    if DUTCH_CITIES is not None:
        return DUTCH_CITIES

    if not GEONAMES_DB.exists():
        DUTCH_CITIES = set()
        return DUTCH_CITIES

    conn = sqlite3.connect(GEONAMES_DB)
    cursor = conn.cursor()
    cursor.execute("""
        SELECT DISTINCT LOWER(name), LOWER(ascii_name)
        FROM cities
        WHERE country_code = 'NL'
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
    """)

    DUTCH_CITIES = set()
    for row in cursor.fetchall():
        DUTCH_CITIES.add(row[0])
        if row[1]:
            DUTCH_CITIES.add(row[1])

    conn.close()
    return DUTCH_CITIES


def lookup_city_in_geonames(city_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
    """Look up a city in the GeoNames database."""
    if not GEONAMES_DB.exists():
        return None

    conn = sqlite3.connect(GEONAMES_DB)
    cursor = conn.cursor()

    cursor.execute("""
        SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
               population, feature_code
        FROM cities
        WHERE country_code = ?
          AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
        ORDER BY population DESC
        LIMIT 1
    """, (country_code, city_name, city_name))

    row = cursor.fetchone()
    conn.close()

    if row:
        return {
            'geonames_id': row[0],
            'name': row[1],
            'ascii_name': row[2],
            'admin1_code': row[3],
            'latitude': row[4],
            'longitude': row[5],
            'population': row[6],
            'feature_code': row[7],
            'region_code': ADMIN1_TO_PROVINCE.get(row[3], 'XX'),
        }

    return None


def extract_city_from_text(text: str) -> Optional[str]:
    """Extract Dutch city names from text."""
    cities = load_dutch_cities()

    # Normalize text
    text_lower = text.lower()

    # Look for city names
    words = re.findall(r'\b([a-zA-Z\-\']+)\b', text)

    for word in words:
        if word.lower() in cities:
            return word.title()

    # Also check multi-word city names
    for city in cities:
        if ' ' in city and city in text_lower:
            return city.title()

    return None


def exa_search(query: str) -> Optional[str]:
    """
    Run EXA search via MCP and return results.
    Uses the exa MCP server directly via opencode.
    """
    # For now, we'll use webfetch to search organization websites
    # In a real implementation, this would call the EXA MCP tool
    return None


def search_organization_location(org_name: str, website: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """Search for organization location using various methods."""

    # Method 1: Search for organization + "adres" or "contact" or "vestiging"
    search_queries = [
        f'"{org_name}" adres Nederland',
        f'"{org_name}" vestiging',
        f'"{org_name}" contact locatie',
    ]

    # For now, return None - we'll implement the actual search via manual EXA queries
    return None


def main():
    """Find remaining entries without locations."""
    remaining = []

    for pattern in ['17*.yaml', '18*.yaml']:
        for entry_path in ENTRIES_DIR.glob(pattern):
            with open(entry_path, 'r') as f:
                entry = yaml.safe_load(f)

            if not entry:
                continue

            provenance = entry.get('provenance', {})
            sources = provenance.get('sources', {})
            is_kien = 'kien' in sources

            if not is_kien:
                continue

            has_location = bool(entry.get('locations'))
            has_ghcid = bool(entry.get('ghcid', {}).get('ghcid_current'))

            if not has_location and not has_ghcid:
                org_name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
                website = entry.get('contact', {}).get('website') or entry.get('original_entry', {}).get('webadres_organisatie')
                remaining.append({
                    'path': entry_path,
                    'entry_index': entry.get('entry_index'),
                    'org_name': org_name,
                    'website': website,
                })

    print(f"Found {len(remaining)} entries without locations")
    for r in remaining:
        print(f"  {r['entry_index']}: {r['org_name']}")


if __name__ == '__main__':
    main()