glam/scripts/exa_search_kien_locations.py
2025-12-05 15:30:23 +01:00

183 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
Use EXA search to find locations for KIEN entries without location data.
"""
import json
import os
import re
import sqlite3
import subprocess
import sys
import time
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
# Paths
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
# Province mapping
ADMIN1_TO_PROVINCE = {
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
'05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
'10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
}
# Dutch cities to look for
DUTCH_CITIES = None
def load_dutch_cities():
"""Load Dutch city names from GeoNames database."""
global DUTCH_CITIES
if DUTCH_CITIES is not None:
return DUTCH_CITIES
if not GEONAMES_DB.exists():
DUTCH_CITIES = set()
return DUTCH_CITIES
conn = sqlite3.connect(GEONAMES_DB)
cursor = conn.cursor()
cursor.execute("""
SELECT DISTINCT LOWER(name), LOWER(ascii_name)
FROM cities
WHERE country_code = 'NL'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
""")
DUTCH_CITIES = set()
for row in cursor.fetchall():
DUTCH_CITIES.add(row[0])
if row[1]:
DUTCH_CITIES.add(row[1])
conn.close()
return DUTCH_CITIES
def lookup_city_in_geonames(city_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
"""Look up a city in the GeoNames database."""
if not GEONAMES_DB.exists():
return None
conn = sqlite3.connect(GEONAMES_DB)
cursor = conn.cursor()
cursor.execute("""
SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
population, feature_code
FROM cities
WHERE country_code = ?
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1
""", (country_code, city_name, city_name))
row = cursor.fetchone()
conn.close()
if row:
return {
'geonames_id': row[0],
'name': row[1],
'ascii_name': row[2],
'admin1_code': row[3],
'latitude': row[4],
'longitude': row[5],
'population': row[6],
'feature_code': row[7],
'region_code': ADMIN1_TO_PROVINCE.get(row[3], 'XX'),
}
return None
def extract_city_from_text(text: str) -> Optional[str]:
"""Extract Dutch city names from text."""
cities = load_dutch_cities()
# Normalize text
text_lower = text.lower()
# Look for city names
words = re.findall(r'\b([a-zA-Z\-\']+)\b', text)
for word in words:
if word.lower() in cities:
return word.title()
# Also check multi-word city names
for city in cities:
if ' ' in city and city in text_lower:
return city.title()
return None
def exa_search(query: str) -> Optional[str]:
"""
Run EXA search via MCP and return results.
Uses the exa MCP server directly via opencode.
"""
# For now, we'll use webfetch to search organization websites
# In a real implementation, this would call the EXA MCP tool
return None
def search_organization_location(org_name: str, website: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""Search for organization location using various methods."""
# Method 1: Search for organization + "adres" or "contact" or "vestiging"
search_queries = [
f'"{org_name}" adres Nederland',
f'"{org_name}" vestiging',
f'"{org_name}" contact locatie',
]
# For now, return None - we'll implement the actual search via manual EXA queries
return None
def main():
"""Find remaining entries without locations."""
remaining = []
for pattern in ['17*.yaml', '18*.yaml']:
for entry_path in ENTRIES_DIR.glob(pattern):
with open(entry_path, 'r') as f:
entry = yaml.safe_load(f)
if not entry:
continue
provenance = entry.get('provenance', {})
sources = provenance.get('sources', {})
is_kien = 'kien' in sources
if not is_kien:
continue
has_location = bool(entry.get('locations'))
has_ghcid = bool(entry.get('ghcid', {}).get('ghcid_current'))
if not has_location and not has_ghcid:
org_name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
website = entry.get('contact', {}).get('website') or entry.get('original_entry', {}).get('webadres_organisatie')
remaining.append({
'path': entry_path,
'entry_index': entry.get('entry_index'),
'org_name': org_name,
'website': website,
})
print(f"Found {len(remaining)} entries without locations")
for r in remaining:
print(f" {r['entry_index']}: {r['org_name']}")
if __name__ == '__main__':
main()