#!/usr/bin/env python3 """ Use EXA search to find locations for KIEN entries without location data. """ import json import os import re import sqlite3 import subprocess import sys import time import yaml from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any, List # Paths ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db') # Province mapping ADMIN1_TO_PROVINCE = { '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL', } # Dutch cities to look for DUTCH_CITIES = None def load_dutch_cities(): """Load Dutch city names from GeoNames database.""" global DUTCH_CITIES if DUTCH_CITIES is not None: return DUTCH_CITIES if not GEONAMES_DB.exists(): DUTCH_CITIES = set() return DUTCH_CITIES conn = sqlite3.connect(GEONAMES_DB) cursor = conn.cursor() cursor.execute(""" SELECT DISTINCT LOWER(name), LOWER(ascii_name) FROM cities WHERE country_code = 'NL' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') """) DUTCH_CITIES = set() for row in cursor.fetchall(): DUTCH_CITIES.add(row[0]) if row[1]: DUTCH_CITIES.add(row[1]) conn.close() return DUTCH_CITIES def lookup_city_in_geonames(city_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]: """Look up a city in the GeoNames database.""" if not GEONAMES_DB.exists(): return None conn = sqlite3.connect(GEONAMES_DB) cursor = conn.cursor() cursor.execute(""" SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude, population, feature_code FROM cities WHERE country_code = ? AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1 """, (country_code, city_name, city_name)) row = cursor.fetchone() conn.close() if row: return { 'geonames_id': row[0], 'name': row[1], 'ascii_name': row[2], 'admin1_code': row[3], 'latitude': row[4], 'longitude': row[5], 'population': row[6], 'feature_code': row[7], 'region_code': ADMIN1_TO_PROVINCE.get(row[3], 'XX'), } return None def extract_city_from_text(text: str) -> Optional[str]: """Extract Dutch city names from text.""" cities = load_dutch_cities() # Normalize text text_lower = text.lower() # Look for city names words = re.findall(r'\b([a-zA-Z\-\']+)\b', text) for word in words: if word.lower() in cities: return word.title() # Also check multi-word city names for city in cities: if ' ' in city and city in text_lower: return city.title() return None def exa_search(query: str) -> Optional[str]: """ Run EXA search via MCP and return results. Uses the exa MCP server directly via opencode. """ # For now, we'll use webfetch to search organization websites # In a real implementation, this would call the EXA MCP tool return None def search_organization_location(org_name: str, website: Optional[str] = None) -> Optional[Dict[str, Any]]: """Search for organization location using various methods.""" # Method 1: Search for organization + "adres" or "contact" or "vestiging" search_queries = [ f'"{org_name}" adres Nederland', f'"{org_name}" vestiging', f'"{org_name}" contact locatie', ] # For now, return None - we'll implement the actual search via manual EXA queries return None def main(): """Find remaining entries without locations.""" remaining = [] for pattern in ['17*.yaml', '18*.yaml']: for entry_path in ENTRIES_DIR.glob(pattern): with open(entry_path, 'r') as f: entry = yaml.safe_load(f) if not entry: continue provenance = entry.get('provenance', {}) sources = provenance.get('sources', {}) is_kien = 'kien' in sources if not is_kien: continue has_location = bool(entry.get('locations')) has_ghcid = bool(entry.get('ghcid', {}).get('ghcid_current')) if not has_location and not has_ghcid: org_name = entry.get('original_entry', {}).get('organisatie', 'Unknown') website = entry.get('contact', {}).get('website') or entry.get('original_entry', {}).get('webadres_organisatie') remaining.append({ 'path': entry_path, 'entry_index': entry.get('entry_index'), 'org_name': org_name, 'website': website, }) print(f"Found {len(remaining)} entries without locations") for r in remaining: print(f" {r['entry_index']}: {r['org_name']}") if __name__ == '__main__': main()