183 lines
5.3 KiB
Python
183 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Use EXA search to find locations for KIEN entries without location data.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List
|
|
|
|
# Paths
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
|
|
|
|
# Province mapping
|
|
ADMIN1_TO_PROVINCE = {
|
|
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
|
|
'05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
|
|
'10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
|
|
}
|
|
|
|
# Dutch cities to look for
|
|
DUTCH_CITIES = None
|
|
|
|
def load_dutch_cities():
|
|
"""Load Dutch city names from GeoNames database."""
|
|
global DUTCH_CITIES
|
|
if DUTCH_CITIES is not None:
|
|
return DUTCH_CITIES
|
|
|
|
if not GEONAMES_DB.exists():
|
|
DUTCH_CITIES = set()
|
|
return DUTCH_CITIES
|
|
|
|
conn = sqlite3.connect(GEONAMES_DB)
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT DISTINCT LOWER(name), LOWER(ascii_name)
|
|
FROM cities
|
|
WHERE country_code = 'NL'
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
""")
|
|
|
|
DUTCH_CITIES = set()
|
|
for row in cursor.fetchall():
|
|
DUTCH_CITIES.add(row[0])
|
|
if row[1]:
|
|
DUTCH_CITIES.add(row[1])
|
|
|
|
conn.close()
|
|
return DUTCH_CITIES
|
|
|
|
|
|
def lookup_city_in_geonames(city_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
|
|
"""Look up a city in the GeoNames database."""
|
|
if not GEONAMES_DB.exists():
|
|
return None
|
|
|
|
conn = sqlite3.connect(GEONAMES_DB)
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""
|
|
SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
|
|
population, feature_code
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (country_code, city_name, city_name))
|
|
|
|
row = cursor.fetchone()
|
|
conn.close()
|
|
|
|
if row:
|
|
return {
|
|
'geonames_id': row[0],
|
|
'name': row[1],
|
|
'ascii_name': row[2],
|
|
'admin1_code': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'population': row[6],
|
|
'feature_code': row[7],
|
|
'region_code': ADMIN1_TO_PROVINCE.get(row[3], 'XX'),
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def extract_city_from_text(text: str) -> Optional[str]:
|
|
"""Extract Dutch city names from text."""
|
|
cities = load_dutch_cities()
|
|
|
|
# Normalize text
|
|
text_lower = text.lower()
|
|
|
|
# Look for city names
|
|
words = re.findall(r'\b([a-zA-Z\-\']+)\b', text)
|
|
|
|
for word in words:
|
|
if word.lower() in cities:
|
|
return word.title()
|
|
|
|
# Also check multi-word city names
|
|
for city in cities:
|
|
if ' ' in city and city in text_lower:
|
|
return city.title()
|
|
|
|
return None
|
|
|
|
|
|
def exa_search(query: str) -> Optional[str]:
|
|
"""
|
|
Run EXA search via MCP and return results.
|
|
Uses the exa MCP server directly via opencode.
|
|
"""
|
|
# For now, we'll use webfetch to search organization websites
|
|
# In a real implementation, this would call the EXA MCP tool
|
|
return None
|
|
|
|
|
|
def search_organization_location(org_name: str, website: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
"""Search for organization location using various methods."""
|
|
|
|
# Method 1: Search for organization + "adres" or "contact" or "vestiging"
|
|
search_queries = [
|
|
f'"{org_name}" adres Nederland',
|
|
f'"{org_name}" vestiging',
|
|
f'"{org_name}" contact locatie',
|
|
]
|
|
|
|
# For now, return None - we'll implement the actual search via manual EXA queries
|
|
return None
|
|
|
|
|
|
def main():
|
|
"""Find remaining entries without locations."""
|
|
remaining = []
|
|
|
|
for pattern in ['17*.yaml', '18*.yaml']:
|
|
for entry_path in ENTRIES_DIR.glob(pattern):
|
|
with open(entry_path, 'r') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
continue
|
|
|
|
provenance = entry.get('provenance', {})
|
|
sources = provenance.get('sources', {})
|
|
is_kien = 'kien' in sources
|
|
|
|
if not is_kien:
|
|
continue
|
|
|
|
has_location = bool(entry.get('locations'))
|
|
has_ghcid = bool(entry.get('ghcid', {}).get('ghcid_current'))
|
|
|
|
if not has_location and not has_ghcid:
|
|
org_name = entry.get('original_entry', {}).get('organisatie', 'Unknown')
|
|
website = entry.get('contact', {}).get('website') or entry.get('original_entry', {}).get('webadres_organisatie')
|
|
remaining.append({
|
|
'path': entry_path,
|
|
'entry_index': entry.get('entry_index'),
|
|
'org_name': org_name,
|
|
'website': website,
|
|
})
|
|
|
|
print(f"Found {len(remaining)} entries without locations")
|
|
for r in remaining:
|
|
print(f" {r['entry_index']}: {r['org_name']}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|