glam/scripts/extract_kien_locations_from_names.py

#!/usr/bin/env python3
"""
Extract location data from KIEN organization names.

Many KIEN organizations have place names embedded in their names, e.g.:
- "Harddraverijvereniging Venhuizen" → Venhuizen
- "Stichting Kortebaandraverij Hoofddorp" → Hoofddorp
- "Vereniging Gondelvaart Giethoorn" → Giethoorn

This script extracts these locations and geocodes them using GeoNames.
"""

import os
import re
import sqlite3
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Dict, Any, Tuple

# Dutch place name patterns - places that commonly appear in org names
# Format: 'pattern': (city_name, province_code, lat, lon, is_regional)
# is_regional=True means it's a province/region reference, not a specific city
DUTCH_PLACES = {
    # Specific cities/towns
    'Venhuizen': ('Venhuizen', 'NH', 52.6333, 5.2167, False),
    'Helmond': ('Helmond', 'NB', 51.4833, 5.6500, False),
    'Ravenstein': ('Ravenstein', 'NB', 51.7833, 5.6500, False),
    'Banholt': ('Banholt', 'LI', 50.7833, 5.8833, False),
    'Noorbeek': ('Noorbeek', 'LI', 50.7667, 5.8000, False),
    'Haarzuilens': ('Haarzuilens', 'UT', 52.1167, 4.9833, False),
    'Terschelling': ('Terschelling', 'FR', 53.4000, 5.3500, False),
    'Denekamp': ('Denekamp', 'OV', 52.3833, 7.0000, False),
    'Doesburg': ('Doesburg', 'GE', 52.0167, 6.1333, False),
    'Kerkrade': ('Kerkrade', 'LI', 50.8667, 6.0667, False),
    'Oosterhout': ('Oosterhout', 'NB', 51.6500, 4.8667, False),
    'Margraten': ('Margraten', 'LI', 50.8167, 5.8167, False),
    'Ameland': ('Ameland', 'FR', 53.4500, 5.7500, False),
    'Didam': ('Didam', 'GE', 51.9333, 6.1333, False),
    'Voorschoten': ('Voorschoten', 'ZH', 52.1333, 4.4500, False),
    'Alphen': ('Alphen aan den Rijn', 'ZH', 52.1333, 4.6667, False),
    'Houten': ('Houten', 'UT', 52.0333, 5.1667, False),
    'Drogeham': ('Drogeham', 'FR', 53.1167, 6.0667, False),
    'Goor': ('Goor', 'OV', 52.2333, 6.5833, False),
    'Naarden': ('Naarden', 'NH', 52.2833, 5.1500, False),
    'Warmond': ('Warmond', 'ZH', 52.2000, 4.5000, False),
    'Nootdorp': ('Nootdorp', 'ZH', 52.0500, 4.3833, False),
    'IJmuiden': ('IJmuiden', 'NH', 52.4667, 4.6167, False),
    'Hoofddorp': ('Hoofddorp', 'NH', 52.3000, 4.6833, False),
    'Sittard': ('Sittard', 'LI', 51.0000, 5.8667, False),
    'Brielle': ('Brielle', 'ZH', 51.9000, 4.1667, False),
    'Espelo': ('Espelo', 'OV', 52.3833, 6.3667, False),
    'Alblasserdam': ('Alblasserdam', 'ZH', 51.8667, 4.6667, False),
    'Sinoutskerke': ('Sinoutskerke', 'ZE', 51.5000, 3.7500, False),
    'Cothen': ('Cothen', 'UT', 52.0000, 5.3000, False),
    'Giethoorn': ('Giethoorn', 'OV', 52.7333, 6.0833, False),
    'Scheveningen': ('Den Haag', 'ZH', 52.1000, 4.2667, False),  # Scheveningen → Den Haag
    'Woerden': ('Woerden', 'UT', 52.0833, 4.8833, False),
    'Workum': ('Workum', 'FR', 52.9833, 5.4500, False),
    'Rotterdam': ('Rotterdam', 'ZH', 51.9167, 4.5000, False),
    'Amsterdam': ('Amsterdam', 'NH', 52.3667, 4.9000, False),
    'Rijssen': ('Rijssen', 'OV', 52.3000, 6.5167, False),
    'Vollenhoofse': ('Vollenhove', 'OV', 52.6833, 5.9500, False),
    'Vollenhove': ('Vollenhove', 'OV', 52.6833, 5.9500, False),
    'Groningen': ('Groningen', 'GR', 53.2167, 6.5667, False),
    'Alkmaar': ('Alkmaar', 'NH', 52.6333, 4.7500, False),

    # Regional/provincial references (is_regional=True) - these organizations operate across a region
    'Grunneger': ('Groningen', 'GR', 53.2167, 6.5667, True),  # Groningen dialect
    'Drentse': ('Assen', 'DR', 52.9925, 6.5625, True),  # Drenthe province → capital
    'Drenthe': ('Assen', 'DR', 52.9925, 6.5625, True),
    'Limburgse': ('Maastricht', 'LI', 50.8514, 5.6910, True),  # Limburg → capital
    'Limburg': ('Maastricht', 'LI', 50.8514, 5.6910, True),
    'Brabantse': ("'s-Hertogenbosch", 'NB', 51.6978, 5.3037, True),  # Noord-Brabant → capital
    'Noord-Brabant': ("'s-Hertogenbosch", 'NB', 51.6978, 5.3037, True),
    'Alkmaars': ('Alkmaar', 'NH', 52.6333, 4.7500, True),  # City adjective
    'Hogeland': ('Uithuizen', 'GR', 53.4000, 6.6667, True),  # Het Hogeland municipality
    'Goors': ('Goor', 'OV', 52.2333, 6.5833, True),  # Goor adjective
    'Rotterdamse': ('Rotterdam', 'ZH', 51.9167, 4.5000, True),  # Rotterdam adjective

    # Amsterdam neighborhoods - map to Amsterdam
    'Floradorp': ('Amsterdam', 'NH', 52.4000, 4.9333, False),
    'Kralingen': ('Rotterdam', 'ZH', 51.9333, 4.5167, False),  # Rotterdam neighborhood
    'Kralingse': ('Rotterdam', 'ZH', 51.9333, 4.5167, False),  # Kralingen adjective

    # Additional places from KIEN analysis
    'Hellemonds': ('Helmond', 'NB', 51.4833, 5.6500, True),  # Helmond dialect adjective
    'Grolse': ('Groenlo', 'GE', 52.0417, 6.6167, True),  # Groenlo adjective
    'Groenlo': ('Groenlo', 'GE', 52.0417, 6.6167, False),
    'Grou': ('Grou', 'FR', 53.0917, 5.8333, False),  # Frisian village
    'De Kwakel': ('De Kwakel', 'NH', 52.2333, 4.8000, False),
    'Kwakel': ('De Kwakel', 'NH', 52.2333, 4.8000, False),
    'Airborne': ('Oosterbeek', 'GE', 51.9833, 5.8500, True),  # Airborne = Arnhem/Oosterbeek area
    'Oosterbeek': ('Oosterbeek', 'GE', 51.9833, 5.8500, False),
    'Renkum': ('Renkum', 'GE', 51.9667, 5.7500, False),
    'Schinderhannes': ('Maastricht', 'LI', 50.8514, 5.6910, True),  # Limburg folklore figure
    'Lanenkaatsen': ('Sint Nicolaasga', 'FR', 52.9000, 5.5333, True),  # Frisian sport

    # Frisian places
    'Skûtsjesilen': ('Sneek', 'FR', 53.0333, 5.6583, True),  # Frisian sailing race
    'Fierljep': ('Winsum', 'FR', 53.2833, 5.5500, True),  # Frisian sport → origin location

    # More Netherlands cities
    'Arnhem': ('Arnhem', 'GE', 51.9833, 5.9167, False),
    'Utrecht': ('Utrecht', 'UT', 52.0908, 5.1222, False),
    'Den Haag': ('Den Haag', 'ZH', 52.0705, 4.3007, False),
    "'s-Gravenhage": ('Den Haag', 'ZH', 52.0705, 4.3007, False),
    'Eindhoven': ('Eindhoven', 'NB', 51.4416, 5.4697, False),
    'Maastricht': ('Maastricht', 'LI', 50.8514, 5.6910, False),
    'Nijmegen': ('Nijmegen', 'GE', 51.8425, 5.8528, False),
    'Leiden': ('Leiden', 'ZH', 52.1601, 4.4970, False),
    'Haarlem': ('Haarlem', 'NH', 52.3874, 4.6462, False),
    'Delft': ('Delft', 'ZH', 52.0116, 4.3571, False),

    # Dam reference (Amsterdam)
    'op de Dam': ('Amsterdam', 'NH', 52.3730, 4.8932, False),
}

# Province code to full name mapping
PROVINCE_CODES = {
    'DR': 'Drenthe',
    'FL': 'Flevoland',
    'FR': 'Friesland',
    'GE': 'Gelderland',
    'GR': 'Groningen',
    'LI': 'Limburg',
    'NB': 'Noord-Brabant',
    'NH': 'Noord-Holland',
    'OV': 'Overijssel',
    'UT': 'Utrecht',
    'ZE': 'Zeeland',
    'ZH': 'Zuid-Holland',
}

# GeoNames database path
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')


def extract_place_from_name(org_name: str) -> Optional[Tuple[str, str, float, float, bool]]:
    """
    Extract a place name from an organization name.
    Returns (city_name, province_code, lat, lon, is_regional) or None.
    """
    # Check for known places in the name
    for place, (city_name, province, lat, lon, is_regional) in DUTCH_PLACES.items():
        # Case-insensitive search
        if place.lower() in org_name.lower():
            return (city_name, province, lat, lon, is_regional)

    return None


def lookup_geonames(place_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
    """Look up a place in the GeoNames database."""
    if not GEONAMES_DB.exists():
        return None

    conn = sqlite3.connect(GEONAMES_DB)
    cursor = conn.cursor()

    # Try exact match first
    cursor.execute("""
        SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
               population, feature_code
        FROM cities
        WHERE country_code = ?
          AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
        ORDER BY population DESC
        LIMIT 1
    """, (country_code, place_name, place_name))

    row = cursor.fetchone()
    conn.close()

    if row:
        return {
            'geonames_id': row[0],
            'name': row[1],
            'ascii_name': row[2],
            'admin1_code': row[3],
            'latitude': row[4],
            'longitude': row[5],
            'population': row[6],
            'feature_code': row[7],
        }

    return None


def get_region_code(admin1_code: str) -> str:
    """Convert GeoNames admin1 code to Dutch province code."""
    admin1_to_province = {
        '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
        '05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
        '10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
    }
    return admin1_to_province.get(admin1_code, 'XX')


def process_entry(entry_path: Path, dry_run: bool = True) -> Optional[Dict[str, Any]]:
    """
    Process a single KIEN entry file.
    Returns location info if extracted, None otherwise.
    """
    with open(entry_path, 'r', encoding='utf-8') as f:
        entry = yaml.safe_load(f)

    # Skip if already has locations
    if 'locations' in entry and entry['locations']:
        return None

    org_name = entry.get('original_entry', {}).get('organisatie', '')
    if not org_name:
        return None

    # Try to extract place from name
    place_info = extract_place_from_name(org_name)

    if place_info:
        city_name, province_code, lat, lon, is_regional = place_info

        # Try to look up in GeoNames for better accuracy
        geonames_info = lookup_geonames(city_name)

        if geonames_info:
            # Use GeoNames data
            location = {
                'city': geonames_info['name'],
                'country': 'NL',
                'latitude': geonames_info['latitude'],
                'longitude': geonames_info['longitude'],
            }
            resolution = {
                'method': 'NAME_EXTRACTION_GEONAMES',
                'extracted_from': org_name,
                'matched_place': city_name,
                'is_regional': is_regional,
                'geonames_id': geonames_info['geonames_id'],
                'geonames_name': geonames_info['name'],
                'feature_code': geonames_info['feature_code'],
                'population': geonames_info['population'],
                'admin1_code': geonames_info['admin1_code'],
                'region_code': get_region_code(geonames_info['admin1_code']),
                'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
            }
        else:
            # Use hardcoded data
            location = {
                'city': city_name,
                'country': 'NL',
                'latitude': lat,
                'longitude': lon,
            }
            resolution = {
                'method': 'NAME_EXTRACTION_HARDCODED',
                'extracted_from': org_name,
                'matched_place': city_name,
                'is_regional': is_regional,
                'region_code': province_code,
                'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
            }

        if not dry_run:
            # Update the entry
            entry['locations'] = [location]
            entry['location_resolution'] = resolution

            # Add provenance note
            if 'provenance' not in entry:
                entry['provenance'] = {'notes': []}
            if 'notes' not in entry['provenance']:
                entry['provenance']['notes'] = []
            entry['provenance']['notes'].append(
                f"Location extracted from organization name '{org_name}' - matched place '{city_name}' ({resolution['method']})"
            )

            with open(entry_path, 'w', encoding='utf-8') as f:
                yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        return {
            'file': entry_path.name,
            'org_name': org_name,
            'location': location,
            'resolution': resolution,
        }

    return None


def main():
    import argparse

    parser = argparse.ArgumentParser(description='Extract locations from KIEN organization names')
    parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
    parser.add_argument('--limit', type=int, help='Limit number of entries to process')
    args = parser.parse_args()

    entries_dir = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')

    # Find KIEN entries (17xx and 18xx range)
    kien_files = sorted(list(entries_dir.glob('17*.yaml')) + list(entries_dir.glob('18*.yaml')))

    if args.limit:
        kien_files = kien_files[:args.limit]

    extracted = []
    skipped_has_location = 0
    skipped_no_match = 0

    for entry_path in kien_files:
        # Check if already has locations
        with open(entry_path, 'r', encoding='utf-8') as f:
            entry = yaml.safe_load(f)

        if 'locations' in entry and entry['locations']:
            skipped_has_location += 1
            continue

        result = process_entry(entry_path, dry_run=args.dry_run)

        if result:
            extracted.append(result)
            print(f"✓ {result['file']}: {result['org_name']} → {result['location']['city']}")
        else:
            skipped_no_match += 1

    print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:")
    print(f"  - Entries with locations extracted: {len(extracted)}")
    print(f"  - Entries already had locations: {skipped_has_location}")
    print(f"  - Entries with no place match: {skipped_no_match}")

    if extracted and args.dry_run:
        print("\nExtracted locations:")
        for e in extracted:
            print(f"  {e['org_name']} → {e['location']['city']} ({e['resolution']['method']})")


if __name__ == '__main__':
    main()