#!/usr/bin/env python3 """ Extract location data from KIEN organization names. Many KIEN organizations have place names embedded in their names, e.g.: - "Harddraverijvereniging Venhuizen" → Venhuizen - "Stichting Kortebaandraverij Hoofddorp" → Hoofddorp - "Vereniging Gondelvaart Giethoorn" → Giethoorn This script extracts these locations and geocodes them using GeoNames. """ import os import re import sqlite3 import yaml from pathlib import Path from datetime import datetime, timezone from typing import Optional, Dict, Any, Tuple # Dutch place name patterns - places that commonly appear in org names # Format: 'pattern': (city_name, province_code, lat, lon, is_regional) # is_regional=True means it's a province/region reference, not a specific city DUTCH_PLACES = { # Specific cities/towns 'Venhuizen': ('Venhuizen', 'NH', 52.6333, 5.2167, False), 'Helmond': ('Helmond', 'NB', 51.4833, 5.6500, False), 'Ravenstein': ('Ravenstein', 'NB', 51.7833, 5.6500, False), 'Banholt': ('Banholt', 'LI', 50.7833, 5.8833, False), 'Noorbeek': ('Noorbeek', 'LI', 50.7667, 5.8000, False), 'Haarzuilens': ('Haarzuilens', 'UT', 52.1167, 4.9833, False), 'Terschelling': ('Terschelling', 'FR', 53.4000, 5.3500, False), 'Denekamp': ('Denekamp', 'OV', 52.3833, 7.0000, False), 'Doesburg': ('Doesburg', 'GE', 52.0167, 6.1333, False), 'Kerkrade': ('Kerkrade', 'LI', 50.8667, 6.0667, False), 'Oosterhout': ('Oosterhout', 'NB', 51.6500, 4.8667, False), 'Margraten': ('Margraten', 'LI', 50.8167, 5.8167, False), 'Ameland': ('Ameland', 'FR', 53.4500, 5.7500, False), 'Didam': ('Didam', 'GE', 51.9333, 6.1333, False), 'Voorschoten': ('Voorschoten', 'ZH', 52.1333, 4.4500, False), 'Alphen': ('Alphen aan den Rijn', 'ZH', 52.1333, 4.6667, False), 'Houten': ('Houten', 'UT', 52.0333, 5.1667, False), 'Drogeham': ('Drogeham', 'FR', 53.1167, 6.0667, False), 'Goor': ('Goor', 'OV', 52.2333, 6.5833, False), 'Naarden': ('Naarden', 'NH', 52.2833, 5.1500, False), 'Warmond': ('Warmond', 'ZH', 52.2000, 4.5000, False), 'Nootdorp': ('Nootdorp', 'ZH', 52.0500, 4.3833, False), 'IJmuiden': ('IJmuiden', 'NH', 52.4667, 4.6167, False), 'Hoofddorp': ('Hoofddorp', 'NH', 52.3000, 4.6833, False), 'Sittard': ('Sittard', 'LI', 51.0000, 5.8667, False), 'Brielle': ('Brielle', 'ZH', 51.9000, 4.1667, False), 'Espelo': ('Espelo', 'OV', 52.3833, 6.3667, False), 'Alblasserdam': ('Alblasserdam', 'ZH', 51.8667, 4.6667, False), 'Sinoutskerke': ('Sinoutskerke', 'ZE', 51.5000, 3.7500, False), 'Cothen': ('Cothen', 'UT', 52.0000, 5.3000, False), 'Giethoorn': ('Giethoorn', 'OV', 52.7333, 6.0833, False), 'Scheveningen': ('Den Haag', 'ZH', 52.1000, 4.2667, False), # Scheveningen → Den Haag 'Woerden': ('Woerden', 'UT', 52.0833, 4.8833, False), 'Workum': ('Workum', 'FR', 52.9833, 5.4500, False), 'Rotterdam': ('Rotterdam', 'ZH', 51.9167, 4.5000, False), 'Amsterdam': ('Amsterdam', 'NH', 52.3667, 4.9000, False), 'Rijssen': ('Rijssen', 'OV', 52.3000, 6.5167, False), 'Vollenhoofse': ('Vollenhove', 'OV', 52.6833, 5.9500, False), 'Vollenhove': ('Vollenhove', 'OV', 52.6833, 5.9500, False), 'Groningen': ('Groningen', 'GR', 53.2167, 6.5667, False), 'Alkmaar': ('Alkmaar', 'NH', 52.6333, 4.7500, False), # Regional/provincial references (is_regional=True) - these organizations operate across a region 'Grunneger': ('Groningen', 'GR', 53.2167, 6.5667, True), # Groningen dialect 'Drentse': ('Assen', 'DR', 52.9925, 6.5625, True), # Drenthe province → capital 'Drenthe': ('Assen', 'DR', 52.9925, 6.5625, True), 'Limburgse': ('Maastricht', 'LI', 50.8514, 5.6910, True), # Limburg → capital 'Limburg': ('Maastricht', 'LI', 50.8514, 5.6910, True), 'Brabantse': ("'s-Hertogenbosch", 'NB', 51.6978, 5.3037, True), # Noord-Brabant → capital 'Noord-Brabant': ("'s-Hertogenbosch", 'NB', 51.6978, 5.3037, True), 'Alkmaars': ('Alkmaar', 'NH', 52.6333, 4.7500, True), # City adjective 'Hogeland': ('Uithuizen', 'GR', 53.4000, 6.6667, True), # Het Hogeland municipality 'Goors': ('Goor', 'OV', 52.2333, 6.5833, True), # Goor adjective 'Rotterdamse': ('Rotterdam', 'ZH', 51.9167, 4.5000, True), # Rotterdam adjective # Amsterdam neighborhoods - map to Amsterdam 'Floradorp': ('Amsterdam', 'NH', 52.4000, 4.9333, False), 'Kralingen': ('Rotterdam', 'ZH', 51.9333, 4.5167, False), # Rotterdam neighborhood 'Kralingse': ('Rotterdam', 'ZH', 51.9333, 4.5167, False), # Kralingen adjective # Additional places from KIEN analysis 'Hellemonds': ('Helmond', 'NB', 51.4833, 5.6500, True), # Helmond dialect adjective 'Grolse': ('Groenlo', 'GE', 52.0417, 6.6167, True), # Groenlo adjective 'Groenlo': ('Groenlo', 'GE', 52.0417, 6.6167, False), 'Grou': ('Grou', 'FR', 53.0917, 5.8333, False), # Frisian village 'De Kwakel': ('De Kwakel', 'NH', 52.2333, 4.8000, False), 'Kwakel': ('De Kwakel', 'NH', 52.2333, 4.8000, False), 'Airborne': ('Oosterbeek', 'GE', 51.9833, 5.8500, True), # Airborne = Arnhem/Oosterbeek area 'Oosterbeek': ('Oosterbeek', 'GE', 51.9833, 5.8500, False), 'Renkum': ('Renkum', 'GE', 51.9667, 5.7500, False), 'Schinderhannes': ('Maastricht', 'LI', 50.8514, 5.6910, True), # Limburg folklore figure 'Lanenkaatsen': ('Sint Nicolaasga', 'FR', 52.9000, 5.5333, True), # Frisian sport # Frisian places 'Skûtsjesilen': ('Sneek', 'FR', 53.0333, 5.6583, True), # Frisian sailing race 'Fierljep': ('Winsum', 'FR', 53.2833, 5.5500, True), # Frisian sport → origin location # More Netherlands cities 'Arnhem': ('Arnhem', 'GE', 51.9833, 5.9167, False), 'Utrecht': ('Utrecht', 'UT', 52.0908, 5.1222, False), 'Den Haag': ('Den Haag', 'ZH', 52.0705, 4.3007, False), "'s-Gravenhage": ('Den Haag', 'ZH', 52.0705, 4.3007, False), 'Eindhoven': ('Eindhoven', 'NB', 51.4416, 5.4697, False), 'Maastricht': ('Maastricht', 'LI', 50.8514, 5.6910, False), 'Nijmegen': ('Nijmegen', 'GE', 51.8425, 5.8528, False), 'Leiden': ('Leiden', 'ZH', 52.1601, 4.4970, False), 'Haarlem': ('Haarlem', 'NH', 52.3874, 4.6462, False), 'Delft': ('Delft', 'ZH', 52.0116, 4.3571, False), # Dam reference (Amsterdam) 'op de Dam': ('Amsterdam', 'NH', 52.3730, 4.8932, False), } # Province code to full name mapping PROVINCE_CODES = { 'DR': 'Drenthe', 'FL': 'Flevoland', 'FR': 'Friesland', 'GE': 'Gelderland', 'GR': 'Groningen', 'LI': 'Limburg', 'NB': 'Noord-Brabant', 'NH': 'Noord-Holland', 'OV': 'Overijssel', 'UT': 'Utrecht', 'ZE': 'Zeeland', 'ZH': 'Zuid-Holland', } # GeoNames database path GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db') def extract_place_from_name(org_name: str) -> Optional[Tuple[str, str, float, float, bool]]: """ Extract a place name from an organization name. Returns (city_name, province_code, lat, lon, is_regional) or None. """ # Check for known places in the name for place, (city_name, province, lat, lon, is_regional) in DUTCH_PLACES.items(): # Case-insensitive search if place.lower() in org_name.lower(): return (city_name, province, lat, lon, is_regional) return None def lookup_geonames(place_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]: """Look up a place in the GeoNames database.""" if not GEONAMES_DB.exists(): return None conn = sqlite3.connect(GEONAMES_DB) cursor = conn.cursor() # Try exact match first cursor.execute(""" SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude, population, feature_code FROM cities WHERE country_code = ? AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1 """, (country_code, place_name, place_name)) row = cursor.fetchone() conn.close() if row: return { 'geonames_id': row[0], 'name': row[1], 'ascii_name': row[2], 'admin1_code': row[3], 'latitude': row[4], 'longitude': row[5], 'population': row[6], 'feature_code': row[7], } return None def get_region_code(admin1_code: str) -> str: """Convert GeoNames admin1 code to Dutch province code.""" admin1_to_province = { '01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR', '05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT', '10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL', } return admin1_to_province.get(admin1_code, 'XX') def process_entry(entry_path: Path, dry_run: bool = True) -> Optional[Dict[str, Any]]: """ Process a single KIEN entry file. Returns location info if extracted, None otherwise. """ with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) # Skip if already has locations if 'locations' in entry and entry['locations']: return None org_name = entry.get('original_entry', {}).get('organisatie', '') if not org_name: return None # Try to extract place from name place_info = extract_place_from_name(org_name) if place_info: city_name, province_code, lat, lon, is_regional = place_info # Try to look up in GeoNames for better accuracy geonames_info = lookup_geonames(city_name) if geonames_info: # Use GeoNames data location = { 'city': geonames_info['name'], 'country': 'NL', 'latitude': geonames_info['latitude'], 'longitude': geonames_info['longitude'], } resolution = { 'method': 'NAME_EXTRACTION_GEONAMES', 'extracted_from': org_name, 'matched_place': city_name, 'is_regional': is_regional, 'geonames_id': geonames_info['geonames_id'], 'geonames_name': geonames_info['name'], 'feature_code': geonames_info['feature_code'], 'population': geonames_info['population'], 'admin1_code': geonames_info['admin1_code'], 'region_code': get_region_code(geonames_info['admin1_code']), 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), } else: # Use hardcoded data location = { 'city': city_name, 'country': 'NL', 'latitude': lat, 'longitude': lon, } resolution = { 'method': 'NAME_EXTRACTION_HARDCODED', 'extracted_from': org_name, 'matched_place': city_name, 'is_regional': is_regional, 'region_code': province_code, 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), } if not dry_run: # Update the entry entry['locations'] = [location] entry['location_resolution'] = resolution # Add provenance note if 'provenance' not in entry: entry['provenance'] = {'notes': []} if 'notes' not in entry['provenance']: entry['provenance']['notes'] = [] entry['provenance']['notes'].append( f"Location extracted from organization name '{org_name}' - matched place '{city_name}' ({resolution['method']})" ) with open(entry_path, 'w', encoding='utf-8') as f: yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return { 'file': entry_path.name, 'org_name': org_name, 'location': location, 'resolution': resolution, } return None def main(): import argparse parser = argparse.ArgumentParser(description='Extract locations from KIEN organization names') parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes') parser.add_argument('--limit', type=int, help='Limit number of entries to process') args = parser.parse_args() entries_dir = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries') # Find KIEN entries (17xx and 18xx range) kien_files = sorted(list(entries_dir.glob('17*.yaml')) + list(entries_dir.glob('18*.yaml'))) if args.limit: kien_files = kien_files[:args.limit] extracted = [] skipped_has_location = 0 skipped_no_match = 0 for entry_path in kien_files: # Check if already has locations with open(entry_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if 'locations' in entry and entry['locations']: skipped_has_location += 1 continue result = process_entry(entry_path, dry_run=args.dry_run) if result: extracted.append(result) print(f"✓ {result['file']}: {result['org_name']} → {result['location']['city']}") else: skipped_no_match += 1 print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:") print(f" - Entries with locations extracted: {len(extracted)}") print(f" - Entries already had locations: {skipped_has_location}") print(f" - Entries with no place match: {skipped_no_match}") if extracted and args.dry_run: print("\nExtracted locations:") for e in extracted: print(f" {e['org_name']} → {e['location']['city']} ({e['resolution']['method']})") if __name__ == '__main__': main()