glam/scripts/extract_kien_locations_from_names.py
2025-12-05 15:30:23 +01:00

340 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Extract location data from KIEN organization names.
Many KIEN organizations have place names embedded in their names, e.g.:
- "Harddraverijvereniging Venhuizen" → Venhuizen
- "Stichting Kortebaandraverij Hoofddorp" → Hoofddorp
- "Vereniging Gondelvaart Giethoorn" → Giethoorn
This script extracts these locations and geocodes them using GeoNames.
"""
import os
import re
import sqlite3
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Dict, Any, Tuple
# Dutch place name patterns - places that commonly appear in org names
# Format: 'pattern': (city_name, province_code, lat, lon, is_regional)
# is_regional=True means it's a province/region reference, not a specific city
DUTCH_PLACES = {
# Specific cities/towns
'Venhuizen': ('Venhuizen', 'NH', 52.6333, 5.2167, False),
'Helmond': ('Helmond', 'NB', 51.4833, 5.6500, False),
'Ravenstein': ('Ravenstein', 'NB', 51.7833, 5.6500, False),
'Banholt': ('Banholt', 'LI', 50.7833, 5.8833, False),
'Noorbeek': ('Noorbeek', 'LI', 50.7667, 5.8000, False),
'Haarzuilens': ('Haarzuilens', 'UT', 52.1167, 4.9833, False),
'Terschelling': ('Terschelling', 'FR', 53.4000, 5.3500, False),
'Denekamp': ('Denekamp', 'OV', 52.3833, 7.0000, False),
'Doesburg': ('Doesburg', 'GE', 52.0167, 6.1333, False),
'Kerkrade': ('Kerkrade', 'LI', 50.8667, 6.0667, False),
'Oosterhout': ('Oosterhout', 'NB', 51.6500, 4.8667, False),
'Margraten': ('Margraten', 'LI', 50.8167, 5.8167, False),
'Ameland': ('Ameland', 'FR', 53.4500, 5.7500, False),
'Didam': ('Didam', 'GE', 51.9333, 6.1333, False),
'Voorschoten': ('Voorschoten', 'ZH', 52.1333, 4.4500, False),
'Alphen': ('Alphen aan den Rijn', 'ZH', 52.1333, 4.6667, False),
'Houten': ('Houten', 'UT', 52.0333, 5.1667, False),
'Drogeham': ('Drogeham', 'FR', 53.1167, 6.0667, False),
'Goor': ('Goor', 'OV', 52.2333, 6.5833, False),
'Naarden': ('Naarden', 'NH', 52.2833, 5.1500, False),
'Warmond': ('Warmond', 'ZH', 52.2000, 4.5000, False),
'Nootdorp': ('Nootdorp', 'ZH', 52.0500, 4.3833, False),
'IJmuiden': ('IJmuiden', 'NH', 52.4667, 4.6167, False),
'Hoofddorp': ('Hoofddorp', 'NH', 52.3000, 4.6833, False),
'Sittard': ('Sittard', 'LI', 51.0000, 5.8667, False),
'Brielle': ('Brielle', 'ZH', 51.9000, 4.1667, False),
'Espelo': ('Espelo', 'OV', 52.3833, 6.3667, False),
'Alblasserdam': ('Alblasserdam', 'ZH', 51.8667, 4.6667, False),
'Sinoutskerke': ('Sinoutskerke', 'ZE', 51.5000, 3.7500, False),
'Cothen': ('Cothen', 'UT', 52.0000, 5.3000, False),
'Giethoorn': ('Giethoorn', 'OV', 52.7333, 6.0833, False),
'Scheveningen': ('Den Haag', 'ZH', 52.1000, 4.2667, False), # Scheveningen → Den Haag
'Woerden': ('Woerden', 'UT', 52.0833, 4.8833, False),
'Workum': ('Workum', 'FR', 52.9833, 5.4500, False),
'Rotterdam': ('Rotterdam', 'ZH', 51.9167, 4.5000, False),
'Amsterdam': ('Amsterdam', 'NH', 52.3667, 4.9000, False),
'Rijssen': ('Rijssen', 'OV', 52.3000, 6.5167, False),
'Vollenhoofse': ('Vollenhove', 'OV', 52.6833, 5.9500, False),
'Vollenhove': ('Vollenhove', 'OV', 52.6833, 5.9500, False),
'Groningen': ('Groningen', 'GR', 53.2167, 6.5667, False),
'Alkmaar': ('Alkmaar', 'NH', 52.6333, 4.7500, False),
# Regional/provincial references (is_regional=True) - these organizations operate across a region
'Grunneger': ('Groningen', 'GR', 53.2167, 6.5667, True), # Groningen dialect
'Drentse': ('Assen', 'DR', 52.9925, 6.5625, True), # Drenthe province → capital
'Drenthe': ('Assen', 'DR', 52.9925, 6.5625, True),
'Limburgse': ('Maastricht', 'LI', 50.8514, 5.6910, True), # Limburg → capital
'Limburg': ('Maastricht', 'LI', 50.8514, 5.6910, True),
'Brabantse': ("'s-Hertogenbosch", 'NB', 51.6978, 5.3037, True), # Noord-Brabant → capital
'Noord-Brabant': ("'s-Hertogenbosch", 'NB', 51.6978, 5.3037, True),
'Alkmaars': ('Alkmaar', 'NH', 52.6333, 4.7500, True), # City adjective
'Hogeland': ('Uithuizen', 'GR', 53.4000, 6.6667, True), # Het Hogeland municipality
'Goors': ('Goor', 'OV', 52.2333, 6.5833, True), # Goor adjective
'Rotterdamse': ('Rotterdam', 'ZH', 51.9167, 4.5000, True), # Rotterdam adjective
# Amsterdam neighborhoods - map to Amsterdam
'Floradorp': ('Amsterdam', 'NH', 52.4000, 4.9333, False),
'Kralingen': ('Rotterdam', 'ZH', 51.9333, 4.5167, False), # Rotterdam neighborhood
'Kralingse': ('Rotterdam', 'ZH', 51.9333, 4.5167, False), # Kralingen adjective
# Additional places from KIEN analysis
'Hellemonds': ('Helmond', 'NB', 51.4833, 5.6500, True), # Helmond dialect adjective
'Grolse': ('Groenlo', 'GE', 52.0417, 6.6167, True), # Groenlo adjective
'Groenlo': ('Groenlo', 'GE', 52.0417, 6.6167, False),
'Grou': ('Grou', 'FR', 53.0917, 5.8333, False), # Frisian village
'De Kwakel': ('De Kwakel', 'NH', 52.2333, 4.8000, False),
'Kwakel': ('De Kwakel', 'NH', 52.2333, 4.8000, False),
'Airborne': ('Oosterbeek', 'GE', 51.9833, 5.8500, True), # Airborne = Arnhem/Oosterbeek area
'Oosterbeek': ('Oosterbeek', 'GE', 51.9833, 5.8500, False),
'Renkum': ('Renkum', 'GE', 51.9667, 5.7500, False),
'Schinderhannes': ('Maastricht', 'LI', 50.8514, 5.6910, True), # Limburg folklore figure
'Lanenkaatsen': ('Sint Nicolaasga', 'FR', 52.9000, 5.5333, True), # Frisian sport
# Frisian places
'Skûtsjesilen': ('Sneek', 'FR', 53.0333, 5.6583, True), # Frisian sailing race
'Fierljep': ('Winsum', 'FR', 53.2833, 5.5500, True), # Frisian sport → origin location
# More Netherlands cities
'Arnhem': ('Arnhem', 'GE', 51.9833, 5.9167, False),
'Utrecht': ('Utrecht', 'UT', 52.0908, 5.1222, False),
'Den Haag': ('Den Haag', 'ZH', 52.0705, 4.3007, False),
"'s-Gravenhage": ('Den Haag', 'ZH', 52.0705, 4.3007, False),
'Eindhoven': ('Eindhoven', 'NB', 51.4416, 5.4697, False),
'Maastricht': ('Maastricht', 'LI', 50.8514, 5.6910, False),
'Nijmegen': ('Nijmegen', 'GE', 51.8425, 5.8528, False),
'Leiden': ('Leiden', 'ZH', 52.1601, 4.4970, False),
'Haarlem': ('Haarlem', 'NH', 52.3874, 4.6462, False),
'Delft': ('Delft', 'ZH', 52.0116, 4.3571, False),
# Dam reference (Amsterdam)
'op de Dam': ('Amsterdam', 'NH', 52.3730, 4.8932, False),
}
# Province code to full name mapping
PROVINCE_CODES = {
'DR': 'Drenthe',
'FL': 'Flevoland',
'FR': 'Friesland',
'GE': 'Gelderland',
'GR': 'Groningen',
'LI': 'Limburg',
'NB': 'Noord-Brabant',
'NH': 'Noord-Holland',
'OV': 'Overijssel',
'UT': 'Utrecht',
'ZE': 'Zeeland',
'ZH': 'Zuid-Holland',
}
# GeoNames database path
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
def extract_place_from_name(org_name: str) -> Optional[Tuple[str, str, float, float, bool]]:
"""
Extract a place name from an organization name.
Returns (city_name, province_code, lat, lon, is_regional) or None.
"""
# Check for known places in the name
for place, (city_name, province, lat, lon, is_regional) in DUTCH_PLACES.items():
# Case-insensitive search
if place.lower() in org_name.lower():
return (city_name, province, lat, lon, is_regional)
return None
def lookup_geonames(place_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
"""Look up a place in the GeoNames database."""
if not GEONAMES_DB.exists():
return None
conn = sqlite3.connect(GEONAMES_DB)
cursor = conn.cursor()
# Try exact match first
cursor.execute("""
SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
population, feature_code
FROM cities
WHERE country_code = ?
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1
""", (country_code, place_name, place_name))
row = cursor.fetchone()
conn.close()
if row:
return {
'geonames_id': row[0],
'name': row[1],
'ascii_name': row[2],
'admin1_code': row[3],
'latitude': row[4],
'longitude': row[5],
'population': row[6],
'feature_code': row[7],
}
return None
def get_region_code(admin1_code: str) -> str:
"""Convert GeoNames admin1 code to Dutch province code."""
admin1_to_province = {
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
'05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
'10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
}
return admin1_to_province.get(admin1_code, 'XX')
def process_entry(entry_path: Path, dry_run: bool = True) -> Optional[Dict[str, Any]]:
"""
Process a single KIEN entry file.
Returns location info if extracted, None otherwise.
"""
with open(entry_path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
# Skip if already has locations
if 'locations' in entry and entry['locations']:
return None
org_name = entry.get('original_entry', {}).get('organisatie', '')
if not org_name:
return None
# Try to extract place from name
place_info = extract_place_from_name(org_name)
if place_info:
city_name, province_code, lat, lon, is_regional = place_info
# Try to look up in GeoNames for better accuracy
geonames_info = lookup_geonames(city_name)
if geonames_info:
# Use GeoNames data
location = {
'city': geonames_info['name'],
'country': 'NL',
'latitude': geonames_info['latitude'],
'longitude': geonames_info['longitude'],
}
resolution = {
'method': 'NAME_EXTRACTION_GEONAMES',
'extracted_from': org_name,
'matched_place': city_name,
'is_regional': is_regional,
'geonames_id': geonames_info['geonames_id'],
'geonames_name': geonames_info['name'],
'feature_code': geonames_info['feature_code'],
'population': geonames_info['population'],
'admin1_code': geonames_info['admin1_code'],
'region_code': get_region_code(geonames_info['admin1_code']),
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
}
else:
# Use hardcoded data
location = {
'city': city_name,
'country': 'NL',
'latitude': lat,
'longitude': lon,
}
resolution = {
'method': 'NAME_EXTRACTION_HARDCODED',
'extracted_from': org_name,
'matched_place': city_name,
'is_regional': is_regional,
'region_code': province_code,
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
}
if not dry_run:
# Update the entry
entry['locations'] = [location]
entry['location_resolution'] = resolution
# Add provenance note
if 'provenance' not in entry:
entry['provenance'] = {'notes': []}
if 'notes' not in entry['provenance']:
entry['provenance']['notes'] = []
entry['provenance']['notes'].append(
f"Location extracted from organization name '{org_name}' - matched place '{city_name}' ({resolution['method']})"
)
with open(entry_path, 'w', encoding='utf-8') as f:
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return {
'file': entry_path.name,
'org_name': org_name,
'location': location,
'resolution': resolution,
}
return None
def main():
import argparse
parser = argparse.ArgumentParser(description='Extract locations from KIEN organization names')
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
parser.add_argument('--limit', type=int, help='Limit number of entries to process')
args = parser.parse_args()
entries_dir = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
# Find KIEN entries (17xx and 18xx range)
kien_files = sorted(list(entries_dir.glob('17*.yaml')) + list(entries_dir.glob('18*.yaml')))
if args.limit:
kien_files = kien_files[:args.limit]
extracted = []
skipped_has_location = 0
skipped_no_match = 0
for entry_path in kien_files:
# Check if already has locations
with open(entry_path, 'r', encoding='utf-8') as f:
entry = yaml.safe_load(f)
if 'locations' in entry and entry['locations']:
skipped_has_location += 1
continue
result = process_entry(entry_path, dry_run=args.dry_run)
if result:
extracted.append(result)
print(f"{result['file']}: {result['org_name']}{result['location']['city']}")
else:
skipped_no_match += 1
print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:")
print(f" - Entries with locations extracted: {len(extracted)}")
print(f" - Entries already had locations: {skipped_has_location}")
print(f" - Entries with no place match: {skipped_no_match}")
if extracted and args.dry_run:
print("\nExtracted locations:")
for e in extracted:
print(f" {e['org_name']}{e['location']['city']} ({e['resolution']['method']})")
if __name__ == '__main__':
main()