340 lines
14 KiB
Python
340 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract location data from KIEN organization names.
|
|
|
|
Many KIEN organizations have place names embedded in their names, e.g.:
|
|
- "Harddraverijvereniging Venhuizen" → Venhuizen
|
|
- "Stichting Kortebaandraverij Hoofddorp" → Hoofddorp
|
|
- "Vereniging Gondelvaart Giethoorn" → Giethoorn
|
|
|
|
This script extracts these locations and geocodes them using GeoNames.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Dict, Any, Tuple
|
|
|
|
# Dutch place name patterns - places that commonly appear in org names
|
|
# Format: 'pattern': (city_name, province_code, lat, lon, is_regional)
|
|
# is_regional=True means it's a province/region reference, not a specific city
|
|
DUTCH_PLACES = {
|
|
# Specific cities/towns
|
|
'Venhuizen': ('Venhuizen', 'NH', 52.6333, 5.2167, False),
|
|
'Helmond': ('Helmond', 'NB', 51.4833, 5.6500, False),
|
|
'Ravenstein': ('Ravenstein', 'NB', 51.7833, 5.6500, False),
|
|
'Banholt': ('Banholt', 'LI', 50.7833, 5.8833, False),
|
|
'Noorbeek': ('Noorbeek', 'LI', 50.7667, 5.8000, False),
|
|
'Haarzuilens': ('Haarzuilens', 'UT', 52.1167, 4.9833, False),
|
|
'Terschelling': ('Terschelling', 'FR', 53.4000, 5.3500, False),
|
|
'Denekamp': ('Denekamp', 'OV', 52.3833, 7.0000, False),
|
|
'Doesburg': ('Doesburg', 'GE', 52.0167, 6.1333, False),
|
|
'Kerkrade': ('Kerkrade', 'LI', 50.8667, 6.0667, False),
|
|
'Oosterhout': ('Oosterhout', 'NB', 51.6500, 4.8667, False),
|
|
'Margraten': ('Margraten', 'LI', 50.8167, 5.8167, False),
|
|
'Ameland': ('Ameland', 'FR', 53.4500, 5.7500, False),
|
|
'Didam': ('Didam', 'GE', 51.9333, 6.1333, False),
|
|
'Voorschoten': ('Voorschoten', 'ZH', 52.1333, 4.4500, False),
|
|
'Alphen': ('Alphen aan den Rijn', 'ZH', 52.1333, 4.6667, False),
|
|
'Houten': ('Houten', 'UT', 52.0333, 5.1667, False),
|
|
'Drogeham': ('Drogeham', 'FR', 53.1167, 6.0667, False),
|
|
'Goor': ('Goor', 'OV', 52.2333, 6.5833, False),
|
|
'Naarden': ('Naarden', 'NH', 52.2833, 5.1500, False),
|
|
'Warmond': ('Warmond', 'ZH', 52.2000, 4.5000, False),
|
|
'Nootdorp': ('Nootdorp', 'ZH', 52.0500, 4.3833, False),
|
|
'IJmuiden': ('IJmuiden', 'NH', 52.4667, 4.6167, False),
|
|
'Hoofddorp': ('Hoofddorp', 'NH', 52.3000, 4.6833, False),
|
|
'Sittard': ('Sittard', 'LI', 51.0000, 5.8667, False),
|
|
'Brielle': ('Brielle', 'ZH', 51.9000, 4.1667, False),
|
|
'Espelo': ('Espelo', 'OV', 52.3833, 6.3667, False),
|
|
'Alblasserdam': ('Alblasserdam', 'ZH', 51.8667, 4.6667, False),
|
|
'Sinoutskerke': ('Sinoutskerke', 'ZE', 51.5000, 3.7500, False),
|
|
'Cothen': ('Cothen', 'UT', 52.0000, 5.3000, False),
|
|
'Giethoorn': ('Giethoorn', 'OV', 52.7333, 6.0833, False),
|
|
'Scheveningen': ('Den Haag', 'ZH', 52.1000, 4.2667, False), # Scheveningen → Den Haag
|
|
'Woerden': ('Woerden', 'UT', 52.0833, 4.8833, False),
|
|
'Workum': ('Workum', 'FR', 52.9833, 5.4500, False),
|
|
'Rotterdam': ('Rotterdam', 'ZH', 51.9167, 4.5000, False),
|
|
'Amsterdam': ('Amsterdam', 'NH', 52.3667, 4.9000, False),
|
|
'Rijssen': ('Rijssen', 'OV', 52.3000, 6.5167, False),
|
|
'Vollenhoofse': ('Vollenhove', 'OV', 52.6833, 5.9500, False),
|
|
'Vollenhove': ('Vollenhove', 'OV', 52.6833, 5.9500, False),
|
|
'Groningen': ('Groningen', 'GR', 53.2167, 6.5667, False),
|
|
'Alkmaar': ('Alkmaar', 'NH', 52.6333, 4.7500, False),
|
|
|
|
# Regional/provincial references (is_regional=True) - these organizations operate across a region
|
|
'Grunneger': ('Groningen', 'GR', 53.2167, 6.5667, True), # Groningen dialect
|
|
'Drentse': ('Assen', 'DR', 52.9925, 6.5625, True), # Drenthe province → capital
|
|
'Drenthe': ('Assen', 'DR', 52.9925, 6.5625, True),
|
|
'Limburgse': ('Maastricht', 'LI', 50.8514, 5.6910, True), # Limburg → capital
|
|
'Limburg': ('Maastricht', 'LI', 50.8514, 5.6910, True),
|
|
'Brabantse': ("'s-Hertogenbosch", 'NB', 51.6978, 5.3037, True), # Noord-Brabant → capital
|
|
'Noord-Brabant': ("'s-Hertogenbosch", 'NB', 51.6978, 5.3037, True),
|
|
'Alkmaars': ('Alkmaar', 'NH', 52.6333, 4.7500, True), # City adjective
|
|
'Hogeland': ('Uithuizen', 'GR', 53.4000, 6.6667, True), # Het Hogeland municipality
|
|
'Goors': ('Goor', 'OV', 52.2333, 6.5833, True), # Goor adjective
|
|
'Rotterdamse': ('Rotterdam', 'ZH', 51.9167, 4.5000, True), # Rotterdam adjective
|
|
|
|
# Amsterdam neighborhoods - map to Amsterdam
|
|
'Floradorp': ('Amsterdam', 'NH', 52.4000, 4.9333, False),
|
|
'Kralingen': ('Rotterdam', 'ZH', 51.9333, 4.5167, False), # Rotterdam neighborhood
|
|
'Kralingse': ('Rotterdam', 'ZH', 51.9333, 4.5167, False), # Kralingen adjective
|
|
|
|
# Additional places from KIEN analysis
|
|
'Hellemonds': ('Helmond', 'NB', 51.4833, 5.6500, True), # Helmond dialect adjective
|
|
'Grolse': ('Groenlo', 'GE', 52.0417, 6.6167, True), # Groenlo adjective
|
|
'Groenlo': ('Groenlo', 'GE', 52.0417, 6.6167, False),
|
|
'Grou': ('Grou', 'FR', 53.0917, 5.8333, False), # Frisian village
|
|
'De Kwakel': ('De Kwakel', 'NH', 52.2333, 4.8000, False),
|
|
'Kwakel': ('De Kwakel', 'NH', 52.2333, 4.8000, False),
|
|
'Airborne': ('Oosterbeek', 'GE', 51.9833, 5.8500, True), # Airborne = Arnhem/Oosterbeek area
|
|
'Oosterbeek': ('Oosterbeek', 'GE', 51.9833, 5.8500, False),
|
|
'Renkum': ('Renkum', 'GE', 51.9667, 5.7500, False),
|
|
'Schinderhannes': ('Maastricht', 'LI', 50.8514, 5.6910, True), # Limburg folklore figure
|
|
'Lanenkaatsen': ('Sint Nicolaasga', 'FR', 52.9000, 5.5333, True), # Frisian sport
|
|
|
|
# Frisian places
|
|
'Skûtsjesilen': ('Sneek', 'FR', 53.0333, 5.6583, True), # Frisian sailing race
|
|
'Fierljep': ('Winsum', 'FR', 53.2833, 5.5500, True), # Frisian sport → origin location
|
|
|
|
# More Netherlands cities
|
|
'Arnhem': ('Arnhem', 'GE', 51.9833, 5.9167, False),
|
|
'Utrecht': ('Utrecht', 'UT', 52.0908, 5.1222, False),
|
|
'Den Haag': ('Den Haag', 'ZH', 52.0705, 4.3007, False),
|
|
"'s-Gravenhage": ('Den Haag', 'ZH', 52.0705, 4.3007, False),
|
|
'Eindhoven': ('Eindhoven', 'NB', 51.4416, 5.4697, False),
|
|
'Maastricht': ('Maastricht', 'LI', 50.8514, 5.6910, False),
|
|
'Nijmegen': ('Nijmegen', 'GE', 51.8425, 5.8528, False),
|
|
'Leiden': ('Leiden', 'ZH', 52.1601, 4.4970, False),
|
|
'Haarlem': ('Haarlem', 'NH', 52.3874, 4.6462, False),
|
|
'Delft': ('Delft', 'ZH', 52.0116, 4.3571, False),
|
|
|
|
# Dam reference (Amsterdam)
|
|
'op de Dam': ('Amsterdam', 'NH', 52.3730, 4.8932, False),
|
|
}
|
|
|
|
# Province code to full name mapping
|
|
PROVINCE_CODES = {
|
|
'DR': 'Drenthe',
|
|
'FL': 'Flevoland',
|
|
'FR': 'Friesland',
|
|
'GE': 'Gelderland',
|
|
'GR': 'Groningen',
|
|
'LI': 'Limburg',
|
|
'NB': 'Noord-Brabant',
|
|
'NH': 'Noord-Holland',
|
|
'OV': 'Overijssel',
|
|
'UT': 'Utrecht',
|
|
'ZE': 'Zeeland',
|
|
'ZH': 'Zuid-Holland',
|
|
}
|
|
|
|
# GeoNames database path
|
|
GEONAMES_DB = Path('/Users/kempersc/apps/glam/data/reference/geonames.db')
|
|
|
|
|
|
def extract_place_from_name(org_name: str) -> Optional[Tuple[str, str, float, float, bool]]:
|
|
"""
|
|
Extract a place name from an organization name.
|
|
Returns (city_name, province_code, lat, lon, is_regional) or None.
|
|
"""
|
|
# Check for known places in the name
|
|
for place, (city_name, province, lat, lon, is_regional) in DUTCH_PLACES.items():
|
|
# Case-insensitive search
|
|
if place.lower() in org_name.lower():
|
|
return (city_name, province, lat, lon, is_regional)
|
|
|
|
return None
|
|
|
|
|
|
def lookup_geonames(place_name: str, country_code: str = 'NL') -> Optional[Dict[str, Any]]:
|
|
"""Look up a place in the GeoNames database."""
|
|
if not GEONAMES_DB.exists():
|
|
return None
|
|
|
|
conn = sqlite3.connect(GEONAMES_DB)
|
|
cursor = conn.cursor()
|
|
|
|
# Try exact match first
|
|
cursor.execute("""
|
|
SELECT geonames_id, name, ascii_name, admin1_code, latitude, longitude,
|
|
population, feature_code
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (country_code, place_name, place_name))
|
|
|
|
row = cursor.fetchone()
|
|
conn.close()
|
|
|
|
if row:
|
|
return {
|
|
'geonames_id': row[0],
|
|
'name': row[1],
|
|
'ascii_name': row[2],
|
|
'admin1_code': row[3],
|
|
'latitude': row[4],
|
|
'longitude': row[5],
|
|
'population': row[6],
|
|
'feature_code': row[7],
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def get_region_code(admin1_code: str) -> str:
|
|
"""Convert GeoNames admin1 code to Dutch province code."""
|
|
admin1_to_province = {
|
|
'01': 'DR', '02': 'FR', '03': 'GE', '04': 'GR',
|
|
'05': 'LI', '06': 'NB', '07': 'NH', '09': 'UT',
|
|
'10': 'ZE', '11': 'ZH', '15': 'OV', '16': 'FL',
|
|
}
|
|
return admin1_to_province.get(admin1_code, 'XX')
|
|
|
|
|
|
def process_entry(entry_path: Path, dry_run: bool = True) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Process a single KIEN entry file.
|
|
Returns location info if extracted, None otherwise.
|
|
"""
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
# Skip if already has locations
|
|
if 'locations' in entry and entry['locations']:
|
|
return None
|
|
|
|
org_name = entry.get('original_entry', {}).get('organisatie', '')
|
|
if not org_name:
|
|
return None
|
|
|
|
# Try to extract place from name
|
|
place_info = extract_place_from_name(org_name)
|
|
|
|
if place_info:
|
|
city_name, province_code, lat, lon, is_regional = place_info
|
|
|
|
# Try to look up in GeoNames for better accuracy
|
|
geonames_info = lookup_geonames(city_name)
|
|
|
|
if geonames_info:
|
|
# Use GeoNames data
|
|
location = {
|
|
'city': geonames_info['name'],
|
|
'country': 'NL',
|
|
'latitude': geonames_info['latitude'],
|
|
'longitude': geonames_info['longitude'],
|
|
}
|
|
resolution = {
|
|
'method': 'NAME_EXTRACTION_GEONAMES',
|
|
'extracted_from': org_name,
|
|
'matched_place': city_name,
|
|
'is_regional': is_regional,
|
|
'geonames_id': geonames_info['geonames_id'],
|
|
'geonames_name': geonames_info['name'],
|
|
'feature_code': geonames_info['feature_code'],
|
|
'population': geonames_info['population'],
|
|
'admin1_code': geonames_info['admin1_code'],
|
|
'region_code': get_region_code(geonames_info['admin1_code']),
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
else:
|
|
# Use hardcoded data
|
|
location = {
|
|
'city': city_name,
|
|
'country': 'NL',
|
|
'latitude': lat,
|
|
'longitude': lon,
|
|
}
|
|
resolution = {
|
|
'method': 'NAME_EXTRACTION_HARDCODED',
|
|
'extracted_from': org_name,
|
|
'matched_place': city_name,
|
|
'is_regional': is_regional,
|
|
'region_code': province_code,
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
if not dry_run:
|
|
# Update the entry
|
|
entry['locations'] = [location]
|
|
entry['location_resolution'] = resolution
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in entry:
|
|
entry['provenance'] = {'notes': []}
|
|
if 'notes' not in entry['provenance']:
|
|
entry['provenance']['notes'] = []
|
|
entry['provenance']['notes'].append(
|
|
f"Location extracted from organization name '{org_name}' - matched place '{city_name}' ({resolution['method']})"
|
|
)
|
|
|
|
with open(entry_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return {
|
|
'file': entry_path.name,
|
|
'org_name': org_name,
|
|
'location': location,
|
|
'resolution': resolution,
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Extract locations from KIEN organization names')
|
|
parser.add_argument('--dry-run', action='store_true', help='Print what would be done without making changes')
|
|
parser.add_argument('--limit', type=int, help='Limit number of entries to process')
|
|
args = parser.parse_args()
|
|
|
|
entries_dir = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
|
|
# Find KIEN entries (17xx and 18xx range)
|
|
kien_files = sorted(list(entries_dir.glob('17*.yaml')) + list(entries_dir.glob('18*.yaml')))
|
|
|
|
if args.limit:
|
|
kien_files = kien_files[:args.limit]
|
|
|
|
extracted = []
|
|
skipped_has_location = 0
|
|
skipped_no_match = 0
|
|
|
|
for entry_path in kien_files:
|
|
# Check if already has locations
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if 'locations' in entry and entry['locations']:
|
|
skipped_has_location += 1
|
|
continue
|
|
|
|
result = process_entry(entry_path, dry_run=args.dry_run)
|
|
|
|
if result:
|
|
extracted.append(result)
|
|
print(f"✓ {result['file']}: {result['org_name']} → {result['location']['city']}")
|
|
else:
|
|
skipped_no_match += 1
|
|
|
|
print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary:")
|
|
print(f" - Entries with locations extracted: {len(extracted)}")
|
|
print(f" - Entries already had locations: {skipped_has_location}")
|
|
print(f" - Entries with no place match: {skipped_no_match}")
|
|
|
|
if extracted and args.dry_run:
|
|
print("\nExtracted locations:")
|
|
for e in extracted:
|
|
print(f" {e['org_name']} → {e['location']['city']} ({e['resolution']['method']})")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|