#!/usr/bin/env python3 """ Belgian city enrichment v2 - with city name aliases. """ import re import sqlite3 import unicodedata from datetime import datetime, timezone from pathlib import Path # Belgian city aliases (Dutch names → GeoNames names) BELGIAN_CITY_ALIASES = { 'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert', 'sint-pieters-woluwe': 'Woluwe-Saint-Pierre', 'sint-stevens-woluwe': 'Sint-Stevens-Woluwe', 'oostende': 'Ostend', 'gent': 'Gent', 'brugge': 'Brugge', 'brussel': 'Brussels', 'antwerpen': 'Antwerpen', 'luik': 'Liège', 'liège': 'Liège', 'leuven': 'Leuven', 'mechelen': 'Mechelen', 'aalst': 'Aalst', 'hasselt': 'Hasselt', 'kortrijk': 'Kortrijk', 'sint-niklaas': 'Sint-Niklaas', 'genk': 'Genk', 'roeselare': 'Roeselare', # Merged municipalities (2019) 'kluisbergen': 'Kluisbergen', 'lievegem': 'Nevele', # Lievegem was created from Nevele, Waarschoot, Zomergem, Lovendegem 'kruisem': 'Kruishoutem', # Kruisem was created from Kruishoutem and Zingem 'lierde': 'Sint-Maria-Lierde', 'maarkedal': 'Etikhove', # Maarkedal includes Etikhove # Other 'de haan': 'De Haan', 'lint': 'Lint', 'herne': 'Herne', } # Belgian admin1 mapping (GeoNames → ISO 3166-2:BE) BELGIAN_ADMIN1_MAP = { 'Brussels Capital': 'BRU', 'Brussels': 'BRU', 'Flanders': 'VLG', 'Wallonia': 'WAL', } def normalize_city_name(name): """Normalize city name for lookup.""" if not name: return None normalized = unicodedata.normalize('NFD', name.lower()) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') return normalized.strip() def lookup_city(city_name, conn): """Look up city in GeoNames with alias support.""" if not city_name: return None normalized = normalize_city_name(city_name) # Check alias first if normalized in BELGIAN_CITY_ALIASES: lookup_name = BELGIAN_CITY_ALIASES[normalized] else: lookup_name = city_name cursor = conn.cursor() # Try exact match cursor.execute(""" SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population FROM cities WHERE country_code='BE' AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?)) ORDER BY population DESC LIMIT 1 """, (lookup_name, lookup_name)) result = cursor.fetchone() if result: return { 'name': result[0], 'ascii_name': result[1], 'admin1_name': result[2], 'latitude': result[3], 'longitude': result[4], 'geonames_id': result[5], 'population': result[6], } # Try partial match cursor.execute(""" SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population FROM cities WHERE country_code='BE' AND (name LIKE ? OR ascii_name LIKE ?) ORDER BY population DESC LIMIT 1 """, (f"%{lookup_name}%", f"%{lookup_name}%")) result = cursor.fetchone() if result: return { 'name': result[0], 'ascii_name': result[1], 'admin1_name': result[2], 'latitude': result[3], 'longitude': result[4], 'geonames_id': result[5], 'population': result[6], } return None def generate_city_code(city_name): """Generate 3-letter city code.""" normalized = unicodedata.normalize('NFD', city_name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) words = clean.split() articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'} if len(words) == 1: return clean[:3].upper() elif words[0].lower() in articles: return (words[0][0] + words[1][:2]).upper() else: return ''.join(w[0] for w in words[:3]).upper() def main(): base_dir = Path(__file__).parent.parent custodian_dir = base_dir / 'data' / 'custodian' geonames_db = base_dir / 'data' / 'reference' / 'geonames.db' print("Belgian City Enrichment v2") print("=" * 50) conn = sqlite3.connect(str(geonames_db)) # Find Belgian XXX files xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml')) print(f"Found {len(xxx_files)} Belgian XXX files") updated = 0 not_found = [] for file_path in xxx_files: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Get institution name name_match = re.search(r'claim_value:\s*(.+)', content) inst_name = name_match.group(1).strip() if name_match else '' # Try to extract city from filename or name # Belgian cities often in the file details - let's look at the log # The scraper was finding cities from ISIL website # Check if there's city info in the file already city_match = re.search(r'city(?:_name)?:\s*([^\n]+)', content) if city_match: city_name = city_match.group(1).strip().strip('"\'') if city_name and city_name != 'XXX': geo_data = lookup_city(city_name, conn) if geo_data: print(f"✓ {file_path.name}: {city_name} → {geo_data['name']}") updated += 1 # Would update file here else: not_found.append((file_path.name, city_name)) print(f"\nUpdated: {updated}") print(f"Not found: {len(not_found)}") if not_found: print("\nCities not found:") for fname, city in not_found[:20]: print(f" {fname}: {city}") conn.close() if __name__ == '__main__': main()