#!/usr/bin/env python3 """ Fix remaining Belgian XXX files by re-scraping ISIL website with correct city extraction. """ import re import sqlite3 import time import unicodedata from datetime import datetime, timezone from pathlib import Path from urllib.request import urlopen, Request # Belgian admin1 mapping BELGIAN_ADMIN1_MAP = { 'Brussels Capital': 'BRU', 'Brussels': 'BRU', 'Flanders': 'VLG', 'Wallonia': 'WAL', } # City name aliases (Dutch → GeoNames) CITY_ALIASES = { 'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert', 'sint-pieters-woluwe': 'Woluwe-Saint-Pierre', 'oostende': 'Ostend', 'brussel': 'Brussels', 'bruxelles': 'Brussels', } def scrape_isil_city(isil_code): """Scrape city from Belgian ISIL website.""" url = f"https://isil.kbr.be/{isil_code}" try: req = Request(url, headers={'User-Agent': 'Mozilla/5.0 GLAM-Scraper/1.0'}) with urlopen(req, timeout=10) as response: html = response.read().decode('utf-8') # Look for address pattern: "Street 123, POSTCODE City" match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', html) if match: postal_code = match.group(1) city = match.group(2).strip() return city, postal_code # Alternative pattern match = re.search(r'(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', html) if match: return match.group(2).strip(), match.group(1) except Exception as e: print(f" Error scraping {isil_code}: {e}") return None, None def lookup_city(city_name, conn): """Look up city in GeoNames.""" if not city_name: return None # Check alias normalized = city_name.lower().strip() lookup_name = CITY_ALIASES.get(normalized, city_name) cursor = conn.cursor() cursor.execute(""" SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population, feature_code FROM cities WHERE country_code='BE' AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?)) AND feature_code NOT IN ('PPLX') ORDER BY population DESC LIMIT 1 """, (lookup_name, lookup_name)) result = cursor.fetchone() if result: return { 'name': result[0], 'ascii_name': result[1], 'admin1_name': result[2], 'latitude': result[3], 'longitude': result[4], 'geonames_id': result[5], 'population': result[6], } return None def generate_city_code(city_name): """Generate 3-letter city code.""" normalized = unicodedata.normalize('NFD', city_name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) words = clean.split() articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'} if len(words) == 1: return clean[:3].upper() elif words[0].lower() in articles: return (words[0][0] + words[1][:2]).upper() else: return ''.join(w[0] for w in words[:3]).upper() def update_file(file_path, geo_data, method='ISIL_SCRAPE'): """Update custodian file with city data.""" with open(file_path, 'r', encoding='utf-8') as f: content = f.read() city_code = generate_city_code(geo_data['name']) region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_name'], 'XX') # Update GHCID old_ghcid_match = re.search(r'ghcid_current:\s*([^\n]+)', content) if not old_ghcid_match: return False old_ghcid = old_ghcid_match.group(1).strip() new_ghcid = re.sub(r'^BE-XX-XXX-', f'BE-{region_code}-{city_code}-', old_ghcid) if new_ghcid == old_ghcid: return False # Update content content = content.replace(f'ghcid_current: {old_ghcid}', f'ghcid_current: {new_ghcid}') content = content.replace(f'ghcid_original: {old_ghcid}', f'ghcid_original: {new_ghcid}') content = content.replace(f'identifier_value: {old_ghcid}', f'identifier_value: {new_ghcid}') content = content.replace(f"ghcid: {old_ghcid}", f"ghcid: {new_ghcid}") # Update location_resolution content = re.sub(r'region_code:\s*XX', f'region_code: {region_code}', content) content = re.sub(r'city_code:\s*XXX', f'city_code: {city_code}', content) # Add resolution details timestamp = datetime.now(timezone.utc).isoformat() history_entry = f""" - ghcid: {new_ghcid} valid_from: '{timestamp}' reason: City resolved via {method} - {geo_data['name']} (GeoNames ID {geo_data['geonames_id']})""" history_match = re.search(r'(ghcid_history:\s*\n)', content) if history_match: insert_pos = history_match.end() content = content[:insert_pos] + history_entry + content[insert_pos:] with open(file_path, 'w', encoding='utf-8') as f: f.write(content) # Rename file old_filename = file_path.name new_filename = old_filename.replace('BE-XX-XXX-', f'BE-{region_code}-{city_code}-') if new_filename != old_filename: new_path = file_path.parent / new_filename file_path.rename(new_path) return True def main(): import sys dry_run = '--dry-run' in sys.argv base_dir = Path(__file__).parent.parent custodian_dir = base_dir / 'data' / 'custodian' geonames_db = base_dir / 'data' / 'reference' / 'geonames.db' print("Belgian City Fix Script") print("=" * 50) if dry_run: print("DRY RUN MODE\n") conn = sqlite3.connect(str(geonames_db)) xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml')) print(f"Found {len(xxx_files)} Belgian XXX files\n") updated = 0 not_found = [] for file_path in xxx_files: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Get ISIL code isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content) if not isil_match: continue isil_code = isil_match.group(1) # Scrape city from website city, postal = scrape_isil_city(isil_code) if not city: print(f"✗ {file_path.name}: No city found for {isil_code}") not_found.append((file_path.name, isil_code, 'scrape failed')) time.sleep(1) continue # Lookup in GeoNames geo_data = lookup_city(city, conn) if not geo_data: print(f"? {file_path.name}: {city} not in GeoNames") not_found.append((file_path.name, isil_code, city)) time.sleep(1) continue if dry_run: print(f"✓ {file_path.name}: {isil_code} → {city} ({geo_data['name']})") else: if update_file(file_path, geo_data): print(f"✓ Updated: {file_path.name} → {geo_data['name']}") updated += 1 time.sleep(1) # Rate limit print(f"\n{'=' * 50}") print(f"Updated: {updated}") print(f"Not found: {len(not_found)}") if not_found: print("\nNot resolved:") for fname, isil, city in not_found: print(f" {fname}: {isil} → {city}") conn.close() if __name__ == '__main__': main()