#!/usr/bin/env python3 """ Enrich Belgian custodian files with city data from ISIL registry. Strategy: 1. First try to get city from enriched source file (fast) 2. If not found, scrape the Belgian ISIL website (slow, 1 req/sec) Usage: python scripts/enrich_belgian_cities.py [--dry-run] """ import os import re import sqlite3 import sys import time import urllib.request from datetime import datetime, timezone from pathlib import Path # Belgian admin1 codes (GeoNames uses BRU, VLG, WAL) BELGIAN_ADMIN1_MAP = { 'BRU': 'BRU', # Brussels Capital Region 'VLG': 'VLG', # Flanders (Vlaanderen) 'WAL': 'WAL', # Wallonia (Wallonië) } # Belgian city name aliases (Dutch/French variants) BELGIAN_CITY_ALIASES = { 'Brussel': 'Brussels', 'Bruxelles': 'Brussels', 'Antwerpen': 'Antwerpen', 'Anvers': 'Antwerpen', 'Gent': 'Gent', 'Gand': 'Gent', 'Luik': 'Liège', 'Liege': 'Liège', 'Bergen': 'Mons', 'Namen': 'Namur', 'Mechelen': 'Mechelen', 'Malines': 'Mechelen', 'Leuven': 'Leuven', 'Louvain': 'Leuven', 'Elsene': 'Ixelles', 'Ukkel': 'Uccle', 'Oudergem': 'Auderghem', 'Watermaal-Bosvoorde': 'Watermael-Boitsfort', 'Sint-Gillis': 'Saint-Gilles', 'Sint-Jans-Molenbeek': 'Molenbeek-Saint-Jean', 'Schaarbeek': 'Schaerbeek', 'Etterbeek': 'Etterbeek', 'Vorst': 'Forest', 'Anderlecht': 'Anderlecht', 'Jette': 'Jette', 'Koekelberg': 'Koekelberg', 'Evere': 'Evere', 'Sint-Pieters-Woluwe': 'Woluwe-Saint-Pierre', 'Sint-Lambrechts-Woluwe': 'Woluwe-Saint-Lambert', 'Ganshoren': 'Ganshoren', } def load_isil_city_lookup(enriched_file: str) -> dict: """Load ISIL -> city mapping from enriched Belgian ISIL file.""" with open(enriched_file, 'r', encoding='utf-8') as f: content = f.read() # Split by 'id:' at start of line entries = re.split(r'\n(?=id: BE-)', content) lookup = {} for entry in entries[1:]: # Skip header # Extract ISIL isil_match = re.search(r'^id: (BE-\w+)', entry) if not isil_match: continue isil = isil_match.group(1) # Extract city from locations section city_match = re.search(r'locations:\s*\n-\s*city:\s*(\S.*)', entry) if city_match: city = city_match.group(1).strip() lookup[isil] = city return lookup def load_isil_source_urls(enriched_file: str) -> dict: """Load ISIL -> source_url mapping for web scraping fallback.""" with open(enriched_file, 'r', encoding='utf-8') as f: content = f.read() entries = re.split(r'\n(?=id: BE-)', content) lookup = {} for entry in entries[1:]: isil_match = re.search(r'^id: (BE-\w+)', entry) url_match = re.search(r'source_url:\s*(https://isil\.kbr\.be/\S+)', entry) if isil_match and url_match: lookup[isil_match.group(1)] = url_match.group(1) return lookup def scrape_city_from_isil_website(url: str) -> str | None: """Scrape city from Belgian ISIL website.""" try: req = urllib.request.Request(url, headers={'User-Agent': 'GLAM-Enricher/1.0'}) with urllib.request.urlopen(req, timeout=10) as response: html = response.read().decode('utf-8') # Look for address pattern: "Street, POSTCODE City" # Belgian postal codes are 4 digits address_match = re.search(r'Walk up adress.*?]*>([^<]+)', html, re.DOTALL | re.IGNORECASE) if address_match: address = address_match.group(1) # Parse city from address: "Veldstraat 53, 9910 Knesselare" city_match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', address) if city_match: city = city_match.group(2).strip() # Clean up trailing HTML entities city = re.sub(r'&\w+;.*$', '', city).strip() return city return None except Exception as e: print(f" Error scraping {url}: {e}") return None def generate_city_code(city_name: str) -> str: """Generate 3-letter city code from city name.""" import unicodedata normalized = unicodedata.normalize('NFD', city_name) ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Clean up clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name) words = clean.split() if len(words) == 1: return words[0][:3].upper() else: if len(words) == 2: return (words[0][0] + words[1][:2]).upper() else: return ''.join(w[0] for w in words[:3]).upper() def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None: """Look up city in GeoNames database.""" cursor = conn.cursor() # Check aliases first normalized_name = BELGIAN_CITY_ALIASES.get(city_name, city_name) # Try exact match first cursor.execute(''' SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code FROM cities WHERE country_code = 'BE' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) ORDER BY population DESC LIMIT 1 ''', (normalized_name, normalized_name)) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'latitude': row[4], 'longitude': row[5], 'geonames_id': row[6], 'population': row[7], 'feature_code': row[8], } # Try original name if alias was used if normalized_name != city_name: cursor.execute(''' SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code FROM cities WHERE country_code = 'BE' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) ORDER BY population DESC LIMIT 1 ''', (city_name, city_name)) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'latitude': row[4], 'longitude': row[5], 'geonames_id': row[6], 'population': row[7], 'feature_code': row[8], } # Try fuzzy match with LIKE cursor.execute(''' SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code FROM cities WHERE country_code = 'BE' AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?)) ORDER BY population DESC LIMIT 1 ''', (f'{city_name}%', f'{city_name}%')) row = cursor.fetchone() if row: return { 'name': row[0], 'ascii_name': row[1], 'admin1_code': row[2], 'admin1_name': row[3], 'latitude': row[4], 'longitude': row[5], 'geonames_id': row[6], 'population': row[7], 'feature_code': row[8], } return None def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, dry_run: bool = False) -> bool: """Update a custodian file with city data.""" with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Extract current GHCID ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content) if not ghcid_match: print(f" WARNING: No ghcid_current found in {file_path.name}") return False old_ghcid = ghcid_match.group(1) # Generate new GHCID components region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code']) city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name']) # Build new GHCID: BE-XX-XXX-{type}-{abbrev}[-suffix] parts = old_ghcid.split('-') if len(parts) >= 5: type_code = parts[3] abbrev_and_suffix = '-'.join(parts[4:]) new_ghcid = f"BE-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}" else: print(f" WARNING: Unexpected GHCID format: {old_ghcid}") return False if old_ghcid == new_ghcid: return False # Calculate new filename old_filename = file_path.name new_filename = old_filename.replace(old_ghcid, new_ghcid) new_file_path = file_path.parent / new_filename # Update content new_content = content.replace(old_ghcid, new_ghcid) # Update location_resolution section old_resolution = re.search( r'location_resolution:\s*\n((?:\s+\S.*\n)*)', new_content ) if old_resolution: new_resolution = f"""location_resolution: country_code: BE region_code: {region_code} region_name: {geo_data['admin1_name']} city_code: {city_code} city_name: {geo_data['name']} geonames_id: {geo_data['geonames_id']} feature_code: {geo_data['feature_code']} latitude: {geo_data['latitude']} longitude: {geo_data['longitude']} method: BELGIAN_ISIL_REGISTRY resolution_date: '{datetime.now(timezone.utc).isoformat()}' """ new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():] # Add GHCID history entry timestamp = datetime.now(timezone.utc).isoformat() history_entry = f""" - ghcid: {new_ghcid} valid_from: '{timestamp}' reason: City enrichment from Belgian ISIL registry - {city_name} resolved to {geo_data['name']} ({region_code}) """ history_match = re.search(r'ghcid_history:\s*\n', new_content) if history_match: insert_pos = history_match.end() new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:] if dry_run: print(f" DRY RUN: Would rename {old_filename} -> {new_filename}") print(f" GHCID: {old_ghcid} -> {new_ghcid}") return True # Write updated content with open(file_path, 'w', encoding='utf-8') as f: f.write(new_content) # Rename file if new_file_path != file_path: file_path.rename(new_file_path) return True def main(): dry_run = '--dry-run' in sys.argv # Paths base_dir = Path(__file__).parent.parent custodian_dir = base_dir / 'data' / 'custodian' enriched_file = base_dir / 'data' / 'instances' / 'belgium_isil_institutions_enriched.yaml' geonames_db = base_dir / 'data' / 'reference' / 'geonames.db' print("Belgian City Enrichment Script") print("=" * 50) if dry_run: print("DRY RUN MODE - No changes will be made") # Load lookups print(f"\nLoading ISIL city lookup from {enriched_file.name}...") isil_city_lookup = load_isil_city_lookup(str(enriched_file)) isil_url_lookup = load_isil_source_urls(str(enriched_file)) print(f" Found {len(isil_city_lookup)} ISIL codes with city data") print(f" Found {len(isil_url_lookup)} ISIL codes with source URLs") # Connect to GeoNames print(f"\nConnecting to GeoNames database...") conn = sqlite3.connect(str(geonames_db)) # Find Belgian XXX files print(f"\nFinding Belgian custodian files with XXX placeholder...") xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml')) print(f" Found {len(xxx_files)} files to process") # Process files updated = 0 no_isil = 0 no_city = 0 no_geonames = 0 scraped = 0 errors = 0 not_found_cities = [] for file_path in xxx_files: try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Find ISIL code isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content) if not isil_match: no_isil += 1 continue isil_code = isil_match.group(1) # Strategy 1: Look up city from enriched file city_name = isil_city_lookup.get(isil_code) # Strategy 2: Scrape from website if not in lookup if not city_name and isil_code in isil_url_lookup: url = isil_url_lookup[isil_code] print(f" Scraping {isil_code} from {url}...") city_name = scrape_city_from_isil_website(url) if city_name: scraped += 1 print(f" Found: {city_name}") time.sleep(1) # Rate limit if not city_name: no_city += 1 continue # Look up in GeoNames geo_data = lookup_city_in_geonames(city_name, conn) if not geo_data: no_geonames += 1 not_found_cities.append((file_path.name, isil_code, city_name)) continue # Update file if update_custodian_file(file_path, city_name, geo_data, dry_run): updated += 1 if not dry_run: print(f" Updated: {file_path.name} -> {city_name} ({geo_data['admin1_code']})") except Exception as e: errors += 1 print(f" ERROR processing {file_path.name}: {e}") conn.close() # Summary print("\n" + "=" * 50) print("SUMMARY") print("=" * 50) print(f"Total XXX files: {len(xxx_files)}") print(f"Updated: {updated}") print(f"Scraped from website: {scraped}") print(f"No ISIL in file: {no_isil}") print(f"No city found: {no_city}") print(f"City not in GeoNames: {no_geonames}") print(f"Errors: {errors}") print(f"Remaining XXX: {len(xxx_files) - updated}") if not_found_cities: print(f"\nCities not found in GeoNames:") for fname, isil, city in not_found_cities[:20]: print(f" {isil}: {city}") if len(not_found_cities) > 20: print(f" ... and {len(not_found_cities) - 20} more") # Generate report timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') report_path = base_dir / 'reports' / f'BELGIAN_CITY_ENRICHMENT_{timestamp}.md' with open(report_path, 'w') as f: f.write(f"# Belgian City Enrichment Report\n\n") f.write(f"**Date**: {datetime.now().isoformat()}\n") f.write(f"**Dry Run**: {dry_run}\n\n") f.write(f"## Summary\n\n") f.write(f"| Metric | Count |\n") f.write(f"|--------|-------|\n") f.write(f"| Total XXX files | {len(xxx_files)} |\n") f.write(f"| Updated | {updated} |\n") f.write(f"| Scraped from website | {scraped} |\n") f.write(f"| No ISIL in file | {no_isil} |\n") f.write(f"| No city found | {no_city} |\n") f.write(f"| City not in GeoNames | {no_geonames} |\n") f.write(f"| Errors | {errors} |\n") f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n") if not_found_cities: f.write(f"\n## Cities Not Found in GeoNames\n\n") f.write(f"| File | ISIL | City |\n") f.write(f"|------|------|------|\n") for fname, isil, city in not_found_cities: f.write(f"| {fname} | {isil} | {city} |\n") print(f"\nReport written to: {report_path}") if __name__ == '__main__': main()