#!/usr/bin/env python3
"""
Enrich Belgian custodian files with city data from ISIL registry.
Strategy:
1. First try to get city from enriched source file (fast)
2. If not found, scrape the Belgian ISIL website (slow, 1 req/sec)
Usage:
python scripts/enrich_belgian_cities.py [--dry-run]
"""
import os
import re
import sqlite3
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
# Belgian admin1 codes (GeoNames uses BRU, VLG, WAL)
BELGIAN_ADMIN1_MAP = {
'BRU': 'BRU', # Brussels Capital Region
'VLG': 'VLG', # Flanders (Vlaanderen)
'WAL': 'WAL', # Wallonia (Wallonië)
}
# Belgian city name aliases (Dutch/French variants)
BELGIAN_CITY_ALIASES = {
'Brussel': 'Brussels',
'Bruxelles': 'Brussels',
'Antwerpen': 'Antwerpen',
'Anvers': 'Antwerpen',
'Gent': 'Gent',
'Gand': 'Gent',
'Luik': 'Liège',
'Liege': 'Liège',
'Bergen': 'Mons',
'Namen': 'Namur',
'Mechelen': 'Mechelen',
'Malines': 'Mechelen',
'Leuven': 'Leuven',
'Louvain': 'Leuven',
'Elsene': 'Ixelles',
'Ukkel': 'Uccle',
'Oudergem': 'Auderghem',
'Watermaal-Bosvoorde': 'Watermael-Boitsfort',
'Sint-Gillis': 'Saint-Gilles',
'Sint-Jans-Molenbeek': 'Molenbeek-Saint-Jean',
'Schaarbeek': 'Schaerbeek',
'Etterbeek': 'Etterbeek',
'Vorst': 'Forest',
'Anderlecht': 'Anderlecht',
'Jette': 'Jette',
'Koekelberg': 'Koekelberg',
'Evere': 'Evere',
'Sint-Pieters-Woluwe': 'Woluwe-Saint-Pierre',
'Sint-Lambrechts-Woluwe': 'Woluwe-Saint-Lambert',
'Ganshoren': 'Ganshoren',
}
def load_isil_city_lookup(enriched_file: str) -> dict:
"""Load ISIL -> city mapping from enriched Belgian ISIL file."""
with open(enriched_file, 'r', encoding='utf-8') as f:
content = f.read()
# Split by 'id:' at start of line
entries = re.split(r'\n(?=id: BE-)', content)
lookup = {}
for entry in entries[1:]: # Skip header
# Extract ISIL
isil_match = re.search(r'^id: (BE-\w+)', entry)
if not isil_match:
continue
isil = isil_match.group(1)
# Extract city from locations section
city_match = re.search(r'locations:\s*\n-\s*city:\s*(\S.*)', entry)
if city_match:
city = city_match.group(1).strip()
lookup[isil] = city
return lookup
def load_isil_source_urls(enriched_file: str) -> dict:
"""Load ISIL -> source_url mapping for web scraping fallback."""
with open(enriched_file, 'r', encoding='utf-8') as f:
content = f.read()
entries = re.split(r'\n(?=id: BE-)', content)
lookup = {}
for entry in entries[1:]:
isil_match = re.search(r'^id: (BE-\w+)', entry)
url_match = re.search(r'source_url:\s*(https://isil\.kbr\.be/\S+)', entry)
if isil_match and url_match:
lookup[isil_match.group(1)] = url_match.group(1)
return lookup
def scrape_city_from_isil_website(url: str) -> str | None:
"""Scrape city from Belgian ISIL website."""
try:
req = urllib.request.Request(url, headers={'User-Agent': 'GLAM-Enricher/1.0'})
with urllib.request.urlopen(req, timeout=10) as response:
html = response.read().decode('utf-8')
# Look for address pattern: "Street, POSTCODE City"
# Belgian postal codes are 4 digits
address_match = re.search(r'Walk up adress.*?
]*>([^<]+) | ', html, re.DOTALL | re.IGNORECASE)
if address_match:
address = address_match.group(1)
# Parse city from address: "Veldstraat 53, 9910 Knesselare"
city_match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', address)
if city_match:
city = city_match.group(2).strip()
# Clean up trailing HTML entities
city = re.sub(r'&\w+;.*$', '', city).strip()
return city
return None
except Exception as e:
print(f" Error scraping {url}: {e}")
return None
def generate_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name."""
import unicodedata
normalized = unicodedata.normalize('NFD', city_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Clean up
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
words = clean.split()
if len(words) == 1:
return words[0][:3].upper()
else:
if len(words) == 2:
return (words[0][0] + words[1][:2]).upper()
else:
return ''.join(w[0] for w in words[:3]).upper()
def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
"""Look up city in GeoNames database."""
cursor = conn.cursor()
# Check aliases first
normalized_name = BELGIAN_CITY_ALIASES.get(city_name, city_name)
# Try exact match first
cursor.execute('''
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
FROM cities
WHERE country_code = 'BE'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
ORDER BY population DESC
LIMIT 1
''', (normalized_name, normalized_name))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'latitude': row[4],
'longitude': row[5],
'geonames_id': row[6],
'population': row[7],
'feature_code': row[8],
}
# Try original name if alias was used
if normalized_name != city_name:
cursor.execute('''
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
FROM cities
WHERE country_code = 'BE'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
ORDER BY population DESC
LIMIT 1
''', (city_name, city_name))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'latitude': row[4],
'longitude': row[5],
'geonames_id': row[6],
'population': row[7],
'feature_code': row[8],
}
# Try fuzzy match with LIKE
cursor.execute('''
SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
FROM cities
WHERE country_code = 'BE'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
ORDER BY population DESC
LIMIT 1
''', (f'{city_name}%', f'{city_name}%'))
row = cursor.fetchone()
if row:
return {
'name': row[0],
'ascii_name': row[1],
'admin1_code': row[2],
'admin1_name': row[3],
'latitude': row[4],
'longitude': row[5],
'geonames_id': row[6],
'population': row[7],
'feature_code': row[8],
}
return None
def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, dry_run: bool = False) -> bool:
"""Update a custodian file with city data."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Extract current GHCID
ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content)
if not ghcid_match:
print(f" WARNING: No ghcid_current found in {file_path.name}")
return False
old_ghcid = ghcid_match.group(1)
# Generate new GHCID components
region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code'])
city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name'])
# Build new GHCID: BE-XX-XXX-{type}-{abbrev}[-suffix]
parts = old_ghcid.split('-')
if len(parts) >= 5:
type_code = parts[3]
abbrev_and_suffix = '-'.join(parts[4:])
new_ghcid = f"BE-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}"
else:
print(f" WARNING: Unexpected GHCID format: {old_ghcid}")
return False
if old_ghcid == new_ghcid:
return False
# Calculate new filename
old_filename = file_path.name
new_filename = old_filename.replace(old_ghcid, new_ghcid)
new_file_path = file_path.parent / new_filename
# Update content
new_content = content.replace(old_ghcid, new_ghcid)
# Update location_resolution section
old_resolution = re.search(
r'location_resolution:\s*\n((?:\s+\S.*\n)*)',
new_content
)
if old_resolution:
new_resolution = f"""location_resolution:
country_code: BE
region_code: {region_code}
region_name: {geo_data['admin1_name']}
city_code: {city_code}
city_name: {geo_data['name']}
geonames_id: {geo_data['geonames_id']}
feature_code: {geo_data['feature_code']}
latitude: {geo_data['latitude']}
longitude: {geo_data['longitude']}
method: BELGIAN_ISIL_REGISTRY
resolution_date: '{datetime.now(timezone.utc).isoformat()}'
"""
new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():]
# Add GHCID history entry
timestamp = datetime.now(timezone.utc).isoformat()
history_entry = f""" - ghcid: {new_ghcid}
valid_from: '{timestamp}'
reason: City enrichment from Belgian ISIL registry - {city_name} resolved to {geo_data['name']} ({region_code})
"""
history_match = re.search(r'ghcid_history:\s*\n', new_content)
if history_match:
insert_pos = history_match.end()
new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:]
if dry_run:
print(f" DRY RUN: Would rename {old_filename} -> {new_filename}")
print(f" GHCID: {old_ghcid} -> {new_ghcid}")
return True
# Write updated content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(new_content)
# Rename file
if new_file_path != file_path:
file_path.rename(new_file_path)
return True
def main():
dry_run = '--dry-run' in sys.argv
# Paths
base_dir = Path(__file__).parent.parent
custodian_dir = base_dir / 'data' / 'custodian'
enriched_file = base_dir / 'data' / 'instances' / 'belgium_isil_institutions_enriched.yaml'
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
print("Belgian City Enrichment Script")
print("=" * 50)
if dry_run:
print("DRY RUN MODE - No changes will be made")
# Load lookups
print(f"\nLoading ISIL city lookup from {enriched_file.name}...")
isil_city_lookup = load_isil_city_lookup(str(enriched_file))
isil_url_lookup = load_isil_source_urls(str(enriched_file))
print(f" Found {len(isil_city_lookup)} ISIL codes with city data")
print(f" Found {len(isil_url_lookup)} ISIL codes with source URLs")
# Connect to GeoNames
print(f"\nConnecting to GeoNames database...")
conn = sqlite3.connect(str(geonames_db))
# Find Belgian XXX files
print(f"\nFinding Belgian custodian files with XXX placeholder...")
xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
print(f" Found {len(xxx_files)} files to process")
# Process files
updated = 0
no_isil = 0
no_city = 0
no_geonames = 0
scraped = 0
errors = 0
not_found_cities = []
for file_path in xxx_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Find ISIL code
isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content)
if not isil_match:
no_isil += 1
continue
isil_code = isil_match.group(1)
# Strategy 1: Look up city from enriched file
city_name = isil_city_lookup.get(isil_code)
# Strategy 2: Scrape from website if not in lookup
if not city_name and isil_code in isil_url_lookup:
url = isil_url_lookup[isil_code]
print(f" Scraping {isil_code} from {url}...")
city_name = scrape_city_from_isil_website(url)
if city_name:
scraped += 1
print(f" Found: {city_name}")
time.sleep(1) # Rate limit
if not city_name:
no_city += 1
continue
# Look up in GeoNames
geo_data = lookup_city_in_geonames(city_name, conn)
if not geo_data:
no_geonames += 1
not_found_cities.append((file_path.name, isil_code, city_name))
continue
# Update file
if update_custodian_file(file_path, city_name, geo_data, dry_run):
updated += 1
if not dry_run:
print(f" Updated: {file_path.name} -> {city_name} ({geo_data['admin1_code']})")
except Exception as e:
errors += 1
print(f" ERROR processing {file_path.name}: {e}")
conn.close()
# Summary
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Total XXX files: {len(xxx_files)}")
print(f"Updated: {updated}")
print(f"Scraped from website: {scraped}")
print(f"No ISIL in file: {no_isil}")
print(f"No city found: {no_city}")
print(f"City not in GeoNames: {no_geonames}")
print(f"Errors: {errors}")
print(f"Remaining XXX: {len(xxx_files) - updated}")
if not_found_cities:
print(f"\nCities not found in GeoNames:")
for fname, isil, city in not_found_cities[:20]:
print(f" {isil}: {city}")
if len(not_found_cities) > 20:
print(f" ... and {len(not_found_cities) - 20} more")
# Generate report
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_path = base_dir / 'reports' / f'BELGIAN_CITY_ENRICHMENT_{timestamp}.md'
with open(report_path, 'w') as f:
f.write(f"# Belgian City Enrichment Report\n\n")
f.write(f"**Date**: {datetime.now().isoformat()}\n")
f.write(f"**Dry Run**: {dry_run}\n\n")
f.write(f"## Summary\n\n")
f.write(f"| Metric | Count |\n")
f.write(f"|--------|-------|\n")
f.write(f"| Total XXX files | {len(xxx_files)} |\n")
f.write(f"| Updated | {updated} |\n")
f.write(f"| Scraped from website | {scraped} |\n")
f.write(f"| No ISIL in file | {no_isil} |\n")
f.write(f"| No city found | {no_city} |\n")
f.write(f"| City not in GeoNames | {no_geonames} |\n")
f.write(f"| Errors | {errors} |\n")
f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n")
if not_found_cities:
f.write(f"\n## Cities Not Found in GeoNames\n\n")
f.write(f"| File | ISIL | City |\n")
f.write(f"|------|------|------|\n")
for fname, isil, city in not_found_cities:
f.write(f"| {fname} | {isil} | {city} |\n")
print(f"\nReport written to: {report_path}")
if __name__ == '__main__':
main()