glam/scripts/enrich_belgian_v2.py
kempersc e45c1a3c85 feat(scripts): add city enrichment and location resolution utilities
Enrichment scripts for country-specific city data:
- enrich_austrian_cities.py, enrich_belgian_cities.py, enrich_belgian_v2.py
- enrich_bulgarian_cities.py, enrich_czech_cities.py, enrich_czech_cities_fast.py
- enrich_japanese_cities.py, enrich_swiss_isil_cities.py, enrich_cities_google.py

Location resolution utilities:
- resolve_cities_from_file_coords.py - Resolve cities using coordinates in filenames
- resolve_cities_wikidata.py - Use Wikidata P131 for city resolution
- resolve_country_codes.py - Standardize country codes
- resolve_cz_xx_regions.py - Fix Czech XX region codes
- resolve_locations_by_name.py - Name-based location lookup
- resolve_regions_from_city.py - Derive regions from city data
- update_ghcid_with_geonames.py - Update GHCIDs with GeoNames data

CH-Annotator integration:
- create_custodian_from_ch_annotator.py - Create custodians from annotations
- add_ch_annotator_location_claims.py - Add location claims
- extract_locations_ch_annotator.py - Extract locations from annotations

Migration and fixes:
- migrate_egyptian_from_ch.py - Migrate Egyptian data
- migrate_web_archives.py - Migrate web archive data
- fix_belgian_cities.py - Fix Belgian city data
2025-12-07 14:26:59 +01:00

185 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Belgian city enrichment v2 - with city name aliases.
"""
import re
import sqlite3
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
# Belgian city aliases (Dutch names → GeoNames names)
BELGIAN_CITY_ALIASES = {
'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert',
'sint-pieters-woluwe': 'Woluwe-Saint-Pierre',
'sint-stevens-woluwe': 'Sint-Stevens-Woluwe',
'oostende': 'Ostend',
'gent': 'Gent',
'brugge': 'Brugge',
'brussel': 'Brussels',
'antwerpen': 'Antwerpen',
'luik': 'Liège',
'liège': 'Liège',
'leuven': 'Leuven',
'mechelen': 'Mechelen',
'aalst': 'Aalst',
'hasselt': 'Hasselt',
'kortrijk': 'Kortrijk',
'sint-niklaas': 'Sint-Niklaas',
'genk': 'Genk',
'roeselare': 'Roeselare',
# Merged municipalities (2019)
'kluisbergen': 'Kluisbergen',
'lievegem': 'Nevele', # Lievegem was created from Nevele, Waarschoot, Zomergem, Lovendegem
'kruisem': 'Kruishoutem', # Kruisem was created from Kruishoutem and Zingem
'lierde': 'Sint-Maria-Lierde',
'maarkedal': 'Etikhove', # Maarkedal includes Etikhove
# Other
'de haan': 'De Haan',
'lint': 'Lint',
'herne': 'Herne',
}
# Belgian admin1 mapping (GeoNames → ISO 3166-2:BE)
BELGIAN_ADMIN1_MAP = {
'Brussels Capital': 'BRU',
'Brussels': 'BRU',
'Flanders': 'VLG',
'Wallonia': 'WAL',
}
def normalize_city_name(name):
"""Normalize city name for lookup."""
if not name:
return None
normalized = unicodedata.normalize('NFD', name.lower())
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return normalized.strip()
def lookup_city(city_name, conn):
"""Look up city in GeoNames with alias support."""
if not city_name:
return None
normalized = normalize_city_name(city_name)
# Check alias first
if normalized in BELGIAN_CITY_ALIASES:
lookup_name = BELGIAN_CITY_ALIASES[normalized]
else:
lookup_name = city_name
cursor = conn.cursor()
# Try exact match
cursor.execute("""
SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population
FROM cities
WHERE country_code='BE' AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?))
ORDER BY population DESC LIMIT 1
""", (lookup_name, lookup_name))
result = cursor.fetchone()
if result:
return {
'name': result[0],
'ascii_name': result[1],
'admin1_name': result[2],
'latitude': result[3],
'longitude': result[4],
'geonames_id': result[5],
'population': result[6],
}
# Try partial match
cursor.execute("""
SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population
FROM cities
WHERE country_code='BE' AND (name LIKE ? OR ascii_name LIKE ?)
ORDER BY population DESC LIMIT 1
""", (f"%{lookup_name}%", f"%{lookup_name}%"))
result = cursor.fetchone()
if result:
return {
'name': result[0],
'ascii_name': result[1],
'admin1_name': result[2],
'latitude': result[3],
'longitude': result[4],
'geonames_id': result[5],
'population': result[6],
}
return None
def generate_city_code(city_name):
"""Generate 3-letter city code."""
normalized = unicodedata.normalize('NFD', city_name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
words = clean.split()
articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'}
if len(words) == 1:
return clean[:3].upper()
elif words[0].lower() in articles:
return (words[0][0] + words[1][:2]).upper()
else:
return ''.join(w[0] for w in words[:3]).upper()
def main():
base_dir = Path(__file__).parent.parent
custodian_dir = base_dir / 'data' / 'custodian'
geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'
print("Belgian City Enrichment v2")
print("=" * 50)
conn = sqlite3.connect(str(geonames_db))
# Find Belgian XXX files
xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
print(f"Found {len(xxx_files)} Belgian XXX files")
updated = 0
not_found = []
for file_path in xxx_files:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Get institution name
name_match = re.search(r'claim_value:\s*(.+)', content)
inst_name = name_match.group(1).strip() if name_match else ''
# Try to extract city from filename or name
# Belgian cities often in the file details - let's look at the log
# The scraper was finding cities from ISIL website
# Check if there's city info in the file already
city_match = re.search(r'city(?:_name)?:\s*([^\n]+)', content)
if city_match:
city_name = city_match.group(1).strip().strip('"\'')
if city_name and city_name != 'XXX':
geo_data = lookup_city(city_name, conn)
if geo_data:
print(f"{file_path.name}: {city_name}{geo_data['name']}")
updated += 1
# Would update file here
else:
not_found.append((file_path.name, city_name))
print(f"\nUpdated: {updated}")
print(f"Not found: {len(not_found)}")
if not_found:
print("\nCities not found:")
for fname, city in not_found[:20]:
print(f" {fname}: {city}")
conn.close()
if __name__ == '__main__':
main()