glam/scripts/enrich_belgian_v2.py

#!/usr/bin/env python3
"""
Belgian city enrichment v2 - with city name aliases.
"""

import re
import sqlite3
import unicodedata
from datetime import datetime, timezone
from pathlib import Path

# Belgian city aliases (Dutch names → GeoNames names)
BELGIAN_CITY_ALIASES = {
    'sint-lambrechts-woluwe': 'Woluwe-Saint-Lambert',
    'sint-pieters-woluwe': 'Woluwe-Saint-Pierre',
    'sint-stevens-woluwe': 'Sint-Stevens-Woluwe',
    'oostende': 'Ostend',
    'gent': 'Gent',
    'brugge': 'Brugge',
    'brussel': 'Brussels',
    'antwerpen': 'Antwerpen',
    'luik': 'Liège',
    'liège': 'Liège',
    'leuven': 'Leuven',
    'mechelen': 'Mechelen',
    'aalst': 'Aalst',
    'hasselt': 'Hasselt',
    'kortrijk': 'Kortrijk',
    'sint-niklaas': 'Sint-Niklaas',
    'genk': 'Genk',
    'roeselare': 'Roeselare',
    # Merged municipalities (2019)
    'kluisbergen': 'Kluisbergen',
    'lievegem': 'Nevele',  # Lievegem was created from Nevele, Waarschoot, Zomergem, Lovendegem
    'kruisem': 'Kruishoutem',  # Kruisem was created from Kruishoutem and Zingem
    'lierde': 'Sint-Maria-Lierde',
    'maarkedal': 'Etikhove',  # Maarkedal includes Etikhove
    # Other
    'de haan': 'De Haan',
    'lint': 'Lint',
    'herne': 'Herne',
}

# Belgian admin1 mapping (GeoNames → ISO 3166-2:BE)
BELGIAN_ADMIN1_MAP = {
    'Brussels Capital': 'BRU',
    'Brussels': 'BRU',
    'Flanders': 'VLG',
    'Wallonia': 'WAL',
}

def normalize_city_name(name):
    """Normalize city name for lookup."""
    if not name:
        return None
    normalized = unicodedata.normalize('NFD', name.lower())
    normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    return normalized.strip()

def lookup_city(city_name, conn):
    """Look up city in GeoNames with alias support."""
    if not city_name:
        return None

    normalized = normalize_city_name(city_name)

    # Check alias first
    if normalized in BELGIAN_CITY_ALIASES:
        lookup_name = BELGIAN_CITY_ALIASES[normalized]
    else:
        lookup_name = city_name

    cursor = conn.cursor()

    # Try exact match
    cursor.execute("""
        SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population
        FROM cities
        WHERE country_code='BE' AND (LOWER(name)=LOWER(?) OR LOWER(ascii_name)=LOWER(?))
        ORDER BY population DESC LIMIT 1
    """, (lookup_name, lookup_name))

    result = cursor.fetchone()
    if result:
        return {
            'name': result[0],
            'ascii_name': result[1],
            'admin1_name': result[2],
            'latitude': result[3],
            'longitude': result[4],
            'geonames_id': result[5],
            'population': result[6],
        }

    # Try partial match
    cursor.execute("""
        SELECT name, ascii_name, admin1_name, latitude, longitude, geonames_id, population
        FROM cities
        WHERE country_code='BE' AND (name LIKE ? OR ascii_name LIKE ?)
        ORDER BY population DESC LIMIT 1
    """, (f"%{lookup_name}%", f"%{lookup_name}%"))

    result = cursor.fetchone()
    if result:
        return {
            'name': result[0],
            'ascii_name': result[1],
            'admin1_name': result[2],
            'latitude': result[3],
            'longitude': result[4],
            'geonames_id': result[5],
            'population': result[6],
        }

    return None

def generate_city_code(city_name):
    """Generate 3-letter city code."""
    normalized = unicodedata.normalize('NFD', city_name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
    words = clean.split()

    articles = {'de', 'het', 'le', 'la', 'les', 'den', 'der', 'des'}

    if len(words) == 1:
        return clean[:3].upper()
    elif words[0].lower() in articles:
        return (words[0][0] + words[1][:2]).upper()
    else:
        return ''.join(w[0] for w in words[:3]).upper()

def main():
    base_dir = Path(__file__).parent.parent
    custodian_dir = base_dir / 'data' / 'custodian'
    geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'

    print("Belgian City Enrichment v2")
    print("=" * 50)

    conn = sqlite3.connect(str(geonames_db))

    # Find Belgian XXX files
    xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
    print(f"Found {len(xxx_files)} Belgian XXX files")

    updated = 0
    not_found = []

    for file_path in xxx_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Get institution name
        name_match = re.search(r'claim_value:\s*(.+)', content)
        inst_name = name_match.group(1).strip() if name_match else ''

        # Try to extract city from filename or name
        # Belgian cities often in the file details - let's look at the log
        # The scraper was finding cities from ISIL website

        # Check if there's city info in the file already
        city_match = re.search(r'city(?:_name)?:\s*([^\n]+)', content)
        if city_match:
            city_name = city_match.group(1).strip().strip('"\'')
            if city_name and city_name != 'XXX':
                geo_data = lookup_city(city_name, conn)
                if geo_data:
                    print(f"✓ {file_path.name}: {city_name} → {geo_data['name']}")
                    updated += 1
                    # Would update file here
                else:
                    not_found.append((file_path.name, city_name))

    print(f"\nUpdated: {updated}")
    print(f"Not found: {len(not_found)}")
    if not_found:
        print("\nCities not found:")
        for fname, city in not_found[:20]:
            print(f"  {fname}: {city}")

    conn.close()

if __name__ == '__main__':
    main()