glam/scripts/enrich_belgian_cities.py

#!/usr/bin/env python3
"""
Enrich Belgian custodian files with city data from ISIL registry.

Strategy:
1. First try to get city from enriched source file (fast)
2. If not found, scrape the Belgian ISIL website (slow, 1 req/sec)

Usage:
    python scripts/enrich_belgian_cities.py [--dry-run]
"""

import os
import re
import sqlite3
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path

# Belgian admin1 codes (GeoNames uses BRU, VLG, WAL)
BELGIAN_ADMIN1_MAP = {
    'BRU': 'BRU',  # Brussels Capital Region
    'VLG': 'VLG',  # Flanders (Vlaanderen)
    'WAL': 'WAL',  # Wallonia (Wallonië)
}

# Belgian city name aliases (Dutch/French variants)
BELGIAN_CITY_ALIASES = {
    'Brussel': 'Brussels',
    'Bruxelles': 'Brussels',
    'Antwerpen': 'Antwerpen',
    'Anvers': 'Antwerpen',
    'Gent': 'Gent',
    'Gand': 'Gent',
    'Luik': 'Liège',
    'Liege': 'Liège',
    'Bergen': 'Mons',
    'Namen': 'Namur',
    'Mechelen': 'Mechelen',
    'Malines': 'Mechelen',
    'Leuven': 'Leuven',
    'Louvain': 'Leuven',
    'Elsene': 'Ixelles',
    'Ukkel': 'Uccle',
    'Oudergem': 'Auderghem',
    'Watermaal-Bosvoorde': 'Watermael-Boitsfort',
    'Sint-Gillis': 'Saint-Gilles',
    'Sint-Jans-Molenbeek': 'Molenbeek-Saint-Jean',
    'Schaarbeek': 'Schaerbeek',
    'Etterbeek': 'Etterbeek',
    'Vorst': 'Forest',
    'Anderlecht': 'Anderlecht',
    'Jette': 'Jette',
    'Koekelberg': 'Koekelberg',
    'Evere': 'Evere',
    'Sint-Pieters-Woluwe': 'Woluwe-Saint-Pierre',
    'Sint-Lambrechts-Woluwe': 'Woluwe-Saint-Lambert',
    'Ganshoren': 'Ganshoren',
}


def load_isil_city_lookup(enriched_file: str) -> dict:
    """Load ISIL -> city mapping from enriched Belgian ISIL file."""
    with open(enriched_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split by 'id:' at start of line
    entries = re.split(r'\n(?=id: BE-)', content)

    lookup = {}
    for entry in entries[1:]:  # Skip header
        # Extract ISIL
        isil_match = re.search(r'^id: (BE-\w+)', entry)
        if not isil_match:
            continue
        isil = isil_match.group(1)

        # Extract city from locations section
        city_match = re.search(r'locations:\s*\n-\s*city:\s*(\S.*)', entry)
        if city_match:
            city = city_match.group(1).strip()
            lookup[isil] = city

    return lookup


def load_isil_source_urls(enriched_file: str) -> dict:
    """Load ISIL -> source_url mapping for web scraping fallback."""
    with open(enriched_file, 'r', encoding='utf-8') as f:
        content = f.read()

    entries = re.split(r'\n(?=id: BE-)', content)

    lookup = {}
    for entry in entries[1:]:
        isil_match = re.search(r'^id: (BE-\w+)', entry)
        url_match = re.search(r'source_url:\s*(https://isil\.kbr\.be/\S+)', entry)
        if isil_match and url_match:
            lookup[isil_match.group(1)] = url_match.group(1)

    return lookup


def scrape_city_from_isil_website(url: str) -> str | None:
    """Scrape city from Belgian ISIL website."""
    try:
        req = urllib.request.Request(url, headers={'User-Agent': 'GLAM-Enricher/1.0'})
        with urllib.request.urlopen(req, timeout=10) as response:
            html = response.read().decode('utf-8')

        # Look for address pattern: "Street, POSTCODE City"
        # Belgian postal codes are 4 digits
        address_match = re.search(r'Walk up adress.*?<td class="output"[^>]*>([^<]+)</td>', html, re.DOTALL | re.IGNORECASE)
        if address_match:
            address = address_match.group(1)
            # Parse city from address: "Veldstraat 53, 9910 Knesselare"
            city_match = re.search(r',\s*(\d{4})\s+([A-Za-zÀ-ÿ\s\-\']+)', address)
            if city_match:
                city = city_match.group(2).strip()
                # Clean up trailing HTML entities
                city = re.sub(r'&\w+;.*$', '', city).strip()
                return city

        return None
    except Exception as e:
        print(f"    Error scraping {url}: {e}")
        return None


def generate_city_code(city_name: str) -> str:
    """Generate 3-letter city code from city name."""
    import unicodedata
    normalized = unicodedata.normalize('NFD', city_name)
    ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

    # Clean up
    clean = re.sub(r'[^a-zA-Z\s-]', '', ascii_name)
    words = clean.split()

    if len(words) == 1:
        return words[0][:3].upper()
    else:
        if len(words) == 2:
            return (words[0][0] + words[1][:2]).upper()
        else:
            return ''.join(w[0] for w in words[:3]).upper()


def lookup_city_in_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
    """Look up city in GeoNames database."""
    cursor = conn.cursor()

    # Check aliases first
    normalized_name = BELGIAN_CITY_ALIASES.get(city_name, city_name)

    # Try exact match first
    cursor.execute('''
        SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
        FROM cities
        WHERE country_code = 'BE'
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
          AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
        ORDER BY population DESC
        LIMIT 1
    ''', (normalized_name, normalized_name))

    row = cursor.fetchone()
    if row:
        return {
            'name': row[0],
            'ascii_name': row[1],
            'admin1_code': row[2],
            'admin1_name': row[3],
            'latitude': row[4],
            'longitude': row[5],
            'geonames_id': row[6],
            'population': row[7],
            'feature_code': row[8],
        }

    # Try original name if alias was used
    if normalized_name != city_name:
        cursor.execute('''
            SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
            FROM cities
            WHERE country_code = 'BE'
              AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
              AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
            ORDER BY population DESC
            LIMIT 1
        ''', (city_name, city_name))

        row = cursor.fetchone()
        if row:
            return {
                'name': row[0],
                'ascii_name': row[1],
                'admin1_code': row[2],
                'admin1_name': row[3],
                'latitude': row[4],
                'longitude': row[5],
                'geonames_id': row[6],
                'population': row[7],
                'feature_code': row[8],
            }

    # Try fuzzy match with LIKE
    cursor.execute('''
        SELECT name, ascii_name, admin1_code, admin1_name, latitude, longitude, geonames_id, population, feature_code
        FROM cities
        WHERE country_code = 'BE'
          AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
          AND (LOWER(name) LIKE LOWER(?) OR LOWER(ascii_name) LIKE LOWER(?))
        ORDER BY population DESC
        LIMIT 1
    ''', (f'{city_name}%', f'{city_name}%'))

    row = cursor.fetchone()
    if row:
        return {
            'name': row[0],
            'ascii_name': row[1],
            'admin1_code': row[2],
            'admin1_name': row[3],
            'latitude': row[4],
            'longitude': row[5],
            'geonames_id': row[6],
            'population': row[7],
            'feature_code': row[8],
        }

    return None


def update_custodian_file(file_path: Path, city_name: str, geo_data: dict, dry_run: bool = False) -> bool:
    """Update a custodian file with city data."""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Extract current GHCID
    ghcid_match = re.search(r'ghcid_current:\s*(\S+)', content)
    if not ghcid_match:
        print(f"  WARNING: No ghcid_current found in {file_path.name}")
        return False

    old_ghcid = ghcid_match.group(1)

    # Generate new GHCID components
    region_code = BELGIAN_ADMIN1_MAP.get(geo_data['admin1_code'], geo_data['admin1_code'])
    city_code = generate_city_code(geo_data['ascii_name'] or geo_data['name'])

    # Build new GHCID: BE-XX-XXX-{type}-{abbrev}[-suffix]
    parts = old_ghcid.split('-')
    if len(parts) >= 5:
        type_code = parts[3]
        abbrev_and_suffix = '-'.join(parts[4:])
        new_ghcid = f"BE-{region_code}-{city_code}-{type_code}-{abbrev_and_suffix}"
    else:
        print(f"  WARNING: Unexpected GHCID format: {old_ghcid}")
        return False

    if old_ghcid == new_ghcid:
        return False

    # Calculate new filename
    old_filename = file_path.name
    new_filename = old_filename.replace(old_ghcid, new_ghcid)
    new_file_path = file_path.parent / new_filename

    # Update content
    new_content = content.replace(old_ghcid, new_ghcid)

    # Update location_resolution section
    old_resolution = re.search(
        r'location_resolution:\s*\n((?:\s+\S.*\n)*)',
        new_content
    )

    if old_resolution:
        new_resolution = f"""location_resolution:
    country_code: BE
    region_code: {region_code}
    region_name: {geo_data['admin1_name']}
    city_code: {city_code}
    city_name: {geo_data['name']}
    geonames_id: {geo_data['geonames_id']}
    feature_code: {geo_data['feature_code']}
    latitude: {geo_data['latitude']}
    longitude: {geo_data['longitude']}
    method: BELGIAN_ISIL_REGISTRY
    resolution_date: '{datetime.now(timezone.utc).isoformat()}'
"""
        new_content = new_content[:old_resolution.start()] + new_resolution + new_content[old_resolution.end():]

    # Add GHCID history entry
    timestamp = datetime.now(timezone.utc).isoformat()
    history_entry = f"""  - ghcid: {new_ghcid}
    valid_from: '{timestamp}'
    reason: City enrichment from Belgian ISIL registry - {city_name} resolved to {geo_data['name']} ({region_code})
"""

    history_match = re.search(r'ghcid_history:\s*\n', new_content)
    if history_match:
        insert_pos = history_match.end()
        new_content = new_content[:insert_pos] + history_entry + new_content[insert_pos:]

    if dry_run:
        print(f"  DRY RUN: Would rename {old_filename} -> {new_filename}")
        print(f"           GHCID: {old_ghcid} -> {new_ghcid}")
        return True

    # Write updated content
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(new_content)

    # Rename file
    if new_file_path != file_path:
        file_path.rename(new_file_path)

    return True


def main():
    dry_run = '--dry-run' in sys.argv

    # Paths
    base_dir = Path(__file__).parent.parent
    custodian_dir = base_dir / 'data' / 'custodian'
    enriched_file = base_dir / 'data' / 'instances' / 'belgium_isil_institutions_enriched.yaml'
    geonames_db = base_dir / 'data' / 'reference' / 'geonames.db'

    print("Belgian City Enrichment Script")
    print("=" * 50)

    if dry_run:
        print("DRY RUN MODE - No changes will be made")

    # Load lookups
    print(f"\nLoading ISIL city lookup from {enriched_file.name}...")
    isil_city_lookup = load_isil_city_lookup(str(enriched_file))
    isil_url_lookup = load_isil_source_urls(str(enriched_file))
    print(f"  Found {len(isil_city_lookup)} ISIL codes with city data")
    print(f"  Found {len(isil_url_lookup)} ISIL codes with source URLs")

    # Connect to GeoNames
    print(f"\nConnecting to GeoNames database...")
    conn = sqlite3.connect(str(geonames_db))

    # Find Belgian XXX files
    print(f"\nFinding Belgian custodian files with XXX placeholder...")
    xxx_files = list(custodian_dir.glob('BE-*-XXX-*.yaml'))
    print(f"  Found {len(xxx_files)} files to process")

    # Process files
    updated = 0
    no_isil = 0
    no_city = 0
    no_geonames = 0
    scraped = 0
    errors = 0
    not_found_cities = []

    for file_path in xxx_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Find ISIL code
            isil_match = re.search(r'identifier_value:\s*(BE-\w+)', content)
            if not isil_match:
                no_isil += 1
                continue

            isil_code = isil_match.group(1)

            # Strategy 1: Look up city from enriched file
            city_name = isil_city_lookup.get(isil_code)

            # Strategy 2: Scrape from website if not in lookup
            if not city_name and isil_code in isil_url_lookup:
                url = isil_url_lookup[isil_code]
                print(f"  Scraping {isil_code} from {url}...")
                city_name = scrape_city_from_isil_website(url)
                if city_name:
                    scraped += 1
                    print(f"    Found: {city_name}")
                time.sleep(1)  # Rate limit

            if not city_name:
                no_city += 1
                continue

            # Look up in GeoNames
            geo_data = lookup_city_in_geonames(city_name, conn)
            if not geo_data:
                no_geonames += 1
                not_found_cities.append((file_path.name, isil_code, city_name))
                continue

            # Update file
            if update_custodian_file(file_path, city_name, geo_data, dry_run):
                updated += 1
                if not dry_run:
                    print(f"  Updated: {file_path.name} -> {city_name} ({geo_data['admin1_code']})")

        except Exception as e:
            errors += 1
            print(f"  ERROR processing {file_path.name}: {e}")

    conn.close()

    # Summary
    print("\n" + "=" * 50)
    print("SUMMARY")
    print("=" * 50)
    print(f"Total XXX files:        {len(xxx_files)}")
    print(f"Updated:                {updated}")
    print(f"Scraped from website:   {scraped}")
    print(f"No ISIL in file:        {no_isil}")
    print(f"No city found:          {no_city}")
    print(f"City not in GeoNames:   {no_geonames}")
    print(f"Errors:                 {errors}")
    print(f"Remaining XXX:          {len(xxx_files) - updated}")

    if not_found_cities:
        print(f"\nCities not found in GeoNames:")
        for fname, isil, city in not_found_cities[:20]:
            print(f"  {isil}: {city}")
        if len(not_found_cities) > 20:
            print(f"  ... and {len(not_found_cities) - 20} more")

    # Generate report
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_path = base_dir / 'reports' / f'BELGIAN_CITY_ENRICHMENT_{timestamp}.md'

    with open(report_path, 'w') as f:
        f.write(f"# Belgian City Enrichment Report\n\n")
        f.write(f"**Date**: {datetime.now().isoformat()}\n")
        f.write(f"**Dry Run**: {dry_run}\n\n")
        f.write(f"## Summary\n\n")
        f.write(f"| Metric | Count |\n")
        f.write(f"|--------|-------|\n")
        f.write(f"| Total XXX files | {len(xxx_files)} |\n")
        f.write(f"| Updated | {updated} |\n")
        f.write(f"| Scraped from website | {scraped} |\n")
        f.write(f"| No ISIL in file | {no_isil} |\n")
        f.write(f"| No city found | {no_city} |\n")
        f.write(f"| City not in GeoNames | {no_geonames} |\n")
        f.write(f"| Errors | {errors} |\n")
        f.write(f"| Remaining XXX | {len(xxx_files) - updated} |\n")

        if not_found_cities:
            f.write(f"\n## Cities Not Found in GeoNames\n\n")
            f.write(f"| File | ISIL | City |\n")
            f.write(f"|------|------|------|\n")
            for fname, isil, city in not_found_cities:
                f.write(f"| {fname} | {isil} | {city} |\n")

    print(f"\nReport written to: {report_path}")


if __name__ == '__main__':
    main()