#!/usr/bin/env python3 """ Load International Administrative Boundaries into PostGIS Supports multiple data sources: - GADM (Global Administrative Areas) - Primary global source - Natural Earth - Simplified global boundaries - CBS (Netherlands) - Official Dutch municipality boundaries - OSM Boundaries - OpenStreetMap extracts Usage: python load_boundaries_postgis.py --source gadm --country NL python load_boundaries_postgis.py --source cbs python load_boundaries_postgis.py --source natural_earth --level 1 python load_boundaries_postgis.py --countries JP,CZ,DE,BE --source gadm """ import argparse import json import logging import os import sys from datetime import date from pathlib import Path from typing import Optional, List, Dict, Any from urllib.request import urlretrieve import zipfile import tempfile # Third-party imports try: import psycopg2 from psycopg2.extras import execute_values import geopandas as gpd from shapely import wkt from shapely.geometry import shape, mapping except ImportError as e: print(f"Missing dependency: {e}") print("Install with: pip install psycopg2-binary geopandas shapely") sys.exit(1) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # ============================================================================ # Configuration # ============================================================================ # Database connection (from environment or defaults) DB_HOST = os.getenv('POSTGRES_HOST', 'localhost') DB_PORT = os.getenv('POSTGRES_PORT', '5432') DB_NAME = os.getenv('POSTGRES_DB', 'glam') DB_USER = os.getenv('POSTGRES_USER', 'glam_api') DB_PASSWORD = os.getenv('POSTGRES_PASSWORD', '') # Data source URLs GADM_BASE_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_0.json" GADM_ADM1_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_1.json" GADM_ADM2_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_2.json" NATURAL_EARTH_URL = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip" # ISO 3166-1 alpha-2 to alpha-3 mapping for common countries ISO_A2_TO_A3 = { 'NL': 'NLD', 'DE': 'DEU', 'BE': 'BEL', 'FR': 'FRA', 'GB': 'GBR', 'JP': 'JPN', 'CZ': 'CZE', 'CH': 'CHE', 'AT': 'AUT', 'PL': 'POL', 'IT': 'ITA', 'ES': 'ESP', 'PT': 'PRT', 'DK': 'DNK', 'SE': 'SWE', 'NO': 'NOR', 'FI': 'FIN', 'IE': 'IRL', 'LU': 'LUX', 'BR': 'BRA', 'MX': 'MEX', 'AR': 'ARG', 'CL': 'CHL', 'CO': 'COL', 'EG': 'EGY', 'PS': 'PSE', 'IL': 'ISR', 'DZ': 'DZA', 'AU': 'AUS', 'NZ': 'NZL', 'KR': 'KOR', 'TW': 'TWN', 'CN': 'CHN', 'IN': 'IND', 'RU': 'RUS', 'UA': 'UKR', 'TR': 'TUR', 'GR': 'GRC', 'BG': 'BGR', 'RO': 'ROU', 'HU': 'HUN', 'SK': 'SVK', 'SI': 'SVN', 'HR': 'HRV', 'RS': 'SRB', 'US': 'USA', 'CA': 'CAN', 'ZA': 'ZAF', 'LB': 'LBN', } # ============================================================================ # Database Functions # ============================================================================ def get_db_connection(): """Get PostgreSQL connection with PostGIS.""" return psycopg2.connect( host=DB_HOST, port=DB_PORT, dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD ) def get_source_id(conn, source_code: str) -> int: """Get or create data source ID.""" with conn.cursor() as cur: cur.execute( "SELECT id FROM boundary_data_sources WHERE source_code = %s", (source_code,) ) row = cur.fetchone() if row: return row[0] # Source not found - this shouldn't happen if schema is properly initialized raise ValueError(f"Unknown data source: {source_code}") def get_or_create_country(conn, iso_a2: str, iso_a3: str, name: str, geom_wkt: str, source_id: int) -> int: """Get or create country record, return ID.""" with conn.cursor() as cur: # Check if exists cur.execute( """SELECT id FROM boundary_countries WHERE iso_a2 = %s AND valid_to IS NULL""", (iso_a2,) ) row = cur.fetchone() if row: return row[0] # Insert new cur.execute( """INSERT INTO boundary_countries (iso_a2, iso_a3, country_name, geom, centroid, source_id, valid_from) VALUES (%s, %s, %s, ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)), %s, CURRENT_DATE) RETURNING id""", (iso_a2, iso_a3, name, geom_wkt, geom_wkt, source_id) ) return cur.fetchone()[0] # ============================================================================ # GADM Loader # ============================================================================ def download_gadm(iso_a3: str, level: int, cache_dir: Path) -> Optional[Path]: """Download GADM GeoJSON for a country and admin level.""" cache_dir.mkdir(parents=True, exist_ok=True) if level == 0: url = GADM_BASE_URL.format(iso3=iso_a3) elif level == 1: url = GADM_ADM1_URL.format(iso3=iso_a3) elif level == 2: url = GADM_ADM2_URL.format(iso3=iso_a3) else: logger.error(f"Unsupported GADM level: {level}") return None filename = f"gadm41_{iso_a3}_{level}.json" filepath = cache_dir / filename if filepath.exists(): logger.info(f"Using cached: {filepath}") return filepath logger.info(f"Downloading: {url}") try: urlretrieve(url, filepath) return filepath except Exception as e: logger.error(f"Failed to download {url}: {e}") return None def load_gadm_country(conn, iso_a2: str, cache_dir: Path) -> Dict[str, int]: """Load GADM data for a country (levels 0, 1, 2).""" iso_a3 = ISO_A2_TO_A3.get(iso_a2) if not iso_a3: logger.error(f"Unknown country code: {iso_a2}") return {'admin0': 0, 'admin1': 0, 'admin2': 0} source_id = get_source_id(conn, 'GADM') counts = {'admin0': 0, 'admin1': 0, 'admin2': 0} # Level 0 - Country boundary level0_file = download_gadm(iso_a3, 0, cache_dir) if level0_file: gdf = gpd.read_file(level0_file) for _, row in gdf.iterrows(): geom_wkt = row.geometry.wkt country_name = row.get('COUNTRY') or row.get('NAME_0') or iso_a2 country_id = get_or_create_country( conn, iso_a2, iso_a3, str(country_name), geom_wkt, source_id ) counts['admin0'] = 1 logger.info(f"Loaded country: {iso_a2} (id={country_id})") # Level 1 - Admin1 (states/provinces) level1_file = download_gadm(iso_a3, 1, cache_dir) if level1_file: gdf = gpd.read_file(level1_file) # Get country ID with conn.cursor() as cur: cur.execute( "SELECT id FROM boundary_countries WHERE iso_a2 = %s AND valid_to IS NULL", (iso_a2,) ) country_row = cur.fetchone() if not country_row: logger.warning(f"Country {iso_a2} not found, skipping admin1") else: country_id = country_row[0] for _, row in gdf.iterrows(): admin1_code = row.get('GID_1', row.get('HASC_1', '')) admin1_name = row.get('NAME_1', '') admin1_type = row.get('TYPE_1', 'Region') geom_wkt = row.geometry.wkt # Generate ISO 3166-2 code iso_3166_2 = f"{iso_a2}-{admin1_code.split('.')[-1][:3].upper()}" if admin1_code else None cur.execute( """INSERT INTO boundary_admin1 (country_id, iso_3166_2, admin1_code, admin1_name, admin1_type, geom, centroid, source_id, source_feature_id, valid_from) VALUES (%s, %s, %s, %s, %s, ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)), %s, %s, CURRENT_DATE) ON CONFLICT (country_id, admin1_code, valid_to) DO UPDATE SET admin1_name = EXCLUDED.admin1_name, geom = EXCLUDED.geom, updated_at = NOW()""", (country_id, iso_3166_2, admin1_code, admin1_name, admin1_type, geom_wkt, geom_wkt, source_id, row.get('GID_1')) ) counts['admin1'] += 1 conn.commit() logger.info(f"Loaded {counts['admin1']} admin1 regions for {iso_a2}") # Level 2 - Admin2 (municipalities/districts) level2_file = download_gadm(iso_a3, 2, cache_dir) if level2_file: gdf = gpd.read_file(level2_file) with conn.cursor() as cur: # Get country ID cur.execute( "SELECT id FROM boundary_countries WHERE iso_a2 = %s AND valid_to IS NULL", (iso_a2,) ) country_row = cur.fetchone() if not country_row: logger.warning(f"Country {iso_a2} not found, skipping admin2") else: country_id = country_row[0] for _, row in gdf.iterrows(): admin1_code = row.get('GID_1', '') admin2_code = row.get('GID_2', row.get('HASC_2', '')) admin2_name = row.get('NAME_2', '') admin2_type = row.get('TYPE_2', 'Municipality') geom_wkt = row.geometry.wkt # Find admin1 ID cur.execute( """SELECT id FROM boundary_admin1 WHERE country_id = %s AND admin1_code = %s AND valid_to IS NULL""", (country_id, admin1_code) ) admin1_row = cur.fetchone() admin1_id = admin1_row[0] if admin1_row else None cur.execute( """INSERT INTO boundary_admin2 (admin1_id, country_id, admin2_code, admin2_name, admin2_type, geom, centroid, source_id, source_feature_id, valid_from) VALUES (%s, %s, %s, %s, %s, ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)), %s, %s, CURRENT_DATE) ON CONFLICT (admin1_id, admin2_code, valid_to) DO UPDATE SET admin2_name = EXCLUDED.admin2_name, geom = EXCLUDED.geom, updated_at = NOW()""", (admin1_id, country_id, admin2_code, admin2_name, admin2_type, geom_wkt, geom_wkt, source_id, row.get('GID_2')) ) counts['admin2'] += 1 conn.commit() logger.info(f"Loaded {counts['admin2']} admin2 regions for {iso_a2}") return counts # ============================================================================ # CBS Netherlands Loader # ============================================================================ def load_cbs_netherlands(conn, geojson_path: Path) -> Dict[str, int]: """Load CBS municipality boundaries for Netherlands.""" source_id = get_source_id(conn, 'CBS') counts = {'admin2': 0} if not geojson_path.exists(): logger.error(f"CBS GeoJSON not found: {geojson_path}") return counts gdf = gpd.read_file(geojson_path) logger.info(f"Loaded {len(gdf)} features from CBS data") with conn.cursor() as cur: # Ensure Netherlands country exists cur.execute( "SELECT id FROM boundary_countries WHERE iso_a2 = 'NL' AND valid_to IS NULL" ) country_row = cur.fetchone() if not country_row: logger.warning("Netherlands not found in countries table, creating...") cur.execute( """INSERT INTO boundary_countries (iso_a2, iso_a3, country_name, valid_from, source_id) VALUES ('NL', 'NLD', 'Netherlands', CURRENT_DATE, %s) RETURNING id""", (source_id,) ) country_id = cur.fetchone()[0] else: country_id = country_row[0] for _, row in gdf.iterrows(): # CBS field names vary - try common patterns # Known formats: 'code'/'naam', 'gemeentecode'/'gemeentenaam', 'GM_CODE'/'GM_NAAM' gemeente_code = row.get('code', row.get('gemeentecode', row.get('statcode', row.get('GM_CODE', '')))) gemeente_name = row.get('naam', row.get('gemeentenaam', row.get('statnaam', row.get('GM_NAAM', '')))) provincie_code = row.get('provincieCode', row.get('provincie_code', '')) provincie_naam = row.get('provincieNaam', row.get('provincie_naam', '')) if not gemeente_name: continue geom_wkt = row.geometry.wkt # Determine admin1 (province) from gemeente code or name # CBS codes: GM0363 = Amsterdam (NH), GM0518 = 's-Gravenhage (ZH) # For now, leave admin1_id as NULL - can be enriched later cur.execute( """INSERT INTO boundary_admin2 (admin1_id, country_id, admin2_code, cbs_gemeente_code, admin2_name, admin2_type, geom, centroid, source_id, valid_from) VALUES (NULL, %s, %s, %s, %s, 'Municipality', ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)), %s, CURRENT_DATE) ON CONFLICT (admin1_id, admin2_code, valid_to) DO UPDATE SET admin2_name = EXCLUDED.admin2_name, cbs_gemeente_code = EXCLUDED.cbs_gemeente_code, geom = EXCLUDED.geom, updated_at = NOW()""", (country_id, gemeente_code, gemeente_code, gemeente_name, geom_wkt, geom_wkt, source_id) ) counts['admin2'] += 1 conn.commit() logger.info(f"Loaded {counts['admin2']} CBS municipalities for Netherlands") return counts # ============================================================================ # GeoNames Settlements Loader # ============================================================================ def load_geonames_settlements(conn, tsv_path: Path, country_codes: Optional[List[str]] = None) -> int: """Load GeoNames settlements from allCountries.txt or country-specific file.""" if not tsv_path.exists(): logger.error(f"GeoNames file not found: {tsv_path}") return 0 count = 0 batch = [] batch_size = 5000 logger.info(f"Loading GeoNames settlements from {tsv_path}") with open(tsv_path, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split('\t') if len(parts) < 19: continue geonames_id = int(parts[0]) name = parts[1] ascii_name = parts[2] alternate_names = parts[3] latitude = float(parts[4]) longitude = float(parts[5]) feature_class = parts[6] feature_code = parts[7] country_code = parts[8] admin1_code = parts[10] admin2_code = parts[11] admin3_code = parts[12] admin4_code = parts[13] population = int(parts[14]) if parts[14] else 0 elevation = int(parts[15]) if parts[15] and parts[15] != '' else None timezone = parts[17] modification_date = parts[18] # Filter by country if specified if country_codes and country_code not in country_codes: continue # Only load populated places (P class) if feature_class != 'P': continue # Filter to main settlement types if feature_code not in ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG'): continue batch.append(( geonames_id, name, ascii_name, alternate_names, latitude, longitude, feature_class, feature_code, country_code, admin1_code, admin2_code, admin3_code, admin4_code, population, elevation, timezone, modification_date )) if len(batch) >= batch_size: _insert_geonames_batch(conn, batch) count += len(batch) logger.info(f"Loaded {count} settlements...") batch = [] # Final batch if batch: _insert_geonames_batch(conn, batch) count += len(batch) logger.info(f"Loaded {count} GeoNames settlements total") return count def _insert_geonames_batch(conn, batch: List[tuple]): """Insert batch of GeoNames settlements.""" with conn.cursor() as cur: execute_values( cur, """INSERT INTO geonames_settlements (geonames_id, name, ascii_name, alternate_names, latitude, longitude, geom, feature_class, feature_code, country_code, admin1_code, admin2_code, admin3_code, admin4_code, population, elevation, timezone, modification_date) VALUES %s ON CONFLICT (geonames_id) DO UPDATE SET name = EXCLUDED.name, population = EXCLUDED.population, modification_date = EXCLUDED.modification_date""", [( r[0], r[1], r[2], r[3], r[4], r[5], f"SRID=4326;POINT({r[5]} {r[4]})", r[6], r[7], r[8], r[9], r[10], r[11], r[12], r[13], r[14], r[15], r[16] ) for r in batch], template="""(%s, %s, %s, %s, %s, %s, ST_GeomFromEWKT(%s), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" ) conn.commit() # ============================================================================ # Main # ============================================================================ def main(): parser = argparse.ArgumentParser(description='Load administrative boundaries into PostGIS') parser.add_argument('--source', choices=['gadm', 'cbs', 'natural_earth', 'geonames'], required=True, help='Data source to load') parser.add_argument('--country', type=str, help='Single country ISO-2 code (e.g., NL)') parser.add_argument('--countries', type=str, help='Comma-separated country codes (e.g., NL,DE,BE)') parser.add_argument('--file', type=str, help='Path to data file (for CBS, GeoNames)') parser.add_argument('--cache-dir', type=str, default='./data/boundary_cache', help='Directory for caching downloaded files') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') args = parser.parse_args() # Determine countries to load countries = [] if args.country: countries = [args.country.upper()] elif args.countries: countries = [c.strip().upper() for c in args.countries.split(',')] cache_dir = Path(args.cache_dir) if args.dry_run: logger.info(f"DRY RUN - Source: {args.source}, Countries: {countries or 'all'}") return # Connect to database try: conn = get_db_connection() logger.info(f"Connected to PostgreSQL: {DB_HOST}:{DB_PORT}/{DB_NAME}") except Exception as e: logger.error(f"Failed to connect to database: {e}") sys.exit(1) try: if args.source == 'gadm': if not countries: logger.error("--country or --countries required for GADM source") sys.exit(1) total_counts = {'admin0': 0, 'admin1': 0, 'admin2': 0} for iso_a2 in countries: logger.info(f"Loading GADM data for {iso_a2}...") counts = load_gadm_country(conn, iso_a2, cache_dir) for k, v in counts.items(): total_counts[k] += v logger.info(f"GADM load complete: {total_counts}") elif args.source == 'cbs': if not args.file: # Default path default_path = Path('frontend/public/data/netherlands_municipalities.geojson') if default_path.exists(): args.file = str(default_path) else: logger.error("--file required for CBS source (or place file at frontend/public/data/netherlands_municipalities.geojson)") sys.exit(1) counts = load_cbs_netherlands(conn, Path(args.file)) logger.info(f"CBS load complete: {counts}") elif args.source == 'geonames': if not args.file: logger.error("--file required for GeoNames source (path to allCountries.txt or country file)") sys.exit(1) count = load_geonames_settlements(conn, Path(args.file), countries or None) logger.info(f"GeoNames load complete: {count} settlements") elif args.source == 'natural_earth': logger.error("Natural Earth loader not yet implemented") sys.exit(1) finally: conn.close() if __name__ == '__main__': main()