glam/scripts/load_boundaries_postgis.py
kempersc 83ab098cf7 feat: add PostGIS international boundary architecture
Add schema and tooling for storing administrative boundaries in PostGIS:
- 002_postgis_boundaries.sql: Complete PostGIS schema with:
  - boundary_countries (ISO 3166-1)
  - boundary_admin1 (states/provinces/regions)
  - boundary_admin2 (municipalities/districts)
  - boundary_historical (HALC pre-modern territories)
  - custodian_service_areas (computed werkgebied geometries)
  - geonames_settlements (reverse geocoding)
  - Spatial functions: find_admin_for_point, find_nearest_settlement
  - Views for API access

- load_boundaries_postgis.py: Python loader supporting:
  - GADM (Global Administrative Areas) - primary global source
  - CBS (Dutch municipality boundaries)
  - GeoNames settlements for reverse geocoding
  - Cached downloads and upsert logic

- POSTGIS_BOUNDARY_ARCHITECTURE.md: Design documentation

This replaces the static GeoJSON approach for international coverage.
2025-12-07 14:34:39 +01:00

552 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Load International Administrative Boundaries into PostGIS
Supports multiple data sources:
- GADM (Global Administrative Areas) - Primary global source
- Natural Earth - Simplified global boundaries
- CBS (Netherlands) - Official Dutch municipality boundaries
- OSM Boundaries - OpenStreetMap extracts
Usage:
python load_boundaries_postgis.py --source gadm --country NL
python load_boundaries_postgis.py --source cbs
python load_boundaries_postgis.py --source natural_earth --level 1
python load_boundaries_postgis.py --countries JP,CZ,DE,BE --source gadm
"""
import argparse
import json
import logging
import os
import sys
from datetime import date
from pathlib import Path
from typing import Optional, List, Dict, Any
from urllib.request import urlretrieve
import zipfile
import tempfile
# Third-party imports
try:
import psycopg2
from psycopg2.extras import execute_values
import geopandas as gpd
from shapely import wkt
from shapely.geometry import shape, mapping
except ImportError as e:
print(f"Missing dependency: {e}")
print("Install with: pip install psycopg2-binary geopandas shapely")
sys.exit(1)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# ============================================================================
# Configuration
# ============================================================================
# Database connection (from environment or defaults)
DB_HOST = os.getenv('POSTGRES_HOST', 'localhost')
DB_PORT = os.getenv('POSTGRES_PORT', '5432')
DB_NAME = os.getenv('POSTGRES_DB', 'glam')
DB_USER = os.getenv('POSTGRES_USER', 'glam_api')
DB_PASSWORD = os.getenv('POSTGRES_PASSWORD', '')
# Data source URLs
GADM_BASE_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_0.json"
GADM_ADM1_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_1.json"
GADM_ADM2_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_2.json"
NATURAL_EARTH_URL = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
# ISO 3166-1 alpha-2 to alpha-3 mapping for common countries
ISO_A2_TO_A3 = {
'NL': 'NLD', 'DE': 'DEU', 'BE': 'BEL', 'FR': 'FRA', 'GB': 'GBR',
'JP': 'JPN', 'CZ': 'CZE', 'CH': 'CHE', 'AT': 'AUT', 'PL': 'POL',
'IT': 'ITA', 'ES': 'ESP', 'PT': 'PRT', 'DK': 'DNK', 'SE': 'SWE',
'NO': 'NOR', 'FI': 'FIN', 'IE': 'IRL', 'LU': 'LUX', 'BR': 'BRA',
'MX': 'MEX', 'AR': 'ARG', 'CL': 'CHL', 'CO': 'COL', 'EG': 'EGY',
'PS': 'PSE', 'IL': 'ISR', 'DZ': 'DZA', 'AU': 'AUS', 'NZ': 'NZL',
'KR': 'KOR', 'TW': 'TWN', 'CN': 'CHN', 'IN': 'IND', 'RU': 'RUS',
'UA': 'UKR', 'TR': 'TUR', 'GR': 'GRC', 'BG': 'BGR', 'RO': 'ROU',
'HU': 'HUN', 'SK': 'SVK', 'SI': 'SVN', 'HR': 'HRV', 'RS': 'SRB',
'US': 'USA', 'CA': 'CAN', 'ZA': 'ZAF',
}
# ============================================================================
# Database Functions
# ============================================================================
def get_db_connection():
"""Get PostgreSQL connection with PostGIS."""
return psycopg2.connect(
host=DB_HOST,
port=DB_PORT,
dbname=DB_NAME,
user=DB_USER,
password=DB_PASSWORD
)
def get_source_id(conn, source_code: str) -> int:
"""Get or create data source ID."""
with conn.cursor() as cur:
cur.execute(
"SELECT id FROM boundary_data_sources WHERE source_code = %s",
(source_code,)
)
row = cur.fetchone()
if row:
return row[0]
# Source not found - this shouldn't happen if schema is properly initialized
raise ValueError(f"Unknown data source: {source_code}")
def get_or_create_country(conn, iso_a2: str, iso_a3: str, name: str,
geom_wkt: str, source_id: int) -> int:
"""Get or create country record, return ID."""
with conn.cursor() as cur:
# Check if exists
cur.execute(
"""SELECT id FROM boundary_countries
WHERE iso_a2 = %s AND valid_to IS NULL""",
(iso_a2,)
)
row = cur.fetchone()
if row:
return row[0]
# Insert new
cur.execute(
"""INSERT INTO boundary_countries
(iso_a2, iso_a3, country_name, geom, centroid, source_id, valid_from)
VALUES (%s, %s, %s, ST_GeomFromText(%s, 4326),
ST_Centroid(ST_GeomFromText(%s, 4326)), %s, CURRENT_DATE)
RETURNING id""",
(iso_a2, iso_a3, name, geom_wkt, geom_wkt, source_id)
)
return cur.fetchone()[0]
# ============================================================================
# GADM Loader
# ============================================================================
def download_gadm(iso_a3: str, level: int, cache_dir: Path) -> Optional[Path]:
"""Download GADM GeoJSON for a country and admin level."""
cache_dir.mkdir(parents=True, exist_ok=True)
if level == 0:
url = GADM_BASE_URL.format(iso3=iso_a3)
elif level == 1:
url = GADM_ADM1_URL.format(iso3=iso_a3)
elif level == 2:
url = GADM_ADM2_URL.format(iso3=iso_a3)
else:
logger.error(f"Unsupported GADM level: {level}")
return None
filename = f"gadm41_{iso_a3}_{level}.json"
filepath = cache_dir / filename
if filepath.exists():
logger.info(f"Using cached: {filepath}")
return filepath
logger.info(f"Downloading: {url}")
try:
urlretrieve(url, filepath)
return filepath
except Exception as e:
logger.error(f"Failed to download {url}: {e}")
return None
def load_gadm_country(conn, iso_a2: str, cache_dir: Path) -> Dict[str, int]:
"""Load GADM data for a country (levels 0, 1, 2)."""
iso_a3 = ISO_A2_TO_A3.get(iso_a2)
if not iso_a3:
logger.error(f"Unknown country code: {iso_a2}")
return {'admin0': 0, 'admin1': 0, 'admin2': 0}
source_id = get_source_id(conn, 'GADM')
counts = {'admin0': 0, 'admin1': 0, 'admin2': 0}
# Level 0 - Country boundary
level0_file = download_gadm(iso_a3, 0, cache_dir)
if level0_file:
gdf = gpd.read_file(level0_file)
for _, row in gdf.iterrows():
geom_wkt = row.geometry.wkt
country_name = row.get('COUNTRY') or row.get('NAME_0') or iso_a2
country_id = get_or_create_country(
conn, iso_a2, iso_a3,
str(country_name),
geom_wkt, source_id
)
counts['admin0'] = 1
logger.info(f"Loaded country: {iso_a2} (id={country_id})")
# Level 1 - Admin1 (states/provinces)
level1_file = download_gadm(iso_a3, 1, cache_dir)
if level1_file:
gdf = gpd.read_file(level1_file)
# Get country ID
with conn.cursor() as cur:
cur.execute(
"SELECT id FROM boundary_countries WHERE iso_a2 = %s AND valid_to IS NULL",
(iso_a2,)
)
country_row = cur.fetchone()
if not country_row:
logger.warning(f"Country {iso_a2} not found, skipping admin1")
else:
country_id = country_row[0]
for _, row in gdf.iterrows():
admin1_code = row.get('GID_1', row.get('HASC_1', ''))
admin1_name = row.get('NAME_1', '')
admin1_type = row.get('TYPE_1', 'Region')
geom_wkt = row.geometry.wkt
# Generate ISO 3166-2 code
iso_3166_2 = f"{iso_a2}-{admin1_code.split('.')[-1][:3].upper()}" if admin1_code else None
cur.execute(
"""INSERT INTO boundary_admin1
(country_id, iso_3166_2, admin1_code, admin1_name, admin1_type,
geom, centroid, source_id, source_feature_id, valid_from)
VALUES (%s, %s, %s, %s, %s,
ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)),
%s, %s, CURRENT_DATE)
ON CONFLICT (country_id, admin1_code, valid_to) DO UPDATE
SET admin1_name = EXCLUDED.admin1_name,
geom = EXCLUDED.geom,
updated_at = NOW()""",
(country_id, iso_3166_2, admin1_code, admin1_name, admin1_type,
geom_wkt, geom_wkt, source_id, row.get('GID_1'))
)
counts['admin1'] += 1
conn.commit()
logger.info(f"Loaded {counts['admin1']} admin1 regions for {iso_a2}")
# Level 2 - Admin2 (municipalities/districts)
level2_file = download_gadm(iso_a3, 2, cache_dir)
if level2_file:
gdf = gpd.read_file(level2_file)
with conn.cursor() as cur:
# Get country ID
cur.execute(
"SELECT id FROM boundary_countries WHERE iso_a2 = %s AND valid_to IS NULL",
(iso_a2,)
)
country_row = cur.fetchone()
if not country_row:
logger.warning(f"Country {iso_a2} not found, skipping admin2")
else:
country_id = country_row[0]
for _, row in gdf.iterrows():
admin1_code = row.get('GID_1', '')
admin2_code = row.get('GID_2', row.get('HASC_2', ''))
admin2_name = row.get('NAME_2', '')
admin2_type = row.get('TYPE_2', 'Municipality')
geom_wkt = row.geometry.wkt
# Find admin1 ID
cur.execute(
"""SELECT id FROM boundary_admin1
WHERE country_id = %s AND admin1_code = %s AND valid_to IS NULL""",
(country_id, admin1_code)
)
admin1_row = cur.fetchone()
admin1_id = admin1_row[0] if admin1_row else None
cur.execute(
"""INSERT INTO boundary_admin2
(admin1_id, country_id, admin2_code, admin2_name, admin2_type,
geom, centroid, source_id, source_feature_id, valid_from)
VALUES (%s, %s, %s, %s, %s,
ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)),
%s, %s, CURRENT_DATE)
ON CONFLICT (admin1_id, admin2_code, valid_to) DO UPDATE
SET admin2_name = EXCLUDED.admin2_name,
geom = EXCLUDED.geom,
updated_at = NOW()""",
(admin1_id, country_id, admin2_code, admin2_name, admin2_type,
geom_wkt, geom_wkt, source_id, row.get('GID_2'))
)
counts['admin2'] += 1
conn.commit()
logger.info(f"Loaded {counts['admin2']} admin2 regions for {iso_a2}")
return counts
# ============================================================================
# CBS Netherlands Loader
# ============================================================================
def load_cbs_netherlands(conn, geojson_path: Path) -> Dict[str, int]:
"""Load CBS municipality boundaries for Netherlands."""
source_id = get_source_id(conn, 'CBS')
counts = {'admin2': 0}
if not geojson_path.exists():
logger.error(f"CBS GeoJSON not found: {geojson_path}")
return counts
gdf = gpd.read_file(geojson_path)
logger.info(f"Loaded {len(gdf)} features from CBS data")
with conn.cursor() as cur:
# Ensure Netherlands country exists
cur.execute(
"SELECT id FROM boundary_countries WHERE iso_a2 = 'NL' AND valid_to IS NULL"
)
country_row = cur.fetchone()
if not country_row:
logger.warning("Netherlands not found in countries table, creating...")
cur.execute(
"""INSERT INTO boundary_countries (iso_a2, iso_a3, country_name, valid_from, source_id)
VALUES ('NL', 'NLD', 'Netherlands', CURRENT_DATE, %s)
RETURNING id""",
(source_id,)
)
country_id = cur.fetchone()[0]
else:
country_id = country_row[0]
for _, row in gdf.iterrows():
# CBS field names vary - try common patterns
gemeente_code = row.get('gemeentecode', row.get('statcode', row.get('GM_CODE', '')))
gemeente_name = row.get('gemeentenaam', row.get('statnaam', row.get('GM_NAAM', '')))
if not gemeente_name:
continue
geom_wkt = row.geometry.wkt
# Determine admin1 (province) from gemeente code or name
# CBS codes: GM0363 = Amsterdam (NH), GM0518 = 's-Gravenhage (ZH)
# For now, leave admin1_id as NULL - can be enriched later
cur.execute(
"""INSERT INTO boundary_admin2
(admin1_id, country_id, admin2_code, cbs_gemeente_code, admin2_name,
admin2_type, geom, centroid, source_id, valid_from)
VALUES (NULL, %s, %s, %s, %s, 'Municipality',
ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)),
%s, CURRENT_DATE)
ON CONFLICT (admin1_id, admin2_code, valid_to) DO UPDATE
SET admin2_name = EXCLUDED.admin2_name,
cbs_gemeente_code = EXCLUDED.cbs_gemeente_code,
geom = EXCLUDED.geom,
updated_at = NOW()""",
(country_id, gemeente_code, gemeente_code, gemeente_name,
geom_wkt, geom_wkt, source_id)
)
counts['admin2'] += 1
conn.commit()
logger.info(f"Loaded {counts['admin2']} CBS municipalities for Netherlands")
return counts
# ============================================================================
# GeoNames Settlements Loader
# ============================================================================
def load_geonames_settlements(conn, tsv_path: Path, country_codes: Optional[List[str]] = None) -> int:
"""Load GeoNames settlements from allCountries.txt or country-specific file."""
if not tsv_path.exists():
logger.error(f"GeoNames file not found: {tsv_path}")
return 0
count = 0
batch = []
batch_size = 5000
logger.info(f"Loading GeoNames settlements from {tsv_path}")
with open(tsv_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) < 19:
continue
geonames_id = int(parts[0])
name = parts[1]
ascii_name = parts[2]
alternate_names = parts[3]
latitude = float(parts[4])
longitude = float(parts[5])
feature_class = parts[6]
feature_code = parts[7]
country_code = parts[8]
admin1_code = parts[10]
admin2_code = parts[11]
admin3_code = parts[12]
admin4_code = parts[13]
population = int(parts[14]) if parts[14] else 0
elevation = int(parts[15]) if parts[15] and parts[15] != '' else None
timezone = parts[17]
modification_date = parts[18]
# Filter by country if specified
if country_codes and country_code not in country_codes:
continue
# Only load populated places (P class)
if feature_class != 'P':
continue
# Filter to main settlement types
if feature_code not in ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG'):
continue
batch.append((
geonames_id, name, ascii_name, alternate_names,
latitude, longitude,
feature_class, feature_code,
country_code, admin1_code, admin2_code, admin3_code, admin4_code,
population, elevation, timezone, modification_date
))
if len(batch) >= batch_size:
_insert_geonames_batch(conn, batch)
count += len(batch)
logger.info(f"Loaded {count} settlements...")
batch = []
# Final batch
if batch:
_insert_geonames_batch(conn, batch)
count += len(batch)
logger.info(f"Loaded {count} GeoNames settlements total")
return count
def _insert_geonames_batch(conn, batch: List[tuple]):
"""Insert batch of GeoNames settlements."""
with conn.cursor() as cur:
execute_values(
cur,
"""INSERT INTO geonames_settlements
(geonames_id, name, ascii_name, alternate_names,
latitude, longitude, geom,
feature_class, feature_code,
country_code, admin1_code, admin2_code, admin3_code, admin4_code,
population, elevation, timezone, modification_date)
VALUES %s
ON CONFLICT (geonames_id) DO UPDATE
SET name = EXCLUDED.name,
population = EXCLUDED.population,
modification_date = EXCLUDED.modification_date""",
[(
r[0], r[1], r[2], r[3],
r[4], r[5], f"SRID=4326;POINT({r[5]} {r[4]})",
r[6], r[7],
r[8], r[9], r[10], r[11], r[12],
r[13], r[14], r[15], r[16]
) for r in batch],
template="""(%s, %s, %s, %s, %s, %s, ST_GeomFromEWKT(%s), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
)
conn.commit()
# ============================================================================
# Main
# ============================================================================
def main():
parser = argparse.ArgumentParser(description='Load administrative boundaries into PostGIS')
parser.add_argument('--source', choices=['gadm', 'cbs', 'natural_earth', 'geonames'],
required=True, help='Data source to load')
parser.add_argument('--country', type=str, help='Single country ISO-2 code (e.g., NL)')
parser.add_argument('--countries', type=str, help='Comma-separated country codes (e.g., NL,DE,BE)')
parser.add_argument('--file', type=str, help='Path to data file (for CBS, GeoNames)')
parser.add_argument('--cache-dir', type=str, default='./data/boundary_cache',
help='Directory for caching downloaded files')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
args = parser.parse_args()
# Determine countries to load
countries = []
if args.country:
countries = [args.country.upper()]
elif args.countries:
countries = [c.strip().upper() for c in args.countries.split(',')]
cache_dir = Path(args.cache_dir)
if args.dry_run:
logger.info(f"DRY RUN - Source: {args.source}, Countries: {countries or 'all'}")
return
# Connect to database
try:
conn = get_db_connection()
logger.info(f"Connected to PostgreSQL: {DB_HOST}:{DB_PORT}/{DB_NAME}")
except Exception as e:
logger.error(f"Failed to connect to database: {e}")
sys.exit(1)
try:
if args.source == 'gadm':
if not countries:
logger.error("--country or --countries required for GADM source")
sys.exit(1)
total_counts = {'admin0': 0, 'admin1': 0, 'admin2': 0}
for iso_a2 in countries:
logger.info(f"Loading GADM data for {iso_a2}...")
counts = load_gadm_country(conn, iso_a2, cache_dir)
for k, v in counts.items():
total_counts[k] += v
logger.info(f"GADM load complete: {total_counts}")
elif args.source == 'cbs':
if not args.file:
# Default path
default_path = Path('frontend/public/data/netherlands_municipalities.geojson')
if default_path.exists():
args.file = str(default_path)
else:
logger.error("--file required for CBS source (or place file at frontend/public/data/netherlands_municipalities.geojson)")
sys.exit(1)
counts = load_cbs_netherlands(conn, Path(args.file))
logger.info(f"CBS load complete: {counts}")
elif args.source == 'geonames':
if not args.file:
logger.error("--file required for GeoNames source (path to allCountries.txt or country file)")
sys.exit(1)
count = load_geonames_settlements(conn, Path(args.file), countries or None)
logger.info(f"GeoNames load complete: {count} settlements")
elif args.source == 'natural_earth':
logger.error("Natural Earth loader not yet implemented")
sys.exit(1)
finally:
conn.close()
if __name__ == '__main__':
main()