diff --git a/docs/POSTGIS_BOUNDARY_ARCHITECTURE.md b/docs/POSTGIS_BOUNDARY_ARCHITECTURE.md new file mode 100644 index 0000000000..c9fb4394ed --- /dev/null +++ b/docs/POSTGIS_BOUNDARY_ARCHITECTURE.md @@ -0,0 +1,189 @@ +# PostGIS International Boundary Architecture + +## Overview + +This document describes the PostGIS-based architecture for storing and querying international administrative boundaries to compute heritage custodian service areas ("werkgebied"). + +## Problem Statement + +The current implementation uses static GeoJSON files for Netherlands-only boundaries: +- `netherlands_provinces.geojson` - 12 provinces +- `netherlands_municipalities.geojson` - ~350 municipalities +- `netherlands_historical_1500.geojson` - HALC historical territories + +This approach **does not scale** for international coverage (Japan, Czechia, Germany, Belgium, Brazil, etc.) because: +1. GeoJSON files are too large for client-side loading (Germany has 400+ districts) +2. No server-side spatial queries (point-in-polygon, intersection) +3. No temporal versioning for boundary changes +4. No consistent administrative hierarchy across countries + +## Solution Architecture + +### Database Schema + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ PostGIS Database │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────┐ ┌─────────────────────┐ │ +│ │ boundary_countries │────▶│ boundary_admin1 │ │ +│ │ (ISO 3166-1) │ │ (States/Provinces) │ │ +│ │ - iso_a2: NL, DE... │ │ - iso_3166_2 │ │ +│ │ - geom: POLYGON │ │ - geom: POLYGON │ │ +│ └─────────────────────┘ └──────────┬──────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────┐ │ +│ │ boundary_admin2 │ │ +│ │ (Municipalities) │ │ +│ │ - geonames_id │ │ +│ │ - cbs_gemeente_code│ │ +│ │ - geom: POLYGON │ │ +│ └──────────┬──────────┘ │ +│ │ │ +│ ┌─────────────────────┐ │ │ +│ │ boundary_historical │ │ │ +│ │ (HALC, pre-modern) │ │ │ +│ │ - halc_adm1_code │ │ │ +│ │ - period_start/end │ │ │ +│ └─────────────────────┘ │ │ +│ ▼ │ +│ ┌───────────────────────────┐ │ +│ │ custodian_service_areas │ │ +│ │ (Computed werkgebied) │ │ +│ │ - ghcid: NL-NH-HAA-A-NHA │ │ +│ │ - admin2_ids: [1,2,3...] │ │ +│ │ - geom: MULTIPOLYGON │ │ +│ └───────────────────────────┘ │ +│ │ +│ ┌─────────────────────┐ │ +│ │ geonames_settlements│ (For reverse geocoding) │ +│ │ - geonames_id │ │ +│ │ - name, ascii_name │ │ +│ │ - geom: POINT │ │ +│ └─────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Data Sources + +| Source | Coverage | License | Admin Levels | Use Case | +|--------|----------|---------|--------------|----------| +| **GADM** | Global | CC-BY-NC | 0, 1, 2, 3+ | Primary global boundaries | +| **Natural Earth** | Global | Public Domain | 0, 1 | Simplified country shapes | +| **CBS** | Netherlands | CC-BY-4.0 | 2 (gemeente) | Official NL municipalities | +| **HALC** | Low Countries | Academic | Historical | Pre-1800 territories | +| **OSM** | Global | ODbL | Variable | Crowdsourced, current | +| **Eurostat** | Europe | Eurostat | NUTS/LAU | EU statistical regions | + +### API Endpoints + +The PostGIS database will be exposed via REST API: + +``` +GET /api/boundaries/countries +GET /api/boundaries/countries/{iso_a2} +GET /api/boundaries/admin1/{iso_a2} +GET /api/boundaries/admin2/{iso_a2}/{admin1_code} +GET /api/boundaries/point?lon={lon}&lat={lat} +GET /api/boundaries/service-area/{ghcid} +GET /api/boundaries/service-area/{ghcid}/geojson +``` + +### Frontend Integration + +The frontend will: +1. Fetch service area GeoJSON via API (not static files) +2. Use MapLibre vector tiles for admin boundaries (optional optimization) +3. Cache frequently-accessed boundaries in browser +4. Support historical boundary display with temporal filtering + +```typescript +// Example: Fetch service area for a custodian +const response = await fetch(`/api/boundaries/service-area/${ghcid}/geojson`); +const geojson = await response.json(); +map.getSource('werkgebied').setData(geojson); +``` + +### Temporal Versioning + +Boundaries change over time (municipal mergers, etc.). The schema supports: + +```sql +-- Current boundary (valid_to IS NULL) +SELECT * FROM boundary_admin2 WHERE cbs_gemeente_code = 'GM0363' AND valid_to IS NULL; + +-- Historical boundary (before merger) +SELECT * FROM boundary_admin2 WHERE cbs_gemeente_code = 'GM0363' AND valid_to = '2001-01-01'; +``` + +### Service Area Computation + +Service areas are computed from admin units: + +```sql +-- Compute service area for Noord-Hollands Archief (serves Haarlem + region) +SELECT ST_Union(geom) AS service_area_geom +FROM boundary_admin2 +WHERE id IN (SELECT unnest(admin2_ids) FROM custodian_service_areas WHERE ghcid = 'NL-NH-HAA-A-NHA'); +``` + +Or pre-computed and stored: + +```sql +-- Pre-compute and cache +INSERT INTO custodian_service_areas (ghcid, service_area_name, geom, admin2_ids) +VALUES ( + 'NL-NH-HAA-A-NHA', + 'Noord-Hollands Archief Werkgebied', + compute_service_area_geometry(ARRAY[123, 124, 125, 126]), -- admin2 IDs + ARRAY[123, 124, 125, 126] +); +``` + +## Implementation Plan + +### Phase 1: Schema & Initial Data (Current Sprint) +- [x] Create PostGIS schema (`002_postgis_boundaries.sql`) +- [x] Create boundary loading script (`load_boundaries_postgis.py`) +- [ ] Load Netherlands boundaries (CBS + provinces) +- [ ] Load HALC historical boundaries +- [ ] Migrate existing GeoJSON data + +### Phase 2: International Expansion +- [ ] Load GADM for priority countries: JP, CZ, DE, BE, CH, AT +- [ ] Load GeoNames settlements for reverse geocoding +- [ ] Create API endpoints for boundary queries +- [ ] Update frontend to use API instead of static files + +### Phase 3: Service Area Management +- [ ] Compute service areas for existing custodians +- [ ] Create admin UI for service area editing +- [ ] Implement temporal boundary display +- [ ] Add vector tile generation (optional optimization) + +## Files Created + +| File | Description | +|------|-------------| +| `infrastructure/sql/002_postgis_boundaries.sql` | PostGIS schema for boundaries | +| `scripts/load_boundaries_postgis.py` | Python script to load boundary data | +| `docs/POSTGIS_BOUNDARY_ARCHITECTURE.md` | This document | + +## Dependencies + +- PostgreSQL 14+ with PostGIS 3.3+ +- Python: `psycopg2`, `geopandas`, `shapely` +- GADM data (downloaded on demand) +- CBS GeoJSON (existing in `frontend/public/data/`) + +## Migration from Static GeoJSON + +The current static GeoJSON approach will be **deprecated** but **not immediately removed**: + +1. PostGIS becomes the source of truth for boundaries +2. API serves boundary GeoJSON on demand +3. Static files remain as fallback for development +4. Frontend gradually migrates to API-based loading diff --git a/infrastructure/sql/002_postgis_boundaries.sql b/infrastructure/sql/002_postgis_boundaries.sql new file mode 100644 index 0000000000..6d33cd546d --- /dev/null +++ b/infrastructure/sql/002_postgis_boundaries.sql @@ -0,0 +1,622 @@ +-- PostGIS International Boundary Schema for Heritage Custodian Service Areas +-- Migration: 002_postgis_boundaries.sql +-- Created: 2025-12-07 +-- +-- Stores administrative boundaries for computing heritage custodian service areas +-- Supports global coverage with multiple data sources (GADM, Natural Earth, OSM, CBS) +-- Enables spatial queries: point-in-polygon, intersection, containment + +-- ============================================================================ +-- Enable PostGIS Extension +-- ============================================================================ + +CREATE EXTENSION IF NOT EXISTS postgis; +CREATE EXTENSION IF NOT EXISTS postgis_topology; + +-- ============================================================================ +-- Data Source Registry +-- ============================================================================ +-- Track data sources for provenance and updates + +CREATE TABLE IF NOT EXISTS boundary_data_sources ( + id SERIAL PRIMARY KEY, + source_code VARCHAR(50) NOT NULL UNIQUE, -- e.g., "GADM", "CBS", "OSM", "HALC" + source_name VARCHAR(255) NOT NULL, -- Full name + source_url TEXT, -- Official website + license_type VARCHAR(100), -- e.g., "CC-BY-4.0", "ODbL" + license_url TEXT, + description TEXT, + coverage_scope VARCHAR(50), -- "GLOBAL", "EUROPE", "NETHERLANDS", etc. + last_updated DATE, + attribution_required TEXT, -- Required attribution text + metadata JSONB DEFAULT '{}'::jsonb, + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +-- Insert known data sources +INSERT INTO boundary_data_sources (source_code, source_name, source_url, license_type, coverage_scope, description) VALUES + ('GADM', 'Global Administrative Areas Database', 'https://gadm.org', 'CC-BY-NC-4.0', 'GLOBAL', 'Global administrative boundaries at multiple levels (country, region, district)'), + ('NATURAL_EARTH', 'Natural Earth', 'https://www.naturalearthdata.com', 'Public Domain', 'GLOBAL', 'Free vector and raster map data at 1:10m, 1:50m, and 1:110m scales'), + ('OSM', 'OpenStreetMap', 'https://www.openstreetmap.org', 'ODbL', 'GLOBAL', 'Crowdsourced global geographic data'), + ('CBS', 'CBS Wijken en Buurten', 'https://www.cbs.nl/nl-nl/dossier/nederland-regionaal/geografische-data', 'CC-BY-4.0', 'NETHERLANDS', 'Official Dutch municipality boundaries from CBS'), + ('HALC', 'Historical Atlas of the Low Countries', NULL, 'Academic', 'LOW_COUNTRIES', 'Historical administrative boundaries for Netherlands, Belgium, Luxembourg'), + ('EUROSTAT', 'Eurostat NUTS/LAU', 'https://ec.europa.eu/eurostat/web/gisco', 'Eurostat', 'EUROPE', 'European statistical regions (NUTS) and local administrative units (LAU)') +ON CONFLICT (source_code) DO NOTHING; + +-- ============================================================================ +-- Countries Table +-- ============================================================================ +-- ISO 3166-1 countries with boundaries + +CREATE TABLE IF NOT EXISTS boundary_countries ( + id SERIAL PRIMARY KEY, + iso_a2 CHAR(2) NOT NULL, -- ISO 3166-1 alpha-2 (e.g., "NL", "DE") + iso_a3 CHAR(3) NOT NULL, -- ISO 3166-1 alpha-3 (e.g., "NLD", "DEU") + iso_n3 CHAR(3), -- ISO 3166-1 numeric (e.g., "528") + country_name VARCHAR(255) NOT NULL, -- English name + country_name_local VARCHAR(255), -- Native language name + + -- Geometry + geom GEOMETRY(MULTIPOLYGON, 4326), -- WGS84 boundary polygon + centroid GEOMETRY(POINT, 4326), -- Computed centroid + bbox BOX2D, -- Bounding box for quick filtering + + -- Metadata + source_id INTEGER REFERENCES boundary_data_sources(id), + source_feature_id VARCHAR(100), -- ID in source dataset + area_km2 NUMERIC, -- Computed area + + -- Versioning + valid_from DATE, + valid_to DATE, -- NULL = current + + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + + UNIQUE(iso_a2, valid_to) -- Allow multiple versions, only one current +); + +CREATE INDEX IF NOT EXISTS idx_boundary_countries_iso_a2 ON boundary_countries(iso_a2); +CREATE INDEX IF NOT EXISTS idx_boundary_countries_geom ON boundary_countries USING GIST(geom); +CREATE INDEX IF NOT EXISTS idx_boundary_countries_centroid ON boundary_countries USING GIST(centroid); +CREATE INDEX IF NOT EXISTS idx_boundary_countries_valid ON boundary_countries(valid_from, valid_to); + +-- ============================================================================ +-- Administrative Level 1 (States/Provinces/Regions) +-- ============================================================================ +-- ISO 3166-2 regions + +CREATE TABLE IF NOT EXISTS boundary_admin1 ( + id SERIAL PRIMARY KEY, + country_id INTEGER REFERENCES boundary_countries(id), + + -- Identifiers + iso_3166_2 VARCHAR(10), -- e.g., "NL-NH", "DE-BY", "JP-13" + admin1_code VARCHAR(20) NOT NULL, -- Internal code (varies by source) + geonames_id INTEGER, -- GeoNames ID for linking + wikidata_id VARCHAR(20), -- Wikidata Q-number + + -- Names + admin1_name VARCHAR(255) NOT NULL, -- English/standard name + admin1_name_local VARCHAR(255), -- Native language name + admin1_type VARCHAR(100), -- e.g., "Province", "State", "Prefecture", "Land" + + -- Geometry + geom GEOMETRY(MULTIPOLYGON, 4326), + centroid GEOMETRY(POINT, 4326), + bbox BOX2D, + + -- Metadata + source_id INTEGER REFERENCES boundary_data_sources(id), + source_feature_id VARCHAR(100), + area_km2 NUMERIC, + + -- Versioning + valid_from DATE, + valid_to DATE, + + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + + UNIQUE(country_id, admin1_code, valid_to) +); + +CREATE INDEX IF NOT EXISTS idx_boundary_admin1_country ON boundary_admin1(country_id); +CREATE INDEX IF NOT EXISTS idx_boundary_admin1_iso ON boundary_admin1(iso_3166_2); +CREATE INDEX IF NOT EXISTS idx_boundary_admin1_geonames ON boundary_admin1(geonames_id); +CREATE INDEX IF NOT EXISTS idx_boundary_admin1_geom ON boundary_admin1 USING GIST(geom); +CREATE INDEX IF NOT EXISTS idx_boundary_admin1_centroid ON boundary_admin1 USING GIST(centroid); + +-- ============================================================================ +-- Administrative Level 2 (Districts/Counties/Municipalities) +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS boundary_admin2 ( + id SERIAL PRIMARY KEY, + admin1_id INTEGER REFERENCES boundary_admin1(id), + country_id INTEGER REFERENCES boundary_countries(id), + + -- Identifiers + admin2_code VARCHAR(50) NOT NULL, -- Internal code + nuts_code VARCHAR(10), -- NUTS code (EU) + lau_code VARCHAR(20), -- LAU code (EU) + geonames_id INTEGER, + wikidata_id VARCHAR(20), + + -- CBS-specific (Netherlands) + cbs_gemeente_code VARCHAR(10), -- Dutch municipality code (GM0363) + + -- Names + admin2_name VARCHAR(255) NOT NULL, + admin2_name_local VARCHAR(255), + admin2_type VARCHAR(100), -- e.g., "Municipality", "District", "County" + + -- Geometry + geom GEOMETRY(MULTIPOLYGON, 4326), + centroid GEOMETRY(POINT, 4326), + bbox BOX2D, + + -- Metadata + source_id INTEGER REFERENCES boundary_data_sources(id), + source_feature_id VARCHAR(100), + area_km2 NUMERIC, + population INTEGER, -- If available from source + + -- Versioning + valid_from DATE, + valid_to DATE, + + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + + UNIQUE(admin1_id, admin2_code, valid_to) +); + +CREATE INDEX IF NOT EXISTS idx_boundary_admin2_admin1 ON boundary_admin2(admin1_id); +CREATE INDEX IF NOT EXISTS idx_boundary_admin2_country ON boundary_admin2(country_id); +CREATE INDEX IF NOT EXISTS idx_boundary_admin2_geonames ON boundary_admin2(geonames_id); +CREATE INDEX IF NOT EXISTS idx_boundary_admin2_cbs ON boundary_admin2(cbs_gemeente_code); +CREATE INDEX IF NOT EXISTS idx_boundary_admin2_geom ON boundary_admin2 USING GIST(geom); +CREATE INDEX IF NOT EXISTS idx_boundary_admin2_centroid ON boundary_admin2 USING GIST(centroid); + +-- ============================================================================ +-- Historical Boundaries +-- ============================================================================ +-- Pre-modern administrative boundaries (e.g., HALC 1500 data) + +CREATE TABLE IF NOT EXISTS boundary_historical ( + id SERIAL PRIMARY KEY, + + -- Identifiers + historical_id VARCHAR(100) NOT NULL, -- Source-specific ID + halc_adm1_code VARCHAR(20), -- HALC ADM1 territory code (e.g., "VI") + halc_adm2_name VARCHAR(255), -- HALC ADM2 district name + wikidata_id VARCHAR(20), + + -- Location context + modern_country_iso_a2 CHAR(2), -- Modern country this is within + + -- Names + territory_name VARCHAR(255) NOT NULL, + territory_name_historical VARCHAR(255), -- Name in historical period + territory_type VARCHAR(100), -- e.g., "County", "Duchy", "Lordship" + + -- Temporal extent + period_start SMALLINT, -- Year (e.g., 1500) + period_end SMALLINT, -- Year (e.g., 1795) + period_description VARCHAR(255), -- e.g., "Ancien Regime", "Medieval" + + -- Geometry + geom GEOMETRY(MULTIPOLYGON, 4326), + centroid GEOMETRY(POINT, 4326), + bbox BOX2D, + + -- Metadata + source_id INTEGER REFERENCES boundary_data_sources(id), + source_feature_id VARCHAR(100), + area_km2 NUMERIC, + certainty VARCHAR(50), -- "HIGH", "MEDIUM", "LOW", "APPROXIMATE" + notes TEXT, + + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + + UNIQUE(historical_id, source_id) +); + +CREATE INDEX IF NOT EXISTS idx_boundary_historical_halc ON boundary_historical(halc_adm1_code); +CREATE INDEX IF NOT EXISTS idx_boundary_historical_country ON boundary_historical(modern_country_iso_a2); +CREATE INDEX IF NOT EXISTS idx_boundary_historical_period ON boundary_historical(period_start, period_end); +CREATE INDEX IF NOT EXISTS idx_boundary_historical_geom ON boundary_historical USING GIST(geom); + +-- ============================================================================ +-- Custodian Service Areas (Computed/Custom) +-- ============================================================================ +-- Stores pre-computed or custom service area geometries for heritage custodians + +CREATE TABLE IF NOT EXISTS custodian_service_areas ( + id SERIAL PRIMARY KEY, + + -- Custodian link (GHCID) + ghcid VARCHAR(100) NOT NULL, -- e.g., "NL-NH-HAA-A-NHA" + ghcid_uuid UUID, -- UUID v5 of GHCID + + -- Service area identity + service_area_name VARCHAR(255) NOT NULL, + service_area_type VARCHAR(50) NOT NULL, -- From ServiceAreaTypeEnum + + -- Geometry (can be computed union or custom-drawn) + geom GEOMETRY(MULTIPOLYGON, 4326), + centroid GEOMETRY(POINT, 4326), + bbox BOX2D, + + -- Composition (which admin units make up this service area) + admin2_ids INTEGER[], -- Array of boundary_admin2.id + admin1_ids INTEGER[], -- Array of boundary_admin1.id + historical_ids INTEGER[], -- Array of boundary_historical.id + + -- Properties + is_historical BOOLEAN DEFAULT FALSE, + is_custom BOOLEAN DEFAULT FALSE, -- True if hand-drawn, not computed from admin units + + -- Temporal extent + valid_from DATE, + valid_to DATE, -- NULL = current + + -- Display properties + display_fill_color VARCHAR(20) DEFAULT '#3498db', + display_fill_opacity NUMERIC(3,2) DEFAULT 0.20, + display_line_color VARCHAR(20) DEFAULT '#2980b9', + display_line_width NUMERIC(3,1) DEFAULT 2.0, + + -- Metadata + source_dataset VARCHAR(100), -- Data provenance + notes TEXT, + + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + + UNIQUE(ghcid, valid_to) +); + +CREATE INDEX IF NOT EXISTS idx_custodian_sa_ghcid ON custodian_service_areas(ghcid); +CREATE INDEX IF NOT EXISTS idx_custodian_sa_uuid ON custodian_service_areas(ghcid_uuid); +CREATE INDEX IF NOT EXISTS idx_custodian_sa_geom ON custodian_service_areas USING GIST(geom); +CREATE INDEX IF NOT EXISTS idx_custodian_sa_type ON custodian_service_areas(service_area_type); +CREATE INDEX IF NOT EXISTS idx_custodian_sa_historical ON custodian_service_areas(is_historical); + +-- ============================================================================ +-- GeoNames Settlements (for reverse geocoding and linking) +-- ============================================================================ +-- Subset of GeoNames populated places for settlement lookups + +CREATE TABLE IF NOT EXISTS geonames_settlements ( + geonames_id INTEGER PRIMARY KEY, + + -- Names + name VARCHAR(200) NOT NULL, + ascii_name VARCHAR(200), + alternate_names TEXT, -- Comma-separated + + -- Location + latitude NUMERIC(10, 6) NOT NULL, + longitude NUMERIC(10, 6) NOT NULL, + geom GEOMETRY(POINT, 4326), -- Computed from lat/lon + + -- Classification + feature_class CHAR(1), -- 'P' = populated place + feature_code VARCHAR(10), -- 'PPL', 'PPLA', 'PPLC', etc. + + -- Administrative hierarchy + country_code CHAR(2), + admin1_code VARCHAR(20), -- GeoNames admin1 + admin2_code VARCHAR(80), + admin3_code VARCHAR(20), + admin4_code VARCHAR(20), + + -- Metadata + population INTEGER, + elevation INTEGER, + timezone VARCHAR(40), + modification_date DATE, + + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_geonames_country ON geonames_settlements(country_code); +CREATE INDEX IF NOT EXISTS idx_geonames_admin1 ON geonames_settlements(country_code, admin1_code); +CREATE INDEX IF NOT EXISTS idx_geonames_feature ON geonames_settlements(feature_class, feature_code); +CREATE INDEX IF NOT EXISTS idx_geonames_geom ON geonames_settlements USING GIST(geom); +CREATE INDEX IF NOT EXISTS idx_geonames_name ON geonames_settlements(name); +CREATE INDEX IF NOT EXISTS idx_geonames_ascii ON geonames_settlements(ascii_name); +CREATE INDEX IF NOT EXISTS idx_geonames_population ON geonames_settlements(population DESC); + +-- ============================================================================ +-- Spatial Functions +-- ============================================================================ + +-- Function: Find admin units containing a point +CREATE OR REPLACE FUNCTION find_admin_for_point( + p_lon NUMERIC, + p_lat NUMERIC, + p_country_code CHAR(2) DEFAULT NULL +) +RETURNS TABLE ( + admin_level INTEGER, + admin_code VARCHAR, + admin_name VARCHAR, + iso_code VARCHAR, + geonames_id INTEGER +) AS $$ +BEGIN + RETURN QUERY + + -- Admin Level 1 + SELECT + 1::INTEGER AS admin_level, + a.admin1_code, + a.admin1_name, + a.iso_3166_2, + a.geonames_id + FROM boundary_admin1 a + JOIN boundary_countries c ON a.country_id = c.id + WHERE ST_Contains(a.geom, ST_SetSRID(ST_Point(p_lon, p_lat), 4326)) + AND a.valid_to IS NULL + AND (p_country_code IS NULL OR c.iso_a2 = p_country_code) + + UNION ALL + + -- Admin Level 2 + SELECT + 2::INTEGER, + a.admin2_code, + a.admin2_name, + a.nuts_code, + a.geonames_id + FROM boundary_admin2 a + JOIN boundary_countries c ON a.country_id = c.id + WHERE ST_Contains(a.geom, ST_SetSRID(ST_Point(p_lon, p_lat), 4326)) + AND a.valid_to IS NULL + AND (p_country_code IS NULL OR c.iso_a2 = p_country_code) + + ORDER BY admin_level; +END; +$$ LANGUAGE plpgsql; + +-- Function: Find nearest settlement +CREATE OR REPLACE FUNCTION find_nearest_settlement( + p_lon NUMERIC, + p_lat NUMERIC, + p_country_code CHAR(2) DEFAULT NULL, + p_max_distance_km NUMERIC DEFAULT 50 +) +RETURNS TABLE ( + geonames_id INTEGER, + name VARCHAR, + feature_code VARCHAR, + population INTEGER, + distance_km NUMERIC +) AS $$ +DECLARE + v_point GEOMETRY; +BEGIN + v_point := ST_SetSRID(ST_Point(p_lon, p_lat), 4326); + + RETURN QUERY + SELECT + g.geonames_id, + g.name, + g.feature_code, + g.population, + ROUND((ST_Distance(g.geom::geography, v_point::geography) / 1000)::NUMERIC, 2) AS distance_km + FROM geonames_settlements g + WHERE g.feature_class = 'P' + AND g.feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') + AND (p_country_code IS NULL OR g.country_code = p_country_code) + AND ST_DWithin(g.geom::geography, v_point::geography, p_max_distance_km * 1000) + ORDER BY g.geom <-> v_point + LIMIT 5; +END; +$$ LANGUAGE plpgsql; + +-- Function: Compute service area from admin units +CREATE OR REPLACE FUNCTION compute_service_area_geometry( + p_admin2_ids INTEGER[], + p_admin1_ids INTEGER[] DEFAULT NULL +) +RETURNS GEOMETRY AS $$ +DECLARE + v_geom GEOMETRY; +BEGIN + -- Union all admin2 geometries + IF p_admin2_ids IS NOT NULL AND array_length(p_admin2_ids, 1) > 0 THEN + SELECT ST_Union(geom) INTO v_geom + FROM boundary_admin2 + WHERE id = ANY(p_admin2_ids) + AND valid_to IS NULL; + END IF; + + -- Add admin1 geometries if specified + IF p_admin1_ids IS NOT NULL AND array_length(p_admin1_ids, 1) > 0 THEN + SELECT ST_Union(COALESCE(v_geom, 'GEOMETRYCOLLECTION EMPTY'::GEOMETRY), ST_Union(geom)) + INTO v_geom + FROM boundary_admin1 + WHERE id = ANY(p_admin1_ids) + AND valid_to IS NULL; + END IF; + + RETURN v_geom; +END; +$$ LANGUAGE plpgsql; + +-- Function: Get service area GeoJSON for a custodian +CREATE OR REPLACE FUNCTION get_custodian_service_area_geojson( + p_ghcid VARCHAR, + p_include_historical BOOLEAN DEFAULT FALSE +) +RETURNS JSONB AS $$ +DECLARE + v_result JSONB; +BEGIN + SELECT jsonb_build_object( + 'type', 'FeatureCollection', + 'features', COALESCE(jsonb_agg( + jsonb_build_object( + 'type', 'Feature', + 'geometry', ST_AsGeoJSON(sa.geom)::jsonb, + 'properties', jsonb_build_object( + 'ghcid', sa.ghcid, + 'name', sa.service_area_name, + 'type', sa.service_area_type, + 'is_historical', sa.is_historical, + 'valid_from', sa.valid_from, + 'valid_to', sa.valid_to, + 'fill_color', sa.display_fill_color, + 'fill_opacity', sa.display_fill_opacity, + 'line_color', sa.display_line_color, + 'line_width', sa.display_line_width + ) + ) + ), '[]'::jsonb) + ) + INTO v_result + FROM custodian_service_areas sa + WHERE sa.ghcid = p_ghcid + AND (p_include_historical OR sa.is_historical = FALSE); + + RETURN v_result; +END; +$$ LANGUAGE plpgsql; + +-- ============================================================================ +-- Views for API Access +-- ============================================================================ + +-- Current admin1 boundaries +CREATE OR REPLACE VIEW v_current_admin1 AS +SELECT + a.id, + c.iso_a2 AS country_code, + a.iso_3166_2, + a.admin1_code, + a.admin1_name, + a.admin1_type, + a.geonames_id, + a.wikidata_id, + ST_AsGeoJSON(a.geom)::jsonb AS geometry, + ST_AsGeoJSON(a.centroid)::jsonb AS centroid, + a.area_km2, + s.source_code AS data_source +FROM boundary_admin1 a +JOIN boundary_countries c ON a.country_id = c.id +LEFT JOIN boundary_data_sources s ON a.source_id = s.id +WHERE a.valid_to IS NULL AND c.valid_to IS NULL; + +-- Current admin2 boundaries +CREATE OR REPLACE VIEW v_current_admin2 AS +SELECT + a2.id, + c.iso_a2 AS country_code, + a1.iso_3166_2 AS admin1_iso, + a2.admin2_code, + a2.admin2_name, + a2.admin2_type, + a2.cbs_gemeente_code, + a2.nuts_code, + a2.lau_code, + a2.geonames_id, + a2.wikidata_id, + ST_AsGeoJSON(a2.geom)::jsonb AS geometry, + ST_AsGeoJSON(a2.centroid)::jsonb AS centroid, + a2.area_km2, + a2.population, + s.source_code AS data_source +FROM boundary_admin2 a2 +JOIN boundary_admin1 a1 ON a2.admin1_id = a1.id +JOIN boundary_countries c ON a2.country_id = c.id +LEFT JOIN boundary_data_sources s ON a2.source_id = s.id +WHERE a2.valid_to IS NULL AND a1.valid_to IS NULL AND c.valid_to IS NULL; + +-- Custodian service areas +CREATE OR REPLACE VIEW v_custodian_service_areas AS +SELECT + sa.ghcid, + sa.ghcid_uuid, + sa.service_area_name, + sa.service_area_type, + sa.is_historical, + sa.is_custom, + sa.valid_from, + sa.valid_to, + ST_AsGeoJSON(sa.geom)::jsonb AS geometry, + ST_AsGeoJSON(sa.centroid)::jsonb AS centroid, + sa.display_fill_color, + sa.display_fill_opacity, + sa.display_line_color, + sa.display_line_width, + sa.source_dataset, + sa.notes +FROM custodian_service_areas sa +WHERE sa.valid_to IS NULL; + +-- ============================================================================ +-- Statistics +-- ============================================================================ + +CREATE OR REPLACE VIEW v_boundary_stats AS +SELECT + 'countries' AS table_name, + COUNT(*) AS total_rows, + COUNT(*) FILTER (WHERE valid_to IS NULL) AS current_rows +FROM boundary_countries +UNION ALL +SELECT 'admin1', COUNT(*), COUNT(*) FILTER (WHERE valid_to IS NULL) FROM boundary_admin1 +UNION ALL +SELECT 'admin2', COUNT(*), COUNT(*) FILTER (WHERE valid_to IS NULL) FROM boundary_admin2 +UNION ALL +SELECT 'historical', COUNT(*), COUNT(*) FROM boundary_historical +UNION ALL +SELECT 'service_areas', COUNT(*), COUNT(*) FILTER (WHERE valid_to IS NULL) FROM custodian_service_areas +UNION ALL +SELECT 'geonames_settlements', COUNT(*), COUNT(*) FROM geonames_settlements; + +-- Country coverage summary +CREATE OR REPLACE VIEW v_boundary_coverage AS +SELECT + c.iso_a2, + c.country_name, + COUNT(DISTINCT a1.id) AS admin1_count, + COUNT(DISTINCT a2.id) AS admin2_count, + (SELECT COUNT(*) FROM geonames_settlements g WHERE g.country_code = c.iso_a2) AS settlement_count, + (SELECT COUNT(*) FROM custodian_service_areas sa WHERE sa.ghcid LIKE c.iso_a2 || '-%') AS service_area_count +FROM boundary_countries c +LEFT JOIN boundary_admin1 a1 ON a1.country_id = c.id AND a1.valid_to IS NULL +LEFT JOIN boundary_admin2 a2 ON a2.country_id = c.id AND a2.valid_to IS NULL +WHERE c.valid_to IS NULL +GROUP BY c.id, c.iso_a2, c.country_name +ORDER BY c.country_name; + +-- ============================================================================ +-- Permissions +-- ============================================================================ + +GRANT SELECT ON ALL TABLES IN SCHEMA public TO glam_api; +GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO glam_api; +GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO glam_api; + +-- ============================================================================ +-- Table Comments +-- ============================================================================ + +COMMENT ON TABLE boundary_data_sources IS 'Registry of data sources for boundary data (GADM, CBS, OSM, etc.)'; +COMMENT ON TABLE boundary_countries IS 'Country boundaries with ISO 3166-1 codes'; +COMMENT ON TABLE boundary_admin1 IS 'First-level administrative divisions (states, provinces, regions)'; +COMMENT ON TABLE boundary_admin2 IS 'Second-level administrative divisions (municipalities, districts, counties)'; +COMMENT ON TABLE boundary_historical IS 'Historical administrative boundaries (e.g., HALC 1500 data)'; +COMMENT ON TABLE custodian_service_areas IS 'Pre-computed or custom service area geometries for heritage custodians'; +COMMENT ON TABLE geonames_settlements IS 'GeoNames populated places for settlement lookups'; + +COMMENT ON FUNCTION find_admin_for_point IS 'Find administrative units containing a geographic point'; +COMMENT ON FUNCTION find_nearest_settlement IS 'Find nearest GeoNames settlement to a point'; +COMMENT ON FUNCTION compute_service_area_geometry IS 'Compute union geometry from admin unit IDs'; +COMMENT ON FUNCTION get_custodian_service_area_geojson IS 'Get GeoJSON for a custodian service area'; diff --git a/scripts/load_boundaries_postgis.py b/scripts/load_boundaries_postgis.py new file mode 100644 index 0000000000..e04e1ced1b --- /dev/null +++ b/scripts/load_boundaries_postgis.py @@ -0,0 +1,552 @@ +#!/usr/bin/env python3 +""" +Load International Administrative Boundaries into PostGIS + +Supports multiple data sources: +- GADM (Global Administrative Areas) - Primary global source +- Natural Earth - Simplified global boundaries +- CBS (Netherlands) - Official Dutch municipality boundaries +- OSM Boundaries - OpenStreetMap extracts + +Usage: + python load_boundaries_postgis.py --source gadm --country NL + python load_boundaries_postgis.py --source cbs + python load_boundaries_postgis.py --source natural_earth --level 1 + python load_boundaries_postgis.py --countries JP,CZ,DE,BE --source gadm +""" + +import argparse +import json +import logging +import os +import sys +from datetime import date +from pathlib import Path +from typing import Optional, List, Dict, Any +from urllib.request import urlretrieve +import zipfile +import tempfile + +# Third-party imports +try: + import psycopg2 + from psycopg2.extras import execute_values + import geopandas as gpd + from shapely import wkt + from shapely.geometry import shape, mapping +except ImportError as e: + print(f"Missing dependency: {e}") + print("Install with: pip install psycopg2-binary geopandas shapely") + sys.exit(1) + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# ============================================================================ +# Configuration +# ============================================================================ + +# Database connection (from environment or defaults) +DB_HOST = os.getenv('POSTGRES_HOST', 'localhost') +DB_PORT = os.getenv('POSTGRES_PORT', '5432') +DB_NAME = os.getenv('POSTGRES_DB', 'glam') +DB_USER = os.getenv('POSTGRES_USER', 'glam_api') +DB_PASSWORD = os.getenv('POSTGRES_PASSWORD', '') + +# Data source URLs +GADM_BASE_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_0.json" +GADM_ADM1_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_1.json" +GADM_ADM2_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{iso3}_2.json" + +NATURAL_EARTH_URL = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip" + +# ISO 3166-1 alpha-2 to alpha-3 mapping for common countries +ISO_A2_TO_A3 = { + 'NL': 'NLD', 'DE': 'DEU', 'BE': 'BEL', 'FR': 'FRA', 'GB': 'GBR', + 'JP': 'JPN', 'CZ': 'CZE', 'CH': 'CHE', 'AT': 'AUT', 'PL': 'POL', + 'IT': 'ITA', 'ES': 'ESP', 'PT': 'PRT', 'DK': 'DNK', 'SE': 'SWE', + 'NO': 'NOR', 'FI': 'FIN', 'IE': 'IRL', 'LU': 'LUX', 'BR': 'BRA', + 'MX': 'MEX', 'AR': 'ARG', 'CL': 'CHL', 'CO': 'COL', 'EG': 'EGY', + 'PS': 'PSE', 'IL': 'ISR', 'DZ': 'DZA', 'AU': 'AUS', 'NZ': 'NZL', + 'KR': 'KOR', 'TW': 'TWN', 'CN': 'CHN', 'IN': 'IND', 'RU': 'RUS', + 'UA': 'UKR', 'TR': 'TUR', 'GR': 'GRC', 'BG': 'BGR', 'RO': 'ROU', + 'HU': 'HUN', 'SK': 'SVK', 'SI': 'SVN', 'HR': 'HRV', 'RS': 'SRB', + 'US': 'USA', 'CA': 'CAN', 'ZA': 'ZAF', +} + + +# ============================================================================ +# Database Functions +# ============================================================================ + +def get_db_connection(): + """Get PostgreSQL connection with PostGIS.""" + return psycopg2.connect( + host=DB_HOST, + port=DB_PORT, + dbname=DB_NAME, + user=DB_USER, + password=DB_PASSWORD + ) + + +def get_source_id(conn, source_code: str) -> int: + """Get or create data source ID.""" + with conn.cursor() as cur: + cur.execute( + "SELECT id FROM boundary_data_sources WHERE source_code = %s", + (source_code,) + ) + row = cur.fetchone() + if row: + return row[0] + # Source not found - this shouldn't happen if schema is properly initialized + raise ValueError(f"Unknown data source: {source_code}") + + +def get_or_create_country(conn, iso_a2: str, iso_a3: str, name: str, + geom_wkt: str, source_id: int) -> int: + """Get or create country record, return ID.""" + with conn.cursor() as cur: + # Check if exists + cur.execute( + """SELECT id FROM boundary_countries + WHERE iso_a2 = %s AND valid_to IS NULL""", + (iso_a2,) + ) + row = cur.fetchone() + if row: + return row[0] + + # Insert new + cur.execute( + """INSERT INTO boundary_countries + (iso_a2, iso_a3, country_name, geom, centroid, source_id, valid_from) + VALUES (%s, %s, %s, ST_GeomFromText(%s, 4326), + ST_Centroid(ST_GeomFromText(%s, 4326)), %s, CURRENT_DATE) + RETURNING id""", + (iso_a2, iso_a3, name, geom_wkt, geom_wkt, source_id) + ) + return cur.fetchone()[0] + + +# ============================================================================ +# GADM Loader +# ============================================================================ + +def download_gadm(iso_a3: str, level: int, cache_dir: Path) -> Optional[Path]: + """Download GADM GeoJSON for a country and admin level.""" + cache_dir.mkdir(parents=True, exist_ok=True) + + if level == 0: + url = GADM_BASE_URL.format(iso3=iso_a3) + elif level == 1: + url = GADM_ADM1_URL.format(iso3=iso_a3) + elif level == 2: + url = GADM_ADM2_URL.format(iso3=iso_a3) + else: + logger.error(f"Unsupported GADM level: {level}") + return None + + filename = f"gadm41_{iso_a3}_{level}.json" + filepath = cache_dir / filename + + if filepath.exists(): + logger.info(f"Using cached: {filepath}") + return filepath + + logger.info(f"Downloading: {url}") + try: + urlretrieve(url, filepath) + return filepath + except Exception as e: + logger.error(f"Failed to download {url}: {e}") + return None + + +def load_gadm_country(conn, iso_a2: str, cache_dir: Path) -> Dict[str, int]: + """Load GADM data for a country (levels 0, 1, 2).""" + iso_a3 = ISO_A2_TO_A3.get(iso_a2) + if not iso_a3: + logger.error(f"Unknown country code: {iso_a2}") + return {'admin0': 0, 'admin1': 0, 'admin2': 0} + + source_id = get_source_id(conn, 'GADM') + counts = {'admin0': 0, 'admin1': 0, 'admin2': 0} + + # Level 0 - Country boundary + level0_file = download_gadm(iso_a3, 0, cache_dir) + if level0_file: + gdf = gpd.read_file(level0_file) + for _, row in gdf.iterrows(): + geom_wkt = row.geometry.wkt + country_name = row.get('COUNTRY') or row.get('NAME_0') or iso_a2 + country_id = get_or_create_country( + conn, iso_a2, iso_a3, + str(country_name), + geom_wkt, source_id + ) + counts['admin0'] = 1 + logger.info(f"Loaded country: {iso_a2} (id={country_id})") + + # Level 1 - Admin1 (states/provinces) + level1_file = download_gadm(iso_a3, 1, cache_dir) + if level1_file: + gdf = gpd.read_file(level1_file) + + # Get country ID + with conn.cursor() as cur: + cur.execute( + "SELECT id FROM boundary_countries WHERE iso_a2 = %s AND valid_to IS NULL", + (iso_a2,) + ) + country_row = cur.fetchone() + if not country_row: + logger.warning(f"Country {iso_a2} not found, skipping admin1") + else: + country_id = country_row[0] + + for _, row in gdf.iterrows(): + admin1_code = row.get('GID_1', row.get('HASC_1', '')) + admin1_name = row.get('NAME_1', '') + admin1_type = row.get('TYPE_1', 'Region') + geom_wkt = row.geometry.wkt + + # Generate ISO 3166-2 code + iso_3166_2 = f"{iso_a2}-{admin1_code.split('.')[-1][:3].upper()}" if admin1_code else None + + cur.execute( + """INSERT INTO boundary_admin1 + (country_id, iso_3166_2, admin1_code, admin1_name, admin1_type, + geom, centroid, source_id, source_feature_id, valid_from) + VALUES (%s, %s, %s, %s, %s, + ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)), + %s, %s, CURRENT_DATE) + ON CONFLICT (country_id, admin1_code, valid_to) DO UPDATE + SET admin1_name = EXCLUDED.admin1_name, + geom = EXCLUDED.geom, + updated_at = NOW()""", + (country_id, iso_3166_2, admin1_code, admin1_name, admin1_type, + geom_wkt, geom_wkt, source_id, row.get('GID_1')) + ) + counts['admin1'] += 1 + + conn.commit() + logger.info(f"Loaded {counts['admin1']} admin1 regions for {iso_a2}") + + # Level 2 - Admin2 (municipalities/districts) + level2_file = download_gadm(iso_a3, 2, cache_dir) + if level2_file: + gdf = gpd.read_file(level2_file) + + with conn.cursor() as cur: + # Get country ID + cur.execute( + "SELECT id FROM boundary_countries WHERE iso_a2 = %s AND valid_to IS NULL", + (iso_a2,) + ) + country_row = cur.fetchone() + if not country_row: + logger.warning(f"Country {iso_a2} not found, skipping admin2") + else: + country_id = country_row[0] + + for _, row in gdf.iterrows(): + admin1_code = row.get('GID_1', '') + admin2_code = row.get('GID_2', row.get('HASC_2', '')) + admin2_name = row.get('NAME_2', '') + admin2_type = row.get('TYPE_2', 'Municipality') + geom_wkt = row.geometry.wkt + + # Find admin1 ID + cur.execute( + """SELECT id FROM boundary_admin1 + WHERE country_id = %s AND admin1_code = %s AND valid_to IS NULL""", + (country_id, admin1_code) + ) + admin1_row = cur.fetchone() + admin1_id = admin1_row[0] if admin1_row else None + + cur.execute( + """INSERT INTO boundary_admin2 + (admin1_id, country_id, admin2_code, admin2_name, admin2_type, + geom, centroid, source_id, source_feature_id, valid_from) + VALUES (%s, %s, %s, %s, %s, + ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)), + %s, %s, CURRENT_DATE) + ON CONFLICT (admin1_id, admin2_code, valid_to) DO UPDATE + SET admin2_name = EXCLUDED.admin2_name, + geom = EXCLUDED.geom, + updated_at = NOW()""", + (admin1_id, country_id, admin2_code, admin2_name, admin2_type, + geom_wkt, geom_wkt, source_id, row.get('GID_2')) + ) + counts['admin2'] += 1 + + conn.commit() + logger.info(f"Loaded {counts['admin2']} admin2 regions for {iso_a2}") + + return counts + + +# ============================================================================ +# CBS Netherlands Loader +# ============================================================================ + +def load_cbs_netherlands(conn, geojson_path: Path) -> Dict[str, int]: + """Load CBS municipality boundaries for Netherlands.""" + source_id = get_source_id(conn, 'CBS') + counts = {'admin2': 0} + + if not geojson_path.exists(): + logger.error(f"CBS GeoJSON not found: {geojson_path}") + return counts + + gdf = gpd.read_file(geojson_path) + logger.info(f"Loaded {len(gdf)} features from CBS data") + + with conn.cursor() as cur: + # Ensure Netherlands country exists + cur.execute( + "SELECT id FROM boundary_countries WHERE iso_a2 = 'NL' AND valid_to IS NULL" + ) + country_row = cur.fetchone() + if not country_row: + logger.warning("Netherlands not found in countries table, creating...") + cur.execute( + """INSERT INTO boundary_countries (iso_a2, iso_a3, country_name, valid_from, source_id) + VALUES ('NL', 'NLD', 'Netherlands', CURRENT_DATE, %s) + RETURNING id""", + (source_id,) + ) + country_id = cur.fetchone()[0] + else: + country_id = country_row[0] + + for _, row in gdf.iterrows(): + # CBS field names vary - try common patterns + gemeente_code = row.get('gemeentecode', row.get('statcode', row.get('GM_CODE', ''))) + gemeente_name = row.get('gemeentenaam', row.get('statnaam', row.get('GM_NAAM', ''))) + + if not gemeente_name: + continue + + geom_wkt = row.geometry.wkt + + # Determine admin1 (province) from gemeente code or name + # CBS codes: GM0363 = Amsterdam (NH), GM0518 = 's-Gravenhage (ZH) + # For now, leave admin1_id as NULL - can be enriched later + + cur.execute( + """INSERT INTO boundary_admin2 + (admin1_id, country_id, admin2_code, cbs_gemeente_code, admin2_name, + admin2_type, geom, centroid, source_id, valid_from) + VALUES (NULL, %s, %s, %s, %s, 'Municipality', + ST_GeomFromText(%s, 4326), ST_Centroid(ST_GeomFromText(%s, 4326)), + %s, CURRENT_DATE) + ON CONFLICT (admin1_id, admin2_code, valid_to) DO UPDATE + SET admin2_name = EXCLUDED.admin2_name, + cbs_gemeente_code = EXCLUDED.cbs_gemeente_code, + geom = EXCLUDED.geom, + updated_at = NOW()""", + (country_id, gemeente_code, gemeente_code, gemeente_name, + geom_wkt, geom_wkt, source_id) + ) + counts['admin2'] += 1 + + conn.commit() + logger.info(f"Loaded {counts['admin2']} CBS municipalities for Netherlands") + + return counts + + +# ============================================================================ +# GeoNames Settlements Loader +# ============================================================================ + +def load_geonames_settlements(conn, tsv_path: Path, country_codes: Optional[List[str]] = None) -> int: + """Load GeoNames settlements from allCountries.txt or country-specific file.""" + if not tsv_path.exists(): + logger.error(f"GeoNames file not found: {tsv_path}") + return 0 + + count = 0 + batch = [] + batch_size = 5000 + + logger.info(f"Loading GeoNames settlements from {tsv_path}") + + with open(tsv_path, 'r', encoding='utf-8') as f: + for line in f: + parts = line.strip().split('\t') + if len(parts) < 19: + continue + + geonames_id = int(parts[0]) + name = parts[1] + ascii_name = parts[2] + alternate_names = parts[3] + latitude = float(parts[4]) + longitude = float(parts[5]) + feature_class = parts[6] + feature_code = parts[7] + country_code = parts[8] + admin1_code = parts[10] + admin2_code = parts[11] + admin3_code = parts[12] + admin4_code = parts[13] + population = int(parts[14]) if parts[14] else 0 + elevation = int(parts[15]) if parts[15] and parts[15] != '' else None + timezone = parts[17] + modification_date = parts[18] + + # Filter by country if specified + if country_codes and country_code not in country_codes: + continue + + # Only load populated places (P class) + if feature_class != 'P': + continue + + # Filter to main settlement types + if feature_code not in ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG'): + continue + + batch.append(( + geonames_id, name, ascii_name, alternate_names, + latitude, longitude, + feature_class, feature_code, + country_code, admin1_code, admin2_code, admin3_code, admin4_code, + population, elevation, timezone, modification_date + )) + + if len(batch) >= batch_size: + _insert_geonames_batch(conn, batch) + count += len(batch) + logger.info(f"Loaded {count} settlements...") + batch = [] + + # Final batch + if batch: + _insert_geonames_batch(conn, batch) + count += len(batch) + + logger.info(f"Loaded {count} GeoNames settlements total") + return count + + +def _insert_geonames_batch(conn, batch: List[tuple]): + """Insert batch of GeoNames settlements.""" + with conn.cursor() as cur: + execute_values( + cur, + """INSERT INTO geonames_settlements + (geonames_id, name, ascii_name, alternate_names, + latitude, longitude, geom, + feature_class, feature_code, + country_code, admin1_code, admin2_code, admin3_code, admin4_code, + population, elevation, timezone, modification_date) + VALUES %s + ON CONFLICT (geonames_id) DO UPDATE + SET name = EXCLUDED.name, + population = EXCLUDED.population, + modification_date = EXCLUDED.modification_date""", + [( + r[0], r[1], r[2], r[3], + r[4], r[5], f"SRID=4326;POINT({r[5]} {r[4]})", + r[6], r[7], + r[8], r[9], r[10], r[11], r[12], + r[13], r[14], r[15], r[16] + ) for r in batch], + template="""(%s, %s, %s, %s, %s, %s, ST_GeomFromEWKT(%s), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" + ) + conn.commit() + + +# ============================================================================ +# Main +# ============================================================================ + +def main(): + parser = argparse.ArgumentParser(description='Load administrative boundaries into PostGIS') + parser.add_argument('--source', choices=['gadm', 'cbs', 'natural_earth', 'geonames'], + required=True, help='Data source to load') + parser.add_argument('--country', type=str, help='Single country ISO-2 code (e.g., NL)') + parser.add_argument('--countries', type=str, help='Comma-separated country codes (e.g., NL,DE,BE)') + parser.add_argument('--file', type=str, help='Path to data file (for CBS, GeoNames)') + parser.add_argument('--cache-dir', type=str, default='./data/boundary_cache', + help='Directory for caching downloaded files') + parser.add_argument('--dry-run', action='store_true', help='Show what would be done') + + args = parser.parse_args() + + # Determine countries to load + countries = [] + if args.country: + countries = [args.country.upper()] + elif args.countries: + countries = [c.strip().upper() for c in args.countries.split(',')] + + cache_dir = Path(args.cache_dir) + + if args.dry_run: + logger.info(f"DRY RUN - Source: {args.source}, Countries: {countries or 'all'}") + return + + # Connect to database + try: + conn = get_db_connection() + logger.info(f"Connected to PostgreSQL: {DB_HOST}:{DB_PORT}/{DB_NAME}") + except Exception as e: + logger.error(f"Failed to connect to database: {e}") + sys.exit(1) + + try: + if args.source == 'gadm': + if not countries: + logger.error("--country or --countries required for GADM source") + sys.exit(1) + + total_counts = {'admin0': 0, 'admin1': 0, 'admin2': 0} + for iso_a2 in countries: + logger.info(f"Loading GADM data for {iso_a2}...") + counts = load_gadm_country(conn, iso_a2, cache_dir) + for k, v in counts.items(): + total_counts[k] += v + + logger.info(f"GADM load complete: {total_counts}") + + elif args.source == 'cbs': + if not args.file: + # Default path + default_path = Path('frontend/public/data/netherlands_municipalities.geojson') + if default_path.exists(): + args.file = str(default_path) + else: + logger.error("--file required for CBS source (or place file at frontend/public/data/netherlands_municipalities.geojson)") + sys.exit(1) + + counts = load_cbs_netherlands(conn, Path(args.file)) + logger.info(f"CBS load complete: {counts}") + + elif args.source == 'geonames': + if not args.file: + logger.error("--file required for GeoNames source (path to allCountries.txt or country file)") + sys.exit(1) + + count = load_geonames_settlements(conn, Path(args.file), countries or None) + logger.info(f"GeoNames load complete: {count} settlements") + + elif args.source == 'natural_earth': + logger.error("Natural Earth loader not yet implemented") + sys.exit(1) + + finally: + conn.close() + + +if __name__ == '__main__': + main()