#!/usr/bin/env python3 """ Migrate GeoJSON files to PostGIS database. This script loads: 1. Netherlands provinces (netherlands_provinces.geojson) 2. Netherlands municipalities (netherlands_municipalities_simplified.geojson) 3. Historical boundaries (netherlands_historical_*.geojson) 4. NDE institutions (nde_institutions.json) Usage: python migrate_geojson_to_postgis.py --host localhost --db glam_geo --data-dir ../../frontend/public/data Requirements: pip install psycopg2-binary """ import argparse import json import logging import os import sys from pathlib import Path from typing import Any, Dict, List, Optional import uuid # Try to import psycopg2 try: import psycopg2 from psycopg2.extras import execute_values, Json except ImportError: print("Error: psycopg2 not installed. Run: pip install psycopg2-binary") sys.exit(1) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Province code mapping from CBS statcode to ISO 3166-2 PROVINCE_ISO_CODES = { "PV20": "GR", # Groningen "PV21": "FR", # Fryslân "PV22": "DR", # Drenthe "PV23": "OV", # Overijssel "PV24": "FL", # Flevoland "PV25": "GE", # Gelderland "PV26": "UT", # Utrecht "PV27": "NH", # Noord-Holland "PV28": "ZH", # Zuid-Holland "PV29": "ZE", # Zeeland "PV30": "NB", # Noord-Brabant "PV31": "LI", # Limburg } # Province name to code mapping PROVINCE_NAME_TO_CODE = { "Groningen": "GR", "Friesland": "FR", "Fryslân": "FR", "Drenthe": "DR", "Overijssel": "OV", "Flevoland": "FL", "Gelderland": "GE", "Utrecht": "UT", "Noord-Holland": "NH", "Zuid-Holland": "ZH", "Zeeland": "ZE", "Noord-Brabant": "NB", "Limburg": "LI", } class GeoJSONMigrator: """Migrate GeoJSON files to PostGIS.""" def __init__(self, host: str, port: int, database: str, user: str, password: str): self.conn_params = { "host": host, "port": port, "database": database, "user": user, "password": password, } self.conn = None self.province_id_map = {} # province_code -> id def connect(self): """Connect to the database.""" logger.info(f"Connecting to {self.conn_params['database']}@{self.conn_params['host']}...") self.conn = psycopg2.connect(**self.conn_params) self.conn.autocommit = False logger.info("Connected successfully") def close(self): """Close database connection.""" if self.conn: self.conn.close() logger.info("Connection closed") def load_geojson(self, filepath: Path) -> Dict[str, Any]: """Load and parse a GeoJSON file.""" logger.info(f"Loading {filepath}...") with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) logger.info(f"Loaded {len(data.get('features', []))} features") return data def migrate_provinces(self, data_dir: Path): """Load provinces from GeoJSON.""" filepath = data_dir / "netherlands_provinces.geojson" if not filepath.exists(): logger.warning(f"Provinces file not found: {filepath}") return geojson = self.load_geojson(filepath) cursor = self.conn.cursor() # Clear existing data cursor.execute("TRUNCATE provinces CASCADE") for feature in geojson.get('features', []): props = feature.get('properties', {}) geom = feature.get('geometry') province_code = props.get('statcode', '') # e.g., "PV27" name = props.get('statnaam', '') iso_code = PROVINCE_ISO_CODES.get(province_code, '') if not province_code or not name: continue cursor.execute(""" INSERT INTO provinces (province_code, iso_code, name, country_code, geom, centroid, area_km2) VALUES (%s, %s, %s, 'NL', ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326)), ST_Area(ST_Transform(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), 28992)) / 1000000) RETURNING id """, (province_code, iso_code, name, json.dumps(geom), json.dumps(geom), json.dumps(geom))) province_id = cursor.fetchone()[0] self.province_id_map[province_code] = province_id self.province_id_map[iso_code] = province_id self.province_id_map[name] = province_id self.conn.commit() logger.info(f"Migrated {len(self.province_id_map) // 3} provinces") def migrate_municipalities(self, data_dir: Path): """Load municipalities from GeoJSON.""" # Prefer simplified version for smaller file size filepath = data_dir / "netherlands_municipalities_simplified.geojson" if not filepath.exists(): filepath = data_dir / "netherlands_municipalities.geojson" if not filepath.exists(): logger.warning(f"Municipalities file not found") return geojson = self.load_geojson(filepath) cursor = self.conn.cursor() # Clear existing data cursor.execute("TRUNCATE municipalities CASCADE") count = 0 for feature in geojson.get('features', []): props = feature.get('properties', {}) geom = feature.get('geometry') municipality_code = props.get('code', '') name = props.get('naam', '') province_code = props.get('provincieCode', '') province_name = props.get('provincieNaam', '') if not municipality_code or not name: continue # Get province_id from map province_id = (self.province_id_map.get(f"PV{province_code}") or self.province_id_map.get(province_name)) try: cursor.execute(""" INSERT INTO municipalities (municipality_code, name, province_id, country_code, geom, centroid, area_km2) VALUES (%s, %s, %s, 'NL', ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326)), ST_Area(ST_Transform(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), 28992)) / 1000000) """, (municipality_code, name, province_id, json.dumps(geom), json.dumps(geom), json.dumps(geom))) count += 1 except Exception as e: logger.error(f"Error inserting municipality {name}: {e}") self.conn.rollback() continue self.conn.commit() logger.info(f"Migrated {count} municipalities") def migrate_historical_boundaries(self, data_dir: Path): """Load historical boundaries from GeoJSON files.""" historical_files = [ ("netherlands_historical_1500_simplified.geojson", 1500, "territory"), ("netherlands_historical_territories_1500.geojson", 1500, "territory"), ("netherlands_historical_adm2_1500.geojson", 1500, "county"), ] cursor = self.conn.cursor() cursor.execute("TRUNCATE historical_boundaries") total_count = 0 for filename, year, boundary_type in historical_files: filepath = data_dir / filename if not filepath.exists(): logger.warning(f"Historical file not found: {filepath}") continue geojson = self.load_geojson(filepath) count = 0 for feature in geojson.get('features', []): props = feature.get('properties', {}) geom = feature.get('geometry') # Handle different property names across files boundary_code = props.get('ID', props.get('id', props.get('code', str(count)))) name = props.get('NAME', props.get('name', props.get('naam', f'Unknown_{count}'))) if not geom: continue try: cursor.execute(""" INSERT INTO historical_boundaries (boundary_code, name, boundary_type, reference_year, country_code, geom, centroid, area_km2, source_dataset) VALUES (%s, %s, %s, %s, 'NL', ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326)), ST_Area(ST_Transform(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), 28992)) / 1000000, %s) """, (str(boundary_code), name, boundary_type, year, json.dumps(geom), json.dumps(geom), json.dumps(geom), filename)) count += 1 except Exception as e: logger.error(f"Error inserting historical boundary {name}: {e}") continue total_count += count logger.info(f"Loaded {count} boundaries from {filename}") self.conn.commit() logger.info(f"Migrated {total_count} historical boundaries total") def migrate_institutions(self, data_dir: Path): """Load NDE institutions from JSON.""" filepath = data_dir / "nde_institutions.json" if not filepath.exists(): logger.warning(f"Institutions file not found: {filepath}") return logger.info(f"Loading {filepath}...") with open(filepath, 'r', encoding='utf-8') as f: institutions = json.load(f) logger.info(f"Loaded {len(institutions)} institutions") cursor = self.conn.cursor() cursor.execute("TRUNCATE institutions CASCADE") self.conn.commit() # Commit the truncate count = 0 errors = 0 for inst in institutions: try: # Use savepoint for each record to allow recovery from errors cursor.execute("SAVEPOINT inst_insert") # Extract GHCID data ghcid_data = inst.get('ghcid', {}) ghcid_current = ghcid_data.get('current', '') ghcid_uuid_str = ghcid_data.get('uuid', '') ghcid_numeric = ghcid_data.get('numeric') if not ghcid_current: # Generate from other identifiers ghcid_current = f"NL-XX-XXX-{inst.get('type', 'U')}-{count}" # Parse UUID try: ghcid_uuid = uuid.UUID(ghcid_uuid_str) if ghcid_uuid_str else uuid.uuid4() except ValueError: ghcid_uuid = uuid.uuid4() # Get coordinates lat = inst.get('lat') lon = inst.get('lon') # Get province_id province_name = inst.get('province', '') province_id = self.province_id_map.get(province_name) # Institution type (single character) inst_type = inst.get('type', 'U') if inst_type not in 'GLAMORCUBESFIXPHDNT': inst_type = 'U' # Use savepoint to allow recovery from errors cursor.execute("SAVEPOINT inst_insert") cursor.execute(""" INSERT INTO institutions ( ghcid_current, ghcid_uuid, ghcid_numeric, name, name_verified, name_source, institution_type, type_name, wikidata_types, geom, address, city, province, province_id, country_code, description, website, phone, wikidata_id, google_place_id, isil_code, reviews, rating, total_ratings, photos, genealogiewerkbalk, business_status, founding_year, founding_decade ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_SetSRID(ST_Point(%s, %s), 4326), %s, %s, %s, %s, 'NL', %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) """, ( ghcid_current, str(ghcid_uuid), ghcid_numeric, inst.get('name', ''), inst.get('verified_name'), inst.get('name_source'), inst_type, inst.get('type_name'), inst.get('wikidata_types'), lon, lat, inst.get('address'), inst.get('city'), province_name, province_id, inst.get('description'), inst.get('website'), inst.get('phone'), inst.get('wikidata_id'), inst.get('google_place_id'), inst.get('isil_code'), Json(inst.get('reviews')) if inst.get('reviews') else None, inst.get('rating'), inst.get('total_ratings'), Json(inst.get('photos')) if inst.get('photos') else None, Json(inst.get('genealogiewerkbalk')) if inst.get('genealogiewerkbalk') else None, inst.get('business_status'), inst.get('founding_year'), inst.get('founding_decade') )) cursor.execute("RELEASE SAVEPOINT inst_insert") count += 1 except Exception as e: cursor.execute("ROLLBACK TO SAVEPOINT inst_insert") errors += 1 if errors < 10: logger.error(f"Error inserting institution {inst.get('name', 'Unknown')}: {e}") continue self.conn.commit() logger.info(f"Migrated {count} institutions ({errors} errors)") def update_institution_admin_links(self): """Update institution province_id and municipality_id based on geometry.""" logger.info("Updating institution administrative links...") cursor = self.conn.cursor() # Update province_id based on point-in-polygon cursor.execute(""" UPDATE institutions i SET province_id = p.id FROM provinces p WHERE ST_Contains(p.geom, i.geom) AND i.province_id IS NULL """) province_updates = cursor.rowcount # Update municipality_id based on point-in-polygon cursor.execute(""" UPDATE institutions i SET municipality_id = m.id FROM municipalities m WHERE ST_Contains(m.geom, i.geom) AND i.municipality_id IS NULL """) municipality_updates = cursor.rowcount self.conn.commit() logger.info(f"Updated {province_updates} province links, {municipality_updates} municipality links") def print_stats(self): """Print migration statistics.""" cursor = self.conn.cursor() tables = ['provinces', 'municipalities', 'historical_boundaries', 'institutions'] print("\n" + "="*50) print("Migration Statistics") print("="*50) for table in tables: cursor.execute(f"SELECT COUNT(*) FROM {table}") count = cursor.fetchone()[0] print(f" {table}: {count:,} records") # Institution type breakdown cursor.execute(""" SELECT institution_type, type_name, COUNT(*) as count FROM institutions GROUP BY institution_type, type_name ORDER BY count DESC """) print("\nInstitutions by type:") for row in cursor.fetchall(): print(f" {row[0]} ({row[1]}): {row[2]:,}") print("="*50 + "\n") def main(): parser = argparse.ArgumentParser(description="Migrate GeoJSON to PostGIS") parser.add_argument("--host", default="localhost", help="Database host") parser.add_argument("--port", type=int, default=5432, help="Database port") parser.add_argument("--db", default="glam_geo", help="Database name") parser.add_argument("--user", default="postgres", help="Database user") parser.add_argument("--password", default="", help="Database password") parser.add_argument("--data-dir", default="../../frontend/public/data", help="Path to GeoJSON data directory") parser.add_argument("--only", choices=['provinces', 'municipalities', 'historical', 'institutions'], help="Only migrate specific data type") args = parser.parse_args() data_dir = Path(args.data_dir) if not data_dir.exists(): # Try relative to script location data_dir = Path(__file__).parent.parent.parent / "frontend" / "public" / "data" if not data_dir.exists(): logger.error(f"Data directory not found: {data_dir}") sys.exit(1) logger.info(f"Using data directory: {data_dir}") migrator = GeoJSONMigrator( host=args.host, port=args.port, database=args.db, user=args.user, password=args.password ) try: migrator.connect() if args.only: if args.only == 'provinces': migrator.migrate_provinces(data_dir) elif args.only == 'municipalities': migrator.migrate_municipalities(data_dir) elif args.only == 'historical': migrator.migrate_historical_boundaries(data_dir) elif args.only == 'institutions': migrator.migrate_institutions(data_dir) else: # Full migration in order migrator.migrate_provinces(data_dir) migrator.migrate_municipalities(data_dir) migrator.migrate_historical_boundaries(data_dir) migrator.migrate_institutions(data_dir) migrator.update_institution_admin_links() migrator.print_stats() except Exception as e: logger.error(f"Migration failed: {e}") raise finally: migrator.close() if __name__ == "__main__": main()