473 lines
19 KiB
Python
473 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Migrate GeoJSON files to PostGIS database.
|
|
|
|
This script loads:
|
|
1. Netherlands provinces (netherlands_provinces.geojson)
|
|
2. Netherlands municipalities (netherlands_municipalities_simplified.geojson)
|
|
3. Historical boundaries (netherlands_historical_*.geojson)
|
|
4. NDE institutions (nde_institutions.json)
|
|
|
|
Usage:
|
|
python migrate_geojson_to_postgis.py --host localhost --db glam_geo --data-dir ../../frontend/public/data
|
|
|
|
Requirements:
|
|
pip install psycopg2-binary
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
import uuid
|
|
|
|
# Try to import psycopg2
|
|
try:
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values, Json
|
|
except ImportError:
|
|
print("Error: psycopg2 not installed. Run: pip install psycopg2-binary")
|
|
sys.exit(1)
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Province code mapping from CBS statcode to ISO 3166-2
|
|
PROVINCE_ISO_CODES = {
|
|
"PV20": "GR", # Groningen
|
|
"PV21": "FR", # Fryslân
|
|
"PV22": "DR", # Drenthe
|
|
"PV23": "OV", # Overijssel
|
|
"PV24": "FL", # Flevoland
|
|
"PV25": "GE", # Gelderland
|
|
"PV26": "UT", # Utrecht
|
|
"PV27": "NH", # Noord-Holland
|
|
"PV28": "ZH", # Zuid-Holland
|
|
"PV29": "ZE", # Zeeland
|
|
"PV30": "NB", # Noord-Brabant
|
|
"PV31": "LI", # Limburg
|
|
}
|
|
|
|
# Province name to code mapping
|
|
PROVINCE_NAME_TO_CODE = {
|
|
"Groningen": "GR",
|
|
"Friesland": "FR",
|
|
"Fryslân": "FR",
|
|
"Drenthe": "DR",
|
|
"Overijssel": "OV",
|
|
"Flevoland": "FL",
|
|
"Gelderland": "GE",
|
|
"Utrecht": "UT",
|
|
"Noord-Holland": "NH",
|
|
"Zuid-Holland": "ZH",
|
|
"Zeeland": "ZE",
|
|
"Noord-Brabant": "NB",
|
|
"Limburg": "LI",
|
|
}
|
|
|
|
|
|
class GeoJSONMigrator:
|
|
"""Migrate GeoJSON files to PostGIS."""
|
|
|
|
def __init__(self, host: str, port: int, database: str, user: str, password: str):
|
|
self.conn_params = {
|
|
"host": host,
|
|
"port": port,
|
|
"database": database,
|
|
"user": user,
|
|
"password": password,
|
|
}
|
|
self.conn = None
|
|
self.province_id_map = {} # province_code -> id
|
|
|
|
def connect(self):
|
|
"""Connect to the database."""
|
|
logger.info(f"Connecting to {self.conn_params['database']}@{self.conn_params['host']}...")
|
|
self.conn = psycopg2.connect(**self.conn_params)
|
|
self.conn.autocommit = False
|
|
logger.info("Connected successfully")
|
|
|
|
def close(self):
|
|
"""Close database connection."""
|
|
if self.conn:
|
|
self.conn.close()
|
|
logger.info("Connection closed")
|
|
|
|
def load_geojson(self, filepath: Path) -> Dict[str, Any]:
|
|
"""Load and parse a GeoJSON file."""
|
|
logger.info(f"Loading {filepath}...")
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
logger.info(f"Loaded {len(data.get('features', []))} features")
|
|
return data
|
|
|
|
def migrate_provinces(self, data_dir: Path):
|
|
"""Load provinces from GeoJSON."""
|
|
filepath = data_dir / "netherlands_provinces.geojson"
|
|
if not filepath.exists():
|
|
logger.warning(f"Provinces file not found: {filepath}")
|
|
return
|
|
|
|
geojson = self.load_geojson(filepath)
|
|
cursor = self.conn.cursor()
|
|
|
|
# Clear existing data
|
|
cursor.execute("TRUNCATE provinces CASCADE")
|
|
|
|
for feature in geojson.get('features', []):
|
|
props = feature.get('properties', {})
|
|
geom = feature.get('geometry')
|
|
|
|
province_code = props.get('statcode', '') # e.g., "PV27"
|
|
name = props.get('statnaam', '')
|
|
iso_code = PROVINCE_ISO_CODES.get(province_code, '')
|
|
|
|
if not province_code or not name:
|
|
continue
|
|
|
|
cursor.execute("""
|
|
INSERT INTO provinces (province_code, iso_code, name, country_code, geom, centroid, area_km2)
|
|
VALUES (%s, %s, %s, 'NL',
|
|
ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326),
|
|
ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326)),
|
|
ST_Area(ST_Transform(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), 28992)) / 1000000)
|
|
RETURNING id
|
|
""", (province_code, iso_code, name, json.dumps(geom), json.dumps(geom), json.dumps(geom)))
|
|
|
|
province_id = cursor.fetchone()[0]
|
|
self.province_id_map[province_code] = province_id
|
|
self.province_id_map[iso_code] = province_id
|
|
self.province_id_map[name] = province_id
|
|
|
|
self.conn.commit()
|
|
logger.info(f"Migrated {len(self.province_id_map) // 3} provinces")
|
|
|
|
def migrate_municipalities(self, data_dir: Path):
|
|
"""Load municipalities from GeoJSON."""
|
|
# Prefer simplified version for smaller file size
|
|
filepath = data_dir / "netherlands_municipalities_simplified.geojson"
|
|
if not filepath.exists():
|
|
filepath = data_dir / "netherlands_municipalities.geojson"
|
|
if not filepath.exists():
|
|
logger.warning(f"Municipalities file not found")
|
|
return
|
|
|
|
geojson = self.load_geojson(filepath)
|
|
cursor = self.conn.cursor()
|
|
|
|
# Clear existing data
|
|
cursor.execute("TRUNCATE municipalities CASCADE")
|
|
|
|
count = 0
|
|
for feature in geojson.get('features', []):
|
|
props = feature.get('properties', {})
|
|
geom = feature.get('geometry')
|
|
|
|
municipality_code = props.get('code', '')
|
|
name = props.get('naam', '')
|
|
province_code = props.get('provincieCode', '')
|
|
province_name = props.get('provincieNaam', '')
|
|
|
|
if not municipality_code or not name:
|
|
continue
|
|
|
|
# Get province_id from map
|
|
province_id = (self.province_id_map.get(f"PV{province_code}") or
|
|
self.province_id_map.get(province_name))
|
|
|
|
try:
|
|
cursor.execute("""
|
|
INSERT INTO municipalities (municipality_code, name, province_id, country_code, geom, centroid, area_km2)
|
|
VALUES (%s, %s, %s, 'NL',
|
|
ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326),
|
|
ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326)),
|
|
ST_Area(ST_Transform(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), 28992)) / 1000000)
|
|
""", (municipality_code, name, province_id, json.dumps(geom), json.dumps(geom), json.dumps(geom)))
|
|
count += 1
|
|
except Exception as e:
|
|
logger.error(f"Error inserting municipality {name}: {e}")
|
|
self.conn.rollback()
|
|
continue
|
|
|
|
self.conn.commit()
|
|
logger.info(f"Migrated {count} municipalities")
|
|
|
|
def migrate_historical_boundaries(self, data_dir: Path):
|
|
"""Load historical boundaries from GeoJSON files."""
|
|
historical_files = [
|
|
("netherlands_historical_1500_simplified.geojson", 1500, "territory"),
|
|
("netherlands_historical_territories_1500.geojson", 1500, "territory"),
|
|
("netherlands_historical_adm2_1500.geojson", 1500, "county"),
|
|
]
|
|
|
|
cursor = self.conn.cursor()
|
|
cursor.execute("TRUNCATE historical_boundaries")
|
|
|
|
total_count = 0
|
|
for filename, year, boundary_type in historical_files:
|
|
filepath = data_dir / filename
|
|
if not filepath.exists():
|
|
logger.warning(f"Historical file not found: {filepath}")
|
|
continue
|
|
|
|
geojson = self.load_geojson(filepath)
|
|
count = 0
|
|
|
|
for feature in geojson.get('features', []):
|
|
props = feature.get('properties', {})
|
|
geom = feature.get('geometry')
|
|
|
|
# Handle different property names across files
|
|
boundary_code = props.get('ID', props.get('id', props.get('code', str(count))))
|
|
name = props.get('NAME', props.get('name', props.get('naam', f'Unknown_{count}')))
|
|
|
|
if not geom:
|
|
continue
|
|
|
|
try:
|
|
cursor.execute("""
|
|
INSERT INTO historical_boundaries
|
|
(boundary_code, name, boundary_type, reference_year, country_code, geom, centroid, area_km2, source_dataset)
|
|
VALUES (%s, %s, %s, %s, 'NL',
|
|
ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326),
|
|
ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326)),
|
|
ST_Area(ST_Transform(ST_SetSRID(ST_GeomFromGeoJSON(%s), 4326), 28992)) / 1000000,
|
|
%s)
|
|
""", (str(boundary_code), name, boundary_type, year,
|
|
json.dumps(geom), json.dumps(geom), json.dumps(geom), filename))
|
|
count += 1
|
|
except Exception as e:
|
|
logger.error(f"Error inserting historical boundary {name}: {e}")
|
|
continue
|
|
|
|
total_count += count
|
|
logger.info(f"Loaded {count} boundaries from {filename}")
|
|
|
|
self.conn.commit()
|
|
logger.info(f"Migrated {total_count} historical boundaries total")
|
|
|
|
def migrate_institutions(self, data_dir: Path):
|
|
"""Load NDE institutions from JSON."""
|
|
filepath = data_dir / "nde_institutions.json"
|
|
if not filepath.exists():
|
|
logger.warning(f"Institutions file not found: {filepath}")
|
|
return
|
|
|
|
logger.info(f"Loading {filepath}...")
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
logger.info(f"Loaded {len(institutions)} institutions")
|
|
|
|
cursor = self.conn.cursor()
|
|
cursor.execute("TRUNCATE institutions CASCADE")
|
|
self.conn.commit() # Commit the truncate
|
|
|
|
count = 0
|
|
errors = 0
|
|
|
|
for inst in institutions:
|
|
try:
|
|
# Use savepoint for each record to allow recovery from errors
|
|
cursor.execute("SAVEPOINT inst_insert")
|
|
# Extract GHCID data
|
|
ghcid_data = inst.get('ghcid', {})
|
|
ghcid_current = ghcid_data.get('current', '')
|
|
ghcid_uuid_str = ghcid_data.get('uuid', '')
|
|
ghcid_numeric = ghcid_data.get('numeric')
|
|
|
|
if not ghcid_current:
|
|
# Generate from other identifiers
|
|
ghcid_current = f"NL-XX-XXX-{inst.get('type', 'U')}-{count}"
|
|
|
|
# Parse UUID
|
|
try:
|
|
ghcid_uuid = uuid.UUID(ghcid_uuid_str) if ghcid_uuid_str else uuid.uuid4()
|
|
except ValueError:
|
|
ghcid_uuid = uuid.uuid4()
|
|
|
|
# Get coordinates
|
|
lat = inst.get('lat')
|
|
lon = inst.get('lon')
|
|
|
|
# Get province_id
|
|
province_name = inst.get('province', '')
|
|
province_id = self.province_id_map.get(province_name)
|
|
|
|
# Institution type (single character)
|
|
inst_type = inst.get('type', 'U')
|
|
if inst_type not in 'GLAMORCUBESFIXPHDNT':
|
|
inst_type = 'U'
|
|
|
|
# Use savepoint to allow recovery from errors
|
|
cursor.execute("SAVEPOINT inst_insert")
|
|
|
|
cursor.execute("""
|
|
INSERT INTO institutions (
|
|
ghcid_current, ghcid_uuid, ghcid_numeric,
|
|
name, name_verified, name_source,
|
|
institution_type, type_name, wikidata_types,
|
|
geom, address, city, province, province_id, country_code,
|
|
description, website, phone,
|
|
wikidata_id, google_place_id, isil_code,
|
|
reviews, rating, total_ratings, photos,
|
|
genealogiewerkbalk, business_status, founding_year, founding_decade
|
|
) VALUES (
|
|
%s, %s, %s,
|
|
%s, %s, %s,
|
|
%s, %s, %s,
|
|
ST_SetSRID(ST_Point(%s, %s), 4326), %s, %s, %s, %s, 'NL',
|
|
%s, %s, %s,
|
|
%s, %s, %s,
|
|
%s, %s, %s, %s,
|
|
%s, %s, %s, %s
|
|
)
|
|
""", (
|
|
ghcid_current, str(ghcid_uuid), ghcid_numeric,
|
|
inst.get('name', ''), inst.get('verified_name'), inst.get('name_source'),
|
|
inst_type, inst.get('type_name'), inst.get('wikidata_types'),
|
|
lon, lat, inst.get('address'), inst.get('city'), province_name, province_id,
|
|
inst.get('description'), inst.get('website'), inst.get('phone'),
|
|
inst.get('wikidata_id'), inst.get('google_place_id'), inst.get('isil_code'),
|
|
Json(inst.get('reviews')) if inst.get('reviews') else None,
|
|
inst.get('rating'), inst.get('total_ratings'),
|
|
Json(inst.get('photos')) if inst.get('photos') else None,
|
|
Json(inst.get('genealogiewerkbalk')) if inst.get('genealogiewerkbalk') else None,
|
|
inst.get('business_status'),
|
|
inst.get('founding_year'), inst.get('founding_decade')
|
|
))
|
|
cursor.execute("RELEASE SAVEPOINT inst_insert")
|
|
count += 1
|
|
|
|
except Exception as e:
|
|
cursor.execute("ROLLBACK TO SAVEPOINT inst_insert")
|
|
errors += 1
|
|
if errors < 10:
|
|
logger.error(f"Error inserting institution {inst.get('name', 'Unknown')}: {e}")
|
|
continue
|
|
|
|
self.conn.commit()
|
|
logger.info(f"Migrated {count} institutions ({errors} errors)")
|
|
|
|
def update_institution_admin_links(self):
|
|
"""Update institution province_id and municipality_id based on geometry."""
|
|
logger.info("Updating institution administrative links...")
|
|
cursor = self.conn.cursor()
|
|
|
|
# Update province_id based on point-in-polygon
|
|
cursor.execute("""
|
|
UPDATE institutions i
|
|
SET province_id = p.id
|
|
FROM provinces p
|
|
WHERE ST_Contains(p.geom, i.geom)
|
|
AND i.province_id IS NULL
|
|
""")
|
|
province_updates = cursor.rowcount
|
|
|
|
# Update municipality_id based on point-in-polygon
|
|
cursor.execute("""
|
|
UPDATE institutions i
|
|
SET municipality_id = m.id
|
|
FROM municipalities m
|
|
WHERE ST_Contains(m.geom, i.geom)
|
|
AND i.municipality_id IS NULL
|
|
""")
|
|
municipality_updates = cursor.rowcount
|
|
|
|
self.conn.commit()
|
|
logger.info(f"Updated {province_updates} province links, {municipality_updates} municipality links")
|
|
|
|
def print_stats(self):
|
|
"""Print migration statistics."""
|
|
cursor = self.conn.cursor()
|
|
|
|
tables = ['provinces', 'municipalities', 'historical_boundaries', 'institutions']
|
|
print("\n" + "="*50)
|
|
print("Migration Statistics")
|
|
print("="*50)
|
|
|
|
for table in tables:
|
|
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
|
count = cursor.fetchone()[0]
|
|
print(f" {table}: {count:,} records")
|
|
|
|
# Institution type breakdown
|
|
cursor.execute("""
|
|
SELECT institution_type, type_name, COUNT(*) as count
|
|
FROM institutions
|
|
GROUP BY institution_type, type_name
|
|
ORDER BY count DESC
|
|
""")
|
|
print("\nInstitutions by type:")
|
|
for row in cursor.fetchall():
|
|
print(f" {row[0]} ({row[1]}): {row[2]:,}")
|
|
|
|
print("="*50 + "\n")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Migrate GeoJSON to PostGIS")
|
|
parser.add_argument("--host", default="localhost", help="Database host")
|
|
parser.add_argument("--port", type=int, default=5432, help="Database port")
|
|
parser.add_argument("--db", default="glam_geo", help="Database name")
|
|
parser.add_argument("--user", default="postgres", help="Database user")
|
|
parser.add_argument("--password", default="", help="Database password")
|
|
parser.add_argument("--data-dir", default="../../frontend/public/data",
|
|
help="Path to GeoJSON data directory")
|
|
parser.add_argument("--only", choices=['provinces', 'municipalities', 'historical', 'institutions'],
|
|
help="Only migrate specific data type")
|
|
args = parser.parse_args()
|
|
|
|
data_dir = Path(args.data_dir)
|
|
if not data_dir.exists():
|
|
# Try relative to script location
|
|
data_dir = Path(__file__).parent.parent.parent / "frontend" / "public" / "data"
|
|
|
|
if not data_dir.exists():
|
|
logger.error(f"Data directory not found: {data_dir}")
|
|
sys.exit(1)
|
|
|
|
logger.info(f"Using data directory: {data_dir}")
|
|
|
|
migrator = GeoJSONMigrator(
|
|
host=args.host,
|
|
port=args.port,
|
|
database=args.db,
|
|
user=args.user,
|
|
password=args.password
|
|
)
|
|
|
|
try:
|
|
migrator.connect()
|
|
|
|
if args.only:
|
|
if args.only == 'provinces':
|
|
migrator.migrate_provinces(data_dir)
|
|
elif args.only == 'municipalities':
|
|
migrator.migrate_municipalities(data_dir)
|
|
elif args.only == 'historical':
|
|
migrator.migrate_historical_boundaries(data_dir)
|
|
elif args.only == 'institutions':
|
|
migrator.migrate_institutions(data_dir)
|
|
else:
|
|
# Full migration in order
|
|
migrator.migrate_provinces(data_dir)
|
|
migrator.migrate_municipalities(data_dir)
|
|
migrator.migrate_historical_boundaries(data_dir)
|
|
migrator.migrate_institutions(data_dir)
|
|
migrator.update_institution_admin_links()
|
|
|
|
migrator.print_stats()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Migration failed: {e}")
|
|
raise
|
|
finally:
|
|
migrator.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|