glam/scripts/generate_archive_werkgebied_mapping.py

#!/usr/bin/env python3
"""
Generate archive-to-municipalities werkgebied mapping from genealogiewerkbalk data.

This creates a JSON file mapping each archive (municipal or provincial) to the
municipalities it serves. This is used by the frontend to display werkgebied polygons.

Input: data/nde/enriched/sources/genealogiewerkbalk_municipality_archives.csv
Output: frontend/public/data/archive_werkgebied_mapping.json
"""

import csv
import json
from pathlib import Path
from collections import defaultdict

# Paths
DATA_DIR = Path(__file__).parent.parent / "data" / "nde" / "enriched" / "sources"
CSV_FILE = DATA_DIR / "genealogiewerkbalk_municipality_archives.csv"
OUTPUT_DIR = Path(__file__).parent.parent / "frontend" / "public" / "data"
OUTPUT_FILE = OUTPUT_DIR / "archive_werkgebied_mapping.json"


def load_csv():
    """Load the genealogiewerkbalk CSV data."""
    municipalities = []
    with open(CSV_FILE, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            municipalities.append(row)
    return municipalities


def build_archive_mapping(municipalities):
    """
    Build mapping from archive identifier to municipalities served.

    Returns dict with structure:
    {
        "archives": {
            "<archive_id>": {
                "name": "Archive Name",
                "website": "https://...",
                "isil": "NL-...",
                "type": "municipal" | "provincial",
                "municipalities": [
                    {"code": "0363", "name": "Amsterdam"},
                    ...
                ]
            }
        },
        "municipality_to_archives": {
            "<gemeente_code>": {
                "municipal_archive_id": "<archive_id>",
                "provincial_archive_id": "<archive_id>"
            }
        }
    }
    """
    archives = {}
    municipality_to_archives = {}

    for row in municipalities:
        gemeente_code = row.get("gemeentecode", "").strip()
        gemeente_naam = row.get("gemeentenaam", "").strip()

        if not gemeente_code:
            continue

        # Process municipal archive
        archief_gemeente = row.get("archief_gemeente", "").strip()
        isil = row.get("isil", "").strip()
        website_gemeente = row.get("website_gemeentearchief", "").strip()

        if archief_gemeente:
            # Create unique archive ID (prefer ISIL, fallback to normalized name)
            if isil and not isil.startswith("geen"):
                archive_id = isil
            else:
                # Normalize name for ID: lowercase, replace spaces with underscores
                archive_id = f"gem_{archief_gemeente.lower().replace(' ', '_').replace('-', '_')}"

            # Initialize archive if not exists
            if archive_id not in archives:
                archives[archive_id] = {
                    "name": archief_gemeente,
                    "website": website_gemeente,
                    "isil": isil if isil and not isil.startswith("geen") else None,
                    "type": "municipal",
                    "municipalities": []
                }

            # Add municipality to this archive's werkgebied
            archives[archive_id]["municipalities"].append({
                "code": gemeente_code,
                "name": gemeente_naam
            })

            # Track which archive serves this municipality
            if gemeente_code not in municipality_to_archives:
                municipality_to_archives[gemeente_code] = {}
            municipality_to_archives[gemeente_code]["municipal_archive_id"] = archive_id

        # Process provincial archive
        archief_provincie = row.get("archief_provincie", "").strip()
        website_provincie = row.get("website_provinciaal_archief", "").strip()
        provincie_code = row.get("provinciecode", "").strip()
        provincie_naam = row.get("provincienaam", "").strip()

        if archief_provincie:
            # Create provincial archive ID
            prov_archive_id = f"prov_{archief_provincie.lower().replace(' ', '_').replace('-', '_')}"

            if prov_archive_id not in archives:
                archives[prov_archive_id] = {
                    "name": archief_provincie,
                    "website": website_provincie,
                    "isil": None,  # Provincial archives may not have ISIL in this dataset
                    "type": "provincial",
                    "province_code": provincie_code,
                    "province_name": provincie_naam,
                    "municipalities": []
                }

            # Add municipality to provincial archive werkgebied
            # Check if already added (avoid duplicates)
            existing = [m for m in archives[prov_archive_id]["municipalities"] if m["code"] == gemeente_code]
            if not existing:
                archives[prov_archive_id]["municipalities"].append({
                    "code": gemeente_code,
                    "name": gemeente_naam
                })

            # Track provincial archive for municipality
            if gemeente_code not in municipality_to_archives:
                municipality_to_archives[gemeente_code] = {}
            municipality_to_archives[gemeente_code]["provincial_archive_id"] = prov_archive_id

    return {
        "archives": archives,
        "municipality_to_archives": municipality_to_archives
    }


def add_statistics(mapping):
    """Add summary statistics to the mapping."""
    stats = {
        "total_archives": len(mapping["archives"]),
        "municipal_archives": sum(1 for a in mapping["archives"].values() if a["type"] == "municipal"),
        "provincial_archives": sum(1 for a in mapping["archives"].values() if a["type"] == "provincial"),
        "total_municipalities": len(mapping["municipality_to_archives"]),
        "archives_by_size": []
    }

    # Top 10 archives by number of municipalities served
    archive_sizes = [
        {"id": k, "name": v["name"], "count": len(v["municipalities"])}
        for k, v in mapping["archives"].items()
    ]
    archive_sizes.sort(key=lambda x: x["count"], reverse=True)
    stats["archives_by_size"] = archive_sizes[:10]

    mapping["statistics"] = stats
    return mapping


def main():
    print(f"Loading {CSV_FILE}...")
    municipalities = load_csv()
    print(f"Loaded {len(municipalities)} municipality records")

    print("Building archive-to-municipalities mapping...")
    mapping = build_archive_mapping(municipalities)

    print("Adding statistics...")
    mapping = add_statistics(mapping)

    # Ensure output directory exists
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # Write mapping
    print(f"Writing to {OUTPUT_FILE}...")
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(mapping, f, ensure_ascii=False, indent=2)

    # Print summary
    stats = mapping["statistics"]
    print(f"\nDone!")
    print(f"  Total archives: {stats['total_archives']}")
    print(f"    - Municipal: {stats['municipal_archives']}")
    print(f"    - Provincial: {stats['provincial_archives']}")
    print(f"  Total municipalities: {stats['total_municipalities']}")
    print(f"\n  Top 5 archives by werkgebied size:")
    for arch in stats["archives_by_size"][:5]:
        print(f"    - {arch['name']}: {arch['count']} municipalities")


if __name__ == "__main__":
    main()