glam/scripts/generate_archive_werkgebied_mapping.py
2025-12-03 17:38:46 +01:00

199 lines
7.4 KiB
Python

#!/usr/bin/env python3
"""
Generate archive-to-municipalities werkgebied mapping from genealogiewerkbalk data.
This creates a JSON file mapping each archive (municipal or provincial) to the
municipalities it serves. This is used by the frontend to display werkgebied polygons.
Input: data/nde/enriched/sources/genealogiewerkbalk_municipality_archives.csv
Output: frontend/public/data/archive_werkgebied_mapping.json
"""
import csv
import json
from pathlib import Path
from collections import defaultdict
# Paths
DATA_DIR = Path(__file__).parent.parent / "data" / "nde" / "enriched" / "sources"
CSV_FILE = DATA_DIR / "genealogiewerkbalk_municipality_archives.csv"
OUTPUT_DIR = Path(__file__).parent.parent / "frontend" / "public" / "data"
OUTPUT_FILE = OUTPUT_DIR / "archive_werkgebied_mapping.json"
def load_csv():
"""Load the genealogiewerkbalk CSV data."""
municipalities = []
with open(CSV_FILE, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
municipalities.append(row)
return municipalities
def build_archive_mapping(municipalities):
"""
Build mapping from archive identifier to municipalities served.
Returns dict with structure:
{
"archives": {
"<archive_id>": {
"name": "Archive Name",
"website": "https://...",
"isil": "NL-...",
"type": "municipal" | "provincial",
"municipalities": [
{"code": "0363", "name": "Amsterdam"},
...
]
}
},
"municipality_to_archives": {
"<gemeente_code>": {
"municipal_archive_id": "<archive_id>",
"provincial_archive_id": "<archive_id>"
}
}
}
"""
archives = {}
municipality_to_archives = {}
for row in municipalities:
gemeente_code = row.get("gemeentecode", "").strip()
gemeente_naam = row.get("gemeentenaam", "").strip()
if not gemeente_code:
continue
# Process municipal archive
archief_gemeente = row.get("archief_gemeente", "").strip()
isil = row.get("isil", "").strip()
website_gemeente = row.get("website_gemeentearchief", "").strip()
if archief_gemeente:
# Create unique archive ID (prefer ISIL, fallback to normalized name)
if isil and not isil.startswith("geen"):
archive_id = isil
else:
# Normalize name for ID: lowercase, replace spaces with underscores
archive_id = f"gem_{archief_gemeente.lower().replace(' ', '_').replace('-', '_')}"
# Initialize archive if not exists
if archive_id not in archives:
archives[archive_id] = {
"name": archief_gemeente,
"website": website_gemeente,
"isil": isil if isil and not isil.startswith("geen") else None,
"type": "municipal",
"municipalities": []
}
# Add municipality to this archive's werkgebied
archives[archive_id]["municipalities"].append({
"code": gemeente_code,
"name": gemeente_naam
})
# Track which archive serves this municipality
if gemeente_code not in municipality_to_archives:
municipality_to_archives[gemeente_code] = {}
municipality_to_archives[gemeente_code]["municipal_archive_id"] = archive_id
# Process provincial archive
archief_provincie = row.get("archief_provincie", "").strip()
website_provincie = row.get("website_provinciaal_archief", "").strip()
provincie_code = row.get("provinciecode", "").strip()
provincie_naam = row.get("provincienaam", "").strip()
if archief_provincie:
# Create provincial archive ID
prov_archive_id = f"prov_{archief_provincie.lower().replace(' ', '_').replace('-', '_')}"
if prov_archive_id not in archives:
archives[prov_archive_id] = {
"name": archief_provincie,
"website": website_provincie,
"isil": None, # Provincial archives may not have ISIL in this dataset
"type": "provincial",
"province_code": provincie_code,
"province_name": provincie_naam,
"municipalities": []
}
# Add municipality to provincial archive werkgebied
# Check if already added (avoid duplicates)
existing = [m for m in archives[prov_archive_id]["municipalities"] if m["code"] == gemeente_code]
if not existing:
archives[prov_archive_id]["municipalities"].append({
"code": gemeente_code,
"name": gemeente_naam
})
# Track provincial archive for municipality
if gemeente_code not in municipality_to_archives:
municipality_to_archives[gemeente_code] = {}
municipality_to_archives[gemeente_code]["provincial_archive_id"] = prov_archive_id
return {
"archives": archives,
"municipality_to_archives": municipality_to_archives
}
def add_statistics(mapping):
"""Add summary statistics to the mapping."""
stats = {
"total_archives": len(mapping["archives"]),
"municipal_archives": sum(1 for a in mapping["archives"].values() if a["type"] == "municipal"),
"provincial_archives": sum(1 for a in mapping["archives"].values() if a["type"] == "provincial"),
"total_municipalities": len(mapping["municipality_to_archives"]),
"archives_by_size": []
}
# Top 10 archives by number of municipalities served
archive_sizes = [
{"id": k, "name": v["name"], "count": len(v["municipalities"])}
for k, v in mapping["archives"].items()
]
archive_sizes.sort(key=lambda x: x["count"], reverse=True)
stats["archives_by_size"] = archive_sizes[:10]
mapping["statistics"] = stats
return mapping
def main():
print(f"Loading {CSV_FILE}...")
municipalities = load_csv()
print(f"Loaded {len(municipalities)} municipality records")
print("Building archive-to-municipalities mapping...")
mapping = build_archive_mapping(municipalities)
print("Adding statistics...")
mapping = add_statistics(mapping)
# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Write mapping
print(f"Writing to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(mapping, f, ensure_ascii=False, indent=2)
# Print summary
stats = mapping["statistics"]
print(f"\nDone!")
print(f" Total archives: {stats['total_archives']}")
print(f" - Municipal: {stats['municipal_archives']}")
print(f" - Provincial: {stats['provincial_archives']}")
print(f" Total municipalities: {stats['total_municipalities']}")
print(f"\n Top 5 archives by werkgebied size:")
for arch in stats["archives_by_size"][:5]:
print(f" - {arch['name']}: {arch['count']} municipalities")
if __name__ == "__main__":
main()