199 lines
7.4 KiB
Python
199 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate archive-to-municipalities werkgebied mapping from genealogiewerkbalk data.
|
|
|
|
This creates a JSON file mapping each archive (municipal or provincial) to the
|
|
municipalities it serves. This is used by the frontend to display werkgebied polygons.
|
|
|
|
Input: data/nde/enriched/sources/genealogiewerkbalk_municipality_archives.csv
|
|
Output: frontend/public/data/archive_werkgebied_mapping.json
|
|
"""
|
|
|
|
import csv
|
|
import json
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
# Paths
|
|
DATA_DIR = Path(__file__).parent.parent / "data" / "nde" / "enriched" / "sources"
|
|
CSV_FILE = DATA_DIR / "genealogiewerkbalk_municipality_archives.csv"
|
|
OUTPUT_DIR = Path(__file__).parent.parent / "frontend" / "public" / "data"
|
|
OUTPUT_FILE = OUTPUT_DIR / "archive_werkgebied_mapping.json"
|
|
|
|
|
|
def load_csv():
|
|
"""Load the genealogiewerkbalk CSV data."""
|
|
municipalities = []
|
|
with open(CSV_FILE, "r", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
municipalities.append(row)
|
|
return municipalities
|
|
|
|
|
|
def build_archive_mapping(municipalities):
|
|
"""
|
|
Build mapping from archive identifier to municipalities served.
|
|
|
|
Returns dict with structure:
|
|
{
|
|
"archives": {
|
|
"<archive_id>": {
|
|
"name": "Archive Name",
|
|
"website": "https://...",
|
|
"isil": "NL-...",
|
|
"type": "municipal" | "provincial",
|
|
"municipalities": [
|
|
{"code": "0363", "name": "Amsterdam"},
|
|
...
|
|
]
|
|
}
|
|
},
|
|
"municipality_to_archives": {
|
|
"<gemeente_code>": {
|
|
"municipal_archive_id": "<archive_id>",
|
|
"provincial_archive_id": "<archive_id>"
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
archives = {}
|
|
municipality_to_archives = {}
|
|
|
|
for row in municipalities:
|
|
gemeente_code = row.get("gemeentecode", "").strip()
|
|
gemeente_naam = row.get("gemeentenaam", "").strip()
|
|
|
|
if not gemeente_code:
|
|
continue
|
|
|
|
# Process municipal archive
|
|
archief_gemeente = row.get("archief_gemeente", "").strip()
|
|
isil = row.get("isil", "").strip()
|
|
website_gemeente = row.get("website_gemeentearchief", "").strip()
|
|
|
|
if archief_gemeente:
|
|
# Create unique archive ID (prefer ISIL, fallback to normalized name)
|
|
if isil and not isil.startswith("geen"):
|
|
archive_id = isil
|
|
else:
|
|
# Normalize name for ID: lowercase, replace spaces with underscores
|
|
archive_id = f"gem_{archief_gemeente.lower().replace(' ', '_').replace('-', '_')}"
|
|
|
|
# Initialize archive if not exists
|
|
if archive_id not in archives:
|
|
archives[archive_id] = {
|
|
"name": archief_gemeente,
|
|
"website": website_gemeente,
|
|
"isil": isil if isil and not isil.startswith("geen") else None,
|
|
"type": "municipal",
|
|
"municipalities": []
|
|
}
|
|
|
|
# Add municipality to this archive's werkgebied
|
|
archives[archive_id]["municipalities"].append({
|
|
"code": gemeente_code,
|
|
"name": gemeente_naam
|
|
})
|
|
|
|
# Track which archive serves this municipality
|
|
if gemeente_code not in municipality_to_archives:
|
|
municipality_to_archives[gemeente_code] = {}
|
|
municipality_to_archives[gemeente_code]["municipal_archive_id"] = archive_id
|
|
|
|
# Process provincial archive
|
|
archief_provincie = row.get("archief_provincie", "").strip()
|
|
website_provincie = row.get("website_provinciaal_archief", "").strip()
|
|
provincie_code = row.get("provinciecode", "").strip()
|
|
provincie_naam = row.get("provincienaam", "").strip()
|
|
|
|
if archief_provincie:
|
|
# Create provincial archive ID
|
|
prov_archive_id = f"prov_{archief_provincie.lower().replace(' ', '_').replace('-', '_')}"
|
|
|
|
if prov_archive_id not in archives:
|
|
archives[prov_archive_id] = {
|
|
"name": archief_provincie,
|
|
"website": website_provincie,
|
|
"isil": None, # Provincial archives may not have ISIL in this dataset
|
|
"type": "provincial",
|
|
"province_code": provincie_code,
|
|
"province_name": provincie_naam,
|
|
"municipalities": []
|
|
}
|
|
|
|
# Add municipality to provincial archive werkgebied
|
|
# Check if already added (avoid duplicates)
|
|
existing = [m for m in archives[prov_archive_id]["municipalities"] if m["code"] == gemeente_code]
|
|
if not existing:
|
|
archives[prov_archive_id]["municipalities"].append({
|
|
"code": gemeente_code,
|
|
"name": gemeente_naam
|
|
})
|
|
|
|
# Track provincial archive for municipality
|
|
if gemeente_code not in municipality_to_archives:
|
|
municipality_to_archives[gemeente_code] = {}
|
|
municipality_to_archives[gemeente_code]["provincial_archive_id"] = prov_archive_id
|
|
|
|
return {
|
|
"archives": archives,
|
|
"municipality_to_archives": municipality_to_archives
|
|
}
|
|
|
|
|
|
def add_statistics(mapping):
|
|
"""Add summary statistics to the mapping."""
|
|
stats = {
|
|
"total_archives": len(mapping["archives"]),
|
|
"municipal_archives": sum(1 for a in mapping["archives"].values() if a["type"] == "municipal"),
|
|
"provincial_archives": sum(1 for a in mapping["archives"].values() if a["type"] == "provincial"),
|
|
"total_municipalities": len(mapping["municipality_to_archives"]),
|
|
"archives_by_size": []
|
|
}
|
|
|
|
# Top 10 archives by number of municipalities served
|
|
archive_sizes = [
|
|
{"id": k, "name": v["name"], "count": len(v["municipalities"])}
|
|
for k, v in mapping["archives"].items()
|
|
]
|
|
archive_sizes.sort(key=lambda x: x["count"], reverse=True)
|
|
stats["archives_by_size"] = archive_sizes[:10]
|
|
|
|
mapping["statistics"] = stats
|
|
return mapping
|
|
|
|
|
|
def main():
|
|
print(f"Loading {CSV_FILE}...")
|
|
municipalities = load_csv()
|
|
print(f"Loaded {len(municipalities)} municipality records")
|
|
|
|
print("Building archive-to-municipalities mapping...")
|
|
mapping = build_archive_mapping(municipalities)
|
|
|
|
print("Adding statistics...")
|
|
mapping = add_statistics(mapping)
|
|
|
|
# Ensure output directory exists
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Write mapping
|
|
print(f"Writing to {OUTPUT_FILE}...")
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(mapping, f, ensure_ascii=False, indent=2)
|
|
|
|
# Print summary
|
|
stats = mapping["statistics"]
|
|
print(f"\nDone!")
|
|
print(f" Total archives: {stats['total_archives']}")
|
|
print(f" - Municipal: {stats['municipal_archives']}")
|
|
print(f" - Provincial: {stats['provincial_archives']}")
|
|
print(f" Total municipalities: {stats['total_municipalities']}")
|
|
print(f"\n Top 5 archives by werkgebied size:")
|
|
for arch in stats["archives_by_size"][:5]:
|
|
print(f" - {arch['name']}: {arch['count']} municipalities")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|