#!/usr/bin/env python3 """ Generate archive-to-municipalities werkgebied mapping from genealogiewerkbalk data. This creates a JSON file mapping each archive (municipal or provincial) to the municipalities it serves. This is used by the frontend to display werkgebied polygons. Input: data/nde/enriched/sources/genealogiewerkbalk_municipality_archives.csv Output: frontend/public/data/archive_werkgebied_mapping.json """ import csv import json from pathlib import Path from collections import defaultdict # Paths DATA_DIR = Path(__file__).parent.parent / "data" / "nde" / "enriched" / "sources" CSV_FILE = DATA_DIR / "genealogiewerkbalk_municipality_archives.csv" OUTPUT_DIR = Path(__file__).parent.parent / "frontend" / "public" / "data" OUTPUT_FILE = OUTPUT_DIR / "archive_werkgebied_mapping.json" def load_csv(): """Load the genealogiewerkbalk CSV data.""" municipalities = [] with open(CSV_FILE, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: municipalities.append(row) return municipalities def build_archive_mapping(municipalities): """ Build mapping from archive identifier to municipalities served. Returns dict with structure: { "archives": { "": { "name": "Archive Name", "website": "https://...", "isil": "NL-...", "type": "municipal" | "provincial", "municipalities": [ {"code": "0363", "name": "Amsterdam"}, ... ] } }, "municipality_to_archives": { "": { "municipal_archive_id": "", "provincial_archive_id": "" } } } """ archives = {} municipality_to_archives = {} for row in municipalities: gemeente_code = row.get("gemeentecode", "").strip() gemeente_naam = row.get("gemeentenaam", "").strip() if not gemeente_code: continue # Process municipal archive archief_gemeente = row.get("archief_gemeente", "").strip() isil = row.get("isil", "").strip() website_gemeente = row.get("website_gemeentearchief", "").strip() if archief_gemeente: # Create unique archive ID (prefer ISIL, fallback to normalized name) if isil and not isil.startswith("geen"): archive_id = isil else: # Normalize name for ID: lowercase, replace spaces with underscores archive_id = f"gem_{archief_gemeente.lower().replace(' ', '_').replace('-', '_')}" # Initialize archive if not exists if archive_id not in archives: archives[archive_id] = { "name": archief_gemeente, "website": website_gemeente, "isil": isil if isil and not isil.startswith("geen") else None, "type": "municipal", "municipalities": [] } # Add municipality to this archive's werkgebied archives[archive_id]["municipalities"].append({ "code": gemeente_code, "name": gemeente_naam }) # Track which archive serves this municipality if gemeente_code not in municipality_to_archives: municipality_to_archives[gemeente_code] = {} municipality_to_archives[gemeente_code]["municipal_archive_id"] = archive_id # Process provincial archive archief_provincie = row.get("archief_provincie", "").strip() website_provincie = row.get("website_provinciaal_archief", "").strip() provincie_code = row.get("provinciecode", "").strip() provincie_naam = row.get("provincienaam", "").strip() if archief_provincie: # Create provincial archive ID prov_archive_id = f"prov_{archief_provincie.lower().replace(' ', '_').replace('-', '_')}" if prov_archive_id not in archives: archives[prov_archive_id] = { "name": archief_provincie, "website": website_provincie, "isil": None, # Provincial archives may not have ISIL in this dataset "type": "provincial", "province_code": provincie_code, "province_name": provincie_naam, "municipalities": [] } # Add municipality to provincial archive werkgebied # Check if already added (avoid duplicates) existing = [m for m in archives[prov_archive_id]["municipalities"] if m["code"] == gemeente_code] if not existing: archives[prov_archive_id]["municipalities"].append({ "code": gemeente_code, "name": gemeente_naam }) # Track provincial archive for municipality if gemeente_code not in municipality_to_archives: municipality_to_archives[gemeente_code] = {} municipality_to_archives[gemeente_code]["provincial_archive_id"] = prov_archive_id return { "archives": archives, "municipality_to_archives": municipality_to_archives } def add_statistics(mapping): """Add summary statistics to the mapping.""" stats = { "total_archives": len(mapping["archives"]), "municipal_archives": sum(1 for a in mapping["archives"].values() if a["type"] == "municipal"), "provincial_archives": sum(1 for a in mapping["archives"].values() if a["type"] == "provincial"), "total_municipalities": len(mapping["municipality_to_archives"]), "archives_by_size": [] } # Top 10 archives by number of municipalities served archive_sizes = [ {"id": k, "name": v["name"], "count": len(v["municipalities"])} for k, v in mapping["archives"].items() ] archive_sizes.sort(key=lambda x: x["count"], reverse=True) stats["archives_by_size"] = archive_sizes[:10] mapping["statistics"] = stats return mapping def main(): print(f"Loading {CSV_FILE}...") municipalities = load_csv() print(f"Loaded {len(municipalities)} municipality records") print("Building archive-to-municipalities mapping...") mapping = build_archive_mapping(municipalities) print("Adding statistics...") mapping = add_statistics(mapping) # Ensure output directory exists OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Write mapping print(f"Writing to {OUTPUT_FILE}...") with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(mapping, f, ensure_ascii=False, indent=2) # Print summary stats = mapping["statistics"] print(f"\nDone!") print(f" Total archives: {stats['total_archives']}") print(f" - Municipal: {stats['municipal_archives']}") print(f" - Provincial: {stats['provincial_archives']}") print(f" Total municipalities: {stats['total_municipalities']}") print(f"\n Top 5 archives by werkgebied size:") for arch in stats["archives_by_size"][:5]: print(f" - {arch['name']}: {arch['count']} municipalities") if __name__ == "__main__": main()