glam/scripts/generate_werkgebied_mapping.py
2025-12-05 15:30:23 +01:00

162 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""
Generate archive_werkgebied_mapping.json from NDE institutions data.
Uses genealogiewerkbalk data to create a comprehensive mapping of:
- Archives to municipalities they serve
- Municipalities to their archives (municipal + provincial)
"""
import json
from pathlib import Path
from collections import defaultdict
def main():
# Load NDE institutions
nde_path = Path(__file__).parent.parent / "frontend/public/data/nde_institutions.json"
with open(nde_path) as f:
institutions = json.load(f)
print(f"Loaded {len(institutions)} institutions")
# Build archive data from genealogiewerkbalk
archives = {} # archive_id -> archive data
municipality_to_archives = defaultdict(dict) # mun_code -> {municipal_archive_id, provincial_archive_id}
for inst in institutions:
gwb = inst.get('genealogiewerkbalk', {})
if not gwb:
continue
mun = gwb.get('municipality', {})
mun_code = mun.get('code')
mun_name = mun.get('name')
if not mun_code:
continue
# Process municipal archive
mun_arch = gwb.get('municipal_archive', {})
if mun_arch and mun_arch.get('name'):
arch_name = mun_arch['name']
# Create archive ID from name (slug)
arch_id = arch_name.lower().replace(' ', '-').replace("'", "")
if arch_id not in archives:
archives[arch_id] = {
'name': arch_name,
'website': mun_arch.get('website'),
'isil': mun_arch.get('isil'),
'type': 'municipal',
'municipalities': []
}
# Add municipality if not already present
existing_codes = {m['code'] for m in archives[arch_id]['municipalities']}
if mun_code not in existing_codes:
archives[arch_id]['municipalities'].append({
'code': mun_code,
'name': mun_name
})
municipality_to_archives[mun_code]['municipal_archive_id'] = arch_id
# Process provincial archive
prov = gwb.get('province', {})
prov_arch = gwb.get('provincial_archive', {})
if prov_arch and prov_arch.get('name'):
arch_name = prov_arch['name']
arch_id = arch_name.lower().replace(' ', '-').replace("'", "")
if arch_id not in archives:
archives[arch_id] = {
'name': arch_name,
'website': prov_arch.get('website'),
'isil': prov_arch.get('isil'),
'type': 'provincial',
'province_code': prov.get('code'),
'province_name': prov.get('name'),
'municipalities': []
}
# Add municipality if not already present
existing_codes = {m['code'] for m in archives[arch_id]['municipalities']}
if mun_code not in existing_codes:
archives[arch_id]['municipalities'].append({
'code': mun_code,
'name': mun_name
})
municipality_to_archives[mun_code]['provincial_archive_id'] = arch_id
# Also add archives from NDE that have ISIL codes but may not be in genealogiewerkbalk
for inst in institutions:
isil_data = inst.get('isil', {})
isil_code = isil_data.get('code') if isinstance(isil_data, dict) else None
name = inst.get('name', '')
if isil_code and ('archief' in name.lower() or 'archiv' in name.lower()):
arch_id = name.lower().replace(' ', '-').replace("'", "")
# Only add if not already present
if arch_id not in archives:
archives[arch_id] = {
'name': name,
'website': inst.get('website'),
'isil': isil_code,
'type': 'municipal', # Assume municipal if not known
'municipalities': []
}
# Try to add municipality from location
city = inst.get('city', '')
if city:
# We don't have the code, but record the city
archives[arch_id]['municipalities'].append({
'code': None, # Unknown
'name': city
})
# Build statistics
municipal_archives = [a for a in archives.values() if a['type'] == 'municipal']
provincial_archives = [a for a in archives.values() if a['type'] == 'provincial']
# Sort archives by municipality count
archives_by_size = sorted(
[{'id': k, 'name': v['name'], 'count': len(v['municipalities'])}
for k, v in archives.items()],
key=lambda x: x['count'],
reverse=True
)
# Create output
output = {
'archives': archives,
'municipality_to_archives': dict(municipality_to_archives),
'statistics': {
'total_archives': len(archives),
'municipal_archives': len(municipal_archives),
'provincial_archives': len(provincial_archives),
'total_municipalities': len(municipality_to_archives),
'archives_by_size': archives_by_size[:20] # Top 20
}
}
# Write output
output_path = Path(__file__).parent.parent / "frontend/public/data/archive_werkgebied_mapping.json"
with open(output_path, 'w') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\nGenerated werkgebied mapping:")
print(f" Total archives: {len(archives)}")
print(f" Municipal archives: {len(municipal_archives)}")
print(f" Provincial archives: {len(provincial_archives)}")
print(f" Municipalities covered: {len(municipality_to_archives)}")
print(f"\nTop archives by size:")
for a in archives_by_size[:10]:
print(f" {a['name']}: {a['count']} municipalities")
print(f"\nOutput written to: {output_path}")
if __name__ == '__main__':
main()