162 lines
6.2 KiB
Python
162 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate archive_werkgebied_mapping.json from NDE institutions data.
|
|
|
|
Uses genealogiewerkbalk data to create a comprehensive mapping of:
|
|
- Archives to municipalities they serve
|
|
- Municipalities to their archives (municipal + provincial)
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
def main():
|
|
# Load NDE institutions
|
|
nde_path = Path(__file__).parent.parent / "frontend/public/data/nde_institutions.json"
|
|
with open(nde_path) as f:
|
|
institutions = json.load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Build archive data from genealogiewerkbalk
|
|
archives = {} # archive_id -> archive data
|
|
municipality_to_archives = defaultdict(dict) # mun_code -> {municipal_archive_id, provincial_archive_id}
|
|
|
|
for inst in institutions:
|
|
gwb = inst.get('genealogiewerkbalk', {})
|
|
if not gwb:
|
|
continue
|
|
|
|
mun = gwb.get('municipality', {})
|
|
mun_code = mun.get('code')
|
|
mun_name = mun.get('name')
|
|
|
|
if not mun_code:
|
|
continue
|
|
|
|
# Process municipal archive
|
|
mun_arch = gwb.get('municipal_archive', {})
|
|
if mun_arch and mun_arch.get('name'):
|
|
arch_name = mun_arch['name']
|
|
# Create archive ID from name (slug)
|
|
arch_id = arch_name.lower().replace(' ', '-').replace("'", "")
|
|
|
|
if arch_id not in archives:
|
|
archives[arch_id] = {
|
|
'name': arch_name,
|
|
'website': mun_arch.get('website'),
|
|
'isil': mun_arch.get('isil'),
|
|
'type': 'municipal',
|
|
'municipalities': []
|
|
}
|
|
|
|
# Add municipality if not already present
|
|
existing_codes = {m['code'] for m in archives[arch_id]['municipalities']}
|
|
if mun_code not in existing_codes:
|
|
archives[arch_id]['municipalities'].append({
|
|
'code': mun_code,
|
|
'name': mun_name
|
|
})
|
|
|
|
municipality_to_archives[mun_code]['municipal_archive_id'] = arch_id
|
|
|
|
# Process provincial archive
|
|
prov = gwb.get('province', {})
|
|
prov_arch = gwb.get('provincial_archive', {})
|
|
if prov_arch and prov_arch.get('name'):
|
|
arch_name = prov_arch['name']
|
|
arch_id = arch_name.lower().replace(' ', '-').replace("'", "")
|
|
|
|
if arch_id not in archives:
|
|
archives[arch_id] = {
|
|
'name': arch_name,
|
|
'website': prov_arch.get('website'),
|
|
'isil': prov_arch.get('isil'),
|
|
'type': 'provincial',
|
|
'province_code': prov.get('code'),
|
|
'province_name': prov.get('name'),
|
|
'municipalities': []
|
|
}
|
|
|
|
# Add municipality if not already present
|
|
existing_codes = {m['code'] for m in archives[arch_id]['municipalities']}
|
|
if mun_code not in existing_codes:
|
|
archives[arch_id]['municipalities'].append({
|
|
'code': mun_code,
|
|
'name': mun_name
|
|
})
|
|
|
|
municipality_to_archives[mun_code]['provincial_archive_id'] = arch_id
|
|
|
|
# Also add archives from NDE that have ISIL codes but may not be in genealogiewerkbalk
|
|
for inst in institutions:
|
|
isil_data = inst.get('isil', {})
|
|
isil_code = isil_data.get('code') if isinstance(isil_data, dict) else None
|
|
name = inst.get('name', '')
|
|
|
|
if isil_code and ('archief' in name.lower() or 'archiv' in name.lower()):
|
|
arch_id = name.lower().replace(' ', '-').replace("'", "")
|
|
|
|
# Only add if not already present
|
|
if arch_id not in archives:
|
|
archives[arch_id] = {
|
|
'name': name,
|
|
'website': inst.get('website'),
|
|
'isil': isil_code,
|
|
'type': 'municipal', # Assume municipal if not known
|
|
'municipalities': []
|
|
}
|
|
|
|
# Try to add municipality from location
|
|
city = inst.get('city', '')
|
|
if city:
|
|
# We don't have the code, but record the city
|
|
archives[arch_id]['municipalities'].append({
|
|
'code': None, # Unknown
|
|
'name': city
|
|
})
|
|
|
|
# Build statistics
|
|
municipal_archives = [a for a in archives.values() if a['type'] == 'municipal']
|
|
provincial_archives = [a for a in archives.values() if a['type'] == 'provincial']
|
|
|
|
# Sort archives by municipality count
|
|
archives_by_size = sorted(
|
|
[{'id': k, 'name': v['name'], 'count': len(v['municipalities'])}
|
|
for k, v in archives.items()],
|
|
key=lambda x: x['count'],
|
|
reverse=True
|
|
)
|
|
|
|
# Create output
|
|
output = {
|
|
'archives': archives,
|
|
'municipality_to_archives': dict(municipality_to_archives),
|
|
'statistics': {
|
|
'total_archives': len(archives),
|
|
'municipal_archives': len(municipal_archives),
|
|
'provincial_archives': len(provincial_archives),
|
|
'total_municipalities': len(municipality_to_archives),
|
|
'archives_by_size': archives_by_size[:20] # Top 20
|
|
}
|
|
}
|
|
|
|
# Write output
|
|
output_path = Path(__file__).parent.parent / "frontend/public/data/archive_werkgebied_mapping.json"
|
|
with open(output_path, 'w') as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nGenerated werkgebied mapping:")
|
|
print(f" Total archives: {len(archives)}")
|
|
print(f" Municipal archives: {len(municipal_archives)}")
|
|
print(f" Provincial archives: {len(provincial_archives)}")
|
|
print(f" Municipalities covered: {len(municipality_to_archives)}")
|
|
print(f"\nTop archives by size:")
|
|
for a in archives_by_size[:10]:
|
|
print(f" {a['name']}: {a['count']} municipalities")
|
|
|
|
print(f"\nOutput written to: {output_path}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|