589 lines
21 KiB
Python
589 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich NDE entries with Genealogiewerkbalk municipality archive data.
|
|
|
|
This script enriches NDE entries with data from the Genealogiewerkbalk.nl
|
|
municipality archives registry, which maps Dutch municipalities to their
|
|
responsible archives with ISIL codes, websites, and provincial archive info.
|
|
|
|
Data source:
|
|
https://docs.google.com/spreadsheets/d/1rS_Z5L6L2vvfGLS6eHI8wfyiwB-KUfHEr7W1VNY3rpg/export?format=csv
|
|
|
|
Matching strategy:
|
|
1. Match by municipality name from original_entry.plaatsnaam_bezoekadres
|
|
2. Match by Google Maps administrative_area_level_2 (gemeente)
|
|
3. Match by Google Maps locality that maps to a municipality
|
|
|
|
Usage:
|
|
python scripts/enrich_nde_genealogiewerkbalk.py
|
|
python scripts/enrich_nde_genealogiewerkbalk.py --dry-run
|
|
python scripts/enrich_nde_genealogiewerkbalk.py --entry 0016
|
|
python scripts/enrich_nde_genealogiewerkbalk.py --refresh-csv
|
|
|
|
Environment:
|
|
No special environment variables required.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import csv
|
|
import yaml
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from difflib import SequenceMatcher
|
|
import urllib.request
|
|
import unicodedata
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
|
|
SOURCES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "sources"
|
|
CSV_FILE = SOURCES_DIR / "genealogiewerkbalk_municipality_archives.csv"
|
|
CSV_URL = "https://docs.google.com/spreadsheets/d/1rS_Z5L6L2vvfGLS6eHI8wfyiwB-KUfHEr7W1VNY3rpg/export?format=csv"
|
|
|
|
# Known municipality name aliases (normalized form -> canonical normalized form)
|
|
# The canonical form must match what's in the Genealogiewerkbalk CSV after normalization
|
|
MUNICIPALITY_ALIASES = {
|
|
# Den Haag / 's-Gravenhage
|
|
"den haag": "gravenhage",
|
|
"the hague": "gravenhage",
|
|
# Scheveningen is part of Den Haag
|
|
"scheveningen": "gravenhage",
|
|
"scheveingen": "gravenhage",
|
|
"loosduinen": "gravenhage",
|
|
# Voorburg is now part of Leidschendam-Voorburg
|
|
"voorburg": "leidschendam voorburg",
|
|
# Villages that are parts of municipalities - Bergen (NH)
|
|
"egmond aan zee": "bergen (nh.)",
|
|
"egmond binnen": "bergen (nh.)",
|
|
"egmond aan den hoef": "bergen (nh.)",
|
|
"bergen": "bergen (nh.)", # Default Bergen to NH (most heritage institutions are there)
|
|
# Schagen area
|
|
"callantsoog": "schagen",
|
|
"sint maarten": "schagen",
|
|
# Frisian name variants
|
|
"haren": "groningen", # Haren merged with Groningen in 2019
|
|
"zuidwolde": "de wolden",
|
|
"de knipe": "heerenveen",
|
|
# Other common variants
|
|
"krommenie": "zaanstad",
|
|
"spaarndam": "haarlem",
|
|
"midwoud": "medemblik",
|
|
"hoogblokland": "vijfheerenlanden",
|
|
"hoogblokland hoornaar noordeloos": "vijfheerenlanden",
|
|
"ouddorp": "goeree overflakkee",
|
|
# Noord-Brabant villages
|
|
"berlicum": "sint michielsgestel",
|
|
"berlicum middelrode": "sint michielsgestel",
|
|
"oeffelt": "berg en dal",
|
|
# Limburg villages
|
|
"helden": "peel en maas",
|
|
# Zeeland villages
|
|
"wissekerke": "noord beveland",
|
|
}
|
|
|
|
|
|
def normalize_municipality_name(name: str) -> str:
|
|
"""Normalize municipality name for matching.
|
|
|
|
Handles:
|
|
- Case insensitivity
|
|
- Dutch articles and prefixes
|
|
- Common abbreviations
|
|
- Unicode normalization
|
|
- Apostrophes and special characters
|
|
- Known aliases (Den Haag -> 's-Gravenhage, etc.)
|
|
"""
|
|
if not name:
|
|
return ""
|
|
|
|
# Unicode normalize
|
|
name = unicodedata.normalize('NFKC', name)
|
|
|
|
# Lowercase
|
|
name = name.lower().strip()
|
|
|
|
# Handle 's- prefix (e.g., 's-Gravenhage -> gravenhage)
|
|
if name.startswith("'s-"):
|
|
name = name[3:]
|
|
elif name.startswith("'s "):
|
|
name = name[3:]
|
|
|
|
# Remove common prefixes that might vary
|
|
prefixes_to_remove = ['gemeente ', 'gem. ', 'gem ']
|
|
for prefix in prefixes_to_remove:
|
|
if name.startswith(prefix):
|
|
name = name[len(prefix):]
|
|
|
|
# Normalize hyphens and spaces
|
|
name = name.replace('-', ' ').replace(' ', ' ')
|
|
|
|
# Remove trailing periods
|
|
name = name.rstrip('.')
|
|
|
|
name = name.strip()
|
|
|
|
# Apply known aliases to map villages/variants to their municipality
|
|
if name in MUNICIPALITY_ALIASES:
|
|
name = MUNICIPALITY_ALIASES[name]
|
|
|
|
return name
|
|
|
|
|
|
def load_genealogiewerkbalk_data(csv_path: Path) -> Dict[str, Dict[str, Any]]:
|
|
"""Load the Genealogiewerkbalk CSV into a lookup dictionary.
|
|
|
|
Returns:
|
|
Dict mapping normalized municipality names to their data.
|
|
"""
|
|
municipalities = {}
|
|
|
|
if not csv_path.exists():
|
|
logger.warning(f"CSV file not found: {csv_path}")
|
|
return municipalities
|
|
|
|
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
gemeente = row.get('gemeentenaam', '').strip()
|
|
if not gemeente:
|
|
continue
|
|
|
|
# Store with normalized key
|
|
norm_key = normalize_municipality_name(gemeente)
|
|
|
|
# Parse ISIL - handle "geen*" codes as no ISIL
|
|
isil = row.get('isil', '').strip()
|
|
has_valid_isil = isil and not isil.startswith('geen')
|
|
|
|
municipalities[norm_key] = {
|
|
'gemeentenaam': gemeente,
|
|
'gemeentecode': row.get('gemeentecode', '').strip(),
|
|
'archief_gemeente': row.get('archief_gemeente', '').strip(),
|
|
'isil': isil if has_valid_isil else None,
|
|
'isil_raw': isil, # Keep original for reference
|
|
'extra_info': row.get('extra_info', '').strip(),
|
|
'website_gemeentearchief': row.get('website_gemeentearchief', '').strip(),
|
|
'provincienaam': row.get('provincienaam', '').strip(),
|
|
'provinciecode': row.get('provinciecode', '').strip(),
|
|
'archief_provincie': row.get('archief_provincie', '').strip(),
|
|
'website_provinciaal_archief': row.get('website_provinciaal_archief', '').strip(),
|
|
}
|
|
|
|
logger.info(f"Loaded {len(municipalities)} municipalities from Genealogiewerkbalk CSV")
|
|
return municipalities
|
|
|
|
|
|
def find_municipality_match(
|
|
entry: Dict[str, Any],
|
|
municipalities: Dict[str, Dict[str, Any]]
|
|
) -> Tuple[Optional[Dict[str, Any]], str, float]:
|
|
"""Find matching municipality for an entry.
|
|
|
|
Args:
|
|
entry: The NDE entry data
|
|
municipalities: Lookup dictionary of municipality data
|
|
|
|
Returns:
|
|
Tuple of (matched_data, match_method, confidence_score)
|
|
"""
|
|
# Strategy 1: Match by plaatsnaam_bezoekadres
|
|
plaatsnaam = entry.get('original_entry', {}).get('plaatsnaam_bezoekadres', '')
|
|
if plaatsnaam:
|
|
norm_plaats = normalize_municipality_name(plaatsnaam)
|
|
if norm_plaats in municipalities:
|
|
return municipalities[norm_plaats], 'plaatsnaam_bezoekadres', 1.0
|
|
|
|
# Try fuzzy match on plaatsnaam
|
|
best_match, score = fuzzy_match_municipality(norm_plaats, municipalities)
|
|
if best_match and score >= 0.85:
|
|
return municipalities[best_match], 'plaatsnaam_fuzzy', score
|
|
|
|
# Strategy 2: Match by Google Maps administrative_area_level_2
|
|
google_data = entry.get('google_maps_enrichment', {})
|
|
address_components = google_data.get('address_components', [])
|
|
|
|
for component in address_components:
|
|
types = component.get('types', [])
|
|
if 'administrative_area_level_2' in types:
|
|
gemeente = component.get('long_name', '')
|
|
norm_gemeente = normalize_municipality_name(gemeente)
|
|
if norm_gemeente in municipalities:
|
|
return municipalities[norm_gemeente], 'google_maps_admin2', 0.95
|
|
|
|
# Try fuzzy
|
|
best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
|
|
if best_match and score >= 0.85:
|
|
return municipalities[best_match], 'google_maps_admin2_fuzzy', score * 0.95
|
|
|
|
# Strategy 3: Match by Google Maps locality
|
|
for component in address_components:
|
|
types = component.get('types', [])
|
|
if 'locality' in types:
|
|
locality = component.get('long_name', '')
|
|
norm_locality = normalize_municipality_name(locality)
|
|
|
|
# Some localities are also municipalities
|
|
if norm_locality in municipalities:
|
|
return municipalities[norm_locality], 'google_maps_locality', 0.85
|
|
|
|
# Try fuzzy
|
|
best_match, score = fuzzy_match_municipality(norm_locality, municipalities)
|
|
if best_match and score >= 0.90: # Higher threshold for locality
|
|
return municipalities[best_match], 'google_maps_locality_fuzzy', score * 0.85
|
|
|
|
# Strategy 4: Match by web_enrichment.claims municipality
|
|
web_enrichment = entry.get('web_enrichment', {})
|
|
claims = web_enrichment.get('claims', [])
|
|
for claim in claims:
|
|
if claim.get('claim_type') == 'municipality':
|
|
gemeente = claim.get('claim_value', '')
|
|
if gemeente:
|
|
norm_gemeente = normalize_municipality_name(gemeente)
|
|
if norm_gemeente in municipalities:
|
|
return municipalities[norm_gemeente], 'web_claim_municipality', 0.90
|
|
|
|
# Try fuzzy
|
|
best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
|
|
if best_match and score >= 0.85:
|
|
return municipalities[best_match], 'web_claim_municipality_fuzzy', score * 0.90
|
|
|
|
# Strategy 5: Match by location.municipality
|
|
location = entry.get('location', {})
|
|
loc_municipality = location.get('municipality', '')
|
|
if loc_municipality:
|
|
norm_gemeente = normalize_municipality_name(loc_municipality)
|
|
if norm_gemeente in municipalities:
|
|
return municipalities[norm_gemeente], 'location_municipality', 0.90
|
|
|
|
# Try fuzzy
|
|
best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
|
|
if best_match and score >= 0.85:
|
|
return municipalities[best_match], 'location_municipality_fuzzy', score * 0.90
|
|
|
|
# Strategy 6: Match by manual_location_override.municipality
|
|
manual_override = entry.get('manual_location_override', {})
|
|
override_municipality = manual_override.get('municipality', '')
|
|
if override_municipality:
|
|
norm_gemeente = normalize_municipality_name(override_municipality)
|
|
if norm_gemeente in municipalities:
|
|
return municipalities[norm_gemeente], 'manual_override_municipality', 0.95
|
|
|
|
# Try fuzzy
|
|
best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
|
|
if best_match and score >= 0.85:
|
|
return municipalities[best_match], 'manual_override_municipality_fuzzy', score * 0.95
|
|
|
|
# Strategy 7: Match by zcbs_enrichment.municipality
|
|
zcbs = entry.get('zcbs_enrichment', {})
|
|
zcbs_municipality = zcbs.get('municipality', '')
|
|
if zcbs_municipality:
|
|
norm_gemeente = normalize_municipality_name(zcbs_municipality)
|
|
if norm_gemeente in municipalities:
|
|
return municipalities[norm_gemeente], 'zcbs_municipality', 0.90
|
|
|
|
# Try fuzzy
|
|
best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
|
|
if best_match and score >= 0.85:
|
|
return municipalities[best_match], 'zcbs_municipality_fuzzy', score * 0.90
|
|
|
|
return None, 'no_match', 0.0
|
|
|
|
|
|
def fuzzy_match_municipality(
|
|
search_term: str,
|
|
municipalities: Dict[str, Dict[str, Any]],
|
|
threshold: float = 0.80
|
|
) -> Tuple[Optional[str], float]:
|
|
"""Find best fuzzy match for a municipality name.
|
|
|
|
Returns:
|
|
Tuple of (matched_key, similarity_score) or (None, 0.0)
|
|
"""
|
|
if not search_term:
|
|
return None, 0.0
|
|
|
|
best_match = None
|
|
best_score = 0.0
|
|
|
|
for key in municipalities:
|
|
score = SequenceMatcher(None, search_term, key).ratio()
|
|
if score > best_score and score >= threshold:
|
|
best_score = score
|
|
best_match = key
|
|
|
|
return best_match, best_score
|
|
|
|
|
|
def create_enrichment_section(
|
|
match_data: Dict[str, Any],
|
|
match_method: str,
|
|
confidence: float
|
|
) -> Dict[str, Any]:
|
|
"""Create the genealogiewerkbalk_enrichment section for an entry."""
|
|
|
|
enrichment = {
|
|
'source': 'Genealogiewerkbalk.nl Municipality Archives Registry',
|
|
'source_url': 'https://www.genealogiewerkbalk.nl/archieven.html',
|
|
'data_url': CSV_URL,
|
|
'data_tier': 'TIER_2_VERIFIED',
|
|
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'match_method': match_method,
|
|
'match_confidence': round(confidence, 4),
|
|
|
|
# Municipality info
|
|
'municipality': {
|
|
'name': match_data['gemeentenaam'],
|
|
'code': match_data['gemeentecode'],
|
|
},
|
|
|
|
# Municipal archive info
|
|
'municipal_archive': {
|
|
'name': match_data['archief_gemeente'],
|
|
'website': match_data['website_gemeentearchief'] or None,
|
|
'isil': match_data['isil'],
|
|
},
|
|
|
|
# Province info
|
|
'province': {
|
|
'name': match_data['provincienaam'],
|
|
'code': match_data['provinciecode'],
|
|
},
|
|
|
|
# Provincial archive info
|
|
'provincial_archive': {
|
|
'name': match_data['archief_provincie'],
|
|
'website': match_data['website_provinciaal_archief'] or None,
|
|
},
|
|
}
|
|
|
|
# Add extra info if present
|
|
if match_data.get('extra_info'):
|
|
enrichment['extra_info'] = match_data['extra_info']
|
|
|
|
# Add raw ISIL if different from parsed (for "geen*" codes)
|
|
if match_data.get('isil_raw') and match_data['isil_raw'] != match_data['isil']:
|
|
enrichment['municipal_archive']['isil_note'] = match_data['isil_raw']
|
|
|
|
return enrichment
|
|
|
|
|
|
def update_provenance(entry: Dict[str, Any], match_method: str) -> None:
|
|
"""Update provenance tracking with Genealogiewerkbalk source."""
|
|
|
|
if 'provenance' not in entry:
|
|
entry['provenance'] = {
|
|
'schema_version': '1.0.0',
|
|
'generated_at': datetime.now(timezone.utc).isoformat(),
|
|
'sources': {}
|
|
}
|
|
|
|
sources = entry['provenance'].setdefault('sources', {})
|
|
|
|
# Add genealogiewerkbalk source
|
|
sources['genealogiewerkbalk'] = [{
|
|
'source_type': 'genealogiewerkbalk_registry',
|
|
'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'data_url': CSV_URL,
|
|
'match_method': match_method,
|
|
'claims_extracted': [
|
|
'municipality_name',
|
|
'municipality_code',
|
|
'municipal_archive_name',
|
|
'municipal_archive_website',
|
|
'municipal_archive_isil',
|
|
'province_name',
|
|
'province_code',
|
|
'provincial_archive_name',
|
|
'provincial_archive_website',
|
|
]
|
|
}]
|
|
|
|
# Update data tier summary
|
|
tier_summary = entry['provenance'].setdefault('data_tier_summary', {})
|
|
tier_2 = tier_summary.setdefault('TIER_2_VERIFIED', [])
|
|
if 'genealogiewerkbalk_registry' not in tier_2:
|
|
tier_2.append('genealogiewerkbalk_registry')
|
|
|
|
|
|
def refresh_csv() -> bool:
|
|
"""Download fresh CSV from Google Sheets."""
|
|
logger.info(f"Downloading fresh CSV from: {CSV_URL}")
|
|
|
|
try:
|
|
# Ensure directory exists
|
|
SOURCES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Download
|
|
urllib.request.urlretrieve(CSV_URL, CSV_FILE)
|
|
|
|
# Verify
|
|
with open(CSV_FILE, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
rows = list(reader)
|
|
|
|
logger.info(f"Downloaded CSV with {len(rows)} municipalities")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to download CSV: {e}")
|
|
return False
|
|
|
|
|
|
def process_entry(
|
|
entry_path: Path,
|
|
municipalities: Dict[str, Dict[str, Any]],
|
|
dry_run: bool = False,
|
|
force: bool = False
|
|
) -> Tuple[str, Optional[str]]:
|
|
"""Process a single entry file.
|
|
|
|
Returns:
|
|
Tuple of (status, match_info)
|
|
status: 'enriched', 'already_enriched', 'no_match', 'error'
|
|
"""
|
|
try:
|
|
with open(entry_path, 'r', encoding='utf-8') as f:
|
|
entry = yaml.safe_load(f)
|
|
|
|
if not entry:
|
|
return 'error', 'Empty file'
|
|
|
|
# Check if already enriched
|
|
if not force and 'genealogiewerkbalk_enrichment' in entry:
|
|
return 'already_enriched', None
|
|
|
|
# Find match
|
|
match_data, match_method, confidence = find_municipality_match(entry, municipalities)
|
|
|
|
if not match_data:
|
|
return 'no_match', None
|
|
|
|
# Create enrichment
|
|
enrichment = create_enrichment_section(match_data, match_method, confidence)
|
|
|
|
if dry_run:
|
|
gemeente = match_data['gemeentenaam']
|
|
archive = match_data['archief_gemeente']
|
|
return 'would_enrich', f"{gemeente} -> {archive} ({match_method}, {confidence:.2f})"
|
|
|
|
# Update entry
|
|
entry['genealogiewerkbalk_enrichment'] = enrichment
|
|
update_provenance(entry, match_method)
|
|
|
|
# Write back
|
|
with open(entry_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
gemeente = match_data['gemeentenaam']
|
|
archive = match_data['archief_gemeente']
|
|
return 'enriched', f"{gemeente} -> {archive} ({match_method})"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {entry_path.name}: {e}")
|
|
return 'error', str(e)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Enrich NDE entries with Genealogiewerkbalk municipality archive data'
|
|
)
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Show what would be done without making changes')
|
|
parser.add_argument('--entry', type=str,
|
|
help='Process only a specific entry (e.g., "0016" or "0016_Q81181377")')
|
|
parser.add_argument('--force', action='store_true',
|
|
help='Re-enrich even if already enriched')
|
|
parser.add_argument('--refresh-csv', action='store_true',
|
|
help='Download fresh CSV before processing')
|
|
parser.add_argument('--verbose', '-v', action='store_true',
|
|
help='Show detailed output')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.verbose:
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
# Refresh CSV if requested or not present
|
|
if args.refresh_csv or not CSV_FILE.exists():
|
|
if not refresh_csv():
|
|
logger.error("Failed to get CSV data")
|
|
sys.exit(1)
|
|
|
|
# Load municipality data
|
|
municipalities = load_genealogiewerkbalk_data(CSV_FILE)
|
|
if not municipalities:
|
|
logger.error("No municipality data loaded")
|
|
sys.exit(1)
|
|
|
|
# Find entry files
|
|
if args.entry:
|
|
# Specific entry
|
|
pattern = f"{args.entry}*.yaml"
|
|
entry_files = list(ENTRIES_DIR.glob(pattern))
|
|
if not entry_files:
|
|
logger.error(f"No entry files found matching: {pattern}")
|
|
sys.exit(1)
|
|
else:
|
|
# All entries
|
|
entry_files = sorted(ENTRIES_DIR.glob("*.yaml"))
|
|
|
|
logger.info(f"Processing {len(entry_files)} entry files...")
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total': len(entry_files),
|
|
'enriched': 0,
|
|
'already_enriched': 0,
|
|
'no_match': 0,
|
|
'error': 0,
|
|
}
|
|
|
|
# Process entries
|
|
for entry_path in entry_files:
|
|
status, info = process_entry(
|
|
entry_path,
|
|
municipalities,
|
|
dry_run=args.dry_run,
|
|
force=args.force
|
|
)
|
|
|
|
if status == 'enriched' or status == 'would_enrich':
|
|
stats['enriched'] += 1
|
|
logger.info(f"{'[DRY-RUN] Would enrich' if args.dry_run else 'Enriched'}: {entry_path.name} - {info}")
|
|
elif status == 'already_enriched':
|
|
stats['already_enriched'] += 1
|
|
if args.verbose:
|
|
logger.debug(f"Already enriched: {entry_path.name}")
|
|
elif status == 'no_match':
|
|
stats['no_match'] += 1
|
|
if args.verbose:
|
|
logger.debug(f"No match: {entry_path.name}")
|
|
elif status == 'error':
|
|
stats['error'] += 1
|
|
logger.warning(f"Error: {entry_path.name} - {info}")
|
|
|
|
# Summary
|
|
logger.info("\n=== Enrichment Summary ===")
|
|
logger.info(f"Total files: {stats['total']}")
|
|
logger.info(f"Enriched: {stats['enriched']}")
|
|
logger.info(f"Already enriched: {stats['already_enriched']}")
|
|
logger.info(f"No match: {stats['no_match']}")
|
|
logger.info(f"Errors: {stats['error']}")
|
|
|
|
if args.dry_run:
|
|
logger.info("\n[DRY-RUN] No changes were made.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|