glam/scripts/enrich_nde_genealogiewerkbalk.py

#!/usr/bin/env python3
"""
Enrich NDE entries with Genealogiewerkbalk municipality archive data.

This script enriches NDE entries with data from the Genealogiewerkbalk.nl
municipality archives registry, which maps Dutch municipalities to their
responsible archives with ISIL codes, websites, and provincial archive info.

Data source:
    https://docs.google.com/spreadsheets/d/1rS_Z5L6L2vvfGLS6eHI8wfyiwB-KUfHEr7W1VNY3rpg/export?format=csv

Matching strategy:
    1. Match by municipality name from original_entry.plaatsnaam_bezoekadres
    2. Match by Google Maps administrative_area_level_2 (gemeente)
    3. Match by Google Maps locality that maps to a municipality

Usage:
    python scripts/enrich_nde_genealogiewerkbalk.py
    python scripts/enrich_nde_genealogiewerkbalk.py --dry-run
    python scripts/enrich_nde_genealogiewerkbalk.py --entry 0016
    python scripts/enrich_nde_genealogiewerkbalk.py --refresh-csv

Environment:
    No special environment variables required.
"""

import os
import sys
import csv
import yaml
import argparse
import logging
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any, Tuple
from difflib import SequenceMatcher
import urllib.request
import unicodedata

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Paths
PROJECT_ROOT = Path(__file__).parent.parent
ENTRIES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "entries"
SOURCES_DIR = PROJECT_ROOT / "data" / "nde" / "enriched" / "sources"
CSV_FILE = SOURCES_DIR / "genealogiewerkbalk_municipality_archives.csv"
CSV_URL = "https://docs.google.com/spreadsheets/d/1rS_Z5L6L2vvfGLS6eHI8wfyiwB-KUfHEr7W1VNY3rpg/export?format=csv"

# Known municipality name aliases (normalized form -> canonical normalized form)
# The canonical form must match what's in the Genealogiewerkbalk CSV after normalization
MUNICIPALITY_ALIASES = {
    # Den Haag / 's-Gravenhage
    "den haag": "gravenhage",
    "the hague": "gravenhage",
    # Scheveningen is part of Den Haag
    "scheveningen": "gravenhage",
    "scheveingen": "gravenhage",
    "loosduinen": "gravenhage",
    # Voorburg is now part of Leidschendam-Voorburg
    "voorburg": "leidschendam voorburg",
    # Villages that are parts of municipalities - Bergen (NH)
    "egmond aan zee": "bergen (nh.)",
    "egmond binnen": "bergen (nh.)",
    "egmond aan den hoef": "bergen (nh.)",
    "bergen": "bergen (nh.)",  # Default Bergen to NH (most heritage institutions are there)
    # Schagen area
    "callantsoog": "schagen",
    "sint maarten": "schagen",
    # Frisian name variants
    "haren": "groningen",  # Haren merged with Groningen in 2019
    "zuidwolde": "de wolden",
    "de knipe": "heerenveen",
    # Other common variants
    "krommenie": "zaanstad",
    "spaarndam": "haarlem",
    "midwoud": "medemblik",
    "hoogblokland": "vijfheerenlanden",
    "hoogblokland hoornaar noordeloos": "vijfheerenlanden",
    "ouddorp": "goeree overflakkee",
    # Noord-Brabant villages
    "berlicum": "sint michielsgestel",
    "berlicum middelrode": "sint michielsgestel",
    "oeffelt": "berg en dal",
    # Limburg villages
    "helden": "peel en maas",
    # Zeeland villages
    "wissekerke": "noord beveland",
}


def normalize_municipality_name(name: str) -> str:
    """Normalize municipality name for matching.

    Handles:
    - Case insensitivity
    - Dutch articles and prefixes
    - Common abbreviations
    - Unicode normalization
    - Apostrophes and special characters
    - Known aliases (Den Haag -> 's-Gravenhage, etc.)
    """
    if not name:
        return ""

    # Unicode normalize
    name = unicodedata.normalize('NFKC', name)

    # Lowercase
    name = name.lower().strip()

    # Handle 's- prefix (e.g., 's-Gravenhage -> gravenhage)
    if name.startswith("'s-"):
        name = name[3:]
    elif name.startswith("'s "):
        name = name[3:]

    # Remove common prefixes that might vary
    prefixes_to_remove = ['gemeente ', 'gem. ', 'gem ']
    for prefix in prefixes_to_remove:
        if name.startswith(prefix):
            name = name[len(prefix):]

    # Normalize hyphens and spaces
    name = name.replace('-', ' ').replace('  ', ' ')

    # Remove trailing periods
    name = name.rstrip('.')

    name = name.strip()

    # Apply known aliases to map villages/variants to their municipality
    if name in MUNICIPALITY_ALIASES:
        name = MUNICIPALITY_ALIASES[name]

    return name


def load_genealogiewerkbalk_data(csv_path: Path) -> Dict[str, Dict[str, Any]]:
    """Load the Genealogiewerkbalk CSV into a lookup dictionary.

    Returns:
        Dict mapping normalized municipality names to their data.
    """
    municipalities = {}

    if not csv_path.exists():
        logger.warning(f"CSV file not found: {csv_path}")
        return municipalities

    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            gemeente = row.get('gemeentenaam', '').strip()
            if not gemeente:
                continue

            # Store with normalized key
            norm_key = normalize_municipality_name(gemeente)

            # Parse ISIL - handle "geen*" codes as no ISIL
            isil = row.get('isil', '').strip()
            has_valid_isil = isil and not isil.startswith('geen')

            municipalities[norm_key] = {
                'gemeentenaam': gemeente,
                'gemeentecode': row.get('gemeentecode', '').strip(),
                'archief_gemeente': row.get('archief_gemeente', '').strip(),
                'isil': isil if has_valid_isil else None,
                'isil_raw': isil,  # Keep original for reference
                'extra_info': row.get('extra_info', '').strip(),
                'website_gemeentearchief': row.get('website_gemeentearchief', '').strip(),
                'provincienaam': row.get('provincienaam', '').strip(),
                'provinciecode': row.get('provinciecode', '').strip(),
                'archief_provincie': row.get('archief_provincie', '').strip(),
                'website_provinciaal_archief': row.get('website_provinciaal_archief', '').strip(),
            }

    logger.info(f"Loaded {len(municipalities)} municipalities from Genealogiewerkbalk CSV")
    return municipalities


def find_municipality_match(
    entry: Dict[str, Any],
    municipalities: Dict[str, Dict[str, Any]]
) -> Tuple[Optional[Dict[str, Any]], str, float]:
    """Find matching municipality for an entry.

    Args:
        entry: The NDE entry data
        municipalities: Lookup dictionary of municipality data

    Returns:
        Tuple of (matched_data, match_method, confidence_score)
    """
    # Strategy 1: Match by plaatsnaam_bezoekadres
    plaatsnaam = entry.get('original_entry', {}).get('plaatsnaam_bezoekadres', '')
    if plaatsnaam:
        norm_plaats = normalize_municipality_name(plaatsnaam)
        if norm_plaats in municipalities:
            return municipalities[norm_plaats], 'plaatsnaam_bezoekadres', 1.0

        # Try fuzzy match on plaatsnaam
        best_match, score = fuzzy_match_municipality(norm_plaats, municipalities)
        if best_match and score >= 0.85:
            return municipalities[best_match], 'plaatsnaam_fuzzy', score

    # Strategy 2: Match by Google Maps administrative_area_level_2
    google_data = entry.get('google_maps_enrichment', {})
    address_components = google_data.get('address_components', [])

    for component in address_components:
        types = component.get('types', [])
        if 'administrative_area_level_2' in types:
            gemeente = component.get('long_name', '')
            norm_gemeente = normalize_municipality_name(gemeente)
            if norm_gemeente in municipalities:
                return municipalities[norm_gemeente], 'google_maps_admin2', 0.95

            # Try fuzzy
            best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
            if best_match and score >= 0.85:
                return municipalities[best_match], 'google_maps_admin2_fuzzy', score * 0.95

    # Strategy 3: Match by Google Maps locality
    for component in address_components:
        types = component.get('types', [])
        if 'locality' in types:
            locality = component.get('long_name', '')
            norm_locality = normalize_municipality_name(locality)

            # Some localities are also municipalities
            if norm_locality in municipalities:
                return municipalities[norm_locality], 'google_maps_locality', 0.85

            # Try fuzzy
            best_match, score = fuzzy_match_municipality(norm_locality, municipalities)
            if best_match and score >= 0.90:  # Higher threshold for locality
                return municipalities[best_match], 'google_maps_locality_fuzzy', score * 0.85

    # Strategy 4: Match by web_enrichment.claims municipality
    web_enrichment = entry.get('web_enrichment', {})
    claims = web_enrichment.get('claims', [])
    for claim in claims:
        if claim.get('claim_type') == 'municipality':
            gemeente = claim.get('claim_value', '')
            if gemeente:
                norm_gemeente = normalize_municipality_name(gemeente)
                if norm_gemeente in municipalities:
                    return municipalities[norm_gemeente], 'web_claim_municipality', 0.90

                # Try fuzzy
                best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
                if best_match and score >= 0.85:
                    return municipalities[best_match], 'web_claim_municipality_fuzzy', score * 0.90

    # Strategy 5: Match by location.municipality
    location = entry.get('location', {})
    loc_municipality = location.get('municipality', '')
    if loc_municipality:
        norm_gemeente = normalize_municipality_name(loc_municipality)
        if norm_gemeente in municipalities:
            return municipalities[norm_gemeente], 'location_municipality', 0.90

        # Try fuzzy
        best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
        if best_match and score >= 0.85:
            return municipalities[best_match], 'location_municipality_fuzzy', score * 0.90

    # Strategy 6: Match by manual_location_override.municipality
    manual_override = entry.get('manual_location_override', {})
    override_municipality = manual_override.get('municipality', '')
    if override_municipality:
        norm_gemeente = normalize_municipality_name(override_municipality)
        if norm_gemeente in municipalities:
            return municipalities[norm_gemeente], 'manual_override_municipality', 0.95

        # Try fuzzy
        best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
        if best_match and score >= 0.85:
            return municipalities[best_match], 'manual_override_municipality_fuzzy', score * 0.95

    # Strategy 7: Match by zcbs_enrichment.municipality
    zcbs = entry.get('zcbs_enrichment', {})
    zcbs_municipality = zcbs.get('municipality', '')
    if zcbs_municipality:
        norm_gemeente = normalize_municipality_name(zcbs_municipality)
        if norm_gemeente in municipalities:
            return municipalities[norm_gemeente], 'zcbs_municipality', 0.90

        # Try fuzzy
        best_match, score = fuzzy_match_municipality(norm_gemeente, municipalities)
        if best_match and score >= 0.85:
            return municipalities[best_match], 'zcbs_municipality_fuzzy', score * 0.90

    return None, 'no_match', 0.0


def fuzzy_match_municipality(
    search_term: str,
    municipalities: Dict[str, Dict[str, Any]],
    threshold: float = 0.80
) -> Tuple[Optional[str], float]:
    """Find best fuzzy match for a municipality name.

    Returns:
        Tuple of (matched_key, similarity_score) or (None, 0.0)
    """
    if not search_term:
        return None, 0.0

    best_match = None
    best_score = 0.0

    for key in municipalities:
        score = SequenceMatcher(None, search_term, key).ratio()
        if score > best_score and score >= threshold:
            best_score = score
            best_match = key

    return best_match, best_score


def create_enrichment_section(
    match_data: Dict[str, Any],
    match_method: str,
    confidence: float
) -> Dict[str, Any]:
    """Create the genealogiewerkbalk_enrichment section for an entry."""

    enrichment = {
        'source': 'Genealogiewerkbalk.nl Municipality Archives Registry',
        'source_url': 'https://www.genealogiewerkbalk.nl/archieven.html',
        'data_url': CSV_URL,
        'data_tier': 'TIER_2_VERIFIED',
        'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
        'match_method': match_method,
        'match_confidence': round(confidence, 4),

        # Municipality info
        'municipality': {
            'name': match_data['gemeentenaam'],
            'code': match_data['gemeentecode'],
        },

        # Municipal archive info
        'municipal_archive': {
            'name': match_data['archief_gemeente'],
            'website': match_data['website_gemeentearchief'] or None,
            'isil': match_data['isil'],
        },

        # Province info
        'province': {
            'name': match_data['provincienaam'],
            'code': match_data['provinciecode'],
        },

        # Provincial archive info
        'provincial_archive': {
            'name': match_data['archief_provincie'],
            'website': match_data['website_provinciaal_archief'] or None,
        },
    }

    # Add extra info if present
    if match_data.get('extra_info'):
        enrichment['extra_info'] = match_data['extra_info']

    # Add raw ISIL if different from parsed (for "geen*" codes)
    if match_data.get('isil_raw') and match_data['isil_raw'] != match_data['isil']:
        enrichment['municipal_archive']['isil_note'] = match_data['isil_raw']

    return enrichment


def update_provenance(entry: Dict[str, Any], match_method: str) -> None:
    """Update provenance tracking with Genealogiewerkbalk source."""

    if 'provenance' not in entry:
        entry['provenance'] = {
            'schema_version': '1.0.0',
            'generated_at': datetime.now(timezone.utc).isoformat(),
            'sources': {}
        }

    sources = entry['provenance'].setdefault('sources', {})

    # Add genealogiewerkbalk source
    sources['genealogiewerkbalk'] = [{
        'source_type': 'genealogiewerkbalk_registry',
        'fetch_timestamp': datetime.now(timezone.utc).isoformat(),
        'data_url': CSV_URL,
        'match_method': match_method,
        'claims_extracted': [
            'municipality_name',
            'municipality_code',
            'municipal_archive_name',
            'municipal_archive_website',
            'municipal_archive_isil',
            'province_name',
            'province_code',
            'provincial_archive_name',
            'provincial_archive_website',
        ]
    }]

    # Update data tier summary
    tier_summary = entry['provenance'].setdefault('data_tier_summary', {})
    tier_2 = tier_summary.setdefault('TIER_2_VERIFIED', [])
    if 'genealogiewerkbalk_registry' not in tier_2:
        tier_2.append('genealogiewerkbalk_registry')


def refresh_csv() -> bool:
    """Download fresh CSV from Google Sheets."""
    logger.info(f"Downloading fresh CSV from: {CSV_URL}")

    try:
        # Ensure directory exists
        SOURCES_DIR.mkdir(parents=True, exist_ok=True)

        # Download
        urllib.request.urlretrieve(CSV_URL, CSV_FILE)

        # Verify
        with open(CSV_FILE, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            rows = list(reader)

        logger.info(f"Downloaded CSV with {len(rows)} municipalities")
        return True

    except Exception as e:
        logger.error(f"Failed to download CSV: {e}")
        return False


def process_entry(
    entry_path: Path,
    municipalities: Dict[str, Dict[str, Any]],
    dry_run: bool = False,
    force: bool = False
) -> Tuple[str, Optional[str]]:
    """Process a single entry file.

    Returns:
        Tuple of (status, match_info)
        status: 'enriched', 'already_enriched', 'no_match', 'error'
    """
    try:
        with open(entry_path, 'r', encoding='utf-8') as f:
            entry = yaml.safe_load(f)

        if not entry:
            return 'error', 'Empty file'

        # Check if already enriched
        if not force and 'genealogiewerkbalk_enrichment' in entry:
            return 'already_enriched', None

        # Find match
        match_data, match_method, confidence = find_municipality_match(entry, municipalities)

        if not match_data:
            return 'no_match', None

        # Create enrichment
        enrichment = create_enrichment_section(match_data, match_method, confidence)

        if dry_run:
            gemeente = match_data['gemeentenaam']
            archive = match_data['archief_gemeente']
            return 'would_enrich', f"{gemeente} -> {archive} ({match_method}, {confidence:.2f})"

        # Update entry
        entry['genealogiewerkbalk_enrichment'] = enrichment
        update_provenance(entry, match_method)

        # Write back
        with open(entry_path, 'w', encoding='utf-8') as f:
            yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

        gemeente = match_data['gemeentenaam']
        archive = match_data['archief_gemeente']
        return 'enriched', f"{gemeente} -> {archive} ({match_method})"

    except Exception as e:
        logger.error(f"Error processing {entry_path.name}: {e}")
        return 'error', str(e)


def main():
    parser = argparse.ArgumentParser(
        description='Enrich NDE entries with Genealogiewerkbalk municipality archive data'
    )
    parser.add_argument('--dry-run', action='store_true',
                        help='Show what would be done without making changes')
    parser.add_argument('--entry', type=str,
                        help='Process only a specific entry (e.g., "0016" or "0016_Q81181377")')
    parser.add_argument('--force', action='store_true',
                        help='Re-enrich even if already enriched')
    parser.add_argument('--refresh-csv', action='store_true',
                        help='Download fresh CSV before processing')
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='Show detailed output')

    args = parser.parse_args()

    if args.verbose:
        logger.setLevel(logging.DEBUG)

    # Refresh CSV if requested or not present
    if args.refresh_csv or not CSV_FILE.exists():
        if not refresh_csv():
            logger.error("Failed to get CSV data")
            sys.exit(1)

    # Load municipality data
    municipalities = load_genealogiewerkbalk_data(CSV_FILE)
    if not municipalities:
        logger.error("No municipality data loaded")
        sys.exit(1)

    # Find entry files
    if args.entry:
        # Specific entry
        pattern = f"{args.entry}*.yaml"
        entry_files = list(ENTRIES_DIR.glob(pattern))
        if not entry_files:
            logger.error(f"No entry files found matching: {pattern}")
            sys.exit(1)
    else:
        # All entries
        entry_files = sorted(ENTRIES_DIR.glob("*.yaml"))

    logger.info(f"Processing {len(entry_files)} entry files...")

    # Statistics
    stats = {
        'total': len(entry_files),
        'enriched': 0,
        'already_enriched': 0,
        'no_match': 0,
        'error': 0,
    }

    # Process entries
    for entry_path in entry_files:
        status, info = process_entry(
            entry_path,
            municipalities,
            dry_run=args.dry_run,
            force=args.force
        )

        if status == 'enriched' or status == 'would_enrich':
            stats['enriched'] += 1
            logger.info(f"{'[DRY-RUN] Would enrich' if args.dry_run else 'Enriched'}: {entry_path.name} - {info}")
        elif status == 'already_enriched':
            stats['already_enriched'] += 1
            if args.verbose:
                logger.debug(f"Already enriched: {entry_path.name}")
        elif status == 'no_match':
            stats['no_match'] += 1
            if args.verbose:
                logger.debug(f"No match: {entry_path.name}")
        elif status == 'error':
            stats['error'] += 1
            logger.warning(f"Error: {entry_path.name} - {info}")

    # Summary
    logger.info("\n=== Enrichment Summary ===")
    logger.info(f"Total files:       {stats['total']}")
    logger.info(f"Enriched:          {stats['enriched']}")
    logger.info(f"Already enriched:  {stats['already_enriched']}")
    logger.info(f"No match:          {stats['no_match']}")
    logger.info(f"Errors:            {stats['error']}")

    if args.dry_run:
        logger.info("\n[DRY-RUN] No changes were made.")


if __name__ == '__main__':
    main()