glam/scripts/load_custodians_to_ducklake_v3.py

#!/usr/bin/env python3
"""
Load all custodian YAML files into DuckLake as raw enriched data.

v3 - Enhanced coordinate extraction with cascading fallback across 10 known patterns.

This script reads all YAML files from data/custodian/ and uploads them
to DuckLake as a single table for the bronze/raw data lake layer.

Usage:
    python scripts/load_custodians_to_ducklake_v3.py [--dry-run] [--limit N]

API Endpoint: http://localhost:8765/upload

COORDINATE EXTRACTION PRIORITY (highest to lowest):
1. manual_location_override.coordinates - User-verified corrections
2. google_maps_enrichment.coordinates - Most accurate for operational institutions
3. wikidata_enrichment.wikidata_coordinates - Good coverage with precision
4. wikidata_enrichment.coordinates - Verified Wikidata data
5. ghcid.location_resolution.source_coordinates - GeoNames-resolved
6. ghcid.location_resolution (direct) - Early GeoNames integration
7. original_entry.locations[0] - CH-Annotator files
8. locations[0] - Root-level locations array
9. location (root level) - Single location object
10. Do NOT use: google_maps_enrichment_invalid, unesco nearby sites

PROVENANCE TRACKING:
Each coordinate extraction records the source for debugging.
"""

import argparse
import json
import os
import sys
import tempfile
from datetime import datetime
from pathlib import Path

import requests
import yaml

# Configuration
DUCKLAKE_URL = os.getenv("DUCKLAKE_URL", "http://localhost:8765")
DEFAULT_CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
TABLE_NAME = "custodians_raw"


def extract_coordinates(data: dict) -> tuple[float | None, float | None, str]:
    """
    Extract coordinates using cascading priority fallback.

    Returns:
        Tuple of (latitude, longitude, source) where source indicates
        which pattern was used for provenance tracking.
    """

    # Priority 1: manual_location_override (user-verified corrections)
    override = data.get("manual_location_override", {})
    if override:
        coords = override.get("coordinates", {})
        lat = coords.get("latitude")
        lon = coords.get("longitude")
        if lat is not None and lon is not None:
            return lat, lon, "manual_location_override"

    # Priority 2: google_maps_enrichment (most accurate for operational institutions)
    gm = data.get("google_maps_enrichment", {})
    if gm:
        coords = gm.get("coordinates", {})
        lat = coords.get("latitude")
        lon = coords.get("longitude")
        if lat is not None and lon is not None:
            return lat, lon, "google_maps_enrichment"

    # Priority 3: wikidata_enrichment.wikidata_coordinates (with precision)
    wd = data.get("wikidata_enrichment", {})
    if wd:
        # First try wikidata_coordinates (most common pattern)
        wd_coords = wd.get("wikidata_coordinates", {})
        if wd_coords:
            lat = wd_coords.get("latitude")
            lon = wd_coords.get("longitude")
            if lat is not None and lon is not None:
                return lat, lon, "wikidata_enrichment.wikidata_coordinates"

        # Priority 4: wikidata_enrichment.coordinates (alternative structure)
        coords = wd.get("coordinates", {})
        if coords:
            lat = coords.get("latitude")
            lon = coords.get("longitude")
            if lat is not None and lon is not None:
                return lat, lon, "wikidata_enrichment.coordinates"

    # Priority 5: ghcid.location_resolution.source_coordinates (GeoNames-resolved)
    ghcid = data.get("ghcid", {})
    if ghcid:
        loc_res = ghcid.get("location_resolution", {})
        if loc_res:
            # Try source_coordinates first
            src_coords = loc_res.get("source_coordinates", {})
            if src_coords:
                lat = src_coords.get("latitude")
                lon = src_coords.get("longitude")
                if lat is not None and lon is not None:
                    return lat, lon, "ghcid.location_resolution.source_coordinates"

            # Priority 6: ghcid.location_resolution direct (older format)
            lat = loc_res.get("latitude")
            lon = loc_res.get("longitude")
            if lat is not None and lon is not None:
                return lat, lon, "ghcid.location_resolution"

    # Priority 7: original_entry.locations[0] (CH-Annotator files)
    original = data.get("original_entry", {})
    if original:
        locations = original.get("locations", [])
        if locations and isinstance(locations, list) and len(locations) > 0:
            loc = locations[0]
            lat = loc.get("latitude")
            lon = loc.get("longitude")
            if lat is not None and lon is not None:
                return lat, lon, "original_entry.locations[0]"

    # Priority 8: locations[0] (root-level locations array)
    locations = data.get("locations", [])
    if locations and isinstance(locations, list) and len(locations) > 0:
        loc = locations[0]
        lat = loc.get("latitude")
        lon = loc.get("longitude")
        if lat is not None and lon is not None:
            return lat, lon, "locations[0]"

        # Some files have coordinates nested under "coordinates"
        coords = loc.get("coordinates", {})
        if coords:
            lat = coords.get("latitude")
            lon = coords.get("longitude")
            if lat is not None and lon is not None:
                return lat, lon, "locations[0].coordinates"

    # Priority 9: location (root-level single object)
    location = data.get("location", {})
    if location and isinstance(location, dict):
        lat = location.get("latitude")
        lon = location.get("longitude")
        if lat is not None and lon is not None:
            return lat, lon, "location"

    # No coordinates found
    return None, None, ""


def extract_city_country(data: dict) -> tuple[str, str]:
    """
    Extract city and country using cascading fallback.

    Returns:
        Tuple of (city, country)
    """
    city = ""
    country = ""

    # Priority 1: Google Maps address components (most accurate)
    gm = data.get("google_maps_enrichment", {})
    if gm:
        for comp in gm.get("address_components", []):
            types = comp.get("types", [])
            if "locality" in types and not city:
                city = comp.get("long_name", "")
            if "country" in types and not country:
                country = comp.get("short_name", "")

    # Priority 2: ghcid.location_resolution
    if not city or not country:
        ghcid = data.get("ghcid", {})
        loc_res = ghcid.get("location_resolution", {}) if ghcid else {}
        if loc_res:
            if not city:
                city = loc_res.get("geonames_name", "") or loc_res.get("city_name", "") or loc_res.get("google_maps_locality", "")
            if not country:
                country = loc_res.get("country_code", "")

    # Priority 3: location (root level)
    if not city or not country:
        location = data.get("location", {})
        if location:
            if not city:
                city = location.get("city", "")
            if not country:
                country = location.get("country", "")

    # Priority 4: locations[0] (root level array)
    if not city or not country:
        locations = data.get("locations", [])
        if locations and len(locations) > 0:
            loc = locations[0]
            if not city:
                city = loc.get("city", "")
            if not country:
                country = loc.get("country", "")

    # Priority 5: original_entry.locations[0]
    if not city or not country:
        original = data.get("original_entry", {})
        if original:
            locations = original.get("locations", [])
            if locations and len(locations) > 0:
                loc = locations[0]
                if not city:
                    city = loc.get("city", "")
                if not country:
                    country = loc.get("country", "")

    # Priority 6: original_entry direct fields
    if not city or not country:
        original = data.get("original_entry", {})
        if original:
            if not city:
                city = original.get("plaatsnaam_bezoekadres", "") or original.get("city", "")
            if not country:
                country = original.get("country", "")

    # Ultimate fallback: Extract country from GHCID string (first 2 chars)
    if not country:
        ghcid_current = data.get("ghcid", {}).get("ghcid_current", "")
        if ghcid_current and len(ghcid_current) >= 2:
            country = ghcid_current[:2]

    return city, country


def extract_financial_documents(data: dict) -> dict:
    """
    Extract financial document metadata from web_claims.

    Extracts counts and latest URLs for:
    - annual_report_url (jaarverslag)
    - financial_statement_url (jaarstukken)
    - anbi_publication_url (ANBI)
    - policy_document_url (beleidsplan)

    Returns:
        Dict with has_*, count, and latest_*_url fields
    """
    result = {
        "has_annual_report": False,
        "has_financial_statement": False,
        "has_anbi_publication": False,
        "has_policy_document": False,
        "annual_report_count": 0,
        "financial_statement_count": 0,
        "anbi_publication_count": 0,
        "policy_document_count": 0,
        "latest_annual_report_url": "",
        "latest_annual_report_year": None,
        "latest_financial_statement_url": "",
        "latest_policy_document_url": "",
        "latest_anbi_publication_url": "",
    }

    web_claims = data.get("web_claims", {})
    if not web_claims:
        return result

    claims = web_claims.get("claims", [])
    if not claims:
        return result

    # Group claims by type and collect year info
    annual_reports = []
    financial_statements = []
    anbi_publications = []
    policy_documents = []
    year_claims = {}  # Map URL to year

    for claim in claims:
        claim_type = claim.get("claim_type", "")
        claim_value = claim.get("claim_value", "")

        if claim_type == "financial_document_year":
            # Year claims have raw_value like "year=2024, from=annual_report_url, url=..."
            raw = claim.get("raw_value", "")
            if "url=" in raw:
                url_part = raw.split("url=")[-1]
                try:
                    year = int(claim_value)
                    year_claims[url_part] = year
                except (ValueError, TypeError):
                    pass
        elif claim_type == "annual_report_url":
            annual_reports.append(claim_value)
        elif claim_type == "financial_statement_url":
            financial_statements.append(claim_value)
        elif claim_type == "anbi_publication_url":
            anbi_publications.append(claim_value)
        elif claim_type == "policy_document_url":
            policy_documents.append(claim_value)

    # Set counts and has_* flags
    result["annual_report_count"] = len(annual_reports)
    result["financial_statement_count"] = len(financial_statements)
    result["anbi_publication_count"] = len(anbi_publications)
    result["policy_document_count"] = len(policy_documents)

    result["has_annual_report"] = len(annual_reports) > 0
    result["has_financial_statement"] = len(financial_statements) > 0
    result["has_anbi_publication"] = len(anbi_publications) > 0
    result["has_policy_document"] = len(policy_documents) > 0

    # Find latest by year (or just use first if no year info)
    def get_latest_with_year(urls: list, year_map: dict) -> tuple:
        """Return (url, year) for the document with highest year."""
        if not urls:
            return "", None

        # Try to find one with year info
        best_url = urls[0]
        best_year = None

        for url in urls:
            year = year_map.get(url)
            if year is not None:
                if best_year is None or year > best_year:
                    best_year = year
                    best_url = url

        return best_url, best_year

    url, year = get_latest_with_year(annual_reports, year_claims)
    result["latest_annual_report_url"] = url
    result["latest_annual_report_year"] = year

    result["latest_financial_statement_url"], _ = get_latest_with_year(financial_statements, year_claims)
    result["latest_policy_document_url"], _ = get_latest_with_year(policy_documents, year_claims)
    result["latest_anbi_publication_url"], _ = get_latest_with_year(anbi_publications, year_claims)

    return result


def extract_top_level_fields(data: dict) -> dict:
    """
    Extract and flatten top-level fields from YAML data.
    Complex nested objects are stored as JSON strings.
    """
    record = {
        # File identification
        "file_name": data.get("_file_name", ""),
        "ghcid_current": "",
        "record_id": "",

        # Original entry (flattened)
        "org_name": "",
        "org_type": "",
        "institution_type": "",
        "wikidata_id": "",

        # Enrichment status
        "enrichment_status": data.get("enrichment_status", ""),
        "processing_timestamp": data.get("processing_timestamp", ""),

        # Location (with cascading extraction)
        "latitude": None,
        "longitude": None,
        "coordinate_source": "",  # NEW: Provenance for coordinates
        "formatted_address": "",
        "city": "",
        "country": "",
        "postal_code": "",
        "region": "",

        # Custodian name consensus
        "custodian_name": "",
        "custodian_name_confidence": None,

        # Emic name (native language name)
        "emic_name": "",
        "name_language": "",

        # Ratings
        "google_rating": None,
        "google_total_ratings": None,

        # Financial transparency (denormalized from web_claims)
        "has_annual_report": False,
        "has_financial_statement": False,
        "has_anbi_publication": False,
        "has_policy_document": False,
        "annual_report_count": 0,
        "financial_statement_count": 0,
        "anbi_publication_count": 0,
        "policy_document_count": 0,
        "latest_annual_report_url": "",
        "latest_annual_report_year": None,
        "latest_financial_statement_url": "",
        "latest_policy_document_url": "",
        "latest_anbi_publication_url": "",

        # Temporal data
        "timespan_begin": None,
        "timespan_end": None,
        "destruction_date": None,
        "founding_date": None,
        "dissolution_date": None,
        "wikidata_inception": None,

        # Complex nested objects as JSON strings
        "original_entry_json": "",
        "wikidata_enrichment_json": "",
        "google_maps_enrichment_json": "",
        "web_enrichment_json": "",
        "web_claims_json": "",
        "ghcid_json": "",
        "identifiers_json": "",
        "provenance_json": "",
        "genealogiewerkbalk_json": "",
        "digital_platforms_json": "",

        # Additional JSON objects
        "service_area_json": "",
        "timespan_json": "",
        "time_of_destruction_json": "",
        "conflict_status_json": "",
        "temporal_extent_json": "",
        "youtube_enrichment_json": "",
        "unesco_mow_enrichment_json": "",
        "ch_annotator_json": "",
        "location_json": "",
    }

    # Extract GHCID
    ghcid = data.get("ghcid", {})
    if ghcid:
        record["ghcid_current"] = ghcid.get("ghcid_current", "")
        record["record_id"] = ghcid.get("record_id", "")
        record["ghcid_json"] = json.dumps(ghcid, ensure_ascii=False, default=str)

    # Extract original entry
    original = data.get("original_entry", {})
    if original:
        # Try multiple name fields
        record["org_name"] = original.get("organisatie", "") or original.get("name", "") or ""
        org_types = original.get("type", [])
        record["org_type"] = ",".join(org_types) if isinstance(org_types, list) else str(org_types)
        record["institution_type"] = original.get("institution_type", "")
        record["wikidata_id"] = original.get("wikidata_id", "")
        record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)

    # Extract coordinates using cascading priority
    lat, lon, coord_source = extract_coordinates(data)
    record["latitude"] = lat
    record["longitude"] = lon
    record["coordinate_source"] = coord_source

    # Extract city and country using cascading priority
    city, country = extract_city_country(data)
    record["city"] = city
    record["country"] = country

    # Extract additional location data from Google Maps
    gm = data.get("google_maps_enrichment", {})
    if gm:
        record["formatted_address"] = gm.get("formatted_address", "")
        record["google_rating"] = gm.get("rating")
        record["google_total_ratings"] = gm.get("total_ratings")

        # Extract postal code from address components
        for comp in gm.get("address_components", []):
            types = comp.get("types", [])
            if "postal_code" in types:
                record["postal_code"] = comp.get("long_name", "")
            if "administrative_area_level_1" in types:
                record["region"] = comp.get("short_name", "")

        record["google_maps_enrichment_json"] = json.dumps(gm, ensure_ascii=False, default=str)

    # Fallback region from GHCID location_resolution
    if not record["region"] and ghcid:
        loc_res = ghcid.get("location_resolution", {})
        if loc_res:
            record["region"] = loc_res.get("region_code", "")

    # Extract custodian name consensus and emic_name
    cn = data.get("custodian_name", {})
    if cn:
        record["custodian_name"] = cn.get("claim_value", "")
        record["custodian_name_confidence"] = cn.get("confidence")
        # Extract emic_name (native language name)
        record["emic_name"] = cn.get("emic_name", "") or cn.get("claim_value_arabic", "") or ""

    # Infer name_language from country code
    language_map = {
        "NL": "nl", "BE": "nl", "DE": "de", "AT": "de", "CH": "de",
        "FR": "fr", "ES": "es", "IT": "it", "PT": "pt", "BR": "pt",
        "GB": "en", "US": "en", "AU": "en", "CA": "en", "IE": "en",
        "PL": "pl", "CZ": "cs", "HU": "hu", "RU": "ru", "JP": "ja",
        "CN": "zh", "KR": "ko", "AR": "ar", "PS": "ar", "IL": "he",
        "GR": "el", "TR": "tr", "SE": "sv", "NO": "no", "DK": "da",
        "FI": "fi", "RO": "ro", "BG": "bg", "UA": "uk", "BY": "be",
    }
    record["name_language"] = language_map.get(record["country"], "")

    # Extract timespan data
    timespan = data.get("timespan", {})
    if timespan:
        record["timespan_begin"] = timespan.get("begin_of_the_begin", "")
        record["timespan_end"] = timespan.get("end_of_the_end", "")
        record["timespan_json"] = json.dumps(timespan, ensure_ascii=False, default=str)

    # Extract time_of_destruction
    tod = data.get("time_of_destruction", {})
    if tod:
        record["destruction_date"] = tod.get("date", "")
        record["time_of_destruction_json"] = json.dumps(tod, ensure_ascii=False, default=str)

    # Extract conflict_status
    cs = data.get("conflict_status", {})
    if cs:
        if not record["destruction_date"]:
            record["destruction_date"] = cs.get("date", "")
        record["conflict_status_json"] = json.dumps(cs, ensure_ascii=False, default=str)

    # Extract wikidata inception/dissolution
    wd = data.get("wikidata_enrichment", {})
    if wd:
        # Try multiple paths for inception date
        wikidata_inception = (
            wd.get("wikidata_inception", "") or
            wd.get("wikidata_founded", "") or
            wd.get("wikidata_temporal", {}).get("inception", "")
        )
        record["wikidata_inception"] = wikidata_inception

        # Use wikidata_inception as founding_date fallback
        if wikidata_inception and not record["founding_date"]:
            record["founding_date"] = wikidata_inception

        # Try multiple paths for dissolution date
        wikidata_dissolution = (
            wd.get("wikidata_dissolution", "") or
            wd.get("wikidata_dissolved", "")
        )
        if wikidata_dissolution and not record["dissolution_date"]:
            record["dissolution_date"] = wikidata_dissolution

        record["wikidata_enrichment_json"] = json.dumps(wd, ensure_ascii=False, default=str)

    # Extract service_area
    sa = data.get("service_area", {})
    if sa:
        record["service_area_json"] = json.dumps(sa, ensure_ascii=False, default=str)

    # Extract youtube_enrichment
    yt = data.get("youtube_enrichment", {})
    if yt:
        record["youtube_enrichment_json"] = json.dumps(yt, ensure_ascii=False, default=str)

    # Extract unesco_mow_enrichment (UNESCO Memory of the World inscriptions)
    unesco_mow = data.get("unesco_mow_enrichment", {})
    if unesco_mow:
        record["unesco_mow_enrichment_json"] = json.dumps(unesco_mow, ensure_ascii=False, default=str)

    # Extract CH-Annotator data
    ch = data.get("ch_annotator", {})
    if ch:
        record["ch_annotator_json"] = json.dumps(ch, ensure_ascii=False, default=str)

    # Extract location as JSON
    location = data.get("location", {})
    if location:
        record["location_json"] = json.dumps(location, ensure_ascii=False, default=str)

    # Store other complex objects as JSON
    if data.get("web_enrichment"):
        record["web_enrichment_json"] = json.dumps(
            data["web_enrichment"], ensure_ascii=False, default=str
        )

    if data.get("web_claims"):
        record["web_claims_json"] = json.dumps(
            data["web_claims"], ensure_ascii=False, default=str
        )
        # Extract denormalized financial document fields
        fin_docs = extract_financial_documents(data)
        record.update(fin_docs)

    if data.get("identifiers"):
        record["identifiers_json"] = json.dumps(
            data["identifiers"], ensure_ascii=False, default=str
        )

    if data.get("provenance"):
        record["provenance_json"] = json.dumps(
            data["provenance"], ensure_ascii=False, default=str
        )

    if data.get("genealogiewerkbalk_enrichment"):
        record["genealogiewerkbalk_json"] = json.dumps(
            data["genealogiewerkbalk_enrichment"], ensure_ascii=False, default=str
        )

    if data.get("digital_platforms"):
        record["digital_platforms_json"] = json.dumps(
            data["digital_platforms"], ensure_ascii=False, default=str
        )

    return record


def load_yaml_files(directory: Path, limit: int | None = None) -> list[dict]:
    """Load all YAML files from directory and convert to records."""
    records = []
    yaml_files = sorted(directory.glob("*.yaml"))

    if limit:
        yaml_files = yaml_files[:limit]

    print(f"Loading {len(yaml_files)} YAML files from {directory}...")

    coord_stats = {
        "total": 0,
        "with_coords": 0,
        "sources": {}
    }

    for i, yaml_file in enumerate(yaml_files):
        try:
            with open(yaml_file, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f)

            if data:
                data["_file_name"] = yaml_file.name
                record = extract_top_level_fields(data)
                records.append(record)

                # Track coordinate extraction stats
                coord_stats["total"] += 1
                if record.get("latitude") is not None and record.get("longitude") is not None:
                    coord_stats["with_coords"] += 1
                    source = record.get("coordinate_source", "unknown")
                    coord_stats["sources"][source] = coord_stats["sources"].get(source, 0) + 1

            if (i + 1) % 1000 == 0:
                print(f"  Processed {i + 1}/{len(yaml_files)} files...")

        except Exception as e:
            print(f"  Warning: Failed to load {yaml_file.name}: {e}")
            continue

    print(f"\nSuccessfully loaded {len(records)} records")
    print(f"\nCoordinate extraction statistics:")
    print(f"  Total files: {coord_stats['total']}")
    total = coord_stats['total'] if coord_stats['total'] > 0 else 1
    pct = 100 * coord_stats['with_coords'] / total
    print(f"  With coordinates: {coord_stats['with_coords']} ({pct:.1f}%)")
    print(f"  Coordinate sources:")
    for source, count in sorted(coord_stats["sources"].items(), key=lambda x: -x[1]):
        print(f"    {source}: {count}")

    return records


def upload_to_ducklake(records: list[dict], table_name: str, mode: str = "replace") -> dict:
    """Upload records to DuckLake via API."""
    print(f"\nUploading {len(records)} records to DuckLake table '{table_name}'...")

    # Write records to temporary JSON file
    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
        json.dump(records, f, ensure_ascii=False, default=str)
        temp_path = f.name

    try:
        # Upload via multipart form
        with open(temp_path, "rb") as f:
            files = {"file": ("custodians.json", f, "application/json")}
            data = {"table_name": table_name, "mode": mode}

            response = requests.post(
                f"{DUCKLAKE_URL}/upload",
                files=files,
                data=data,
                timeout=600
            )

        response.raise_for_status()
        result = response.json()
        print(f"Upload successful: {result}")
        return result

    finally:
        os.unlink(temp_path)


def check_ducklake_status() -> bool:
    """Check if DuckLake server is running."""
    try:
        response = requests.get(f"{DUCKLAKE_URL}/", timeout=5)
        status = response.json()
        print(f"DuckLake status: {status.get('status', 'unknown')}")
        print(f"  Tables: {status.get('tables', 0)}")
        print(f"  Total rows: {status.get('total_rows', 0)}")
        print(f"  Snapshots: {status.get('snapshots', 0)}")
        return status.get("status") == "healthy"
    except Exception as e:
        print(f"Error connecting to DuckLake at {DUCKLAKE_URL}: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(description="Load custodian YAML files to DuckLake (v3)")
    parser.add_argument("--dry-run", action="store_true", help="Only load and validate, don't upload")
    parser.add_argument("--limit", type=int, help="Limit number of files to process")
    parser.add_argument("--mode", choices=["replace", "append", "create"], default="replace",
                        help="Upload mode (default: replace)")
    parser.add_argument("--directory", "-d", type=Path, default=None,
                        help="Directory containing custodian YAML files (default: auto-detect)")
    args = parser.parse_args()

    # Determine custodian directory
    if args.directory:
        custodian_dir = args.directory
    else:
        custodian_dir = DEFAULT_CUSTODIAN_DIR

    print("=" * 60)
    print("DuckLake Custodian Data Loader v3")
    print("  Enhanced coordinate extraction with cascading fallback")
    print("=" * 60)
    print(f"Source directory: {custodian_dir}")
    print(f"Target table: {TABLE_NAME}")
    print(f"DuckLake URL: {DUCKLAKE_URL}")
    print(f"Mode: {'DRY RUN' if args.dry_run else args.mode.upper()}")
    if args.limit:
        print(f"Limit: {args.limit} files")
    print("=" * 60)

    # Check if directory exists
    if not custodian_dir.exists():
        print(f"Error: Directory not found: {custodian_dir}")
        sys.exit(1)

    # Count YAML files
    yaml_count = len(list(custodian_dir.glob("*.yaml")))
    print(f"Found {yaml_count} YAML files")

    if yaml_count == 0:
        print("No YAML files found. Exiting.")
        sys.exit(0)

    # Load YAML files
    records = load_yaml_files(custodian_dir, limit=args.limit)

    if not records:
        print("No records loaded. Exiting.")
        sys.exit(1)

    # Show sample record
    print("\nSample record (first):")
    sample = records[0]
    for key in ["file_name", "ghcid_current", "custodian_name", "emic_name", "city", "country", "latitude", "longitude", "coordinate_source"]:
        print(f"  {key}: {sample.get(key, 'N/A')}")

    if args.dry_run:
        print("\n[DRY RUN] Would upload to DuckLake. Exiting without upload.")
        return

    # Check DuckLake status
    print("\nChecking DuckLake connection...")
    if not check_ducklake_status():
        print("Error: DuckLake is not available. Please start the server.")
        print("  cd backend/ducklake && python main.py")
        sys.exit(1)

    # Upload to DuckLake
    result = upload_to_ducklake(records, TABLE_NAME, mode=args.mode)

    print("\n" + "=" * 60)
    print("Load complete!")
    print(f"  Table: {TABLE_NAME}")
    print(f"  Rows: {result.get('rows_inserted', len(records))}")
    print(f"  Snapshot ID: {result.get('snapshot_id', 'N/A')}")
    print("=" * 60)


if __name__ == "__main__":
    main()