glam/backend/postgres/load_custodian_data.py

#!/usr/bin/env python3
"""
Load ALL custodian data from data/custodian/ into PostgreSQL/PostGIS

This script replaces load_nde_data.py and reads directly from the authoritative
custodian YAML files, preserving ALL rich metadata including:
- Google Maps enrichment (ratings, reviews, photos, opening hours)
- YouTube enrichment (channel info, videos)
- Web claims (social media, logos, descriptions)
- Genealogiewerkbalk data
- GHCID identifiers and history
- Provenance tracking
- Temporal extent and successor organizations

Usage:
    python load_custodian_data.py [--drop-existing] [--limit N]
"""

import argparse
import asyncio
import json
import os
from pathlib import Path
from datetime import date, datetime, timezone
from typing import Any, Dict, List, Optional
import sys

# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

try:
    import asyncpg
except ImportError:
    print("Error: asyncpg not installed. Run: pip install asyncpg")
    sys.exit(1)

try:
    import yaml
    try:
        from yaml import CSafeLoader as SafeLoader
    except ImportError:
        from yaml import SafeLoader
except ImportError:
    print("Error: PyYAML not installed. Run: pip install pyyaml")
    sys.exit(1)


# Configuration
# Server path: /mnt/data/custodian/
# Local path: {project_root}/data/custodian/
DEFAULT_CUSTODIAN_DIR = "/mnt/data/custodian" if os.path.exists("/mnt/data/custodian") else str(project_root / "data" / "custodian")
CUSTODIAN_DIR = Path(os.getenv("CUSTODIAN_DIR", DEFAULT_CUSTODIAN_DIR))

# Single database config (for backward compatibility)
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "glam_heritage")
POSTGRES_USER = os.getenv("POSTGRES_USER", "kempersc")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")

# Multi-database configuration for production
# Production has two databases that need identical custodian data:
# - glam: Main custodian data storage
# - glam_geo: PostGIS geo API for bronhouder.nl map
DATABASES = {
    'glam': {
        'host': os.getenv("POSTGRES_HOST", "localhost"),
        'port': int(os.getenv("POSTGRES_PORT", "5432")),
        'database': os.getenv("POSTGRES_DB", "glam"),
        'user': os.getenv("POSTGRES_USER", "glam_api"),
        'password': os.getenv("POSTGRES_PASSWORD", ""),
    },
    'glam_geo': {
        'host': os.getenv("GEO_POSTGRES_HOST", os.getenv("POSTGRES_HOST", "localhost")),
        'port': int(os.getenv("GEO_POSTGRES_PORT", os.getenv("POSTGRES_PORT", "5432"))),
        'database': os.getenv("GEO_POSTGRES_DB", "glam_geo"),
        'user': os.getenv("GEO_POSTGRES_USER", os.getenv("POSTGRES_USER", "glam_api")),
        'password': os.getenv("GEO_POSTGRES_PASSWORD", os.getenv("POSTGRES_PASSWORD", "")),
    },
    # Local development database
    'glam_heritage': {
        'host': os.getenv("POSTGRES_HOST", "localhost"),
        'port': int(os.getenv("POSTGRES_PORT", "5432")),
        'database': os.getenv("POSTGRES_DB", "glam_heritage"),
        'user': os.getenv("POSTGRES_USER", "kempersc"),
        'password': os.getenv("POSTGRES_PASSWORD", ""),
    },
}


# Institution type mappings
TYPE_COLORS = {
    'G': '#00bcd4', 'L': '#2ecc71', 'A': '#3498db', 'M': '#e74c3c',
    'O': '#f39c12', 'R': '#1abc9c', 'C': '#795548', 'U': '#9e9e9e',
    'B': '#4caf50', 'E': '#ff9800', 'S': '#9b59b6', 'F': '#95a5a6',
    'I': '#673ab7', 'X': '#607d8b', 'P': '#ff5722', 'H': '#607d8b',
    'D': '#34495e', 'N': '#e91e63', 'T': '#ff5722',
}

TYPE_NAMES = {
    'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum',
    'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown',
    'B': 'Botanical', 'E': 'Education', 'S': 'Society', 'F': 'Features',
    'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy sites',
    'D': 'Digital', 'N': 'NGO', 'T': 'Taste/smell',
}


CREATE_TABLE_SQL = """
-- Drop existing table if requested
DROP TABLE IF EXISTS custodians CASCADE;

CREATE TABLE custodians (
    id SERIAL PRIMARY KEY,

    -- ═══════════════════════════════════════════════════════════════
    -- CORE IDENTITY
    -- ═══════════════════════════════════════════════════════════════
    name TEXT NOT NULL,
    verified_name TEXT,
    name_source TEXT,
    emic_name TEXT,  -- Name in original/native language

    -- ═══════════════════════════════════════════════════════════════
    -- CLASSIFICATION (GLAMORCUBESFIXPHDNT)
    -- ═══════════════════════════════════════════════════════════════
    type CHAR(1),
    type_name TEXT,
    color VARCHAR(10),
    ch_annotator_hypernym TEXT,  -- e.g., GRP
    ch_annotator_subtype TEXT,   -- e.g., GRP.HER.MUS
    ontology_class TEXT,         -- e.g., schema:Museum

    -- ═══════════════════════════════════════════════════════════════
    -- GHCID (Global Heritage Custodian Identifier)
    -- ═══════════════════════════════════════════════════════════════
    ghcid TEXT UNIQUE,
    ghcid_uuid UUID,
    ghcid_uuid_sha256 UUID,
    ghcid_numeric NUMERIC(20),
    record_id UUID,
    ghcid_original TEXT,
    ghcid_history JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- LOCATION
    -- ═══════════════════════════════════════════════════════════════
    lat DOUBLE PRECISION,
    lon DOUBLE PRECISION,
    city TEXT,
    region TEXT,
    region_code TEXT,
    country TEXT,
    country_code CHAR(2),
    street_address TEXT,
    postal_code TEXT,
    formatted_address TEXT,
    geonames_id INTEGER,

    -- ═══════════════════════════════════════════════════════════════
    -- EXTERNAL IDENTIFIERS
    -- ═══════════════════════════════════════════════════════════════
    wikidata_id TEXT,
    isil_code TEXT,
    viaf_id TEXT,
    google_place_id TEXT,
    kvk_number TEXT,
    ico_number TEXT,  -- Czech business identifier
    sigla TEXT,       -- Czech library code

    -- All identifiers as JSONB array
    identifiers JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- BASIC METADATA
    -- ═══════════════════════════════════════════════════════════════
    website TEXT,
    email TEXT,
    phone TEXT,
    phone_international TEXT,
    description TEXT,

    -- ═══════════════════════════════════════════════════════════════
    -- GOOGLE MAPS ENRICHMENT
    -- ═══════════════════════════════════════════════════════════════
    rating REAL,
    total_ratings INTEGER,
    business_status TEXT,
    google_maps_url TEXT,
    street_view_url TEXT,

    -- Opening hours (weekday_text array and periods)
    opening_hours JSONB,
    open_now BOOLEAN,

    -- Reviews (full array with author, rating, text, time)
    reviews JSONB,

    -- Photos (URLs and metadata)
    photos JSONB,
    photo_urls TEXT[],

    -- Full Google Maps enrichment backup
    google_maps_enrichment JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- WIKIDATA ENRICHMENT
    -- ═══════════════════════════════════════════════════════════════
    wikidata_label_nl TEXT,
    wikidata_label_en TEXT,
    wikidata_description_nl TEXT,
    wikidata_description_en TEXT,
    wikidata_types JSONB,  -- instance_of (P31) values
    wikidata_inception TEXT,
    wikidata_coordinates JSONB,
    wikidata_enrichment JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- YOUTUBE ENRICHMENT
    -- ═══════════════════════════════════════════════════════════════
    youtube_channel_id TEXT,
    youtube_channel_url TEXT,
    youtube_subscriber_count INTEGER,
    youtube_video_count INTEGER,
    youtube_view_count BIGINT,
    youtube_enrichment JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- WEB CLAIMS (extracted from institutional websites)
    -- ═══════════════════════════════════════════════════════════════
    social_facebook TEXT,
    social_twitter TEXT,
    social_instagram TEXT,
    social_linkedin TEXT,
    social_youtube TEXT,
    logo_url TEXT,
    web_claims JSONB,
    web_archives JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- GENEALOGIEWERKBALK (Dutch genealogy resources)
    -- ═══════════════════════════════════════════════════════════════
    genealogiewerkbalk JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- ISIL REGISTRIES
    -- ═══════════════════════════════════════════════════════════════
    nan_isil_enrichment JSONB,  -- Nationaal Archief ISIL
    kb_enrichment JSONB,        -- KB Netherlands Library Network

    -- ═══════════════════════════════════════════════════════════════
    -- MUSEUM REGISTER & ZCBS
    -- ═══════════════════════════════════════════════════════════════
    museum_register JSONB,
    zcbs_enrichment JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- TEMPORAL EXTENT & HISTORY
    -- ═══════════════════════════════════════════════════════════════
    founding_year INTEGER,
    founding_date DATE,
    dissolution_year INTEGER,
    dissolution_date DATE,
    temporal_extent JSONB,
    successor_organization JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- PROVENANCE
    -- ═══════════════════════════════════════════════════════════════
    data_source TEXT,
    data_tier TEXT,
    extraction_date TIMESTAMPTZ,
    confidence_score REAL,
    provenance JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- CH-ANNOTATOR METADATA
    -- ═══════════════════════════════════════════════════════════════
    ch_annotator JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- ORIGINAL ENTRY (full backup)
    -- ═══════════════════════════════════════════════════════════════
    original_entry JSONB,

    -- ═══════════════════════════════════════════════════════════════
    -- TIMESTAMPS
    -- ═══════════════════════════════════════════════════════════════
    source_file TEXT,
    created_at TIMESTAMPTZ DEFAULT NOW(),
    updated_at TIMESTAMPTZ DEFAULT NOW()
);

-- ═══════════════════════════════════════════════════════════════
-- INDEXES
-- ═══════════════════════════════════════════════════════════════

-- Core identity
CREATE INDEX idx_custodians_name ON custodians(name);
CREATE INDEX idx_custodians_name_gin ON custodians USING GIN (to_tsvector('simple', name));

-- Classification
CREATE INDEX idx_custodians_type ON custodians(type);
CREATE INDEX idx_custodians_type_name ON custodians(type_name);

-- GHCID
CREATE UNIQUE INDEX idx_custodians_ghcid ON custodians(ghcid);
CREATE INDEX idx_custodians_ghcid_uuid ON custodians(ghcid_uuid);
CREATE INDEX idx_custodians_record_id ON custodians(record_id);

-- Location
CREATE INDEX idx_custodians_city ON custodians(city);
CREATE INDEX idx_custodians_region ON custodians(region);
CREATE INDEX idx_custodians_country_code ON custodians(country_code);
CREATE INDEX idx_custodians_geonames_id ON custodians(geonames_id);

-- External identifiers
CREATE INDEX idx_custodians_wikidata_id ON custodians(wikidata_id);
CREATE INDEX idx_custodians_isil_code ON custodians(isil_code);
CREATE INDEX idx_custodians_google_place_id ON custodians(google_place_id);

-- Ratings
CREATE INDEX idx_custodians_rating ON custodians(rating);

-- Provenance
CREATE INDEX idx_custodians_data_source ON custodians(data_source);
CREATE INDEX idx_custodians_data_tier ON custodians(data_tier);
"""


SPATIAL_INDEX_SQL = """
-- Create a spatial index (requires PostGIS)
CREATE INDEX IF NOT EXISTS idx_custodians_geom ON custodians USING GIST (
    ST_SetSRID(ST_MakePoint(lon, lat), 4326)
) WHERE lat IS NOT NULL AND lon IS NOT NULL;
"""


def extract_coordinates(entry: Dict) -> tuple[Optional[float], Optional[float]]:
    """Extract lat/lon from entry with priority order."""
    lat, lon = None, None

    # Priority 1: Google Maps coordinates
    google_maps = entry.get('google_maps_enrichment', {})
    coords = google_maps.get('coordinates', {})
    if coords.get('latitude') and coords.get('longitude'):
        return coords['latitude'], coords['longitude']

    # Priority 2: Wikidata coordinates
    wd = entry.get('wikidata_enrichment', {})
    wd_coords = wd.get('wikidata_coordinates', {})
    if wd_coords.get('latitude') and wd_coords.get('longitude'):
        return wd_coords['latitude'], wd_coords['longitude']

    # Priority 3: locations array
    locations = entry.get('locations', entry.get('original_entry', {}).get('locations', []))
    if locations and isinstance(locations, list) and len(locations) > 0:
        loc = locations[0]
        if loc.get('latitude') and loc.get('longitude'):
            return loc['latitude'], loc['longitude']

    # Priority 4: location object
    location = entry.get('location', {})
    if location.get('latitude') and location.get('longitude'):
        return location['latitude'], location['longitude']

    return None, None


def extract_temporal_data(entry: Dict, wikidata: Dict) -> Dict[str, Any]:
    """Extract temporal data from timespan (CIDOC-CRM E52_Time-Span) or wikidata_inception.

    Priority for founding date:
    1. timespan.begin_of_the_begin (CIDOC-CRM P82a)
    2. wikidata_inception from wikidata_enrichment

    Priority for dissolution date:
    1. timespan.begin_of_the_end (CIDOC-CRM P81b)

    Returns dict with: founding_year, founding_date, dissolution_year, dissolution_date, temporal_extent
    """
    timespan = entry.get('timespan', {})
    result = {
        'founding_year': None,
        'founding_date': None,
        'dissolution_year': None,
        'dissolution_date': None,
        'temporal_extent': None,
    }

    # Extract founding date - priority: timespan.begin_of_the_begin > wikidata_inception
    founding_str = timespan.get('begin_of_the_begin')
    if not founding_str:
        founding_str = wikidata.get('wikidata_inception')

    if founding_str:
        try:
            # Handle various date formats
            founding_str = str(founding_str).strip()
            if founding_str and founding_str.lower() not in ('null', 'none', ''):
                # Try full ISO datetime (2001-01-01T00:00:00Z)
                if 'T' in founding_str:
                    dt = datetime.fromisoformat(founding_str.replace('Z', '+00:00'))
                    result['founding_date'] = dt.date()
                    result['founding_year'] = dt.year
                # Try date only (2001-01-01 or 1940-11-00)
                elif '-' in founding_str:
                    parts = founding_str.split('-')
                    if len(parts) >= 1:
                        year = int(parts[0])
                        result['founding_year'] = year
                        # Only set full date if month and day are valid
                        if len(parts) >= 3 and parts[1] != '00' and parts[2] != '00':
                            try:
                                result['founding_date'] = date(year, int(parts[1]), int(parts[2]))
                            except ValueError:
                                pass  # Invalid date, keep year only
                # Try year only
                elif founding_str.isdigit():
                    result['founding_year'] = int(founding_str)
        except (ValueError, TypeError, AttributeError):
            pass  # Could not parse founding date

    # Extract dissolution date from timespan.begin_of_the_end
    dissolution_str = timespan.get('begin_of_the_end')
    if dissolution_str:
        try:
            dissolution_str = str(dissolution_str).strip()
            if dissolution_str and dissolution_str.lower() not in ('null', 'none', ''):
                # Try full ISO datetime
                if 'T' in dissolution_str:
                    dt = datetime.fromisoformat(dissolution_str.replace('Z', '+00:00'))
                    result['dissolution_date'] = dt.date()
                    result['dissolution_year'] = dt.year
                # Try date only
                elif '-' in dissolution_str:
                    parts = dissolution_str.split('-')
                    if len(parts) >= 1:
                        year = int(parts[0])
                        result['dissolution_year'] = year
                        if len(parts) >= 3 and parts[1] != '00' and parts[2] != '00':
                            try:
                                result['dissolution_date'] = date(year, int(parts[1]), int(parts[2]))
                            except ValueError:
                                pass
                elif dissolution_str.isdigit():
                    result['dissolution_year'] = int(dissolution_str)
        except (ValueError, TypeError, AttributeError):
            pass

    # Store full timespan object if present (for CIDOC-CRM compliance)
    if timespan:
        result['temporal_extent'] = timespan
    else:
        # Fallback to entry.temporal_extent if no timespan
        result['temporal_extent'] = entry.get('temporal_extent')

    return result


def extract_custodian_data(entry: Dict, source_file: str) -> Dict[str, Any]:
    """Extract all relevant data from a custodian YAML entry."""

    original = entry.get('original_entry', {})
    google_maps = entry.get('google_maps_enrichment', {})
    wikidata = entry.get('wikidata_enrichment', {})
    youtube = entry.get('youtube_enrichment', {})
    web_claims = entry.get('web_claims', {})
    web_enrichment = entry.get('web_enrichment', {})
    ghcid_data = entry.get('ghcid', {})
    provenance = entry.get('provenance', {})
    custodian_name = entry.get('custodian_name', {})
    ch_annotator = entry.get('ch_annotator', {})

    # Get coordinates
    lat, lon = extract_coordinates(entry)

    # Get institution type - comprehensive extraction from multiple sources
    # Priority: 1) GHCID type letter, 2) original_entry.type, 3) original_entry.institution_type,
    #           4) type_organisatie, 5) CH-Annotator entity_classification, 6) Wikidata instance_of
    inst_type = 'U'  # Default to Unknown

    # 1. Extract from GHCID (most reliable - already normalized)
    ghcid_current = ghcid_data.get('ghcid_current', '')
    if ghcid_current and len(ghcid_current.split('-')) >= 4:
        ghcid_type = ghcid_current.split('-')[3]  # e.g., NL-NH-AMS-M-RM -> M
        if ghcid_type and len(ghcid_type) == 1 and ghcid_type in TYPE_COLORS:
            inst_type = ghcid_type

    # 2. If still Unknown, try original_entry.type (Dutch NDE format - list of letters)
    if inst_type == 'U':
        types = original.get('type', [])
        if isinstance(types, list) and types:
            # Dutch format: type: [M] or type: [M, A]
            first_type = types[0] if types else None
            if first_type and len(str(first_type)) == 1 and str(first_type).upper() in TYPE_COLORS:
                inst_type = str(first_type).upper()
        elif isinstance(types, str) and types:
            # CH-Annotator format: type: GRP.HER.MUS
            if types.startswith('GRP.HER.'):
                ch_type_map = {
                    'GRP.HER.GAL': 'G', 'GRP.HER.LIB': 'L', 'GRP.HER.ARC': 'A',
                    'GRP.HER.MUS': 'M', 'GRP.HER.MIX': 'X', 'GRP.HER': 'U'
                }
                inst_type = ch_type_map.get(types, 'U')
            elif len(types) == 1 and types.upper() in TYPE_COLORS:
                inst_type = types.upper()

    # 3. Try original_entry.institution_type (CH-Annotator full name format)
    if inst_type == 'U':
        inst_type_str = original.get('institution_type', '')
        if inst_type_str:
            type_map = {v.upper(): k for k, v in TYPE_NAMES.items()}
            # Also add common variations
            type_map.update({
                'GALLERY': 'G', 'LIBRARY': 'L', 'ARCHIVE': 'A', 'MUSEUM': 'M',
                'OFFICIAL': 'O', 'OFFICIAL_INSTITUTION': 'O', 'RESEARCH': 'R',
                'RESEARCH_CENTER': 'R', 'CORPORATION': 'C', 'UNKNOWN': 'U',
                'BOTANICAL': 'B', 'BOTANICAL_ZOO': 'B', 'EDUCATION': 'E',
                'EDUCATION_PROVIDER': 'E', 'SOCIETY': 'S', 'COLLECTING_SOCIETY': 'S',
                'FEATURES': 'F', 'INTANGIBLE': 'I', 'INTANGIBLE_HERITAGE_GROUP': 'I',
                'MIXED': 'X', 'PERSONAL': 'P', 'PERSONAL_COLLECTION': 'P',
                'HOLY_SITES': 'H', 'DIGITAL': 'D', 'DIGITAL_PLATFORM': 'D',
                'NGO': 'N', 'TASTE_SMELL': 'T',
            })
            inst_type = type_map.get(inst_type_str.upper(), 'U')

    # 4. Try type_organisatie (Dutch NDE CSV field)
    if inst_type == 'U':
        type_org = original.get('type_organisatie', '')
        if type_org:
            type_org_map = {
                'museum': 'M', 'archief': 'A', 'bibliotheek': 'L',
                'galerie': 'G', 'onderzoek': 'R', 'erfgoed': 'O',
                'onderwijs': 'E', 'vereniging': 'S', 'stichting': 'N',
            }
            inst_type = type_org_map.get(type_org.lower(), 'U')

    # 5. Try CH-Annotator entity_classification
    if inst_type == 'U' and ch_annotator:
        entity_class = ch_annotator.get('entity_classification', {})
        subtype = entity_class.get('subtype', '')
        if subtype:
            ch_subtype_map = {
                'GRP.HER.GAL': 'G', 'GRP.HER.LIB': 'L', 'GRP.HER.ARC': 'A',
                'GRP.HER.MUS': 'M', 'GRP.HER.MIX': 'X',
            }
            inst_type = ch_subtype_map.get(subtype, 'U')

    # 6. Try to infer from Wikidata instance_of
    if inst_type == 'U' and wikidata:
        wikidata_types = wikidata.get('wikidata_instance_of', [])
        if isinstance(wikidata_types, list):
            # Map common Wikidata types to GLAMORCUBESFIXPHDNT
            wd_type_map = {
                'museum': 'M', 'art museum': 'M', 'history museum': 'M',
                'natural history museum': 'M', 'science museum': 'M',
                'archive': 'A', 'national archive': 'A', 'state archive': 'A',
                'library': 'L', 'public library': 'L', 'national library': 'L',
                'research institute': 'R', 'research center': 'R',
                'university': 'E', 'college': 'E', 'school': 'E',
                'botanical garden': 'B', 'zoo': 'B', 'aquarium': 'B',
                'art gallery': 'G', 'gallery': 'G',
                'organization': 'N', 'non-profit organization': 'N',
                'foundation': 'N', 'association': 'S', 'society': 'S',
                'church': 'H', 'monastery': 'H', 'temple': 'H', 'mosque': 'H',
            }
            for wdt in wikidata_types:
                wdt_lower = str(wdt).lower() if wdt else ''
                for pattern, type_code in wd_type_map.items():
                    if pattern in wdt_lower:
                        inst_type = type_code
                        break
                if inst_type != 'U':
                    break

    # Get name with priority
    name = (
        custodian_name.get('claim_value') or
        wikidata.get('wikidata_label_nl') or
        original.get('organisatie') or
        original.get('name') or
        'Unknown Institution'
    )

    # Get location info
    locations = entry.get('locations', original.get('locations', []))
    first_loc = locations[0] if locations and isinstance(locations, list) else {}

    city = (
        original.get('plaatsnaam_bezoekadres') or
        first_loc.get('city') or
        google_maps.get('short_address', '').split(',')[-1].strip() if google_maps.get('short_address') else ''
    )

    # Extract identifiers - handle both list of dicts and flat dict formats
    identifiers = entry.get('identifiers', [])
    wikidata_id = None
    isil_code = None
    viaf_id = None
    kvk_number = None
    ico_number = None
    sigla = None

    # Handle flat dict format (e.g., {wikidata_id: "Q123", anbi: true})
    if isinstance(identifiers, dict):
        wikidata_id = identifiers.get('wikidata_id')
        isil_code = identifiers.get('isil_code') or identifiers.get('isil')
        viaf_id = identifiers.get('viaf_id') or identifiers.get('viaf')
        kvk_number = identifiers.get('kvk_number') or identifiers.get('kvk')
        ico_number = identifiers.get('ico_number') or identifiers.get('ico')
        sigla = identifiers.get('sigla')
        identifiers = []  # Clear to skip the loop

    for ident in identifiers:
        if not isinstance(ident, dict):
            continue  # Skip non-dict items
        scheme = ident.get('identifier_scheme', '')
        value = ident.get('identifier_value', '')
        if scheme == 'Wikidata':
            wikidata_id = value
        elif scheme == 'ISIL':
            isil_code = value
        elif scheme == 'VIAF':
            viaf_id = value
        elif scheme == 'KvK':
            kvk_number = value
        elif scheme == 'IČO':
            ico_number = value
        elif scheme == 'Sigla':
            sigla = value

    # Extract social media from web claims
    social = {}
    if web_claims.get('claims'):
        for claim in web_claims['claims']:
            ct = claim.get('claim_type', '')
            cv = claim.get('claim_value', '')
            if ct == 'social_facebook':
                social['facebook'] = cv
            elif ct == 'social_twitter':
                social['twitter'] = cv
            elif ct == 'social_instagram':
                social['instagram'] = cv
            elif ct == 'social_linkedin':
                social['linkedin'] = cv
            elif ct == 'social_youtube':
                social['youtube'] = cv
            elif ct == 'logo':
                social['logo'] = cv

    # Extract YouTube data - handle both nested (Dutch) and flat (UNESCO) formats
    # Nested: youtube_enrichment.channel.channel_id
    # Flat: youtube_enrichment.channel_id
    if youtube.get('channel'):
        # Nested format (Dutch files)
        yt_channel = youtube.get('channel', {})
    elif youtube.get('channel_id'):
        # Flat format (UNESCO/other files) - use youtube dict directly
        yt_channel = youtube
    else:
        yt_channel = {}

    # Build result
    return {
        # Core identity
        'name': name,
        'verified_name': custodian_name.get('claim_value'),
        'name_source': custodian_name.get('extraction_method'),
        'emic_name': custodian_name.get('emic_name'),

        # Classification
        'type': inst_type,
        'type_name': TYPE_NAMES.get(inst_type, 'Unknown'),
        'color': TYPE_COLORS.get(inst_type, '#9e9e9e'),
        'ch_annotator_hypernym': ch_annotator.get('entity_classification', {}).get('hypernym'),
        'ch_annotator_subtype': ch_annotator.get('entity_classification', {}).get('subtype'),
        'ontology_class': ch_annotator.get('entity_classification', {}).get('ontology_class'),

        # GHCID
        'ghcid': ghcid_data.get('ghcid_current'),
        'ghcid_uuid': ghcid_data.get('ghcid_uuid'),
        'ghcid_uuid_sha256': ghcid_data.get('ghcid_uuid_sha256'),
        'ghcid_numeric': ghcid_data.get('ghcid_numeric'),
        'record_id': ghcid_data.get('record_id'),
        'ghcid_original': ghcid_data.get('ghcid_original'),
        'ghcid_history': ghcid_data.get('ghcid_history'),

        # Location
        'lat': lat,
        'lon': lon,
        'city': city,
        'region': first_loc.get('region') or ghcid_data.get('location_resolution', {}).get('region_name'),
        'region_code': ensure_str(ghcid_data.get('location_resolution', {}).get('region_code')),
        'country': ensure_str(first_loc.get('country')),
        'country_code': ensure_str(ghcid_data.get('location_resolution', {}).get('country_code')),
        'street_address': first_loc.get('street_address') or google_maps.get('formatted_address'),
        'postal_code': first_loc.get('postal_code'),
        'formatted_address': google_maps.get('formatted_address'),
        'geonames_id': ghcid_data.get('location_resolution', {}).get('geonames_id') or ghcid_data.get('geonames_id'),

        # External identifiers
        'wikidata_id': wikidata_id or wikidata.get('wikidata_entity_id'),
        'isil_code': isil_code or original.get('isil-code_na'),
        'viaf_id': viaf_id,
        'google_place_id': google_maps.get('place_id'),
        'kvk_number': kvk_number,
        'ico_number': ico_number,
        'sigla': sigla,
        'identifiers': identifiers,

        # Basic metadata
        'website': ensure_str(google_maps.get('website') or wikidata.get('wikidata_official_website') or original.get('webadres_organisatie')),
        'email': None,  # Extract from web_claims if present
        'phone': google_maps.get('phone_local'),
        'phone_international': google_maps.get('phone_international'),
        'description': wikidata.get('wikidata_description_nl') or wikidata.get('wikidata_description_en'),

        # Google Maps enrichment
        'rating': google_maps.get('rating'),
        'total_ratings': google_maps.get('total_ratings'),
        'business_status': google_maps.get('business_status'),
        'google_maps_url': google_maps.get('google_maps_url'),
        'street_view_url': google_maps.get('street_view_url'),
        'opening_hours': google_maps.get('opening_hours'),
        'open_now': google_maps.get('opening_hours', {}).get('open_now') if isinstance(google_maps.get('opening_hours'), dict) else None,
        'reviews': google_maps.get('reviews'),
        'photos': google_maps.get('photos_metadata'),
        'photo_urls': google_maps.get('photo_urls'),
        'google_maps_enrichment': google_maps if google_maps else None,

        # Wikidata enrichment
        'wikidata_label_nl': wikidata.get('wikidata_label_nl'),
        'wikidata_label_en': wikidata.get('wikidata_label_en'),
        'wikidata_description_nl': wikidata.get('wikidata_description_nl'),
        'wikidata_description_en': wikidata.get('wikidata_description_en'),
        'wikidata_types': wikidata.get('wikidata_instance_of'),
        'wikidata_inception': str(wikidata.get('wikidata_inception')) if wikidata.get('wikidata_inception') else None,
        'wikidata_coordinates': wikidata.get('wikidata_coordinates'),
        'wikidata_enrichment': wikidata if wikidata else None,

        # YouTube enrichment
        'youtube_channel_id': ensure_str(yt_channel.get('channel_id')),
        'youtube_channel_url': ensure_str(yt_channel.get('channel_url')),
        'youtube_subscriber_count': yt_channel.get('subscriber_count'),
        'youtube_video_count': yt_channel.get('video_count'),
        'youtube_view_count': yt_channel.get('view_count'),
        'youtube_enrichment': youtube if youtube.get('status') == 'SUCCESS' else None,

        # Web claims
        'social_facebook': social.get('facebook'),
        'social_twitter': social.get('twitter'),
        'social_instagram': social.get('instagram'),
        'social_linkedin': social.get('linkedin'),
        'social_youtube': social.get('youtube'),
        'logo_url': social.get('logo'),
        'web_claims': web_claims if web_claims.get('claims') else None,
        'web_archives': web_enrichment.get('web_archives'),

        # Genealogiewerkbalk
        'genealogiewerkbalk': entry.get('genealogiewerkbalk_enrichment'),

        # ISIL registries
        'nan_isil_enrichment': entry.get('nan_isil_enrichment'),
        'kb_enrichment': entry.get('kb_enrichment'),

        # Museum register & ZCBS
        'museum_register': entry.get('museum_register_enrichment'),
        'zcbs_enrichment': entry.get('zcbs_enrichment'),

        # Temporal extent - extract from timespan (CIDOC-CRM E52_Time-Span) or wikidata_inception
        **extract_temporal_data(entry, wikidata),
        'successor_organization': entry.get('successor_organization'),

        # Provenance
        'data_source': provenance.get('data_source'),
        'data_tier': provenance.get('data_tier'),
        'extraction_date': parse_datetime(provenance.get('extraction_date')),
        'confidence_score': provenance.get('confidence_score'),
        'provenance': provenance if provenance else None,

        # CH-Annotator
        'ch_annotator': ch_annotator if ch_annotator else None,

        # Original entry
        'original_entry': original if original else None,

        # Source file
        'source_file': source_file,
    }


def to_json(value: Any) -> Optional[str]:
    """Convert value to JSON string, handling None and date objects."""
    if value is None:
        return None

    def json_serializer(obj):
        """Handle date/datetime objects in JSON serialization."""
        from datetime import date, datetime
        if isinstance(obj, datetime):
            return obj.isoformat()
        elif isinstance(obj, date):
            return obj.isoformat()
        raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

    return json.dumps(value, default=json_serializer)


def parse_datetime(value: Any) -> Optional[datetime]:
    """Parse datetime from string or return None."""
    if value is None:
        return None
    if isinstance(value, datetime):
        return value
    if isinstance(value, str):
        try:
            # Try ISO format with timezone
            if '+' in value or 'Z' in value:
                return datetime.fromisoformat(value.replace('Z', '+00:00'))
            # Try ISO format without timezone
            return datetime.fromisoformat(value).replace(tzinfo=timezone.utc)
        except (ValueError, AttributeError):
            return None
    return None


def ensure_str(value: Any) -> Optional[str]:
    """Ensure value is a string or None."""
    if value is None:
        return None
    if isinstance(value, bool):
        return None  # False/True shouldn't be stored as strings
    if isinstance(value, list):
        # Return first item if list, or join
        if len(value) == 0:
            return None
        return value[0] if len(value) == 1 else value[0]  # Take first URL
    if isinstance(value, (int, float)):
        return str(value)
    return str(value)


async def load_data_to_database(
    db_name: str,
    db_config: Dict[str, Any],
    yaml_files: List[Path],
    drop_existing: bool = False,
) -> Dict[str, int]:
    """Load custodian data into a single database.

    Args:
        db_name: Name of the database (for logging)
        db_config: Database connection configuration
        yaml_files: List of YAML files to process
        drop_existing: Whether to drop and recreate the table

    Returns:
        Dict with counts: processed, skipped, errors, total
    """

    print(f"\n{'='*60}")
    print(f"Loading data to: {db_name}")
    print(f"{'='*60}")
    print(f"Connecting to PostgreSQL at {db_config['host']}:{db_config['port']}/{db_config['database']}...")

    conn = await asyncpg.connect(
        host=db_config['host'],
        port=db_config['port'],
        database=db_config['database'],
        user=db_config['user'],
        password=db_config['password'],
    )

    processed = 0
    skipped = 0
    errors = 0

    try:
        if drop_existing:
            print("Creating custodians table (dropping existing)...")
            await conn.execute(CREATE_TABLE_SQL)

            # Try to create spatial index
            try:
                await conn.execute(SPATIAL_INDEX_SQL)
                print("  Created spatial index (PostGIS)")
            except Exception as e:
                print(f"  Skipped spatial index (PostGIS not available): {e}")

        print(f"Processing {len(yaml_files)} custodian files...")

        # Prepare INSERT statement with expanded ON CONFLICT to update all web claims fields
        insert_sql = """
        INSERT INTO custodians (
            name, verified_name, name_source, emic_name,
            type, type_name, color, ch_annotator_hypernym, ch_annotator_subtype, ontology_class,
            ghcid, ghcid_uuid, ghcid_uuid_sha256, ghcid_numeric, record_id, ghcid_original, ghcid_history,
            lat, lon, city, region, region_code, country, country_code, street_address, postal_code, formatted_address, geonames_id,
            wikidata_id, isil_code, viaf_id, google_place_id, kvk_number, ico_number, sigla, identifiers,
            website, email, phone, phone_international, description,
            rating, total_ratings, business_status, google_maps_url, street_view_url,
            opening_hours, open_now, reviews, photos, photo_urls, google_maps_enrichment,
            wikidata_label_nl, wikidata_label_en, wikidata_description_nl, wikidata_description_en,
            wikidata_types, wikidata_inception, wikidata_coordinates, wikidata_enrichment,
            youtube_channel_id, youtube_channel_url, youtube_subscriber_count, youtube_video_count, youtube_view_count, youtube_enrichment,
            social_facebook, social_twitter, social_instagram, social_linkedin, social_youtube, logo_url, web_claims, web_archives,
            genealogiewerkbalk, nan_isil_enrichment, kb_enrichment, museum_register, zcbs_enrichment,
            founding_year, founding_date, dissolution_year, dissolution_date, temporal_extent, successor_organization,
            data_source, data_tier, extraction_date, confidence_score, provenance,
            ch_annotator, original_entry, source_file
        ) VALUES (
            $1, $2, $3, $4,
            $5, $6, $7, $8, $9, $10,
            $11, $12, $13, $14, $15, $16, $17,
            $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28,
            $29, $30, $31, $32, $33, $34, $35, $36,
            $37, $38, $39, $40, $41,
            $42, $43, $44, $45, $46,
            $47, $48, $49, $50, $51, $52,
            $53, $54, $55, $56,
            $57, $58, $59, $60,
            $61, $62, $63, $64, $65, $66,
            $67, $68, $69, $70, $71, $72, $73, $74,
            $75, $76, $77, $78, $79,
            $80, $81, $82, $83, $84, $85,
            $86, $87, $88, $89, $90,
            $91, $92, $93
        )
        ON CONFLICT (ghcid) DO UPDATE SET
            name = EXCLUDED.name,
            verified_name = EXCLUDED.verified_name,
            emic_name = EXCLUDED.emic_name,
            type = EXCLUDED.type,
            type_name = EXCLUDED.type_name,
            lat = EXCLUDED.lat,
            lon = EXCLUDED.lon,
            city = EXCLUDED.city,
            region = EXCLUDED.region,
            country_code = EXCLUDED.country_code,
            website = EXCLUDED.website,
            description = EXCLUDED.description,
            rating = EXCLUDED.rating,
            total_ratings = EXCLUDED.total_ratings,
            reviews = EXCLUDED.reviews,
            photos = EXCLUDED.photos,
            photo_urls = EXCLUDED.photo_urls,
            opening_hours = EXCLUDED.opening_hours,
            google_maps_enrichment = EXCLUDED.google_maps_enrichment,
            wikidata_enrichment = EXCLUDED.wikidata_enrichment,
            youtube_channel_id = EXCLUDED.youtube_channel_id,
            youtube_channel_url = EXCLUDED.youtube_channel_url,
            youtube_subscriber_count = EXCLUDED.youtube_subscriber_count,
            youtube_video_count = EXCLUDED.youtube_video_count,
            youtube_view_count = EXCLUDED.youtube_view_count,
            youtube_enrichment = EXCLUDED.youtube_enrichment,
            social_facebook = EXCLUDED.social_facebook,
            social_twitter = EXCLUDED.social_twitter,
            social_instagram = EXCLUDED.social_instagram,
            social_linkedin = EXCLUDED.social_linkedin,
            social_youtube = EXCLUDED.social_youtube,
            logo_url = EXCLUDED.logo_url,
            web_claims = EXCLUDED.web_claims,
            web_archives = EXCLUDED.web_archives,
            updated_at = NOW()
        """

        for i, yaml_file in enumerate(yaml_files):
            if (i + 1) % 1000 == 0 or i == 0:
                print(f"  Processing {i + 1}/{len(yaml_files)}...")

            try:
                with open(yaml_file, 'r', encoding='utf-8') as f:
                    entry = yaml.load(f, Loader=SafeLoader)

                if not entry:
                    skipped += 1
                    continue

                data = extract_custodian_data(entry, yaml_file.name)

                # Skip if no GHCID
                if not data['ghcid']:
                    skipped += 1
                    continue

                # Build values tuple
                values = (
                    data['name'], data['verified_name'], data['name_source'], data['emic_name'],
                    data['type'], data['type_name'], data['color'], data['ch_annotator_hypernym'], data['ch_annotator_subtype'], data['ontology_class'],
                    data['ghcid'], data['ghcid_uuid'], data['ghcid_uuid_sha256'], data['ghcid_numeric'], data['record_id'], data['ghcid_original'], to_json(data['ghcid_history']),
                    data['lat'], data['lon'], data['city'], data['region'], data['region_code'], data['country'], data['country_code'], data['street_address'], data['postal_code'], data['formatted_address'], data['geonames_id'],
                    data['wikidata_id'], data['isil_code'], data['viaf_id'], data['google_place_id'], data['kvk_number'], data['ico_number'], data['sigla'], to_json(data['identifiers']),
                    data['website'], data['email'], data['phone'], data['phone_international'], data['description'],
                    data['rating'], data['total_ratings'], data['business_status'], data['google_maps_url'], data['street_view_url'],
                    to_json(data['opening_hours']), data['open_now'], to_json(data['reviews']), to_json(data['photos']), data['photo_urls'], to_json(data['google_maps_enrichment']),
                    data['wikidata_label_nl'], data['wikidata_label_en'], data['wikidata_description_nl'], data['wikidata_description_en'],
                    to_json(data['wikidata_types']), data['wikidata_inception'], to_json(data['wikidata_coordinates']), to_json(data['wikidata_enrichment']),
                    data['youtube_channel_id'], data['youtube_channel_url'], data['youtube_subscriber_count'], data['youtube_video_count'], data['youtube_view_count'], to_json(data['youtube_enrichment']),
                    data['social_facebook'], data['social_twitter'], data['social_instagram'], data['social_linkedin'], data['social_youtube'], data['logo_url'], to_json(data['web_claims']), to_json(data['web_archives']),
                    to_json(data['genealogiewerkbalk']), to_json(data['nan_isil_enrichment']), to_json(data['kb_enrichment']), to_json(data['museum_register']), to_json(data['zcbs_enrichment']),
                    data['founding_year'], data['founding_date'], data['dissolution_year'], data['dissolution_date'], to_json(data['temporal_extent']), to_json(data['successor_organization']),
                    data['data_source'], data['data_tier'], data['extraction_date'], data['confidence_score'], to_json(data['provenance']),
                    to_json(data['ch_annotator']), to_json(data['original_entry']), data['source_file'],
                )

                await conn.execute(insert_sql, *values)
                processed += 1

            except Exception as e:
                errors += 1
                if errors <= 10:
                    print(f"    Error processing {yaml_file.name}: {e}")
                elif errors == 11:
                    print("    ... suppressing further error messages")

        # Get final count
        count = await conn.fetchval("SELECT COUNT(*) FROM custodians")

        print(f"\n  [{db_name}] LOAD COMPLETE")
        print(f"    Files processed: {processed}")
        print(f"    Files skipped:   {skipped}")
        print(f"    Errors:          {errors}")
        print(f"    Total in DB:     {count}")

        return {'processed': processed, 'skipped': skipped, 'errors': errors, 'total': count}

    finally:
        await conn.close()


async def load_data(
    drop_existing: bool = False,
    limit: Optional[int] = None,
    databases: Optional[List[str]] = None,
):
    """Load all custodian data into one or more PostgreSQL databases.

    Args:
        drop_existing: Whether to drop and recreate tables
        limit: Optional limit on number of files to process
        databases: List of database names to load into. If None, uses single-database mode
                   with environment variables (backward compatible).
    """

    # Find all YAML files
    print(f"\nReading custodian files from: {CUSTODIAN_DIR}")
    yaml_files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
    total_files = len(yaml_files)
    print(f"Found {total_files} custodian files")

    if limit:
        yaml_files = yaml_files[:limit]
        print(f"Processing first {limit} files only")

    # Single database mode (backward compatible)
    if databases is None:
        print(f"\nUsing single-database mode (backward compatible)")
        db_config = {
            'host': POSTGRES_HOST,
            'port': POSTGRES_PORT,
            'database': POSTGRES_DB,
            'user': POSTGRES_USER,
            'password': POSTGRES_PASSWORD,
        }
        await load_data_to_database(
            db_name=POSTGRES_DB,
            db_config=db_config,
            yaml_files=yaml_files,
            drop_existing=drop_existing,
        )
        return

    # Multi-database mode
    print(f"\nUsing multi-database mode: {', '.join(databases)}")

    results = {}
    for db_name in databases:
        if db_name not in DATABASES:
            print(f"\nERROR: Unknown database '{db_name}'. Available: {', '.join(DATABASES.keys())}")
            continue

        db_config = DATABASES[db_name]
        try:
            result = await load_data_to_database(
                db_name=db_name,
                db_config=db_config,
                yaml_files=yaml_files,
                drop_existing=drop_existing,
            )
            results[db_name] = result
        except Exception as e:
            print(f"\nERROR loading to {db_name}: {e}")
            results[db_name] = {'error': str(e)}

    # Summary
    print(f"\n{'='*60}")
    print("MULTI-DATABASE LOAD SUMMARY")
    print(f"{'='*60}")
    for db_name, result in results.items():
        if 'error' in result:
            print(f"  {db_name}: FAILED - {result['error']}")
        else:
            print(f"  {db_name}: {result['processed']} processed, {result['total']} total in DB")


def main():
    parser = argparse.ArgumentParser(description="Load custodian data into PostgreSQL")
    parser.add_argument("--drop-existing", action="store_true", help="Drop existing table and recreate")
    parser.add_argument("--limit", type=int, help="Limit number of files to process (for testing)")
    parser.add_argument(
        "--databases",
        type=str,
        help="Comma-separated list of databases to load (e.g., 'glam,glam_geo'). "
             "If not specified, uses single-database mode with env vars. "
             f"Available: {', '.join(DATABASES.keys())}"
    )
    args = parser.parse_args()

    # Parse databases list
    databases = None
    if args.databases:
        databases = [db.strip() for db in args.databases.split(',')]

    asyncio.run(load_data(
        drop_existing=args.drop_existing,
        limit=args.limit,
        databases=databases,
    ))


if __name__ == "__main__":
    main()