glam/backend/postgres/load_custodian_data.py
2025-12-21 00:01:54 +01:00

1134 lines
52 KiB
Python

#!/usr/bin/env python3
"""
Load ALL custodian data from data/custodian/ into PostgreSQL/PostGIS
This script replaces load_nde_data.py and reads directly from the authoritative
custodian YAML files, preserving ALL rich metadata including:
- Google Maps enrichment (ratings, reviews, photos, opening hours)
- YouTube enrichment (channel info, videos)
- Web claims (social media, logos, descriptions)
- Genealogiewerkbalk data
- GHCID identifiers and history
- Provenance tracking
- Temporal extent and successor organizations
Usage:
python load_custodian_data.py [--drop-existing] [--limit N]
"""
import argparse
import asyncio
import json
import os
from pathlib import Path
from datetime import date, datetime, timezone
from typing import Any, Dict, List, Optional
import sys
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
try:
import asyncpg
except ImportError:
print("Error: asyncpg not installed. Run: pip install asyncpg")
sys.exit(1)
try:
import yaml
try:
from yaml import CSafeLoader as SafeLoader
except ImportError:
from yaml import SafeLoader
except ImportError:
print("Error: PyYAML not installed. Run: pip install pyyaml")
sys.exit(1)
# Configuration
# Server path: /mnt/data/custodian/
# Local path: {project_root}/data/custodian/
DEFAULT_CUSTODIAN_DIR = "/mnt/data/custodian" if os.path.exists("/mnt/data/custodian") else str(project_root / "data" / "custodian")
CUSTODIAN_DIR = Path(os.getenv("CUSTODIAN_DIR", DEFAULT_CUSTODIAN_DIR))
# Single database config (for backward compatibility)
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "glam_heritage")
POSTGRES_USER = os.getenv("POSTGRES_USER", "kempersc")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
# Multi-database configuration for production
# Production has two databases that need identical custodian data:
# - glam: Main custodian data storage
# - glam_geo: PostGIS geo API for bronhouder.nl map
DATABASES = {
'glam': {
'host': os.getenv("POSTGRES_HOST", "localhost"),
'port': int(os.getenv("POSTGRES_PORT", "5432")),
'database': os.getenv("POSTGRES_DB", "glam"),
'user': os.getenv("POSTGRES_USER", "glam_api"),
'password': os.getenv("POSTGRES_PASSWORD", ""),
},
'glam_geo': {
'host': os.getenv("GEO_POSTGRES_HOST", os.getenv("POSTGRES_HOST", "localhost")),
'port': int(os.getenv("GEO_POSTGRES_PORT", os.getenv("POSTGRES_PORT", "5432"))),
'database': os.getenv("GEO_POSTGRES_DB", "glam_geo"),
'user': os.getenv("GEO_POSTGRES_USER", os.getenv("POSTGRES_USER", "glam_api")),
'password': os.getenv("GEO_POSTGRES_PASSWORD", os.getenv("POSTGRES_PASSWORD", "")),
},
# Local development database
'glam_heritage': {
'host': os.getenv("POSTGRES_HOST", "localhost"),
'port': int(os.getenv("POSTGRES_PORT", "5432")),
'database': os.getenv("POSTGRES_DB", "glam_heritage"),
'user': os.getenv("POSTGRES_USER", "kempersc"),
'password': os.getenv("POSTGRES_PASSWORD", ""),
},
}
# Institution type mappings
TYPE_COLORS = {
'G': '#00bcd4', 'L': '#2ecc71', 'A': '#3498db', 'M': '#e74c3c',
'O': '#f39c12', 'R': '#1abc9c', 'C': '#795548', 'U': '#9e9e9e',
'B': '#4caf50', 'E': '#ff9800', 'S': '#9b59b6', 'F': '#95a5a6',
'I': '#673ab7', 'X': '#607d8b', 'P': '#ff5722', 'H': '#607d8b',
'D': '#34495e', 'N': '#e91e63', 'T': '#ff5722',
}
TYPE_NAMES = {
'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum',
'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown',
'B': 'Botanical', 'E': 'Education', 'S': 'Society', 'F': 'Features',
'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy sites',
'D': 'Digital', 'N': 'NGO', 'T': 'Taste/smell',
}
CREATE_TABLE_SQL = """
-- Drop existing table if requested
DROP TABLE IF EXISTS custodians CASCADE;
CREATE TABLE custodians (
id SERIAL PRIMARY KEY,
-- ═══════════════════════════════════════════════════════════════
-- CORE IDENTITY
-- ═══════════════════════════════════════════════════════════════
name TEXT NOT NULL,
verified_name TEXT,
name_source TEXT,
emic_name TEXT, -- Name in original/native language
-- ═══════════════════════════════════════════════════════════════
-- CLASSIFICATION (GLAMORCUBESFIXPHDNT)
-- ═══════════════════════════════════════════════════════════════
type CHAR(1),
type_name TEXT,
color VARCHAR(10),
ch_annotator_hypernym TEXT, -- e.g., GRP
ch_annotator_subtype TEXT, -- e.g., GRP.HER.MUS
ontology_class TEXT, -- e.g., schema:Museum
-- ═══════════════════════════════════════════════════════════════
-- GHCID (Global Heritage Custodian Identifier)
-- ═══════════════════════════════════════════════════════════════
ghcid TEXT UNIQUE,
ghcid_uuid UUID,
ghcid_uuid_sha256 UUID,
ghcid_numeric NUMERIC(20),
record_id UUID,
ghcid_original TEXT,
ghcid_history JSONB,
-- ═══════════════════════════════════════════════════════════════
-- LOCATION
-- ═══════════════════════════════════════════════════════════════
lat DOUBLE PRECISION,
lon DOUBLE PRECISION,
city TEXT,
region TEXT,
region_code TEXT,
country TEXT,
country_code CHAR(2),
street_address TEXT,
postal_code TEXT,
formatted_address TEXT,
geonames_id INTEGER,
-- ═══════════════════════════════════════════════════════════════
-- EXTERNAL IDENTIFIERS
-- ═══════════════════════════════════════════════════════════════
wikidata_id TEXT,
isil_code TEXT,
viaf_id TEXT,
google_place_id TEXT,
kvk_number TEXT,
ico_number TEXT, -- Czech business identifier
sigla TEXT, -- Czech library code
-- All identifiers as JSONB array
identifiers JSONB,
-- ═══════════════════════════════════════════════════════════════
-- BASIC METADATA
-- ═══════════════════════════════════════════════════════════════
website TEXT,
email TEXT,
phone TEXT,
phone_international TEXT,
description TEXT,
-- ═══════════════════════════════════════════════════════════════
-- GOOGLE MAPS ENRICHMENT
-- ═══════════════════════════════════════════════════════════════
rating REAL,
total_ratings INTEGER,
business_status TEXT,
google_maps_url TEXT,
street_view_url TEXT,
-- Opening hours (weekday_text array and periods)
opening_hours JSONB,
open_now BOOLEAN,
-- Reviews (full array with author, rating, text, time)
reviews JSONB,
-- Photos (URLs and metadata)
photos JSONB,
photo_urls TEXT[],
-- Full Google Maps enrichment backup
google_maps_enrichment JSONB,
-- ═══════════════════════════════════════════════════════════════
-- WIKIDATA ENRICHMENT
-- ═══════════════════════════════════════════════════════════════
wikidata_label_nl TEXT,
wikidata_label_en TEXT,
wikidata_description_nl TEXT,
wikidata_description_en TEXT,
wikidata_types JSONB, -- instance_of (P31) values
wikidata_inception TEXT,
wikidata_coordinates JSONB,
wikidata_enrichment JSONB,
-- ═══════════════════════════════════════════════════════════════
-- YOUTUBE ENRICHMENT
-- ═══════════════════════════════════════════════════════════════
youtube_channel_id TEXT,
youtube_channel_url TEXT,
youtube_subscriber_count INTEGER,
youtube_video_count INTEGER,
youtube_view_count BIGINT,
youtube_enrichment JSONB,
-- ═══════════════════════════════════════════════════════════════
-- WEB CLAIMS (extracted from institutional websites)
-- ═══════════════════════════════════════════════════════════════
social_facebook TEXT,
social_twitter TEXT,
social_instagram TEXT,
social_linkedin TEXT,
social_youtube TEXT,
logo_url TEXT,
web_claims JSONB,
web_archives JSONB,
-- ═══════════════════════════════════════════════════════════════
-- GENEALOGIEWERKBALK (Dutch genealogy resources)
-- ═══════════════════════════════════════════════════════════════
genealogiewerkbalk JSONB,
-- ═══════════════════════════════════════════════════════════════
-- ISIL REGISTRIES
-- ═══════════════════════════════════════════════════════════════
nan_isil_enrichment JSONB, -- Nationaal Archief ISIL
kb_enrichment JSONB, -- KB Netherlands Library Network
-- ═══════════════════════════════════════════════════════════════
-- MUSEUM REGISTER & ZCBS
-- ═══════════════════════════════════════════════════════════════
museum_register JSONB,
zcbs_enrichment JSONB,
-- ═══════════════════════════════════════════════════════════════
-- TEMPORAL EXTENT & HISTORY
-- ═══════════════════════════════════════════════════════════════
founding_year INTEGER,
founding_date DATE,
dissolution_year INTEGER,
dissolution_date DATE,
temporal_extent JSONB,
successor_organization JSONB,
-- ═══════════════════════════════════════════════════════════════
-- PROVENANCE
-- ═══════════════════════════════════════════════════════════════
data_source TEXT,
data_tier TEXT,
extraction_date TIMESTAMPTZ,
confidence_score REAL,
provenance JSONB,
-- ═══════════════════════════════════════════════════════════════
-- CH-ANNOTATOR METADATA
-- ═══════════════════════════════════════════════════════════════
ch_annotator JSONB,
-- ═══════════════════════════════════════════════════════════════
-- ORIGINAL ENTRY (full backup)
-- ═══════════════════════════════════════════════════════════════
original_entry JSONB,
-- ═══════════════════════════════════════════════════════════════
-- TIMESTAMPS
-- ═══════════════════════════════════════════════════════════════
source_file TEXT,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- ═══════════════════════════════════════════════════════════════
-- INDEXES
-- ═══════════════════════════════════════════════════════════════
-- Core identity
CREATE INDEX idx_custodians_name ON custodians(name);
CREATE INDEX idx_custodians_name_gin ON custodians USING GIN (to_tsvector('simple', name));
-- Classification
CREATE INDEX idx_custodians_type ON custodians(type);
CREATE INDEX idx_custodians_type_name ON custodians(type_name);
-- GHCID
CREATE UNIQUE INDEX idx_custodians_ghcid ON custodians(ghcid);
CREATE INDEX idx_custodians_ghcid_uuid ON custodians(ghcid_uuid);
CREATE INDEX idx_custodians_record_id ON custodians(record_id);
-- Location
CREATE INDEX idx_custodians_city ON custodians(city);
CREATE INDEX idx_custodians_region ON custodians(region);
CREATE INDEX idx_custodians_country_code ON custodians(country_code);
CREATE INDEX idx_custodians_geonames_id ON custodians(geonames_id);
-- External identifiers
CREATE INDEX idx_custodians_wikidata_id ON custodians(wikidata_id);
CREATE INDEX idx_custodians_isil_code ON custodians(isil_code);
CREATE INDEX idx_custodians_google_place_id ON custodians(google_place_id);
-- Ratings
CREATE INDEX idx_custodians_rating ON custodians(rating);
-- Provenance
CREATE INDEX idx_custodians_data_source ON custodians(data_source);
CREATE INDEX idx_custodians_data_tier ON custodians(data_tier);
"""
SPATIAL_INDEX_SQL = """
-- Create a spatial index (requires PostGIS)
CREATE INDEX IF NOT EXISTS idx_custodians_geom ON custodians USING GIST (
ST_SetSRID(ST_MakePoint(lon, lat), 4326)
) WHERE lat IS NOT NULL AND lon IS NOT NULL;
"""
def extract_coordinates(entry: Dict) -> tuple[Optional[float], Optional[float]]:
"""Extract lat/lon from entry with priority order."""
lat, lon = None, None
# Priority 1: Google Maps coordinates
google_maps = entry.get('google_maps_enrichment', {})
coords = google_maps.get('coordinates', {})
if coords.get('latitude') and coords.get('longitude'):
return coords['latitude'], coords['longitude']
# Priority 2: Wikidata coordinates
wd = entry.get('wikidata_enrichment', {})
wd_coords = wd.get('wikidata_coordinates', {})
if wd_coords.get('latitude') and wd_coords.get('longitude'):
return wd_coords['latitude'], wd_coords['longitude']
# Priority 3: locations array
locations = entry.get('locations', entry.get('original_entry', {}).get('locations', []))
if locations and isinstance(locations, list) and len(locations) > 0:
loc = locations[0]
if loc.get('latitude') and loc.get('longitude'):
return loc['latitude'], loc['longitude']
# Priority 4: location object
location = entry.get('location', {})
if location.get('latitude') and location.get('longitude'):
return location['latitude'], location['longitude']
return None, None
def extract_temporal_data(entry: Dict, wikidata: Dict) -> Dict[str, Any]:
"""Extract temporal data from timespan (CIDOC-CRM E52_Time-Span) or wikidata_inception.
Priority for founding date:
1. timespan.begin_of_the_begin (CIDOC-CRM P82a)
2. wikidata_inception from wikidata_enrichment
Priority for dissolution date:
1. timespan.begin_of_the_end (CIDOC-CRM P81b)
Returns dict with: founding_year, founding_date, dissolution_year, dissolution_date, temporal_extent
"""
timespan = entry.get('timespan', {})
result = {
'founding_year': None,
'founding_date': None,
'dissolution_year': None,
'dissolution_date': None,
'temporal_extent': None,
}
# Extract founding date - priority: timespan.begin_of_the_begin > wikidata_inception
founding_str = timespan.get('begin_of_the_begin')
if not founding_str:
founding_str = wikidata.get('wikidata_inception')
if founding_str:
try:
# Handle various date formats
founding_str = str(founding_str).strip()
if founding_str and founding_str.lower() not in ('null', 'none', ''):
# Try full ISO datetime (2001-01-01T00:00:00Z)
if 'T' in founding_str:
dt = datetime.fromisoformat(founding_str.replace('Z', '+00:00'))
result['founding_date'] = dt.date()
result['founding_year'] = dt.year
# Try date only (2001-01-01 or 1940-11-00)
elif '-' in founding_str:
parts = founding_str.split('-')
if len(parts) >= 1:
year = int(parts[0])
result['founding_year'] = year
# Only set full date if month and day are valid
if len(parts) >= 3 and parts[1] != '00' and parts[2] != '00':
try:
result['founding_date'] = date(year, int(parts[1]), int(parts[2]))
except ValueError:
pass # Invalid date, keep year only
# Try year only
elif founding_str.isdigit():
result['founding_year'] = int(founding_str)
except (ValueError, TypeError, AttributeError):
pass # Could not parse founding date
# Extract dissolution date from timespan.begin_of_the_end
dissolution_str = timespan.get('begin_of_the_end')
if dissolution_str:
try:
dissolution_str = str(dissolution_str).strip()
if dissolution_str and dissolution_str.lower() not in ('null', 'none', ''):
# Try full ISO datetime
if 'T' in dissolution_str:
dt = datetime.fromisoformat(dissolution_str.replace('Z', '+00:00'))
result['dissolution_date'] = dt.date()
result['dissolution_year'] = dt.year
# Try date only
elif '-' in dissolution_str:
parts = dissolution_str.split('-')
if len(parts) >= 1:
year = int(parts[0])
result['dissolution_year'] = year
if len(parts) >= 3 and parts[1] != '00' and parts[2] != '00':
try:
result['dissolution_date'] = date(year, int(parts[1]), int(parts[2]))
except ValueError:
pass
elif dissolution_str.isdigit():
result['dissolution_year'] = int(dissolution_str)
except (ValueError, TypeError, AttributeError):
pass
# Store full timespan object if present (for CIDOC-CRM compliance)
if timespan:
result['temporal_extent'] = timespan
else:
# Fallback to entry.temporal_extent if no timespan
result['temporal_extent'] = entry.get('temporal_extent')
return result
def extract_custodian_data(entry: Dict, source_file: str) -> Dict[str, Any]:
"""Extract all relevant data from a custodian YAML entry."""
original = entry.get('original_entry', {})
google_maps = entry.get('google_maps_enrichment', {})
wikidata = entry.get('wikidata_enrichment', {})
youtube = entry.get('youtube_enrichment', {})
web_claims = entry.get('web_claims', {})
web_enrichment = entry.get('web_enrichment', {})
ghcid_data = entry.get('ghcid', {})
provenance = entry.get('provenance', {})
custodian_name = entry.get('custodian_name', {})
ch_annotator = entry.get('ch_annotator', {})
# Get coordinates
lat, lon = extract_coordinates(entry)
# Get institution type - comprehensive extraction from multiple sources
# Priority: 1) GHCID type letter, 2) original_entry.type, 3) original_entry.institution_type,
# 4) type_organisatie, 5) CH-Annotator entity_classification, 6) Wikidata instance_of
inst_type = 'U' # Default to Unknown
# 1. Extract from GHCID (most reliable - already normalized)
ghcid_current = ghcid_data.get('ghcid_current', '')
if ghcid_current and len(ghcid_current.split('-')) >= 4:
ghcid_type = ghcid_current.split('-')[3] # e.g., NL-NH-AMS-M-RM -> M
if ghcid_type and len(ghcid_type) == 1 and ghcid_type in TYPE_COLORS:
inst_type = ghcid_type
# 2. If still Unknown, try original_entry.type (Dutch NDE format - list of letters)
if inst_type == 'U':
types = original.get('type', [])
if isinstance(types, list) and types:
# Dutch format: type: [M] or type: [M, A]
first_type = types[0] if types else None
if first_type and len(str(first_type)) == 1 and str(first_type).upper() in TYPE_COLORS:
inst_type = str(first_type).upper()
elif isinstance(types, str) and types:
# CH-Annotator format: type: GRP.HER.MUS
if types.startswith('GRP.HER.'):
ch_type_map = {
'GRP.HER.GAL': 'G', 'GRP.HER.LIB': 'L', 'GRP.HER.ARC': 'A',
'GRP.HER.MUS': 'M', 'GRP.HER.MIX': 'X', 'GRP.HER': 'U'
}
inst_type = ch_type_map.get(types, 'U')
elif len(types) == 1 and types.upper() in TYPE_COLORS:
inst_type = types.upper()
# 3. Try original_entry.institution_type (CH-Annotator full name format)
if inst_type == 'U':
inst_type_str = original.get('institution_type', '')
if inst_type_str:
type_map = {v.upper(): k for k, v in TYPE_NAMES.items()}
# Also add common variations
type_map.update({
'GALLERY': 'G', 'LIBRARY': 'L', 'ARCHIVE': 'A', 'MUSEUM': 'M',
'OFFICIAL': 'O', 'OFFICIAL_INSTITUTION': 'O', 'RESEARCH': 'R',
'RESEARCH_CENTER': 'R', 'CORPORATION': 'C', 'UNKNOWN': 'U',
'BOTANICAL': 'B', 'BOTANICAL_ZOO': 'B', 'EDUCATION': 'E',
'EDUCATION_PROVIDER': 'E', 'SOCIETY': 'S', 'COLLECTING_SOCIETY': 'S',
'FEATURES': 'F', 'INTANGIBLE': 'I', 'INTANGIBLE_HERITAGE_GROUP': 'I',
'MIXED': 'X', 'PERSONAL': 'P', 'PERSONAL_COLLECTION': 'P',
'HOLY_SITES': 'H', 'DIGITAL': 'D', 'DIGITAL_PLATFORM': 'D',
'NGO': 'N', 'TASTE_SMELL': 'T',
})
inst_type = type_map.get(inst_type_str.upper(), 'U')
# 4. Try type_organisatie (Dutch NDE CSV field)
if inst_type == 'U':
type_org = original.get('type_organisatie', '')
if type_org:
type_org_map = {
'museum': 'M', 'archief': 'A', 'bibliotheek': 'L',
'galerie': 'G', 'onderzoek': 'R', 'erfgoed': 'O',
'onderwijs': 'E', 'vereniging': 'S', 'stichting': 'N',
}
inst_type = type_org_map.get(type_org.lower(), 'U')
# 5. Try CH-Annotator entity_classification
if inst_type == 'U' and ch_annotator:
entity_class = ch_annotator.get('entity_classification', {})
subtype = entity_class.get('subtype', '')
if subtype:
ch_subtype_map = {
'GRP.HER.GAL': 'G', 'GRP.HER.LIB': 'L', 'GRP.HER.ARC': 'A',
'GRP.HER.MUS': 'M', 'GRP.HER.MIX': 'X',
}
inst_type = ch_subtype_map.get(subtype, 'U')
# 6. Try to infer from Wikidata instance_of
if inst_type == 'U' and wikidata:
wikidata_types = wikidata.get('wikidata_instance_of', [])
if isinstance(wikidata_types, list):
# Map common Wikidata types to GLAMORCUBESFIXPHDNT
wd_type_map = {
'museum': 'M', 'art museum': 'M', 'history museum': 'M',
'natural history museum': 'M', 'science museum': 'M',
'archive': 'A', 'national archive': 'A', 'state archive': 'A',
'library': 'L', 'public library': 'L', 'national library': 'L',
'research institute': 'R', 'research center': 'R',
'university': 'E', 'college': 'E', 'school': 'E',
'botanical garden': 'B', 'zoo': 'B', 'aquarium': 'B',
'art gallery': 'G', 'gallery': 'G',
'organization': 'N', 'non-profit organization': 'N',
'foundation': 'N', 'association': 'S', 'society': 'S',
'church': 'H', 'monastery': 'H', 'temple': 'H', 'mosque': 'H',
}
for wdt in wikidata_types:
wdt_lower = str(wdt).lower() if wdt else ''
for pattern, type_code in wd_type_map.items():
if pattern in wdt_lower:
inst_type = type_code
break
if inst_type != 'U':
break
# Get name with priority
name = (
custodian_name.get('claim_value') or
wikidata.get('wikidata_label_nl') or
original.get('organisatie') or
original.get('name') or
'Unknown Institution'
)
# Get location info
locations = entry.get('locations', original.get('locations', []))
first_loc = locations[0] if locations and isinstance(locations, list) else {}
city = (
original.get('plaatsnaam_bezoekadres') or
first_loc.get('city') or
google_maps.get('short_address', '').split(',')[-1].strip() if google_maps.get('short_address') else ''
)
# Extract identifiers - handle both list of dicts and flat dict formats
identifiers = entry.get('identifiers', [])
wikidata_id = None
isil_code = None
viaf_id = None
kvk_number = None
ico_number = None
sigla = None
# Handle flat dict format (e.g., {wikidata_id: "Q123", anbi: true})
if isinstance(identifiers, dict):
wikidata_id = identifiers.get('wikidata_id')
isil_code = identifiers.get('isil_code') or identifiers.get('isil')
viaf_id = identifiers.get('viaf_id') or identifiers.get('viaf')
kvk_number = identifiers.get('kvk_number') or identifiers.get('kvk')
ico_number = identifiers.get('ico_number') or identifiers.get('ico')
sigla = identifiers.get('sigla')
identifiers = [] # Clear to skip the loop
for ident in identifiers:
if not isinstance(ident, dict):
continue # Skip non-dict items
scheme = ident.get('identifier_scheme', '')
value = ident.get('identifier_value', '')
if scheme == 'Wikidata':
wikidata_id = value
elif scheme == 'ISIL':
isil_code = value
elif scheme == 'VIAF':
viaf_id = value
elif scheme == 'KvK':
kvk_number = value
elif scheme == 'IČO':
ico_number = value
elif scheme == 'Sigla':
sigla = value
# Extract social media from web claims
social = {}
if web_claims.get('claims'):
for claim in web_claims['claims']:
ct = claim.get('claim_type', '')
cv = claim.get('claim_value', '')
if ct == 'social_facebook':
social['facebook'] = cv
elif ct == 'social_twitter':
social['twitter'] = cv
elif ct == 'social_instagram':
social['instagram'] = cv
elif ct == 'social_linkedin':
social['linkedin'] = cv
elif ct == 'social_youtube':
social['youtube'] = cv
elif ct == 'logo':
social['logo'] = cv
# Extract YouTube data - handle both nested (Dutch) and flat (UNESCO) formats
# Nested: youtube_enrichment.channel.channel_id
# Flat: youtube_enrichment.channel_id
if youtube.get('channel'):
# Nested format (Dutch files)
yt_channel = youtube.get('channel', {})
elif youtube.get('channel_id'):
# Flat format (UNESCO/other files) - use youtube dict directly
yt_channel = youtube
else:
yt_channel = {}
# Build result
return {
# Core identity
'name': name,
'verified_name': custodian_name.get('claim_value'),
'name_source': custodian_name.get('extraction_method'),
'emic_name': custodian_name.get('emic_name'),
# Classification
'type': inst_type,
'type_name': TYPE_NAMES.get(inst_type, 'Unknown'),
'color': TYPE_COLORS.get(inst_type, '#9e9e9e'),
'ch_annotator_hypernym': ch_annotator.get('entity_classification', {}).get('hypernym'),
'ch_annotator_subtype': ch_annotator.get('entity_classification', {}).get('subtype'),
'ontology_class': ch_annotator.get('entity_classification', {}).get('ontology_class'),
# GHCID
'ghcid': ghcid_data.get('ghcid_current'),
'ghcid_uuid': ghcid_data.get('ghcid_uuid'),
'ghcid_uuid_sha256': ghcid_data.get('ghcid_uuid_sha256'),
'ghcid_numeric': ghcid_data.get('ghcid_numeric'),
'record_id': ghcid_data.get('record_id'),
'ghcid_original': ghcid_data.get('ghcid_original'),
'ghcid_history': ghcid_data.get('ghcid_history'),
# Location
'lat': lat,
'lon': lon,
'city': city,
'region': first_loc.get('region') or ghcid_data.get('location_resolution', {}).get('region_name'),
'region_code': ensure_str(ghcid_data.get('location_resolution', {}).get('region_code')),
'country': ensure_str(first_loc.get('country')),
'country_code': ensure_str(ghcid_data.get('location_resolution', {}).get('country_code')),
'street_address': first_loc.get('street_address') or google_maps.get('formatted_address'),
'postal_code': first_loc.get('postal_code'),
'formatted_address': google_maps.get('formatted_address'),
'geonames_id': ghcid_data.get('location_resolution', {}).get('geonames_id') or ghcid_data.get('geonames_id'),
# External identifiers
'wikidata_id': wikidata_id or wikidata.get('wikidata_entity_id'),
'isil_code': isil_code or original.get('isil-code_na'),
'viaf_id': viaf_id,
'google_place_id': google_maps.get('place_id'),
'kvk_number': kvk_number,
'ico_number': ico_number,
'sigla': sigla,
'identifiers': identifiers,
# Basic metadata
'website': ensure_str(google_maps.get('website') or wikidata.get('wikidata_official_website') or original.get('webadres_organisatie')),
'email': None, # Extract from web_claims if present
'phone': google_maps.get('phone_local'),
'phone_international': google_maps.get('phone_international'),
'description': wikidata.get('wikidata_description_nl') or wikidata.get('wikidata_description_en'),
# Google Maps enrichment
'rating': google_maps.get('rating'),
'total_ratings': google_maps.get('total_ratings'),
'business_status': google_maps.get('business_status'),
'google_maps_url': google_maps.get('google_maps_url'),
'street_view_url': google_maps.get('street_view_url'),
'opening_hours': google_maps.get('opening_hours'),
'open_now': google_maps.get('opening_hours', {}).get('open_now') if isinstance(google_maps.get('opening_hours'), dict) else None,
'reviews': google_maps.get('reviews'),
'photos': google_maps.get('photos_metadata'),
'photo_urls': google_maps.get('photo_urls'),
'google_maps_enrichment': google_maps if google_maps else None,
# Wikidata enrichment
'wikidata_label_nl': wikidata.get('wikidata_label_nl'),
'wikidata_label_en': wikidata.get('wikidata_label_en'),
'wikidata_description_nl': wikidata.get('wikidata_description_nl'),
'wikidata_description_en': wikidata.get('wikidata_description_en'),
'wikidata_types': wikidata.get('wikidata_instance_of'),
'wikidata_inception': str(wikidata.get('wikidata_inception')) if wikidata.get('wikidata_inception') else None,
'wikidata_coordinates': wikidata.get('wikidata_coordinates'),
'wikidata_enrichment': wikidata if wikidata else None,
# YouTube enrichment
'youtube_channel_id': ensure_str(yt_channel.get('channel_id')),
'youtube_channel_url': ensure_str(yt_channel.get('channel_url')),
'youtube_subscriber_count': yt_channel.get('subscriber_count'),
'youtube_video_count': yt_channel.get('video_count'),
'youtube_view_count': yt_channel.get('view_count'),
'youtube_enrichment': youtube if youtube.get('status') == 'SUCCESS' else None,
# Web claims
'social_facebook': social.get('facebook'),
'social_twitter': social.get('twitter'),
'social_instagram': social.get('instagram'),
'social_linkedin': social.get('linkedin'),
'social_youtube': social.get('youtube'),
'logo_url': social.get('logo'),
'web_claims': web_claims if web_claims.get('claims') else None,
'web_archives': web_enrichment.get('web_archives'),
# Genealogiewerkbalk
'genealogiewerkbalk': entry.get('genealogiewerkbalk_enrichment'),
# ISIL registries
'nan_isil_enrichment': entry.get('nan_isil_enrichment'),
'kb_enrichment': entry.get('kb_enrichment'),
# Museum register & ZCBS
'museum_register': entry.get('museum_register_enrichment'),
'zcbs_enrichment': entry.get('zcbs_enrichment'),
# Temporal extent - extract from timespan (CIDOC-CRM E52_Time-Span) or wikidata_inception
**extract_temporal_data(entry, wikidata),
'successor_organization': entry.get('successor_organization'),
# Provenance
'data_source': provenance.get('data_source'),
'data_tier': provenance.get('data_tier'),
'extraction_date': parse_datetime(provenance.get('extraction_date')),
'confidence_score': provenance.get('confidence_score'),
'provenance': provenance if provenance else None,
# CH-Annotator
'ch_annotator': ch_annotator if ch_annotator else None,
# Original entry
'original_entry': original if original else None,
# Source file
'source_file': source_file,
}
def to_json(value: Any) -> Optional[str]:
"""Convert value to JSON string, handling None and date objects."""
if value is None:
return None
def json_serializer(obj):
"""Handle date/datetime objects in JSON serialization."""
from datetime import date, datetime
if isinstance(obj, datetime):
return obj.isoformat()
elif isinstance(obj, date):
return obj.isoformat()
raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
return json.dumps(value, default=json_serializer)
def parse_datetime(value: Any) -> Optional[datetime]:
"""Parse datetime from string or return None."""
if value is None:
return None
if isinstance(value, datetime):
return value
if isinstance(value, str):
try:
# Try ISO format with timezone
if '+' in value or 'Z' in value:
return datetime.fromisoformat(value.replace('Z', '+00:00'))
# Try ISO format without timezone
return datetime.fromisoformat(value).replace(tzinfo=timezone.utc)
except (ValueError, AttributeError):
return None
return None
def ensure_str(value: Any) -> Optional[str]:
"""Ensure value is a string or None."""
if value is None:
return None
if isinstance(value, bool):
return None # False/True shouldn't be stored as strings
if isinstance(value, list):
# Return first item if list, or join
if len(value) == 0:
return None
return value[0] if len(value) == 1 else value[0] # Take first URL
if isinstance(value, (int, float)):
return str(value)
return str(value)
async def load_data_to_database(
db_name: str,
db_config: Dict[str, Any],
yaml_files: List[Path],
drop_existing: bool = False,
) -> Dict[str, int]:
"""Load custodian data into a single database.
Args:
db_name: Name of the database (for logging)
db_config: Database connection configuration
yaml_files: List of YAML files to process
drop_existing: Whether to drop and recreate the table
Returns:
Dict with counts: processed, skipped, errors, total
"""
print(f"\n{'='*60}")
print(f"Loading data to: {db_name}")
print(f"{'='*60}")
print(f"Connecting to PostgreSQL at {db_config['host']}:{db_config['port']}/{db_config['database']}...")
conn = await asyncpg.connect(
host=db_config['host'],
port=db_config['port'],
database=db_config['database'],
user=db_config['user'],
password=db_config['password'],
)
processed = 0
skipped = 0
errors = 0
try:
if drop_existing:
print("Creating custodians table (dropping existing)...")
await conn.execute(CREATE_TABLE_SQL)
# Try to create spatial index
try:
await conn.execute(SPATIAL_INDEX_SQL)
print(" Created spatial index (PostGIS)")
except Exception as e:
print(f" Skipped spatial index (PostGIS not available): {e}")
print(f"Processing {len(yaml_files)} custodian files...")
# Prepare INSERT statement with expanded ON CONFLICT to update all web claims fields
insert_sql = """
INSERT INTO custodians (
name, verified_name, name_source, emic_name,
type, type_name, color, ch_annotator_hypernym, ch_annotator_subtype, ontology_class,
ghcid, ghcid_uuid, ghcid_uuid_sha256, ghcid_numeric, record_id, ghcid_original, ghcid_history,
lat, lon, city, region, region_code, country, country_code, street_address, postal_code, formatted_address, geonames_id,
wikidata_id, isil_code, viaf_id, google_place_id, kvk_number, ico_number, sigla, identifiers,
website, email, phone, phone_international, description,
rating, total_ratings, business_status, google_maps_url, street_view_url,
opening_hours, open_now, reviews, photos, photo_urls, google_maps_enrichment,
wikidata_label_nl, wikidata_label_en, wikidata_description_nl, wikidata_description_en,
wikidata_types, wikidata_inception, wikidata_coordinates, wikidata_enrichment,
youtube_channel_id, youtube_channel_url, youtube_subscriber_count, youtube_video_count, youtube_view_count, youtube_enrichment,
social_facebook, social_twitter, social_instagram, social_linkedin, social_youtube, logo_url, web_claims, web_archives,
genealogiewerkbalk, nan_isil_enrichment, kb_enrichment, museum_register, zcbs_enrichment,
founding_year, founding_date, dissolution_year, dissolution_date, temporal_extent, successor_organization,
data_source, data_tier, extraction_date, confidence_score, provenance,
ch_annotator, original_entry, source_file
) VALUES (
$1, $2, $3, $4,
$5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17,
$18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28,
$29, $30, $31, $32, $33, $34, $35, $36,
$37, $38, $39, $40, $41,
$42, $43, $44, $45, $46,
$47, $48, $49, $50, $51, $52,
$53, $54, $55, $56,
$57, $58, $59, $60,
$61, $62, $63, $64, $65, $66,
$67, $68, $69, $70, $71, $72, $73, $74,
$75, $76, $77, $78, $79,
$80, $81, $82, $83, $84, $85,
$86, $87, $88, $89, $90,
$91, $92, $93
)
ON CONFLICT (ghcid) DO UPDATE SET
name = EXCLUDED.name,
verified_name = EXCLUDED.verified_name,
emic_name = EXCLUDED.emic_name,
type = EXCLUDED.type,
type_name = EXCLUDED.type_name,
lat = EXCLUDED.lat,
lon = EXCLUDED.lon,
city = EXCLUDED.city,
region = EXCLUDED.region,
country_code = EXCLUDED.country_code,
website = EXCLUDED.website,
description = EXCLUDED.description,
rating = EXCLUDED.rating,
total_ratings = EXCLUDED.total_ratings,
reviews = EXCLUDED.reviews,
photos = EXCLUDED.photos,
photo_urls = EXCLUDED.photo_urls,
opening_hours = EXCLUDED.opening_hours,
google_maps_enrichment = EXCLUDED.google_maps_enrichment,
wikidata_enrichment = EXCLUDED.wikidata_enrichment,
youtube_channel_id = EXCLUDED.youtube_channel_id,
youtube_channel_url = EXCLUDED.youtube_channel_url,
youtube_subscriber_count = EXCLUDED.youtube_subscriber_count,
youtube_video_count = EXCLUDED.youtube_video_count,
youtube_view_count = EXCLUDED.youtube_view_count,
youtube_enrichment = EXCLUDED.youtube_enrichment,
social_facebook = EXCLUDED.social_facebook,
social_twitter = EXCLUDED.social_twitter,
social_instagram = EXCLUDED.social_instagram,
social_linkedin = EXCLUDED.social_linkedin,
social_youtube = EXCLUDED.social_youtube,
logo_url = EXCLUDED.logo_url,
web_claims = EXCLUDED.web_claims,
web_archives = EXCLUDED.web_archives,
updated_at = NOW()
"""
for i, yaml_file in enumerate(yaml_files):
if (i + 1) % 1000 == 0 or i == 0:
print(f" Processing {i + 1}/{len(yaml_files)}...")
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
entry = yaml.load(f, Loader=SafeLoader)
if not entry:
skipped += 1
continue
data = extract_custodian_data(entry, yaml_file.name)
# Skip if no GHCID
if not data['ghcid']:
skipped += 1
continue
# Build values tuple
values = (
data['name'], data['verified_name'], data['name_source'], data['emic_name'],
data['type'], data['type_name'], data['color'], data['ch_annotator_hypernym'], data['ch_annotator_subtype'], data['ontology_class'],
data['ghcid'], data['ghcid_uuid'], data['ghcid_uuid_sha256'], data['ghcid_numeric'], data['record_id'], data['ghcid_original'], to_json(data['ghcid_history']),
data['lat'], data['lon'], data['city'], data['region'], data['region_code'], data['country'], data['country_code'], data['street_address'], data['postal_code'], data['formatted_address'], data['geonames_id'],
data['wikidata_id'], data['isil_code'], data['viaf_id'], data['google_place_id'], data['kvk_number'], data['ico_number'], data['sigla'], to_json(data['identifiers']),
data['website'], data['email'], data['phone'], data['phone_international'], data['description'],
data['rating'], data['total_ratings'], data['business_status'], data['google_maps_url'], data['street_view_url'],
to_json(data['opening_hours']), data['open_now'], to_json(data['reviews']), to_json(data['photos']), data['photo_urls'], to_json(data['google_maps_enrichment']),
data['wikidata_label_nl'], data['wikidata_label_en'], data['wikidata_description_nl'], data['wikidata_description_en'],
to_json(data['wikidata_types']), data['wikidata_inception'], to_json(data['wikidata_coordinates']), to_json(data['wikidata_enrichment']),
data['youtube_channel_id'], data['youtube_channel_url'], data['youtube_subscriber_count'], data['youtube_video_count'], data['youtube_view_count'], to_json(data['youtube_enrichment']),
data['social_facebook'], data['social_twitter'], data['social_instagram'], data['social_linkedin'], data['social_youtube'], data['logo_url'], to_json(data['web_claims']), to_json(data['web_archives']),
to_json(data['genealogiewerkbalk']), to_json(data['nan_isil_enrichment']), to_json(data['kb_enrichment']), to_json(data['museum_register']), to_json(data['zcbs_enrichment']),
data['founding_year'], data['founding_date'], data['dissolution_year'], data['dissolution_date'], to_json(data['temporal_extent']), to_json(data['successor_organization']),
data['data_source'], data['data_tier'], data['extraction_date'], data['confidence_score'], to_json(data['provenance']),
to_json(data['ch_annotator']), to_json(data['original_entry']), data['source_file'],
)
await conn.execute(insert_sql, *values)
processed += 1
except Exception as e:
errors += 1
if errors <= 10:
print(f" Error processing {yaml_file.name}: {e}")
elif errors == 11:
print(" ... suppressing further error messages")
# Get final count
count = await conn.fetchval("SELECT COUNT(*) FROM custodians")
print(f"\n [{db_name}] LOAD COMPLETE")
print(f" Files processed: {processed}")
print(f" Files skipped: {skipped}")
print(f" Errors: {errors}")
print(f" Total in DB: {count}")
return {'processed': processed, 'skipped': skipped, 'errors': errors, 'total': count}
finally:
await conn.close()
async def load_data(
drop_existing: bool = False,
limit: Optional[int] = None,
databases: Optional[List[str]] = None,
):
"""Load all custodian data into one or more PostgreSQL databases.
Args:
drop_existing: Whether to drop and recreate tables
limit: Optional limit on number of files to process
databases: List of database names to load into. If None, uses single-database mode
with environment variables (backward compatible).
"""
# Find all YAML files
print(f"\nReading custodian files from: {CUSTODIAN_DIR}")
yaml_files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
total_files = len(yaml_files)
print(f"Found {total_files} custodian files")
if limit:
yaml_files = yaml_files[:limit]
print(f"Processing first {limit} files only")
# Single database mode (backward compatible)
if databases is None:
print(f"\nUsing single-database mode (backward compatible)")
db_config = {
'host': POSTGRES_HOST,
'port': POSTGRES_PORT,
'database': POSTGRES_DB,
'user': POSTGRES_USER,
'password': POSTGRES_PASSWORD,
}
await load_data_to_database(
db_name=POSTGRES_DB,
db_config=db_config,
yaml_files=yaml_files,
drop_existing=drop_existing,
)
return
# Multi-database mode
print(f"\nUsing multi-database mode: {', '.join(databases)}")
results = {}
for db_name in databases:
if db_name not in DATABASES:
print(f"\nERROR: Unknown database '{db_name}'. Available: {', '.join(DATABASES.keys())}")
continue
db_config = DATABASES[db_name]
try:
result = await load_data_to_database(
db_name=db_name,
db_config=db_config,
yaml_files=yaml_files,
drop_existing=drop_existing,
)
results[db_name] = result
except Exception as e:
print(f"\nERROR loading to {db_name}: {e}")
results[db_name] = {'error': str(e)}
# Summary
print(f"\n{'='*60}")
print("MULTI-DATABASE LOAD SUMMARY")
print(f"{'='*60}")
for db_name, result in results.items():
if 'error' in result:
print(f" {db_name}: FAILED - {result['error']}")
else:
print(f" {db_name}: {result['processed']} processed, {result['total']} total in DB")
def main():
parser = argparse.ArgumentParser(description="Load custodian data into PostgreSQL")
parser.add_argument("--drop-existing", action="store_true", help="Drop existing table and recreate")
parser.add_argument("--limit", type=int, help="Limit number of files to process (for testing)")
parser.add_argument(
"--databases",
type=str,
help="Comma-separated list of databases to load (e.g., 'glam,glam_geo'). "
"If not specified, uses single-database mode with env vars. "
f"Available: {', '.join(DATABASES.keys())}"
)
args = parser.parse_args()
# Parse databases list
databases = None
if args.databases:
databases = [db.strip() for db in args.databases.split(',')]
asyncio.run(load_data(
drop_existing=args.drop_existing,
limit=args.limit,
databases=databases,
))
if __name__ == "__main__":
main()