1020 lines
46 KiB
Python
1020 lines
46 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Load ALL custodian data from data/custodian/ into PostgreSQL/PostGIS
|
|
|
|
This script replaces load_nde_data.py and reads directly from the authoritative
|
|
custodian YAML files, preserving ALL rich metadata including:
|
|
- Google Maps enrichment (ratings, reviews, photos, opening hours)
|
|
- YouTube enrichment (channel info, videos)
|
|
- Web claims (social media, logos, descriptions)
|
|
- Genealogiewerkbalk data
|
|
- GHCID identifiers and history
|
|
- Provenance tracking
|
|
- Temporal extent and successor organizations
|
|
|
|
Usage:
|
|
python load_custodian_data.py [--drop-existing] [--limit N]
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Dict, List, Optional
|
|
import sys
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
try:
|
|
import asyncpg
|
|
except ImportError:
|
|
print("Error: asyncpg not installed. Run: pip install asyncpg")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
import yaml
|
|
try:
|
|
from yaml import CSafeLoader as SafeLoader
|
|
except ImportError:
|
|
from yaml import SafeLoader
|
|
except ImportError:
|
|
print("Error: PyYAML not installed. Run: pip install pyyaml")
|
|
sys.exit(1)
|
|
|
|
|
|
# Configuration
|
|
# Server path: /mnt/data/custodian/
|
|
# Local path: {project_root}/data/custodian/
|
|
DEFAULT_CUSTODIAN_DIR = "/mnt/data/custodian" if os.path.exists("/mnt/data/custodian") else str(project_root / "data" / "custodian")
|
|
CUSTODIAN_DIR = Path(os.getenv("CUSTODIAN_DIR", DEFAULT_CUSTODIAN_DIR))
|
|
|
|
# Single database config (for backward compatibility)
|
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
|
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "glam_heritage")
|
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "kempersc")
|
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
|
|
|
|
# Multi-database configuration for production
|
|
# Production has two databases that need identical custodian data:
|
|
# - glam: Main custodian data storage
|
|
# - glam_geo: PostGIS geo API for bronhouder.nl map
|
|
DATABASES = {
|
|
'glam': {
|
|
'host': os.getenv("POSTGRES_HOST", "localhost"),
|
|
'port': int(os.getenv("POSTGRES_PORT", "5432")),
|
|
'database': os.getenv("POSTGRES_DB", "glam"),
|
|
'user': os.getenv("POSTGRES_USER", "glam_api"),
|
|
'password': os.getenv("POSTGRES_PASSWORD", ""),
|
|
},
|
|
'glam_geo': {
|
|
'host': os.getenv("GEO_POSTGRES_HOST", os.getenv("POSTGRES_HOST", "localhost")),
|
|
'port': int(os.getenv("GEO_POSTGRES_PORT", os.getenv("POSTGRES_PORT", "5432"))),
|
|
'database': os.getenv("GEO_POSTGRES_DB", "glam_geo"),
|
|
'user': os.getenv("GEO_POSTGRES_USER", os.getenv("POSTGRES_USER", "glam_api")),
|
|
'password': os.getenv("GEO_POSTGRES_PASSWORD", os.getenv("POSTGRES_PASSWORD", "")),
|
|
},
|
|
# Local development database
|
|
'glam_heritage': {
|
|
'host': os.getenv("POSTGRES_HOST", "localhost"),
|
|
'port': int(os.getenv("POSTGRES_PORT", "5432")),
|
|
'database': os.getenv("POSTGRES_DB", "glam_heritage"),
|
|
'user': os.getenv("POSTGRES_USER", "kempersc"),
|
|
'password': os.getenv("POSTGRES_PASSWORD", ""),
|
|
},
|
|
}
|
|
|
|
|
|
# Institution type mappings
|
|
TYPE_COLORS = {
|
|
'G': '#00bcd4', 'L': '#2ecc71', 'A': '#3498db', 'M': '#e74c3c',
|
|
'O': '#f39c12', 'R': '#1abc9c', 'C': '#795548', 'U': '#9e9e9e',
|
|
'B': '#4caf50', 'E': '#ff9800', 'S': '#9b59b6', 'F': '#95a5a6',
|
|
'I': '#673ab7', 'X': '#607d8b', 'P': '#ff5722', 'H': '#607d8b',
|
|
'D': '#34495e', 'N': '#e91e63', 'T': '#ff5722',
|
|
}
|
|
|
|
TYPE_NAMES = {
|
|
'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum',
|
|
'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown',
|
|
'B': 'Botanical', 'E': 'Education', 'S': 'Society', 'F': 'Features',
|
|
'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy sites',
|
|
'D': 'Digital', 'N': 'NGO', 'T': 'Taste/smell',
|
|
}
|
|
|
|
|
|
CREATE_TABLE_SQL = """
|
|
-- Drop existing table if requested
|
|
DROP TABLE IF EXISTS custodians CASCADE;
|
|
|
|
CREATE TABLE custodians (
|
|
id SERIAL PRIMARY KEY,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- CORE IDENTITY
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
name TEXT NOT NULL,
|
|
verified_name TEXT,
|
|
name_source TEXT,
|
|
emic_name TEXT, -- Name in original/native language
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- CLASSIFICATION (GLAMORCUBESFIXPHDNT)
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
type CHAR(1),
|
|
type_name TEXT,
|
|
color VARCHAR(10),
|
|
ch_annotator_hypernym TEXT, -- e.g., GRP
|
|
ch_annotator_subtype TEXT, -- e.g., GRP.HER.MUS
|
|
ontology_class TEXT, -- e.g., schema:Museum
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- GHCID (Global Heritage Custodian Identifier)
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
ghcid TEXT UNIQUE,
|
|
ghcid_uuid UUID,
|
|
ghcid_uuid_sha256 UUID,
|
|
ghcid_numeric NUMERIC(20),
|
|
record_id UUID,
|
|
ghcid_original TEXT,
|
|
ghcid_history JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- LOCATION
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
lat DOUBLE PRECISION,
|
|
lon DOUBLE PRECISION,
|
|
city TEXT,
|
|
region TEXT,
|
|
region_code TEXT,
|
|
country TEXT,
|
|
country_code CHAR(2),
|
|
street_address TEXT,
|
|
postal_code TEXT,
|
|
formatted_address TEXT,
|
|
geonames_id INTEGER,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- EXTERNAL IDENTIFIERS
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
wikidata_id TEXT,
|
|
isil_code TEXT,
|
|
viaf_id TEXT,
|
|
google_place_id TEXT,
|
|
kvk_number TEXT,
|
|
ico_number TEXT, -- Czech business identifier
|
|
sigla TEXT, -- Czech library code
|
|
|
|
-- All identifiers as JSONB array
|
|
identifiers JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- BASIC METADATA
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
website TEXT,
|
|
email TEXT,
|
|
phone TEXT,
|
|
phone_international TEXT,
|
|
description TEXT,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- GOOGLE MAPS ENRICHMENT
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
rating REAL,
|
|
total_ratings INTEGER,
|
|
business_status TEXT,
|
|
google_maps_url TEXT,
|
|
street_view_url TEXT,
|
|
|
|
-- Opening hours (weekday_text array and periods)
|
|
opening_hours JSONB,
|
|
open_now BOOLEAN,
|
|
|
|
-- Reviews (full array with author, rating, text, time)
|
|
reviews JSONB,
|
|
|
|
-- Photos (URLs and metadata)
|
|
photos JSONB,
|
|
photo_urls TEXT[],
|
|
|
|
-- Full Google Maps enrichment backup
|
|
google_maps_enrichment JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- WIKIDATA ENRICHMENT
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
wikidata_label_nl TEXT,
|
|
wikidata_label_en TEXT,
|
|
wikidata_description_nl TEXT,
|
|
wikidata_description_en TEXT,
|
|
wikidata_types JSONB, -- instance_of (P31) values
|
|
wikidata_inception TEXT,
|
|
wikidata_coordinates JSONB,
|
|
wikidata_enrichment JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- YOUTUBE ENRICHMENT
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
youtube_channel_id TEXT,
|
|
youtube_channel_url TEXT,
|
|
youtube_subscriber_count INTEGER,
|
|
youtube_video_count INTEGER,
|
|
youtube_view_count BIGINT,
|
|
youtube_enrichment JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- WEB CLAIMS (extracted from institutional websites)
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
social_facebook TEXT,
|
|
social_twitter TEXT,
|
|
social_instagram TEXT,
|
|
social_linkedin TEXT,
|
|
social_youtube TEXT,
|
|
logo_url TEXT,
|
|
web_claims JSONB,
|
|
web_archives JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- GENEALOGIEWERKBALK (Dutch genealogy resources)
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
genealogiewerkbalk JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- ISIL REGISTRIES
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
nan_isil_enrichment JSONB, -- Nationaal Archief ISIL
|
|
kb_enrichment JSONB, -- KB Netherlands Library Network
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- MUSEUM REGISTER & ZCBS
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
museum_register JSONB,
|
|
zcbs_enrichment JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- TEMPORAL EXTENT & HISTORY
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
founding_year INTEGER,
|
|
founding_date DATE,
|
|
dissolution_year INTEGER,
|
|
dissolution_date DATE,
|
|
temporal_extent JSONB,
|
|
successor_organization JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- PROVENANCE
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
data_source TEXT,
|
|
data_tier TEXT,
|
|
extraction_date TIMESTAMPTZ,
|
|
confidence_score REAL,
|
|
provenance JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- CH-ANNOTATOR METADATA
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
ch_annotator JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- ORIGINAL ENTRY (full backup)
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
original_entry JSONB,
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- TIMESTAMPS
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
source_file TEXT,
|
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
|
);
|
|
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
-- INDEXES
|
|
-- ═══════════════════════════════════════════════════════════════
|
|
|
|
-- Core identity
|
|
CREATE INDEX idx_custodians_name ON custodians(name);
|
|
CREATE INDEX idx_custodians_name_gin ON custodians USING GIN (to_tsvector('simple', name));
|
|
|
|
-- Classification
|
|
CREATE INDEX idx_custodians_type ON custodians(type);
|
|
CREATE INDEX idx_custodians_type_name ON custodians(type_name);
|
|
|
|
-- GHCID
|
|
CREATE UNIQUE INDEX idx_custodians_ghcid ON custodians(ghcid);
|
|
CREATE INDEX idx_custodians_ghcid_uuid ON custodians(ghcid_uuid);
|
|
CREATE INDEX idx_custodians_record_id ON custodians(record_id);
|
|
|
|
-- Location
|
|
CREATE INDEX idx_custodians_city ON custodians(city);
|
|
CREATE INDEX idx_custodians_region ON custodians(region);
|
|
CREATE INDEX idx_custodians_country_code ON custodians(country_code);
|
|
CREATE INDEX idx_custodians_geonames_id ON custodians(geonames_id);
|
|
|
|
-- External identifiers
|
|
CREATE INDEX idx_custodians_wikidata_id ON custodians(wikidata_id);
|
|
CREATE INDEX idx_custodians_isil_code ON custodians(isil_code);
|
|
CREATE INDEX idx_custodians_google_place_id ON custodians(google_place_id);
|
|
|
|
-- Ratings
|
|
CREATE INDEX idx_custodians_rating ON custodians(rating);
|
|
|
|
-- Provenance
|
|
CREATE INDEX idx_custodians_data_source ON custodians(data_source);
|
|
CREATE INDEX idx_custodians_data_tier ON custodians(data_tier);
|
|
"""
|
|
|
|
|
|
SPATIAL_INDEX_SQL = """
|
|
-- Create a spatial index (requires PostGIS)
|
|
CREATE INDEX IF NOT EXISTS idx_custodians_geom ON custodians USING GIST (
|
|
ST_SetSRID(ST_MakePoint(lon, lat), 4326)
|
|
) WHERE lat IS NOT NULL AND lon IS NOT NULL;
|
|
"""
|
|
|
|
|
|
def extract_coordinates(entry: Dict) -> tuple[Optional[float], Optional[float]]:
|
|
"""Extract lat/lon from entry with priority order."""
|
|
lat, lon = None, None
|
|
|
|
# Priority 1: Google Maps coordinates
|
|
google_maps = entry.get('google_maps_enrichment', {})
|
|
coords = google_maps.get('coordinates', {})
|
|
if coords.get('latitude') and coords.get('longitude'):
|
|
return coords['latitude'], coords['longitude']
|
|
|
|
# Priority 2: Wikidata coordinates
|
|
wd = entry.get('wikidata_enrichment', {})
|
|
wd_coords = wd.get('wikidata_coordinates', {})
|
|
if wd_coords.get('latitude') and wd_coords.get('longitude'):
|
|
return wd_coords['latitude'], wd_coords['longitude']
|
|
|
|
# Priority 3: locations array
|
|
locations = entry.get('locations', entry.get('original_entry', {}).get('locations', []))
|
|
if locations and isinstance(locations, list) and len(locations) > 0:
|
|
loc = locations[0]
|
|
if loc.get('latitude') and loc.get('longitude'):
|
|
return loc['latitude'], loc['longitude']
|
|
|
|
# Priority 4: location object
|
|
location = entry.get('location', {})
|
|
if location.get('latitude') and location.get('longitude'):
|
|
return location['latitude'], location['longitude']
|
|
|
|
return None, None
|
|
|
|
|
|
def extract_custodian_data(entry: Dict, source_file: str) -> Dict[str, Any]:
|
|
"""Extract all relevant data from a custodian YAML entry."""
|
|
|
|
original = entry.get('original_entry', {})
|
|
google_maps = entry.get('google_maps_enrichment', {})
|
|
wikidata = entry.get('wikidata_enrichment', {})
|
|
youtube = entry.get('youtube_enrichment', {})
|
|
web_claims = entry.get('web_claims', {})
|
|
web_enrichment = entry.get('web_enrichment', {})
|
|
ghcid_data = entry.get('ghcid', {})
|
|
provenance = entry.get('provenance', {})
|
|
custodian_name = entry.get('custodian_name', {})
|
|
ch_annotator = entry.get('ch_annotator', {})
|
|
|
|
# Get coordinates
|
|
lat, lon = extract_coordinates(entry)
|
|
|
|
# Get institution type - comprehensive extraction from multiple sources
|
|
# Priority: 1) GHCID type letter, 2) original_entry.type, 3) original_entry.institution_type,
|
|
# 4) type_organisatie, 5) CH-Annotator entity_classification, 6) Wikidata instance_of
|
|
inst_type = 'U' # Default to Unknown
|
|
|
|
# 1. Extract from GHCID (most reliable - already normalized)
|
|
ghcid_current = ghcid_data.get('ghcid_current', '')
|
|
if ghcid_current and len(ghcid_current.split('-')) >= 4:
|
|
ghcid_type = ghcid_current.split('-')[3] # e.g., NL-NH-AMS-M-RM -> M
|
|
if ghcid_type and len(ghcid_type) == 1 and ghcid_type in TYPE_COLORS:
|
|
inst_type = ghcid_type
|
|
|
|
# 2. If still Unknown, try original_entry.type (Dutch NDE format - list of letters)
|
|
if inst_type == 'U':
|
|
types = original.get('type', [])
|
|
if isinstance(types, list) and types:
|
|
# Dutch format: type: [M] or type: [M, A]
|
|
first_type = types[0] if types else None
|
|
if first_type and len(str(first_type)) == 1 and str(first_type).upper() in TYPE_COLORS:
|
|
inst_type = str(first_type).upper()
|
|
elif isinstance(types, str) and types:
|
|
# CH-Annotator format: type: GRP.HER.MUS
|
|
if types.startswith('GRP.HER.'):
|
|
ch_type_map = {
|
|
'GRP.HER.GAL': 'G', 'GRP.HER.LIB': 'L', 'GRP.HER.ARC': 'A',
|
|
'GRP.HER.MUS': 'M', 'GRP.HER.MIX': 'X', 'GRP.HER': 'U'
|
|
}
|
|
inst_type = ch_type_map.get(types, 'U')
|
|
elif len(types) == 1 and types.upper() in TYPE_COLORS:
|
|
inst_type = types.upper()
|
|
|
|
# 3. Try original_entry.institution_type (CH-Annotator full name format)
|
|
if inst_type == 'U':
|
|
inst_type_str = original.get('institution_type', '')
|
|
if inst_type_str:
|
|
type_map = {v.upper(): k for k, v in TYPE_NAMES.items()}
|
|
# Also add common variations
|
|
type_map.update({
|
|
'GALLERY': 'G', 'LIBRARY': 'L', 'ARCHIVE': 'A', 'MUSEUM': 'M',
|
|
'OFFICIAL': 'O', 'OFFICIAL_INSTITUTION': 'O', 'RESEARCH': 'R',
|
|
'RESEARCH_CENTER': 'R', 'CORPORATION': 'C', 'UNKNOWN': 'U',
|
|
'BOTANICAL': 'B', 'BOTANICAL_ZOO': 'B', 'EDUCATION': 'E',
|
|
'EDUCATION_PROVIDER': 'E', 'SOCIETY': 'S', 'COLLECTING_SOCIETY': 'S',
|
|
'FEATURES': 'F', 'INTANGIBLE': 'I', 'INTANGIBLE_HERITAGE_GROUP': 'I',
|
|
'MIXED': 'X', 'PERSONAL': 'P', 'PERSONAL_COLLECTION': 'P',
|
|
'HOLY_SITES': 'H', 'DIGITAL': 'D', 'DIGITAL_PLATFORM': 'D',
|
|
'NGO': 'N', 'TASTE_SMELL': 'T',
|
|
})
|
|
inst_type = type_map.get(inst_type_str.upper(), 'U')
|
|
|
|
# 4. Try type_organisatie (Dutch NDE CSV field)
|
|
if inst_type == 'U':
|
|
type_org = original.get('type_organisatie', '')
|
|
if type_org:
|
|
type_org_map = {
|
|
'museum': 'M', 'archief': 'A', 'bibliotheek': 'L',
|
|
'galerie': 'G', 'onderzoek': 'R', 'erfgoed': 'O',
|
|
'onderwijs': 'E', 'vereniging': 'S', 'stichting': 'N',
|
|
}
|
|
inst_type = type_org_map.get(type_org.lower(), 'U')
|
|
|
|
# 5. Try CH-Annotator entity_classification
|
|
if inst_type == 'U' and ch_annotator:
|
|
entity_class = ch_annotator.get('entity_classification', {})
|
|
subtype = entity_class.get('subtype', '')
|
|
if subtype:
|
|
ch_subtype_map = {
|
|
'GRP.HER.GAL': 'G', 'GRP.HER.LIB': 'L', 'GRP.HER.ARC': 'A',
|
|
'GRP.HER.MUS': 'M', 'GRP.HER.MIX': 'X',
|
|
}
|
|
inst_type = ch_subtype_map.get(subtype, 'U')
|
|
|
|
# 6. Try to infer from Wikidata instance_of
|
|
if inst_type == 'U' and wikidata:
|
|
wikidata_types = wikidata.get('wikidata_instance_of', [])
|
|
if isinstance(wikidata_types, list):
|
|
# Map common Wikidata types to GLAMORCUBESFIXPHDNT
|
|
wd_type_map = {
|
|
'museum': 'M', 'art museum': 'M', 'history museum': 'M',
|
|
'natural history museum': 'M', 'science museum': 'M',
|
|
'archive': 'A', 'national archive': 'A', 'state archive': 'A',
|
|
'library': 'L', 'public library': 'L', 'national library': 'L',
|
|
'research institute': 'R', 'research center': 'R',
|
|
'university': 'E', 'college': 'E', 'school': 'E',
|
|
'botanical garden': 'B', 'zoo': 'B', 'aquarium': 'B',
|
|
'art gallery': 'G', 'gallery': 'G',
|
|
'organization': 'N', 'non-profit organization': 'N',
|
|
'foundation': 'N', 'association': 'S', 'society': 'S',
|
|
'church': 'H', 'monastery': 'H', 'temple': 'H', 'mosque': 'H',
|
|
}
|
|
for wdt in wikidata_types:
|
|
wdt_lower = str(wdt).lower() if wdt else ''
|
|
for pattern, type_code in wd_type_map.items():
|
|
if pattern in wdt_lower:
|
|
inst_type = type_code
|
|
break
|
|
if inst_type != 'U':
|
|
break
|
|
|
|
# Get name with priority
|
|
name = (
|
|
custodian_name.get('claim_value') or
|
|
wikidata.get('wikidata_label_nl') or
|
|
original.get('organisatie') or
|
|
original.get('name') or
|
|
'Unknown Institution'
|
|
)
|
|
|
|
# Get location info
|
|
locations = entry.get('locations', original.get('locations', []))
|
|
first_loc = locations[0] if locations and isinstance(locations, list) else {}
|
|
|
|
city = (
|
|
original.get('plaatsnaam_bezoekadres') or
|
|
first_loc.get('city') or
|
|
google_maps.get('short_address', '').split(',')[-1].strip() if google_maps.get('short_address') else ''
|
|
)
|
|
|
|
# Extract identifiers
|
|
identifiers = entry.get('identifiers', [])
|
|
wikidata_id = None
|
|
isil_code = None
|
|
viaf_id = None
|
|
kvk_number = None
|
|
ico_number = None
|
|
sigla = None
|
|
|
|
for ident in identifiers:
|
|
scheme = ident.get('identifier_scheme', '')
|
|
value = ident.get('identifier_value', '')
|
|
if scheme == 'Wikidata':
|
|
wikidata_id = value
|
|
elif scheme == 'ISIL':
|
|
isil_code = value
|
|
elif scheme == 'VIAF':
|
|
viaf_id = value
|
|
elif scheme == 'KvK':
|
|
kvk_number = value
|
|
elif scheme == 'IČO':
|
|
ico_number = value
|
|
elif scheme == 'Sigla':
|
|
sigla = value
|
|
|
|
# Extract social media from web claims
|
|
social = {}
|
|
if web_claims.get('claims'):
|
|
for claim in web_claims['claims']:
|
|
ct = claim.get('claim_type', '')
|
|
cv = claim.get('claim_value', '')
|
|
if ct == 'social_facebook':
|
|
social['facebook'] = cv
|
|
elif ct == 'social_twitter':
|
|
social['twitter'] = cv
|
|
elif ct == 'social_instagram':
|
|
social['instagram'] = cv
|
|
elif ct == 'social_linkedin':
|
|
social['linkedin'] = cv
|
|
elif ct == 'social_youtube':
|
|
social['youtube'] = cv
|
|
elif ct == 'logo':
|
|
social['logo'] = cv
|
|
|
|
# Extract YouTube data - handle both nested (Dutch) and flat (UNESCO) formats
|
|
# Nested: youtube_enrichment.channel.channel_id
|
|
# Flat: youtube_enrichment.channel_id
|
|
if youtube.get('channel'):
|
|
# Nested format (Dutch files)
|
|
yt_channel = youtube.get('channel', {})
|
|
elif youtube.get('channel_id'):
|
|
# Flat format (UNESCO/other files) - use youtube dict directly
|
|
yt_channel = youtube
|
|
else:
|
|
yt_channel = {}
|
|
|
|
# Build result
|
|
return {
|
|
# Core identity
|
|
'name': name,
|
|
'verified_name': custodian_name.get('claim_value'),
|
|
'name_source': custodian_name.get('extraction_method'),
|
|
'emic_name': custodian_name.get('emic_name'),
|
|
|
|
# Classification
|
|
'type': inst_type,
|
|
'type_name': TYPE_NAMES.get(inst_type, 'Unknown'),
|
|
'color': TYPE_COLORS.get(inst_type, '#9e9e9e'),
|
|
'ch_annotator_hypernym': ch_annotator.get('entity_classification', {}).get('hypernym'),
|
|
'ch_annotator_subtype': ch_annotator.get('entity_classification', {}).get('subtype'),
|
|
'ontology_class': ch_annotator.get('entity_classification', {}).get('ontology_class'),
|
|
|
|
# GHCID
|
|
'ghcid': ghcid_data.get('ghcid_current'),
|
|
'ghcid_uuid': ghcid_data.get('ghcid_uuid'),
|
|
'ghcid_uuid_sha256': ghcid_data.get('ghcid_uuid_sha256'),
|
|
'ghcid_numeric': ghcid_data.get('ghcid_numeric'),
|
|
'record_id': ghcid_data.get('record_id'),
|
|
'ghcid_original': ghcid_data.get('ghcid_original'),
|
|
'ghcid_history': ghcid_data.get('ghcid_history'),
|
|
|
|
# Location
|
|
'lat': lat,
|
|
'lon': lon,
|
|
'city': city,
|
|
'region': first_loc.get('region') or ghcid_data.get('location_resolution', {}).get('region_name'),
|
|
'region_code': ensure_str(ghcid_data.get('location_resolution', {}).get('region_code')),
|
|
'country': ensure_str(first_loc.get('country')),
|
|
'country_code': ensure_str(ghcid_data.get('location_resolution', {}).get('country_code')),
|
|
'street_address': first_loc.get('street_address') or google_maps.get('formatted_address'),
|
|
'postal_code': first_loc.get('postal_code'),
|
|
'formatted_address': google_maps.get('formatted_address'),
|
|
'geonames_id': ghcid_data.get('location_resolution', {}).get('geonames_id') or ghcid_data.get('geonames_id'),
|
|
|
|
# External identifiers
|
|
'wikidata_id': wikidata_id or wikidata.get('wikidata_entity_id'),
|
|
'isil_code': isil_code or original.get('isil-code_na'),
|
|
'viaf_id': viaf_id,
|
|
'google_place_id': google_maps.get('place_id'),
|
|
'kvk_number': kvk_number,
|
|
'ico_number': ico_number,
|
|
'sigla': sigla,
|
|
'identifiers': identifiers,
|
|
|
|
# Basic metadata
|
|
'website': ensure_str(google_maps.get('website') or wikidata.get('wikidata_official_website') or original.get('webadres_organisatie')),
|
|
'email': None, # Extract from web_claims if present
|
|
'phone': google_maps.get('phone_local'),
|
|
'phone_international': google_maps.get('phone_international'),
|
|
'description': wikidata.get('wikidata_description_nl') or wikidata.get('wikidata_description_en'),
|
|
|
|
# Google Maps enrichment
|
|
'rating': google_maps.get('rating'),
|
|
'total_ratings': google_maps.get('total_ratings'),
|
|
'business_status': google_maps.get('business_status'),
|
|
'google_maps_url': google_maps.get('google_maps_url'),
|
|
'street_view_url': google_maps.get('street_view_url'),
|
|
'opening_hours': google_maps.get('opening_hours'),
|
|
'open_now': google_maps.get('opening_hours', {}).get('open_now'),
|
|
'reviews': google_maps.get('reviews'),
|
|
'photos': google_maps.get('photos_metadata'),
|
|
'photo_urls': google_maps.get('photo_urls'),
|
|
'google_maps_enrichment': google_maps if google_maps else None,
|
|
|
|
# Wikidata enrichment
|
|
'wikidata_label_nl': wikidata.get('wikidata_label_nl'),
|
|
'wikidata_label_en': wikidata.get('wikidata_label_en'),
|
|
'wikidata_description_nl': wikidata.get('wikidata_description_nl'),
|
|
'wikidata_description_en': wikidata.get('wikidata_description_en'),
|
|
'wikidata_types': wikidata.get('wikidata_instance_of'),
|
|
'wikidata_inception': str(wikidata.get('wikidata_inception')) if wikidata.get('wikidata_inception') else None,
|
|
'wikidata_coordinates': wikidata.get('wikidata_coordinates'),
|
|
'wikidata_enrichment': wikidata if wikidata else None,
|
|
|
|
# YouTube enrichment
|
|
'youtube_channel_id': yt_channel.get('channel_id'),
|
|
'youtube_channel_url': yt_channel.get('channel_url'),
|
|
'youtube_subscriber_count': yt_channel.get('subscriber_count'),
|
|
'youtube_video_count': yt_channel.get('video_count'),
|
|
'youtube_view_count': yt_channel.get('view_count'),
|
|
'youtube_enrichment': youtube if youtube.get('status') == 'SUCCESS' else None,
|
|
|
|
# Web claims
|
|
'social_facebook': social.get('facebook'),
|
|
'social_twitter': social.get('twitter'),
|
|
'social_instagram': social.get('instagram'),
|
|
'social_linkedin': social.get('linkedin'),
|
|
'social_youtube': social.get('youtube'),
|
|
'logo_url': social.get('logo'),
|
|
'web_claims': web_claims if web_claims.get('claims') else None,
|
|
'web_archives': web_enrichment.get('web_archives'),
|
|
|
|
# Genealogiewerkbalk
|
|
'genealogiewerkbalk': entry.get('genealogiewerkbalk_enrichment'),
|
|
|
|
# ISIL registries
|
|
'nan_isil_enrichment': entry.get('nan_isil_enrichment'),
|
|
'kb_enrichment': entry.get('kb_enrichment'),
|
|
|
|
# Museum register & ZCBS
|
|
'museum_register': entry.get('museum_register_enrichment'),
|
|
'zcbs_enrichment': entry.get('zcbs_enrichment'),
|
|
|
|
# Temporal extent
|
|
'founding_year': None, # TODO: extract from wikidata_inception
|
|
'founding_date': None,
|
|
'dissolution_year': None,
|
|
'dissolution_date': None,
|
|
'temporal_extent': entry.get('temporal_extent'),
|
|
'successor_organization': entry.get('successor_organization'),
|
|
|
|
# Provenance
|
|
'data_source': provenance.get('data_source'),
|
|
'data_tier': provenance.get('data_tier'),
|
|
'extraction_date': parse_datetime(provenance.get('extraction_date')),
|
|
'confidence_score': provenance.get('confidence_score'),
|
|
'provenance': provenance if provenance else None,
|
|
|
|
# CH-Annotator
|
|
'ch_annotator': ch_annotator if ch_annotator else None,
|
|
|
|
# Original entry
|
|
'original_entry': original if original else None,
|
|
|
|
# Source file
|
|
'source_file': source_file,
|
|
}
|
|
|
|
|
|
def to_json(value: Any) -> Optional[str]:
|
|
"""Convert value to JSON string, handling None."""
|
|
if value is None:
|
|
return None
|
|
return json.dumps(value)
|
|
|
|
|
|
def parse_datetime(value: Any) -> Optional[datetime]:
|
|
"""Parse datetime from string or return None."""
|
|
if value is None:
|
|
return None
|
|
if isinstance(value, datetime):
|
|
return value
|
|
if isinstance(value, str):
|
|
try:
|
|
# Try ISO format with timezone
|
|
if '+' in value or 'Z' in value:
|
|
return datetime.fromisoformat(value.replace('Z', '+00:00'))
|
|
# Try ISO format without timezone
|
|
return datetime.fromisoformat(value).replace(tzinfo=timezone.utc)
|
|
except (ValueError, AttributeError):
|
|
return None
|
|
return None
|
|
|
|
|
|
def ensure_str(value: Any) -> Optional[str]:
|
|
"""Ensure value is a string or None."""
|
|
if value is None:
|
|
return None
|
|
if isinstance(value, bool):
|
|
return None # False/True shouldn't be stored as strings
|
|
if isinstance(value, list):
|
|
# Return first item if list, or join
|
|
if len(value) == 0:
|
|
return None
|
|
return value[0] if len(value) == 1 else value[0] # Take first URL
|
|
if isinstance(value, (int, float)):
|
|
return str(value)
|
|
return str(value)
|
|
|
|
|
|
async def load_data_to_database(
|
|
db_name: str,
|
|
db_config: Dict[str, Any],
|
|
yaml_files: List[Path],
|
|
drop_existing: bool = False,
|
|
) -> Dict[str, int]:
|
|
"""Load custodian data into a single database.
|
|
|
|
Args:
|
|
db_name: Name of the database (for logging)
|
|
db_config: Database connection configuration
|
|
yaml_files: List of YAML files to process
|
|
drop_existing: Whether to drop and recreate the table
|
|
|
|
Returns:
|
|
Dict with counts: processed, skipped, errors, total
|
|
"""
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Loading data to: {db_name}")
|
|
print(f"{'='*60}")
|
|
print(f"Connecting to PostgreSQL at {db_config['host']}:{db_config['port']}/{db_config['database']}...")
|
|
|
|
conn = await asyncpg.connect(
|
|
host=db_config['host'],
|
|
port=db_config['port'],
|
|
database=db_config['database'],
|
|
user=db_config['user'],
|
|
password=db_config['password'],
|
|
)
|
|
|
|
processed = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
try:
|
|
if drop_existing:
|
|
print("Creating custodians table (dropping existing)...")
|
|
await conn.execute(CREATE_TABLE_SQL)
|
|
|
|
# Try to create spatial index
|
|
try:
|
|
await conn.execute(SPATIAL_INDEX_SQL)
|
|
print(" Created spatial index (PostGIS)")
|
|
except Exception as e:
|
|
print(f" Skipped spatial index (PostGIS not available): {e}")
|
|
|
|
print(f"Processing {len(yaml_files)} custodian files...")
|
|
|
|
# Prepare INSERT statement with expanded ON CONFLICT to update all web claims fields
|
|
insert_sql = """
|
|
INSERT INTO custodians (
|
|
name, verified_name, name_source, emic_name,
|
|
type, type_name, color, ch_annotator_hypernym, ch_annotator_subtype, ontology_class,
|
|
ghcid, ghcid_uuid, ghcid_uuid_sha256, ghcid_numeric, record_id, ghcid_original, ghcid_history,
|
|
lat, lon, city, region, region_code, country, country_code, street_address, postal_code, formatted_address, geonames_id,
|
|
wikidata_id, isil_code, viaf_id, google_place_id, kvk_number, ico_number, sigla, identifiers,
|
|
website, email, phone, phone_international, description,
|
|
rating, total_ratings, business_status, google_maps_url, street_view_url,
|
|
opening_hours, open_now, reviews, photos, photo_urls, google_maps_enrichment,
|
|
wikidata_label_nl, wikidata_label_en, wikidata_description_nl, wikidata_description_en,
|
|
wikidata_types, wikidata_inception, wikidata_coordinates, wikidata_enrichment,
|
|
youtube_channel_id, youtube_channel_url, youtube_subscriber_count, youtube_video_count, youtube_view_count, youtube_enrichment,
|
|
social_facebook, social_twitter, social_instagram, social_linkedin, social_youtube, logo_url, web_claims, web_archives,
|
|
genealogiewerkbalk, nan_isil_enrichment, kb_enrichment, museum_register, zcbs_enrichment,
|
|
founding_year, founding_date, dissolution_year, dissolution_date, temporal_extent, successor_organization,
|
|
data_source, data_tier, extraction_date, confidence_score, provenance,
|
|
ch_annotator, original_entry, source_file
|
|
) VALUES (
|
|
$1, $2, $3, $4,
|
|
$5, $6, $7, $8, $9, $10,
|
|
$11, $12, $13, $14, $15, $16, $17,
|
|
$18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28,
|
|
$29, $30, $31, $32, $33, $34, $35, $36,
|
|
$37, $38, $39, $40, $41,
|
|
$42, $43, $44, $45, $46,
|
|
$47, $48, $49, $50, $51, $52,
|
|
$53, $54, $55, $56,
|
|
$57, $58, $59, $60,
|
|
$61, $62, $63, $64, $65, $66,
|
|
$67, $68, $69, $70, $71, $72, $73, $74,
|
|
$75, $76, $77, $78, $79,
|
|
$80, $81, $82, $83, $84, $85,
|
|
$86, $87, $88, $89, $90,
|
|
$91, $92, $93
|
|
)
|
|
ON CONFLICT (ghcid) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
verified_name = EXCLUDED.verified_name,
|
|
emic_name = EXCLUDED.emic_name,
|
|
type = EXCLUDED.type,
|
|
type_name = EXCLUDED.type_name,
|
|
lat = EXCLUDED.lat,
|
|
lon = EXCLUDED.lon,
|
|
city = EXCLUDED.city,
|
|
region = EXCLUDED.region,
|
|
country_code = EXCLUDED.country_code,
|
|
website = EXCLUDED.website,
|
|
description = EXCLUDED.description,
|
|
rating = EXCLUDED.rating,
|
|
total_ratings = EXCLUDED.total_ratings,
|
|
reviews = EXCLUDED.reviews,
|
|
photos = EXCLUDED.photos,
|
|
photo_urls = EXCLUDED.photo_urls,
|
|
opening_hours = EXCLUDED.opening_hours,
|
|
google_maps_enrichment = EXCLUDED.google_maps_enrichment,
|
|
wikidata_enrichment = EXCLUDED.wikidata_enrichment,
|
|
youtube_enrichment = EXCLUDED.youtube_enrichment,
|
|
social_facebook = EXCLUDED.social_facebook,
|
|
social_twitter = EXCLUDED.social_twitter,
|
|
social_instagram = EXCLUDED.social_instagram,
|
|
social_linkedin = EXCLUDED.social_linkedin,
|
|
social_youtube = EXCLUDED.social_youtube,
|
|
logo_url = EXCLUDED.logo_url,
|
|
web_claims = EXCLUDED.web_claims,
|
|
web_archives = EXCLUDED.web_archives,
|
|
updated_at = NOW()
|
|
"""
|
|
|
|
for i, yaml_file in enumerate(yaml_files):
|
|
if (i + 1) % 1000 == 0 or i == 0:
|
|
print(f" Processing {i + 1}/{len(yaml_files)}...")
|
|
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
entry = yaml.load(f, Loader=SafeLoader)
|
|
|
|
if not entry:
|
|
skipped += 1
|
|
continue
|
|
|
|
data = extract_custodian_data(entry, yaml_file.name)
|
|
|
|
# Skip if no GHCID
|
|
if not data['ghcid']:
|
|
skipped += 1
|
|
continue
|
|
|
|
# Build values tuple
|
|
values = (
|
|
data['name'], data['verified_name'], data['name_source'], data['emic_name'],
|
|
data['type'], data['type_name'], data['color'], data['ch_annotator_hypernym'], data['ch_annotator_subtype'], data['ontology_class'],
|
|
data['ghcid'], data['ghcid_uuid'], data['ghcid_uuid_sha256'], data['ghcid_numeric'], data['record_id'], data['ghcid_original'], to_json(data['ghcid_history']),
|
|
data['lat'], data['lon'], data['city'], data['region'], data['region_code'], data['country'], data['country_code'], data['street_address'], data['postal_code'], data['formatted_address'], data['geonames_id'],
|
|
data['wikidata_id'], data['isil_code'], data['viaf_id'], data['google_place_id'], data['kvk_number'], data['ico_number'], data['sigla'], to_json(data['identifiers']),
|
|
data['website'], data['email'], data['phone'], data['phone_international'], data['description'],
|
|
data['rating'], data['total_ratings'], data['business_status'], data['google_maps_url'], data['street_view_url'],
|
|
to_json(data['opening_hours']), data['open_now'], to_json(data['reviews']), to_json(data['photos']), data['photo_urls'], to_json(data['google_maps_enrichment']),
|
|
data['wikidata_label_nl'], data['wikidata_label_en'], data['wikidata_description_nl'], data['wikidata_description_en'],
|
|
to_json(data['wikidata_types']), data['wikidata_inception'], to_json(data['wikidata_coordinates']), to_json(data['wikidata_enrichment']),
|
|
data['youtube_channel_id'], data['youtube_channel_url'], data['youtube_subscriber_count'], data['youtube_video_count'], data['youtube_view_count'], to_json(data['youtube_enrichment']),
|
|
data['social_facebook'], data['social_twitter'], data['social_instagram'], data['social_linkedin'], data['social_youtube'], data['logo_url'], to_json(data['web_claims']), to_json(data['web_archives']),
|
|
to_json(data['genealogiewerkbalk']), to_json(data['nan_isil_enrichment']), to_json(data['kb_enrichment']), to_json(data['museum_register']), to_json(data['zcbs_enrichment']),
|
|
data['founding_year'], data['founding_date'], data['dissolution_year'], data['dissolution_date'], to_json(data['temporal_extent']), to_json(data['successor_organization']),
|
|
data['data_source'], data['data_tier'], data['extraction_date'], data['confidence_score'], to_json(data['provenance']),
|
|
to_json(data['ch_annotator']), to_json(data['original_entry']), data['source_file'],
|
|
)
|
|
|
|
await conn.execute(insert_sql, *values)
|
|
processed += 1
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
if errors <= 10:
|
|
print(f" Error processing {yaml_file.name}: {e}")
|
|
elif errors == 11:
|
|
print(" ... suppressing further error messages")
|
|
|
|
# Get final count
|
|
count = await conn.fetchval("SELECT COUNT(*) FROM custodians")
|
|
|
|
print(f"\n [{db_name}] LOAD COMPLETE")
|
|
print(f" Files processed: {processed}")
|
|
print(f" Files skipped: {skipped}")
|
|
print(f" Errors: {errors}")
|
|
print(f" Total in DB: {count}")
|
|
|
|
return {'processed': processed, 'skipped': skipped, 'errors': errors, 'total': count}
|
|
|
|
finally:
|
|
await conn.close()
|
|
|
|
|
|
async def load_data(
|
|
drop_existing: bool = False,
|
|
limit: Optional[int] = None,
|
|
databases: Optional[List[str]] = None,
|
|
):
|
|
"""Load all custodian data into one or more PostgreSQL databases.
|
|
|
|
Args:
|
|
drop_existing: Whether to drop and recreate tables
|
|
limit: Optional limit on number of files to process
|
|
databases: List of database names to load into. If None, uses single-database mode
|
|
with environment variables (backward compatible).
|
|
"""
|
|
|
|
# Find all YAML files
|
|
print(f"\nReading custodian files from: {CUSTODIAN_DIR}")
|
|
yaml_files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
|
|
total_files = len(yaml_files)
|
|
print(f"Found {total_files} custodian files")
|
|
|
|
if limit:
|
|
yaml_files = yaml_files[:limit]
|
|
print(f"Processing first {limit} files only")
|
|
|
|
# Single database mode (backward compatible)
|
|
if databases is None:
|
|
print(f"\nUsing single-database mode (backward compatible)")
|
|
db_config = {
|
|
'host': POSTGRES_HOST,
|
|
'port': POSTGRES_PORT,
|
|
'database': POSTGRES_DB,
|
|
'user': POSTGRES_USER,
|
|
'password': POSTGRES_PASSWORD,
|
|
}
|
|
await load_data_to_database(
|
|
db_name=POSTGRES_DB,
|
|
db_config=db_config,
|
|
yaml_files=yaml_files,
|
|
drop_existing=drop_existing,
|
|
)
|
|
return
|
|
|
|
# Multi-database mode
|
|
print(f"\nUsing multi-database mode: {', '.join(databases)}")
|
|
|
|
results = {}
|
|
for db_name in databases:
|
|
if db_name not in DATABASES:
|
|
print(f"\nERROR: Unknown database '{db_name}'. Available: {', '.join(DATABASES.keys())}")
|
|
continue
|
|
|
|
db_config = DATABASES[db_name]
|
|
try:
|
|
result = await load_data_to_database(
|
|
db_name=db_name,
|
|
db_config=db_config,
|
|
yaml_files=yaml_files,
|
|
drop_existing=drop_existing,
|
|
)
|
|
results[db_name] = result
|
|
except Exception as e:
|
|
print(f"\nERROR loading to {db_name}: {e}")
|
|
results[db_name] = {'error': str(e)}
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print("MULTI-DATABASE LOAD SUMMARY")
|
|
print(f"{'='*60}")
|
|
for db_name, result in results.items():
|
|
if 'error' in result:
|
|
print(f" {db_name}: FAILED - {result['error']}")
|
|
else:
|
|
print(f" {db_name}: {result['processed']} processed, {result['total']} total in DB")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Load custodian data into PostgreSQL")
|
|
parser.add_argument("--drop-existing", action="store_true", help="Drop existing table and recreate")
|
|
parser.add_argument("--limit", type=int, help="Limit number of files to process (for testing)")
|
|
parser.add_argument(
|
|
"--databases",
|
|
type=str,
|
|
help="Comma-separated list of databases to load (e.g., 'glam,glam_geo'). "
|
|
"If not specified, uses single-database mode with env vars. "
|
|
f"Available: {', '.join(DATABASES.keys())}"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Parse databases list
|
|
databases = None
|
|
if args.databases:
|
|
databases = [db.strip() for db in args.databases.split(',')]
|
|
|
|
asyncio.run(load_data(
|
|
drop_existing=args.drop_existing,
|
|
limit=args.limit,
|
|
databases=databases,
|
|
))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|