#!/usr/bin/env python3 """ Load ALL custodian data from data/custodian/ into PostgreSQL/PostGIS This script replaces load_nde_data.py and reads directly from the authoritative custodian YAML files, preserving ALL rich metadata including: - Google Maps enrichment (ratings, reviews, photos, opening hours) - YouTube enrichment (channel info, videos) - Web claims (social media, logos, descriptions) - Genealogiewerkbalk data - GHCID identifiers and history - Provenance tracking - Temporal extent and successor organizations Usage: python load_custodian_data.py [--drop-existing] [--limit N] """ import argparse import asyncio import json import os from pathlib import Path from datetime import datetime, timezone from typing import Any, Dict, List, Optional import sys # Add project root to path project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) try: import asyncpg except ImportError: print("Error: asyncpg not installed. Run: pip install asyncpg") sys.exit(1) try: import yaml try: from yaml import CSafeLoader as SafeLoader except ImportError: from yaml import SafeLoader except ImportError: print("Error: PyYAML not installed. Run: pip install pyyaml") sys.exit(1) # Configuration # Server path: /mnt/data/custodian/ # Local path: {project_root}/data/custodian/ DEFAULT_CUSTODIAN_DIR = "/mnt/data/custodian" if os.path.exists("/mnt/data/custodian") else str(project_root / "data" / "custodian") CUSTODIAN_DIR = Path(os.getenv("CUSTODIAN_DIR", DEFAULT_CUSTODIAN_DIR)) # Single database config (for backward compatibility) POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost") POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432")) POSTGRES_DB = os.getenv("POSTGRES_DB", "glam_heritage") POSTGRES_USER = os.getenv("POSTGRES_USER", "kempersc") POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "") # Multi-database configuration for production # Production has two databases that need identical custodian data: # - glam: Main custodian data storage # - glam_geo: PostGIS geo API for bronhouder.nl map DATABASES = { 'glam': { 'host': os.getenv("POSTGRES_HOST", "localhost"), 'port': int(os.getenv("POSTGRES_PORT", "5432")), 'database': os.getenv("POSTGRES_DB", "glam"), 'user': os.getenv("POSTGRES_USER", "glam_api"), 'password': os.getenv("POSTGRES_PASSWORD", ""), }, 'glam_geo': { 'host': os.getenv("GEO_POSTGRES_HOST", os.getenv("POSTGRES_HOST", "localhost")), 'port': int(os.getenv("GEO_POSTGRES_PORT", os.getenv("POSTGRES_PORT", "5432"))), 'database': os.getenv("GEO_POSTGRES_DB", "glam_geo"), 'user': os.getenv("GEO_POSTGRES_USER", os.getenv("POSTGRES_USER", "glam_api")), 'password': os.getenv("GEO_POSTGRES_PASSWORD", os.getenv("POSTGRES_PASSWORD", "")), }, # Local development database 'glam_heritage': { 'host': os.getenv("POSTGRES_HOST", "localhost"), 'port': int(os.getenv("POSTGRES_PORT", "5432")), 'database': os.getenv("POSTGRES_DB", "glam_heritage"), 'user': os.getenv("POSTGRES_USER", "kempersc"), 'password': os.getenv("POSTGRES_PASSWORD", ""), }, } # Institution type mappings TYPE_COLORS = { 'G': '#00bcd4', 'L': '#2ecc71', 'A': '#3498db', 'M': '#e74c3c', 'O': '#f39c12', 'R': '#1abc9c', 'C': '#795548', 'U': '#9e9e9e', 'B': '#4caf50', 'E': '#ff9800', 'S': '#9b59b6', 'F': '#95a5a6', 'I': '#673ab7', 'X': '#607d8b', 'P': '#ff5722', 'H': '#607d8b', 'D': '#34495e', 'N': '#e91e63', 'T': '#ff5722', } TYPE_NAMES = { 'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum', 'O': 'Official', 'R': 'Research', 'C': 'Corporation', 'U': 'Unknown', 'B': 'Botanical', 'E': 'Education', 'S': 'Society', 'F': 'Features', 'I': 'Intangible', 'X': 'Mixed', 'P': 'Personal', 'H': 'Holy sites', 'D': 'Digital', 'N': 'NGO', 'T': 'Taste/smell', } CREATE_TABLE_SQL = """ -- Drop existing table if requested DROP TABLE IF EXISTS custodians CASCADE; CREATE TABLE custodians ( id SERIAL PRIMARY KEY, -- ═══════════════════════════════════════════════════════════════ -- CORE IDENTITY -- ═══════════════════════════════════════════════════════════════ name TEXT NOT NULL, verified_name TEXT, name_source TEXT, emic_name TEXT, -- Name in original/native language -- ═══════════════════════════════════════════════════════════════ -- CLASSIFICATION (GLAMORCUBESFIXPHDNT) -- ═══════════════════════════════════════════════════════════════ type CHAR(1), type_name TEXT, color VARCHAR(10), ch_annotator_hypernym TEXT, -- e.g., GRP ch_annotator_subtype TEXT, -- e.g., GRP.HER.MUS ontology_class TEXT, -- e.g., schema:Museum -- ═══════════════════════════════════════════════════════════════ -- GHCID (Global Heritage Custodian Identifier) -- ═══════════════════════════════════════════════════════════════ ghcid TEXT UNIQUE, ghcid_uuid UUID, ghcid_uuid_sha256 UUID, ghcid_numeric NUMERIC(20), record_id UUID, ghcid_original TEXT, ghcid_history JSONB, -- ═══════════════════════════════════════════════════════════════ -- LOCATION -- ═══════════════════════════════════════════════════════════════ lat DOUBLE PRECISION, lon DOUBLE PRECISION, city TEXT, region TEXT, region_code TEXT, country TEXT, country_code CHAR(2), street_address TEXT, postal_code TEXT, formatted_address TEXT, geonames_id INTEGER, -- ═══════════════════════════════════════════════════════════════ -- EXTERNAL IDENTIFIERS -- ═══════════════════════════════════════════════════════════════ wikidata_id TEXT, isil_code TEXT, viaf_id TEXT, google_place_id TEXT, kvk_number TEXT, ico_number TEXT, -- Czech business identifier sigla TEXT, -- Czech library code -- All identifiers as JSONB array identifiers JSONB, -- ═══════════════════════════════════════════════════════════════ -- BASIC METADATA -- ═══════════════════════════════════════════════════════════════ website TEXT, email TEXT, phone TEXT, phone_international TEXT, description TEXT, -- ═══════════════════════════════════════════════════════════════ -- GOOGLE MAPS ENRICHMENT -- ═══════════════════════════════════════════════════════════════ rating REAL, total_ratings INTEGER, business_status TEXT, google_maps_url TEXT, street_view_url TEXT, -- Opening hours (weekday_text array and periods) opening_hours JSONB, open_now BOOLEAN, -- Reviews (full array with author, rating, text, time) reviews JSONB, -- Photos (URLs and metadata) photos JSONB, photo_urls TEXT[], -- Full Google Maps enrichment backup google_maps_enrichment JSONB, -- ═══════════════════════════════════════════════════════════════ -- WIKIDATA ENRICHMENT -- ═══════════════════════════════════════════════════════════════ wikidata_label_nl TEXT, wikidata_label_en TEXT, wikidata_description_nl TEXT, wikidata_description_en TEXT, wikidata_types JSONB, -- instance_of (P31) values wikidata_inception TEXT, wikidata_coordinates JSONB, wikidata_enrichment JSONB, -- ═══════════════════════════════════════════════════════════════ -- YOUTUBE ENRICHMENT -- ═══════════════════════════════════════════════════════════════ youtube_channel_id TEXT, youtube_channel_url TEXT, youtube_subscriber_count INTEGER, youtube_video_count INTEGER, youtube_view_count BIGINT, youtube_enrichment JSONB, -- ═══════════════════════════════════════════════════════════════ -- WEB CLAIMS (extracted from institutional websites) -- ═══════════════════════════════════════════════════════════════ social_facebook TEXT, social_twitter TEXT, social_instagram TEXT, social_linkedin TEXT, social_youtube TEXT, logo_url TEXT, web_claims JSONB, web_archives JSONB, -- ═══════════════════════════════════════════════════════════════ -- GENEALOGIEWERKBALK (Dutch genealogy resources) -- ═══════════════════════════════════════════════════════════════ genealogiewerkbalk JSONB, -- ═══════════════════════════════════════════════════════════════ -- ISIL REGISTRIES -- ═══════════════════════════════════════════════════════════════ nan_isil_enrichment JSONB, -- Nationaal Archief ISIL kb_enrichment JSONB, -- KB Netherlands Library Network -- ═══════════════════════════════════════════════════════════════ -- MUSEUM REGISTER & ZCBS -- ═══════════════════════════════════════════════════════════════ museum_register JSONB, zcbs_enrichment JSONB, -- ═══════════════════════════════════════════════════════════════ -- TEMPORAL EXTENT & HISTORY -- ═══════════════════════════════════════════════════════════════ founding_year INTEGER, founding_date DATE, dissolution_year INTEGER, dissolution_date DATE, temporal_extent JSONB, successor_organization JSONB, -- ═══════════════════════════════════════════════════════════════ -- PROVENANCE -- ═══════════════════════════════════════════════════════════════ data_source TEXT, data_tier TEXT, extraction_date TIMESTAMPTZ, confidence_score REAL, provenance JSONB, -- ═══════════════════════════════════════════════════════════════ -- CH-ANNOTATOR METADATA -- ═══════════════════════════════════════════════════════════════ ch_annotator JSONB, -- ═══════════════════════════════════════════════════════════════ -- ORIGINAL ENTRY (full backup) -- ═══════════════════════════════════════════════════════════════ original_entry JSONB, -- ═══════════════════════════════════════════════════════════════ -- TIMESTAMPS -- ═══════════════════════════════════════════════════════════════ source_file TEXT, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); -- ═══════════════════════════════════════════════════════════════ -- INDEXES -- ═══════════════════════════════════════════════════════════════ -- Core identity CREATE INDEX idx_custodians_name ON custodians(name); CREATE INDEX idx_custodians_name_gin ON custodians USING GIN (to_tsvector('simple', name)); -- Classification CREATE INDEX idx_custodians_type ON custodians(type); CREATE INDEX idx_custodians_type_name ON custodians(type_name); -- GHCID CREATE UNIQUE INDEX idx_custodians_ghcid ON custodians(ghcid); CREATE INDEX idx_custodians_ghcid_uuid ON custodians(ghcid_uuid); CREATE INDEX idx_custodians_record_id ON custodians(record_id); -- Location CREATE INDEX idx_custodians_city ON custodians(city); CREATE INDEX idx_custodians_region ON custodians(region); CREATE INDEX idx_custodians_country_code ON custodians(country_code); CREATE INDEX idx_custodians_geonames_id ON custodians(geonames_id); -- External identifiers CREATE INDEX idx_custodians_wikidata_id ON custodians(wikidata_id); CREATE INDEX idx_custodians_isil_code ON custodians(isil_code); CREATE INDEX idx_custodians_google_place_id ON custodians(google_place_id); -- Ratings CREATE INDEX idx_custodians_rating ON custodians(rating); -- Provenance CREATE INDEX idx_custodians_data_source ON custodians(data_source); CREATE INDEX idx_custodians_data_tier ON custodians(data_tier); """ SPATIAL_INDEX_SQL = """ -- Create a spatial index (requires PostGIS) CREATE INDEX IF NOT EXISTS idx_custodians_geom ON custodians USING GIST ( ST_SetSRID(ST_MakePoint(lon, lat), 4326) ) WHERE lat IS NOT NULL AND lon IS NOT NULL; """ def extract_coordinates(entry: Dict) -> tuple[Optional[float], Optional[float]]: """Extract lat/lon from entry with priority order.""" lat, lon = None, None # Priority 1: Google Maps coordinates google_maps = entry.get('google_maps_enrichment', {}) coords = google_maps.get('coordinates', {}) if coords.get('latitude') and coords.get('longitude'): return coords['latitude'], coords['longitude'] # Priority 2: Wikidata coordinates wd = entry.get('wikidata_enrichment', {}) wd_coords = wd.get('wikidata_coordinates', {}) if wd_coords.get('latitude') and wd_coords.get('longitude'): return wd_coords['latitude'], wd_coords['longitude'] # Priority 3: locations array locations = entry.get('locations', entry.get('original_entry', {}).get('locations', [])) if locations and isinstance(locations, list) and len(locations) > 0: loc = locations[0] if loc.get('latitude') and loc.get('longitude'): return loc['latitude'], loc['longitude'] # Priority 4: location object location = entry.get('location', {}) if location.get('latitude') and location.get('longitude'): return location['latitude'], location['longitude'] return None, None def extract_custodian_data(entry: Dict, source_file: str) -> Dict[str, Any]: """Extract all relevant data from a custodian YAML entry.""" original = entry.get('original_entry', {}) google_maps = entry.get('google_maps_enrichment', {}) wikidata = entry.get('wikidata_enrichment', {}) youtube = entry.get('youtube_enrichment', {}) web_claims = entry.get('web_claims', {}) web_enrichment = entry.get('web_enrichment', {}) ghcid_data = entry.get('ghcid', {}) provenance = entry.get('provenance', {}) custodian_name = entry.get('custodian_name', {}) ch_annotator = entry.get('ch_annotator', {}) # Get coordinates lat, lon = extract_coordinates(entry) # Get institution type - comprehensive extraction from multiple sources # Priority: 1) GHCID type letter, 2) original_entry.type, 3) original_entry.institution_type, # 4) type_organisatie, 5) CH-Annotator entity_classification, 6) Wikidata instance_of inst_type = 'U' # Default to Unknown # 1. Extract from GHCID (most reliable - already normalized) ghcid_current = ghcid_data.get('ghcid_current', '') if ghcid_current and len(ghcid_current.split('-')) >= 4: ghcid_type = ghcid_current.split('-')[3] # e.g., NL-NH-AMS-M-RM -> M if ghcid_type and len(ghcid_type) == 1 and ghcid_type in TYPE_COLORS: inst_type = ghcid_type # 2. If still Unknown, try original_entry.type (Dutch NDE format - list of letters) if inst_type == 'U': types = original.get('type', []) if isinstance(types, list) and types: # Dutch format: type: [M] or type: [M, A] first_type = types[0] if types else None if first_type and len(str(first_type)) == 1 and str(first_type).upper() in TYPE_COLORS: inst_type = str(first_type).upper() elif isinstance(types, str) and types: # CH-Annotator format: type: GRP.HER.MUS if types.startswith('GRP.HER.'): ch_type_map = { 'GRP.HER.GAL': 'G', 'GRP.HER.LIB': 'L', 'GRP.HER.ARC': 'A', 'GRP.HER.MUS': 'M', 'GRP.HER.MIX': 'X', 'GRP.HER': 'U' } inst_type = ch_type_map.get(types, 'U') elif len(types) == 1 and types.upper() in TYPE_COLORS: inst_type = types.upper() # 3. Try original_entry.institution_type (CH-Annotator full name format) if inst_type == 'U': inst_type_str = original.get('institution_type', '') if inst_type_str: type_map = {v.upper(): k for k, v in TYPE_NAMES.items()} # Also add common variations type_map.update({ 'GALLERY': 'G', 'LIBRARY': 'L', 'ARCHIVE': 'A', 'MUSEUM': 'M', 'OFFICIAL': 'O', 'OFFICIAL_INSTITUTION': 'O', 'RESEARCH': 'R', 'RESEARCH_CENTER': 'R', 'CORPORATION': 'C', 'UNKNOWN': 'U', 'BOTANICAL': 'B', 'BOTANICAL_ZOO': 'B', 'EDUCATION': 'E', 'EDUCATION_PROVIDER': 'E', 'SOCIETY': 'S', 'COLLECTING_SOCIETY': 'S', 'FEATURES': 'F', 'INTANGIBLE': 'I', 'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL': 'P', 'PERSONAL_COLLECTION': 'P', 'HOLY_SITES': 'H', 'DIGITAL': 'D', 'DIGITAL_PLATFORM': 'D', 'NGO': 'N', 'TASTE_SMELL': 'T', }) inst_type = type_map.get(inst_type_str.upper(), 'U') # 4. Try type_organisatie (Dutch NDE CSV field) if inst_type == 'U': type_org = original.get('type_organisatie', '') if type_org: type_org_map = { 'museum': 'M', 'archief': 'A', 'bibliotheek': 'L', 'galerie': 'G', 'onderzoek': 'R', 'erfgoed': 'O', 'onderwijs': 'E', 'vereniging': 'S', 'stichting': 'N', } inst_type = type_org_map.get(type_org.lower(), 'U') # 5. Try CH-Annotator entity_classification if inst_type == 'U' and ch_annotator: entity_class = ch_annotator.get('entity_classification', {}) subtype = entity_class.get('subtype', '') if subtype: ch_subtype_map = { 'GRP.HER.GAL': 'G', 'GRP.HER.LIB': 'L', 'GRP.HER.ARC': 'A', 'GRP.HER.MUS': 'M', 'GRP.HER.MIX': 'X', } inst_type = ch_subtype_map.get(subtype, 'U') # 6. Try to infer from Wikidata instance_of if inst_type == 'U' and wikidata: wikidata_types = wikidata.get('wikidata_instance_of', []) if isinstance(wikidata_types, list): # Map common Wikidata types to GLAMORCUBESFIXPHDNT wd_type_map = { 'museum': 'M', 'art museum': 'M', 'history museum': 'M', 'natural history museum': 'M', 'science museum': 'M', 'archive': 'A', 'national archive': 'A', 'state archive': 'A', 'library': 'L', 'public library': 'L', 'national library': 'L', 'research institute': 'R', 'research center': 'R', 'university': 'E', 'college': 'E', 'school': 'E', 'botanical garden': 'B', 'zoo': 'B', 'aquarium': 'B', 'art gallery': 'G', 'gallery': 'G', 'organization': 'N', 'non-profit organization': 'N', 'foundation': 'N', 'association': 'S', 'society': 'S', 'church': 'H', 'monastery': 'H', 'temple': 'H', 'mosque': 'H', } for wdt in wikidata_types: wdt_lower = str(wdt).lower() if wdt else '' for pattern, type_code in wd_type_map.items(): if pattern in wdt_lower: inst_type = type_code break if inst_type != 'U': break # Get name with priority name = ( custodian_name.get('claim_value') or wikidata.get('wikidata_label_nl') or original.get('organisatie') or original.get('name') or 'Unknown Institution' ) # Get location info locations = entry.get('locations', original.get('locations', [])) first_loc = locations[0] if locations and isinstance(locations, list) else {} city = ( original.get('plaatsnaam_bezoekadres') or first_loc.get('city') or google_maps.get('short_address', '').split(',')[-1].strip() if google_maps.get('short_address') else '' ) # Extract identifiers identifiers = entry.get('identifiers', []) wikidata_id = None isil_code = None viaf_id = None kvk_number = None ico_number = None sigla = None for ident in identifiers: scheme = ident.get('identifier_scheme', '') value = ident.get('identifier_value', '') if scheme == 'Wikidata': wikidata_id = value elif scheme == 'ISIL': isil_code = value elif scheme == 'VIAF': viaf_id = value elif scheme == 'KvK': kvk_number = value elif scheme == 'IČO': ico_number = value elif scheme == 'Sigla': sigla = value # Extract social media from web claims social = {} if web_claims.get('claims'): for claim in web_claims['claims']: ct = claim.get('claim_type', '') cv = claim.get('claim_value', '') if ct == 'social_facebook': social['facebook'] = cv elif ct == 'social_twitter': social['twitter'] = cv elif ct == 'social_instagram': social['instagram'] = cv elif ct == 'social_linkedin': social['linkedin'] = cv elif ct == 'social_youtube': social['youtube'] = cv elif ct == 'logo': social['logo'] = cv # Extract YouTube data - handle both nested (Dutch) and flat (UNESCO) formats # Nested: youtube_enrichment.channel.channel_id # Flat: youtube_enrichment.channel_id if youtube.get('channel'): # Nested format (Dutch files) yt_channel = youtube.get('channel', {}) elif youtube.get('channel_id'): # Flat format (UNESCO/other files) - use youtube dict directly yt_channel = youtube else: yt_channel = {} # Build result return { # Core identity 'name': name, 'verified_name': custodian_name.get('claim_value'), 'name_source': custodian_name.get('extraction_method'), 'emic_name': custodian_name.get('emic_name'), # Classification 'type': inst_type, 'type_name': TYPE_NAMES.get(inst_type, 'Unknown'), 'color': TYPE_COLORS.get(inst_type, '#9e9e9e'), 'ch_annotator_hypernym': ch_annotator.get('entity_classification', {}).get('hypernym'), 'ch_annotator_subtype': ch_annotator.get('entity_classification', {}).get('subtype'), 'ontology_class': ch_annotator.get('entity_classification', {}).get('ontology_class'), # GHCID 'ghcid': ghcid_data.get('ghcid_current'), 'ghcid_uuid': ghcid_data.get('ghcid_uuid'), 'ghcid_uuid_sha256': ghcid_data.get('ghcid_uuid_sha256'), 'ghcid_numeric': ghcid_data.get('ghcid_numeric'), 'record_id': ghcid_data.get('record_id'), 'ghcid_original': ghcid_data.get('ghcid_original'), 'ghcid_history': ghcid_data.get('ghcid_history'), # Location 'lat': lat, 'lon': lon, 'city': city, 'region': first_loc.get('region') or ghcid_data.get('location_resolution', {}).get('region_name'), 'region_code': ensure_str(ghcid_data.get('location_resolution', {}).get('region_code')), 'country': ensure_str(first_loc.get('country')), 'country_code': ensure_str(ghcid_data.get('location_resolution', {}).get('country_code')), 'street_address': first_loc.get('street_address') or google_maps.get('formatted_address'), 'postal_code': first_loc.get('postal_code'), 'formatted_address': google_maps.get('formatted_address'), 'geonames_id': ghcid_data.get('location_resolution', {}).get('geonames_id') or ghcid_data.get('geonames_id'), # External identifiers 'wikidata_id': wikidata_id or wikidata.get('wikidata_entity_id'), 'isil_code': isil_code or original.get('isil-code_na'), 'viaf_id': viaf_id, 'google_place_id': google_maps.get('place_id'), 'kvk_number': kvk_number, 'ico_number': ico_number, 'sigla': sigla, 'identifiers': identifiers, # Basic metadata 'website': ensure_str(google_maps.get('website') or wikidata.get('wikidata_official_website') or original.get('webadres_organisatie')), 'email': None, # Extract from web_claims if present 'phone': google_maps.get('phone_local'), 'phone_international': google_maps.get('phone_international'), 'description': wikidata.get('wikidata_description_nl') or wikidata.get('wikidata_description_en'), # Google Maps enrichment 'rating': google_maps.get('rating'), 'total_ratings': google_maps.get('total_ratings'), 'business_status': google_maps.get('business_status'), 'google_maps_url': google_maps.get('google_maps_url'), 'street_view_url': google_maps.get('street_view_url'), 'opening_hours': google_maps.get('opening_hours'), 'open_now': google_maps.get('opening_hours', {}).get('open_now'), 'reviews': google_maps.get('reviews'), 'photos': google_maps.get('photos_metadata'), 'photo_urls': google_maps.get('photo_urls'), 'google_maps_enrichment': google_maps if google_maps else None, # Wikidata enrichment 'wikidata_label_nl': wikidata.get('wikidata_label_nl'), 'wikidata_label_en': wikidata.get('wikidata_label_en'), 'wikidata_description_nl': wikidata.get('wikidata_description_nl'), 'wikidata_description_en': wikidata.get('wikidata_description_en'), 'wikidata_types': wikidata.get('wikidata_instance_of'), 'wikidata_inception': str(wikidata.get('wikidata_inception')) if wikidata.get('wikidata_inception') else None, 'wikidata_coordinates': wikidata.get('wikidata_coordinates'), 'wikidata_enrichment': wikidata if wikidata else None, # YouTube enrichment 'youtube_channel_id': yt_channel.get('channel_id'), 'youtube_channel_url': yt_channel.get('channel_url'), 'youtube_subscriber_count': yt_channel.get('subscriber_count'), 'youtube_video_count': yt_channel.get('video_count'), 'youtube_view_count': yt_channel.get('view_count'), 'youtube_enrichment': youtube if youtube.get('status') == 'SUCCESS' else None, # Web claims 'social_facebook': social.get('facebook'), 'social_twitter': social.get('twitter'), 'social_instagram': social.get('instagram'), 'social_linkedin': social.get('linkedin'), 'social_youtube': social.get('youtube'), 'logo_url': social.get('logo'), 'web_claims': web_claims if web_claims.get('claims') else None, 'web_archives': web_enrichment.get('web_archives'), # Genealogiewerkbalk 'genealogiewerkbalk': entry.get('genealogiewerkbalk_enrichment'), # ISIL registries 'nan_isil_enrichment': entry.get('nan_isil_enrichment'), 'kb_enrichment': entry.get('kb_enrichment'), # Museum register & ZCBS 'museum_register': entry.get('museum_register_enrichment'), 'zcbs_enrichment': entry.get('zcbs_enrichment'), # Temporal extent 'founding_year': None, # TODO: extract from wikidata_inception 'founding_date': None, 'dissolution_year': None, 'dissolution_date': None, 'temporal_extent': entry.get('temporal_extent'), 'successor_organization': entry.get('successor_organization'), # Provenance 'data_source': provenance.get('data_source'), 'data_tier': provenance.get('data_tier'), 'extraction_date': parse_datetime(provenance.get('extraction_date')), 'confidence_score': provenance.get('confidence_score'), 'provenance': provenance if provenance else None, # CH-Annotator 'ch_annotator': ch_annotator if ch_annotator else None, # Original entry 'original_entry': original if original else None, # Source file 'source_file': source_file, } def to_json(value: Any) -> Optional[str]: """Convert value to JSON string, handling None.""" if value is None: return None return json.dumps(value) def parse_datetime(value: Any) -> Optional[datetime]: """Parse datetime from string or return None.""" if value is None: return None if isinstance(value, datetime): return value if isinstance(value, str): try: # Try ISO format with timezone if '+' in value or 'Z' in value: return datetime.fromisoformat(value.replace('Z', '+00:00')) # Try ISO format without timezone return datetime.fromisoformat(value).replace(tzinfo=timezone.utc) except (ValueError, AttributeError): return None return None def ensure_str(value: Any) -> Optional[str]: """Ensure value is a string or None.""" if value is None: return None if isinstance(value, bool): return None # False/True shouldn't be stored as strings if isinstance(value, list): # Return first item if list, or join if len(value) == 0: return None return value[0] if len(value) == 1 else value[0] # Take first URL if isinstance(value, (int, float)): return str(value) return str(value) async def load_data_to_database( db_name: str, db_config: Dict[str, Any], yaml_files: List[Path], drop_existing: bool = False, ) -> Dict[str, int]: """Load custodian data into a single database. Args: db_name: Name of the database (for logging) db_config: Database connection configuration yaml_files: List of YAML files to process drop_existing: Whether to drop and recreate the table Returns: Dict with counts: processed, skipped, errors, total """ print(f"\n{'='*60}") print(f"Loading data to: {db_name}") print(f"{'='*60}") print(f"Connecting to PostgreSQL at {db_config['host']}:{db_config['port']}/{db_config['database']}...") conn = await asyncpg.connect( host=db_config['host'], port=db_config['port'], database=db_config['database'], user=db_config['user'], password=db_config['password'], ) processed = 0 skipped = 0 errors = 0 try: if drop_existing: print("Creating custodians table (dropping existing)...") await conn.execute(CREATE_TABLE_SQL) # Try to create spatial index try: await conn.execute(SPATIAL_INDEX_SQL) print(" Created spatial index (PostGIS)") except Exception as e: print(f" Skipped spatial index (PostGIS not available): {e}") print(f"Processing {len(yaml_files)} custodian files...") # Prepare INSERT statement with expanded ON CONFLICT to update all web claims fields insert_sql = """ INSERT INTO custodians ( name, verified_name, name_source, emic_name, type, type_name, color, ch_annotator_hypernym, ch_annotator_subtype, ontology_class, ghcid, ghcid_uuid, ghcid_uuid_sha256, ghcid_numeric, record_id, ghcid_original, ghcid_history, lat, lon, city, region, region_code, country, country_code, street_address, postal_code, formatted_address, geonames_id, wikidata_id, isil_code, viaf_id, google_place_id, kvk_number, ico_number, sigla, identifiers, website, email, phone, phone_international, description, rating, total_ratings, business_status, google_maps_url, street_view_url, opening_hours, open_now, reviews, photos, photo_urls, google_maps_enrichment, wikidata_label_nl, wikidata_label_en, wikidata_description_nl, wikidata_description_en, wikidata_types, wikidata_inception, wikidata_coordinates, wikidata_enrichment, youtube_channel_id, youtube_channel_url, youtube_subscriber_count, youtube_video_count, youtube_view_count, youtube_enrichment, social_facebook, social_twitter, social_instagram, social_linkedin, social_youtube, logo_url, web_claims, web_archives, genealogiewerkbalk, nan_isil_enrichment, kb_enrichment, museum_register, zcbs_enrichment, founding_year, founding_date, dissolution_year, dissolution_date, temporal_extent, successor_organization, data_source, data_tier, extraction_date, confidence_score, provenance, ch_annotator, original_entry, source_file ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55, $56, $57, $58, $59, $60, $61, $62, $63, $64, $65, $66, $67, $68, $69, $70, $71, $72, $73, $74, $75, $76, $77, $78, $79, $80, $81, $82, $83, $84, $85, $86, $87, $88, $89, $90, $91, $92, $93 ) ON CONFLICT (ghcid) DO UPDATE SET name = EXCLUDED.name, verified_name = EXCLUDED.verified_name, emic_name = EXCLUDED.emic_name, type = EXCLUDED.type, type_name = EXCLUDED.type_name, lat = EXCLUDED.lat, lon = EXCLUDED.lon, city = EXCLUDED.city, region = EXCLUDED.region, country_code = EXCLUDED.country_code, website = EXCLUDED.website, description = EXCLUDED.description, rating = EXCLUDED.rating, total_ratings = EXCLUDED.total_ratings, reviews = EXCLUDED.reviews, photos = EXCLUDED.photos, photo_urls = EXCLUDED.photo_urls, opening_hours = EXCLUDED.opening_hours, google_maps_enrichment = EXCLUDED.google_maps_enrichment, wikidata_enrichment = EXCLUDED.wikidata_enrichment, youtube_enrichment = EXCLUDED.youtube_enrichment, social_facebook = EXCLUDED.social_facebook, social_twitter = EXCLUDED.social_twitter, social_instagram = EXCLUDED.social_instagram, social_linkedin = EXCLUDED.social_linkedin, social_youtube = EXCLUDED.social_youtube, logo_url = EXCLUDED.logo_url, web_claims = EXCLUDED.web_claims, web_archives = EXCLUDED.web_archives, updated_at = NOW() """ for i, yaml_file in enumerate(yaml_files): if (i + 1) % 1000 == 0 or i == 0: print(f" Processing {i + 1}/{len(yaml_files)}...") try: with open(yaml_file, 'r', encoding='utf-8') as f: entry = yaml.load(f, Loader=SafeLoader) if not entry: skipped += 1 continue data = extract_custodian_data(entry, yaml_file.name) # Skip if no GHCID if not data['ghcid']: skipped += 1 continue # Build values tuple values = ( data['name'], data['verified_name'], data['name_source'], data['emic_name'], data['type'], data['type_name'], data['color'], data['ch_annotator_hypernym'], data['ch_annotator_subtype'], data['ontology_class'], data['ghcid'], data['ghcid_uuid'], data['ghcid_uuid_sha256'], data['ghcid_numeric'], data['record_id'], data['ghcid_original'], to_json(data['ghcid_history']), data['lat'], data['lon'], data['city'], data['region'], data['region_code'], data['country'], data['country_code'], data['street_address'], data['postal_code'], data['formatted_address'], data['geonames_id'], data['wikidata_id'], data['isil_code'], data['viaf_id'], data['google_place_id'], data['kvk_number'], data['ico_number'], data['sigla'], to_json(data['identifiers']), data['website'], data['email'], data['phone'], data['phone_international'], data['description'], data['rating'], data['total_ratings'], data['business_status'], data['google_maps_url'], data['street_view_url'], to_json(data['opening_hours']), data['open_now'], to_json(data['reviews']), to_json(data['photos']), data['photo_urls'], to_json(data['google_maps_enrichment']), data['wikidata_label_nl'], data['wikidata_label_en'], data['wikidata_description_nl'], data['wikidata_description_en'], to_json(data['wikidata_types']), data['wikidata_inception'], to_json(data['wikidata_coordinates']), to_json(data['wikidata_enrichment']), data['youtube_channel_id'], data['youtube_channel_url'], data['youtube_subscriber_count'], data['youtube_video_count'], data['youtube_view_count'], to_json(data['youtube_enrichment']), data['social_facebook'], data['social_twitter'], data['social_instagram'], data['social_linkedin'], data['social_youtube'], data['logo_url'], to_json(data['web_claims']), to_json(data['web_archives']), to_json(data['genealogiewerkbalk']), to_json(data['nan_isil_enrichment']), to_json(data['kb_enrichment']), to_json(data['museum_register']), to_json(data['zcbs_enrichment']), data['founding_year'], data['founding_date'], data['dissolution_year'], data['dissolution_date'], to_json(data['temporal_extent']), to_json(data['successor_organization']), data['data_source'], data['data_tier'], data['extraction_date'], data['confidence_score'], to_json(data['provenance']), to_json(data['ch_annotator']), to_json(data['original_entry']), data['source_file'], ) await conn.execute(insert_sql, *values) processed += 1 except Exception as e: errors += 1 if errors <= 10: print(f" Error processing {yaml_file.name}: {e}") elif errors == 11: print(" ... suppressing further error messages") # Get final count count = await conn.fetchval("SELECT COUNT(*) FROM custodians") print(f"\n [{db_name}] LOAD COMPLETE") print(f" Files processed: {processed}") print(f" Files skipped: {skipped}") print(f" Errors: {errors}") print(f" Total in DB: {count}") return {'processed': processed, 'skipped': skipped, 'errors': errors, 'total': count} finally: await conn.close() async def load_data( drop_existing: bool = False, limit: Optional[int] = None, databases: Optional[List[str]] = None, ): """Load all custodian data into one or more PostgreSQL databases. Args: drop_existing: Whether to drop and recreate tables limit: Optional limit on number of files to process databases: List of database names to load into. If None, uses single-database mode with environment variables (backward compatible). """ # Find all YAML files print(f"\nReading custodian files from: {CUSTODIAN_DIR}") yaml_files = sorted(CUSTODIAN_DIR.glob("*.yaml")) total_files = len(yaml_files) print(f"Found {total_files} custodian files") if limit: yaml_files = yaml_files[:limit] print(f"Processing first {limit} files only") # Single database mode (backward compatible) if databases is None: print(f"\nUsing single-database mode (backward compatible)") db_config = { 'host': POSTGRES_HOST, 'port': POSTGRES_PORT, 'database': POSTGRES_DB, 'user': POSTGRES_USER, 'password': POSTGRES_PASSWORD, } await load_data_to_database( db_name=POSTGRES_DB, db_config=db_config, yaml_files=yaml_files, drop_existing=drop_existing, ) return # Multi-database mode print(f"\nUsing multi-database mode: {', '.join(databases)}") results = {} for db_name in databases: if db_name not in DATABASES: print(f"\nERROR: Unknown database '{db_name}'. Available: {', '.join(DATABASES.keys())}") continue db_config = DATABASES[db_name] try: result = await load_data_to_database( db_name=db_name, db_config=db_config, yaml_files=yaml_files, drop_existing=drop_existing, ) results[db_name] = result except Exception as e: print(f"\nERROR loading to {db_name}: {e}") results[db_name] = {'error': str(e)} # Summary print(f"\n{'='*60}") print("MULTI-DATABASE LOAD SUMMARY") print(f"{'='*60}") for db_name, result in results.items(): if 'error' in result: print(f" {db_name}: FAILED - {result['error']}") else: print(f" {db_name}: {result['processed']} processed, {result['total']} total in DB") def main(): parser = argparse.ArgumentParser(description="Load custodian data into PostgreSQL") parser.add_argument("--drop-existing", action="store_true", help="Drop existing table and recreate") parser.add_argument("--limit", type=int, help="Limit number of files to process (for testing)") parser.add_argument( "--databases", type=str, help="Comma-separated list of databases to load (e.g., 'glam,glam_geo'). " "If not specified, uses single-database mode with env vars. " f"Available: {', '.join(DATABASES.keys())}" ) args = parser.parse_args() # Parse databases list databases = None if args.databases: databases = [db.strip() for db in args.databases.split(',')] asyncio.run(load_data( drop_existing=args.drop_existing, limit=args.limit, databases=databases, )) if __name__ == "__main__": main()