785 lines
29 KiB
Python
785 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Load all custodian YAML files into DuckLake as raw enriched data.
|
|
|
|
v3 - Enhanced coordinate extraction with cascading fallback across 10 known patterns.
|
|
|
|
This script reads all YAML files from data/custodian/ and uploads them
|
|
to DuckLake as a single table for the bronze/raw data lake layer.
|
|
|
|
Usage:
|
|
python scripts/load_custodians_to_ducklake_v3.py [--dry-run] [--limit N]
|
|
|
|
API Endpoint: http://localhost:8765/upload
|
|
|
|
COORDINATE EXTRACTION PRIORITY (highest to lowest):
|
|
1. manual_location_override.coordinates - User-verified corrections
|
|
2. google_maps_enrichment.coordinates - Most accurate for operational institutions
|
|
3. wikidata_enrichment.wikidata_coordinates - Good coverage with precision
|
|
4. wikidata_enrichment.coordinates - Verified Wikidata data
|
|
5. ghcid.location_resolution.source_coordinates - GeoNames-resolved
|
|
6. ghcid.location_resolution (direct) - Early GeoNames integration
|
|
7. original_entry.locations[0] - CH-Annotator files
|
|
8. locations[0] - Root-level locations array
|
|
9. location (root level) - Single location object
|
|
10. Do NOT use: google_maps_enrichment_invalid, unesco nearby sites
|
|
|
|
PROVENANCE TRACKING:
|
|
Each coordinate extraction records the source for debugging.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
import yaml
|
|
|
|
# Configuration
|
|
DUCKLAKE_URL = os.getenv("DUCKLAKE_URL", "http://localhost:8765")
|
|
DEFAULT_CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
TABLE_NAME = "custodians_raw"
|
|
|
|
|
|
def extract_coordinates(data: dict) -> tuple[float | None, float | None, str]:
|
|
"""
|
|
Extract coordinates using cascading priority fallback.
|
|
|
|
Returns:
|
|
Tuple of (latitude, longitude, source) where source indicates
|
|
which pattern was used for provenance tracking.
|
|
"""
|
|
|
|
# Priority 1: manual_location_override (user-verified corrections)
|
|
override = data.get("manual_location_override", {})
|
|
if override:
|
|
coords = override.get("coordinates", {})
|
|
lat = coords.get("latitude")
|
|
lon = coords.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "manual_location_override"
|
|
|
|
# Priority 2: google_maps_enrichment (most accurate for operational institutions)
|
|
gm = data.get("google_maps_enrichment", {})
|
|
if gm:
|
|
coords = gm.get("coordinates", {})
|
|
lat = coords.get("latitude")
|
|
lon = coords.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "google_maps_enrichment"
|
|
|
|
# Priority 3: wikidata_enrichment.wikidata_coordinates (with precision)
|
|
wd = data.get("wikidata_enrichment", {})
|
|
if wd:
|
|
# First try wikidata_coordinates (most common pattern)
|
|
wd_coords = wd.get("wikidata_coordinates", {})
|
|
if wd_coords:
|
|
lat = wd_coords.get("latitude")
|
|
lon = wd_coords.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "wikidata_enrichment.wikidata_coordinates"
|
|
|
|
# Priority 4: wikidata_enrichment.coordinates (alternative structure)
|
|
coords = wd.get("coordinates", {})
|
|
if coords:
|
|
lat = coords.get("latitude")
|
|
lon = coords.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "wikidata_enrichment.coordinates"
|
|
|
|
# Priority 5: ghcid.location_resolution.source_coordinates (GeoNames-resolved)
|
|
ghcid = data.get("ghcid", {})
|
|
if ghcid:
|
|
loc_res = ghcid.get("location_resolution", {})
|
|
if loc_res:
|
|
# Try source_coordinates first
|
|
src_coords = loc_res.get("source_coordinates", {})
|
|
if src_coords:
|
|
lat = src_coords.get("latitude")
|
|
lon = src_coords.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "ghcid.location_resolution.source_coordinates"
|
|
|
|
# Priority 6: ghcid.location_resolution direct (older format)
|
|
lat = loc_res.get("latitude")
|
|
lon = loc_res.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "ghcid.location_resolution"
|
|
|
|
# Priority 7: original_entry.locations[0] (CH-Annotator files)
|
|
original = data.get("original_entry", {})
|
|
if original:
|
|
locations = original.get("locations", [])
|
|
if locations and isinstance(locations, list) and len(locations) > 0:
|
|
loc = locations[0]
|
|
lat = loc.get("latitude")
|
|
lon = loc.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "original_entry.locations[0]"
|
|
|
|
# Priority 8: locations[0] (root-level locations array)
|
|
locations = data.get("locations", [])
|
|
if locations and isinstance(locations, list) and len(locations) > 0:
|
|
loc = locations[0]
|
|
lat = loc.get("latitude")
|
|
lon = loc.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "locations[0]"
|
|
|
|
# Some files have coordinates nested under "coordinates"
|
|
coords = loc.get("coordinates", {})
|
|
if coords:
|
|
lat = coords.get("latitude")
|
|
lon = coords.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "locations[0].coordinates"
|
|
|
|
# Priority 9: location (root-level single object)
|
|
location = data.get("location", {})
|
|
if location and isinstance(location, dict):
|
|
lat = location.get("latitude")
|
|
lon = location.get("longitude")
|
|
if lat is not None and lon is not None:
|
|
return lat, lon, "location"
|
|
|
|
# No coordinates found
|
|
return None, None, ""
|
|
|
|
|
|
def extract_city_country(data: dict) -> tuple[str, str]:
|
|
"""
|
|
Extract city and country using cascading fallback.
|
|
|
|
Returns:
|
|
Tuple of (city, country)
|
|
"""
|
|
city = ""
|
|
country = ""
|
|
|
|
# Priority 1: Google Maps address components (most accurate)
|
|
gm = data.get("google_maps_enrichment", {})
|
|
if gm:
|
|
for comp in gm.get("address_components", []):
|
|
types = comp.get("types", [])
|
|
if "locality" in types and not city:
|
|
city = comp.get("long_name", "")
|
|
if "country" in types and not country:
|
|
country = comp.get("short_name", "")
|
|
|
|
# Priority 2: ghcid.location_resolution
|
|
if not city or not country:
|
|
ghcid = data.get("ghcid", {})
|
|
loc_res = ghcid.get("location_resolution", {}) if ghcid else {}
|
|
if loc_res:
|
|
if not city:
|
|
city = loc_res.get("geonames_name", "") or loc_res.get("city_name", "") or loc_res.get("google_maps_locality", "")
|
|
if not country:
|
|
country = loc_res.get("country_code", "")
|
|
|
|
# Priority 3: location (root level)
|
|
if not city or not country:
|
|
location = data.get("location", {})
|
|
if location:
|
|
if not city:
|
|
city = location.get("city", "")
|
|
if not country:
|
|
country = location.get("country", "")
|
|
|
|
# Priority 4: locations[0] (root level array)
|
|
if not city or not country:
|
|
locations = data.get("locations", [])
|
|
if locations and len(locations) > 0:
|
|
loc = locations[0]
|
|
if not city:
|
|
city = loc.get("city", "")
|
|
if not country:
|
|
country = loc.get("country", "")
|
|
|
|
# Priority 5: original_entry.locations[0]
|
|
if not city or not country:
|
|
original = data.get("original_entry", {})
|
|
if original:
|
|
locations = original.get("locations", [])
|
|
if locations and len(locations) > 0:
|
|
loc = locations[0]
|
|
if not city:
|
|
city = loc.get("city", "")
|
|
if not country:
|
|
country = loc.get("country", "")
|
|
|
|
# Priority 6: original_entry direct fields
|
|
if not city or not country:
|
|
original = data.get("original_entry", {})
|
|
if original:
|
|
if not city:
|
|
city = original.get("plaatsnaam_bezoekadres", "") or original.get("city", "")
|
|
if not country:
|
|
country = original.get("country", "")
|
|
|
|
# Ultimate fallback: Extract country from GHCID string (first 2 chars)
|
|
if not country:
|
|
ghcid_current = data.get("ghcid", {}).get("ghcid_current", "")
|
|
if ghcid_current and len(ghcid_current) >= 2:
|
|
country = ghcid_current[:2]
|
|
|
|
return city, country
|
|
|
|
|
|
def extract_financial_documents(data: dict) -> dict:
|
|
"""
|
|
Extract financial document metadata from web_claims.
|
|
|
|
Extracts counts and latest URLs for:
|
|
- annual_report_url (jaarverslag)
|
|
- financial_statement_url (jaarstukken)
|
|
- anbi_publication_url (ANBI)
|
|
- policy_document_url (beleidsplan)
|
|
|
|
Returns:
|
|
Dict with has_*, count, and latest_*_url fields
|
|
"""
|
|
result = {
|
|
"has_annual_report": False,
|
|
"has_financial_statement": False,
|
|
"has_anbi_publication": False,
|
|
"has_policy_document": False,
|
|
"annual_report_count": 0,
|
|
"financial_statement_count": 0,
|
|
"anbi_publication_count": 0,
|
|
"policy_document_count": 0,
|
|
"latest_annual_report_url": "",
|
|
"latest_annual_report_year": None,
|
|
"latest_financial_statement_url": "",
|
|
"latest_policy_document_url": "",
|
|
"latest_anbi_publication_url": "",
|
|
}
|
|
|
|
web_claims = data.get("web_claims", {})
|
|
if not web_claims:
|
|
return result
|
|
|
|
claims = web_claims.get("claims", [])
|
|
if not claims:
|
|
return result
|
|
|
|
# Group claims by type and collect year info
|
|
annual_reports = []
|
|
financial_statements = []
|
|
anbi_publications = []
|
|
policy_documents = []
|
|
year_claims = {} # Map URL to year
|
|
|
|
for claim in claims:
|
|
claim_type = claim.get("claim_type", "")
|
|
claim_value = claim.get("claim_value", "")
|
|
|
|
if claim_type == "financial_document_year":
|
|
# Year claims have raw_value like "year=2024, from=annual_report_url, url=..."
|
|
raw = claim.get("raw_value", "")
|
|
if "url=" in raw:
|
|
url_part = raw.split("url=")[-1]
|
|
try:
|
|
year = int(claim_value)
|
|
year_claims[url_part] = year
|
|
except (ValueError, TypeError):
|
|
pass
|
|
elif claim_type == "annual_report_url":
|
|
annual_reports.append(claim_value)
|
|
elif claim_type == "financial_statement_url":
|
|
financial_statements.append(claim_value)
|
|
elif claim_type == "anbi_publication_url":
|
|
anbi_publications.append(claim_value)
|
|
elif claim_type == "policy_document_url":
|
|
policy_documents.append(claim_value)
|
|
|
|
# Set counts and has_* flags
|
|
result["annual_report_count"] = len(annual_reports)
|
|
result["financial_statement_count"] = len(financial_statements)
|
|
result["anbi_publication_count"] = len(anbi_publications)
|
|
result["policy_document_count"] = len(policy_documents)
|
|
|
|
result["has_annual_report"] = len(annual_reports) > 0
|
|
result["has_financial_statement"] = len(financial_statements) > 0
|
|
result["has_anbi_publication"] = len(anbi_publications) > 0
|
|
result["has_policy_document"] = len(policy_documents) > 0
|
|
|
|
# Find latest by year (or just use first if no year info)
|
|
def get_latest_with_year(urls: list, year_map: dict) -> tuple:
|
|
"""Return (url, year) for the document with highest year."""
|
|
if not urls:
|
|
return "", None
|
|
|
|
# Try to find one with year info
|
|
best_url = urls[0]
|
|
best_year = None
|
|
|
|
for url in urls:
|
|
year = year_map.get(url)
|
|
if year is not None:
|
|
if best_year is None or year > best_year:
|
|
best_year = year
|
|
best_url = url
|
|
|
|
return best_url, best_year
|
|
|
|
url, year = get_latest_with_year(annual_reports, year_claims)
|
|
result["latest_annual_report_url"] = url
|
|
result["latest_annual_report_year"] = year
|
|
|
|
result["latest_financial_statement_url"], _ = get_latest_with_year(financial_statements, year_claims)
|
|
result["latest_policy_document_url"], _ = get_latest_with_year(policy_documents, year_claims)
|
|
result["latest_anbi_publication_url"], _ = get_latest_with_year(anbi_publications, year_claims)
|
|
|
|
return result
|
|
|
|
|
|
def extract_top_level_fields(data: dict) -> dict:
|
|
"""
|
|
Extract and flatten top-level fields from YAML data.
|
|
Complex nested objects are stored as JSON strings.
|
|
"""
|
|
record = {
|
|
# File identification
|
|
"file_name": data.get("_file_name", ""),
|
|
"ghcid_current": "",
|
|
"record_id": "",
|
|
|
|
# Original entry (flattened)
|
|
"org_name": "",
|
|
"org_type": "",
|
|
"institution_type": "",
|
|
"wikidata_id": "",
|
|
|
|
# Enrichment status
|
|
"enrichment_status": data.get("enrichment_status", ""),
|
|
"processing_timestamp": data.get("processing_timestamp", ""),
|
|
|
|
# Location (with cascading extraction)
|
|
"latitude": None,
|
|
"longitude": None,
|
|
"coordinate_source": "", # NEW: Provenance for coordinates
|
|
"formatted_address": "",
|
|
"city": "",
|
|
"country": "",
|
|
"postal_code": "",
|
|
"region": "",
|
|
|
|
# Custodian name consensus
|
|
"custodian_name": "",
|
|
"custodian_name_confidence": None,
|
|
|
|
# Emic name (native language name)
|
|
"emic_name": "",
|
|
"name_language": "",
|
|
|
|
# Ratings
|
|
"google_rating": None,
|
|
"google_total_ratings": None,
|
|
|
|
# Financial transparency (denormalized from web_claims)
|
|
"has_annual_report": False,
|
|
"has_financial_statement": False,
|
|
"has_anbi_publication": False,
|
|
"has_policy_document": False,
|
|
"annual_report_count": 0,
|
|
"financial_statement_count": 0,
|
|
"anbi_publication_count": 0,
|
|
"policy_document_count": 0,
|
|
"latest_annual_report_url": "",
|
|
"latest_annual_report_year": None,
|
|
"latest_financial_statement_url": "",
|
|
"latest_policy_document_url": "",
|
|
"latest_anbi_publication_url": "",
|
|
|
|
# Temporal data
|
|
"timespan_begin": None,
|
|
"timespan_end": None,
|
|
"destruction_date": None,
|
|
"founding_date": None,
|
|
"dissolution_date": None,
|
|
"wikidata_inception": None,
|
|
|
|
# Complex nested objects as JSON strings
|
|
"original_entry_json": "",
|
|
"wikidata_enrichment_json": "",
|
|
"google_maps_enrichment_json": "",
|
|
"web_enrichment_json": "",
|
|
"web_claims_json": "",
|
|
"ghcid_json": "",
|
|
"identifiers_json": "",
|
|
"provenance_json": "",
|
|
"genealogiewerkbalk_json": "",
|
|
"digital_platforms_json": "",
|
|
|
|
# Additional JSON objects
|
|
"service_area_json": "",
|
|
"timespan_json": "",
|
|
"time_of_destruction_json": "",
|
|
"conflict_status_json": "",
|
|
"temporal_extent_json": "",
|
|
"youtube_enrichment_json": "",
|
|
"unesco_mow_enrichment_json": "",
|
|
"ch_annotator_json": "",
|
|
"location_json": "",
|
|
}
|
|
|
|
# Extract GHCID
|
|
ghcid = data.get("ghcid", {})
|
|
if ghcid:
|
|
record["ghcid_current"] = ghcid.get("ghcid_current", "")
|
|
record["record_id"] = ghcid.get("record_id", "")
|
|
record["ghcid_json"] = json.dumps(ghcid, ensure_ascii=False, default=str)
|
|
|
|
# Extract original entry
|
|
original = data.get("original_entry", {})
|
|
if original:
|
|
# Try multiple name fields
|
|
record["org_name"] = original.get("organisatie", "") or original.get("name", "") or ""
|
|
org_types = original.get("type", [])
|
|
record["org_type"] = ",".join(org_types) if isinstance(org_types, list) else str(org_types)
|
|
record["institution_type"] = original.get("institution_type", "")
|
|
record["wikidata_id"] = original.get("wikidata_id", "")
|
|
record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)
|
|
|
|
# Extract coordinates using cascading priority
|
|
lat, lon, coord_source = extract_coordinates(data)
|
|
record["latitude"] = lat
|
|
record["longitude"] = lon
|
|
record["coordinate_source"] = coord_source
|
|
|
|
# Extract city and country using cascading priority
|
|
city, country = extract_city_country(data)
|
|
record["city"] = city
|
|
record["country"] = country
|
|
|
|
# Extract additional location data from Google Maps
|
|
gm = data.get("google_maps_enrichment", {})
|
|
if gm:
|
|
record["formatted_address"] = gm.get("formatted_address", "")
|
|
record["google_rating"] = gm.get("rating")
|
|
record["google_total_ratings"] = gm.get("total_ratings")
|
|
|
|
# Extract postal code from address components
|
|
for comp in gm.get("address_components", []):
|
|
types = comp.get("types", [])
|
|
if "postal_code" in types:
|
|
record["postal_code"] = comp.get("long_name", "")
|
|
if "administrative_area_level_1" in types:
|
|
record["region"] = comp.get("short_name", "")
|
|
|
|
record["google_maps_enrichment_json"] = json.dumps(gm, ensure_ascii=False, default=str)
|
|
|
|
# Fallback region from GHCID location_resolution
|
|
if not record["region"] and ghcid:
|
|
loc_res = ghcid.get("location_resolution", {})
|
|
if loc_res:
|
|
record["region"] = loc_res.get("region_code", "")
|
|
|
|
# Extract custodian name consensus and emic_name
|
|
cn = data.get("custodian_name", {})
|
|
if cn:
|
|
record["custodian_name"] = cn.get("claim_value", "")
|
|
record["custodian_name_confidence"] = cn.get("confidence")
|
|
# Extract emic_name (native language name)
|
|
record["emic_name"] = cn.get("emic_name", "") or cn.get("claim_value_arabic", "") or ""
|
|
|
|
# Infer name_language from country code
|
|
language_map = {
|
|
"NL": "nl", "BE": "nl", "DE": "de", "AT": "de", "CH": "de",
|
|
"FR": "fr", "ES": "es", "IT": "it", "PT": "pt", "BR": "pt",
|
|
"GB": "en", "US": "en", "AU": "en", "CA": "en", "IE": "en",
|
|
"PL": "pl", "CZ": "cs", "HU": "hu", "RU": "ru", "JP": "ja",
|
|
"CN": "zh", "KR": "ko", "AR": "ar", "PS": "ar", "IL": "he",
|
|
"GR": "el", "TR": "tr", "SE": "sv", "NO": "no", "DK": "da",
|
|
"FI": "fi", "RO": "ro", "BG": "bg", "UA": "uk", "BY": "be",
|
|
}
|
|
record["name_language"] = language_map.get(record["country"], "")
|
|
|
|
# Extract timespan data
|
|
timespan = data.get("timespan", {})
|
|
if timespan:
|
|
record["timespan_begin"] = timespan.get("begin_of_the_begin", "")
|
|
record["timespan_end"] = timespan.get("end_of_the_end", "")
|
|
record["timespan_json"] = json.dumps(timespan, ensure_ascii=False, default=str)
|
|
|
|
# Extract time_of_destruction
|
|
tod = data.get("time_of_destruction", {})
|
|
if tod:
|
|
record["destruction_date"] = tod.get("date", "")
|
|
record["time_of_destruction_json"] = json.dumps(tod, ensure_ascii=False, default=str)
|
|
|
|
# Extract conflict_status
|
|
cs = data.get("conflict_status", {})
|
|
if cs:
|
|
if not record["destruction_date"]:
|
|
record["destruction_date"] = cs.get("date", "")
|
|
record["conflict_status_json"] = json.dumps(cs, ensure_ascii=False, default=str)
|
|
|
|
# Extract wikidata inception/dissolution
|
|
wd = data.get("wikidata_enrichment", {})
|
|
if wd:
|
|
# Try multiple paths for inception date
|
|
wikidata_inception = (
|
|
wd.get("wikidata_inception", "") or
|
|
wd.get("wikidata_founded", "") or
|
|
wd.get("wikidata_temporal", {}).get("inception", "")
|
|
)
|
|
record["wikidata_inception"] = wikidata_inception
|
|
|
|
# Use wikidata_inception as founding_date fallback
|
|
if wikidata_inception and not record["founding_date"]:
|
|
record["founding_date"] = wikidata_inception
|
|
|
|
# Try multiple paths for dissolution date
|
|
wikidata_dissolution = (
|
|
wd.get("wikidata_dissolution", "") or
|
|
wd.get("wikidata_dissolved", "")
|
|
)
|
|
if wikidata_dissolution and not record["dissolution_date"]:
|
|
record["dissolution_date"] = wikidata_dissolution
|
|
|
|
record["wikidata_enrichment_json"] = json.dumps(wd, ensure_ascii=False, default=str)
|
|
|
|
# Extract service_area
|
|
sa = data.get("service_area", {})
|
|
if sa:
|
|
record["service_area_json"] = json.dumps(sa, ensure_ascii=False, default=str)
|
|
|
|
# Extract youtube_enrichment
|
|
yt = data.get("youtube_enrichment", {})
|
|
if yt:
|
|
record["youtube_enrichment_json"] = json.dumps(yt, ensure_ascii=False, default=str)
|
|
|
|
# Extract unesco_mow_enrichment (UNESCO Memory of the World inscriptions)
|
|
unesco_mow = data.get("unesco_mow_enrichment", {})
|
|
if unesco_mow:
|
|
record["unesco_mow_enrichment_json"] = json.dumps(unesco_mow, ensure_ascii=False, default=str)
|
|
|
|
# Extract CH-Annotator data
|
|
ch = data.get("ch_annotator", {})
|
|
if ch:
|
|
record["ch_annotator_json"] = json.dumps(ch, ensure_ascii=False, default=str)
|
|
|
|
# Extract location as JSON
|
|
location = data.get("location", {})
|
|
if location:
|
|
record["location_json"] = json.dumps(location, ensure_ascii=False, default=str)
|
|
|
|
# Store other complex objects as JSON
|
|
if data.get("web_enrichment"):
|
|
record["web_enrichment_json"] = json.dumps(
|
|
data["web_enrichment"], ensure_ascii=False, default=str
|
|
)
|
|
|
|
if data.get("web_claims"):
|
|
record["web_claims_json"] = json.dumps(
|
|
data["web_claims"], ensure_ascii=False, default=str
|
|
)
|
|
# Extract denormalized financial document fields
|
|
fin_docs = extract_financial_documents(data)
|
|
record.update(fin_docs)
|
|
|
|
if data.get("identifiers"):
|
|
record["identifiers_json"] = json.dumps(
|
|
data["identifiers"], ensure_ascii=False, default=str
|
|
)
|
|
|
|
if data.get("provenance"):
|
|
record["provenance_json"] = json.dumps(
|
|
data["provenance"], ensure_ascii=False, default=str
|
|
)
|
|
|
|
if data.get("genealogiewerkbalk_enrichment"):
|
|
record["genealogiewerkbalk_json"] = json.dumps(
|
|
data["genealogiewerkbalk_enrichment"], ensure_ascii=False, default=str
|
|
)
|
|
|
|
if data.get("digital_platforms"):
|
|
record["digital_platforms_json"] = json.dumps(
|
|
data["digital_platforms"], ensure_ascii=False, default=str
|
|
)
|
|
|
|
return record
|
|
|
|
|
|
def load_yaml_files(directory: Path, limit: int | None = None) -> list[dict]:
|
|
"""Load all YAML files from directory and convert to records."""
|
|
records = []
|
|
yaml_files = sorted(directory.glob("*.yaml"))
|
|
|
|
if limit:
|
|
yaml_files = yaml_files[:limit]
|
|
|
|
print(f"Loading {len(yaml_files)} YAML files from {directory}...")
|
|
|
|
coord_stats = {
|
|
"total": 0,
|
|
"with_coords": 0,
|
|
"sources": {}
|
|
}
|
|
|
|
for i, yaml_file in enumerate(yaml_files):
|
|
try:
|
|
with open(yaml_file, "r", encoding="utf-8") as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if data:
|
|
data["_file_name"] = yaml_file.name
|
|
record = extract_top_level_fields(data)
|
|
records.append(record)
|
|
|
|
# Track coordinate extraction stats
|
|
coord_stats["total"] += 1
|
|
if record.get("latitude") is not None and record.get("longitude") is not None:
|
|
coord_stats["with_coords"] += 1
|
|
source = record.get("coordinate_source", "unknown")
|
|
coord_stats["sources"][source] = coord_stats["sources"].get(source, 0) + 1
|
|
|
|
if (i + 1) % 1000 == 0:
|
|
print(f" Processed {i + 1}/{len(yaml_files)} files...")
|
|
|
|
except Exception as e:
|
|
print(f" Warning: Failed to load {yaml_file.name}: {e}")
|
|
continue
|
|
|
|
print(f"\nSuccessfully loaded {len(records)} records")
|
|
print(f"\nCoordinate extraction statistics:")
|
|
print(f" Total files: {coord_stats['total']}")
|
|
total = coord_stats['total'] if coord_stats['total'] > 0 else 1
|
|
pct = 100 * coord_stats['with_coords'] / total
|
|
print(f" With coordinates: {coord_stats['with_coords']} ({pct:.1f}%)")
|
|
print(f" Coordinate sources:")
|
|
for source, count in sorted(coord_stats["sources"].items(), key=lambda x: -x[1]):
|
|
print(f" {source}: {count}")
|
|
|
|
return records
|
|
|
|
|
|
def upload_to_ducklake(records: list[dict], table_name: str, mode: str = "replace") -> dict:
|
|
"""Upload records to DuckLake via API."""
|
|
print(f"\nUploading {len(records)} records to DuckLake table '{table_name}'...")
|
|
|
|
# Write records to temporary JSON file
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
json.dump(records, f, ensure_ascii=False, default=str)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
# Upload via multipart form
|
|
with open(temp_path, "rb") as f:
|
|
files = {"file": ("custodians.json", f, "application/json")}
|
|
data = {"table_name": table_name, "mode": mode}
|
|
|
|
response = requests.post(
|
|
f"{DUCKLAKE_URL}/upload",
|
|
files=files,
|
|
data=data,
|
|
timeout=600
|
|
)
|
|
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
print(f"Upload successful: {result}")
|
|
return result
|
|
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
|
|
def check_ducklake_status() -> bool:
|
|
"""Check if DuckLake server is running."""
|
|
try:
|
|
response = requests.get(f"{DUCKLAKE_URL}/", timeout=5)
|
|
status = response.json()
|
|
print(f"DuckLake status: {status.get('status', 'unknown')}")
|
|
print(f" Tables: {status.get('tables', 0)}")
|
|
print(f" Total rows: {status.get('total_rows', 0)}")
|
|
print(f" Snapshots: {status.get('snapshots', 0)}")
|
|
return status.get("status") == "healthy"
|
|
except Exception as e:
|
|
print(f"Error connecting to DuckLake at {DUCKLAKE_URL}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Load custodian YAML files to DuckLake (v3)")
|
|
parser.add_argument("--dry-run", action="store_true", help="Only load and validate, don't upload")
|
|
parser.add_argument("--limit", type=int, help="Limit number of files to process")
|
|
parser.add_argument("--mode", choices=["replace", "append", "create"], default="replace",
|
|
help="Upload mode (default: replace)")
|
|
parser.add_argument("--directory", "-d", type=Path, default=None,
|
|
help="Directory containing custodian YAML files (default: auto-detect)")
|
|
args = parser.parse_args()
|
|
|
|
# Determine custodian directory
|
|
if args.directory:
|
|
custodian_dir = args.directory
|
|
else:
|
|
custodian_dir = DEFAULT_CUSTODIAN_DIR
|
|
|
|
print("=" * 60)
|
|
print("DuckLake Custodian Data Loader v3")
|
|
print(" Enhanced coordinate extraction with cascading fallback")
|
|
print("=" * 60)
|
|
print(f"Source directory: {custodian_dir}")
|
|
print(f"Target table: {TABLE_NAME}")
|
|
print(f"DuckLake URL: {DUCKLAKE_URL}")
|
|
print(f"Mode: {'DRY RUN' if args.dry_run else args.mode.upper()}")
|
|
if args.limit:
|
|
print(f"Limit: {args.limit} files")
|
|
print("=" * 60)
|
|
|
|
# Check if directory exists
|
|
if not custodian_dir.exists():
|
|
print(f"Error: Directory not found: {custodian_dir}")
|
|
sys.exit(1)
|
|
|
|
# Count YAML files
|
|
yaml_count = len(list(custodian_dir.glob("*.yaml")))
|
|
print(f"Found {yaml_count} YAML files")
|
|
|
|
if yaml_count == 0:
|
|
print("No YAML files found. Exiting.")
|
|
sys.exit(0)
|
|
|
|
# Load YAML files
|
|
records = load_yaml_files(custodian_dir, limit=args.limit)
|
|
|
|
if not records:
|
|
print("No records loaded. Exiting.")
|
|
sys.exit(1)
|
|
|
|
# Show sample record
|
|
print("\nSample record (first):")
|
|
sample = records[0]
|
|
for key in ["file_name", "ghcid_current", "custodian_name", "emic_name", "city", "country", "latitude", "longitude", "coordinate_source"]:
|
|
print(f" {key}: {sample.get(key, 'N/A')}")
|
|
|
|
if args.dry_run:
|
|
print("\n[DRY RUN] Would upload to DuckLake. Exiting without upload.")
|
|
return
|
|
|
|
# Check DuckLake status
|
|
print("\nChecking DuckLake connection...")
|
|
if not check_ducklake_status():
|
|
print("Error: DuckLake is not available. Please start the server.")
|
|
print(" cd backend/ducklake && python main.py")
|
|
sys.exit(1)
|
|
|
|
# Upload to DuckLake
|
|
result = upload_to_ducklake(records, TABLE_NAME, mode=args.mode)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Load complete!")
|
|
print(f" Table: {TABLE_NAME}")
|
|
print(f" Rows: {result.get('rows_inserted', len(records))}")
|
|
print(f" Snapshot ID: {result.get('snapshot_id', 'N/A')}")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|