glam/scripts/load_custodians_to_ducklake_v3.py
2025-12-12 12:51:10 +01:00

785 lines
29 KiB
Python

#!/usr/bin/env python3
"""
Load all custodian YAML files into DuckLake as raw enriched data.
v3 - Enhanced coordinate extraction with cascading fallback across 10 known patterns.
This script reads all YAML files from data/custodian/ and uploads them
to DuckLake as a single table for the bronze/raw data lake layer.
Usage:
python scripts/load_custodians_to_ducklake_v3.py [--dry-run] [--limit N]
API Endpoint: http://localhost:8765/upload
COORDINATE EXTRACTION PRIORITY (highest to lowest):
1. manual_location_override.coordinates - User-verified corrections
2. google_maps_enrichment.coordinates - Most accurate for operational institutions
3. wikidata_enrichment.wikidata_coordinates - Good coverage with precision
4. wikidata_enrichment.coordinates - Verified Wikidata data
5. ghcid.location_resolution.source_coordinates - GeoNames-resolved
6. ghcid.location_resolution (direct) - Early GeoNames integration
7. original_entry.locations[0] - CH-Annotator files
8. locations[0] - Root-level locations array
9. location (root level) - Single location object
10. Do NOT use: google_maps_enrichment_invalid, unesco nearby sites
PROVENANCE TRACKING:
Each coordinate extraction records the source for debugging.
"""
import argparse
import json
import os
import sys
import tempfile
from datetime import datetime
from pathlib import Path
import requests
import yaml
# Configuration
DUCKLAKE_URL = os.getenv("DUCKLAKE_URL", "http://localhost:8765")
DEFAULT_CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian"
TABLE_NAME = "custodians_raw"
def extract_coordinates(data: dict) -> tuple[float | None, float | None, str]:
"""
Extract coordinates using cascading priority fallback.
Returns:
Tuple of (latitude, longitude, source) where source indicates
which pattern was used for provenance tracking.
"""
# Priority 1: manual_location_override (user-verified corrections)
override = data.get("manual_location_override", {})
if override:
coords = override.get("coordinates", {})
lat = coords.get("latitude")
lon = coords.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "manual_location_override"
# Priority 2: google_maps_enrichment (most accurate for operational institutions)
gm = data.get("google_maps_enrichment", {})
if gm:
coords = gm.get("coordinates", {})
lat = coords.get("latitude")
lon = coords.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "google_maps_enrichment"
# Priority 3: wikidata_enrichment.wikidata_coordinates (with precision)
wd = data.get("wikidata_enrichment", {})
if wd:
# First try wikidata_coordinates (most common pattern)
wd_coords = wd.get("wikidata_coordinates", {})
if wd_coords:
lat = wd_coords.get("latitude")
lon = wd_coords.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "wikidata_enrichment.wikidata_coordinates"
# Priority 4: wikidata_enrichment.coordinates (alternative structure)
coords = wd.get("coordinates", {})
if coords:
lat = coords.get("latitude")
lon = coords.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "wikidata_enrichment.coordinates"
# Priority 5: ghcid.location_resolution.source_coordinates (GeoNames-resolved)
ghcid = data.get("ghcid", {})
if ghcid:
loc_res = ghcid.get("location_resolution", {})
if loc_res:
# Try source_coordinates first
src_coords = loc_res.get("source_coordinates", {})
if src_coords:
lat = src_coords.get("latitude")
lon = src_coords.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "ghcid.location_resolution.source_coordinates"
# Priority 6: ghcid.location_resolution direct (older format)
lat = loc_res.get("latitude")
lon = loc_res.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "ghcid.location_resolution"
# Priority 7: original_entry.locations[0] (CH-Annotator files)
original = data.get("original_entry", {})
if original:
locations = original.get("locations", [])
if locations and isinstance(locations, list) and len(locations) > 0:
loc = locations[0]
lat = loc.get("latitude")
lon = loc.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "original_entry.locations[0]"
# Priority 8: locations[0] (root-level locations array)
locations = data.get("locations", [])
if locations and isinstance(locations, list) and len(locations) > 0:
loc = locations[0]
lat = loc.get("latitude")
lon = loc.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "locations[0]"
# Some files have coordinates nested under "coordinates"
coords = loc.get("coordinates", {})
if coords:
lat = coords.get("latitude")
lon = coords.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "locations[0].coordinates"
# Priority 9: location (root-level single object)
location = data.get("location", {})
if location and isinstance(location, dict):
lat = location.get("latitude")
lon = location.get("longitude")
if lat is not None and lon is not None:
return lat, lon, "location"
# No coordinates found
return None, None, ""
def extract_city_country(data: dict) -> tuple[str, str]:
"""
Extract city and country using cascading fallback.
Returns:
Tuple of (city, country)
"""
city = ""
country = ""
# Priority 1: Google Maps address components (most accurate)
gm = data.get("google_maps_enrichment", {})
if gm:
for comp in gm.get("address_components", []):
types = comp.get("types", [])
if "locality" in types and not city:
city = comp.get("long_name", "")
if "country" in types and not country:
country = comp.get("short_name", "")
# Priority 2: ghcid.location_resolution
if not city or not country:
ghcid = data.get("ghcid", {})
loc_res = ghcid.get("location_resolution", {}) if ghcid else {}
if loc_res:
if not city:
city = loc_res.get("geonames_name", "") or loc_res.get("city_name", "") or loc_res.get("google_maps_locality", "")
if not country:
country = loc_res.get("country_code", "")
# Priority 3: location (root level)
if not city or not country:
location = data.get("location", {})
if location:
if not city:
city = location.get("city", "")
if not country:
country = location.get("country", "")
# Priority 4: locations[0] (root level array)
if not city or not country:
locations = data.get("locations", [])
if locations and len(locations) > 0:
loc = locations[0]
if not city:
city = loc.get("city", "")
if not country:
country = loc.get("country", "")
# Priority 5: original_entry.locations[0]
if not city or not country:
original = data.get("original_entry", {})
if original:
locations = original.get("locations", [])
if locations and len(locations) > 0:
loc = locations[0]
if not city:
city = loc.get("city", "")
if not country:
country = loc.get("country", "")
# Priority 6: original_entry direct fields
if not city or not country:
original = data.get("original_entry", {})
if original:
if not city:
city = original.get("plaatsnaam_bezoekadres", "") or original.get("city", "")
if not country:
country = original.get("country", "")
# Ultimate fallback: Extract country from GHCID string (first 2 chars)
if not country:
ghcid_current = data.get("ghcid", {}).get("ghcid_current", "")
if ghcid_current and len(ghcid_current) >= 2:
country = ghcid_current[:2]
return city, country
def extract_financial_documents(data: dict) -> dict:
"""
Extract financial document metadata from web_claims.
Extracts counts and latest URLs for:
- annual_report_url (jaarverslag)
- financial_statement_url (jaarstukken)
- anbi_publication_url (ANBI)
- policy_document_url (beleidsplan)
Returns:
Dict with has_*, count, and latest_*_url fields
"""
result = {
"has_annual_report": False,
"has_financial_statement": False,
"has_anbi_publication": False,
"has_policy_document": False,
"annual_report_count": 0,
"financial_statement_count": 0,
"anbi_publication_count": 0,
"policy_document_count": 0,
"latest_annual_report_url": "",
"latest_annual_report_year": None,
"latest_financial_statement_url": "",
"latest_policy_document_url": "",
"latest_anbi_publication_url": "",
}
web_claims = data.get("web_claims", {})
if not web_claims:
return result
claims = web_claims.get("claims", [])
if not claims:
return result
# Group claims by type and collect year info
annual_reports = []
financial_statements = []
anbi_publications = []
policy_documents = []
year_claims = {} # Map URL to year
for claim in claims:
claim_type = claim.get("claim_type", "")
claim_value = claim.get("claim_value", "")
if claim_type == "financial_document_year":
# Year claims have raw_value like "year=2024, from=annual_report_url, url=..."
raw = claim.get("raw_value", "")
if "url=" in raw:
url_part = raw.split("url=")[-1]
try:
year = int(claim_value)
year_claims[url_part] = year
except (ValueError, TypeError):
pass
elif claim_type == "annual_report_url":
annual_reports.append(claim_value)
elif claim_type == "financial_statement_url":
financial_statements.append(claim_value)
elif claim_type == "anbi_publication_url":
anbi_publications.append(claim_value)
elif claim_type == "policy_document_url":
policy_documents.append(claim_value)
# Set counts and has_* flags
result["annual_report_count"] = len(annual_reports)
result["financial_statement_count"] = len(financial_statements)
result["anbi_publication_count"] = len(anbi_publications)
result["policy_document_count"] = len(policy_documents)
result["has_annual_report"] = len(annual_reports) > 0
result["has_financial_statement"] = len(financial_statements) > 0
result["has_anbi_publication"] = len(anbi_publications) > 0
result["has_policy_document"] = len(policy_documents) > 0
# Find latest by year (or just use first if no year info)
def get_latest_with_year(urls: list, year_map: dict) -> tuple:
"""Return (url, year) for the document with highest year."""
if not urls:
return "", None
# Try to find one with year info
best_url = urls[0]
best_year = None
for url in urls:
year = year_map.get(url)
if year is not None:
if best_year is None or year > best_year:
best_year = year
best_url = url
return best_url, best_year
url, year = get_latest_with_year(annual_reports, year_claims)
result["latest_annual_report_url"] = url
result["latest_annual_report_year"] = year
result["latest_financial_statement_url"], _ = get_latest_with_year(financial_statements, year_claims)
result["latest_policy_document_url"], _ = get_latest_with_year(policy_documents, year_claims)
result["latest_anbi_publication_url"], _ = get_latest_with_year(anbi_publications, year_claims)
return result
def extract_top_level_fields(data: dict) -> dict:
"""
Extract and flatten top-level fields from YAML data.
Complex nested objects are stored as JSON strings.
"""
record = {
# File identification
"file_name": data.get("_file_name", ""),
"ghcid_current": "",
"record_id": "",
# Original entry (flattened)
"org_name": "",
"org_type": "",
"institution_type": "",
"wikidata_id": "",
# Enrichment status
"enrichment_status": data.get("enrichment_status", ""),
"processing_timestamp": data.get("processing_timestamp", ""),
# Location (with cascading extraction)
"latitude": None,
"longitude": None,
"coordinate_source": "", # NEW: Provenance for coordinates
"formatted_address": "",
"city": "",
"country": "",
"postal_code": "",
"region": "",
# Custodian name consensus
"custodian_name": "",
"custodian_name_confidence": None,
# Emic name (native language name)
"emic_name": "",
"name_language": "",
# Ratings
"google_rating": None,
"google_total_ratings": None,
# Financial transparency (denormalized from web_claims)
"has_annual_report": False,
"has_financial_statement": False,
"has_anbi_publication": False,
"has_policy_document": False,
"annual_report_count": 0,
"financial_statement_count": 0,
"anbi_publication_count": 0,
"policy_document_count": 0,
"latest_annual_report_url": "",
"latest_annual_report_year": None,
"latest_financial_statement_url": "",
"latest_policy_document_url": "",
"latest_anbi_publication_url": "",
# Temporal data
"timespan_begin": None,
"timespan_end": None,
"destruction_date": None,
"founding_date": None,
"dissolution_date": None,
"wikidata_inception": None,
# Complex nested objects as JSON strings
"original_entry_json": "",
"wikidata_enrichment_json": "",
"google_maps_enrichment_json": "",
"web_enrichment_json": "",
"web_claims_json": "",
"ghcid_json": "",
"identifiers_json": "",
"provenance_json": "",
"genealogiewerkbalk_json": "",
"digital_platforms_json": "",
# Additional JSON objects
"service_area_json": "",
"timespan_json": "",
"time_of_destruction_json": "",
"conflict_status_json": "",
"temporal_extent_json": "",
"youtube_enrichment_json": "",
"unesco_mow_enrichment_json": "",
"ch_annotator_json": "",
"location_json": "",
}
# Extract GHCID
ghcid = data.get("ghcid", {})
if ghcid:
record["ghcid_current"] = ghcid.get("ghcid_current", "")
record["record_id"] = ghcid.get("record_id", "")
record["ghcid_json"] = json.dumps(ghcid, ensure_ascii=False, default=str)
# Extract original entry
original = data.get("original_entry", {})
if original:
# Try multiple name fields
record["org_name"] = original.get("organisatie", "") or original.get("name", "") or ""
org_types = original.get("type", [])
record["org_type"] = ",".join(org_types) if isinstance(org_types, list) else str(org_types)
record["institution_type"] = original.get("institution_type", "")
record["wikidata_id"] = original.get("wikidata_id", "")
record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str)
# Extract coordinates using cascading priority
lat, lon, coord_source = extract_coordinates(data)
record["latitude"] = lat
record["longitude"] = lon
record["coordinate_source"] = coord_source
# Extract city and country using cascading priority
city, country = extract_city_country(data)
record["city"] = city
record["country"] = country
# Extract additional location data from Google Maps
gm = data.get("google_maps_enrichment", {})
if gm:
record["formatted_address"] = gm.get("formatted_address", "")
record["google_rating"] = gm.get("rating")
record["google_total_ratings"] = gm.get("total_ratings")
# Extract postal code from address components
for comp in gm.get("address_components", []):
types = comp.get("types", [])
if "postal_code" in types:
record["postal_code"] = comp.get("long_name", "")
if "administrative_area_level_1" in types:
record["region"] = comp.get("short_name", "")
record["google_maps_enrichment_json"] = json.dumps(gm, ensure_ascii=False, default=str)
# Fallback region from GHCID location_resolution
if not record["region"] and ghcid:
loc_res = ghcid.get("location_resolution", {})
if loc_res:
record["region"] = loc_res.get("region_code", "")
# Extract custodian name consensus and emic_name
cn = data.get("custodian_name", {})
if cn:
record["custodian_name"] = cn.get("claim_value", "")
record["custodian_name_confidence"] = cn.get("confidence")
# Extract emic_name (native language name)
record["emic_name"] = cn.get("emic_name", "") or cn.get("claim_value_arabic", "") or ""
# Infer name_language from country code
language_map = {
"NL": "nl", "BE": "nl", "DE": "de", "AT": "de", "CH": "de",
"FR": "fr", "ES": "es", "IT": "it", "PT": "pt", "BR": "pt",
"GB": "en", "US": "en", "AU": "en", "CA": "en", "IE": "en",
"PL": "pl", "CZ": "cs", "HU": "hu", "RU": "ru", "JP": "ja",
"CN": "zh", "KR": "ko", "AR": "ar", "PS": "ar", "IL": "he",
"GR": "el", "TR": "tr", "SE": "sv", "NO": "no", "DK": "da",
"FI": "fi", "RO": "ro", "BG": "bg", "UA": "uk", "BY": "be",
}
record["name_language"] = language_map.get(record["country"], "")
# Extract timespan data
timespan = data.get("timespan", {})
if timespan:
record["timespan_begin"] = timespan.get("begin_of_the_begin", "")
record["timespan_end"] = timespan.get("end_of_the_end", "")
record["timespan_json"] = json.dumps(timespan, ensure_ascii=False, default=str)
# Extract time_of_destruction
tod = data.get("time_of_destruction", {})
if tod:
record["destruction_date"] = tod.get("date", "")
record["time_of_destruction_json"] = json.dumps(tod, ensure_ascii=False, default=str)
# Extract conflict_status
cs = data.get("conflict_status", {})
if cs:
if not record["destruction_date"]:
record["destruction_date"] = cs.get("date", "")
record["conflict_status_json"] = json.dumps(cs, ensure_ascii=False, default=str)
# Extract wikidata inception/dissolution
wd = data.get("wikidata_enrichment", {})
if wd:
# Try multiple paths for inception date
wikidata_inception = (
wd.get("wikidata_inception", "") or
wd.get("wikidata_founded", "") or
wd.get("wikidata_temporal", {}).get("inception", "")
)
record["wikidata_inception"] = wikidata_inception
# Use wikidata_inception as founding_date fallback
if wikidata_inception and not record["founding_date"]:
record["founding_date"] = wikidata_inception
# Try multiple paths for dissolution date
wikidata_dissolution = (
wd.get("wikidata_dissolution", "") or
wd.get("wikidata_dissolved", "")
)
if wikidata_dissolution and not record["dissolution_date"]:
record["dissolution_date"] = wikidata_dissolution
record["wikidata_enrichment_json"] = json.dumps(wd, ensure_ascii=False, default=str)
# Extract service_area
sa = data.get("service_area", {})
if sa:
record["service_area_json"] = json.dumps(sa, ensure_ascii=False, default=str)
# Extract youtube_enrichment
yt = data.get("youtube_enrichment", {})
if yt:
record["youtube_enrichment_json"] = json.dumps(yt, ensure_ascii=False, default=str)
# Extract unesco_mow_enrichment (UNESCO Memory of the World inscriptions)
unesco_mow = data.get("unesco_mow_enrichment", {})
if unesco_mow:
record["unesco_mow_enrichment_json"] = json.dumps(unesco_mow, ensure_ascii=False, default=str)
# Extract CH-Annotator data
ch = data.get("ch_annotator", {})
if ch:
record["ch_annotator_json"] = json.dumps(ch, ensure_ascii=False, default=str)
# Extract location as JSON
location = data.get("location", {})
if location:
record["location_json"] = json.dumps(location, ensure_ascii=False, default=str)
# Store other complex objects as JSON
if data.get("web_enrichment"):
record["web_enrichment_json"] = json.dumps(
data["web_enrichment"], ensure_ascii=False, default=str
)
if data.get("web_claims"):
record["web_claims_json"] = json.dumps(
data["web_claims"], ensure_ascii=False, default=str
)
# Extract denormalized financial document fields
fin_docs = extract_financial_documents(data)
record.update(fin_docs)
if data.get("identifiers"):
record["identifiers_json"] = json.dumps(
data["identifiers"], ensure_ascii=False, default=str
)
if data.get("provenance"):
record["provenance_json"] = json.dumps(
data["provenance"], ensure_ascii=False, default=str
)
if data.get("genealogiewerkbalk_enrichment"):
record["genealogiewerkbalk_json"] = json.dumps(
data["genealogiewerkbalk_enrichment"], ensure_ascii=False, default=str
)
if data.get("digital_platforms"):
record["digital_platforms_json"] = json.dumps(
data["digital_platforms"], ensure_ascii=False, default=str
)
return record
def load_yaml_files(directory: Path, limit: int | None = None) -> list[dict]:
"""Load all YAML files from directory and convert to records."""
records = []
yaml_files = sorted(directory.glob("*.yaml"))
if limit:
yaml_files = yaml_files[:limit]
print(f"Loading {len(yaml_files)} YAML files from {directory}...")
coord_stats = {
"total": 0,
"with_coords": 0,
"sources": {}
}
for i, yaml_file in enumerate(yaml_files):
try:
with open(yaml_file, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if data:
data["_file_name"] = yaml_file.name
record = extract_top_level_fields(data)
records.append(record)
# Track coordinate extraction stats
coord_stats["total"] += 1
if record.get("latitude") is not None and record.get("longitude") is not None:
coord_stats["with_coords"] += 1
source = record.get("coordinate_source", "unknown")
coord_stats["sources"][source] = coord_stats["sources"].get(source, 0) + 1
if (i + 1) % 1000 == 0:
print(f" Processed {i + 1}/{len(yaml_files)} files...")
except Exception as e:
print(f" Warning: Failed to load {yaml_file.name}: {e}")
continue
print(f"\nSuccessfully loaded {len(records)} records")
print(f"\nCoordinate extraction statistics:")
print(f" Total files: {coord_stats['total']}")
total = coord_stats['total'] if coord_stats['total'] > 0 else 1
pct = 100 * coord_stats['with_coords'] / total
print(f" With coordinates: {coord_stats['with_coords']} ({pct:.1f}%)")
print(f" Coordinate sources:")
for source, count in sorted(coord_stats["sources"].items(), key=lambda x: -x[1]):
print(f" {source}: {count}")
return records
def upload_to_ducklake(records: list[dict], table_name: str, mode: str = "replace") -> dict:
"""Upload records to DuckLake via API."""
print(f"\nUploading {len(records)} records to DuckLake table '{table_name}'...")
# Write records to temporary JSON file
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(records, f, ensure_ascii=False, default=str)
temp_path = f.name
try:
# Upload via multipart form
with open(temp_path, "rb") as f:
files = {"file": ("custodians.json", f, "application/json")}
data = {"table_name": table_name, "mode": mode}
response = requests.post(
f"{DUCKLAKE_URL}/upload",
files=files,
data=data,
timeout=600
)
response.raise_for_status()
result = response.json()
print(f"Upload successful: {result}")
return result
finally:
os.unlink(temp_path)
def check_ducklake_status() -> bool:
"""Check if DuckLake server is running."""
try:
response = requests.get(f"{DUCKLAKE_URL}/", timeout=5)
status = response.json()
print(f"DuckLake status: {status.get('status', 'unknown')}")
print(f" Tables: {status.get('tables', 0)}")
print(f" Total rows: {status.get('total_rows', 0)}")
print(f" Snapshots: {status.get('snapshots', 0)}")
return status.get("status") == "healthy"
except Exception as e:
print(f"Error connecting to DuckLake at {DUCKLAKE_URL}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Load custodian YAML files to DuckLake (v3)")
parser.add_argument("--dry-run", action="store_true", help="Only load and validate, don't upload")
parser.add_argument("--limit", type=int, help="Limit number of files to process")
parser.add_argument("--mode", choices=["replace", "append", "create"], default="replace",
help="Upload mode (default: replace)")
parser.add_argument("--directory", "-d", type=Path, default=None,
help="Directory containing custodian YAML files (default: auto-detect)")
args = parser.parse_args()
# Determine custodian directory
if args.directory:
custodian_dir = args.directory
else:
custodian_dir = DEFAULT_CUSTODIAN_DIR
print("=" * 60)
print("DuckLake Custodian Data Loader v3")
print(" Enhanced coordinate extraction with cascading fallback")
print("=" * 60)
print(f"Source directory: {custodian_dir}")
print(f"Target table: {TABLE_NAME}")
print(f"DuckLake URL: {DUCKLAKE_URL}")
print(f"Mode: {'DRY RUN' if args.dry_run else args.mode.upper()}")
if args.limit:
print(f"Limit: {args.limit} files")
print("=" * 60)
# Check if directory exists
if not custodian_dir.exists():
print(f"Error: Directory not found: {custodian_dir}")
sys.exit(1)
# Count YAML files
yaml_count = len(list(custodian_dir.glob("*.yaml")))
print(f"Found {yaml_count} YAML files")
if yaml_count == 0:
print("No YAML files found. Exiting.")
sys.exit(0)
# Load YAML files
records = load_yaml_files(custodian_dir, limit=args.limit)
if not records:
print("No records loaded. Exiting.")
sys.exit(1)
# Show sample record
print("\nSample record (first):")
sample = records[0]
for key in ["file_name", "ghcid_current", "custodian_name", "emic_name", "city", "country", "latitude", "longitude", "coordinate_source"]:
print(f" {key}: {sample.get(key, 'N/A')}")
if args.dry_run:
print("\n[DRY RUN] Would upload to DuckLake. Exiting without upload.")
return
# Check DuckLake status
print("\nChecking DuckLake connection...")
if not check_ducklake_status():
print("Error: DuckLake is not available. Please start the server.")
print(" cd backend/ducklake && python main.py")
sys.exit(1)
# Upload to DuckLake
result = upload_to_ducklake(records, TABLE_NAME, mode=args.mode)
print("\n" + "=" * 60)
print("Load complete!")
print(f" Table: {TABLE_NAME}")
print(f" Rows: {result.get('rows_inserted', len(records))}")
print(f" Snapshot ID: {result.get('snapshot_id', 'N/A')}")
print("=" * 60)
if __name__ == "__main__":
main()