#!/usr/bin/env python3 """ Load all custodian YAML files into DuckLake as raw enriched data. v3 - Enhanced coordinate extraction with cascading fallback across 10 known patterns. This script reads all YAML files from data/custodian/ and uploads them to DuckLake as a single table for the bronze/raw data lake layer. Usage: python scripts/load_custodians_to_ducklake_v3.py [--dry-run] [--limit N] API Endpoint: http://localhost:8765/upload COORDINATE EXTRACTION PRIORITY (highest to lowest): 1. manual_location_override.coordinates - User-verified corrections 2. google_maps_enrichment.coordinates - Most accurate for operational institutions 3. wikidata_enrichment.wikidata_coordinates - Good coverage with precision 4. wikidata_enrichment.coordinates - Verified Wikidata data 5. ghcid.location_resolution.source_coordinates - GeoNames-resolved 6. ghcid.location_resolution (direct) - Early GeoNames integration 7. original_entry.locations[0] - CH-Annotator files 8. locations[0] - Root-level locations array 9. location (root level) - Single location object 10. Do NOT use: google_maps_enrichment_invalid, unesco nearby sites PROVENANCE TRACKING: Each coordinate extraction records the source for debugging. """ import argparse import json import os import sys import tempfile from datetime import datetime from pathlib import Path import requests import yaml # Configuration DUCKLAKE_URL = os.getenv("DUCKLAKE_URL", "http://localhost:8765") DEFAULT_CUSTODIAN_DIR = Path(__file__).parent.parent / "data" / "custodian" TABLE_NAME = "custodians_raw" def extract_coordinates(data: dict) -> tuple[float | None, float | None, str]: """ Extract coordinates using cascading priority fallback. Returns: Tuple of (latitude, longitude, source) where source indicates which pattern was used for provenance tracking. """ # Priority 1: manual_location_override (user-verified corrections) override = data.get("manual_location_override", {}) if override: coords = override.get("coordinates", {}) lat = coords.get("latitude") lon = coords.get("longitude") if lat is not None and lon is not None: return lat, lon, "manual_location_override" # Priority 2: google_maps_enrichment (most accurate for operational institutions) gm = data.get("google_maps_enrichment", {}) if gm: coords = gm.get("coordinates", {}) lat = coords.get("latitude") lon = coords.get("longitude") if lat is not None and lon is not None: return lat, lon, "google_maps_enrichment" # Priority 3: wikidata_enrichment.wikidata_coordinates (with precision) wd = data.get("wikidata_enrichment", {}) if wd: # First try wikidata_coordinates (most common pattern) wd_coords = wd.get("wikidata_coordinates", {}) if wd_coords: lat = wd_coords.get("latitude") lon = wd_coords.get("longitude") if lat is not None and lon is not None: return lat, lon, "wikidata_enrichment.wikidata_coordinates" # Priority 4: wikidata_enrichment.coordinates (alternative structure) coords = wd.get("coordinates", {}) if coords: lat = coords.get("latitude") lon = coords.get("longitude") if lat is not None and lon is not None: return lat, lon, "wikidata_enrichment.coordinates" # Priority 5: ghcid.location_resolution.source_coordinates (GeoNames-resolved) ghcid = data.get("ghcid", {}) if ghcid: loc_res = ghcid.get("location_resolution", {}) if loc_res: # Try source_coordinates first src_coords = loc_res.get("source_coordinates", {}) if src_coords: lat = src_coords.get("latitude") lon = src_coords.get("longitude") if lat is not None and lon is not None: return lat, lon, "ghcid.location_resolution.source_coordinates" # Priority 6: ghcid.location_resolution direct (older format) lat = loc_res.get("latitude") lon = loc_res.get("longitude") if lat is not None and lon is not None: return lat, lon, "ghcid.location_resolution" # Priority 7: original_entry.locations[0] (CH-Annotator files) original = data.get("original_entry", {}) if original: locations = original.get("locations", []) if locations and isinstance(locations, list) and len(locations) > 0: loc = locations[0] lat = loc.get("latitude") lon = loc.get("longitude") if lat is not None and lon is not None: return lat, lon, "original_entry.locations[0]" # Priority 8: locations[0] (root-level locations array) locations = data.get("locations", []) if locations and isinstance(locations, list) and len(locations) > 0: loc = locations[0] lat = loc.get("latitude") lon = loc.get("longitude") if lat is not None and lon is not None: return lat, lon, "locations[0]" # Some files have coordinates nested under "coordinates" coords = loc.get("coordinates", {}) if coords: lat = coords.get("latitude") lon = coords.get("longitude") if lat is not None and lon is not None: return lat, lon, "locations[0].coordinates" # Priority 9: location (root-level single object) location = data.get("location", {}) if location and isinstance(location, dict): lat = location.get("latitude") lon = location.get("longitude") if lat is not None and lon is not None: return lat, lon, "location" # No coordinates found return None, None, "" def extract_city_country(data: dict) -> tuple[str, str]: """ Extract city and country using cascading fallback. Returns: Tuple of (city, country) """ city = "" country = "" # Priority 1: Google Maps address components (most accurate) gm = data.get("google_maps_enrichment", {}) if gm: for comp in gm.get("address_components", []): types = comp.get("types", []) if "locality" in types and not city: city = comp.get("long_name", "") if "country" in types and not country: country = comp.get("short_name", "") # Priority 2: ghcid.location_resolution if not city or not country: ghcid = data.get("ghcid", {}) loc_res = ghcid.get("location_resolution", {}) if ghcid else {} if loc_res: if not city: city = loc_res.get("geonames_name", "") or loc_res.get("city_name", "") or loc_res.get("google_maps_locality", "") if not country: country = loc_res.get("country_code", "") # Priority 3: location (root level) if not city or not country: location = data.get("location", {}) if location: if not city: city = location.get("city", "") if not country: country = location.get("country", "") # Priority 4: locations[0] (root level array) if not city or not country: locations = data.get("locations", []) if locations and len(locations) > 0: loc = locations[0] if not city: city = loc.get("city", "") if not country: country = loc.get("country", "") # Priority 5: original_entry.locations[0] if not city or not country: original = data.get("original_entry", {}) if original: locations = original.get("locations", []) if locations and len(locations) > 0: loc = locations[0] if not city: city = loc.get("city", "") if not country: country = loc.get("country", "") # Priority 6: original_entry direct fields if not city or not country: original = data.get("original_entry", {}) if original: if not city: city = original.get("plaatsnaam_bezoekadres", "") or original.get("city", "") if not country: country = original.get("country", "") # Ultimate fallback: Extract country from GHCID string (first 2 chars) if not country: ghcid_current = data.get("ghcid", {}).get("ghcid_current", "") if ghcid_current and len(ghcid_current) >= 2: country = ghcid_current[:2] return city, country def extract_financial_documents(data: dict) -> dict: """ Extract financial document metadata from web_claims. Extracts counts and latest URLs for: - annual_report_url (jaarverslag) - financial_statement_url (jaarstukken) - anbi_publication_url (ANBI) - policy_document_url (beleidsplan) Returns: Dict with has_*, count, and latest_*_url fields """ result = { "has_annual_report": False, "has_financial_statement": False, "has_anbi_publication": False, "has_policy_document": False, "annual_report_count": 0, "financial_statement_count": 0, "anbi_publication_count": 0, "policy_document_count": 0, "latest_annual_report_url": "", "latest_annual_report_year": None, "latest_financial_statement_url": "", "latest_policy_document_url": "", "latest_anbi_publication_url": "", } web_claims = data.get("web_claims", {}) if not web_claims: return result claims = web_claims.get("claims", []) if not claims: return result # Group claims by type and collect year info annual_reports = [] financial_statements = [] anbi_publications = [] policy_documents = [] year_claims = {} # Map URL to year for claim in claims: claim_type = claim.get("claim_type", "") claim_value = claim.get("claim_value", "") if claim_type == "financial_document_year": # Year claims have raw_value like "year=2024, from=annual_report_url, url=..." raw = claim.get("raw_value", "") if "url=" in raw: url_part = raw.split("url=")[-1] try: year = int(claim_value) year_claims[url_part] = year except (ValueError, TypeError): pass elif claim_type == "annual_report_url": annual_reports.append(claim_value) elif claim_type == "financial_statement_url": financial_statements.append(claim_value) elif claim_type == "anbi_publication_url": anbi_publications.append(claim_value) elif claim_type == "policy_document_url": policy_documents.append(claim_value) # Set counts and has_* flags result["annual_report_count"] = len(annual_reports) result["financial_statement_count"] = len(financial_statements) result["anbi_publication_count"] = len(anbi_publications) result["policy_document_count"] = len(policy_documents) result["has_annual_report"] = len(annual_reports) > 0 result["has_financial_statement"] = len(financial_statements) > 0 result["has_anbi_publication"] = len(anbi_publications) > 0 result["has_policy_document"] = len(policy_documents) > 0 # Find latest by year (or just use first if no year info) def get_latest_with_year(urls: list, year_map: dict) -> tuple: """Return (url, year) for the document with highest year.""" if not urls: return "", None # Try to find one with year info best_url = urls[0] best_year = None for url in urls: year = year_map.get(url) if year is not None: if best_year is None or year > best_year: best_year = year best_url = url return best_url, best_year url, year = get_latest_with_year(annual_reports, year_claims) result["latest_annual_report_url"] = url result["latest_annual_report_year"] = year result["latest_financial_statement_url"], _ = get_latest_with_year(financial_statements, year_claims) result["latest_policy_document_url"], _ = get_latest_with_year(policy_documents, year_claims) result["latest_anbi_publication_url"], _ = get_latest_with_year(anbi_publications, year_claims) return result def extract_top_level_fields(data: dict) -> dict: """ Extract and flatten top-level fields from YAML data. Complex nested objects are stored as JSON strings. """ record = { # File identification "file_name": data.get("_file_name", ""), "ghcid_current": "", "record_id": "", # Original entry (flattened) "org_name": "", "org_type": "", "institution_type": "", "wikidata_id": "", # Enrichment status "enrichment_status": data.get("enrichment_status", ""), "processing_timestamp": data.get("processing_timestamp", ""), # Location (with cascading extraction) "latitude": None, "longitude": None, "coordinate_source": "", # NEW: Provenance for coordinates "formatted_address": "", "city": "", "country": "", "postal_code": "", "region": "", # Custodian name consensus "custodian_name": "", "custodian_name_confidence": None, # Emic name (native language name) "emic_name": "", "name_language": "", # Ratings "google_rating": None, "google_total_ratings": None, # Financial transparency (denormalized from web_claims) "has_annual_report": False, "has_financial_statement": False, "has_anbi_publication": False, "has_policy_document": False, "annual_report_count": 0, "financial_statement_count": 0, "anbi_publication_count": 0, "policy_document_count": 0, "latest_annual_report_url": "", "latest_annual_report_year": None, "latest_financial_statement_url": "", "latest_policy_document_url": "", "latest_anbi_publication_url": "", # Temporal data "timespan_begin": None, "timespan_end": None, "destruction_date": None, "founding_date": None, "dissolution_date": None, "wikidata_inception": None, # Complex nested objects as JSON strings "original_entry_json": "", "wikidata_enrichment_json": "", "google_maps_enrichment_json": "", "web_enrichment_json": "", "web_claims_json": "", "ghcid_json": "", "identifiers_json": "", "provenance_json": "", "genealogiewerkbalk_json": "", "digital_platforms_json": "", # Additional JSON objects "service_area_json": "", "timespan_json": "", "time_of_destruction_json": "", "conflict_status_json": "", "temporal_extent_json": "", "youtube_enrichment_json": "", "unesco_mow_enrichment_json": "", "ch_annotator_json": "", "location_json": "", } # Extract GHCID ghcid = data.get("ghcid", {}) if ghcid: record["ghcid_current"] = ghcid.get("ghcid_current", "") record["record_id"] = ghcid.get("record_id", "") record["ghcid_json"] = json.dumps(ghcid, ensure_ascii=False, default=str) # Extract original entry original = data.get("original_entry", {}) if original: # Try multiple name fields record["org_name"] = original.get("organisatie", "") or original.get("name", "") or "" org_types = original.get("type", []) record["org_type"] = ",".join(org_types) if isinstance(org_types, list) else str(org_types) record["institution_type"] = original.get("institution_type", "") record["wikidata_id"] = original.get("wikidata_id", "") record["original_entry_json"] = json.dumps(original, ensure_ascii=False, default=str) # Extract coordinates using cascading priority lat, lon, coord_source = extract_coordinates(data) record["latitude"] = lat record["longitude"] = lon record["coordinate_source"] = coord_source # Extract city and country using cascading priority city, country = extract_city_country(data) record["city"] = city record["country"] = country # Extract additional location data from Google Maps gm = data.get("google_maps_enrichment", {}) if gm: record["formatted_address"] = gm.get("formatted_address", "") record["google_rating"] = gm.get("rating") record["google_total_ratings"] = gm.get("total_ratings") # Extract postal code from address components for comp in gm.get("address_components", []): types = comp.get("types", []) if "postal_code" in types: record["postal_code"] = comp.get("long_name", "") if "administrative_area_level_1" in types: record["region"] = comp.get("short_name", "") record["google_maps_enrichment_json"] = json.dumps(gm, ensure_ascii=False, default=str) # Fallback region from GHCID location_resolution if not record["region"] and ghcid: loc_res = ghcid.get("location_resolution", {}) if loc_res: record["region"] = loc_res.get("region_code", "") # Extract custodian name consensus and emic_name cn = data.get("custodian_name", {}) if cn: record["custodian_name"] = cn.get("claim_value", "") record["custodian_name_confidence"] = cn.get("confidence") # Extract emic_name (native language name) record["emic_name"] = cn.get("emic_name", "") or cn.get("claim_value_arabic", "") or "" # Infer name_language from country code language_map = { "NL": "nl", "BE": "nl", "DE": "de", "AT": "de", "CH": "de", "FR": "fr", "ES": "es", "IT": "it", "PT": "pt", "BR": "pt", "GB": "en", "US": "en", "AU": "en", "CA": "en", "IE": "en", "PL": "pl", "CZ": "cs", "HU": "hu", "RU": "ru", "JP": "ja", "CN": "zh", "KR": "ko", "AR": "ar", "PS": "ar", "IL": "he", "GR": "el", "TR": "tr", "SE": "sv", "NO": "no", "DK": "da", "FI": "fi", "RO": "ro", "BG": "bg", "UA": "uk", "BY": "be", } record["name_language"] = language_map.get(record["country"], "") # Extract timespan data timespan = data.get("timespan", {}) if timespan: record["timespan_begin"] = timespan.get("begin_of_the_begin", "") record["timespan_end"] = timespan.get("end_of_the_end", "") record["timespan_json"] = json.dumps(timespan, ensure_ascii=False, default=str) # Extract time_of_destruction tod = data.get("time_of_destruction", {}) if tod: record["destruction_date"] = tod.get("date", "") record["time_of_destruction_json"] = json.dumps(tod, ensure_ascii=False, default=str) # Extract conflict_status cs = data.get("conflict_status", {}) if cs: if not record["destruction_date"]: record["destruction_date"] = cs.get("date", "") record["conflict_status_json"] = json.dumps(cs, ensure_ascii=False, default=str) # Extract wikidata inception/dissolution wd = data.get("wikidata_enrichment", {}) if wd: # Try multiple paths for inception date wikidata_inception = ( wd.get("wikidata_inception", "") or wd.get("wikidata_founded", "") or wd.get("wikidata_temporal", {}).get("inception", "") ) record["wikidata_inception"] = wikidata_inception # Use wikidata_inception as founding_date fallback if wikidata_inception and not record["founding_date"]: record["founding_date"] = wikidata_inception # Try multiple paths for dissolution date wikidata_dissolution = ( wd.get("wikidata_dissolution", "") or wd.get("wikidata_dissolved", "") ) if wikidata_dissolution and not record["dissolution_date"]: record["dissolution_date"] = wikidata_dissolution record["wikidata_enrichment_json"] = json.dumps(wd, ensure_ascii=False, default=str) # Extract service_area sa = data.get("service_area", {}) if sa: record["service_area_json"] = json.dumps(sa, ensure_ascii=False, default=str) # Extract youtube_enrichment yt = data.get("youtube_enrichment", {}) if yt: record["youtube_enrichment_json"] = json.dumps(yt, ensure_ascii=False, default=str) # Extract unesco_mow_enrichment (UNESCO Memory of the World inscriptions) unesco_mow = data.get("unesco_mow_enrichment", {}) if unesco_mow: record["unesco_mow_enrichment_json"] = json.dumps(unesco_mow, ensure_ascii=False, default=str) # Extract CH-Annotator data ch = data.get("ch_annotator", {}) if ch: record["ch_annotator_json"] = json.dumps(ch, ensure_ascii=False, default=str) # Extract location as JSON location = data.get("location", {}) if location: record["location_json"] = json.dumps(location, ensure_ascii=False, default=str) # Store other complex objects as JSON if data.get("web_enrichment"): record["web_enrichment_json"] = json.dumps( data["web_enrichment"], ensure_ascii=False, default=str ) if data.get("web_claims"): record["web_claims_json"] = json.dumps( data["web_claims"], ensure_ascii=False, default=str ) # Extract denormalized financial document fields fin_docs = extract_financial_documents(data) record.update(fin_docs) if data.get("identifiers"): record["identifiers_json"] = json.dumps( data["identifiers"], ensure_ascii=False, default=str ) if data.get("provenance"): record["provenance_json"] = json.dumps( data["provenance"], ensure_ascii=False, default=str ) if data.get("genealogiewerkbalk_enrichment"): record["genealogiewerkbalk_json"] = json.dumps( data["genealogiewerkbalk_enrichment"], ensure_ascii=False, default=str ) if data.get("digital_platforms"): record["digital_platforms_json"] = json.dumps( data["digital_platforms"], ensure_ascii=False, default=str ) return record def load_yaml_files(directory: Path, limit: int | None = None) -> list[dict]: """Load all YAML files from directory and convert to records.""" records = [] yaml_files = sorted(directory.glob("*.yaml")) if limit: yaml_files = yaml_files[:limit] print(f"Loading {len(yaml_files)} YAML files from {directory}...") coord_stats = { "total": 0, "with_coords": 0, "sources": {} } for i, yaml_file in enumerate(yaml_files): try: with open(yaml_file, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if data: data["_file_name"] = yaml_file.name record = extract_top_level_fields(data) records.append(record) # Track coordinate extraction stats coord_stats["total"] += 1 if record.get("latitude") is not None and record.get("longitude") is not None: coord_stats["with_coords"] += 1 source = record.get("coordinate_source", "unknown") coord_stats["sources"][source] = coord_stats["sources"].get(source, 0) + 1 if (i + 1) % 1000 == 0: print(f" Processed {i + 1}/{len(yaml_files)} files...") except Exception as e: print(f" Warning: Failed to load {yaml_file.name}: {e}") continue print(f"\nSuccessfully loaded {len(records)} records") print(f"\nCoordinate extraction statistics:") print(f" Total files: {coord_stats['total']}") total = coord_stats['total'] if coord_stats['total'] > 0 else 1 pct = 100 * coord_stats['with_coords'] / total print(f" With coordinates: {coord_stats['with_coords']} ({pct:.1f}%)") print(f" Coordinate sources:") for source, count in sorted(coord_stats["sources"].items(), key=lambda x: -x[1]): print(f" {source}: {count}") return records def upload_to_ducklake(records: list[dict], table_name: str, mode: str = "replace") -> dict: """Upload records to DuckLake via API.""" print(f"\nUploading {len(records)} records to DuckLake table '{table_name}'...") # Write records to temporary JSON file with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: json.dump(records, f, ensure_ascii=False, default=str) temp_path = f.name try: # Upload via multipart form with open(temp_path, "rb") as f: files = {"file": ("custodians.json", f, "application/json")} data = {"table_name": table_name, "mode": mode} response = requests.post( f"{DUCKLAKE_URL}/upload", files=files, data=data, timeout=600 ) response.raise_for_status() result = response.json() print(f"Upload successful: {result}") return result finally: os.unlink(temp_path) def check_ducklake_status() -> bool: """Check if DuckLake server is running.""" try: response = requests.get(f"{DUCKLAKE_URL}/", timeout=5) status = response.json() print(f"DuckLake status: {status.get('status', 'unknown')}") print(f" Tables: {status.get('tables', 0)}") print(f" Total rows: {status.get('total_rows', 0)}") print(f" Snapshots: {status.get('snapshots', 0)}") return status.get("status") == "healthy" except Exception as e: print(f"Error connecting to DuckLake at {DUCKLAKE_URL}: {e}") return False def main(): parser = argparse.ArgumentParser(description="Load custodian YAML files to DuckLake (v3)") parser.add_argument("--dry-run", action="store_true", help="Only load and validate, don't upload") parser.add_argument("--limit", type=int, help="Limit number of files to process") parser.add_argument("--mode", choices=["replace", "append", "create"], default="replace", help="Upload mode (default: replace)") parser.add_argument("--directory", "-d", type=Path, default=None, help="Directory containing custodian YAML files (default: auto-detect)") args = parser.parse_args() # Determine custodian directory if args.directory: custodian_dir = args.directory else: custodian_dir = DEFAULT_CUSTODIAN_DIR print("=" * 60) print("DuckLake Custodian Data Loader v3") print(" Enhanced coordinate extraction with cascading fallback") print("=" * 60) print(f"Source directory: {custodian_dir}") print(f"Target table: {TABLE_NAME}") print(f"DuckLake URL: {DUCKLAKE_URL}") print(f"Mode: {'DRY RUN' if args.dry_run else args.mode.upper()}") if args.limit: print(f"Limit: {args.limit} files") print("=" * 60) # Check if directory exists if not custodian_dir.exists(): print(f"Error: Directory not found: {custodian_dir}") sys.exit(1) # Count YAML files yaml_count = len(list(custodian_dir.glob("*.yaml"))) print(f"Found {yaml_count} YAML files") if yaml_count == 0: print("No YAML files found. Exiting.") sys.exit(0) # Load YAML files records = load_yaml_files(custodian_dir, limit=args.limit) if not records: print("No records loaded. Exiting.") sys.exit(1) # Show sample record print("\nSample record (first):") sample = records[0] for key in ["file_name", "ghcid_current", "custodian_name", "emic_name", "city", "country", "latitude", "longitude", "coordinate_source"]: print(f" {key}: {sample.get(key, 'N/A')}") if args.dry_run: print("\n[DRY RUN] Would upload to DuckLake. Exiting without upload.") return # Check DuckLake status print("\nChecking DuckLake connection...") if not check_ducklake_status(): print("Error: DuckLake is not available. Please start the server.") print(" cd backend/ducklake && python main.py") sys.exit(1) # Upload to DuckLake result = upload_to_ducklake(records, TABLE_NAME, mode=args.mode) print("\n" + "=" * 60) print("Load complete!") print(f" Table: {TABLE_NAME}") print(f" Rows: {result.get('rows_inserted', len(records))}") print(f" Snapshot ID: {result.get('snapshot_id', 'N/A')}") print("=" * 60) if __name__ == "__main__": main()