#!/usr/bin/env python3 """Quick DuckLake loader - resumes from current count.""" import json import yaml import httpx from pathlib import Path DUCKLAKE_URL = "http://localhost:8765" TABLE_NAME = "custodians_raw" BATCH_SIZE = 2000 # Smaller batches for reliability def extract_record(entry, file_name): """Extract minimal record for map display.""" original = entry.get('original_entry', {}) ghcid = entry.get('ghcid', {}) loc = ghcid.get('location_resolution', {}) gm = entry.get('google_maps_enrichment', {}) custodian_name = entry.get('custodian_name', {}) location = entry.get('location', {}) ch = entry.get('ch_annotator', {}) ec = ch.get('entity_classification', {}) name = custodian_name.get('claim_value') or original.get('name') or original.get('organisatie') or '' lat = location.get('latitude') or gm.get('latitude') or loc.get('latitude') lon = location.get('longitude') or gm.get('longitude') or loc.get('longitude') inst_type = ec.get('glamorcubesfixphdnt_primary') or '' if not inst_type and ghcid.get('ghcid_current'): parts = ghcid['ghcid_current'].split('-') if len(parts) >= 4 and len(parts[3]) == 1: inst_type = parts[3] if not inst_type: types = original.get('type', []) if isinstance(types, list) and types: inst_type = types[0] return { 'file_name': file_name, 'ghcid_current': ghcid.get('ghcid_current', ''), 'org_name': name, 'org_type': inst_type, 'institution_type': inst_type, 'latitude': lat, 'longitude': lon, 'city': location.get('city') or loc.get('city_name'), 'country': location.get('country') or loc.get('country_code'), 'custodian_name': name, } def upload_batch(records, mode="append"): with httpx.Client(timeout=120.0) as client: response = client.post( f"{DUCKLAKE_URL}/upload", files={"file": ("data.json", json.dumps(records), "application/json")}, data={"table_name": TABLE_NAME, "mode": mode}, ) return response.json() if response.status_code == 200 else None def get_current_count(): with httpx.Client() as client: response = client.post( f"{DUCKLAKE_URL}/query", json={"query": f"SELECT COUNT(*) FROM heritage.{TABLE_NAME}"} ) if response.status_code == 200: return response.json()['rows'][0][0] return 0 if __name__ == "__main__": custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') yaml_files = sorted(custodian_dir.glob('*.yaml')) current = get_current_count() print(f"Current count in DuckLake: {current}") print(f"Total YAML files: {len(yaml_files)}") if current >= len(yaml_files): print("Already fully loaded!") exit(0) # Start from current position yaml_files = yaml_files[current:] print(f"Loading remaining {len(yaml_files)} files...") records = [] total = 0 for i, file_path in enumerate(yaml_files): try: with open(file_path) as f: entry = yaml.safe_load(f) if entry: records.append(extract_record(entry, file_path.name)) except: pass if len(records) >= BATCH_SIZE: result = upload_batch(records) if result: total += result.get('rows_inserted', 0) print(f" Batch {(i+1)//BATCH_SIZE}: +{result.get('rows_inserted')} (total: {current + total})") records = [] if records: result = upload_batch(records) if result: total += result.get('rows_inserted', 0) print(f" Final: +{result.get('rows_inserted')} (total: {current + total})") print(f"\nComplete! Added {total} rows. Total now: {current + total}")