feat(loaders): update DuckLake and TypeDB loaders with relation support

2025-12-08 15:00:14 +01:00 · 2025-12-08 15:00:14 +01:00 · 0938cce6cf
commit 0938cce6cf
parent 486bbee813
2 changed files with 487 additions and 2 deletions
--- a/scripts/load_custodians_to_ducklake.py
+++ b/scripts/load_custodians_to_ducklake.py
@ -392,6 +392,8 @@ def extract_top_level_fields(data: dict) -> dict:
    if cn:
        record["custodian_name"] = cn.get("claim_value", "")
        record["custodian_name_confidence"] = cn.get("confidence")
+        record["emic_name"] = cn.get("emic_name", "")
+        record["name_language"] = cn.get("name_language", "")
    
    # Store complex objects as JSON
    if data.get("wikidata_enrichment"):
@ -644,8 +646,8 @@ def main():
    # Show sample record
    print("\nSample record (first):")
    sample = records[0]
-    for key in ["file_name", "ghcid_current", "custodian_name", "city", "country", 
-                "google_rating", "youtube_channel_id"]:
+    for key in ["file_name", "ghcid_current", "custodian_name", "emic_name", "name_language",
+                "city", "country", "google_rating", "youtube_channel_id"]:
        value = sample.get(key, 'N/A')
        if value == "" or value is None:
            value = "(empty)"
@ -655,10 +657,12 @@ def main():
    yt_count = sum(1 for r in records if r.get("youtube_channel_id"))
    gm_count = sum(1 for r in records if r.get("google_place_id"))
    coord_count = sum(1 for r in records if r.get("latitude") is not None)
+    emic_count = sum(1 for r in records if r.get("emic_name"))
    print(f"\nEnrichment summary:")
    print(f"  With coordinates: {coord_count}/{len(records)}")
    print(f"  With Google Maps: {gm_count}/{len(records)}")
    print(f"  With YouTube: {yt_count}/{len(records)}")
+    print(f"  With emic_name: {emic_count}/{len(records)}")
    
    if args.dry_run:
        print("\n[DRY RUN] Would upload to DuckLake. Exiting without upload.")
--- a/scripts/load_typedb_data_with_relations.py
+++ b/scripts/load_typedb_data_with_relations.py
@ -0,0 +1,481 @@
+#!/usr/bin/env python3
+"""
+Load NDE institution data into TypeDB with proper relations (edges)
+
+This script creates:
+1. institution entities with their attributes
+2. location entities with geographic data
+3. identifier entities for GHCID, Wikidata, etc.
+4. RELATIONS connecting these entities (for graph visualization)
+
+Key relations created:
+- located_at: institution → location
+- has_identifier: institution → identifier
+- affiliated_with: institution → provincial_archive (genealogiewerkbalk data)
+
+Requires: typedb-driver>=2.28.0,<3.0.0
+Usage: python scripts/load_typedb_data_with_relations.py
+"""
+
+import json
+import sys
+from pathlib import Path
+from typedb.driver import TypeDB, SessionType, TransactionType
+
+# Configuration
+TYPEDB_HOST = "localhost"
+TYPEDB_PORT = 1729
+DATABASE = "glam"
+
+# Path to NDE data
+DATA_FILE = Path(__file__).parent.parent / "frontend/public/data/nde_institutions.json"
+
+
+def escape_string(s: str) -> str:
+    """Escape string for TypeQL"""
+    if s is None:
+        return ""
+    return str(s).replace("\\", "\\\\").replace('"', '\\"')
+
+
+def define_schema(driver):
+    """Define schema with relations for proper graph structure"""
+    print("Defining schema with relations...")
+    
+    schema = """
+    define
+    
+    # === ENTITY TYPES ===
+    
+    institution sub entity,
+        owns name,
+        owns institution_type,
+        owns website,
+        owns description,
+        owns founding_year,
+        owns rating,
+        owns phone,
+        owns address,
+        plays located_at:subject,
+        plays has_identifier:subject,
+        plays affiliated_with:member,
+        plays same_province:institution;
+    
+    location sub entity,
+        owns city,
+        owns province,
+        owns country,
+        owns latitude,
+        owns longitude,
+        plays located_at:place,
+        plays same_province:location;
+    
+    identifier sub entity,
+        owns identifier_scheme,
+        owns identifier_value,
+        owns identifier_url,
+        plays has_identifier:identifier;
+    
+    archive sub entity,
+        owns name,
+        owns website,
+        owns isil_code,
+        plays affiliated_with:archive;
+    
+    # === ATTRIBUTE TYPES ===
+    
+    name sub attribute, value string;
+    institution_type sub attribute, value string;
+    website sub attribute, value string;
+    description sub attribute, value string;
+    founding_year sub attribute, value long;
+    rating sub attribute, value double;
+    phone sub attribute, value string;
+    address sub attribute, value string;
+    
+    city sub attribute, value string;
+    province sub attribute, value string;
+    country sub attribute, value string;
+    latitude sub attribute, value double;
+    longitude sub attribute, value double;
+    
+    identifier_scheme sub attribute, value string;
+    identifier_value sub attribute, value string;
+    identifier_url sub attribute, value string;
+    
+    isil_code sub attribute, value string;
+    
+    # === RELATION TYPES (These create edges in the graph!) ===
+    
+    # Institution is located at a place
+    located_at sub relation,
+        relates subject,
+        relates place;
+    
+    # Institution has an identifier
+    has_identifier sub relation,
+        relates subject,
+        relates identifier;
+    
+    # Institution is affiliated with an archive
+    affiliated_with sub relation,
+        relates member,
+        relates archive;
+    
+    # Institutions in the same province (inferred relation for graph connectivity)
+    same_province sub relation,
+        relates institution,
+        relates location;
+    """
+    
+    with driver.session(DATABASE, SessionType.SCHEMA) as session:
+        with session.transaction(TransactionType.WRITE) as tx:
+            tx.query.define(schema)
+            tx.commit()
+    
+    print("✅ Schema defined with relations")
+
+
+def clear_data(driver):
+    """Clear existing data"""
+    print("Clearing existing data...")
+    
+    with driver.session(DATABASE, SessionType.DATA) as session:
+        with session.transaction(TransactionType.WRITE) as tx:
+            # Delete relations first (TypeDB requires this order)
+            try:
+                tx.query.delete("match $r isa located_at; delete $r isa located_at;")
+            except:
+                pass
+            try:
+                tx.query.delete("match $r isa has_identifier; delete $r isa has_identifier;")
+            except:
+                pass
+            try:
+                tx.query.delete("match $r isa affiliated_with; delete $r isa affiliated_with;")
+            except:
+                pass
+            
+            # Then delete entities
+            try:
+                tx.query.delete("match $x isa institution; delete $x isa institution;")
+            except:
+                pass
+            try:
+                tx.query.delete("match $x isa location; delete $x isa location;")
+            except:
+                pass
+            try:
+                tx.query.delete("match $x isa identifier; delete $x isa identifier;")
+            except:
+                pass
+            try:
+                tx.query.delete("match $x isa archive; delete $x isa archive;")
+            except:
+                pass
+            
+            tx.commit()
+    
+    print("✅ Data cleared")
+
+
+def load_data(driver, institutions: list):
+    """Load institutions with proper relations"""
+    
+    batch_size = 25  # Smaller batches for complex inserts
+    total = len(institutions)
+    inserted = 0
+    relations_created = 0
+    errors = 0
+    
+    # Track unique locations and archives to avoid duplicates
+    locations_created = set()
+    archives_created = set()
+    
+    with driver.session(DATABASE, SessionType.DATA) as session:
+        for i in range(0, total, batch_size):
+            batch = institutions[i:i + batch_size]
+            
+            with session.transaction(TransactionType.WRITE) as tx:
+                for inst in batch:
+                    try:
+                        # Extract data
+                        name = escape_string(inst.get("name", "Unknown"))
+                        inst_type = escape_string(inst.get("type_name", "Unknown"))
+                        website = escape_string(inst.get("website", ""))
+                        description = escape_string(inst.get("description", ""))
+                        founding_year = inst.get("founding_year")
+                        rating = inst.get("rating")
+                        phone = escape_string(inst.get("phone", ""))
+                        address_str = escape_string(inst.get("address", ""))
+                        
+                        city = escape_string(inst.get("city", ""))
+                        province = escape_string(inst.get("province", ""))
+                        lat = inst.get("lat")
+                        lon = inst.get("lon")
+                        
+                        # Get GHCID as unique key
+                        ghcid = ""
+                        ghcid_data = inst.get("ghcid", {})
+                        if ghcid_data:
+                            ghcid = escape_string(ghcid_data.get("current", ""))
+                        
+                        if not ghcid:
+                            # Skip entries without GHCID (can't create unique references)
+                            continue
+                        
+                        # Wikidata ID
+                        wikidata_id = escape_string(inst.get("wikidata_id", ""))
+                        
+                        # === 1. INSERT INSTITUTION ===
+                        inst_parts = [f'insert $inst isa institution, has name "{name}"']
+                        if inst_type:
+                            inst_parts.append(f', has institution_type "{inst_type}"')
+                        if website:
+                            inst_parts.append(f', has website "{website}"')
+                        if description:
+                            inst_parts.append(f', has description "{description}"')
+                        if founding_year:
+                            inst_parts.append(f', has founding_year {int(founding_year)}')
+                        if rating:
+                            inst_parts.append(f', has rating {float(rating)}')
+                        if phone:
+                            inst_parts.append(f', has phone "{phone}"')
+                        if address_str:
+                            inst_parts.append(f', has address "{address_str}"')
+                        
+                        inst_query = "".join(inst_parts) + ";"
+                        tx.query.insert(inst_query)
+                        inserted += 1
+                        
+                        # === 2. INSERT LOCATION & CREATE RELATION ===
+                        if lat is not None and lon is not None:
+                            location_key = f"{province}:{city}" if city else f"{province}:unknown"
+                            
+                            if location_key not in locations_created:
+                                # Insert new location
+                                loc_parts = [f'insert $loc isa location']
+                                loc_parts.append(f', has latitude {lat}')
+                                loc_parts.append(f', has longitude {lon}')
+                                loc_parts.append(', has country "NL"')
+                                if city:
+                                    loc_parts.append(f', has city "{city}"')
+                                if province:
+                                    loc_parts.append(f', has province "{province}"')
+                                
+                                loc_query = "".join(loc_parts) + ";"
+                                tx.query.insert(loc_query)
+                                locations_created.add(location_key)
+                            
+                            # Create located_at relation using name match
+                            # We use the institution name and location attributes to match
+                            rel_query = f'''
+                            match 
+                                $inst isa institution, has name "{name}";
+                                $loc isa location, has latitude {lat}, has longitude {lon};
+                            insert 
+                                (subject: $inst, place: $loc) isa located_at;
+                            '''
+                            tx.query.insert(rel_query)
+                            relations_created += 1
+                        
+                        # === 3. INSERT IDENTIFIERS & CREATE RELATIONS ===
+                        
+                        # GHCID identifier
+                        if ghcid:
+                            ghcid_uuid = ghcid_data.get("uuid", "")
+                            ghcid_query = f'''
+                            insert $id isa identifier,
+                                has identifier_scheme "GHCID",
+                                has identifier_value "{ghcid}",
+                                has identifier_url "";
+                            '''
+                            tx.query.insert(ghcid_query)
+                            
+                            # Create has_identifier relation
+                            id_rel_query = f'''
+                            match
+                                $inst isa institution, has name "{name}";
+                                $id isa identifier, has identifier_value "{ghcid}";
+                            insert
+                                (subject: $inst, identifier: $id) isa has_identifier;
+                            '''
+                            tx.query.insert(id_rel_query)
+                            relations_created += 1
+                        
+                        # Wikidata identifier
+                        if wikidata_id:
+                            wikidata_query = f'''
+                            insert $id isa identifier,
+                                has identifier_scheme "Wikidata",
+                                has identifier_value "{wikidata_id}",
+                                has identifier_url "https://www.wikidata.org/wiki/{wikidata_id}";
+                            '''
+                            tx.query.insert(wikidata_query)
+                            
+                            # Create has_identifier relation
+                            wd_rel_query = f'''
+                            match
+                                $inst isa institution, has name "{name}";
+                                $id isa identifier, has identifier_value "{wikidata_id}";
+                            insert
+                                (subject: $inst, identifier: $id) isa has_identifier;
+                            '''
+                            tx.query.insert(wd_rel_query)
+                            relations_created += 1
+                        
+                        # === 4. INSERT ARCHIVE AFFILIATIONS ===
+                        genealogie = inst.get("genealogiewerkbalk", {})
+                        if genealogie:
+                            prov_archive = genealogie.get("provincial_archive", {})
+                            if prov_archive:
+                                archive_name = escape_string(prov_archive.get("name", ""))
+                                archive_website = escape_string(prov_archive.get("website", ""))
+                                
+                                if archive_name and archive_name not in archives_created:
+                                    # Insert archive
+                                    archive_query = f'''
+                                    insert $arch isa archive,
+                                        has name "{archive_name}",
+                                        has website "{archive_website}";
+                                    '''
+                                    tx.query.insert(archive_query)
+                                    archives_created.add(archive_name)
+                                
+                                if archive_name:
+                                    # Create affiliated_with relation
+                                    aff_query = f'''
+                                    match
+                                        $inst isa institution, has name "{name}";
+                                        $arch isa archive, has name "{archive_name}";
+                                    insert
+                                        (member: $inst, archive: $arch) isa affiliated_with;
+                                    '''
+                                    tx.query.insert(aff_query)
+                                    relations_created += 1
+                        
+                    except Exception as e:
+                        errors += 1
+                        if errors <= 5:
+                            print(f"  ❌ Error inserting {inst.get('name', 'unknown')}: {e}")
+                
+                tx.commit()
+            
+            # Progress
+            pct = (i + len(batch)) / total * 100
+            print(f"  Progress: {i + len(batch)}/{total} ({pct:.1f}%) - {inserted} institutions, {relations_created} relations, {errors} errors")
+    
+    return inserted, relations_created, errors
+
+
+def verify_data(driver):
+    """Verify loaded data and relations"""
+    print("\n🔍 Verifying loaded data...")
+    
+    with driver.session(DATABASE, SessionType.DATA) as session:
+        with session.transaction(TransactionType.READ) as tx:
+            # Count entities
+            inst_count = sum(1 for _ in tx.query.get("match $x isa institution; get $x;"))
+            loc_count = sum(1 for _ in tx.query.get("match $x isa location; get $x;"))
+            id_count = sum(1 for _ in tx.query.get("match $x isa identifier; get $x;"))
+            arch_count = sum(1 for _ in tx.query.get("match $x isa archive; get $x;"))
+            
+            # Count relations
+            located_count = sum(1 for _ in tx.query.get("match $r isa located_at; get $r;"))
+            has_id_count = sum(1 for _ in tx.query.get("match $r isa has_identifier; get $r;"))
+            aff_count = sum(1 for _ in tx.query.get("match $r isa affiliated_with; get $r;"))
+            
+            print(f"""
+📊 Data Summary:
+═══════════════════════════════════════
+ENTITIES:
+  • Institutions: {inst_count}
+  • Locations:    {loc_count}
+  • Identifiers:  {id_count}
+  • Archives:     {arch_count}
+
+RELATIONS (EDGES):
+  • located_at:      {located_count}
+  • has_identifier:  {has_id_count}
+  • affiliated_with: {aff_count}
+
+Total edges: {located_count + has_id_count + aff_count}
+═══════════════════════════════════════
+            """)
+            
+            # Sample a graph query
+            print("📈 Sample graph query (institution → location):")
+            sample = tx.query.get("""
+                match 
+                    $inst isa institution, has name $name;
+                    (subject: $inst, place: $loc) isa located_at;
+                    $loc has city $city, has province $prov;
+                get $name, $city, $prov; 
+                limit 5;
+            """)
+            
+            for row in sample:
+                name = row.get("name")
+                city = row.get("city")
+                prov = row.get("prov")
+                if name and city:
+                    print(f"  • {name.get_value()} → {city.get_value()}, {prov.get_value()}")
+
+
+def main():
+    """Main entry point"""
+    print("=" * 60)
+    print("TypeDB Data Loader with Relations")
+    print("=" * 60)
+    print()
+    
+    # Load JSON data
+    print(f"📂 Loading data from {DATA_FILE}...")
+    with open(DATA_FILE, "r", encoding="utf-8") as f:
+        institutions = json.load(f)
+    
+    print(f"  Found {len(institutions)} institutions")
+    
+    # Connect to TypeDB
+    print(f"\n🔗 Connecting to TypeDB at {TYPEDB_HOST}:{TYPEDB_PORT}...")
+    driver = TypeDB.core_driver(f"{TYPEDB_HOST}:{TYPEDB_PORT}")
+    
+    try:
+        # Check database exists
+        db_names = [db.name for db in driver.databases.all()]
+        if DATABASE not in db_names:
+            print(f"  Creating database '{DATABASE}'...")
+            driver.databases.create(DATABASE)
+        
+        print(f"  ✅ Connected to database '{DATABASE}'")
+        
+        # Define schema
+        define_schema(driver)
+        
+        # Clear existing data
+        clear_data(driver)
+        
+        # Load data with relations
+        print("\n📥 Loading data with relations...")
+        inserted, relations, errors = load_data(driver, institutions)
+        
+        print(f"""
+✅ Loading complete!
+  • Institutions inserted: {inserted}
+  • Relations created: {relations}
+  • Errors: {errors}
+        """)
+        
+        # Verify
+        verify_data(driver)
+        
+    finally:
+        driver.close()
+    
+    print("\n" + "=" * 60)
+    print("Done! Graph data ready for visualization at bronhouder.nl/database")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()