diff --git a/scripts/migrate_web_archives.py b/scripts/migrate_web_archives.py index 264f2ef923..862fe2f2fc 100644 --- a/scripts/migrate_web_archives.py +++ b/scripts/migrate_web_archives.py @@ -168,9 +168,14 @@ def build_ducklake_database(mapping: Dict[int, str]): con = duckdb.connect(str(DUCKLAKE_DB)) + # Drop and recreate tables to ensure schema is up to date + con.execute("DROP TABLE IF EXISTS web_claims") + con.execute("DROP TABLE IF EXISTS web_pages") + con.execute("DROP TABLE IF EXISTS web_archives") + # Create tables con.execute(""" - CREATE TABLE IF NOT EXISTS web_archives ( + CREATE TABLE web_archives ( ghcid VARCHAR PRIMARY KEY, entry_index INTEGER, domain VARCHAR, @@ -186,7 +191,7 @@ def build_ducklake_database(mapping: Dict[int, str]): """) con.execute(""" - CREATE TABLE IF NOT EXISTS web_pages ( + CREATE TABLE web_pages ( id INTEGER PRIMARY KEY, ghcid VARCHAR, page_title VARCHAR, @@ -198,7 +203,7 @@ def build_ducklake_database(mapping: Dict[int, str]): """) con.execute(""" - CREATE TABLE IF NOT EXISTS web_claims ( + CREATE TABLE web_claims ( id INTEGER PRIMARY KEY, ghcid VARCHAR, claim_id VARCHAR, @@ -217,9 +222,9 @@ def build_ducklake_database(mapping: Dict[int, str]): """) # Clear existing data - con.execute("DELETE FROM web_claims") - con.execute("DELETE FROM web_pages") - con.execute("DELETE FROM web_archives") + con.execute("-- Removed: DELETE FROM web_claims") + con.execute("-- Removed: DELETE FROM web_pages") + con.execute("-- Removed: DELETE FROM web_archives") page_id = 0 claim_id_counter = 0