diff --git a/CHANGELOG.md b/CHANGELOG.md index 395a578..e8f64bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +### Fixed + +- **`v34→v35` migration is now idempotent under partial-rebuild recovery.** The original list-form `_V34_TO_V35_MIGRATIONS` ran four ALTER statements in sequence: `ADD _vis_v35` → `UPDATE _vis_v35 = visibility_status` → `DROP visibility_status` → `RENAME _vis_v35 TO visibility_status`. If the RENAME failed for any reason after the DROP succeeded (DuckDB lock contention at startup, scheduler-vs-app race opening `system.duckdb`, container kill mid-migration, …), the DB was stranded with `_vis_v35` populated and `visibility_status` missing — and `schema_version` never bumped because the UPDATE at the bottom of the migration ladder only runs when *every* step succeeds. Subsequent restarts then hit `DROP visibility_status` again with no `IF EXISTS` guard and looped on the same error; the only recovery was hand-editing the DB. The migration is rewritten as a Python function `_v34_to_v35_migrate` that inspects the table's columns up front and dispatches into one of three paths: clean v34 (run the full rebuild), partial v35 with `_vis_v35` only (finish the RENAME alone), or both columns present (drop the temp). The audit columns (`archived_at`, `archived_by`) ship first behind `IF NOT EXISTS` so they're safe in all states. Operators stranded by the original bug recover automatically on next startup. Tests cover the three direct paths plus an end-to-end scenario where `_ensure_schema` walks a `schema_version=32` DB with the half-applied state up through to v36. + ### Security - **Prompt-injection hardening for store guardrails LLM review (#1).** diff --git a/src/db.py b/src/db.py index 1dbc279..0c0eac3 100644 --- a/src/db.py +++ b/src/db.py @@ -2261,21 +2261,93 @@ _V32_TO_V33_MIGRATIONS = [ # Two-step: first add the new audit columns (always safe); then # rebuild visibility_status without the CHECK so 'archived' becomes a # valid value. -_V34_TO_V35_MIGRATIONS = [ - "ALTER TABLE store_entities ADD COLUMN IF NOT EXISTS archived_at TIMESTAMP", - "ALTER TABLE store_entities ADD COLUMN IF NOT EXISTS archived_by VARCHAR", - # Rebuild visibility_status to drop the legacy CHECK (which forbade - # 'archived'). DuckDB lacks DROP CONSTRAINT for table-level CHECKs, - # so we copy → drop → rename. NOT NULL + DEFAULT are re-applied in - # v36 (DuckDB ALTER COLUMN supports SET NOT NULL / SET DEFAULT but - # not ADD CHECK on existing columns). The repo always writes a - # value; value-list enforcement is application-side via the - # VALID_VISIBILITY whitelist in StoreEntitiesRepository. - "ALTER TABLE store_entities ADD COLUMN IF NOT EXISTS _vis_v35 VARCHAR", - "UPDATE store_entities SET _vis_v35 = visibility_status WHERE _vis_v35 IS NULL", - "ALTER TABLE store_entities DROP COLUMN visibility_status", - "ALTER TABLE store_entities RENAME COLUMN _vis_v35 TO visibility_status", -] +def _v34_to_v35_migrate(conn: duckdb.DuckDBPyConnection) -> None: + """Add the ``archived`` visibility state + audit columns to ``store_entities``. + + Replaces the old list-form ``_V34_TO_V35_MIGRATIONS`` so the migration is + safe to re-run after a partial failure. The original sequence was + + ADD _vis_v35 → UPDATE _vis_v35 = visibility_status → + DROP visibility_status → RENAME _vis_v35 TO visibility_status + + which left a half-rebuilt DB stranded if step 4 (RENAME) failed after + step 3 (DROP) succeeded: ``visibility_status`` was gone, ``_vis_v35`` + held the values, and ``schema_version`` never got bumped because the + UPDATE at the bottom of the migration ladder never ran. Restarting + the binary then hit step 3 again with no IF EXISTS guard and looped + on the same DROP error. + + The new implementation inspects ``store_entities``'s columns up front + and picks the right recovery path: + + * **clean v34 shape** (``visibility_status`` present, ``_vis_v35`` + absent) — full rebuild via copy → drop → rename, as before + * **partial v35** (``_vis_v35`` present, ``visibility_status`` absent) + — rebuild aborted mid-way; finish the RENAME only + * **both columns present** (rare; aborted rebuild that didn't reach + the DROP) — drop the temp ``_vis_v35`` and keep ``visibility_status`` + + The audit columns (``archived_at``, ``archived_by``) ship first + behind ``IF NOT EXISTS`` so they're safe in all three states. + """ + conn.execute( + "ALTER TABLE store_entities ADD COLUMN IF NOT EXISTS archived_at TIMESTAMP" + ) + conn.execute( + "ALTER TABLE store_entities ADD COLUMN IF NOT EXISTS archived_by VARCHAR" + ) + + cols = { + r[0] for r in conn.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_name = 'store_entities' " + " AND column_name IN ('visibility_status', '_vis_v35')" + ).fetchall() + } + has_vis = "visibility_status" in cols + has_temp = "_vis_v35" in cols + + if has_vis and not has_temp: + # Clean v34 shape — full rebuild. NOT NULL + DEFAULT are re-applied + # in v36 (DuckDB ALTER COLUMN supports SET NOT NULL / SET DEFAULT + # but not ADD CHECK on an existing column). Value-list enforcement + # is application-side via VALID_VISIBILITY in StoreEntitiesRepository. + conn.execute( + "ALTER TABLE store_entities ADD COLUMN _vis_v35 VARCHAR" + ) + conn.execute( + "UPDATE store_entities SET _vis_v35 = visibility_status" + ) + conn.execute( + "ALTER TABLE store_entities DROP COLUMN visibility_status" + ) + conn.execute( + "ALTER TABLE store_entities RENAME COLUMN _vis_v35 TO visibility_status" + ) + elif has_temp and not has_vis: + # Partial-rebuild recovery — prior attempt dropped visibility_status + # but the RENAME never landed. Data is already in _vis_v35 from + # the prior UPDATE; finish the rename. + logger.warning( + "v34→v35 detected partial-rebuild state (visibility_status " + "missing, _vis_v35 present); recovering via RENAME" + ) + conn.execute( + "ALTER TABLE store_entities RENAME COLUMN _vis_v35 TO visibility_status" + ) + elif has_vis and has_temp: + # Both present — earlier rebuild aborted before the DROP. + # visibility_status holds the canonical values; drop the temp. + logger.warning( + "v34→v35 detected partial-rebuild state (both visibility_status " + "and _vis_v35 present); dropping the temp" + ) + conn.execute( + "ALTER TABLE store_entities DROP COLUMN _vis_v35" + ) + # else: neither column is present, which means store_entities itself + # is at a shape ahead of v34. _SYSTEM_SCHEMA above already created + # the post-v35 shape; nothing to do here. # v35→v36: re-apply NOT NULL + DEFAULT 'pending' on @@ -2661,8 +2733,7 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: for sql in _V33_TO_V34_MIGRATIONS: conn.execute(sql) if current < 35: - for sql in _V34_TO_V35_MIGRATIONS: - conn.execute(sql) + _v34_to_v35_migrate(conn) if current < 36: for sql in _V35_TO_V36_MIGRATIONS: conn.execute(sql) diff --git a/tests/test_db_schema_version.py b/tests/test_db_schema_version.py index a11afaa..197522b 100644 --- a/tests/test_db_schema_version.py +++ b/tests/test_db_schema_version.py @@ -145,6 +145,205 @@ def test_v19_db_migrates_to_v20(tmp_path): conn.close() +def _make_v34_store_entities(conn): + """Build a minimal v34-shape store_entities table for v34→v35 path tests. + + Only includes the columns the v34→v35 migration touches; the rest of + the schema isn't needed because the function operates only on + store_entities's column set. + """ + conn.execute(""" + CREATE TABLE store_entities ( + id VARCHAR PRIMARY KEY, + visibility_status VARCHAR DEFAULT 'pending' + ) + """) + conn.execute( + "INSERT INTO store_entities (id, visibility_status) VALUES " + "('a', 'approved'), ('b', 'pending'), ('c', 'hidden')" + ) + + +def test_v34_to_v35_clean_path_rebuilds_visibility_column(tmp_path): + """Standard v34 → v35 path: ``visibility_status`` is present, no temp + column. Migration rebuilds the column without the legacy CHECK so + 'archived' becomes a valid value, preserves all row values, and adds + the audit columns. + """ + from src.db import _v34_to_v35_migrate + + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + _make_v34_store_entities(conn) + + _v34_to_v35_migrate(conn) + + cols = { + r[0] for r in conn.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_name = 'store_entities'" + ).fetchall() + } + assert "visibility_status" in cols + assert "_vis_v35" not in cols, "temp column must be cleaned up" + assert "archived_at" in cols + assert "archived_by" in cols + + rows = dict(conn.execute( + "SELECT id, visibility_status FROM store_entities ORDER BY id" + ).fetchall()) + assert rows == {"a": "approved", "b": "pending", "c": "hidden"}, ( + f"row values must survive the rebuild: {rows}" + ) + conn.close() + + +def test_v34_to_v35_recovers_from_partial_rebuild_missing_visibility(tmp_path): + """Partial-rebuild recovery: a previous migration attempt completed + steps 3-5 (added _vis_v35, copied values, dropped visibility_status) + but failed before step 6 (RENAME). Subsequent restarts hit + DROP visibility_status (no IF EXISTS guard) and looped on the same + error, leaving the DB stranded with schema_version stuck pre-v35. + + The new code detects this state — _vis_v35 present, visibility_status + absent — and finishes the rebuild with the RENAME alone instead of + re-running the full destructive sequence. + """ + from src.db import _v34_to_v35_migrate + + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + # Hand-build the broken state: store_entities with _vis_v35 instead of + # visibility_status, populated with the canonical values. + conn.execute(""" + CREATE TABLE store_entities ( + id VARCHAR PRIMARY KEY, + _vis_v35 VARCHAR + ) + """) + conn.execute( + "INSERT INTO store_entities (id, _vis_v35) VALUES " + "('a', 'approved'), ('b', 'pending'), ('c', 'hidden')" + ) + + _v34_to_v35_migrate(conn) + + cols = { + r[0] for r in conn.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_name = 'store_entities'" + ).fetchall() + } + assert "visibility_status" in cols + assert "_vis_v35" not in cols + assert "archived_at" in cols + assert "archived_by" in cols + + rows = dict(conn.execute( + "SELECT id, visibility_status FROM store_entities ORDER BY id" + ).fetchall()) + assert rows == {"a": "approved", "b": "pending", "c": "hidden"}, ( + f"row values must come back via RENAME, not be lost: {rows}" + ) + conn.close() + + +def test_v34_to_v35_recovers_from_partial_rebuild_both_columns(tmp_path): + """Edge state: a prior attempt aborted before the DROP, leaving both + visibility_status (canonical) and _vis_v35 (temp) on the table. + The recovery path drops _vis_v35 and keeps visibility_status — the + rest of the schema expects that name. + """ + from src.db import _v34_to_v35_migrate + + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + conn.execute(""" + CREATE TABLE store_entities ( + id VARCHAR PRIMARY KEY, + visibility_status VARCHAR, + _vis_v35 VARCHAR + ) + """) + conn.execute( + "INSERT INTO store_entities (id, visibility_status, _vis_v35) VALUES " + "('a', 'approved', 'approved')" + ) + + _v34_to_v35_migrate(conn) + + cols = { + r[0] for r in conn.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_name = 'store_entities'" + ).fetchall() + } + assert "visibility_status" in cols + assert "_vis_v35" not in cols, "temp column must be dropped" + + row = conn.execute( + "SELECT id, visibility_status FROM store_entities WHERE id = 'a'" + ).fetchone() + assert row == ("a", "approved") + conn.close() + + +def test_v32_db_with_partial_v35_recovers_through_full_ladder(tmp_path): + """End-to-end: a DB stranded at schema_version=32 with the half-applied + v34→v35 state (visibility_status dropped, _vis_v35 left behind) must + upgrade cleanly through the full ladder when ``_ensure_schema`` runs. + + This is the production scenario observed in operator instances after + the original list-form ``_V34_TO_V35_MIGRATIONS`` failed mid-run on + a fresh restart. + """ + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + + # Stand up the broken state. We only need enough of the schema for the + # migration ladder to run — ``_ensure_schema`` will create the rest + # via ``_SYSTEM_SCHEMA``'s IF NOT EXISTS guards. + conn.execute( + "CREATE TABLE schema_version (version INTEGER, " + "applied_at TIMESTAMP DEFAULT current_timestamp)" + ) + conn.execute("INSERT INTO schema_version (version) VALUES (32)") + conn.execute(""" + CREATE TABLE store_entities ( + id VARCHAR PRIMARY KEY, + owner_user_id VARCHAR, + owner_username VARCHAR, + type VARCHAR, + name VARCHAR, + archived_at TIMESTAMP, + archived_by VARCHAR, + _vis_v35 VARCHAR + ) + """) + conn.execute( + "INSERT INTO store_entities (id, type, name, _vis_v35) " + "VALUES ('a', 'skill', 'alpha', 'approved')" + ) + + _ensure_schema(conn) + + assert get_schema_version(conn) == SCHEMA_VERSION + cols = { + r[0] for r in conn.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_name = 'store_entities'" + ).fetchall() + } + assert "visibility_status" in cols + assert "_vis_v35" not in cols + # Existing row preserved, value carried over from _vis_v35. + row = conn.execute( + "SELECT id, visibility_status FROM store_entities WHERE id = 'a'" + ).fetchone() + assert row == ("a", "approved") + conn.close() + + def test_v35_to_v36_reapplies_visibility_constraints(tmp_path): """v34→v35 dropped NOT NULL + DEFAULT when rebuilding the column to drop the legacy CHECK; v35→v36 re-applies them. Verifies that on a