The original list-form _V34_TO_V35_MIGRATIONS ran four ALTER
statements in sequence:
ADD _vis_v35 → UPDATE _vis_v35 = visibility_status →
DROP visibility_status → RENAME _vis_v35 TO visibility_status
If the RENAME failed for any reason after the DROP succeeded — DuckDB
lock contention at startup, scheduler-vs-app race opening
system.duckdb, container kill mid-migration, etc. — the DB was
stranded with _vis_v35 populated and visibility_status missing. The
schema_version row never bumped because the UPDATE at the bottom of
the migration ladder runs only when every step succeeded. Subsequent
restarts then hit DROP visibility_status again with no IF EXISTS
guard and looped on the same error; the only recovery was hand-
editing the DB.
Replace the list with a Python function _v34_to_v35_migrate that
inspects the table's columns up front and dispatches into one of
three paths:
* clean v34 (visibility_status present, _vis_v35 absent) — run the
full rebuild
* partial v35 (_vis_v35 present, visibility_status absent) — finish
the RENAME alone, data is already in _vis_v35 from the prior
UPDATE
* both columns present (rare; aborted before DROP) — drop the temp
and keep visibility_status
The audit columns (archived_at, archived_by) ship first behind
IF NOT EXISTS so they're safe in all states. Operators stranded by
the original bug now recover automatically on next startup.
Tests cover the three direct paths plus an end-to-end scenario where
_ensure_schema walks a schema_version=32 DB with the half-applied
state up through to v36.
Co-authored-by: Minas Arustamyan <arustamyan.minas@gmail.com>
375 lines
14 KiB
Python
375 lines
14 KiB
Python
"""v20 adds source_query column to table_registry.
|
|
|
|
Backs query_mode='materialized' for BigQuery: admin registers a SQL body
|
|
that the scheduler runs through the DuckDB BQ extension and writes as a
|
|
parquet to /data/extracts/bigquery/data/<id>.parquet.
|
|
|
|
The v19 step (#150) drops dataset_permissions, access_requests tables and
|
|
users.role, table_registry.is_public columns; v20 then ALTERs the post-v19
|
|
table_registry to add the source_query column.
|
|
"""
|
|
import duckdb
|
|
|
|
from src.db import SCHEMA_VERSION, _ensure_schema, get_schema_version
|
|
|
|
|
|
def test_schema_version_is_36():
|
|
# v27 → v28: explicit-install (Model B) for curated marketplace plugins.
|
|
# user_plugin_optouts row presence flips meaning from "excluded" to
|
|
# "subscribed"; migration wipes existing rows so the inverted reading
|
|
# starts from a clean baseline. Also adds marketplace_plugins.created_at
|
|
# (per-plugin "newest first" sort on /marketplace), backfilled from
|
|
# parent marketplace_registry.registered_at.
|
|
# v28 → v29: /home page rollout — instance_templates singleton
|
|
# consolidation (welcome_template + claude_md_template merged) + new
|
|
# users.onboarded column. See tests/test_v29_home_migration.py for
|
|
# the exhaustive coverage of that step.
|
|
# v29 → v30: news_template — single versioned table for the /home
|
|
# news perex + /news permalink page. See
|
|
# tests/test_news_template_repository.py.
|
|
# v30 → v31: session-pipeline framework — session_processor_state
|
|
# replaces session_extraction_state with composite PK.
|
|
# v31 → v32 (this PR): flea-market upload guardrails — adds
|
|
# store_entities.visibility_status + creates store_submissions.
|
|
# v32 → v33 (this PR): forensic columns on store_submissions —
|
|
# file_size, bundle_sha256, bundle_purged_at. Underpins the
|
|
# persist-blocked-bundle behavior so admins can Rescan /
|
|
# Override / Download; 30-day TTL purge clears bytes while
|
|
# keeping the row + sha intact. See docs/STORE_GUARDRAILS.md.
|
|
# v33 → v34: drop store_submissions.retry_count — counter mixed LLM
|
|
# error count + admin rescan count, redundant with audit_log.
|
|
# v34 → v35 (this PR): store_entities gains 'archived' visibility
|
|
# state + archived_at + archived_by audit columns. Owner
|
|
# soft-delete writes 'archived'; existing user_store_installs
|
|
# keep serving the bundle through marketplace.zip / .git.
|
|
# Hard delete (DELETE ?hard=true) remains admin-only.
|
|
# v35 → v36 (PR #233 follow-up): re-apply NOT NULL + DEFAULT 'pending'
|
|
# on store_entities.visibility_status. Lost in the v34→v35
|
|
# column rebuild. Without this, an INSERT that omits the
|
|
# column lands NULL → repo reads None → undefined behavior
|
|
# in the visibility gates. Value-list invariant remains
|
|
# enforced application-side (DuckDB ADD CHECK on existing
|
|
# column not supported).
|
|
assert SCHEMA_VERSION == 36
|
|
|
|
|
|
def test_v20_adds_source_query(tmp_path):
|
|
db_path = tmp_path / "system.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
_ensure_schema(conn)
|
|
|
|
cols = {
|
|
r[0] for r in conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name = 'table_registry'"
|
|
).fetchall()
|
|
}
|
|
assert "source_query" in cols, f"source_query missing from {cols}"
|
|
assert get_schema_version(conn) == SCHEMA_VERSION
|
|
conn.close()
|
|
|
|
|
|
def test_claude_md_template_seeded_in_instance_templates(tmp_path):
|
|
"""v23 introduced claude_md_template as a singleton table; v28 consolidates
|
|
it into instance_templates keyed 'claude_md'. Post-v28 the legacy table is
|
|
dropped — the canonical lookup is `instance_templates WHERE key='claude_md'`.
|
|
|
|
See tests/test_v28_migration.py for the migration path coverage. This test
|
|
just verifies the seeded row is present on a fresh install.
|
|
"""
|
|
db_path = tmp_path / "system.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
_ensure_schema(conn)
|
|
|
|
tables = {
|
|
r[0] for r in conn.execute(
|
|
"SELECT table_name FROM information_schema.tables "
|
|
"WHERE table_schema = 'main'"
|
|
).fetchall()
|
|
}
|
|
assert "instance_templates" in tables
|
|
assert "claude_md_template" not in tables, (
|
|
"claude_md_template should be consolidated away post-v28"
|
|
)
|
|
|
|
row = conn.execute(
|
|
"SELECT key, content FROM instance_templates WHERE key = 'claude_md'"
|
|
).fetchone()
|
|
assert row is not None
|
|
assert row[0] == "claude_md"
|
|
assert row[1] is None # default = no override
|
|
conn.close()
|
|
|
|
|
|
def test_v19_db_migrates_to_v20(tmp_path):
|
|
"""Pre-existing v19 DB (post-RBAC-drop) without source_query upgrades
|
|
cleanly without losing data."""
|
|
db_path = tmp_path / "system.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
|
|
# Simulate a v19 DB at minimal but realistic shape: schema_version row +
|
|
# a table_registry row in the post-v19 column shape (no is_public column,
|
|
# since v19 finalize dropped it via the table-rebuild idiom).
|
|
conn.execute(
|
|
"CREATE TABLE schema_version (version INTEGER, "
|
|
"applied_at TIMESTAMP DEFAULT current_timestamp)"
|
|
)
|
|
conn.execute("INSERT INTO schema_version (version) VALUES (19)")
|
|
conn.execute("""CREATE TABLE table_registry (
|
|
id VARCHAR PRIMARY KEY, name VARCHAR NOT NULL,
|
|
source_type VARCHAR, bucket VARCHAR, source_table VARCHAR,
|
|
sync_strategy VARCHAR DEFAULT 'full_refresh',
|
|
query_mode VARCHAR DEFAULT 'local',
|
|
sync_schedule VARCHAR, profile_after_sync BOOLEAN DEFAULT true,
|
|
primary_key VARCHAR, folder VARCHAR, description TEXT,
|
|
registered_by VARCHAR,
|
|
registered_at TIMESTAMP DEFAULT current_timestamp
|
|
)""")
|
|
conn.execute("INSERT INTO table_registry (id, name) VALUES ('foo', 'foo')")
|
|
|
|
_ensure_schema(conn)
|
|
|
|
assert get_schema_version(conn) == SCHEMA_VERSION # bumped 19→28 forward
|
|
cols = {
|
|
r[0] for r in conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name = 'table_registry'"
|
|
).fetchall()
|
|
}
|
|
assert "source_query" in cols
|
|
# Existing row preserved, new column NULL
|
|
row = conn.execute(
|
|
"SELECT id, source_query FROM table_registry WHERE id='foo'"
|
|
).fetchone()
|
|
assert row == ("foo", None)
|
|
conn.close()
|
|
|
|
|
|
def _make_v34_store_entities(conn):
|
|
"""Build a minimal v34-shape store_entities table for v34→v35 path tests.
|
|
|
|
Only includes the columns the v34→v35 migration touches; the rest of
|
|
the schema isn't needed because the function operates only on
|
|
store_entities's column set.
|
|
"""
|
|
conn.execute("""
|
|
CREATE TABLE store_entities (
|
|
id VARCHAR PRIMARY KEY,
|
|
visibility_status VARCHAR DEFAULT 'pending'
|
|
)
|
|
""")
|
|
conn.execute(
|
|
"INSERT INTO store_entities (id, visibility_status) VALUES "
|
|
"('a', 'approved'), ('b', 'pending'), ('c', 'hidden')"
|
|
)
|
|
|
|
|
|
def test_v34_to_v35_clean_path_rebuilds_visibility_column(tmp_path):
|
|
"""Standard v34 → v35 path: ``visibility_status`` is present, no temp
|
|
column. Migration rebuilds the column without the legacy CHECK so
|
|
'archived' becomes a valid value, preserves all row values, and adds
|
|
the audit columns.
|
|
"""
|
|
from src.db import _v34_to_v35_migrate
|
|
|
|
db_path = tmp_path / "system.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
_make_v34_store_entities(conn)
|
|
|
|
_v34_to_v35_migrate(conn)
|
|
|
|
cols = {
|
|
r[0] for r in conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name = 'store_entities'"
|
|
).fetchall()
|
|
}
|
|
assert "visibility_status" in cols
|
|
assert "_vis_v35" not in cols, "temp column must be cleaned up"
|
|
assert "archived_at" in cols
|
|
assert "archived_by" in cols
|
|
|
|
rows = dict(conn.execute(
|
|
"SELECT id, visibility_status FROM store_entities ORDER BY id"
|
|
).fetchall())
|
|
assert rows == {"a": "approved", "b": "pending", "c": "hidden"}, (
|
|
f"row values must survive the rebuild: {rows}"
|
|
)
|
|
conn.close()
|
|
|
|
|
|
def test_v34_to_v35_recovers_from_partial_rebuild_missing_visibility(tmp_path):
|
|
"""Partial-rebuild recovery: a previous migration attempt completed
|
|
steps 3-5 (added _vis_v35, copied values, dropped visibility_status)
|
|
but failed before step 6 (RENAME). Subsequent restarts hit
|
|
DROP visibility_status (no IF EXISTS guard) and looped on the same
|
|
error, leaving the DB stranded with schema_version stuck pre-v35.
|
|
|
|
The new code detects this state — _vis_v35 present, visibility_status
|
|
absent — and finishes the rebuild with the RENAME alone instead of
|
|
re-running the full destructive sequence.
|
|
"""
|
|
from src.db import _v34_to_v35_migrate
|
|
|
|
db_path = tmp_path / "system.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
# Hand-build the broken state: store_entities with _vis_v35 instead of
|
|
# visibility_status, populated with the canonical values.
|
|
conn.execute("""
|
|
CREATE TABLE store_entities (
|
|
id VARCHAR PRIMARY KEY,
|
|
_vis_v35 VARCHAR
|
|
)
|
|
""")
|
|
conn.execute(
|
|
"INSERT INTO store_entities (id, _vis_v35) VALUES "
|
|
"('a', 'approved'), ('b', 'pending'), ('c', 'hidden')"
|
|
)
|
|
|
|
_v34_to_v35_migrate(conn)
|
|
|
|
cols = {
|
|
r[0] for r in conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name = 'store_entities'"
|
|
).fetchall()
|
|
}
|
|
assert "visibility_status" in cols
|
|
assert "_vis_v35" not in cols
|
|
assert "archived_at" in cols
|
|
assert "archived_by" in cols
|
|
|
|
rows = dict(conn.execute(
|
|
"SELECT id, visibility_status FROM store_entities ORDER BY id"
|
|
).fetchall())
|
|
assert rows == {"a": "approved", "b": "pending", "c": "hidden"}, (
|
|
f"row values must come back via RENAME, not be lost: {rows}"
|
|
)
|
|
conn.close()
|
|
|
|
|
|
def test_v34_to_v35_recovers_from_partial_rebuild_both_columns(tmp_path):
|
|
"""Edge state: a prior attempt aborted before the DROP, leaving both
|
|
visibility_status (canonical) and _vis_v35 (temp) on the table.
|
|
The recovery path drops _vis_v35 and keeps visibility_status — the
|
|
rest of the schema expects that name.
|
|
"""
|
|
from src.db import _v34_to_v35_migrate
|
|
|
|
db_path = tmp_path / "system.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
conn.execute("""
|
|
CREATE TABLE store_entities (
|
|
id VARCHAR PRIMARY KEY,
|
|
visibility_status VARCHAR,
|
|
_vis_v35 VARCHAR
|
|
)
|
|
""")
|
|
conn.execute(
|
|
"INSERT INTO store_entities (id, visibility_status, _vis_v35) VALUES "
|
|
"('a', 'approved', 'approved')"
|
|
)
|
|
|
|
_v34_to_v35_migrate(conn)
|
|
|
|
cols = {
|
|
r[0] for r in conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name = 'store_entities'"
|
|
).fetchall()
|
|
}
|
|
assert "visibility_status" in cols
|
|
assert "_vis_v35" not in cols, "temp column must be dropped"
|
|
|
|
row = conn.execute(
|
|
"SELECT id, visibility_status FROM store_entities WHERE id = 'a'"
|
|
).fetchone()
|
|
assert row == ("a", "approved")
|
|
conn.close()
|
|
|
|
|
|
def test_v32_db_with_partial_v35_recovers_through_full_ladder(tmp_path):
|
|
"""End-to-end: a DB stranded at schema_version=32 with the half-applied
|
|
v34→v35 state (visibility_status dropped, _vis_v35 left behind) must
|
|
upgrade cleanly through the full ladder when ``_ensure_schema`` runs.
|
|
|
|
This is the production scenario observed in operator instances after
|
|
the original list-form ``_V34_TO_V35_MIGRATIONS`` failed mid-run on
|
|
a fresh restart.
|
|
"""
|
|
db_path = tmp_path / "system.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
|
|
# Stand up the broken state. We only need enough of the schema for the
|
|
# migration ladder to run — ``_ensure_schema`` will create the rest
|
|
# via ``_SYSTEM_SCHEMA``'s IF NOT EXISTS guards.
|
|
conn.execute(
|
|
"CREATE TABLE schema_version (version INTEGER, "
|
|
"applied_at TIMESTAMP DEFAULT current_timestamp)"
|
|
)
|
|
conn.execute("INSERT INTO schema_version (version) VALUES (32)")
|
|
conn.execute("""
|
|
CREATE TABLE store_entities (
|
|
id VARCHAR PRIMARY KEY,
|
|
owner_user_id VARCHAR,
|
|
owner_username VARCHAR,
|
|
type VARCHAR,
|
|
name VARCHAR,
|
|
archived_at TIMESTAMP,
|
|
archived_by VARCHAR,
|
|
_vis_v35 VARCHAR
|
|
)
|
|
""")
|
|
conn.execute(
|
|
"INSERT INTO store_entities (id, type, name, _vis_v35) "
|
|
"VALUES ('a', 'skill', 'alpha', 'approved')"
|
|
)
|
|
|
|
_ensure_schema(conn)
|
|
|
|
assert get_schema_version(conn) == SCHEMA_VERSION
|
|
cols = {
|
|
r[0] for r in conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name = 'store_entities'"
|
|
).fetchall()
|
|
}
|
|
assert "visibility_status" in cols
|
|
assert "_vis_v35" not in cols
|
|
# Existing row preserved, value carried over from _vis_v35.
|
|
row = conn.execute(
|
|
"SELECT id, visibility_status FROM store_entities WHERE id = 'a'"
|
|
).fetchone()
|
|
assert row == ("a", "approved")
|
|
conn.close()
|
|
|
|
|
|
def test_v35_to_v36_reapplies_visibility_constraints(tmp_path):
|
|
"""v34→v35 dropped NOT NULL + DEFAULT when rebuilding the column to
|
|
drop the legacy CHECK; v35→v36 re-applies them. Verifies that on a
|
|
freshly migrated DB, an INSERT omitting visibility_status either
|
|
inherits the default 'pending' or fails — never lands NULL.
|
|
"""
|
|
db_path = tmp_path / "system.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
_ensure_schema(conn)
|
|
assert get_schema_version(conn) == SCHEMA_VERSION
|
|
|
|
cols = conn.execute(
|
|
"SELECT column_name, is_nullable, column_default "
|
|
"FROM information_schema.columns "
|
|
"WHERE table_name = 'store_entities' "
|
|
" AND column_name = 'visibility_status'"
|
|
).fetchall()
|
|
assert cols, "visibility_status column missing from store_entities"
|
|
name, is_nullable, default_expr = cols[0]
|
|
assert is_nullable == "NO", (
|
|
f"visibility_status must be NOT NULL after v36; got is_nullable={is_nullable!r}"
|
|
)
|
|
# DuckDB renders the default as a quoted literal — match either form.
|
|
assert default_expr is not None, "visibility_status DEFAULT must be set"
|
|
assert "pending" in str(default_expr).lower(), (
|
|
f"visibility_status DEFAULT must be 'pending'; got {default_expr!r}"
|
|
)
|
|
|
|
conn.close()
|