fix(schema): code-review follow-ups for fac10b29

- _v23_to_v24_finalize: wrap row-update loop in BEGIN/COMMIT/ROLLBACK to match the project's transactional-finalizer pattern (compare _v12_to_v13_finalize, _v17_to_v18_finalize, _v18_to_v19_finalize). Pre-fix a process crash mid-loop left the schema_version unchanged but partially-converted rows persisted across restart — idempotent overall but inconsistent with project convention. - _v23_to_v24_finalize: re.sub replacement now uses a function-form (lambda) instead of an f-string, so any future project_id with a backslash sequence isn't misinterpreted as a group reference. - tests: add a Keboola-source materialized row case asserting the SELECT's source_type filter prevents non-BQ rewrites.
2026-05-04 19:32:24 +02:00 · 2026-05-04 19:32:24 +02:00 · ce108d4c6d
commit ce108d4c6d
parent fac10b29e4
2 changed files with 74 additions and 20 deletions
--- a/src/db.py
+++ b/src/db.py
@ -1696,6 +1696,18 @@ _V22_TO_V23_MIGRATIONS = [
 # migration time, log a warning per affected row and leave them — the
 # operator must configure data_source.bigquery.project, restart, and
 # the migration will fire on next start (idempotent).
 def _replace_for_v24(project_id: str):
    """Build a re.sub replacement function (not a string) so backslash
    sequences in `project_id` aren't interpreted as group references.
    GCP project IDs can't actually contain backslashes, but using a
    function-form replacement is the defensive idiom — it makes the
    intent explicit and removes the dependency on re.sub's replacement-
    string escaping rules."""
    def _repl(m):
        return f"`{project_id}.{m.group(1)}.{m.group(2)}`"
    return _repl
 def _v23_to_v24_finalize(conn: duckdb.DuckDBPyConnection) -> None:
    import re as _re
@ -1714,26 +1726,35 @@ def _v23_to_v24_finalize(conn: duckdb.DuckDBPyConnection) -> None:
        "AND source_type = 'bigquery'"
    ).fetchall()
-    for row_id, sq in rows:
+    if not rows:
-        if sq is None:
+        return  # Nothing to migrate; skip the transaction.
-            continue
+
-        if not project_id:
+    conn.execute("BEGIN TRANSACTION")
-            logger.warning(
+    try:
-                "v24 migration: skipping rewrite of source_query for row %r — "
+        for row_id, sq in rows:
-                "data_source.bigquery.project is not configured. Set it via "
+            if sq is None:
-                "/admin/server-config and restart the app to retry the "
+                continue
-                "migration.", row_id,
+            if not project_id:
-            )
+                logger.warning(
-            continue
+                    "v24 migration: skipping rewrite of source_query for row %r — "
-        new_sq = pattern.sub(rf'`{project_id}.\1.\2`', sq)
+                    "data_source.bigquery.project is not configured. Set it via "
-        if new_sq != sq:
+                    "/admin/server-config and restart the app to retry the "
-            conn.execute(
+                    "migration.", row_id,
-                "UPDATE table_registry SET source_query = ? WHERE id = ?",
+                )
-                [new_sq, row_id],
+                continue
-            )
+            new_sq = pattern.sub(_replace_for_v24(project_id), sq)
-            logger.info(
+            if new_sq != sq:
-                "v24 migration: rewrote source_query for row %r", row_id,
+                conn.execute(
-            )
+                    "UPDATE table_registry SET source_query = ? WHERE id = ?",
                    [new_sq, row_id],
                )
                logger.info(
                    "v24 migration: rewrote source_query for row %r", row_id,
                )
        conn.execute("COMMIT")
    except Exception:
        conn.execute("ROLLBACK")
        raise
 def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
--- a/tests/test_schema_v24_source_query_rewrite.py
+++ b/tests/test_schema_v24_source_query_rewrite.py
@ -124,3 +124,36 @@ def test_v24_logs_warning_when_project_not_configured(monkeypatch, caplog):
            )
        finally:
            conn.close()
 def test_v24_keboola_materialized_row_not_rewritten(monkeypatch):
    """Materialized rows with source_type != 'bigquery' must not be touched
    by v24. Keboola materialized has no notion of bq."ds"."tbl" syntax;
    the SELECT's source_type filter pins this contract.
    """
    with tempfile.TemporaryDirectory() as tmp:
        monkeypatch.setenv("DATA_DIR", tmp)
        monkeypatch.setattr(
            "app.instance_config.get_value",
            lambda *args, **kw: "prj-data" if args == ("data_source", "bigquery", "project") else kw.get("default"),
        )
        Path(tmp, "state").mkdir(parents=True, exist_ok=True)
        db_path = Path(tmp, "state", "system.duckdb")
        conn = duckdb.connect(str(db_path))
        try:
            _seed_v23(conn)
            # Keboola row that happens to contain `bq."..."` in its SQL
            # (admin error or copy-paste from a BQ row). Migration must
            # leave it alone — this is not the v24 contract.
            conn.execute(
                'INSERT INTO table_registry VALUES (?, ?, ?, ?, ?, ?, ?)',
                ["kb1", "kb1", "keboola", "materialized", "ds", "tbl",
                 'SELECT * FROM bq."ds"."tbl"'],
            )
            _ensure_schema(conn)
            row = conn.execute(
                "SELECT source_query FROM table_registry WHERE id='kb1'"
            ).fetchone()
            assert row[0] == 'SELECT * FROM bq."ds"."tbl"'
        finally:
            conn.close()