fix(schema): code-review follow-ups for fac10b29

- _v23_to_v24_finalize: wrap row-update loop in BEGIN/COMMIT/ROLLBACK
  to match the project's transactional-finalizer pattern (compare
  _v12_to_v13_finalize, _v17_to_v18_finalize, _v18_to_v19_finalize).
  Pre-fix a process crash mid-loop left the schema_version unchanged
  but partially-converted rows persisted across restart — idempotent
  overall but inconsistent with project convention.
- _v23_to_v24_finalize: re.sub replacement now uses a function-form
  (lambda) instead of an f-string, so any future project_id with a
  backslash sequence isn't misinterpreted as a group reference.
- tests: add a Keboola-source materialized row case asserting the
  SELECT's source_type filter prevents non-BQ rewrites.
This commit is contained in:
ZdenekSrotyr 2026-05-04 19:32:24 +02:00
parent fac10b29e4
commit ce108d4c6d
2 changed files with 74 additions and 20 deletions

View file

@ -1696,6 +1696,18 @@ _V22_TO_V23_MIGRATIONS = [
# migration time, log a warning per affected row and leave them — the # migration time, log a warning per affected row and leave them — the
# operator must configure data_source.bigquery.project, restart, and # operator must configure data_source.bigquery.project, restart, and
# the migration will fire on next start (idempotent). # the migration will fire on next start (idempotent).
def _replace_for_v24(project_id: str):
"""Build a re.sub replacement function (not a string) so backslash
sequences in `project_id` aren't interpreted as group references.
GCP project IDs can't actually contain backslashes, but using a
function-form replacement is the defensive idiom it makes the
intent explicit and removes the dependency on re.sub's replacement-
string escaping rules."""
def _repl(m):
return f"`{project_id}.{m.group(1)}.{m.group(2)}`"
return _repl
def _v23_to_v24_finalize(conn: duckdb.DuckDBPyConnection) -> None: def _v23_to_v24_finalize(conn: duckdb.DuckDBPyConnection) -> None:
import re as _re import re as _re
@ -1714,26 +1726,35 @@ def _v23_to_v24_finalize(conn: duckdb.DuckDBPyConnection) -> None:
"AND source_type = 'bigquery'" "AND source_type = 'bigquery'"
).fetchall() ).fetchall()
for row_id, sq in rows: if not rows:
if sq is None: return # Nothing to migrate; skip the transaction.
continue
if not project_id: conn.execute("BEGIN TRANSACTION")
logger.warning( try:
"v24 migration: skipping rewrite of source_query for row %r" for row_id, sq in rows:
"data_source.bigquery.project is not configured. Set it via " if sq is None:
"/admin/server-config and restart the app to retry the " continue
"migration.", row_id, if not project_id:
) logger.warning(
continue "v24 migration: skipping rewrite of source_query for row %r"
new_sq = pattern.sub(rf'`{project_id}.\1.\2`', sq) "data_source.bigquery.project is not configured. Set it via "
if new_sq != sq: "/admin/server-config and restart the app to retry the "
conn.execute( "migration.", row_id,
"UPDATE table_registry SET source_query = ? WHERE id = ?", )
[new_sq, row_id], continue
) new_sq = pattern.sub(_replace_for_v24(project_id), sq)
logger.info( if new_sq != sq:
"v24 migration: rewrote source_query for row %r", row_id, conn.execute(
) "UPDATE table_registry SET source_query = ? WHERE id = ?",
[new_sq, row_id],
)
logger.info(
"v24 migration: rewrote source_query for row %r", row_id,
)
conn.execute("COMMIT")
except Exception:
conn.execute("ROLLBACK")
raise
def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:

View file

@ -124,3 +124,36 @@ def test_v24_logs_warning_when_project_not_configured(monkeypatch, caplog):
) )
finally: finally:
conn.close() conn.close()
def test_v24_keboola_materialized_row_not_rewritten(monkeypatch):
"""Materialized rows with source_type != 'bigquery' must not be touched
by v24. Keboola materialized has no notion of bq."ds"."tbl" syntax;
the SELECT's source_type filter pins this contract.
"""
with tempfile.TemporaryDirectory() as tmp:
monkeypatch.setenv("DATA_DIR", tmp)
monkeypatch.setattr(
"app.instance_config.get_value",
lambda *args, **kw: "prj-data" if args == ("data_source", "bigquery", "project") else kw.get("default"),
)
Path(tmp, "state").mkdir(parents=True, exist_ok=True)
db_path = Path(tmp, "state", "system.duckdb")
conn = duckdb.connect(str(db_path))
try:
_seed_v23(conn)
# Keboola row that happens to contain `bq."..."` in its SQL
# (admin error or copy-paste from a BQ row). Migration must
# leave it alone — this is not the v24 contract.
conn.execute(
'INSERT INTO table_registry VALUES (?, ?, ?, ?, ?, ?, ?)',
["kb1", "kb1", "keboola", "materialized", "ds", "tbl",
'SELECT * FROM bq."ds"."tbl"'],
)
_ensure_schema(conn)
row = conn.execute(
"SELECT source_query FROM table_registry WHERE id='kb1'"
).fetchone()
assert row[0] == 'SELECT * FROM bq."ds"."tbl"'
finally:
conn.close()