fix(schema): code-review follow-ups for fac10b29
- _v23_to_v24_finalize: wrap row-update loop in BEGIN/COMMIT/ROLLBACK to match the project's transactional-finalizer pattern (compare _v12_to_v13_finalize, _v17_to_v18_finalize, _v18_to_v19_finalize). Pre-fix a process crash mid-loop left the schema_version unchanged but partially-converted rows persisted across restart — idempotent overall but inconsistent with project convention. - _v23_to_v24_finalize: re.sub replacement now uses a function-form (lambda) instead of an f-string, so any future project_id with a backslash sequence isn't misinterpreted as a group reference. - tests: add a Keboola-source materialized row case asserting the SELECT's source_type filter prevents non-BQ rewrites.
This commit is contained in:
parent
fac10b29e4
commit
ce108d4c6d
2 changed files with 74 additions and 20 deletions
61
src/db.py
61
src/db.py
|
|
@ -1696,6 +1696,18 @@ _V22_TO_V23_MIGRATIONS = [
|
|||
# migration time, log a warning per affected row and leave them — the
|
||||
# operator must configure data_source.bigquery.project, restart, and
|
||||
# the migration will fire on next start (idempotent).
|
||||
def _replace_for_v24(project_id: str):
|
||||
"""Build a re.sub replacement function (not a string) so backslash
|
||||
sequences in `project_id` aren't interpreted as group references.
|
||||
GCP project IDs can't actually contain backslashes, but using a
|
||||
function-form replacement is the defensive idiom — it makes the
|
||||
intent explicit and removes the dependency on re.sub's replacement-
|
||||
string escaping rules."""
|
||||
def _repl(m):
|
||||
return f"`{project_id}.{m.group(1)}.{m.group(2)}`"
|
||||
return _repl
|
||||
|
||||
|
||||
def _v23_to_v24_finalize(conn: duckdb.DuckDBPyConnection) -> None:
|
||||
import re as _re
|
||||
|
||||
|
|
@ -1714,26 +1726,35 @@ def _v23_to_v24_finalize(conn: duckdb.DuckDBPyConnection) -> None:
|
|||
"AND source_type = 'bigquery'"
|
||||
).fetchall()
|
||||
|
||||
for row_id, sq in rows:
|
||||
if sq is None:
|
||||
continue
|
||||
if not project_id:
|
||||
logger.warning(
|
||||
"v24 migration: skipping rewrite of source_query for row %r — "
|
||||
"data_source.bigquery.project is not configured. Set it via "
|
||||
"/admin/server-config and restart the app to retry the "
|
||||
"migration.", row_id,
|
||||
)
|
||||
continue
|
||||
new_sq = pattern.sub(rf'`{project_id}.\1.\2`', sq)
|
||||
if new_sq != sq:
|
||||
conn.execute(
|
||||
"UPDATE table_registry SET source_query = ? WHERE id = ?",
|
||||
[new_sq, row_id],
|
||||
)
|
||||
logger.info(
|
||||
"v24 migration: rewrote source_query for row %r", row_id,
|
||||
)
|
||||
if not rows:
|
||||
return # Nothing to migrate; skip the transaction.
|
||||
|
||||
conn.execute("BEGIN TRANSACTION")
|
||||
try:
|
||||
for row_id, sq in rows:
|
||||
if sq is None:
|
||||
continue
|
||||
if not project_id:
|
||||
logger.warning(
|
||||
"v24 migration: skipping rewrite of source_query for row %r — "
|
||||
"data_source.bigquery.project is not configured. Set it via "
|
||||
"/admin/server-config and restart the app to retry the "
|
||||
"migration.", row_id,
|
||||
)
|
||||
continue
|
||||
new_sq = pattern.sub(_replace_for_v24(project_id), sq)
|
||||
if new_sq != sq:
|
||||
conn.execute(
|
||||
"UPDATE table_registry SET source_query = ? WHERE id = ?",
|
||||
[new_sq, row_id],
|
||||
)
|
||||
logger.info(
|
||||
"v24 migration: rewrote source_query for row %r", row_id,
|
||||
)
|
||||
conn.execute("COMMIT")
|
||||
except Exception:
|
||||
conn.execute("ROLLBACK")
|
||||
raise
|
||||
|
||||
|
||||
def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
|
||||
|
|
|
|||
|
|
@ -124,3 +124,36 @@ def test_v24_logs_warning_when_project_not_configured(monkeypatch, caplog):
|
|||
)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_v24_keboola_materialized_row_not_rewritten(monkeypatch):
|
||||
"""Materialized rows with source_type != 'bigquery' must not be touched
|
||||
by v24. Keboola materialized has no notion of bq."ds"."tbl" syntax;
|
||||
the SELECT's source_type filter pins this contract.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
monkeypatch.setenv("DATA_DIR", tmp)
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.get_value",
|
||||
lambda *args, **kw: "prj-data" if args == ("data_source", "bigquery", "project") else kw.get("default"),
|
||||
)
|
||||
Path(tmp, "state").mkdir(parents=True, exist_ok=True)
|
||||
db_path = Path(tmp, "state", "system.duckdb")
|
||||
conn = duckdb.connect(str(db_path))
|
||||
try:
|
||||
_seed_v23(conn)
|
||||
# Keboola row that happens to contain `bq."..."` in its SQL
|
||||
# (admin error or copy-paste from a BQ row). Migration must
|
||||
# leave it alone — this is not the v24 contract.
|
||||
conn.execute(
|
||||
'INSERT INTO table_registry VALUES (?, ?, ?, ?, ?, ?, ?)',
|
||||
["kb1", "kb1", "keboola", "materialized", "ds", "tbl",
|
||||
'SELECT * FROM bq."ds"."tbl"'],
|
||||
)
|
||||
_ensure_schema(conn)
|
||||
row = conn.execute(
|
||||
"SELECT source_query FROM table_registry WHERE id='kb1'"
|
||||
).fetchone()
|
||||
assert row[0] == 'SELECT * FROM bq."ds"."tbl"'
|
||||
finally:
|
||||
conn.close()
|
||||
|
|
|
|||
Loading…
Reference in a new issue