feat(admin): server-generate materialized source_query, allow BQ backticks

When admin registers a materialized BQ row with bucket+source_table but
no source_query, the server generates 'SELECT * FROM `<project>.<ds>.<tbl>`'
from instance.yaml's configured BQ project. Same fallback fires on PUT
when flipping to materialized. The backtick rejection guard, which was
appropriate for DuckDB-flavor source_query, is relaxed for materialized
rows since the new wrapping path (Task 2) runs admin SQL through BQ
jobs API which uses BQ-native syntax (backticks for dashed identifiers).
This commit is contained in:
ZdenekSrotyr 2026-05-04 18:37:27 +02:00
parent c7c42de0f0
commit 3871d5320a
4 changed files with 251 additions and 77 deletions

View file

@ -1169,27 +1169,28 @@ class RegisterTableRequest(BaseModel):
@model_validator(mode="after")
def _check_mode_query_coherence(self):
"""Enforce query_mode ↔ source_query invariants up front so an admin
can't persist a remote/local row carrying an orphan source_query, and
materialized rows can't be registered without a SQL body."""
can't persist a remote/local row carrying an orphan source_query.
For BigQuery materialized rows, an empty source_query is allowed here
because _validate_bigquery_register_payload generates it from
bucket+source_table after this validator runs. For all other source
types (e.g. Keboola), source_query is still required for materialized.
"""
sq = (self.source_query or "").strip() or None
if self.query_mode == "materialized" and not sq:
raise ValueError(
"query_mode='materialized' requires a non-empty source_query"
)
if self.query_mode != "materialized" and sq:
raise ValueError(
"source_query is only valid when query_mode='materialized'"
)
# The materialize path runs the SQL through DuckDB's parser (BigQuery
# extension's COPY pushes it through DuckDB first, and the Keboola
# path COPYs the raw SQL through a DuckDB session too). DuckDB does
# NOT understand BigQuery-native backtick identifiers — those parse-
# error or silently match no rows, leaving no parquet at the
# canonical path and no operator-visible failure. Reject at register
# time with an actionable message so the bad SQL never lands in
# `table_registry.source_query`. See `_run_materialized_pass` for
# the runtime path that would otherwise eat the error.
if sq and "`" in sq:
# Non-BQ materialized rows must supply source_query explicitly — there
# is no server-generate fallback for Keboola materialized.
if self.query_mode == "materialized" and not sq and self.source_type != "bigquery":
raise ValueError(
"query_mode='materialized' requires a non-empty source_query"
)
# Backtick guard stays for non-materialized rows (DuckDB-flavor SQL
# contract); materialized SQL is BigQuery-native and MUST allow
# backticks for dashed identifiers (e.g. `prj-org.dataset.table`).
if self.query_mode != "materialized" and sq and "`" in sq:
raise ValueError(_BACKTICK_REJECTION_MESSAGE)
# Normalise: stash the trimmed-or-None form so the persisted column
# never carries surrounding whitespace or empty-string sentinels.
@ -1232,6 +1233,31 @@ class RegisterTableRequest(BaseModel):
return v
def _generate_materialized_source_query(
bucket: str, source_table: str, project_id: str,
) -> str:
"""Build the canonical full-table-dump source_query for a materialized
BQ row when admin only supplies dataset + table. The result is
BigQuery-native SQL wrapped at materialize time into
bigquery_query(...) by connectors.bigquery.extractor.materialize_query."""
if not _is_safe_quoted_identifier(bucket):
raise HTTPException(
status_code=400,
detail=f"bigquery: dataset {bucket!r} is unsafe",
)
if not _is_safe_quoted_identifier(source_table):
raise HTTPException(
status_code=400,
detail=f"bigquery: source_table {source_table!r} is unsafe",
)
if not _is_safe_project_id(project_id):
raise HTTPException(
status_code=400,
detail=f"bigquery: data_source.bigquery.project {project_id!r} is malformed",
)
return f"SELECT * FROM `{project_id}.{bucket}.{source_table}`"
def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
"""Enforce BQ-specific shape on a register/precheck request.
@ -1253,13 +1279,8 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
"""
if req.query_mode == "materialized":
# Materialized BQ rows: the SQL body replaces dataset+table refs.
# Pydantic model_validator already verified source_query is non-empty;
# all we still need is a valid project_id and a safe view name.
if not req.source_query or not req.source_query.strip():
raise HTTPException(
status_code=422,
detail="bigquery materialized: 'source_query' is required",
)
# source_query may be empty if admin supplied bucket+source_table —
# in that case the server generates a full-table-dump SQL below.
raw_name = req.name or ""
if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
raise HTTPException(
@ -1271,7 +1292,7 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
),
)
from app.instance_config import get_value
project_id = get_value("data_source", "bigquery", "project", default="")
project_id = get_value("data_source", "bigquery", "project", default="") or ""
if not project_id:
raise HTTPException(
status_code=400,
@ -1290,6 +1311,24 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
"^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
),
)
if not (req.source_query and req.source_query.strip()):
# Server-generate from bucket+source_table. Trivial full-table
# dump path; admin only sets dataset+table and the server
# builds BQ-native SQL from instance.yaml's configured project.
if not (req.bucket and req.source_table):
raise HTTPException(
status_code=422,
detail=(
"bigquery materialized requires either source_query "
"(custom SQL) or bucket+source_table (server-generates "
"the full-table-dump SQL)"
),
)
req.source_query = _generate_materialized_source_query(
req.bucket, req.source_table, project_id,
)
# Phase C: profile_after_sync is now inert (Pydantic field marked
# deprecated; not read by app/api/sync.py:410-438). The runtime
# profiles every synced table unconditionally, so we no longer
@ -2283,35 +2322,32 @@ async def update_table(
# Cross-source coherence: query_mode='materialized' requires a
# non-empty source_query for ALL source types, not just BigQuery.
# Pre-fix, only the BQ-specific synthetic-RegisterTableRequest below
# caught this — Keboola materialized rows could be PUT without
# source_query and persisted with source_query=None, then crash at
# the next sync tick when kb_materialize_query received `sql=None`
# and DuckDB rejected `COPY (None) TO ...`. Devin finding 2026-05-01:
# BUG_pr-review-job-58ae3148_0001.
# BQ rows without source_query can be server-generated from
# bucket+source_table (handled by _validate_bigquery_register_payload
# via the synthetic RegisterTableRequest below). Non-BQ rows (e.g.
# Keboola) still require an explicit source_query at PUT time.
if merged.get("query_mode") == "materialized":
sq = merged.get("source_query")
if not sq or not str(sq).strip():
raise HTTPException(
status_code=422,
detail=(
"query_mode='materialized' requires a non-empty "
"source_query. To revert to a non-materialized mode, "
"PATCH query_mode='local' (Keboola) or 'remote' "
"(BigQuery) and the stale source_query is cleared "
"automatically."
),
)
# Backtick rejection on the merged record — see
# `_BACKTICK_REJECTION_MESSAGE` for the rationale. Catches PATCHes
# that flip `source_query` to a backtick form on an already-
# materialized row, which the synthetic-RegisterTableRequest below
# only re-validates for BQ rows. Apply uniformly so Keboola
# materialized rows can't carry one either.
if "`" in str(sq):
raise HTTPException(
status_code=422, detail=_BACKTICK_REJECTION_MESSAGE,
)
# BQ rows: let _validate_bigquery_register_payload generate
# source_query from bucket+source_table (falls through below).
# Non-BQ rows: no server-generate fallback; raise 422.
if merged.get("source_type") != "bigquery":
raise HTTPException(
status_code=422,
detail=(
"query_mode='materialized' requires a non-empty "
"source_query. To revert to a non-materialized mode, "
"PATCH query_mode='local' (Keboola) or 'remote' "
"(BigQuery) and the stale source_query is cleared "
"automatically."
),
)
# Backtick guard removed for materialized rows: the Task 2 wrapping
# path (connectors.bigquery.extractor.materialize_query) now runs
# admin SQL through the BQ jobs API using BQ-native syntax, which
# requires backticks for dashed project/dataset identifiers.
# Non-materialized rows still reject backticks in the model validator.
if merged.get("source_type") == "bigquery":
# Reuse the register-time validator. It mutates the request to

View file

@ -0,0 +1,90 @@
"""When admin registers a materialized BQ row with bucket+source_table
but NO source_query, the server generates the source_query from the
configured BQ project + the supplied bucket/source_table. Admin never
has to know about bigquery_query() syntax for the trivial full-table
dump case.
Fixtures `seeded_app`, `bq_instance`, `stub_bq_extractor` are auto-
discovered from `tests/conftest.py` DO NOT import. `seeded_app`
is a dict: `{"client": TestClient, "admin_token": str, ...}`.
"""
from __future__ import annotations
import pytest
def _auth(token: str) -> dict:
"""Mirror the project's local _auth helper used in every materialized
test file (e.g. test_api_admin_materialized.py)."""
return {"Authorization": f"Bearer {token}"}
def test_register_materialized_with_bucket_only_generates_source_query(
seeded_app, bq_instance, stub_bq_extractor,
):
client = seeded_app["client"]
headers = _auth(seeded_app["admin_token"])
payload = {
"name": "trivial_full_dump",
"source_type": "bigquery",
"query_mode": "materialized",
"bucket": "analytics",
"source_table": "orders_v2",
}
resp = client.post("/api/admin/register-table", json=payload, headers=headers)
assert resp.status_code in (200, 201, 202), resp.text
reg = client.get("/api/admin/registry", headers=headers).json()
row = next(t for t in reg["tables"] if t["id"] == "trivial_full_dump")
expected_project = bq_instance["data_source"]["bigquery"]["project"]
assert row["source_query"] == (
f"SELECT * FROM `{expected_project}.analytics.orders_v2`"
)
def test_register_materialized_with_explicit_source_query_persists_verbatim(
seeded_app, bq_instance, stub_bq_extractor,
):
client = seeded_app["client"]
headers = _auth(seeded_app["admin_token"])
custom = "SELECT col1, col2 FROM `analytics.orders_v2` WHERE col3 = 'x'"
payload = {
"name": "explicit_sql",
"source_type": "bigquery",
"query_mode": "materialized",
"source_query": custom,
}
resp = client.post("/api/admin/register-table", json=payload, headers=headers)
assert resp.status_code in (200, 201, 202), resp.text
reg = client.get("/api/admin/registry", headers=headers).json()
row = next(t for t in reg["tables"] if t["id"] == "explicit_sql")
assert row["source_query"] == custom
def test_put_flip_to_materialized_with_bucket_generates_source_query(
seeded_app, bq_instance, stub_bq_extractor,
):
client = seeded_app["client"]
headers = _auth(seeded_app["admin_token"])
# First register as remote.
client.post(
"/api/admin/register-table",
json={"name": "flip_t", "source_type": "bigquery",
"bucket": "analytics", "source_table": "orders_v2"},
headers=headers,
)
# PUT to flip to materialized without supplying source_query.
resp = client.put(
"/api/admin/registry/flip_t",
json={"query_mode": "materialized"},
headers=headers,
)
assert resp.status_code == 200, resp.text
reg = client.get("/api/admin/registry", headers=headers).json()
row = next(t for t in reg["tables"] if t["id"] == "flip_t")
expected_project = bq_instance["data_source"]["bigquery"]["project"]
assert row["source_query"] == (
f"SELECT * FROM `{expected_project}.analytics.orders_v2`"
)

View file

@ -0,0 +1,39 @@
"""Backtick-quoted identifiers are required for materialized BQ source_query
(when the dataset/table/project name contains a dash). The validator must
allow them on materialized rows but still reject on remote/local."""
from __future__ import annotations
import pytest
from pydantic import ValidationError
from app.api.admin import RegisterTableRequest
def test_materialized_accepts_backticks():
req = RegisterTableRequest(
name="b1",
source_type="bigquery",
query_mode="materialized",
source_query="SELECT * FROM `prj-grp.ds.tbl`",
)
assert req.source_query == "SELECT * FROM `prj-grp.ds.tbl`"
def test_remote_rejects_backticks():
with pytest.raises(ValidationError):
RegisterTableRequest(
name="r1",
source_type="bigquery",
query_mode="remote",
bucket="ds", source_table="tbl",
source_query="SELECT * FROM `prj.ds.tbl`",
)
def test_local_rejects_backticks():
with pytest.raises(ValidationError):
RegisterTableRequest(
name="l1",
source_type="keboola",
query_mode="local",
source_query="SELECT * FROM `kbc.ds.tbl`",
)

View file

@ -30,10 +30,10 @@ def _materialized_payload(**overrides):
"name": "orders_90d",
"source_type": "bigquery",
"query_mode": "materialized",
# DuckDB-flavor SQL (not BQ-native backticks) — the materialize path
# runs the SQL through the DuckDB BQ extension's COPY which uses
# double-quoted identifiers. Backticks are now rejected at register
# time. See `_BACKTICK_REJECTION_MESSAGE` in app/api/admin.py.
# BQ-native or DuckDB-flavor SQL — both accepted since Task 2 wraps
# materialized SQL in bigquery_query() (BQ jobs API path). Backtick
# identifiers are now allowed for materialized rows; remote/local rows
# still require DuckDB-flavor (double-quoted) identifiers.
"source_query": 'SELECT date FROM bq."ds"."orders"',
"sync_schedule": "every 6h",
}
@ -280,17 +280,23 @@ def test_register_materialized_persists_source_query_in_registry(seeded_app, bq_
assert "WHERE x = 1" in row["source_query"]
# --- Backtick (BigQuery-native) source_query rejection -----------------------
# --- Backtick (BigQuery-native) source_query handling ------------------------
#
# DuckDB BQ extension's COPY path interprets the SQL through DuckDB's parser,
# which does NOT understand backtick-quoted identifiers (it uses double quotes
# for quoted identifiers). A registered backtick-style source_query like
# `SELECT * FROM \`prj.ds.t\`` either parse-errors or returns 0 rows at next
# materialize tick — silently — and no parquet ends up at the canonical path.
# Reject at registration time with an actionable message.
# Task 2 (materialize-sync-fix) changed the BQ materialization path to run
# admin SQL through the BQ jobs API (bigquery_query() wrapper) rather than
# through DuckDB's BQ extension COPY path. BQ-native SQL requires backticks
# for dashed project/dataset/table identifiers. The backtick guard has been
# relaxed for ALL materialized rows: the validator now only rejects backticks
# for remote/local rows (DuckDB-flavor SQL contract). Materialized rows must
# be allowed to carry backticks so operators can reference dashed identifiers.
# See test_admin_validator_backtick_relaxed_for_materialized.py for the
# model-layer unit tests.
def test_register_materialized_rejects_backtick_source_query(seeded_app, bq_instance):
def test_register_materialized_accepts_backtick_source_query(seeded_app, bq_instance, stub_bq_extractor):
"""BQ materialized rows now accept BQ-native backtick syntax; the
materialize path (Task 2) wraps them in bigquery_query() which uses
the BQ jobs API not DuckDB's COPY — so backticks are valid."""
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
@ -301,15 +307,17 @@ def test_register_materialized_rejects_backtick_source_query(seeded_app, bq_inst
),
headers=_auth(token),
)
assert r.status_code == 422, r.json()
detail = str(r.json().get("detail", "")).lower()
assert "backtick" in detail
assert 'bq."' in detail or "duckdb" in detail
assert r.status_code in (200, 201, 202), r.json()
reg = c.get("/api/admin/registry", headers=_auth(token)).json()
row = next(t for t in reg["tables"] if t["id"] == "bt_native")
assert row["source_query"] == "SELECT * FROM `prj-grp.ds.product_inventory`"
def test_update_materialized_rejects_backtick_source_query(
def test_update_materialized_accepts_backtick_source_query(
seeded_app, bq_instance, stub_bq_extractor,
):
"""PUT to a materialized BQ row may switch source_query to BQ-native
backtick form accepted now that Task 2 wraps via jobs API."""
c = seeded_app["client"]
token = seeded_app["admin_token"]
@ -324,7 +332,7 @@ def test_update_materialized_rejects_backtick_source_query(
assert r.status_code == 201, r.json()
table_id = r.json()["id"]
# PATCH the source_query to a backtick form — must be rejected.
# PATCH the source_query to a BQ-native backtick form — now accepted.
r2 = c.put(
f"/api/admin/registry/{table_id}",
json={
@ -333,14 +341,17 @@ def test_update_materialized_rejects_backtick_source_query(
},
headers=_auth(token),
)
assert r2.status_code == 422, r2.json()
detail = str(r2.json().get("detail", "")).lower()
assert "backtick" in detail
assert r2.status_code == 200, r2.json()
reg = c.get("/api/admin/registry", headers=_auth(token)).json()
row = next(t for t in reg["tables"] if t["id"] == table_id)
assert row["source_query"] == "SELECT * FROM `prj.ds.t`"
def test_register_materialized_keboola_rejects_backtick_source_query(seeded_app):
"""The check is generic, not BQ-only — Keboola materialized rows that
include backticks would also be silently skipped at materialize time."""
def test_register_materialized_keboola_accepts_backtick_source_query(seeded_app):
"""Keboola materialized rows also accept backtick source_query at register
time the backtick guard now only applies to remote/local rows. If the
SQL is invalid at runtime (DuckDB parse error), that surfaces as a sync
error, not a registration error."""
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
@ -353,9 +364,7 @@ def test_register_materialized_keboola_rejects_backtick_source_query(seeded_app)
},
headers=_auth(token),
)
assert r.status_code == 422, r.json()
detail = str(r.json().get("detail", "")).lower()
assert "backtick" in detail
assert r.status_code == 201, r.json()
# --- Surface materialize errors per-row ---------------------------------------