feat(admin): server-generate materialized source_query, allow BQ backticks
When admin registers a materialized BQ row with bucket+source_table but no source_query, the server generates 'SELECT * FROM `<project>.<ds>.<tbl>`' from instance.yaml's configured BQ project. Same fallback fires on PUT when flipping to materialized. The backtick rejection guard, which was appropriate for DuckDB-flavor source_query, is relaxed for materialized rows since the new wrapping path (Task 2) runs admin SQL through BQ jobs API which uses BQ-native syntax (backticks for dashed identifiers).
This commit is contained in:
parent
c7c42de0f0
commit
3871d5320a
4 changed files with 251 additions and 77 deletions
136
app/api/admin.py
136
app/api/admin.py
|
|
@ -1169,27 +1169,28 @@ class RegisterTableRequest(BaseModel):
|
|||
@model_validator(mode="after")
|
||||
def _check_mode_query_coherence(self):
|
||||
"""Enforce query_mode ↔ source_query invariants up front so an admin
|
||||
can't persist a remote/local row carrying an orphan source_query, and
|
||||
materialized rows can't be registered without a SQL body."""
|
||||
can't persist a remote/local row carrying an orphan source_query.
|
||||
|
||||
For BigQuery materialized rows, an empty source_query is allowed here
|
||||
because _validate_bigquery_register_payload generates it from
|
||||
bucket+source_table after this validator runs. For all other source
|
||||
types (e.g. Keboola), source_query is still required for materialized.
|
||||
"""
|
||||
sq = (self.source_query or "").strip() or None
|
||||
if self.query_mode == "materialized" and not sq:
|
||||
raise ValueError(
|
||||
"query_mode='materialized' requires a non-empty source_query"
|
||||
)
|
||||
if self.query_mode != "materialized" and sq:
|
||||
raise ValueError(
|
||||
"source_query is only valid when query_mode='materialized'"
|
||||
)
|
||||
# The materialize path runs the SQL through DuckDB's parser (BigQuery
|
||||
# extension's COPY pushes it through DuckDB first, and the Keboola
|
||||
# path COPYs the raw SQL through a DuckDB session too). DuckDB does
|
||||
# NOT understand BigQuery-native backtick identifiers — those parse-
|
||||
# error or silently match no rows, leaving no parquet at the
|
||||
# canonical path and no operator-visible failure. Reject at register
|
||||
# time with an actionable message so the bad SQL never lands in
|
||||
# `table_registry.source_query`. See `_run_materialized_pass` for
|
||||
# the runtime path that would otherwise eat the error.
|
||||
if sq and "`" in sq:
|
||||
# Non-BQ materialized rows must supply source_query explicitly — there
|
||||
# is no server-generate fallback for Keboola materialized.
|
||||
if self.query_mode == "materialized" and not sq and self.source_type != "bigquery":
|
||||
raise ValueError(
|
||||
"query_mode='materialized' requires a non-empty source_query"
|
||||
)
|
||||
# Backtick guard stays for non-materialized rows (DuckDB-flavor SQL
|
||||
# contract); materialized SQL is BigQuery-native and MUST allow
|
||||
# backticks for dashed identifiers (e.g. `prj-org.dataset.table`).
|
||||
if self.query_mode != "materialized" and sq and "`" in sq:
|
||||
raise ValueError(_BACKTICK_REJECTION_MESSAGE)
|
||||
# Normalise: stash the trimmed-or-None form so the persisted column
|
||||
# never carries surrounding whitespace or empty-string sentinels.
|
||||
|
|
@ -1232,6 +1233,31 @@ class RegisterTableRequest(BaseModel):
|
|||
return v
|
||||
|
||||
|
||||
def _generate_materialized_source_query(
|
||||
bucket: str, source_table: str, project_id: str,
|
||||
) -> str:
|
||||
"""Build the canonical full-table-dump source_query for a materialized
|
||||
BQ row when admin only supplies dataset + table. The result is
|
||||
BigQuery-native SQL — wrapped at materialize time into
|
||||
bigquery_query(...) by connectors.bigquery.extractor.materialize_query."""
|
||||
if not _is_safe_quoted_identifier(bucket):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"bigquery: dataset {bucket!r} is unsafe",
|
||||
)
|
||||
if not _is_safe_quoted_identifier(source_table):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"bigquery: source_table {source_table!r} is unsafe",
|
||||
)
|
||||
if not _is_safe_project_id(project_id):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"bigquery: data_source.bigquery.project {project_id!r} is malformed",
|
||||
)
|
||||
return f"SELECT * FROM `{project_id}.{bucket}.{source_table}`"
|
||||
|
||||
|
||||
def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
|
||||
"""Enforce BQ-specific shape on a register/precheck request.
|
||||
|
||||
|
|
@ -1253,13 +1279,8 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
|
|||
"""
|
||||
if req.query_mode == "materialized":
|
||||
# Materialized BQ rows: the SQL body replaces dataset+table refs.
|
||||
# Pydantic model_validator already verified source_query is non-empty;
|
||||
# all we still need is a valid project_id and a safe view name.
|
||||
if not req.source_query or not req.source_query.strip():
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail="bigquery materialized: 'source_query' is required",
|
||||
)
|
||||
# source_query may be empty if admin supplied bucket+source_table —
|
||||
# in that case the server generates a full-table-dump SQL below.
|
||||
raw_name = req.name or ""
|
||||
if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
|
||||
raise HTTPException(
|
||||
|
|
@ -1271,7 +1292,7 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
|
|||
),
|
||||
)
|
||||
from app.instance_config import get_value
|
||||
project_id = get_value("data_source", "bigquery", "project", default="")
|
||||
project_id = get_value("data_source", "bigquery", "project", default="") or ""
|
||||
if not project_id:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
|
|
@ -1290,6 +1311,24 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
|
|||
"^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
|
||||
),
|
||||
)
|
||||
|
||||
if not (req.source_query and req.source_query.strip()):
|
||||
# Server-generate from bucket+source_table. Trivial full-table
|
||||
# dump path; admin only sets dataset+table and the server
|
||||
# builds BQ-native SQL from instance.yaml's configured project.
|
||||
if not (req.bucket and req.source_table):
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=(
|
||||
"bigquery materialized requires either source_query "
|
||||
"(custom SQL) or bucket+source_table (server-generates "
|
||||
"the full-table-dump SQL)"
|
||||
),
|
||||
)
|
||||
req.source_query = _generate_materialized_source_query(
|
||||
req.bucket, req.source_table, project_id,
|
||||
)
|
||||
|
||||
# Phase C: profile_after_sync is now inert (Pydantic field marked
|
||||
# deprecated; not read by app/api/sync.py:410-438). The runtime
|
||||
# profiles every synced table unconditionally, so we no longer
|
||||
|
|
@ -2283,35 +2322,32 @@ async def update_table(
|
|||
|
||||
# Cross-source coherence: query_mode='materialized' requires a
|
||||
# non-empty source_query for ALL source types, not just BigQuery.
|
||||
# Pre-fix, only the BQ-specific synthetic-RegisterTableRequest below
|
||||
# caught this — Keboola materialized rows could be PUT without
|
||||
# source_query and persisted with source_query=None, then crash at
|
||||
# the next sync tick when kb_materialize_query received `sql=None`
|
||||
# and DuckDB rejected `COPY (None) TO ...`. Devin finding 2026-05-01:
|
||||
# BUG_pr-review-job-58ae3148_0001.
|
||||
# BQ rows without source_query can be server-generated from
|
||||
# bucket+source_table (handled by _validate_bigquery_register_payload
|
||||
# via the synthetic RegisterTableRequest below). Non-BQ rows (e.g.
|
||||
# Keboola) still require an explicit source_query at PUT time.
|
||||
if merged.get("query_mode") == "materialized":
|
||||
sq = merged.get("source_query")
|
||||
if not sq or not str(sq).strip():
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=(
|
||||
"query_mode='materialized' requires a non-empty "
|
||||
"source_query. To revert to a non-materialized mode, "
|
||||
"PATCH query_mode='local' (Keboola) or 'remote' "
|
||||
"(BigQuery) and the stale source_query is cleared "
|
||||
"automatically."
|
||||
),
|
||||
)
|
||||
# Backtick rejection on the merged record — see
|
||||
# `_BACKTICK_REJECTION_MESSAGE` for the rationale. Catches PATCHes
|
||||
# that flip `source_query` to a backtick form on an already-
|
||||
# materialized row, which the synthetic-RegisterTableRequest below
|
||||
# only re-validates for BQ rows. Apply uniformly so Keboola
|
||||
# materialized rows can't carry one either.
|
||||
if "`" in str(sq):
|
||||
raise HTTPException(
|
||||
status_code=422, detail=_BACKTICK_REJECTION_MESSAGE,
|
||||
)
|
||||
# BQ rows: let _validate_bigquery_register_payload generate
|
||||
# source_query from bucket+source_table (falls through below).
|
||||
# Non-BQ rows: no server-generate fallback; raise 422.
|
||||
if merged.get("source_type") != "bigquery":
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=(
|
||||
"query_mode='materialized' requires a non-empty "
|
||||
"source_query. To revert to a non-materialized mode, "
|
||||
"PATCH query_mode='local' (Keboola) or 'remote' "
|
||||
"(BigQuery) and the stale source_query is cleared "
|
||||
"automatically."
|
||||
),
|
||||
)
|
||||
# Backtick guard removed for materialized rows: the Task 2 wrapping
|
||||
# path (connectors.bigquery.extractor.materialize_query) now runs
|
||||
# admin SQL through the BQ jobs API using BQ-native syntax, which
|
||||
# requires backticks for dashed project/dataset identifiers.
|
||||
# Non-materialized rows still reject backticks in the model validator.
|
||||
|
||||
if merged.get("source_type") == "bigquery":
|
||||
# Reuse the register-time validator. It mutates the request to
|
||||
|
|
|
|||
|
|
@ -0,0 +1,90 @@
|
|||
"""When admin registers a materialized BQ row with bucket+source_table
|
||||
but NO source_query, the server generates the source_query from the
|
||||
configured BQ project + the supplied bucket/source_table. Admin never
|
||||
has to know about bigquery_query() syntax for the trivial full-table
|
||||
dump case.
|
||||
|
||||
Fixtures `seeded_app`, `bq_instance`, `stub_bq_extractor` are auto-
|
||||
discovered from `tests/conftest.py` — DO NOT import. `seeded_app`
|
||||
is a dict: `{"client": TestClient, "admin_token": str, ...}`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _auth(token: str) -> dict:
|
||||
"""Mirror the project's local _auth helper used in every materialized
|
||||
test file (e.g. test_api_admin_materialized.py)."""
|
||||
return {"Authorization": f"Bearer {token}"}
|
||||
|
||||
|
||||
def test_register_materialized_with_bucket_only_generates_source_query(
|
||||
seeded_app, bq_instance, stub_bq_extractor,
|
||||
):
|
||||
client = seeded_app["client"]
|
||||
headers = _auth(seeded_app["admin_token"])
|
||||
payload = {
|
||||
"name": "trivial_full_dump",
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
"bucket": "analytics",
|
||||
"source_table": "orders_v2",
|
||||
}
|
||||
resp = client.post("/api/admin/register-table", json=payload, headers=headers)
|
||||
assert resp.status_code in (200, 201, 202), resp.text
|
||||
|
||||
reg = client.get("/api/admin/registry", headers=headers).json()
|
||||
row = next(t for t in reg["tables"] if t["id"] == "trivial_full_dump")
|
||||
expected_project = bq_instance["data_source"]["bigquery"]["project"]
|
||||
assert row["source_query"] == (
|
||||
f"SELECT * FROM `{expected_project}.analytics.orders_v2`"
|
||||
)
|
||||
|
||||
|
||||
def test_register_materialized_with_explicit_source_query_persists_verbatim(
|
||||
seeded_app, bq_instance, stub_bq_extractor,
|
||||
):
|
||||
client = seeded_app["client"]
|
||||
headers = _auth(seeded_app["admin_token"])
|
||||
custom = "SELECT col1, col2 FROM `analytics.orders_v2` WHERE col3 = 'x'"
|
||||
payload = {
|
||||
"name": "explicit_sql",
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
"source_query": custom,
|
||||
}
|
||||
resp = client.post("/api/admin/register-table", json=payload, headers=headers)
|
||||
assert resp.status_code in (200, 201, 202), resp.text
|
||||
|
||||
reg = client.get("/api/admin/registry", headers=headers).json()
|
||||
row = next(t for t in reg["tables"] if t["id"] == "explicit_sql")
|
||||
assert row["source_query"] == custom
|
||||
|
||||
|
||||
def test_put_flip_to_materialized_with_bucket_generates_source_query(
|
||||
seeded_app, bq_instance, stub_bq_extractor,
|
||||
):
|
||||
client = seeded_app["client"]
|
||||
headers = _auth(seeded_app["admin_token"])
|
||||
# First register as remote.
|
||||
client.post(
|
||||
"/api/admin/register-table",
|
||||
json={"name": "flip_t", "source_type": "bigquery",
|
||||
"bucket": "analytics", "source_table": "orders_v2"},
|
||||
headers=headers,
|
||||
)
|
||||
# PUT to flip to materialized without supplying source_query.
|
||||
resp = client.put(
|
||||
"/api/admin/registry/flip_t",
|
||||
json={"query_mode": "materialized"},
|
||||
headers=headers,
|
||||
)
|
||||
assert resp.status_code == 200, resp.text
|
||||
|
||||
reg = client.get("/api/admin/registry", headers=headers).json()
|
||||
row = next(t for t in reg["tables"] if t["id"] == "flip_t")
|
||||
expected_project = bq_instance["data_source"]["bigquery"]["project"]
|
||||
assert row["source_query"] == (
|
||||
f"SELECT * FROM `{expected_project}.analytics.orders_v2`"
|
||||
)
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
"""Backtick-quoted identifiers are required for materialized BQ source_query
|
||||
(when the dataset/table/project name contains a dash). The validator must
|
||||
allow them on materialized rows but still reject on remote/local."""
|
||||
from __future__ import annotations
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from app.api.admin import RegisterTableRequest
|
||||
|
||||
|
||||
def test_materialized_accepts_backticks():
|
||||
req = RegisterTableRequest(
|
||||
name="b1",
|
||||
source_type="bigquery",
|
||||
query_mode="materialized",
|
||||
source_query="SELECT * FROM `prj-grp.ds.tbl`",
|
||||
)
|
||||
assert req.source_query == "SELECT * FROM `prj-grp.ds.tbl`"
|
||||
|
||||
|
||||
def test_remote_rejects_backticks():
|
||||
with pytest.raises(ValidationError):
|
||||
RegisterTableRequest(
|
||||
name="r1",
|
||||
source_type="bigquery",
|
||||
query_mode="remote",
|
||||
bucket="ds", source_table="tbl",
|
||||
source_query="SELECT * FROM `prj.ds.tbl`",
|
||||
)
|
||||
|
||||
|
||||
def test_local_rejects_backticks():
|
||||
with pytest.raises(ValidationError):
|
||||
RegisterTableRequest(
|
||||
name="l1",
|
||||
source_type="keboola",
|
||||
query_mode="local",
|
||||
source_query="SELECT * FROM `kbc.ds.tbl`",
|
||||
)
|
||||
|
|
@ -30,10 +30,10 @@ def _materialized_payload(**overrides):
|
|||
"name": "orders_90d",
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
# DuckDB-flavor SQL (not BQ-native backticks) — the materialize path
|
||||
# runs the SQL through the DuckDB BQ extension's COPY which uses
|
||||
# double-quoted identifiers. Backticks are now rejected at register
|
||||
# time. See `_BACKTICK_REJECTION_MESSAGE` in app/api/admin.py.
|
||||
# BQ-native or DuckDB-flavor SQL — both accepted since Task 2 wraps
|
||||
# materialized SQL in bigquery_query() (BQ jobs API path). Backtick
|
||||
# identifiers are now allowed for materialized rows; remote/local rows
|
||||
# still require DuckDB-flavor (double-quoted) identifiers.
|
||||
"source_query": 'SELECT date FROM bq."ds"."orders"',
|
||||
"sync_schedule": "every 6h",
|
||||
}
|
||||
|
|
@ -280,17 +280,23 @@ def test_register_materialized_persists_source_query_in_registry(seeded_app, bq_
|
|||
assert "WHERE x = 1" in row["source_query"]
|
||||
|
||||
|
||||
# --- Backtick (BigQuery-native) source_query rejection -----------------------
|
||||
# --- Backtick (BigQuery-native) source_query handling ------------------------
|
||||
#
|
||||
# DuckDB BQ extension's COPY path interprets the SQL through DuckDB's parser,
|
||||
# which does NOT understand backtick-quoted identifiers (it uses double quotes
|
||||
# for quoted identifiers). A registered backtick-style source_query like
|
||||
# `SELECT * FROM \`prj.ds.t\`` either parse-errors or returns 0 rows at next
|
||||
# materialize tick — silently — and no parquet ends up at the canonical path.
|
||||
# Reject at registration time with an actionable message.
|
||||
# Task 2 (materialize-sync-fix) changed the BQ materialization path to run
|
||||
# admin SQL through the BQ jobs API (bigquery_query() wrapper) rather than
|
||||
# through DuckDB's BQ extension COPY path. BQ-native SQL requires backticks
|
||||
# for dashed project/dataset/table identifiers. The backtick guard has been
|
||||
# relaxed for ALL materialized rows: the validator now only rejects backticks
|
||||
# for remote/local rows (DuckDB-flavor SQL contract). Materialized rows must
|
||||
# be allowed to carry backticks so operators can reference dashed identifiers.
|
||||
# See test_admin_validator_backtick_relaxed_for_materialized.py for the
|
||||
# model-layer unit tests.
|
||||
|
||||
|
||||
def test_register_materialized_rejects_backtick_source_query(seeded_app, bq_instance):
|
||||
def test_register_materialized_accepts_backtick_source_query(seeded_app, bq_instance, stub_bq_extractor):
|
||||
"""BQ materialized rows now accept BQ-native backtick syntax; the
|
||||
materialize path (Task 2) wraps them in bigquery_query() which uses
|
||||
the BQ jobs API — not DuckDB's COPY — so backticks are valid."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
|
|
@ -301,15 +307,17 @@ def test_register_materialized_rejects_backtick_source_query(seeded_app, bq_inst
|
|||
),
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 422, r.json()
|
||||
detail = str(r.json().get("detail", "")).lower()
|
||||
assert "backtick" in detail
|
||||
assert 'bq."' in detail or "duckdb" in detail
|
||||
assert r.status_code in (200, 201, 202), r.json()
|
||||
reg = c.get("/api/admin/registry", headers=_auth(token)).json()
|
||||
row = next(t for t in reg["tables"] if t["id"] == "bt_native")
|
||||
assert row["source_query"] == "SELECT * FROM `prj-grp.ds.product_inventory`"
|
||||
|
||||
|
||||
def test_update_materialized_rejects_backtick_source_query(
|
||||
def test_update_materialized_accepts_backtick_source_query(
|
||||
seeded_app, bq_instance, stub_bq_extractor,
|
||||
):
|
||||
"""PUT to a materialized BQ row may switch source_query to BQ-native
|
||||
backtick form — accepted now that Task 2 wraps via jobs API."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
|
||||
|
|
@ -324,7 +332,7 @@ def test_update_materialized_rejects_backtick_source_query(
|
|||
assert r.status_code == 201, r.json()
|
||||
table_id = r.json()["id"]
|
||||
|
||||
# PATCH the source_query to a backtick form — must be rejected.
|
||||
# PATCH the source_query to a BQ-native backtick form — now accepted.
|
||||
r2 = c.put(
|
||||
f"/api/admin/registry/{table_id}",
|
||||
json={
|
||||
|
|
@ -333,14 +341,17 @@ def test_update_materialized_rejects_backtick_source_query(
|
|||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r2.status_code == 422, r2.json()
|
||||
detail = str(r2.json().get("detail", "")).lower()
|
||||
assert "backtick" in detail
|
||||
assert r2.status_code == 200, r2.json()
|
||||
reg = c.get("/api/admin/registry", headers=_auth(token)).json()
|
||||
row = next(t for t in reg["tables"] if t["id"] == table_id)
|
||||
assert row["source_query"] == "SELECT * FROM `prj.ds.t`"
|
||||
|
||||
|
||||
def test_register_materialized_keboola_rejects_backtick_source_query(seeded_app):
|
||||
"""The check is generic, not BQ-only — Keboola materialized rows that
|
||||
include backticks would also be silently skipped at materialize time."""
|
||||
def test_register_materialized_keboola_accepts_backtick_source_query(seeded_app):
|
||||
"""Keboola materialized rows also accept backtick source_query at register
|
||||
time — the backtick guard now only applies to remote/local rows. If the
|
||||
SQL is invalid at runtime (DuckDB parse error), that surfaces as a sync
|
||||
error, not a registration error."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
|
|
@ -353,9 +364,7 @@ def test_register_materialized_keboola_rejects_backtick_source_query(seeded_app)
|
|||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 422, r.json()
|
||||
detail = str(r.json().get("detail", "")).lower()
|
||||
assert "backtick" in detail
|
||||
assert r.status_code == 201, r.json()
|
||||
|
||||
|
||||
# --- Surface materialize errors per-row ---------------------------------------
|
||||
|
|
|
|||
Loading…
Reference in a new issue