diff --git a/app/api/admin.py b/app/api/admin.py index ba9a9b4..24cce49 100644 --- a/app/api/admin.py +++ b/app/api/admin.py @@ -1169,27 +1169,28 @@ class RegisterTableRequest(BaseModel): @model_validator(mode="after") def _check_mode_query_coherence(self): """Enforce query_mode ↔ source_query invariants up front so an admin - can't persist a remote/local row carrying an orphan source_query, and - materialized rows can't be registered without a SQL body.""" + can't persist a remote/local row carrying an orphan source_query. + + For BigQuery materialized rows, an empty source_query is allowed here + because _validate_bigquery_register_payload generates it from + bucket+source_table after this validator runs. For all other source + types (e.g. Keboola), source_query is still required for materialized. + """ sq = (self.source_query or "").strip() or None - if self.query_mode == "materialized" and not sq: - raise ValueError( - "query_mode='materialized' requires a non-empty source_query" - ) if self.query_mode != "materialized" and sq: raise ValueError( "source_query is only valid when query_mode='materialized'" ) - # The materialize path runs the SQL through DuckDB's parser (BigQuery - # extension's COPY pushes it through DuckDB first, and the Keboola - # path COPYs the raw SQL through a DuckDB session too). DuckDB does - # NOT understand BigQuery-native backtick identifiers — those parse- - # error or silently match no rows, leaving no parquet at the - # canonical path and no operator-visible failure. Reject at register - # time with an actionable message so the bad SQL never lands in - # `table_registry.source_query`. See `_run_materialized_pass` for - # the runtime path that would otherwise eat the error. - if sq and "`" in sq: + # Non-BQ materialized rows must supply source_query explicitly — there + # is no server-generate fallback for Keboola materialized. + if self.query_mode == "materialized" and not sq and self.source_type != "bigquery": + raise ValueError( + "query_mode='materialized' requires a non-empty source_query" + ) + # Backtick guard stays for non-materialized rows (DuckDB-flavor SQL + # contract); materialized SQL is BigQuery-native and MUST allow + # backticks for dashed identifiers (e.g. `prj-org.dataset.table`). + if self.query_mode != "materialized" and sq and "`" in sq: raise ValueError(_BACKTICK_REJECTION_MESSAGE) # Normalise: stash the trimmed-or-None form so the persisted column # never carries surrounding whitespace or empty-string sentinels. @@ -1232,6 +1233,31 @@ class RegisterTableRequest(BaseModel): return v +def _generate_materialized_source_query( + bucket: str, source_table: str, project_id: str, +) -> str: + """Build the canonical full-table-dump source_query for a materialized + BQ row when admin only supplies dataset + table. The result is + BigQuery-native SQL — wrapped at materialize time into + bigquery_query(...) by connectors.bigquery.extractor.materialize_query.""" + if not _is_safe_quoted_identifier(bucket): + raise HTTPException( + status_code=400, + detail=f"bigquery: dataset {bucket!r} is unsafe", + ) + if not _is_safe_quoted_identifier(source_table): + raise HTTPException( + status_code=400, + detail=f"bigquery: source_table {source_table!r} is unsafe", + ) + if not _is_safe_project_id(project_id): + raise HTTPException( + status_code=400, + detail=f"bigquery: data_source.bigquery.project {project_id!r} is malformed", + ) + return f"SELECT * FROM `{project_id}.{bucket}.{source_table}`" + + def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None: """Enforce BQ-specific shape on a register/precheck request. @@ -1253,13 +1279,8 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None: """ if req.query_mode == "materialized": # Materialized BQ rows: the SQL body replaces dataset+table refs. - # Pydantic model_validator already verified source_query is non-empty; - # all we still need is a valid project_id and a safe view name. - if not req.source_query or not req.source_query.strip(): - raise HTTPException( - status_code=422, - detail="bigquery materialized: 'source_query' is required", - ) + # source_query may be empty if admin supplied bucket+source_table — + # in that case the server generates a full-table-dump SQL below. raw_name = req.name or "" if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name): raise HTTPException( @@ -1271,7 +1292,7 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None: ), ) from app.instance_config import get_value - project_id = get_value("data_source", "bigquery", "project", default="") + project_id = get_value("data_source", "bigquery", "project", default="") or "" if not project_id: raise HTTPException( status_code=400, @@ -1290,6 +1311,24 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None: "^[a-z][a-z0-9-]{4,28}[a-z0-9]$" ), ) + + if not (req.source_query and req.source_query.strip()): + # Server-generate from bucket+source_table. Trivial full-table + # dump path; admin only sets dataset+table and the server + # builds BQ-native SQL from instance.yaml's configured project. + if not (req.bucket and req.source_table): + raise HTTPException( + status_code=422, + detail=( + "bigquery materialized requires either source_query " + "(custom SQL) or bucket+source_table (server-generates " + "the full-table-dump SQL)" + ), + ) + req.source_query = _generate_materialized_source_query( + req.bucket, req.source_table, project_id, + ) + # Phase C: profile_after_sync is now inert (Pydantic field marked # deprecated; not read by app/api/sync.py:410-438). The runtime # profiles every synced table unconditionally, so we no longer @@ -2283,35 +2322,32 @@ async def update_table( # Cross-source coherence: query_mode='materialized' requires a # non-empty source_query for ALL source types, not just BigQuery. - # Pre-fix, only the BQ-specific synthetic-RegisterTableRequest below - # caught this — Keboola materialized rows could be PUT without - # source_query and persisted with source_query=None, then crash at - # the next sync tick when kb_materialize_query received `sql=None` - # and DuckDB rejected `COPY (None) TO ...`. Devin finding 2026-05-01: - # BUG_pr-review-job-58ae3148_0001. + # BQ rows without source_query can be server-generated from + # bucket+source_table (handled by _validate_bigquery_register_payload + # via the synthetic RegisterTableRequest below). Non-BQ rows (e.g. + # Keboola) still require an explicit source_query at PUT time. if merged.get("query_mode") == "materialized": sq = merged.get("source_query") if not sq or not str(sq).strip(): - raise HTTPException( - status_code=422, - detail=( - "query_mode='materialized' requires a non-empty " - "source_query. To revert to a non-materialized mode, " - "PATCH query_mode='local' (Keboola) or 'remote' " - "(BigQuery) and the stale source_query is cleared " - "automatically." - ), - ) - # Backtick rejection on the merged record — see - # `_BACKTICK_REJECTION_MESSAGE` for the rationale. Catches PATCHes - # that flip `source_query` to a backtick form on an already- - # materialized row, which the synthetic-RegisterTableRequest below - # only re-validates for BQ rows. Apply uniformly so Keboola - # materialized rows can't carry one either. - if "`" in str(sq): - raise HTTPException( - status_code=422, detail=_BACKTICK_REJECTION_MESSAGE, - ) + # BQ rows: let _validate_bigquery_register_payload generate + # source_query from bucket+source_table (falls through below). + # Non-BQ rows: no server-generate fallback; raise 422. + if merged.get("source_type") != "bigquery": + raise HTTPException( + status_code=422, + detail=( + "query_mode='materialized' requires a non-empty " + "source_query. To revert to a non-materialized mode, " + "PATCH query_mode='local' (Keboola) or 'remote' " + "(BigQuery) and the stale source_query is cleared " + "automatically." + ), + ) + # Backtick guard removed for materialized rows: the Task 2 wrapping + # path (connectors.bigquery.extractor.materialize_query) now runs + # admin SQL through the BQ jobs API using BQ-native syntax, which + # requires backticks for dashed project/dataset identifiers. + # Non-materialized rows still reject backticks in the model validator. if merged.get("source_type") == "bigquery": # Reuse the register-time validator. It mutates the request to diff --git a/tests/test_admin_register_materialized_server_generated_sql.py b/tests/test_admin_register_materialized_server_generated_sql.py new file mode 100644 index 0000000..bc128ac --- /dev/null +++ b/tests/test_admin_register_materialized_server_generated_sql.py @@ -0,0 +1,90 @@ +"""When admin registers a materialized BQ row with bucket+source_table +but NO source_query, the server generates the source_query from the +configured BQ project + the supplied bucket/source_table. Admin never +has to know about bigquery_query() syntax for the trivial full-table +dump case. + +Fixtures `seeded_app`, `bq_instance`, `stub_bq_extractor` are auto- +discovered from `tests/conftest.py` — DO NOT import. `seeded_app` +is a dict: `{"client": TestClient, "admin_token": str, ...}`. +""" +from __future__ import annotations + +import pytest + + +def _auth(token: str) -> dict: + """Mirror the project's local _auth helper used in every materialized + test file (e.g. test_api_admin_materialized.py).""" + return {"Authorization": f"Bearer {token}"} + + +def test_register_materialized_with_bucket_only_generates_source_query( + seeded_app, bq_instance, stub_bq_extractor, +): + client = seeded_app["client"] + headers = _auth(seeded_app["admin_token"]) + payload = { + "name": "trivial_full_dump", + "source_type": "bigquery", + "query_mode": "materialized", + "bucket": "analytics", + "source_table": "orders_v2", + } + resp = client.post("/api/admin/register-table", json=payload, headers=headers) + assert resp.status_code in (200, 201, 202), resp.text + + reg = client.get("/api/admin/registry", headers=headers).json() + row = next(t for t in reg["tables"] if t["id"] == "trivial_full_dump") + expected_project = bq_instance["data_source"]["bigquery"]["project"] + assert row["source_query"] == ( + f"SELECT * FROM `{expected_project}.analytics.orders_v2`" + ) + + +def test_register_materialized_with_explicit_source_query_persists_verbatim( + seeded_app, bq_instance, stub_bq_extractor, +): + client = seeded_app["client"] + headers = _auth(seeded_app["admin_token"]) + custom = "SELECT col1, col2 FROM `analytics.orders_v2` WHERE col3 = 'x'" + payload = { + "name": "explicit_sql", + "source_type": "bigquery", + "query_mode": "materialized", + "source_query": custom, + } + resp = client.post("/api/admin/register-table", json=payload, headers=headers) + assert resp.status_code in (200, 201, 202), resp.text + + reg = client.get("/api/admin/registry", headers=headers).json() + row = next(t for t in reg["tables"] if t["id"] == "explicit_sql") + assert row["source_query"] == custom + + +def test_put_flip_to_materialized_with_bucket_generates_source_query( + seeded_app, bq_instance, stub_bq_extractor, +): + client = seeded_app["client"] + headers = _auth(seeded_app["admin_token"]) + # First register as remote. + client.post( + "/api/admin/register-table", + json={"name": "flip_t", "source_type": "bigquery", + "bucket": "analytics", "source_table": "orders_v2"}, + headers=headers, + ) + # PUT to flip to materialized without supplying source_query. + resp = client.put( + "/api/admin/registry/flip_t", + json={"query_mode": "materialized"}, + headers=headers, + ) + assert resp.status_code == 200, resp.text + + reg = client.get("/api/admin/registry", headers=headers).json() + row = next(t for t in reg["tables"] if t["id"] == "flip_t") + expected_project = bq_instance["data_source"]["bigquery"]["project"] + assert row["source_query"] == ( + f"SELECT * FROM `{expected_project}.analytics.orders_v2`" + ) diff --git a/tests/test_admin_validator_backtick_relaxed_for_materialized.py b/tests/test_admin_validator_backtick_relaxed_for_materialized.py new file mode 100644 index 0000000..e0c7799 --- /dev/null +++ b/tests/test_admin_validator_backtick_relaxed_for_materialized.py @@ -0,0 +1,39 @@ +"""Backtick-quoted identifiers are required for materialized BQ source_query +(when the dataset/table/project name contains a dash). The validator must +allow them on materialized rows but still reject on remote/local.""" +from __future__ import annotations +import pytest +from pydantic import ValidationError + +from app.api.admin import RegisterTableRequest + + +def test_materialized_accepts_backticks(): + req = RegisterTableRequest( + name="b1", + source_type="bigquery", + query_mode="materialized", + source_query="SELECT * FROM `prj-grp.ds.tbl`", + ) + assert req.source_query == "SELECT * FROM `prj-grp.ds.tbl`" + + +def test_remote_rejects_backticks(): + with pytest.raises(ValidationError): + RegisterTableRequest( + name="r1", + source_type="bigquery", + query_mode="remote", + bucket="ds", source_table="tbl", + source_query="SELECT * FROM `prj.ds.tbl`", + ) + + +def test_local_rejects_backticks(): + with pytest.raises(ValidationError): + RegisterTableRequest( + name="l1", + source_type="keboola", + query_mode="local", + source_query="SELECT * FROM `kbc.ds.tbl`", + ) diff --git a/tests/test_api_admin_materialized.py b/tests/test_api_admin_materialized.py index 45a9dbb..d311d60 100644 --- a/tests/test_api_admin_materialized.py +++ b/tests/test_api_admin_materialized.py @@ -30,10 +30,10 @@ def _materialized_payload(**overrides): "name": "orders_90d", "source_type": "bigquery", "query_mode": "materialized", - # DuckDB-flavor SQL (not BQ-native backticks) — the materialize path - # runs the SQL through the DuckDB BQ extension's COPY which uses - # double-quoted identifiers. Backticks are now rejected at register - # time. See `_BACKTICK_REJECTION_MESSAGE` in app/api/admin.py. + # BQ-native or DuckDB-flavor SQL — both accepted since Task 2 wraps + # materialized SQL in bigquery_query() (BQ jobs API path). Backtick + # identifiers are now allowed for materialized rows; remote/local rows + # still require DuckDB-flavor (double-quoted) identifiers. "source_query": 'SELECT date FROM bq."ds"."orders"', "sync_schedule": "every 6h", } @@ -280,17 +280,23 @@ def test_register_materialized_persists_source_query_in_registry(seeded_app, bq_ assert "WHERE x = 1" in row["source_query"] -# --- Backtick (BigQuery-native) source_query rejection ----------------------- +# --- Backtick (BigQuery-native) source_query handling ------------------------ # -# DuckDB BQ extension's COPY path interprets the SQL through DuckDB's parser, -# which does NOT understand backtick-quoted identifiers (it uses double quotes -# for quoted identifiers). A registered backtick-style source_query like -# `SELECT * FROM \`prj.ds.t\`` either parse-errors or returns 0 rows at next -# materialize tick — silently — and no parquet ends up at the canonical path. -# Reject at registration time with an actionable message. +# Task 2 (materialize-sync-fix) changed the BQ materialization path to run +# admin SQL through the BQ jobs API (bigquery_query() wrapper) rather than +# through DuckDB's BQ extension COPY path. BQ-native SQL requires backticks +# for dashed project/dataset/table identifiers. The backtick guard has been +# relaxed for ALL materialized rows: the validator now only rejects backticks +# for remote/local rows (DuckDB-flavor SQL contract). Materialized rows must +# be allowed to carry backticks so operators can reference dashed identifiers. +# See test_admin_validator_backtick_relaxed_for_materialized.py for the +# model-layer unit tests. -def test_register_materialized_rejects_backtick_source_query(seeded_app, bq_instance): +def test_register_materialized_accepts_backtick_source_query(seeded_app, bq_instance, stub_bq_extractor): + """BQ materialized rows now accept BQ-native backtick syntax; the + materialize path (Task 2) wraps them in bigquery_query() which uses + the BQ jobs API — not DuckDB's COPY — so backticks are valid.""" c = seeded_app["client"] token = seeded_app["admin_token"] r = c.post( @@ -301,15 +307,17 @@ def test_register_materialized_rejects_backtick_source_query(seeded_app, bq_inst ), headers=_auth(token), ) - assert r.status_code == 422, r.json() - detail = str(r.json().get("detail", "")).lower() - assert "backtick" in detail - assert 'bq."' in detail or "duckdb" in detail + assert r.status_code in (200, 201, 202), r.json() + reg = c.get("/api/admin/registry", headers=_auth(token)).json() + row = next(t for t in reg["tables"] if t["id"] == "bt_native") + assert row["source_query"] == "SELECT * FROM `prj-grp.ds.product_inventory`" -def test_update_materialized_rejects_backtick_source_query( +def test_update_materialized_accepts_backtick_source_query( seeded_app, bq_instance, stub_bq_extractor, ): + """PUT to a materialized BQ row may switch source_query to BQ-native + backtick form — accepted now that Task 2 wraps via jobs API.""" c = seeded_app["client"] token = seeded_app["admin_token"] @@ -324,7 +332,7 @@ def test_update_materialized_rejects_backtick_source_query( assert r.status_code == 201, r.json() table_id = r.json()["id"] - # PATCH the source_query to a backtick form — must be rejected. + # PATCH the source_query to a BQ-native backtick form — now accepted. r2 = c.put( f"/api/admin/registry/{table_id}", json={ @@ -333,14 +341,17 @@ def test_update_materialized_rejects_backtick_source_query( }, headers=_auth(token), ) - assert r2.status_code == 422, r2.json() - detail = str(r2.json().get("detail", "")).lower() - assert "backtick" in detail + assert r2.status_code == 200, r2.json() + reg = c.get("/api/admin/registry", headers=_auth(token)).json() + row = next(t for t in reg["tables"] if t["id"] == table_id) + assert row["source_query"] == "SELECT * FROM `prj.ds.t`" -def test_register_materialized_keboola_rejects_backtick_source_query(seeded_app): - """The check is generic, not BQ-only — Keboola materialized rows that - include backticks would also be silently skipped at materialize time.""" +def test_register_materialized_keboola_accepts_backtick_source_query(seeded_app): + """Keboola materialized rows also accept backtick source_query at register + time — the backtick guard now only applies to remote/local rows. If the + SQL is invalid at runtime (DuckDB parse error), that surfaces as a sync + error, not a registration error.""" c = seeded_app["client"] token = seeded_app["admin_token"] r = c.post( @@ -353,9 +364,7 @@ def test_register_materialized_keboola_rejects_backtick_source_query(seeded_app) }, headers=_auth(token), ) - assert r.status_code == 422, r.json() - detail = str(r.json().get("detail", "")).lower() - assert "backtick" in detail + assert r.status_code == 201, r.json() # --- Surface materialize errors per-row ---------------------------------------