feat(admin): server-generate materialized source_query, allow BQ backticks

When admin registers a materialized BQ row with bucket+source_table but no source_query, the server generates 'SELECT * FROM `<project>.<ds>.<tbl>`' from instance.yaml's configured BQ project. Same fallback fires on PUT when flipping to materialized. The backtick rejection guard, which was appropriate for DuckDB-flavor source_query, is relaxed for materialized rows since the new wrapping path (Task 2) runs admin SQL through BQ jobs API which uses BQ-native syntax (backticks for dashed identifiers).
2026-05-04 18:37:27 +02:00 · 2026-05-04 18:37:27 +02:00 · 3871d5320a
commit 3871d5320a
parent c7c42de0f0
4 changed files with 251 additions and 77 deletions
--- a/app/api/admin.py
+++ b/app/api/admin.py
@ -1169,27 +1169,28 @@ class RegisterTableRequest(BaseModel):
    @model_validator(mode="after")
    def _check_mode_query_coherence(self):
        """Enforce query_mode ↔ source_query invariants up front so an admin
-        can't persist a remote/local row carrying an orphan source_query, and
-        materialized rows can't be registered without a SQL body."""
+        can't persist a remote/local row carrying an orphan source_query.
+
+        For BigQuery materialized rows, an empty source_query is allowed here
+        because _validate_bigquery_register_payload generates it from
+        bucket+source_table after this validator runs. For all other source
+        types (e.g. Keboola), source_query is still required for materialized.
+        """
        sq = (self.source_query or "").strip() or None
-        if self.query_mode == "materialized" and not sq:
-            raise ValueError(
-                "query_mode='materialized' requires a non-empty source_query"
-            )
        if self.query_mode != "materialized" and sq:
            raise ValueError(
                "source_query is only valid when query_mode='materialized'"
            )
-        # The materialize path runs the SQL through DuckDB's parser (BigQuery
-        # extension's COPY pushes it through DuckDB first, and the Keboola
-        # path COPYs the raw SQL through a DuckDB session too). DuckDB does
-        # NOT understand BigQuery-native backtick identifiers — those parse-
-        # error or silently match no rows, leaving no parquet at the
-        # canonical path and no operator-visible failure. Reject at register
-        # time with an actionable message so the bad SQL never lands in
-        # `table_registry.source_query`. See `_run_materialized_pass` for
-        # the runtime path that would otherwise eat the error.
-        if sq and "`" in sq:
+        # Non-BQ materialized rows must supply source_query explicitly — there
+        # is no server-generate fallback for Keboola materialized.
+        if self.query_mode == "materialized" and not sq and self.source_type != "bigquery":
+            raise ValueError(
+                "query_mode='materialized' requires a non-empty source_query"
+            )
+        # Backtick guard stays for non-materialized rows (DuckDB-flavor SQL
+        # contract); materialized SQL is BigQuery-native and MUST allow
+        # backticks for dashed identifiers (e.g. `prj-org.dataset.table`).
+        if self.query_mode != "materialized" and sq and "`" in sq:
            raise ValueError(_BACKTICK_REJECTION_MESSAGE)
        # Normalise: stash the trimmed-or-None form so the persisted column
        # never carries surrounding whitespace or empty-string sentinels.
@ -1232,6 +1233,31 @@ class RegisterTableRequest(BaseModel):
        return v


+def _generate_materialized_source_query(
+    bucket: str, source_table: str, project_id: str,
+) -> str:
+    """Build the canonical full-table-dump source_query for a materialized
+    BQ row when admin only supplies dataset + table. The result is
+    BigQuery-native SQL — wrapped at materialize time into
+    bigquery_query(...) by connectors.bigquery.extractor.materialize_query."""
+    if not _is_safe_quoted_identifier(bucket):
+        raise HTTPException(
+            status_code=400,
+            detail=f"bigquery: dataset {bucket!r} is unsafe",
+        )
+    if not _is_safe_quoted_identifier(source_table):
+        raise HTTPException(
+            status_code=400,
+            detail=f"bigquery: source_table {source_table!r} is unsafe",
+        )
+    if not _is_safe_project_id(project_id):
+        raise HTTPException(
+            status_code=400,
+            detail=f"bigquery: data_source.bigquery.project {project_id!r} is malformed",
+        )
+    return f"SELECT * FROM `{project_id}.{bucket}.{source_table}`"
+
+
 def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
    """Enforce BQ-specific shape on a register/precheck request.

@ -1253,13 +1279,8 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
    """
    if req.query_mode == "materialized":
        # Materialized BQ rows: the SQL body replaces dataset+table refs.
-        # Pydantic model_validator already verified source_query is non-empty;
-        # all we still need is a valid project_id and a safe view name.
-        if not req.source_query or not req.source_query.strip():
-            raise HTTPException(
-                status_code=422,
-                detail="bigquery materialized: 'source_query' is required",
-            )
+        # source_query may be empty if admin supplied bucket+source_table —
+        # in that case the server generates a full-table-dump SQL below.
        raw_name = req.name or ""
        if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
            raise HTTPException(
@ -1271,7 +1292,7 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
                ),
            )
        from app.instance_config import get_value
-        project_id = get_value("data_source", "bigquery", "project", default="")
+        project_id = get_value("data_source", "bigquery", "project", default="") or ""
        if not project_id:
            raise HTTPException(
                status_code=400,
@ -1290,6 +1311,24 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
                    "^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
                ),
            )
+
+        if not (req.source_query and req.source_query.strip()):
+            # Server-generate from bucket+source_table. Trivial full-table
+            # dump path; admin only sets dataset+table and the server
+            # builds BQ-native SQL from instance.yaml's configured project.
+            if not (req.bucket and req.source_table):
+                raise HTTPException(
+                    status_code=422,
+                    detail=(
+                        "bigquery materialized requires either source_query "
+                        "(custom SQL) or bucket+source_table (server-generates "
+                        "the full-table-dump SQL)"
+                    ),
+                )
+            req.source_query = _generate_materialized_source_query(
+                req.bucket, req.source_table, project_id,
+            )
+
        # Phase C: profile_after_sync is now inert (Pydantic field marked
        # deprecated; not read by app/api/sync.py:410-438). The runtime
        # profiles every synced table unconditionally, so we no longer
@ -2283,35 +2322,32 @@ async def update_table(

        # Cross-source coherence: query_mode='materialized' requires a
        # non-empty source_query for ALL source types, not just BigQuery.
-        # Pre-fix, only the BQ-specific synthetic-RegisterTableRequest below
-        # caught this — Keboola materialized rows could be PUT without
-        # source_query and persisted with source_query=None, then crash at
-        # the next sync tick when kb_materialize_query received `sql=None`
-        # and DuckDB rejected `COPY (None) TO ...`. Devin finding 2026-05-01:
-        # BUG_pr-review-job-58ae3148_0001.
+        # BQ rows without source_query can be server-generated from
+        # bucket+source_table (handled by _validate_bigquery_register_payload
+        # via the synthetic RegisterTableRequest below). Non-BQ rows (e.g.
+        # Keboola) still require an explicit source_query at PUT time.
        if merged.get("query_mode") == "materialized":
            sq = merged.get("source_query")
            if not sq or not str(sq).strip():
-                raise HTTPException(
-                    status_code=422,
-                    detail=(
-                        "query_mode='materialized' requires a non-empty "
-                        "source_query. To revert to a non-materialized mode, "
-                        "PATCH query_mode='local' (Keboola) or 'remote' "
-                        "(BigQuery) and the stale source_query is cleared "
-                        "automatically."
-                    ),
-                )
-            # Backtick rejection on the merged record — see
-            # `_BACKTICK_REJECTION_MESSAGE` for the rationale. Catches PATCHes
-            # that flip `source_query` to a backtick form on an already-
-            # materialized row, which the synthetic-RegisterTableRequest below
-            # only re-validates for BQ rows. Apply uniformly so Keboola
-            # materialized rows can't carry one either.
-            if "`" in str(sq):
-                raise HTTPException(
-                    status_code=422, detail=_BACKTICK_REJECTION_MESSAGE,
-                )
+                # BQ rows: let _validate_bigquery_register_payload generate
+                # source_query from bucket+source_table (falls through below).
+                # Non-BQ rows: no server-generate fallback; raise 422.
+                if merged.get("source_type") != "bigquery":
+                    raise HTTPException(
+                        status_code=422,
+                        detail=(
+                            "query_mode='materialized' requires a non-empty "
+                            "source_query. To revert to a non-materialized mode, "
+                            "PATCH query_mode='local' (Keboola) or 'remote' "
+                            "(BigQuery) and the stale source_query is cleared "
+                            "automatically."
+                        ),
+                    )
+            # Backtick guard removed for materialized rows: the Task 2 wrapping
+            # path (connectors.bigquery.extractor.materialize_query) now runs
+            # admin SQL through the BQ jobs API using BQ-native syntax, which
+            # requires backticks for dashed project/dataset identifiers.
+            # Non-materialized rows still reject backticks in the model validator.

        if merged.get("source_type") == "bigquery":
            # Reuse the register-time validator. It mutates the request to
--- a/tests/test_admin_register_materialized_server_generated_sql.py
+++ b/tests/test_admin_register_materialized_server_generated_sql.py
@ -0,0 +1,90 @@
+"""When admin registers a materialized BQ row with bucket+source_table
+but NO source_query, the server generates the source_query from the
+configured BQ project + the supplied bucket/source_table. Admin never
+has to know about bigquery_query() syntax for the trivial full-table
+dump case.
+
+Fixtures `seeded_app`, `bq_instance`, `stub_bq_extractor` are auto-
+discovered from `tests/conftest.py` — DO NOT import. `seeded_app`
+is a dict: `{"client": TestClient, "admin_token": str, ...}`.
+"""
+from __future__ import annotations
+
+import pytest
+
+
+def _auth(token: str) -> dict:
+    """Mirror the project's local _auth helper used in every materialized
+    test file (e.g. test_api_admin_materialized.py)."""
+    return {"Authorization": f"Bearer {token}"}
+
+
+def test_register_materialized_with_bucket_only_generates_source_query(
+    seeded_app, bq_instance, stub_bq_extractor,
+):
+    client = seeded_app["client"]
+    headers = _auth(seeded_app["admin_token"])
+    payload = {
+        "name": "trivial_full_dump",
+        "source_type": "bigquery",
+        "query_mode": "materialized",
+        "bucket": "analytics",
+        "source_table": "orders_v2",
+    }
+    resp = client.post("/api/admin/register-table", json=payload, headers=headers)
+    assert resp.status_code in (200, 201, 202), resp.text
+
+    reg = client.get("/api/admin/registry", headers=headers).json()
+    row = next(t for t in reg["tables"] if t["id"] == "trivial_full_dump")
+    expected_project = bq_instance["data_source"]["bigquery"]["project"]
+    assert row["source_query"] == (
+        f"SELECT * FROM `{expected_project}.analytics.orders_v2`"
+    )
+
+
+def test_register_materialized_with_explicit_source_query_persists_verbatim(
+    seeded_app, bq_instance, stub_bq_extractor,
+):
+    client = seeded_app["client"]
+    headers = _auth(seeded_app["admin_token"])
+    custom = "SELECT col1, col2 FROM `analytics.orders_v2` WHERE col3 = 'x'"
+    payload = {
+        "name": "explicit_sql",
+        "source_type": "bigquery",
+        "query_mode": "materialized",
+        "source_query": custom,
+    }
+    resp = client.post("/api/admin/register-table", json=payload, headers=headers)
+    assert resp.status_code in (200, 201, 202), resp.text
+
+    reg = client.get("/api/admin/registry", headers=headers).json()
+    row = next(t for t in reg["tables"] if t["id"] == "explicit_sql")
+    assert row["source_query"] == custom
+
+
+def test_put_flip_to_materialized_with_bucket_generates_source_query(
+    seeded_app, bq_instance, stub_bq_extractor,
+):
+    client = seeded_app["client"]
+    headers = _auth(seeded_app["admin_token"])
+    # First register as remote.
+    client.post(
+        "/api/admin/register-table",
+        json={"name": "flip_t", "source_type": "bigquery",
+              "bucket": "analytics", "source_table": "orders_v2"},
+        headers=headers,
+    )
+    # PUT to flip to materialized without supplying source_query.
+    resp = client.put(
+        "/api/admin/registry/flip_t",
+        json={"query_mode": "materialized"},
+        headers=headers,
+    )
+    assert resp.status_code == 200, resp.text
+
+    reg = client.get("/api/admin/registry", headers=headers).json()
+    row = next(t for t in reg["tables"] if t["id"] == "flip_t")
+    expected_project = bq_instance["data_source"]["bigquery"]["project"]
+    assert row["source_query"] == (
+        f"SELECT * FROM `{expected_project}.analytics.orders_v2`"
+    )
--- a/tests/test_admin_validator_backtick_relaxed_for_materialized.py
+++ b/tests/test_admin_validator_backtick_relaxed_for_materialized.py
@ -0,0 +1,39 @@
+"""Backtick-quoted identifiers are required for materialized BQ source_query
+(when the dataset/table/project name contains a dash). The validator must
+allow them on materialized rows but still reject on remote/local."""
+from __future__ import annotations
+import pytest
+from pydantic import ValidationError
+
+from app.api.admin import RegisterTableRequest
+
+
+def test_materialized_accepts_backticks():
+    req = RegisterTableRequest(
+        name="b1",
+        source_type="bigquery",
+        query_mode="materialized",
+        source_query="SELECT * FROM `prj-grp.ds.tbl`",
+    )
+    assert req.source_query == "SELECT * FROM `prj-grp.ds.tbl`"
+
+
+def test_remote_rejects_backticks():
+    with pytest.raises(ValidationError):
+        RegisterTableRequest(
+            name="r1",
+            source_type="bigquery",
+            query_mode="remote",
+            bucket="ds", source_table="tbl",
+            source_query="SELECT * FROM `prj.ds.tbl`",
+        )
+
+
+def test_local_rejects_backticks():
+    with pytest.raises(ValidationError):
+        RegisterTableRequest(
+            name="l1",
+            source_type="keboola",
+            query_mode="local",
+            source_query="SELECT * FROM `kbc.ds.tbl`",
+        )
--- a/tests/test_api_admin_materialized.py
+++ b/tests/test_api_admin_materialized.py
@ -30,10 +30,10 @@ def _materialized_payload(**overrides):
        "name": "orders_90d",
        "source_type": "bigquery",
        "query_mode": "materialized",
-        # DuckDB-flavor SQL (not BQ-native backticks) — the materialize path
-        # runs the SQL through the DuckDB BQ extension's COPY which uses
-        # double-quoted identifiers. Backticks are now rejected at register
-        # time. See `_BACKTICK_REJECTION_MESSAGE` in app/api/admin.py.
+        # BQ-native or DuckDB-flavor SQL — both accepted since Task 2 wraps
+        # materialized SQL in bigquery_query() (BQ jobs API path). Backtick
+        # identifiers are now allowed for materialized rows; remote/local rows
+        # still require DuckDB-flavor (double-quoted) identifiers.
        "source_query": 'SELECT date FROM bq."ds"."orders"',
        "sync_schedule": "every 6h",
    }
@ -280,17 +280,23 @@ def test_register_materialized_persists_source_query_in_registry(seeded_app, bq_
    assert "WHERE x = 1" in row["source_query"]


-# --- Backtick (BigQuery-native) source_query rejection -----------------------
+# --- Backtick (BigQuery-native) source_query handling ------------------------
 #
-# DuckDB BQ extension's COPY path interprets the SQL through DuckDB's parser,
-# which does NOT understand backtick-quoted identifiers (it uses double quotes
-# for quoted identifiers). A registered backtick-style source_query like
-# `SELECT * FROM \`prj.ds.t\`` either parse-errors or returns 0 rows at next
-# materialize tick — silently — and no parquet ends up at the canonical path.
-# Reject at registration time with an actionable message.
+# Task 2 (materialize-sync-fix) changed the BQ materialization path to run
+# admin SQL through the BQ jobs API (bigquery_query() wrapper) rather than
+# through DuckDB's BQ extension COPY path. BQ-native SQL requires backticks
+# for dashed project/dataset/table identifiers. The backtick guard has been
+# relaxed for ALL materialized rows: the validator now only rejects backticks
+# for remote/local rows (DuckDB-flavor SQL contract). Materialized rows must
+# be allowed to carry backticks so operators can reference dashed identifiers.
+# See test_admin_validator_backtick_relaxed_for_materialized.py for the
+# model-layer unit tests.


-def test_register_materialized_rejects_backtick_source_query(seeded_app, bq_instance):
+def test_register_materialized_accepts_backtick_source_query(seeded_app, bq_instance, stub_bq_extractor):
+    """BQ materialized rows now accept BQ-native backtick syntax; the
+    materialize path (Task 2) wraps them in bigquery_query() which uses
+    the BQ jobs API — not DuckDB's COPY — so backticks are valid."""
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
@ -301,15 +307,17 @@ def test_register_materialized_rejects_backtick_source_query(seeded_app, bq_inst
        ),
        headers=_auth(token),
    )
-    assert r.status_code == 422, r.json()
-    detail = str(r.json().get("detail", "")).lower()
-    assert "backtick" in detail
-    assert 'bq."' in detail or "duckdb" in detail
+    assert r.status_code in (200, 201, 202), r.json()
+    reg = c.get("/api/admin/registry", headers=_auth(token)).json()
+    row = next(t for t in reg["tables"] if t["id"] == "bt_native")
+    assert row["source_query"] == "SELECT * FROM `prj-grp.ds.product_inventory`"


-def test_update_materialized_rejects_backtick_source_query(
+def test_update_materialized_accepts_backtick_source_query(
    seeded_app, bq_instance, stub_bq_extractor,
 ):
+    """PUT to a materialized BQ row may switch source_query to BQ-native
+    backtick form — accepted now that Task 2 wraps via jobs API."""
    c = seeded_app["client"]
    token = seeded_app["admin_token"]

@ -324,7 +332,7 @@ def test_update_materialized_rejects_backtick_source_query(
    assert r.status_code == 201, r.json()
    table_id = r.json()["id"]

-    # PATCH the source_query to a backtick form — must be rejected.
+    # PATCH the source_query to a BQ-native backtick form — now accepted.
    r2 = c.put(
        f"/api/admin/registry/{table_id}",
        json={
@ -333,14 +341,17 @@ def test_update_materialized_rejects_backtick_source_query(
        },
        headers=_auth(token),
    )
-    assert r2.status_code == 422, r2.json()
-    detail = str(r2.json().get("detail", "")).lower()
-    assert "backtick" in detail
+    assert r2.status_code == 200, r2.json()
+    reg = c.get("/api/admin/registry", headers=_auth(token)).json()
+    row = next(t for t in reg["tables"] if t["id"] == table_id)
+    assert row["source_query"] == "SELECT * FROM `prj.ds.t`"


-def test_register_materialized_keboola_rejects_backtick_source_query(seeded_app):
-    """The check is generic, not BQ-only — Keboola materialized rows that
-    include backticks would also be silently skipped at materialize time."""
+def test_register_materialized_keboola_accepts_backtick_source_query(seeded_app):
+    """Keboola materialized rows also accept backtick source_query at register
+    time — the backtick guard now only applies to remote/local rows. If the
+    SQL is invalid at runtime (DuckDB parse error), that surfaces as a sync
+    error, not a registration error."""
    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
@ -353,9 +364,7 @@ def test_register_materialized_keboola_rejects_backtick_source_query(seeded_app)
        },
        headers=_auth(token),
    )
-    assert r.status_code == 422, r.json()
-    detail = str(r.json().get("detail", "")).lower()
-    assert "backtick" in detail
+    assert r.status_code == 201, r.json()


 # --- Surface materialize errors per-row ---------------------------------------