SQL using only a full backtick path (`<proj>.<dataset>.<table>`) as the table reference had neither bare name_lookups nor direct bq.ds.tbl matches, so _rewrite_user_sql_for_bigquery_query's Skip 1 returned the original SQL unchanged. DuckDB then rejected the backtick syntax locally with "syntax error at or near `"" before the query ever reached BigQuery. Detect _BACKTICK_FULL_PATH matches in the rewriter and include them in the Skip 1 guard so the SQL gets wrapped in bigquery_query(). No identifier rewrite is needed — backtick paths are already BQ-native and _rewrite_bq_table_refs_to_native preserves them verbatim via its backtick-split pass. Closes #363
836 lines
31 KiB
Python
836 lines
31 KiB
Python
"""Unit tests for ``_rewrite_user_sql_for_bigquery_query``.
|
||
|
||
The helper rewrites user SQL referencing query_mode='remote' BigQuery
|
||
tables so the entire query ships to BQ via the DuckDB BQ extension's
|
||
``bigquery_query(<project>, <sql>)`` UDF — engaging WHERE / SELECT /
|
||
LIMIT predicate pushdown instead of falling through to ATTACH-catalog
|
||
mode (which opens a Storage Read API session over the whole table).
|
||
|
||
These tests pin down each conservative-skip rule plus the happy-path
|
||
rewrites. Edge cases (CTE shadowing, double-wrap, mixed-source JOIN)
|
||
are intentionally explicit so a future refactor doesn't quietly
|
||
loosen the guard.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import pytest
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test infrastructure: an in-memory DuckDB seeded with table_registry rows
|
||
# matching the shapes the production registry produces. Avoids the full app
|
||
# bootstrap path; the rewriter only needs ``conn.execute("SELECT * FROM
|
||
# table_registry ...")`` to resolve names.
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@pytest.fixture
|
||
def seeded_registry(tmp_path, monkeypatch):
|
||
"""Build a fresh ``system.duckdb`` in tmp_path with the schema migrated.
|
||
|
||
Returns the open connection so tests can pass it to the rewriter.
|
||
Cleanup is automatic via tmp_path teardown — but we close the
|
||
open singleton handle first so a different DATA_DIR in the next
|
||
test doesn't see the previous tmp's lock.
|
||
"""
|
||
from src.db import get_system_db, close_system_db
|
||
|
||
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
||
(tmp_path / "state").mkdir(parents=True, exist_ok=True)
|
||
close_system_db()
|
||
conn = get_system_db()
|
||
yield conn
|
||
close_system_db()
|
||
|
||
|
||
def _register_bq_remote(conn, *, table_id, name, bucket, source_table):
|
||
from src.repositories.table_registry import TableRegistryRepository
|
||
TableRegistryRepository(conn).register(
|
||
id=table_id,
|
||
name=name,
|
||
source_type="bigquery",
|
||
bucket=bucket,
|
||
source_table=source_table,
|
||
query_mode="remote",
|
||
)
|
||
|
||
|
||
def _register_local(conn, *, table_id, name, source_type="keboola"):
|
||
from src.repositories.table_registry import TableRegistryRepository
|
||
TableRegistryRepository(conn).register(
|
||
id=table_id,
|
||
name=name,
|
||
source_type=source_type,
|
||
bucket="bkt",
|
||
source_table=name,
|
||
query_mode="local",
|
||
)
|
||
|
||
|
||
def _set_bq_project(monkeypatch, project="test-prj", billing=None):
|
||
"""Stub get_bq_access so the rewriter sees a real-looking project ID.
|
||
|
||
`project` configures the data project (used in backtick paths).
|
||
`billing` (when provided) configures a different billing project so
|
||
cross-project deployments can be exercised; defaults to `project`
|
||
for the single-project case."""
|
||
from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access
|
||
bq = BqAccess(
|
||
BqProjects(billing=billing or project, data=project),
|
||
client_factory=lambda projects: object(),
|
||
)
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_bq_access",
|
||
lambda: bq,
|
||
raising=False,
|
||
)
|
||
get_bq_access.cache_clear()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Happy-path rewrites
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def test_simple_select_where_against_one_bq_table_rewrites(seeded_registry, monkeypatch):
|
||
"""Single-table SELECT-WHERE against a registered BQ remote row →
|
||
full SQL wrapped in ``bigquery_query('project', '<rewritten>')``.
|
||
The bare-name reference gets translated to BQ-native backtick form."""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
"SELECT count(*) FROM ue WHERE event_date = '2026-01-01'",
|
||
seeded_registry,
|
||
)
|
||
|
||
assert did_rewrite is True
|
||
# Outer wrap must be a single bigquery_query() FROM-source.
|
||
assert "bigquery_query(" in rewritten
|
||
assert "test-prj" in rewritten
|
||
# Inner SQL: bare name rewritten to backticked BQ-native path.
|
||
assert "`test-prj.fin.ue`" in rewritten
|
||
# Inner SQL is dollar-quoted (`$bqq_inner$ ... $bqq_inner$`), so
|
||
# single quotes inside the WHERE predicate remain literal — no
|
||
# doubling, no backslash escaping. Verifies the safer embedding form
|
||
# introduced after the code review caught naive single-quote-only
|
||
# escape doubling missing DuckDB backslash sequences.
|
||
assert "$bqq_inner$" in rewritten
|
||
assert "event_date = '2026-01-01'" in rewritten
|
||
|
||
|
||
def test_direct_bq_path_rewrites(seeded_registry, monkeypatch):
|
||
"""User wrote the direct ``bq."ds"."tbl"`` form. The rewriter must
|
||
still translate to BQ-native backtick form before wrapping."""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
'SELECT * FROM bq."fin"."ue" LIMIT 10',
|
||
seeded_registry,
|
||
)
|
||
|
||
assert did_rewrite is True
|
||
assert "bigquery_query(" in rewritten
|
||
assert "`test-prj.fin.ue`" in rewritten
|
||
# Original duckdb-flavor path must NOT remain (it'd parse-fail under BQ).
|
||
assert 'bq."fin"."ue"' not in rewritten
|
||
|
||
|
||
def test_cte_referencing_bq_table_rewrites_inside_cte(seeded_registry, monkeypatch):
|
||
"""A WITH clause whose body references a BQ table must rewrite that
|
||
inner reference; the wrapping happens at the top level so BQ sees a
|
||
valid BQ-flavor CTE."""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.orders", name="orders",
|
||
bucket="fin", source_table="orders")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
"WITH x AS (SELECT id FROM orders WHERE total > 0) SELECT count(*) FROM x",
|
||
seeded_registry,
|
||
)
|
||
assert did_rewrite is True
|
||
# Inner reference is rewritten.
|
||
assert "`test-prj.fin.orders`" in rewritten
|
||
# The whole thing is wrapped — bigquery_query is the outermost FROM.
|
||
assert rewritten.lower().count("bigquery_query(") == 1
|
||
|
||
|
||
def test_subquery_referencing_bq_table_rewrites(seeded_registry, monkeypatch):
|
||
"""Subquery in FROM position — same handling as a CTE: rewrite the
|
||
inner table reference, wrap the whole at the top."""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
"SELECT s.cnt FROM (SELECT count(*) AS cnt FROM ue) s",
|
||
seeded_registry,
|
||
)
|
||
assert did_rewrite is True
|
||
assert "`test-prj.fin.ue`" in rewritten
|
||
assert rewritten.lower().count("bigquery_query(") == 1
|
||
|
||
|
||
def test_multiple_bq_tables_one_project_combine(seeded_registry, monkeypatch):
|
||
"""Two registered BQ tables in the same project → single
|
||
``bigquery_query()`` wraps the whole SQL with both refs rewritten
|
||
inline. No separate parallel calls."""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.orders", name="orders",
|
||
bucket="fin", source_table="orders")
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.users", name="users",
|
||
bucket="fin", source_table="users")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
"SELECT u.id, count(o.id) "
|
||
"FROM users u JOIN orders o ON u.id = o.user_id "
|
||
"GROUP BY u.id",
|
||
seeded_registry,
|
||
)
|
||
assert did_rewrite is True
|
||
# Both rewritten.
|
||
assert "`test-prj.fin.users`" in rewritten
|
||
assert "`test-prj.fin.orders`" in rewritten
|
||
# Single wrap.
|
||
assert rewritten.lower().count("bigquery_query(") == 1
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Conservative-skip cases
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
|
||
"""A JOIN between a BQ table and a local-mode (Keboola/Jira) table
|
||
is a cross-source query — wrapping it in bigquery_query() would lose
|
||
the local table. The rewriter must fall through to the ATTACH-catalog
|
||
path (slow but correct).
|
||
"""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue")
|
||
_register_local(seeded_registry, table_id="kbc.in.local_orders",
|
||
name="local_orders")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
user_sql = (
|
||
"SELECT u.id, lo.total "
|
||
"FROM ue u JOIN local_orders lo ON u.id = lo.user_id"
|
||
)
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
user_sql, seeded_registry,
|
||
)
|
||
assert did_rewrite is False
|
||
assert rewritten == user_sql # untouched
|
||
|
||
|
||
def test_local_name_inside_backtick_path_does_not_trip_cross_source(
|
||
seeded_registry, monkeypatch,
|
||
):
|
||
"""Devin Review on PR #208 (issue #201 follow-up): a registered
|
||
LOCAL-mode table name appearing as a segment of a user-supplied full
|
||
backtick BQ path must NOT trip the cross-source guard. Pre-fix the
|
||
bare-name regex at the cross-source check ran against unmasked
|
||
sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
|
||
local ``orders`` inside the backticks and force the wrapper to bail
|
||
to the ATTACH-catalog slow path (50-100× slower). Post-fix the
|
||
regex runs against the backtick-masked copy, the cross-source check
|
||
correctly sees only BQ refs, and the wrap proceeds.
|
||
"""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue")
|
||
_register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
user_sql = (
|
||
"SELECT u.id "
|
||
"FROM ue u "
|
||
"JOIN `test-prj.dataset.orders` o ON u.x = o.x "
|
||
"WHERE o.value > 0"
|
||
)
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
user_sql, seeded_registry,
|
||
)
|
||
# Must wrap — both refs are BQ; the local `orders` registration is
|
||
# irrelevant to a query that touches only BQ paths.
|
||
assert did_rewrite is True
|
||
assert "bigquery_query(" in rewritten
|
||
# The user's backtick path is preserved verbatim inside the wrapped
|
||
# inner SQL (Layer 1 split-on-backticks behaviour), so the original
|
||
# `test-prj.dataset.orders` reference survives.
|
||
assert "test-prj.dataset.orders" in rewritten
|
||
|
||
|
||
def test_full_backtick_path_only_wraps_in_bigquery_query(seeded_registry, monkeypatch):
|
||
"""Issue #363: user SQL uses only a full backtick BQ path
|
||
(`<proj>.<dataset>.<table>`) — no bare catalog name, no bq.ds.tbl.
|
||
Pre-fix: name_lookups and direct_paths both empty → Skip 1 fires →
|
||
original backtick SQL reaches analytics.execute() → DuckDB parser error
|
||
"syntax error at or near `"".
|
||
Post-fix: has_backtick_paths detects the pattern → Skip 1 does NOT fire
|
||
→ SQL is wrapped in bigquery_query() and BQ executes it natively.
|
||
"""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
user_sql = (
|
||
"SELECT COUNT(*) FROM `test-prj.fin.ue` WHERE event_date = DATE '2026-05-15'"
|
||
)
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
user_sql, seeded_registry,
|
||
)
|
||
|
||
assert did_rewrite is True
|
||
assert "bigquery_query(" in rewritten
|
||
# The backtick path is already BQ-native — preserved verbatim inside the wrap.
|
||
assert "`test-prj.fin.ue`" in rewritten
|
||
# Original backtick SQL must not be sent raw to DuckDB's local parser.
|
||
assert rewritten != user_sql
|
||
|
||
|
||
def test_full_backtick_path_only_unregistered_table_still_wraps(
|
||
seeded_registry, monkeypatch,
|
||
):
|
||
"""Backtick-only SQL whose path happens NOT to be registered still gets
|
||
wrapped — the guardrail upstream (_bq_guardrail_inputs pass 3) is the
|
||
RBAC gate; the rewriter's job is only to avoid sending backtick syntax
|
||
to DuckDB locally. An unregistered-path error from BQ is surfaced cleanly
|
||
rather than a local "syntax error at or near `"".
|
||
"""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
user_sql = "SELECT * FROM `test-prj.some_ds.some_tbl` LIMIT 5"
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
user_sql, seeded_registry,
|
||
)
|
||
|
||
assert did_rewrite is True
|
||
assert "bigquery_query(" in rewritten
|
||
assert "test-prj.some_ds.some_tbl" in rewritten
|
||
|
||
|
||
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
|
||
"""User SQL referencing only local-source tables → no rewrite,
|
||
no log spam, original SQL returned."""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
user_sql = "SELECT * FROM orders WHERE id = 1"
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
user_sql, seeded_registry,
|
||
)
|
||
assert did_rewrite is False
|
||
assert rewritten == user_sql
|
||
|
||
|
||
def test_already_contains_bigquery_query_passes_through(seeded_registry, monkeypatch):
|
||
"""User SQL already calls bigquery_query() — never double-wrap.
|
||
|
||
Note: the /api/query endpoint blocks ``bigquery_query`` in user SQL
|
||
via the keyword denylist, so this scenario can't reach the rewriter
|
||
in production today. Defensive guard for callers from other paths.
|
||
"""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue")
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
user_sql = (
|
||
"SELECT * FROM bigquery_query('test-prj', 'SELECT * FROM `test-prj.fin.ue`')"
|
||
)
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
user_sql, seeded_registry,
|
||
)
|
||
assert did_rewrite is False
|
||
assert rewritten == user_sql
|
||
|
||
|
||
def test_unconfigured_bq_project_skips(seeded_registry, monkeypatch):
|
||
"""If get_bq_access() is the not-configured sentinel (data=''),
|
||
don't rewrite — there's no project to fill into bigquery_query()."""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue")
|
||
|
||
# Override to sentinel (empty data project).
|
||
from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_bq_access",
|
||
lambda: BqAccess(BqProjects(billing="", data="")),
|
||
raising=False,
|
||
)
|
||
get_bq_access.cache_clear()
|
||
|
||
user_sql = "SELECT * FROM ue"
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
user_sql, seeded_registry,
|
||
)
|
||
assert did_rewrite is False
|
||
assert rewritten == user_sql
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Backwards-compat: dry-run helper still available + behaves the same
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def test_existing_dry_run_helper_still_callable():
|
||
"""The original ``_rewrite_user_sql_for_bq_dry_run`` is now a thin
|
||
wrapper around the shared core rewriter (Pass 1 + Pass 2). Callers
|
||
that pass an explicit ``project`` argument keep working unchanged.
|
||
"""
|
||
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||
|
||
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||
sql="SELECT * FROM ue",
|
||
name_lookups=[("ue", "fin", "ue")],
|
||
project="some-prj",
|
||
)
|
||
assert "`some-prj.fin.ue`" in rewritten
|
||
# The dry-run helper does NOT add a bigquery_query() wrapper; that's
|
||
# only the new execution-path helper's job.
|
||
assert "bigquery_query(" not in rewritten
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# End-to-end: the /api/query handler must invoke the rewriter and execute
|
||
# the rewritten SQL (not the original) when there's a BQ-remote table.
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _auth(token: str) -> dict:
|
||
return {"Authorization": f"Bearer {token}"}
|
||
|
||
|
||
def _register_bq_remote_row(name: str, bucket: str, source_table: str) -> None:
|
||
from src.db import get_system_db
|
||
from src.repositories.table_registry import TableRegistryRepository
|
||
sys_conn = get_system_db()
|
||
try:
|
||
TableRegistryRepository(sys_conn).register(
|
||
id=f"bq.{bucket}.{source_table}",
|
||
name=name,
|
||
source_type="bigquery",
|
||
bucket=bucket,
|
||
source_table=source_table,
|
||
query_mode="remote",
|
||
)
|
||
finally:
|
||
sys_conn.close()
|
||
|
||
|
||
@pytest.fixture
|
||
def stub_bq_for_endpoint(monkeypatch):
|
||
"""Stub _bq_dry_run_bytes + get_bq_access at the endpoint level so the
|
||
cap-guard sees a real-looking BQ project but doesn't issue real RPCs.
|
||
"""
|
||
monkeypatch.setattr(
|
||
"app.api.query._bq_dry_run_bytes",
|
||
lambda *a, **k: 1024, # tiny — pass cap
|
||
raising=False,
|
||
)
|
||
|
||
class _FakeProjects:
|
||
data = "test-data-prj"
|
||
billing = "test-billing-prj"
|
||
|
||
class _FakeBqAccess:
|
||
projects = _FakeProjects()
|
||
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_bq_access",
|
||
lambda: _FakeBqAccess(),
|
||
raising=False,
|
||
)
|
||
|
||
|
||
def test_endpoint_executes_rewritten_sql_against_analytics(
|
||
seeded_app, stub_bq_for_endpoint, monkeypatch,
|
||
):
|
||
"""The /api/query handler must call ``analytics.execute(rewritten_sql)``
|
||
— NOT the user's original SQL — when a BQ-remote table is referenced.
|
||
Capture what reaches DuckDB and assert the bigquery_query() wrap is
|
||
present.
|
||
"""
|
||
_register_bq_remote_row("ue", "fin", "ue")
|
||
|
||
# Capture analytics.execute calls. The handler does
|
||
# `analytics = get_analytics_db_readonly(); analytics.execute(sql)`,
|
||
# so we patch the connection factory to return a stub.
|
||
captured = {"sql": None}
|
||
|
||
class _StubAnalytics:
|
||
description = [("c0",)]
|
||
def execute(self, sql, *args, **kwargs):
|
||
captured["sql"] = sql
|
||
class _R:
|
||
def fetchmany(self, _n):
|
||
return []
|
||
return _R()
|
||
def close(self):
|
||
pass
|
||
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_analytics_db_readonly",
|
||
lambda: _StubAnalytics(),
|
||
raising=False,
|
||
)
|
||
|
||
c = seeded_app["client"]
|
||
token = seeded_app["admin_token"]
|
||
r = c.post(
|
||
"/api/query",
|
||
json={"sql": "SELECT count(*) FROM ue WHERE country = 'CZ'"},
|
||
headers=_auth(token),
|
||
)
|
||
assert r.status_code == 200, r.json()
|
||
sent = captured["sql"]
|
||
assert sent is not None, "analytics.execute was never called"
|
||
assert "bigquery_query(" in sent, (
|
||
f"endpoint did not wrap user SQL in bigquery_query(); sent: {sent!r}"
|
||
)
|
||
assert "test-data-prj" in sent
|
||
assert "`test-data-prj.fin.ue`" in sent
|
||
|
||
|
||
def test_endpoint_passes_original_sql_when_no_bq_table(
|
||
seeded_app, stub_bq_for_endpoint, monkeypatch,
|
||
):
|
||
"""For queries that don't touch any BQ-remote registered name, the
|
||
handler must pass the original SQL through unchanged — the
|
||
ATTACH-catalog path handles local-source tables natively and any
|
||
rewrite would be wasted work."""
|
||
captured = {"sql": None}
|
||
|
||
class _StubAnalytics:
|
||
description = [("c0",)]
|
||
def execute(self, sql, *args, **kwargs):
|
||
captured["sql"] = sql
|
||
class _R:
|
||
def fetchmany(self, _n):
|
||
return []
|
||
return _R()
|
||
def close(self):
|
||
pass
|
||
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_analytics_db_readonly",
|
||
lambda: _StubAnalytics(),
|
||
raising=False,
|
||
)
|
||
|
||
c = seeded_app["client"]
|
||
token = seeded_app["admin_token"]
|
||
user_sql = "SELECT 1 AS x"
|
||
r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
|
||
assert r.status_code == 200, r.json()
|
||
assert captured["sql"] == user_sql
|
||
assert "bigquery_query(" not in captured["sql"]
|
||
|
||
|
||
def test_endpoint_wraps_rewritten_sql_with_outer_limit(
|
||
seeded_app, stub_bq_for_endpoint, monkeypatch,
|
||
):
|
||
"""Memory-safety regression — when the rewriter fires, the handler
|
||
MUST wrap the bigquery_query() call in an outer ``LIMIT N+1`` so a
|
||
`SELECT *` against a billion-row remote table doesn't materialise the
|
||
full result into the worker before fetchmany applies the cap.
|
||
Code-review #2a fix.
|
||
"""
|
||
_register_bq_remote_row("ue", "fin", "ue")
|
||
|
||
captured = {"sql": None}
|
||
|
||
class _StubAnalytics:
|
||
description = [("c0",)]
|
||
def execute(self, sql, *args, **kwargs):
|
||
captured["sql"] = sql
|
||
class _R:
|
||
def fetchmany(self, _n):
|
||
return []
|
||
return _R()
|
||
def close(self):
|
||
pass
|
||
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_analytics_db_readonly",
|
||
lambda: _StubAnalytics(),
|
||
raising=False,
|
||
)
|
||
|
||
c = seeded_app["client"]
|
||
token = seeded_app["admin_token"]
|
||
r = c.post(
|
||
"/api/query",
|
||
json={"sql": "SELECT * FROM ue", "limit": 100},
|
||
headers=_auth(token),
|
||
)
|
||
assert r.status_code == 200, r.json()
|
||
sent = captured["sql"]
|
||
# The bigquery_query() wrap is present, AND the whole thing is wrapped
|
||
# again with an outer LIMIT that includes the user-requested cap +1
|
||
# (the +1 is the existing truncation-detection pattern).
|
||
assert "bigquery_query(" in sent
|
||
assert "_bqq_outer" in sent
|
||
assert "LIMIT 101" in sent # request.limit (100) + 1
|
||
|
||
|
||
def test_endpoint_falls_back_to_original_sql_on_bq_parse_error(
|
||
seeded_app, stub_bq_for_endpoint, monkeypatch,
|
||
):
|
||
"""When the rewritten ``bigquery_query()`` path fails with a parse-
|
||
level error (e.g. user SQL contained DuckDB-only syntax that BQ
|
||
can't parse), the handler MUST retry with the original SQL via the
|
||
ATTACH-catalog path so the user request still succeeds. Code-review
|
||
#4 fix.
|
||
"""
|
||
_register_bq_remote_row("ue", "fin", "ue")
|
||
|
||
calls = {"sqls": []}
|
||
|
||
class _StubAnalytics:
|
||
description = [("c0",)]
|
||
def execute(self, sql, *args, **kwargs):
|
||
calls["sqls"].append(sql)
|
||
# First call (rewritten) raises a BQ-style parse error;
|
||
# second call (original SQL fallback) returns rows.
|
||
if "bigquery_query(" in sql:
|
||
raise RuntimeError(
|
||
"BinderException: Query execution failed: "
|
||
"Syntax error: Unexpected token at [1:42]"
|
||
)
|
||
class _R:
|
||
def fetchmany(self, _n):
|
||
return [(1,)]
|
||
return _R()
|
||
def close(self):
|
||
pass
|
||
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_analytics_db_readonly",
|
||
lambda: _StubAnalytics(),
|
||
raising=False,
|
||
)
|
||
|
||
c = seeded_app["client"]
|
||
token = seeded_app["admin_token"]
|
||
r = c.post(
|
||
"/api/query",
|
||
# DuckDB-only ::INT cast — survives identifier rewrite, BQ refuses.
|
||
json={"sql": "SELECT (count(*))::INT FROM ue"},
|
||
headers=_auth(token),
|
||
)
|
||
assert r.status_code == 200, r.json()
|
||
# Two execute calls: 1) rewritten (raised) 2) fallback to original.
|
||
assert len(calls["sqls"]) == 2
|
||
assert "bigquery_query(" in calls["sqls"][0]
|
||
assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue"
|
||
|
||
|
||
def test_rewriter_uses_billing_project_for_bigquery_query_first_arg(
|
||
seeded_registry, monkeypatch,
|
||
):
|
||
"""Devin-review BUG #1: `bigquery_query()` first arg is the
|
||
**billing** project (where BQ jobs are billed + executed), backtick
|
||
paths use the **data** project. In cross-project deploys the SA
|
||
has `serviceusage.services.use` only on the billing project, so
|
||
using the data project as billing → 403 USER_PROJECT_DENIED.
|
||
|
||
Match the existing convention in v2_scan / v2_sample / v2_schema /
|
||
extractor.
|
||
"""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(
|
||
seeded_registry, table_id="bq.fin.ue", name="ue",
|
||
bucket="fin", source_table="ue",
|
||
)
|
||
_set_bq_project(monkeypatch, project="data-prj", billing="billing-prj")
|
||
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
"SELECT count(*) FROM ue",
|
||
seeded_registry,
|
||
)
|
||
assert did_rewrite is True
|
||
# First arg of bigquery_query must be the billing project.
|
||
assert "bigquery_query('billing-prj'" in rewritten
|
||
# Backtick path must use the data project.
|
||
assert "`data-prj.fin.ue`" in rewritten
|
||
# And the data project must NOT appear as the first arg.
|
||
assert "bigquery_query('data-prj'" not in rewritten
|
||
|
||
|
||
def test_rewriter_skips_when_bq_row_bucket_contains_dot(
|
||
seeded_registry, monkeypatch,
|
||
):
|
||
"""Devil's-advocate R1 finding #5: a BQ row whose `bucket` contains
|
||
`.` suggests the operator encoded a project prefix in the bucket
|
||
name. Wrapping under our single-project assumption could silently
|
||
target the wrong project. Rewriter must skip in that case (fall
|
||
through to ATTACH-catalog path which respects the operator's
|
||
`_remote_attach` configuration).
|
||
"""
|
||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||
_register_bq_remote(
|
||
seeded_registry,
|
||
table_id="bq.other-prj.dataset.ue",
|
||
name="ue",
|
||
# Project-qualified bucket — the multi-project red flag.
|
||
bucket="other-prj.dataset",
|
||
source_table="ue",
|
||
)
|
||
_set_bq_project(monkeypatch, "test-prj")
|
||
|
||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||
"SELECT count(*) FROM ue",
|
||
seeded_registry,
|
||
)
|
||
# Skip — original SQL returned, no rewrite.
|
||
assert did_rewrite is False
|
||
assert rewritten == "SELECT count(*) FROM ue"
|
||
|
||
|
||
def test_fallback_does_not_trigger_on_user_column_typo(
|
||
seeded_app, stub_bq_for_endpoint, monkeypatch,
|
||
):
|
||
"""Devil's-advocate R1 finding #2: previously the fallback
|
||
heuristic matched `Unrecognized name`, which BQ surfaces for both
|
||
DuckDB-only-name AND user-column-typo cases. The user-typo case
|
||
triggered re-running the original SQL through the slow ATTACH-
|
||
catalog path (90+ s) → 2× latency tax on every typo.
|
||
|
||
Post-fix: heuristic only matches `Syntax error`. A BQ-side
|
||
`Unrecognized name: bad_col` should propagate as-is, NOT trigger
|
||
a fallback retry.
|
||
"""
|
||
_register_bq_remote_row("ue", "fin", "ue")
|
||
|
||
calls = {"sqls": []}
|
||
|
||
class _StubAnalytics:
|
||
description = [("c0",)]
|
||
def execute(self, sql, *args, **kwargs):
|
||
calls["sqls"].append(sql)
|
||
raise RuntimeError(
|
||
"BinderException: Query execution failed: "
|
||
"Unrecognized name: bad_col at [1:8]"
|
||
)
|
||
def close(self):
|
||
pass
|
||
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_analytics_db_readonly",
|
||
lambda: _StubAnalytics(),
|
||
raising=False,
|
||
)
|
||
|
||
c = seeded_app["client"]
|
||
token = seeded_app["admin_token"]
|
||
r = c.post(
|
||
"/api/query",
|
||
json={"sql": "SELECT bad_col FROM ue"},
|
||
headers=_auth(token),
|
||
)
|
||
# Error propagates; fallback NOT triggered (only one execute call).
|
||
assert r.status_code in (400, 500, 502)
|
||
assert len(calls["sqls"]) == 1, (
|
||
"user column typo must NOT trigger fallback retry"
|
||
)
|
||
assert "bigquery_query(" in calls["sqls"][0]
|
||
|
||
|
||
def test_endpoint_does_not_fall_back_on_non_parse_errors(
|
||
seeded_app, stub_bq_for_endpoint, monkeypatch,
|
||
):
|
||
"""Non-parse-error exceptions from the rewritten path (network,
|
||
quota, forbidden, generic runtime) must propagate, NOT silently
|
||
retry against the legacy path. Otherwise the legacy path would
|
||
just fail again and the user sees a slow + double-failure.
|
||
"""
|
||
_register_bq_remote_row("ue", "fin", "ue")
|
||
|
||
calls = {"sqls": []}
|
||
|
||
class _StubAnalytics:
|
||
description = [("c0",)]
|
||
def execute(self, sql, *args, **kwargs):
|
||
calls["sqls"].append(sql)
|
||
raise RuntimeError("Network unreachable: BQ endpoint timed out")
|
||
def close(self):
|
||
pass
|
||
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_analytics_db_readonly",
|
||
lambda: _StubAnalytics(),
|
||
raising=False,
|
||
)
|
||
|
||
c = seeded_app["client"]
|
||
token = seeded_app["admin_token"]
|
||
r = c.post(
|
||
"/api/query",
|
||
json={"sql": "SELECT count(*) FROM ue"},
|
||
headers=_auth(token),
|
||
)
|
||
# Generic 400 from the handler's outer except — body will surface
|
||
# the runtime error message; we just need to confirm no fallback.
|
||
assert r.status_code in (400, 500, 502)
|
||
assert len(calls["sqls"]) == 1, "must not retry on non-parse error"
|
||
|
||
|
||
def test_endpoint_backtick_only_path_wraps_not_local_parse(
|
||
seeded_app, stub_bq_for_endpoint, monkeypatch,
|
||
):
|
||
"""Issue #363 regression: SQL using only a full backtick BQ path must
|
||
be wrapped in bigquery_query() before reaching analytics.execute() —
|
||
NOT passed raw to the local DuckDB instance that would reject the
|
||
backtick syntax with "syntax error at or near `"".
|
||
"""
|
||
_register_bq_remote_row("ue", "fin", "ue")
|
||
|
||
captured = {"sql": None}
|
||
|
||
class _StubAnalytics:
|
||
description = [("cnt",)]
|
||
def execute(self, sql, *args, **kwargs):
|
||
captured["sql"] = sql
|
||
class _R:
|
||
def fetchmany(self, _n):
|
||
return [(42,)]
|
||
return _R()
|
||
def close(self):
|
||
pass
|
||
|
||
monkeypatch.setattr(
|
||
"app.api.query.get_analytics_db_readonly",
|
||
lambda: _StubAnalytics(),
|
||
raising=False,
|
||
)
|
||
|
||
c = seeded_app["client"]
|
||
token = seeded_app["admin_token"]
|
||
r = c.post(
|
||
"/api/query",
|
||
json={"sql": "SELECT COUNT(*) FROM `test-data-prj.fin.ue` WHERE x = 1"},
|
||
headers=_auth(token),
|
||
)
|
||
assert r.status_code == 200, r.json()
|
||
sent = captured["sql"]
|
||
assert sent is not None
|
||
assert "bigquery_query(" in sent, (
|
||
f"backtick-path SQL must be wrapped in bigquery_query(); got: {sent!r}"
|
||
)
|
||
# The backtick path survives verbatim inside the wrapper.
|
||
assert "test-data-prj.fin.ue" in sent
|