Merge pull request #208 from keboola/zs/issue-201-rewriter-backtick
fix(query): rewriter respects backtick paths; tighten cap-guard fallback (#201)
This commit is contained in:
commit
e3494607bf
5 changed files with 586 additions and 87 deletions
34
CHANGELOG.md
34
CHANGELOG.md
|
|
@ -10,6 +10,40 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.42.0] — 2026-05-06
|
||||
|
||||
### Fixed
|
||||
- `agnes query --remote`: full backtick BigQuery paths in user SQL are no
|
||||
longer corrupted by the registered-name rewriter. Previously a query
|
||||
like ``SELECT … FROM `<project>.<dataset>.<table>` WHERE …`` whose
|
||||
table name happened to be registered as a bare-name alias would have
|
||||
the alias re-substituted *inside* the backtick path, producing
|
||||
malformed SQL that BigQuery rejected with a parse error. The cap-guard
|
||||
then fell back to a filter-less `SELECT *` size estimate (often orders
|
||||
of magnitude larger than the real scan), blocking the query as
|
||||
`remote_scan_too_large`. Issue #201.
|
||||
|
||||
### Changed
|
||||
- `agnes query --remote`: cap-guard fallback no longer estimates from
|
||||
a synthetic `SELECT *` when the rewritten SQL fails dry-run. It first
|
||||
retries the user's original SQL (handles BQ-native input cleanly), and
|
||||
only when *that* also fails returns a structured `remote_estimate_failed`
|
||||
HTTP 400 with a hint instead of silently over-estimating.
|
||||
- **BREAKING (clients matching error kinds)**: failure to estimate
|
||||
remote-query scan size now returns `kind="remote_estimate_failed"`
|
||||
instead of being masked as `remote_scan_too_large` caused by
|
||||
over-estimation. Operators that grep for the old kind in dashboards
|
||||
should update.
|
||||
|
||||
### Security
|
||||
- `agnes query --remote`: full backtick BigQuery paths are now
|
||||
registry-gated identically to `bq."<dataset>"."<table>"` syntax.
|
||||
Previously, full backtick paths bypassed Agnes RBAC entirely — only
|
||||
the configured service account scope limited what users could query.
|
||||
New `bq_path_cross_project` (when the project ≠ configured data
|
||||
project) and `bq_path_not_registered` (when path is unknown) error
|
||||
kinds. Issue #201.
|
||||
|
||||
## [0.41.0] — 2026-05-06
|
||||
|
||||
### Fixed
|
||||
|
|
|
|||
247
app/api/query.py
247
app/api/query.py
|
|
@ -90,6 +90,29 @@ BQ_PATH = re.compile(
|
|||
)
|
||||
|
||||
|
||||
# Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
|
||||
# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
|
||||
# bare-name regexes from firing inside backtick-quoted segments.
|
||||
_BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
|
||||
_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
|
||||
|
||||
|
||||
def _mask_backticks(sql: str) -> str:
|
||||
"""Replace each `…`-quoted segment with spaces of equal length so
|
||||
word-boundary regexes find positions outside backticks but ignore
|
||||
everything inside. Preserves all character offsets so ``re.search``
|
||||
on the masked string returns matches at the same positions as on the
|
||||
original.
|
||||
|
||||
Issue #201: `\\b` matches inside backtick segments because both `.`
|
||||
and `` ` `` are non-word characters. A registered bare-name like
|
||||
``unit_economics`` would otherwise match inside a user-supplied full
|
||||
backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
|
||||
falsely rewritten — corrupting the user's intended SQL.
|
||||
"""
|
||||
return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
|
||||
|
||||
|
||||
def _default_remote_query_cap_bytes() -> int:
|
||||
"""5 GiB default cap on /api/query BQ-touching scans. Configurable via
|
||||
`data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
|
||||
|
|
@ -197,11 +220,18 @@ def execute_query(
|
|||
if r.get("name") and r.get("id") in allowed_ids
|
||||
}
|
||||
|
||||
# Check if query references any forbidden tables (word-boundary match)
|
||||
# Check if query references any forbidden tables (word-boundary
|
||||
# match). Issue #201: mask backtick segments so `\b` doesn't
|
||||
# falsely fire inside a user-supplied full backtick path like
|
||||
# `<project>.<dataset>.<table>` whose final segment happens to
|
||||
# collide with a forbidden master view name. The full-path
|
||||
# registry-gate downstream is the proper authorization check
|
||||
# for those.
|
||||
sql_lower_masked = _mask_backticks(sql_lower)
|
||||
forbidden = all_views - allowed_view_names
|
||||
for table in forbidden:
|
||||
pattern = r'\b' + re.escape(table.lower()) + r'\b'
|
||||
if re.search(pattern, sql_lower):
|
||||
if re.search(pattern, sql_lower_masked):
|
||||
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
|
||||
|
||||
# ---- #160 BQ remote-row guardrail + RBAC patch -------------------
|
||||
|
|
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
|
|||
name_lookups: list = []
|
||||
seen_paths: set = set()
|
||||
accessible_set = set(allowed) if allowed is not None else None
|
||||
# Issue #201: mask backtick segments so a registered bare name like
|
||||
# `unit_economics` doesn't false-positive on a user-supplied full
|
||||
# backtick path `<project>.<dataset>.unit_economics`. The full-path
|
||||
# pass below registry-gates those properly.
|
||||
sql_lower_masked = _mask_backticks(sql_lower)
|
||||
for r in repo.list_by_source("bigquery"):
|
||||
if (r.get("query_mode") or "") != "remote":
|
||||
continue
|
||||
|
|
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
|
|||
# before we get here. Defensive skip.
|
||||
continue
|
||||
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
||||
if re.search(pattern, sql_lower):
|
||||
if re.search(pattern, sql_lower_masked):
|
||||
key = (bucket.lower(), source_table.lower())
|
||||
if key not in seen_paths:
|
||||
seen_paths.add(key)
|
||||
|
|
@ -529,6 +564,66 @@ def _bq_guardrail_inputs(
|
|||
seen_paths.add(key)
|
||||
dry_run.append((bucket, source_table, 0))
|
||||
|
||||
# 3. Full backtick path `<project>.<dataset>.<table>` pass (issue #201).
|
||||
# Pre-#201 these bypassed Agnes RBAC entirely — only the configured
|
||||
# service account scope limited which tables a user could reach. Gate
|
||||
# them identically to the `bq.<ds>.<tbl>` pass: must match the
|
||||
# configured data project, must point at a registered row, and the
|
||||
# caller must hold a grant on that row's id (admin bypasses the grant
|
||||
# check but still requires registration + project match).
|
||||
#
|
||||
# Lazy `get_bq_access()` import via the module-level alias so tests
|
||||
# can monkeypatch a fake. When BQ isn't configured (no data project),
|
||||
# fall through silently — full backtick paths can't possibly resolve
|
||||
# against this instance, so leave them to BQ to reject if a query
|
||||
# somehow makes it through.
|
||||
try:
|
||||
bq = get_bq_access()
|
||||
data_project = (bq.projects.data or "").strip()
|
||||
except Exception:
|
||||
data_project = ""
|
||||
|
||||
if data_project:
|
||||
for m in _BACKTICK_FULL_PATH.finditer(sql):
|
||||
proj, ds, tbl = m.group(1), m.group(2), m.group(3)
|
||||
if proj.lower() != data_project.lower():
|
||||
return [], [], {
|
||||
"reason": "bq_path_cross_project",
|
||||
"path": f"`{proj}.{ds}.{tbl}`",
|
||||
"expected_project": data_project,
|
||||
"hint": (
|
||||
"--remote queries can only reference tables in the "
|
||||
"configured BigQuery data project. Register "
|
||||
"cross-project tables via `agnes admin "
|
||||
"register-table` if needed."
|
||||
),
|
||||
}
|
||||
row = repo.find_by_bq_path(ds, tbl)
|
||||
if row is None:
|
||||
return [], [], {
|
||||
"reason": "bq_path_not_registered",
|
||||
"path": f"`{proj}.{ds}.{tbl}`",
|
||||
"hint": (
|
||||
"Direct BigQuery paths must point to a registered "
|
||||
"table. Register via `agnes admin register-table` "
|
||||
"or use the registered name from `agnes catalog`."
|
||||
),
|
||||
}
|
||||
if not is_admin:
|
||||
if accessible_set is None or row["id"] not in accessible_set:
|
||||
return [], [], {
|
||||
"reason": "bq_path_access_denied",
|
||||
"path": f"`{proj}.{ds}.{tbl}`",
|
||||
"registered_as": row["name"],
|
||||
}
|
||||
bucket = row["bucket"]
|
||||
source_table = row["source_table"]
|
||||
if bucket and source_table:
|
||||
key = (bucket.lower(), source_table.lower())
|
||||
if key not in seen_paths:
|
||||
seen_paths.add(key)
|
||||
dry_run.append((bucket, source_table, 0))
|
||||
|
||||
return dry_run, name_lookups, None
|
||||
|
||||
|
||||
|
|
@ -579,6 +674,15 @@ def _rewrite_bq_table_refs_to_native(
|
|||
# name up in a case-insensitive dict. Single-pass means freshly
|
||||
# inserted backticked text isn't re-scanned, fixing the
|
||||
# project-ID-contains-name corruption (Devin Review on query.py:464).
|
||||
#
|
||||
# Issue #201: split the SQL on `…` segments and rewrite ONLY in the
|
||||
# outside-backtick chunks. Without this, a user-supplied full backtick
|
||||
# path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
|
||||
# segment matches a registered bare name would have the bare-name
|
||||
# regex fire INSIDE the backticks (since `\\b` treats both `.` and
|
||||
# `` ` `` as non-word boundaries), producing malformed nested
|
||||
# backticks. Splitting confines the rewrite to user identifier
|
||||
# positions where bare-name resolution is the intended behaviour.
|
||||
if name_lookups:
|
||||
# Map name (lower-cased) → backticked target. Names are
|
||||
# case-insensitive on the input side per the existing helper
|
||||
|
|
@ -598,7 +702,15 @@ def _rewrite_bq_table_refs_to_native(
|
|||
def _name_repl(m: re.Match) -> str:
|
||||
return name_to_target[m.group(1).lower()]
|
||||
|
||||
out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
|
||||
# `re.split` with a captured group returns: [outside, backtick,
|
||||
# outside, backtick, …]. Even indices are outside-backtick chunks
|
||||
# eligible for bare-name rewrite; odd indices are full backtick
|
||||
# segments preserved verbatim.
|
||||
parts = re.split(r'(`[^`]*`)', out)
|
||||
for i, part in enumerate(parts):
|
||||
if i % 2 == 0:
|
||||
parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
|
||||
out = "".join(parts)
|
||||
|
||||
# Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
|
||||
def _bq_path_repl(m: re.Match) -> str:
|
||||
|
|
@ -675,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query(
|
|||
return user_sql, False
|
||||
|
||||
# Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
|
||||
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`.
|
||||
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201:
|
||||
# bare-name regex must run against a backtick-masked copy so a
|
||||
# registered name like ``orders`` doesn't false-positive when it
|
||||
# appears as the table segment of a user-supplied full backtick path
|
||||
# like ``\\`<project>.<dataset>.orders\\```. Without masking, the
|
||||
# cross-source check below would falsely conclude the SQL touches
|
||||
# both BQ-remote and local sources, dropping every backtick-path
|
||||
# query into the 50-100× slower ATTACH-catalog fallback. Devin
|
||||
# Review on PR #208.
|
||||
sql_lower = user_sql.lower()
|
||||
sql_lower_masked = _mask_backticks(sql_lower)
|
||||
name_lookups: list = []
|
||||
seen_paths: set = set()
|
||||
|
||||
|
|
@ -715,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query(
|
|||
# mix rewritten and non-rewritten BQ paths in one query.
|
||||
return user_sql, False
|
||||
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
||||
if re.search(pattern, sql_lower):
|
||||
if re.search(pattern, sql_lower_masked):
|
||||
key = (bucket.lower(), source_table.lower())
|
||||
if key not in seen_paths:
|
||||
seen_paths.add(key)
|
||||
|
|
@ -752,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query(
|
|||
# Same name registered both BQ-remote and local? Pathological;
|
||||
# skip as a safety measure.
|
||||
return user_sql, False
|
||||
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower):
|
||||
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked):
|
||||
logger.info(
|
||||
"rewrite_skip_cross_source: user SQL references both "
|
||||
"BQ-remote and local-mode tables; falling back to "
|
||||
|
|
@ -839,28 +960,32 @@ def _bq_quota_and_cap_guard(
|
|||
partitioned/clustered tables and rejecting narrow queries that BQ
|
||||
itself would dry-run as a few MB.
|
||||
|
||||
Fallback: if BQ rejects the rewritten SQL with a parse-level
|
||||
``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that
|
||||
don't translate to BQ), fall back to the pre-#171 per-table
|
||||
SELECT * approach so the cap-guard still functions — over-estimate
|
||||
is preferred over fail-open. Forbidden / upstream errors still
|
||||
propagate as HTTP 502.
|
||||
Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level
|
||||
``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or
|
||||
a rewriter bug that broke valid BQ-native input), retry with the
|
||||
user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the
|
||||
original ALSO fails, return a structured `remote_estimate_failed`
|
||||
HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table
|
||||
over-estimate. The synthetic fallback threw away user filters and
|
||||
routinely ballooned to "full table size", blocking legitimate narrow
|
||||
queries via `remote_scan_too_large`. Forbidden / upstream errors
|
||||
still propagate as HTTP 502.
|
||||
|
||||
Flow:
|
||||
1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work.
|
||||
2. `quota.acquire(user_id)` opened — concurrent-slot held throughout.
|
||||
3. Single dry-run of rewritten user SQL → `total_bytes`.
|
||||
On parse error, fall back to per-table SELECT * → sum.
|
||||
On parse error, retry with the user's original SQL.
|
||||
On second parse error, raise 400 `remote_estimate_failed`.
|
||||
4. If total > cap → 400 `remote_scan_too_large`.
|
||||
5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`.
|
||||
6. On exit, slot released.
|
||||
|
||||
Mutates `dry_run_set` in place: the third tuple element (bytes) is
|
||||
populated so the caller can sum and record bytes against the user's
|
||||
quota post-flight. Single-dry-run path puts `total_bytes` on the first
|
||||
entry and zero on the rest (BQ doesn't expose per-table bytes for a
|
||||
composite query); the caller's `sum(b for _, _, b in dry_run_set)`
|
||||
still equals `total_bytes`.
|
||||
quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest
|
||||
— BQ doesn't expose per-table bytes for a composite query — so
|
||||
`sum(b for _, _, b in dry_run_set)` still equals `total_bytes`.
|
||||
"""
|
||||
quota = _build_quota_tracker()
|
||||
try:
|
||||
|
|
@ -901,55 +1026,65 @@ def _bq_quota_and_cap_guard(
|
|||
sql, name_lookups, project,
|
||||
)
|
||||
|
||||
# Try the single-dry-run path first (issue #171). Falls back
|
||||
# to the per-table SELECT * approach only on BQ parse errors
|
||||
# (kind="bq_bad_request" — DuckDB-only syntax that BQ can't
|
||||
# translate). All other BQ errors propagate as 502 below.
|
||||
# Try the single-dry-run path first (issue #171). On BQ parse
|
||||
# errors (`bq_bad_request` — typically DuckDB-only syntax the
|
||||
# rewriter couldn't translate, OR — pre-#201 fix — a
|
||||
# rewriter-corrupted backtick path) retry the user's ORIGINAL
|
||||
# SQL: when the user submitted BQ-native SQL, the rewriter is
|
||||
# the only thing standing between them and a clean dry-run.
|
||||
# If the original ALSO fails, this is true DuckDB-only syntax
|
||||
# that BQ cannot estimate — fail fast with a structured
|
||||
# `remote_estimate_failed` instead of the pre-#201 synthetic
|
||||
# `SELECT *` over-estimate (which threw away user filters and
|
||||
# often ballooned to "full table size", blocking legitimate
|
||||
# narrow queries via `remote_scan_too_large`).
|
||||
#
|
||||
# All other BQ errors (forbidden, upstream) propagate as 502.
|
||||
total_bytes = 0
|
||||
used_fallback = False
|
||||
try:
|
||||
total_bytes = _bq_dry_run_bytes(bq, rewritten_sql)
|
||||
except BqAccessError as exc:
|
||||
if exc.kind == "bq_bad_request":
|
||||
if exc.kind != "bq_bad_request":
|
||||
raise HTTPException(status_code=502, detail={
|
||||
"kind": exc.kind,
|
||||
"message": exc.message,
|
||||
**(exc.details or {}),
|
||||
})
|
||||
logger.warning(
|
||||
"BQ dry-run rejected the rewritten SQL "
|
||||
"(kind=%s, message=%s). Falling back to per-table "
|
||||
"SELECT * estimate; the cap check will over-estimate "
|
||||
"scan bytes for this query. Consider rewriting to "
|
||||
"BQ-native syntax for a tight pre-check.",
|
||||
"(kind=%s, message=%s). Retrying with the user's "
|
||||
"original SQL.",
|
||||
exc.kind, exc.message,
|
||||
)
|
||||
used_fallback = True
|
||||
else:
|
||||
try:
|
||||
total_bytes = _bq_dry_run_bytes(bq, sql)
|
||||
except BqAccessError as exc2:
|
||||
if exc2.kind != "bq_bad_request":
|
||||
raise HTTPException(status_code=502, detail={
|
||||
"kind": exc.kind,
|
||||
"message": exc.message,
|
||||
**(exc.details or {}),
|
||||
"kind": exc2.kind,
|
||||
"message": exc2.message,
|
||||
**(exc2.details or {}),
|
||||
})
|
||||
raise HTTPException(status_code=400, detail={
|
||||
"kind": "remote_estimate_failed",
|
||||
"message": (
|
||||
"Could not estimate scan size for this query."
|
||||
),
|
||||
"hint": (
|
||||
"Use a registered table name from `agnes "
|
||||
"catalog`, or write BQ-native SQL with full "
|
||||
"backtick paths. Pure DuckDB-only syntax is "
|
||||
"not supported for --remote queries."
|
||||
),
|
||||
"underlying": exc2.message,
|
||||
})
|
||||
|
||||
if used_fallback:
|
||||
# Pre-#171 path: estimate per registered table from a
|
||||
# synthetic SELECT *. Over-estimates partitioned scans but
|
||||
# never under-estimates, so the cap still bounds risk.
|
||||
for i, (bucket, source_table, _) in enumerate(dry_run_set):
|
||||
fallback_sql = (
|
||||
f"SELECT * FROM `{project}.{bucket}.{source_table}`"
|
||||
)
|
||||
try:
|
||||
est = _bq_dry_run_bytes(bq, fallback_sql)
|
||||
except BqAccessError as exc:
|
||||
raise HTTPException(status_code=502, detail={
|
||||
"kind": exc.kind,
|
||||
"message": exc.message,
|
||||
**(exc.details or {}),
|
||||
})
|
||||
dry_run_set[i] = (bucket, source_table, est)
|
||||
total_bytes += est
|
||||
else:
|
||||
# Single-dry-run path. Distribute the total to dry_run_set
|
||||
# so the caller's `record_bytes(sum(...))` stays correct.
|
||||
# Per-table breakdown is unavailable from a composite
|
||||
# dry-run; pin total to entry 0, zero the rest.
|
||||
# Distribute the total to dry_run_set so the caller's
|
||||
# `record_bytes(sum(...))` stays correct. Per-table breakdown
|
||||
# is unavailable from a composite dry-run; pin total to entry
|
||||
# 0, zero the rest. (Same accounting symmetry whether the
|
||||
# bytes came from the rewritten SQL or the original-SQL
|
||||
# retry.)
|
||||
if dry_run_set:
|
||||
b0, t0, _ = dry_run_set[0]
|
||||
dry_run_set[0] = (b0, t0, total_bytes)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
[project]
|
||||
name = "agnes-the-ai-analyst"
|
||||
version = "0.41.0"
|
||||
version = "0.42.0"
|
||||
description = "Agnes — AI Data Analyst platform for AI analytical systems"
|
||||
requires-python = ">=3.11,<3.14"
|
||||
license = "MIT"
|
||||
|
|
|
|||
|
|
@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request(
|
|||
assert "`test-data-prj.marketing.traffic`" in state["last_sql"]
|
||||
|
||||
|
||||
def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
|
||||
def test_fallback_tries_original_sql_first(
|
||||
seeded_app, mock_dry_run, monkeypatch,
|
||||
):
|
||||
"""When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only
|
||||
syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …),
|
||||
the cap-guard falls back to the pre-#171 per-table SELECT * approach
|
||||
so a non-portable query still gets a (loose) cap estimate instead of
|
||||
fail-opening.
|
||||
"""
|
||||
"""Issue #201 — when the rewriter produces SQL that BQ rejects with
|
||||
`bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the
|
||||
cap-guard uses the original SQL's byte estimate. No more synthetic
|
||||
`SELECT *` over-estimate.
|
||||
|
||||
Bare-name reference populates `dry_run_set` so the cap-guard
|
||||
actually fires. Mock returns parse-error on the first call
|
||||
(rewritten SQL) and small bytes on the second (original)."""
|
||||
from connectors.bigquery.access import BqAccessError
|
||||
|
||||
_register_bq_remote_row("ue", "finance", "ue")
|
||||
|
|
@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
|
|||
|
||||
def fake_dry_run(_bq, sql):
|
||||
state["calls"].append(sql)
|
||||
# First call (rewritten user SQL) → BQ parse error.
|
||||
# First call (rewritten SQL) → BQ parse error.
|
||||
if len(state["calls"]) == 1:
|
||||
raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
|
||||
# Second call (fallback per-table SELECT *) → small bytes, pass cap.
|
||||
raise BqAccessError("bq_bad_request", "Syntax error: simulated")
|
||||
# Second call (the user's original SQL) → small, passes cap.
|
||||
return 4096
|
||||
|
||||
monkeypatch.setattr(
|
||||
|
|
@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
|
|||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
# SQL with DuckDB-only `::INT` cast that BQ would reject.
|
||||
user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'"
|
||||
r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
|
||||
|
||||
# Two dry-runs: rewritten then original. No third synthetic-SELECT-*
|
||||
# call.
|
||||
assert len(state["calls"]) == 2, (
|
||||
f"expected rewritten + original-SQL retry, got "
|
||||
f"{len(state['calls'])}: {state['calls']}"
|
||||
)
|
||||
assert state["calls"][1] == user_sql, (
|
||||
f"second call must be the user's ORIGINAL SQL, got "
|
||||
f"{state['calls'][1]!r}"
|
||||
)
|
||||
# The response must NOT be remote_scan_too_large from a synthetic
|
||||
# over-estimate — 4096 bytes is well under the 5 GiB cap.
|
||||
if r.status_code == 400:
|
||||
detail = r.json().get("detail", {})
|
||||
if isinstance(detail, dict):
|
||||
assert detail.get("reason") != "remote_scan_too_large", detail
|
||||
|
||||
|
||||
def test_fallback_fails_fast_on_pure_duckdb_syntax(
|
||||
seeded_app, mock_dry_run, monkeypatch,
|
||||
):
|
||||
"""When BOTH the rewritten and original SQL fail with `bq_bad_request`
|
||||
(true DuckDB-only syntax like `::INT`), return HTTP 400
|
||||
`remote_estimate_failed` — never silently over-estimate via a
|
||||
synthetic `SELECT *`."""
|
||||
from connectors.bigquery.access import BqAccessError
|
||||
|
||||
_register_bq_remote_row("ue", "finance", "ue")
|
||||
|
||||
state = {"calls": []}
|
||||
|
||||
def always_parse_error(_bq, sql):
|
||||
state["calls"].append(sql)
|
||||
raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.api.query._bq_dry_run_bytes", always_parse_error, raising=False,
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"},
|
||||
headers=_auth(token),
|
||||
)
|
||||
|
||||
# Two dry-runs (rewritten + fallback per-table) before the (failed)
|
||||
# execute. Status will be a downstream error from analytics.execute()
|
||||
# since `::INT` doesn't work in DuckDB either against a remote view —
|
||||
# but the GUARDRAIL must have completed without 5xx-ing.
|
||||
# Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback.
|
||||
assert len(state["calls"]) == 2, (
|
||||
f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: "
|
||||
f"{state['calls']}"
|
||||
f"expected 1 rewritten + 1 original-retry, got "
|
||||
f"{len(state['calls'])}: {state['calls']}"
|
||||
)
|
||||
assert "::" in state["calls"][0], "first call should be the rewritten user SQL"
|
||||
assert state["calls"][1].startswith("SELECT * FROM"), (
|
||||
"second call should be the per-table fallback"
|
||||
# No call should be a synthetic ``SELECT * FROM `<project>...```. The
|
||||
# original-SQL retry contains the user's SELECT clause.
|
||||
for c_sql in state["calls"]:
|
||||
# If a call is just a synthetic ``SELECT * FROM `<project>.<bucket>.<table>```
|
||||
# the user's `WHERE country = 'CZ'` would be missing.
|
||||
if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql:
|
||||
raise AssertionError(
|
||||
f"synthetic SELECT * fallback was used: {c_sql!r}"
|
||||
)
|
||||
# Whatever HTTP status comes back must NOT be 502 from the guard's
|
||||
# transport-error path — fallback must absorb the bq_bad_request.
|
||||
assert r.status_code != 502, r.json()
|
||||
|
||||
assert r.status_code == 400, r.json()
|
||||
detail = r.json().get("detail", {})
|
||||
assert isinstance(detail, dict), detail
|
||||
assert detail.get("kind") == "remote_estimate_failed", detail
|
||||
assert "underlying" in detail, detail
|
||||
assert "agnes catalog" in detail.get("hint", "").lower() or \
|
||||
"backtick" in detail.get("hint", "").lower(), detail
|
||||
|
||||
|
||||
def test_guardrail_propagates_502_on_non_parse_bq_errors(
|
||||
|
|
@ -432,3 +484,243 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
|
|||
)
|
||||
assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
|
||||
rewritten.lower().count("`p.fin.ue`") == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Issue #201: rewriter must NOT touch text inside `…` backtick segments.
|
||||
# A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
|
||||
# table segment matches a registered bare name was being re-substituted
|
||||
# inside the backticks, producing malformed nested-backtick SQL that BQ
|
||||
# rejected with a parse error.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_rewrite_skips_inside_backtick_path():
|
||||
"""Full backtick BQ path is preserved byte-for-byte even when its
|
||||
final segment matches a registered bare-name alias."""
|
||||
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||||
|
||||
sql = (
|
||||
"SELECT * FROM `my-prj.finance.unit_economics` "
|
||||
"WHERE country = 'CZ'"
|
||||
)
|
||||
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||||
sql=sql,
|
||||
name_lookups=[("unit_economics", "finance", "unit_economics")],
|
||||
project="my-prj",
|
||||
)
|
||||
# No corruption — input is already BQ-native, rewriter is a no-op here.
|
||||
assert rewritten == sql, (
|
||||
f"backtick path was rewritten:\n in : {sql!r}\n out: {rewritten!r}"
|
||||
)
|
||||
# Sanity: the malformed nested form must NOT appear.
|
||||
assert "`my-prj.finance.`my-prj" not in rewritten
|
||||
|
||||
|
||||
def test_rewrite_skips_inside_backtick_with_outside_bare_name():
|
||||
"""Mixed SQL: a bare name outside backticks is rewritten as before,
|
||||
but an identically-named segment inside a backtick path is left
|
||||
alone."""
|
||||
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||||
|
||||
sql = (
|
||||
"SELECT a.id, b.col FROM ue a "
|
||||
"JOIN `my-prj.finance.ue` b ON a.id = b.id"
|
||||
)
|
||||
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||||
sql=sql,
|
||||
name_lookups=[("ue", "fin_alias", "ue_alias")],
|
||||
project="my-prj",
|
||||
)
|
||||
# Outside-backtick `ue` rewrites to the registered alias path.
|
||||
assert "`my-prj.fin_alias.ue_alias`" in rewritten
|
||||
# The user-supplied backtick path is preserved verbatim.
|
||||
assert "`my-prj.finance.ue`" in rewritten
|
||||
# The malformed nested form must NOT appear.
|
||||
assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
|
||||
|
||||
|
||||
def test_guardrail_skips_bare_name_match_inside_backticks(
|
||||
seeded_app, mock_dry_run, monkeypatch,
|
||||
):
|
||||
"""The `name_lookups` collection populated by `_bq_guardrail_inputs`
|
||||
must not include a registered name when the only place that name
|
||||
appears in the SQL is inside a `…` backtick segment.
|
||||
|
||||
Captures the rewritten SQL the guardrail forwards to the dry-run and
|
||||
asserts the bare-name was NOT substituted inside the user's backtick
|
||||
path.
|
||||
"""
|
||||
_register_bq_remote_row("unit_economics", "finance", "unit_economics")
|
||||
|
||||
captured = {"sql": None}
|
||||
|
||||
def capturing_fake(_bq, sql):
|
||||
captured["sql"] = sql
|
||||
return 1024
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
user_sql = (
|
||||
"SELECT * FROM `test-data-prj.finance.unit_economics` "
|
||||
"WHERE country = 'CZ'"
|
||||
)
|
||||
c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
|
||||
|
||||
sent = captured["sql"]
|
||||
if sent is None:
|
||||
# Guardrail decided no BQ tables were referenced — that's also
|
||||
# an acceptable "no false-positive" outcome (Layer 3 will cover
|
||||
# the explicit registry check for full backtick paths). We just
|
||||
# need to ensure the bare-name regex didn't fire.
|
||||
return
|
||||
# The user's exact backtick path must survive verbatim — no nested
|
||||
# backticks introduced by a stray bare-name rewrite.
|
||||
assert "`test-data-prj.finance.unit_economics`" in sent, (
|
||||
f"backtick path corrupted by guardrail:\n out: {sent!r}"
|
||||
)
|
||||
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
|
||||
f"nested-backtick corruption signature present: {sent!r}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated.
|
||||
# Pre-fix these bypassed Agnes RBAC entirely — only the configured service
|
||||
# account scope limited which tables a user could reach. Post-fix, they're
|
||||
# treated identically to `bq."<dataset>"."<table>"` syntax.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run):
|
||||
"""Full backtick path to an unregistered `<dataset>.<table>` (project
|
||||
matches the configured data project) → HTTP 403 with
|
||||
`bq_path_not_registered`."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={
|
||||
"sql": (
|
||||
"SELECT * FROM `test-data-prj.secret_ds.secret_tbl` "
|
||||
"WHERE country = 'CZ'"
|
||||
),
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 403, r.json()
|
||||
detail = r.json().get("detail", {})
|
||||
assert isinstance(detail, dict), detail
|
||||
assert detail.get("reason") == "bq_path_not_registered", detail
|
||||
assert "secret_ds" in detail.get("path", ""), detail
|
||||
assert "secret_tbl" in detail.get("path", ""), detail
|
||||
|
||||
|
||||
def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run):
|
||||
"""Full backtick path with project ≠ configured data project → HTTP
|
||||
403 with `bq_path_cross_project`. Even if the path happens to point
|
||||
at a registered (bucket, source_table), the project mismatch is the
|
||||
primary boundary."""
|
||||
_register_bq_remote_row("ue", "finance", "ue")
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={
|
||||
"sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1",
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 403, r.json()
|
||||
detail = r.json().get("detail", {})
|
||||
assert isinstance(detail, dict), detail
|
||||
assert detail.get("reason") == "bq_path_cross_project", detail
|
||||
assert detail.get("expected_project") == "test-data-prj", detail
|
||||
assert "other-project" in detail.get("path", ""), detail
|
||||
|
||||
|
||||
def test_full_backtick_path_registered_admin_passes(
|
||||
seeded_app, mock_dry_run, monkeypatch,
|
||||
):
|
||||
"""Admin caller + registered path + matching project → no RBAC
|
||||
rejection. The dry-run fires (we can capture the SQL the guardrail
|
||||
forwards) and no `bq_path_*` reason appears in any error response."""
|
||||
_register_bq_remote_row("ue", "finance", "ue")
|
||||
|
||||
captured = {"sql": None}
|
||||
|
||||
def capturing_fake(_bq, sql):
|
||||
captured["sql"] = sql
|
||||
return 1024 # tiny — pass cap
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={
|
||||
"sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1",
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
# If 403, must NOT be the issue-#201 bq_path_* reasons.
|
||||
if r.status_code == 403:
|
||||
detail = r.json().get("detail", {})
|
||||
if isinstance(detail, dict):
|
||||
assert detail.get("reason") not in (
|
||||
"bq_path_not_registered",
|
||||
"bq_path_access_denied",
|
||||
"bq_path_cross_project",
|
||||
), f"admin + registered path should pass RBAC: {detail}"
|
||||
# The dry-run was invoked, meaning Pass 3 added the path to dry_run_set
|
||||
# and the cap-guard fired. The user's WHERE clause must still be in
|
||||
# the dry-run SQL (validates Layer 1 — backtick-aware rewrite).
|
||||
assert captured["sql"] is not None, (
|
||||
"dry-run never fired — Pass 3 may not have registered the path"
|
||||
)
|
||||
assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"]
|
||||
assert "WHERE id = 1" in captured["sql"], captured["sql"]
|
||||
|
||||
|
||||
def test_full_backtick_path_inside_string_literal_not_gated(
|
||||
seeded_app, mock_dry_run,
|
||||
):
|
||||
"""Defensive case: a backtick path appearing inside a SQL string
|
||||
literal (rare but possible) should not trigger Pass 3. Practically
|
||||
this is unreachable because backticks aren't typically valid inside
|
||||
BQ string literals — but the regex doesn't know that. We document
|
||||
that the gate applies to ALL backtick triples to be safe; users who
|
||||
really need a literal can use single-quoted strings without
|
||||
backticks."""
|
||||
# No registration; the test confirms an unregistered path inside
|
||||
# what looks like a string is still gated. This is the conservative
|
||||
# boundary — false-positive on string literal beats false-negative
|
||||
# on a real RBAC bypass.
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/query",
|
||||
json={
|
||||
"sql": (
|
||||
"SELECT 'matches `test-data-prj.x.y`' AS lit"
|
||||
),
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
# Either gated (403) or 200 if the analytics DB happens to evaluate
|
||||
# the literal — both are acceptable. The point is no silent RBAC
|
||||
# bypass: if the response is 200, no BQ table was reached.
|
||||
if r.status_code == 403:
|
||||
detail = r.json().get("detail", {})
|
||||
if isinstance(detail, dict):
|
||||
assert detail.get("reason") in (
|
||||
"bq_path_not_registered",
|
||||
"bq_path_cross_project",
|
||||
), detail
|
||||
|
|
|
|||
|
|
@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
|
|||
assert rewritten == user_sql # untouched
|
||||
|
||||
|
||||
def test_local_name_inside_backtick_path_does_not_trip_cross_source(
|
||||
seeded_registry, monkeypatch,
|
||||
):
|
||||
"""Devin Review on PR #208 (issue #201 follow-up): a registered
|
||||
LOCAL-mode table name appearing as a segment of a user-supplied full
|
||||
backtick BQ path must NOT trip the cross-source guard. Pre-fix the
|
||||
bare-name regex at the cross-source check ran against unmasked
|
||||
sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
|
||||
local ``orders`` inside the backticks and force the wrapper to bail
|
||||
to the ATTACH-catalog slow path (50-100× slower). Post-fix the
|
||||
regex runs against the backtick-masked copy, the cross-source check
|
||||
correctly sees only BQ refs, and the wrap proceeds.
|
||||
"""
|
||||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||||
bucket="fin", source_table="ue")
|
||||
_register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
|
||||
_set_bq_project(monkeypatch, "test-prj")
|
||||
|
||||
user_sql = (
|
||||
"SELECT u.id "
|
||||
"FROM ue u "
|
||||
"JOIN `test-prj.dataset.orders` o ON u.x = o.x "
|
||||
"WHERE o.value > 0"
|
||||
)
|
||||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||||
user_sql, seeded_registry,
|
||||
)
|
||||
# Must wrap — both refs are BQ; the local `orders` registration is
|
||||
# irrelevant to a query that touches only BQ paths.
|
||||
assert did_rewrite is True
|
||||
assert "bigquery_query(" in rewritten
|
||||
# The user's backtick path is preserved verbatim inside the wrapped
|
||||
# inner SQL (Layer 1 split-on-backticks behaviour), so the original
|
||||
# `test-prj.dataset.orders` reference survives.
|
||||
assert "test-prj.dataset.orders" in rewritten
|
||||
|
||||
|
||||
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
|
||||
"""User SQL referencing only local-source tables → no rewrite,
|
||||
no log spam, original SQL returned."""
|
||||
|
|
|
|||
Loading…
Reference in a new issue