Merge pull request #208 from keboola/zs/issue-201-rewriter-backtick
fix(query): rewriter respects backtick paths; tighten cap-guard fallback (#201)
This commit is contained in:
commit
e3494607bf
5 changed files with 586 additions and 87 deletions
34
CHANGELOG.md
34
CHANGELOG.md
|
|
@ -10,6 +10,40 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.42.0] — 2026-05-06
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- `agnes query --remote`: full backtick BigQuery paths in user SQL are no
|
||||||
|
longer corrupted by the registered-name rewriter. Previously a query
|
||||||
|
like ``SELECT … FROM `<project>.<dataset>.<table>` WHERE …`` whose
|
||||||
|
table name happened to be registered as a bare-name alias would have
|
||||||
|
the alias re-substituted *inside* the backtick path, producing
|
||||||
|
malformed SQL that BigQuery rejected with a parse error. The cap-guard
|
||||||
|
then fell back to a filter-less `SELECT *` size estimate (often orders
|
||||||
|
of magnitude larger than the real scan), blocking the query as
|
||||||
|
`remote_scan_too_large`. Issue #201.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- `agnes query --remote`: cap-guard fallback no longer estimates from
|
||||||
|
a synthetic `SELECT *` when the rewritten SQL fails dry-run. It first
|
||||||
|
retries the user's original SQL (handles BQ-native input cleanly), and
|
||||||
|
only when *that* also fails returns a structured `remote_estimate_failed`
|
||||||
|
HTTP 400 with a hint instead of silently over-estimating.
|
||||||
|
- **BREAKING (clients matching error kinds)**: failure to estimate
|
||||||
|
remote-query scan size now returns `kind="remote_estimate_failed"`
|
||||||
|
instead of being masked as `remote_scan_too_large` caused by
|
||||||
|
over-estimation. Operators that grep for the old kind in dashboards
|
||||||
|
should update.
|
||||||
|
|
||||||
|
### Security
|
||||||
|
- `agnes query --remote`: full backtick BigQuery paths are now
|
||||||
|
registry-gated identically to `bq."<dataset>"."<table>"` syntax.
|
||||||
|
Previously, full backtick paths bypassed Agnes RBAC entirely — only
|
||||||
|
the configured service account scope limited what users could query.
|
||||||
|
New `bq_path_cross_project` (when the project ≠ configured data
|
||||||
|
project) and `bq_path_not_registered` (when path is unknown) error
|
||||||
|
kinds. Issue #201.
|
||||||
|
|
||||||
## [0.41.0] — 2026-05-06
|
## [0.41.0] — 2026-05-06
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
|
||||||
247
app/api/query.py
247
app/api/query.py
|
|
@ -90,6 +90,29 @@ BQ_PATH = re.compile(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Issue #201 — full backtick BQ path `<project>.<dataset>.<table>` in user
|
||||||
|
# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep
|
||||||
|
# bare-name regexes from firing inside backtick-quoted segments.
|
||||||
|
_BACKTICK_SEGMENT = re.compile(r'`[^`]*`')
|
||||||
|
_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`')
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_backticks(sql: str) -> str:
|
||||||
|
"""Replace each `…`-quoted segment with spaces of equal length so
|
||||||
|
word-boundary regexes find positions outside backticks but ignore
|
||||||
|
everything inside. Preserves all character offsets so ``re.search``
|
||||||
|
on the masked string returns matches at the same positions as on the
|
||||||
|
original.
|
||||||
|
|
||||||
|
Issue #201: `\\b` matches inside backtick segments because both `.`
|
||||||
|
and `` ` `` are non-word characters. A registered bare-name like
|
||||||
|
``unit_economics`` would otherwise match inside a user-supplied full
|
||||||
|
backtick path ``\\`<project>.<dataset>.unit_economics\\``` and get
|
||||||
|
falsely rewritten — corrupting the user's intended SQL.
|
||||||
|
"""
|
||||||
|
return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql)
|
||||||
|
|
||||||
|
|
||||||
def _default_remote_query_cap_bytes() -> int:
|
def _default_remote_query_cap_bytes() -> int:
|
||||||
"""5 GiB default cap on /api/query BQ-touching scans. Configurable via
|
"""5 GiB default cap on /api/query BQ-touching scans. Configurable via
|
||||||
`data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
|
`data_source.bigquery.bq_max_scan_bytes` in /admin/server-config —
|
||||||
|
|
@ -197,11 +220,18 @@ def execute_query(
|
||||||
if r.get("name") and r.get("id") in allowed_ids
|
if r.get("name") and r.get("id") in allowed_ids
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check if query references any forbidden tables (word-boundary match)
|
# Check if query references any forbidden tables (word-boundary
|
||||||
|
# match). Issue #201: mask backtick segments so `\b` doesn't
|
||||||
|
# falsely fire inside a user-supplied full backtick path like
|
||||||
|
# `<project>.<dataset>.<table>` whose final segment happens to
|
||||||
|
# collide with a forbidden master view name. The full-path
|
||||||
|
# registry-gate downstream is the proper authorization check
|
||||||
|
# for those.
|
||||||
|
sql_lower_masked = _mask_backticks(sql_lower)
|
||||||
forbidden = all_views - allowed_view_names
|
forbidden = all_views - allowed_view_names
|
||||||
for table in forbidden:
|
for table in forbidden:
|
||||||
pattern = r'\b' + re.escape(table.lower()) + r'\b'
|
pattern = r'\b' + re.escape(table.lower()) + r'\b'
|
||||||
if re.search(pattern, sql_lower):
|
if re.search(pattern, sql_lower_masked):
|
||||||
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
|
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
|
||||||
|
|
||||||
# ---- #160 BQ remote-row guardrail + RBAC patch -------------------
|
# ---- #160 BQ remote-row guardrail + RBAC patch -------------------
|
||||||
|
|
@ -467,6 +497,11 @@ def _bq_guardrail_inputs(
|
||||||
name_lookups: list = []
|
name_lookups: list = []
|
||||||
seen_paths: set = set()
|
seen_paths: set = set()
|
||||||
accessible_set = set(allowed) if allowed is not None else None
|
accessible_set = set(allowed) if allowed is not None else None
|
||||||
|
# Issue #201: mask backtick segments so a registered bare name like
|
||||||
|
# `unit_economics` doesn't false-positive on a user-supplied full
|
||||||
|
# backtick path `<project>.<dataset>.unit_economics`. The full-path
|
||||||
|
# pass below registry-gates those properly.
|
||||||
|
sql_lower_masked = _mask_backticks(sql_lower)
|
||||||
for r in repo.list_by_source("bigquery"):
|
for r in repo.list_by_source("bigquery"):
|
||||||
if (r.get("query_mode") or "") != "remote":
|
if (r.get("query_mode") or "") != "remote":
|
||||||
continue
|
continue
|
||||||
|
|
@ -481,7 +516,7 @@ def _bq_guardrail_inputs(
|
||||||
# before we get here. Defensive skip.
|
# before we get here. Defensive skip.
|
||||||
continue
|
continue
|
||||||
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
||||||
if re.search(pattern, sql_lower):
|
if re.search(pattern, sql_lower_masked):
|
||||||
key = (bucket.lower(), source_table.lower())
|
key = (bucket.lower(), source_table.lower())
|
||||||
if key not in seen_paths:
|
if key not in seen_paths:
|
||||||
seen_paths.add(key)
|
seen_paths.add(key)
|
||||||
|
|
@ -529,6 +564,66 @@ def _bq_guardrail_inputs(
|
||||||
seen_paths.add(key)
|
seen_paths.add(key)
|
||||||
dry_run.append((bucket, source_table, 0))
|
dry_run.append((bucket, source_table, 0))
|
||||||
|
|
||||||
|
# 3. Full backtick path `<project>.<dataset>.<table>` pass (issue #201).
|
||||||
|
# Pre-#201 these bypassed Agnes RBAC entirely — only the configured
|
||||||
|
# service account scope limited which tables a user could reach. Gate
|
||||||
|
# them identically to the `bq.<ds>.<tbl>` pass: must match the
|
||||||
|
# configured data project, must point at a registered row, and the
|
||||||
|
# caller must hold a grant on that row's id (admin bypasses the grant
|
||||||
|
# check but still requires registration + project match).
|
||||||
|
#
|
||||||
|
# Lazy `get_bq_access()` import via the module-level alias so tests
|
||||||
|
# can monkeypatch a fake. When BQ isn't configured (no data project),
|
||||||
|
# fall through silently — full backtick paths can't possibly resolve
|
||||||
|
# against this instance, so leave them to BQ to reject if a query
|
||||||
|
# somehow makes it through.
|
||||||
|
try:
|
||||||
|
bq = get_bq_access()
|
||||||
|
data_project = (bq.projects.data or "").strip()
|
||||||
|
except Exception:
|
||||||
|
data_project = ""
|
||||||
|
|
||||||
|
if data_project:
|
||||||
|
for m in _BACKTICK_FULL_PATH.finditer(sql):
|
||||||
|
proj, ds, tbl = m.group(1), m.group(2), m.group(3)
|
||||||
|
if proj.lower() != data_project.lower():
|
||||||
|
return [], [], {
|
||||||
|
"reason": "bq_path_cross_project",
|
||||||
|
"path": f"`{proj}.{ds}.{tbl}`",
|
||||||
|
"expected_project": data_project,
|
||||||
|
"hint": (
|
||||||
|
"--remote queries can only reference tables in the "
|
||||||
|
"configured BigQuery data project. Register "
|
||||||
|
"cross-project tables via `agnes admin "
|
||||||
|
"register-table` if needed."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
row = repo.find_by_bq_path(ds, tbl)
|
||||||
|
if row is None:
|
||||||
|
return [], [], {
|
||||||
|
"reason": "bq_path_not_registered",
|
||||||
|
"path": f"`{proj}.{ds}.{tbl}`",
|
||||||
|
"hint": (
|
||||||
|
"Direct BigQuery paths must point to a registered "
|
||||||
|
"table. Register via `agnes admin register-table` "
|
||||||
|
"or use the registered name from `agnes catalog`."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
if not is_admin:
|
||||||
|
if accessible_set is None or row["id"] not in accessible_set:
|
||||||
|
return [], [], {
|
||||||
|
"reason": "bq_path_access_denied",
|
||||||
|
"path": f"`{proj}.{ds}.{tbl}`",
|
||||||
|
"registered_as": row["name"],
|
||||||
|
}
|
||||||
|
bucket = row["bucket"]
|
||||||
|
source_table = row["source_table"]
|
||||||
|
if bucket and source_table:
|
||||||
|
key = (bucket.lower(), source_table.lower())
|
||||||
|
if key not in seen_paths:
|
||||||
|
seen_paths.add(key)
|
||||||
|
dry_run.append((bucket, source_table, 0))
|
||||||
|
|
||||||
return dry_run, name_lookups, None
|
return dry_run, name_lookups, None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -579,6 +674,15 @@ def _rewrite_bq_table_refs_to_native(
|
||||||
# name up in a case-insensitive dict. Single-pass means freshly
|
# name up in a case-insensitive dict. Single-pass means freshly
|
||||||
# inserted backticked text isn't re-scanned, fixing the
|
# inserted backticked text isn't re-scanned, fixing the
|
||||||
# project-ID-contains-name corruption (Devin Review on query.py:464).
|
# project-ID-contains-name corruption (Devin Review on query.py:464).
|
||||||
|
#
|
||||||
|
# Issue #201: split the SQL on `…` segments and rewrite ONLY in the
|
||||||
|
# outside-backtick chunks. Without this, a user-supplied full backtick
|
||||||
|
# path like ``\\`<project>.<dataset>.unit_economics\\``` whose final
|
||||||
|
# segment matches a registered bare name would have the bare-name
|
||||||
|
# regex fire INSIDE the backticks (since `\\b` treats both `.` and
|
||||||
|
# `` ` `` as non-word boundaries), producing malformed nested
|
||||||
|
# backticks. Splitting confines the rewrite to user identifier
|
||||||
|
# positions where bare-name resolution is the intended behaviour.
|
||||||
if name_lookups:
|
if name_lookups:
|
||||||
# Map name (lower-cased) → backticked target. Names are
|
# Map name (lower-cased) → backticked target. Names are
|
||||||
# case-insensitive on the input side per the existing helper
|
# case-insensitive on the input side per the existing helper
|
||||||
|
|
@ -598,7 +702,15 @@ def _rewrite_bq_table_refs_to_native(
|
||||||
def _name_repl(m: re.Match) -> str:
|
def _name_repl(m: re.Match) -> str:
|
||||||
return name_to_target[m.group(1).lower()]
|
return name_to_target[m.group(1).lower()]
|
||||||
|
|
||||||
out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE)
|
# `re.split` with a captured group returns: [outside, backtick,
|
||||||
|
# outside, backtick, …]. Even indices are outside-backtick chunks
|
||||||
|
# eligible for bare-name rewrite; odd indices are full backtick
|
||||||
|
# segments preserved verbatim.
|
||||||
|
parts = re.split(r'(`[^`]*`)', out)
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i % 2 == 0:
|
||||||
|
parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE)
|
||||||
|
out = "".join(parts)
|
||||||
|
|
||||||
# Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
|
# Pass 2: bq."ds"."tbl" / bq.ds.tbl → `<project>.<ds>.<tbl>`.
|
||||||
def _bq_path_repl(m: re.Match) -> str:
|
def _bq_path_repl(m: re.Match) -> str:
|
||||||
|
|
@ -675,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query(
|
||||||
return user_sql, False
|
return user_sql, False
|
||||||
|
|
||||||
# Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
|
# Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
|
||||||
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`.
|
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201:
|
||||||
|
# bare-name regex must run against a backtick-masked copy so a
|
||||||
|
# registered name like ``orders`` doesn't false-positive when it
|
||||||
|
# appears as the table segment of a user-supplied full backtick path
|
||||||
|
# like ``\\`<project>.<dataset>.orders\\```. Without masking, the
|
||||||
|
# cross-source check below would falsely conclude the SQL touches
|
||||||
|
# both BQ-remote and local sources, dropping every backtick-path
|
||||||
|
# query into the 50-100× slower ATTACH-catalog fallback. Devin
|
||||||
|
# Review on PR #208.
|
||||||
sql_lower = user_sql.lower()
|
sql_lower = user_sql.lower()
|
||||||
|
sql_lower_masked = _mask_backticks(sql_lower)
|
||||||
name_lookups: list = []
|
name_lookups: list = []
|
||||||
seen_paths: set = set()
|
seen_paths: set = set()
|
||||||
|
|
||||||
|
|
@ -715,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query(
|
||||||
# mix rewritten and non-rewritten BQ paths in one query.
|
# mix rewritten and non-rewritten BQ paths in one query.
|
||||||
return user_sql, False
|
return user_sql, False
|
||||||
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
||||||
if re.search(pattern, sql_lower):
|
if re.search(pattern, sql_lower_masked):
|
||||||
key = (bucket.lower(), source_table.lower())
|
key = (bucket.lower(), source_table.lower())
|
||||||
if key not in seen_paths:
|
if key not in seen_paths:
|
||||||
seen_paths.add(key)
|
seen_paths.add(key)
|
||||||
|
|
@ -752,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query(
|
||||||
# Same name registered both BQ-remote and local? Pathological;
|
# Same name registered both BQ-remote and local? Pathological;
|
||||||
# skip as a safety measure.
|
# skip as a safety measure.
|
||||||
return user_sql, False
|
return user_sql, False
|
||||||
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower):
|
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked):
|
||||||
logger.info(
|
logger.info(
|
||||||
"rewrite_skip_cross_source: user SQL references both "
|
"rewrite_skip_cross_source: user SQL references both "
|
||||||
"BQ-remote and local-mode tables; falling back to "
|
"BQ-remote and local-mode tables; falling back to "
|
||||||
|
|
@ -839,28 +960,32 @@ def _bq_quota_and_cap_guard(
|
||||||
partitioned/clustered tables and rejecting narrow queries that BQ
|
partitioned/clustered tables and rejecting narrow queries that BQ
|
||||||
itself would dry-run as a few MB.
|
itself would dry-run as a few MB.
|
||||||
|
|
||||||
Fallback: if BQ rejects the rewritten SQL with a parse-level
|
Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level
|
||||||
``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that
|
``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or
|
||||||
don't translate to BQ), fall back to the pre-#171 per-table
|
a rewriter bug that broke valid BQ-native input), retry with the
|
||||||
SELECT * approach so the cap-guard still functions — over-estimate
|
user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the
|
||||||
is preferred over fail-open. Forbidden / upstream errors still
|
original ALSO fails, return a structured `remote_estimate_failed`
|
||||||
propagate as HTTP 502.
|
HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table
|
||||||
|
over-estimate. The synthetic fallback threw away user filters and
|
||||||
|
routinely ballooned to "full table size", blocking legitimate narrow
|
||||||
|
queries via `remote_scan_too_large`. Forbidden / upstream errors
|
||||||
|
still propagate as HTTP 502.
|
||||||
|
|
||||||
Flow:
|
Flow:
|
||||||
1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work.
|
1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work.
|
||||||
2. `quota.acquire(user_id)` opened — concurrent-slot held throughout.
|
2. `quota.acquire(user_id)` opened — concurrent-slot held throughout.
|
||||||
3. Single dry-run of rewritten user SQL → `total_bytes`.
|
3. Single dry-run of rewritten user SQL → `total_bytes`.
|
||||||
On parse error, fall back to per-table SELECT * → sum.
|
On parse error, retry with the user's original SQL.
|
||||||
|
On second parse error, raise 400 `remote_estimate_failed`.
|
||||||
4. If total > cap → 400 `remote_scan_too_large`.
|
4. If total > cap → 400 `remote_scan_too_large`.
|
||||||
5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`.
|
5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`.
|
||||||
6. On exit, slot released.
|
6. On exit, slot released.
|
||||||
|
|
||||||
Mutates `dry_run_set` in place: the third tuple element (bytes) is
|
Mutates `dry_run_set` in place: the third tuple element (bytes) is
|
||||||
populated so the caller can sum and record bytes against the user's
|
populated so the caller can sum and record bytes against the user's
|
||||||
quota post-flight. Single-dry-run path puts `total_bytes` on the first
|
quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest
|
||||||
entry and zero on the rest (BQ doesn't expose per-table bytes for a
|
— BQ doesn't expose per-table bytes for a composite query — so
|
||||||
composite query); the caller's `sum(b for _, _, b in dry_run_set)`
|
`sum(b for _, _, b in dry_run_set)` still equals `total_bytes`.
|
||||||
still equals `total_bytes`.
|
|
||||||
"""
|
"""
|
||||||
quota = _build_quota_tracker()
|
quota = _build_quota_tracker()
|
||||||
try:
|
try:
|
||||||
|
|
@ -901,55 +1026,65 @@ def _bq_quota_and_cap_guard(
|
||||||
sql, name_lookups, project,
|
sql, name_lookups, project,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Try the single-dry-run path first (issue #171). Falls back
|
# Try the single-dry-run path first (issue #171). On BQ parse
|
||||||
# to the per-table SELECT * approach only on BQ parse errors
|
# errors (`bq_bad_request` — typically DuckDB-only syntax the
|
||||||
# (kind="bq_bad_request" — DuckDB-only syntax that BQ can't
|
# rewriter couldn't translate, OR — pre-#201 fix — a
|
||||||
# translate). All other BQ errors propagate as 502 below.
|
# rewriter-corrupted backtick path) retry the user's ORIGINAL
|
||||||
|
# SQL: when the user submitted BQ-native SQL, the rewriter is
|
||||||
|
# the only thing standing between them and a clean dry-run.
|
||||||
|
# If the original ALSO fails, this is true DuckDB-only syntax
|
||||||
|
# that BQ cannot estimate — fail fast with a structured
|
||||||
|
# `remote_estimate_failed` instead of the pre-#201 synthetic
|
||||||
|
# `SELECT *` over-estimate (which threw away user filters and
|
||||||
|
# often ballooned to "full table size", blocking legitimate
|
||||||
|
# narrow queries via `remote_scan_too_large`).
|
||||||
|
#
|
||||||
|
# All other BQ errors (forbidden, upstream) propagate as 502.
|
||||||
total_bytes = 0
|
total_bytes = 0
|
||||||
used_fallback = False
|
|
||||||
try:
|
try:
|
||||||
total_bytes = _bq_dry_run_bytes(bq, rewritten_sql)
|
total_bytes = _bq_dry_run_bytes(bq, rewritten_sql)
|
||||||
except BqAccessError as exc:
|
except BqAccessError as exc:
|
||||||
if exc.kind == "bq_bad_request":
|
if exc.kind != "bq_bad_request":
|
||||||
|
raise HTTPException(status_code=502, detail={
|
||||||
|
"kind": exc.kind,
|
||||||
|
"message": exc.message,
|
||||||
|
**(exc.details or {}),
|
||||||
|
})
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"BQ dry-run rejected the rewritten SQL "
|
"BQ dry-run rejected the rewritten SQL "
|
||||||
"(kind=%s, message=%s). Falling back to per-table "
|
"(kind=%s, message=%s). Retrying with the user's "
|
||||||
"SELECT * estimate; the cap check will over-estimate "
|
"original SQL.",
|
||||||
"scan bytes for this query. Consider rewriting to "
|
|
||||||
"BQ-native syntax for a tight pre-check.",
|
|
||||||
exc.kind, exc.message,
|
exc.kind, exc.message,
|
||||||
)
|
)
|
||||||
used_fallback = True
|
try:
|
||||||
else:
|
total_bytes = _bq_dry_run_bytes(bq, sql)
|
||||||
|
except BqAccessError as exc2:
|
||||||
|
if exc2.kind != "bq_bad_request":
|
||||||
raise HTTPException(status_code=502, detail={
|
raise HTTPException(status_code=502, detail={
|
||||||
"kind": exc.kind,
|
"kind": exc2.kind,
|
||||||
"message": exc.message,
|
"message": exc2.message,
|
||||||
**(exc.details or {}),
|
**(exc2.details or {}),
|
||||||
|
})
|
||||||
|
raise HTTPException(status_code=400, detail={
|
||||||
|
"kind": "remote_estimate_failed",
|
||||||
|
"message": (
|
||||||
|
"Could not estimate scan size for this query."
|
||||||
|
),
|
||||||
|
"hint": (
|
||||||
|
"Use a registered table name from `agnes "
|
||||||
|
"catalog`, or write BQ-native SQL with full "
|
||||||
|
"backtick paths. Pure DuckDB-only syntax is "
|
||||||
|
"not supported for --remote queries."
|
||||||
|
),
|
||||||
|
"underlying": exc2.message,
|
||||||
})
|
})
|
||||||
|
|
||||||
if used_fallback:
|
# Distribute the total to dry_run_set so the caller's
|
||||||
# Pre-#171 path: estimate per registered table from a
|
# `record_bytes(sum(...))` stays correct. Per-table breakdown
|
||||||
# synthetic SELECT *. Over-estimates partitioned scans but
|
# is unavailable from a composite dry-run; pin total to entry
|
||||||
# never under-estimates, so the cap still bounds risk.
|
# 0, zero the rest. (Same accounting symmetry whether the
|
||||||
for i, (bucket, source_table, _) in enumerate(dry_run_set):
|
# bytes came from the rewritten SQL or the original-SQL
|
||||||
fallback_sql = (
|
# retry.)
|
||||||
f"SELECT * FROM `{project}.{bucket}.{source_table}`"
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
est = _bq_dry_run_bytes(bq, fallback_sql)
|
|
||||||
except BqAccessError as exc:
|
|
||||||
raise HTTPException(status_code=502, detail={
|
|
||||||
"kind": exc.kind,
|
|
||||||
"message": exc.message,
|
|
||||||
**(exc.details or {}),
|
|
||||||
})
|
|
||||||
dry_run_set[i] = (bucket, source_table, est)
|
|
||||||
total_bytes += est
|
|
||||||
else:
|
|
||||||
# Single-dry-run path. Distribute the total to dry_run_set
|
|
||||||
# so the caller's `record_bytes(sum(...))` stays correct.
|
|
||||||
# Per-table breakdown is unavailable from a composite
|
|
||||||
# dry-run; pin total to entry 0, zero the rest.
|
|
||||||
if dry_run_set:
|
if dry_run_set:
|
||||||
b0, t0, _ = dry_run_set[0]
|
b0, t0, _ = dry_run_set[0]
|
||||||
dry_run_set[0] = (b0, t0, total_bytes)
|
dry_run_set[0] = (b0, t0, total_bytes)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
[project]
|
[project]
|
||||||
name = "agnes-the-ai-analyst"
|
name = "agnes-the-ai-analyst"
|
||||||
version = "0.41.0"
|
version = "0.42.0"
|
||||||
description = "Agnes — AI Data Analyst platform for AI analytical systems"
|
description = "Agnes — AI Data Analyst platform for AI analytical systems"
|
||||||
requires-python = ">=3.11,<3.14"
|
requires-python = ">=3.11,<3.14"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|
|
||||||
|
|
@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request(
|
||||||
assert "`test-data-prj.marketing.traffic`" in state["last_sql"]
|
assert "`test-data-prj.marketing.traffic`" in state["last_sql"]
|
||||||
|
|
||||||
|
|
||||||
def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
|
def test_fallback_tries_original_sql_first(
|
||||||
seeded_app, mock_dry_run, monkeypatch,
|
seeded_app, mock_dry_run, monkeypatch,
|
||||||
):
|
):
|
||||||
"""When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only
|
"""Issue #201 — when the rewriter produces SQL that BQ rejects with
|
||||||
syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …),
|
`bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the
|
||||||
the cap-guard falls back to the pre-#171 per-table SELECT * approach
|
cap-guard uses the original SQL's byte estimate. No more synthetic
|
||||||
so a non-portable query still gets a (loose) cap estimate instead of
|
`SELECT *` over-estimate.
|
||||||
fail-opening.
|
|
||||||
"""
|
Bare-name reference populates `dry_run_set` so the cap-guard
|
||||||
|
actually fires. Mock returns parse-error on the first call
|
||||||
|
(rewritten SQL) and small bytes on the second (original)."""
|
||||||
from connectors.bigquery.access import BqAccessError
|
from connectors.bigquery.access import BqAccessError
|
||||||
|
|
||||||
_register_bq_remote_row("ue", "finance", "ue")
|
_register_bq_remote_row("ue", "finance", "ue")
|
||||||
|
|
@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
|
||||||
|
|
||||||
def fake_dry_run(_bq, sql):
|
def fake_dry_run(_bq, sql):
|
||||||
state["calls"].append(sql)
|
state["calls"].append(sql)
|
||||||
# First call (rewritten user SQL) → BQ parse error.
|
# First call (rewritten SQL) → BQ parse error.
|
||||||
if len(state["calls"]) == 1:
|
if len(state["calls"]) == 1:
|
||||||
raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
|
raise BqAccessError("bq_bad_request", "Syntax error: simulated")
|
||||||
# Second call (fallback per-table SELECT *) → small bytes, pass cap.
|
# Second call (the user's original SQL) → small, passes cap.
|
||||||
return 4096
|
return 4096
|
||||||
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
|
|
@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error(
|
||||||
|
|
||||||
c = seeded_app["client"]
|
c = seeded_app["client"]
|
||||||
token = seeded_app["admin_token"]
|
token = seeded_app["admin_token"]
|
||||||
# SQL with DuckDB-only `::INT` cast that BQ would reject.
|
user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'"
|
||||||
|
r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
|
||||||
|
|
||||||
|
# Two dry-runs: rewritten then original. No third synthetic-SELECT-*
|
||||||
|
# call.
|
||||||
|
assert len(state["calls"]) == 2, (
|
||||||
|
f"expected rewritten + original-SQL retry, got "
|
||||||
|
f"{len(state['calls'])}: {state['calls']}"
|
||||||
|
)
|
||||||
|
assert state["calls"][1] == user_sql, (
|
||||||
|
f"second call must be the user's ORIGINAL SQL, got "
|
||||||
|
f"{state['calls'][1]!r}"
|
||||||
|
)
|
||||||
|
# The response must NOT be remote_scan_too_large from a synthetic
|
||||||
|
# over-estimate — 4096 bytes is well under the 5 GiB cap.
|
||||||
|
if r.status_code == 400:
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
if isinstance(detail, dict):
|
||||||
|
assert detail.get("reason") != "remote_scan_too_large", detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_fallback_fails_fast_on_pure_duckdb_syntax(
|
||||||
|
seeded_app, mock_dry_run, monkeypatch,
|
||||||
|
):
|
||||||
|
"""When BOTH the rewritten and original SQL fail with `bq_bad_request`
|
||||||
|
(true DuckDB-only syntax like `::INT`), return HTTP 400
|
||||||
|
`remote_estimate_failed` — never silently over-estimate via a
|
||||||
|
synthetic `SELECT *`."""
|
||||||
|
from connectors.bigquery.access import BqAccessError
|
||||||
|
|
||||||
|
_register_bq_remote_row("ue", "finance", "ue")
|
||||||
|
|
||||||
|
state = {"calls": []}
|
||||||
|
|
||||||
|
def always_parse_error(_bq, sql):
|
||||||
|
state["calls"].append(sql)
|
||||||
|
raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'")
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.api.query._bq_dry_run_bytes", always_parse_error, raising=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
r = c.post(
|
r = c.post(
|
||||||
"/api/query",
|
"/api/query",
|
||||||
json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"},
|
json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"},
|
||||||
headers=_auth(token),
|
headers=_auth(token),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Two dry-runs (rewritten + fallback per-table) before the (failed)
|
# Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback.
|
||||||
# execute. Status will be a downstream error from analytics.execute()
|
|
||||||
# since `::INT` doesn't work in DuckDB either against a remote view —
|
|
||||||
# but the GUARDRAIL must have completed without 5xx-ing.
|
|
||||||
assert len(state["calls"]) == 2, (
|
assert len(state["calls"]) == 2, (
|
||||||
f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: "
|
f"expected 1 rewritten + 1 original-retry, got "
|
||||||
f"{state['calls']}"
|
f"{len(state['calls'])}: {state['calls']}"
|
||||||
)
|
)
|
||||||
assert "::" in state["calls"][0], "first call should be the rewritten user SQL"
|
# No call should be a synthetic ``SELECT * FROM `<project>...```. The
|
||||||
assert state["calls"][1].startswith("SELECT * FROM"), (
|
# original-SQL retry contains the user's SELECT clause.
|
||||||
"second call should be the per-table fallback"
|
for c_sql in state["calls"]:
|
||||||
|
# If a call is just a synthetic ``SELECT * FROM `<project>.<bucket>.<table>```
|
||||||
|
# the user's `WHERE country = 'CZ'` would be missing.
|
||||||
|
if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql:
|
||||||
|
raise AssertionError(
|
||||||
|
f"synthetic SELECT * fallback was used: {c_sql!r}"
|
||||||
)
|
)
|
||||||
# Whatever HTTP status comes back must NOT be 502 from the guard's
|
|
||||||
# transport-error path — fallback must absorb the bq_bad_request.
|
assert r.status_code == 400, r.json()
|
||||||
assert r.status_code != 502, r.json()
|
detail = r.json().get("detail", {})
|
||||||
|
assert isinstance(detail, dict), detail
|
||||||
|
assert detail.get("kind") == "remote_estimate_failed", detail
|
||||||
|
assert "underlying" in detail, detail
|
||||||
|
assert "agnes catalog" in detail.get("hint", "").lower() or \
|
||||||
|
"backtick" in detail.get("hint", "").lower(), detail
|
||||||
|
|
||||||
|
|
||||||
def test_guardrail_propagates_502_on_non_parse_bq_errors(
|
def test_guardrail_propagates_502_on_non_parse_bq_errors(
|
||||||
|
|
@ -432,3 +484,243 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names():
|
||||||
)
|
)
|
||||||
assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
|
assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \
|
||||||
rewritten.lower().count("`p.fin.ue`") == 2
|
rewritten.lower().count("`p.fin.ue`") == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Issue #201: rewriter must NOT touch text inside `…` backtick segments.
|
||||||
|
# A user-supplied full BQ-native path `<project>.<dataset>.<table>` whose
|
||||||
|
# table segment matches a registered bare name was being re-substituted
|
||||||
|
# inside the backticks, producing malformed nested-backtick SQL that BQ
|
||||||
|
# rejected with a parse error.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_rewrite_skips_inside_backtick_path():
|
||||||
|
"""Full backtick BQ path is preserved byte-for-byte even when its
|
||||||
|
final segment matches a registered bare-name alias."""
|
||||||
|
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||||||
|
|
||||||
|
sql = (
|
||||||
|
"SELECT * FROM `my-prj.finance.unit_economics` "
|
||||||
|
"WHERE country = 'CZ'"
|
||||||
|
)
|
||||||
|
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||||||
|
sql=sql,
|
||||||
|
name_lookups=[("unit_economics", "finance", "unit_economics")],
|
||||||
|
project="my-prj",
|
||||||
|
)
|
||||||
|
# No corruption — input is already BQ-native, rewriter is a no-op here.
|
||||||
|
assert rewritten == sql, (
|
||||||
|
f"backtick path was rewritten:\n in : {sql!r}\n out: {rewritten!r}"
|
||||||
|
)
|
||||||
|
# Sanity: the malformed nested form must NOT appear.
|
||||||
|
assert "`my-prj.finance.`my-prj" not in rewritten
|
||||||
|
|
||||||
|
|
||||||
|
def test_rewrite_skips_inside_backtick_with_outside_bare_name():
|
||||||
|
"""Mixed SQL: a bare name outside backticks is rewritten as before,
|
||||||
|
but an identically-named segment inside a backtick path is left
|
||||||
|
alone."""
|
||||||
|
from app.api.query import _rewrite_user_sql_for_bq_dry_run
|
||||||
|
|
||||||
|
sql = (
|
||||||
|
"SELECT a.id, b.col FROM ue a "
|
||||||
|
"JOIN `my-prj.finance.ue` b ON a.id = b.id"
|
||||||
|
)
|
||||||
|
rewritten = _rewrite_user_sql_for_bq_dry_run(
|
||||||
|
sql=sql,
|
||||||
|
name_lookups=[("ue", "fin_alias", "ue_alias")],
|
||||||
|
project="my-prj",
|
||||||
|
)
|
||||||
|
# Outside-backtick `ue` rewrites to the registered alias path.
|
||||||
|
assert "`my-prj.fin_alias.ue_alias`" in rewritten
|
||||||
|
# The user-supplied backtick path is preserved verbatim.
|
||||||
|
assert "`my-prj.finance.ue`" in rewritten
|
||||||
|
# The malformed nested form must NOT appear.
|
||||||
|
assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten
|
||||||
|
|
||||||
|
|
||||||
|
def test_guardrail_skips_bare_name_match_inside_backticks(
|
||||||
|
seeded_app, mock_dry_run, monkeypatch,
|
||||||
|
):
|
||||||
|
"""The `name_lookups` collection populated by `_bq_guardrail_inputs`
|
||||||
|
must not include a registered name when the only place that name
|
||||||
|
appears in the SQL is inside a `…` backtick segment.
|
||||||
|
|
||||||
|
Captures the rewritten SQL the guardrail forwards to the dry-run and
|
||||||
|
asserts the bare-name was NOT substituted inside the user's backtick
|
||||||
|
path.
|
||||||
|
"""
|
||||||
|
_register_bq_remote_row("unit_economics", "finance", "unit_economics")
|
||||||
|
|
||||||
|
captured = {"sql": None}
|
||||||
|
|
||||||
|
def capturing_fake(_bq, sql):
|
||||||
|
captured["sql"] = sql
|
||||||
|
return 1024
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
user_sql = (
|
||||||
|
"SELECT * FROM `test-data-prj.finance.unit_economics` "
|
||||||
|
"WHERE country = 'CZ'"
|
||||||
|
)
|
||||||
|
c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
|
||||||
|
|
||||||
|
sent = captured["sql"]
|
||||||
|
if sent is None:
|
||||||
|
# Guardrail decided no BQ tables were referenced — that's also
|
||||||
|
# an acceptable "no false-positive" outcome (Layer 3 will cover
|
||||||
|
# the explicit registry check for full backtick paths). We just
|
||||||
|
# need to ensure the bare-name regex didn't fire.
|
||||||
|
return
|
||||||
|
# The user's exact backtick path must survive verbatim — no nested
|
||||||
|
# backticks introduced by a stray bare-name rewrite.
|
||||||
|
assert "`test-data-prj.finance.unit_economics`" in sent, (
|
||||||
|
f"backtick path corrupted by guardrail:\n out: {sent!r}"
|
||||||
|
)
|
||||||
|
assert "`test-data-prj.finance.`test-data-prj" not in sent, (
|
||||||
|
f"nested-backtick corruption signature present: {sent!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated.
|
||||||
|
# Pre-fix these bypassed Agnes RBAC entirely — only the configured service
|
||||||
|
# account scope limited which tables a user could reach. Post-fix, they're
|
||||||
|
# treated identically to `bq."<dataset>"."<table>"` syntax.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run):
|
||||||
|
"""Full backtick path to an unregistered `<dataset>.<table>` (project
|
||||||
|
matches the configured data project) → HTTP 403 with
|
||||||
|
`bq_path_not_registered`."""
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={
|
||||||
|
"sql": (
|
||||||
|
"SELECT * FROM `test-data-prj.secret_ds.secret_tbl` "
|
||||||
|
"WHERE country = 'CZ'"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
assert r.status_code == 403, r.json()
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
assert isinstance(detail, dict), detail
|
||||||
|
assert detail.get("reason") == "bq_path_not_registered", detail
|
||||||
|
assert "secret_ds" in detail.get("path", ""), detail
|
||||||
|
assert "secret_tbl" in detail.get("path", ""), detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run):
|
||||||
|
"""Full backtick path with project ≠ configured data project → HTTP
|
||||||
|
403 with `bq_path_cross_project`. Even if the path happens to point
|
||||||
|
at a registered (bucket, source_table), the project mismatch is the
|
||||||
|
primary boundary."""
|
||||||
|
_register_bq_remote_row("ue", "finance", "ue")
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={
|
||||||
|
"sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1",
|
||||||
|
},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
assert r.status_code == 403, r.json()
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
assert isinstance(detail, dict), detail
|
||||||
|
assert detail.get("reason") == "bq_path_cross_project", detail
|
||||||
|
assert detail.get("expected_project") == "test-data-prj", detail
|
||||||
|
assert "other-project" in detail.get("path", ""), detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_backtick_path_registered_admin_passes(
|
||||||
|
seeded_app, mock_dry_run, monkeypatch,
|
||||||
|
):
|
||||||
|
"""Admin caller + registered path + matching project → no RBAC
|
||||||
|
rejection. The dry-run fires (we can capture the SQL the guardrail
|
||||||
|
forwards) and no `bq_path_*` reason appears in any error response."""
|
||||||
|
_register_bq_remote_row("ue", "finance", "ue")
|
||||||
|
|
||||||
|
captured = {"sql": None}
|
||||||
|
|
||||||
|
def capturing_fake(_bq, sql):
|
||||||
|
captured["sql"] = sql
|
||||||
|
return 1024 # tiny — pass cap
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.api.query._bq_dry_run_bytes", capturing_fake, raising=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={
|
||||||
|
"sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1",
|
||||||
|
},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
# If 403, must NOT be the issue-#201 bq_path_* reasons.
|
||||||
|
if r.status_code == 403:
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
if isinstance(detail, dict):
|
||||||
|
assert detail.get("reason") not in (
|
||||||
|
"bq_path_not_registered",
|
||||||
|
"bq_path_access_denied",
|
||||||
|
"bq_path_cross_project",
|
||||||
|
), f"admin + registered path should pass RBAC: {detail}"
|
||||||
|
# The dry-run was invoked, meaning Pass 3 added the path to dry_run_set
|
||||||
|
# and the cap-guard fired. The user's WHERE clause must still be in
|
||||||
|
# the dry-run SQL (validates Layer 1 — backtick-aware rewrite).
|
||||||
|
assert captured["sql"] is not None, (
|
||||||
|
"dry-run never fired — Pass 3 may not have registered the path"
|
||||||
|
)
|
||||||
|
assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"]
|
||||||
|
assert "WHERE id = 1" in captured["sql"], captured["sql"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_backtick_path_inside_string_literal_not_gated(
|
||||||
|
seeded_app, mock_dry_run,
|
||||||
|
):
|
||||||
|
"""Defensive case: a backtick path appearing inside a SQL string
|
||||||
|
literal (rare but possible) should not trigger Pass 3. Practically
|
||||||
|
this is unreachable because backticks aren't typically valid inside
|
||||||
|
BQ string literals — but the regex doesn't know that. We document
|
||||||
|
that the gate applies to ALL backtick triples to be safe; users who
|
||||||
|
really need a literal can use single-quoted strings without
|
||||||
|
backticks."""
|
||||||
|
# No registration; the test confirms an unregistered path inside
|
||||||
|
# what looks like a string is still gated. This is the conservative
|
||||||
|
# boundary — false-positive on string literal beats false-negative
|
||||||
|
# on a real RBAC bypass.
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={
|
||||||
|
"sql": (
|
||||||
|
"SELECT 'matches `test-data-prj.x.y`' AS lit"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
# Either gated (403) or 200 if the analytics DB happens to evaluate
|
||||||
|
# the literal — both are acceptable. The point is no silent RBAC
|
||||||
|
# bypass: if the response is 200, no BQ table was reached.
|
||||||
|
if r.status_code == 403:
|
||||||
|
detail = r.json().get("detail", {})
|
||||||
|
if isinstance(detail, dict):
|
||||||
|
assert detail.get("reason") in (
|
||||||
|
"bq_path_not_registered",
|
||||||
|
"bq_path_cross_project",
|
||||||
|
), detail
|
||||||
|
|
|
||||||
|
|
@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
|
||||||
assert rewritten == user_sql # untouched
|
assert rewritten == user_sql # untouched
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_name_inside_backtick_path_does_not_trip_cross_source(
|
||||||
|
seeded_registry, monkeypatch,
|
||||||
|
):
|
||||||
|
"""Devin Review on PR #208 (issue #201 follow-up): a registered
|
||||||
|
LOCAL-mode table name appearing as a segment of a user-supplied full
|
||||||
|
backtick BQ path must NOT trip the cross-source guard. Pre-fix the
|
||||||
|
bare-name regex at the cross-source check ran against unmasked
|
||||||
|
sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
|
||||||
|
local ``orders`` inside the backticks and force the wrapper to bail
|
||||||
|
to the ATTACH-catalog slow path (50-100× slower). Post-fix the
|
||||||
|
regex runs against the backtick-masked copy, the cross-source check
|
||||||
|
correctly sees only BQ refs, and the wrap proceeds.
|
||||||
|
"""
|
||||||
|
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||||||
|
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||||||
|
bucket="fin", source_table="ue")
|
||||||
|
_register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
|
||||||
|
_set_bq_project(monkeypatch, "test-prj")
|
||||||
|
|
||||||
|
user_sql = (
|
||||||
|
"SELECT u.id "
|
||||||
|
"FROM ue u "
|
||||||
|
"JOIN `test-prj.dataset.orders` o ON u.x = o.x "
|
||||||
|
"WHERE o.value > 0"
|
||||||
|
)
|
||||||
|
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||||||
|
user_sql, seeded_registry,
|
||||||
|
)
|
||||||
|
# Must wrap — both refs are BQ; the local `orders` registration is
|
||||||
|
# irrelevant to a query that touches only BQ paths.
|
||||||
|
assert did_rewrite is True
|
||||||
|
assert "bigquery_query(" in rewritten
|
||||||
|
# The user's backtick path is preserved verbatim inside the wrapped
|
||||||
|
# inner SQL (Layer 1 split-on-backticks behaviour), so the original
|
||||||
|
# `test-prj.dataset.orders` reference survives.
|
||||||
|
assert "test-prj.dataset.orders" in rewritten
|
||||||
|
|
||||||
|
|
||||||
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
|
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
|
||||||
"""User SQL referencing only local-source tables → no rewrite,
|
"""User SQL referencing only local-source tables → no rewrite,
|
||||||
no log spam, original SQL returned."""
|
no log spam, original SQL returned."""
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue