diff --git a/CHANGELOG.md b/CHANGELOG.md index adc8edd..678f39c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,40 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +## [0.42.0] — 2026-05-06 + +### Fixed +- `agnes query --remote`: full backtick BigQuery paths in user SQL are no + longer corrupted by the registered-name rewriter. Previously a query + like ``SELECT … FROM `..` WHERE …`` whose + table name happened to be registered as a bare-name alias would have + the alias re-substituted *inside* the backtick path, producing + malformed SQL that BigQuery rejected with a parse error. The cap-guard + then fell back to a filter-less `SELECT *` size estimate (often orders + of magnitude larger than the real scan), blocking the query as + `remote_scan_too_large`. Issue #201. + +### Changed +- `agnes query --remote`: cap-guard fallback no longer estimates from + a synthetic `SELECT *` when the rewritten SQL fails dry-run. It first + retries the user's original SQL (handles BQ-native input cleanly), and + only when *that* also fails returns a structured `remote_estimate_failed` + HTTP 400 with a hint instead of silently over-estimating. +- **BREAKING (clients matching error kinds)**: failure to estimate + remote-query scan size now returns `kind="remote_estimate_failed"` + instead of being masked as `remote_scan_too_large` caused by + over-estimation. Operators that grep for the old kind in dashboards + should update. + +### Security +- `agnes query --remote`: full backtick BigQuery paths are now + registry-gated identically to `bq.""."
"` syntax. + Previously, full backtick paths bypassed Agnes RBAC entirely — only + the configured service account scope limited what users could query. + New `bq_path_cross_project` (when the project ≠ configured data + project) and `bq_path_not_registered` (when path is unknown) error + kinds. Issue #201. + ## [0.41.0] — 2026-05-06 ### Fixed diff --git a/app/api/query.py b/app/api/query.py index 6dbbadd..b9b8b9b 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -90,6 +90,29 @@ BQ_PATH = re.compile( ) +# Issue #201 — full backtick BQ path `..
` in user +# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep +# bare-name regexes from firing inside backtick-quoted segments. +_BACKTICK_SEGMENT = re.compile(r'`[^`]*`') +_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`') + + +def _mask_backticks(sql: str) -> str: + """Replace each `…`-quoted segment with spaces of equal length so + word-boundary regexes find positions outside backticks but ignore + everything inside. Preserves all character offsets so ``re.search`` + on the masked string returns matches at the same positions as on the + original. + + Issue #201: `\\b` matches inside backtick segments because both `.` + and `` ` `` are non-word characters. A registered bare-name like + ``unit_economics`` would otherwise match inside a user-supplied full + backtick path ``\\`..unit_economics\\``` and get + falsely rewritten — corrupting the user's intended SQL. + """ + return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql) + + def _default_remote_query_cap_bytes() -> int: """5 GiB default cap on /api/query BQ-touching scans. Configurable via `data_source.bigquery.bq_max_scan_bytes` in /admin/server-config — @@ -197,11 +220,18 @@ def execute_query( if r.get("name") and r.get("id") in allowed_ids } - # Check if query references any forbidden tables (word-boundary match) + # Check if query references any forbidden tables (word-boundary + # match). Issue #201: mask backtick segments so `\b` doesn't + # falsely fire inside a user-supplied full backtick path like + # `..
` whose final segment happens to + # collide with a forbidden master view name. The full-path + # registry-gate downstream is the proper authorization check + # for those. + sql_lower_masked = _mask_backticks(sql_lower) forbidden = all_views - allowed_view_names for table in forbidden: pattern = r'\b' + re.escape(table.lower()) + r'\b' - if re.search(pattern, sql_lower): + if re.search(pattern, sql_lower_masked): raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'") # ---- #160 BQ remote-row guardrail + RBAC patch ------------------- @@ -467,6 +497,11 @@ def _bq_guardrail_inputs( name_lookups: list = [] seen_paths: set = set() accessible_set = set(allowed) if allowed is not None else None + # Issue #201: mask backtick segments so a registered bare name like + # `unit_economics` doesn't false-positive on a user-supplied full + # backtick path `..unit_economics`. The full-path + # pass below registry-gates those properly. + sql_lower_masked = _mask_backticks(sql_lower) for r in repo.list_by_source("bigquery"): if (r.get("query_mode") or "") != "remote": continue @@ -481,7 +516,7 @@ def _bq_guardrail_inputs( # before we get here. Defensive skip. continue pattern = r'\b' + re.escape(str(name).lower()) + r'\b' - if re.search(pattern, sql_lower): + if re.search(pattern, sql_lower_masked): key = (bucket.lower(), source_table.lower()) if key not in seen_paths: seen_paths.add(key) @@ -529,6 +564,66 @@ def _bq_guardrail_inputs( seen_paths.add(key) dry_run.append((bucket, source_table, 0)) + # 3. Full backtick path `..
` pass (issue #201). + # Pre-#201 these bypassed Agnes RBAC entirely — only the configured + # service account scope limited which tables a user could reach. Gate + # them identically to the `bq..` pass: must match the + # configured data project, must point at a registered row, and the + # caller must hold a grant on that row's id (admin bypasses the grant + # check but still requires registration + project match). + # + # Lazy `get_bq_access()` import via the module-level alias so tests + # can monkeypatch a fake. When BQ isn't configured (no data project), + # fall through silently — full backtick paths can't possibly resolve + # against this instance, so leave them to BQ to reject if a query + # somehow makes it through. + try: + bq = get_bq_access() + data_project = (bq.projects.data or "").strip() + except Exception: + data_project = "" + + if data_project: + for m in _BACKTICK_FULL_PATH.finditer(sql): + proj, ds, tbl = m.group(1), m.group(2), m.group(3) + if proj.lower() != data_project.lower(): + return [], [], { + "reason": "bq_path_cross_project", + "path": f"`{proj}.{ds}.{tbl}`", + "expected_project": data_project, + "hint": ( + "--remote queries can only reference tables in the " + "configured BigQuery data project. Register " + "cross-project tables via `agnes admin " + "register-table` if needed." + ), + } + row = repo.find_by_bq_path(ds, tbl) + if row is None: + return [], [], { + "reason": "bq_path_not_registered", + "path": f"`{proj}.{ds}.{tbl}`", + "hint": ( + "Direct BigQuery paths must point to a registered " + "table. Register via `agnes admin register-table` " + "or use the registered name from `agnes catalog`." + ), + } + if not is_admin: + if accessible_set is None or row["id"] not in accessible_set: + return [], [], { + "reason": "bq_path_access_denied", + "path": f"`{proj}.{ds}.{tbl}`", + "registered_as": row["name"], + } + bucket = row["bucket"] + source_table = row["source_table"] + if bucket and source_table: + key = (bucket.lower(), source_table.lower()) + if key not in seen_paths: + seen_paths.add(key) + dry_run.append((bucket, source_table, 0)) + return dry_run, name_lookups, None @@ -579,6 +674,15 @@ def _rewrite_bq_table_refs_to_native( # name up in a case-insensitive dict. Single-pass means freshly # inserted backticked text isn't re-scanned, fixing the # project-ID-contains-name corruption (Devin Review on query.py:464). + # + # Issue #201: split the SQL on `…` segments and rewrite ONLY in the + # outside-backtick chunks. Without this, a user-supplied full backtick + # path like ``\\`..unit_economics\\``` whose final + # segment matches a registered bare name would have the bare-name + # regex fire INSIDE the backticks (since `\\b` treats both `.` and + # `` ` `` as non-word boundaries), producing malformed nested + # backticks. Splitting confines the rewrite to user identifier + # positions where bare-name resolution is the intended behaviour. if name_lookups: # Map name (lower-cased) → backticked target. Names are # case-insensitive on the input side per the existing helper @@ -598,7 +702,15 @@ def _rewrite_bq_table_refs_to_native( def _name_repl(m: re.Match) -> str: return name_to_target[m.group(1).lower()] - out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE) + # `re.split` with a captured group returns: [outside, backtick, + # outside, backtick, …]. Even indices are outside-backtick chunks + # eligible for bare-name rewrite; odd indices are full backtick + # segments preserved verbatim. + parts = re.split(r'(`[^`]*`)', out) + for i, part in enumerate(parts): + if i % 2 == 0: + parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE) + out = "".join(parts) # Pass 2: bq."ds"."tbl" / bq.ds.tbl → `..`. def _bq_path_repl(m: re.Match) -> str: @@ -675,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query( return user_sql, False # Find all referenced BQ remote-mode rows (bare-name + direct bq.path). - # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. + # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201: + # bare-name regex must run against a backtick-masked copy so a + # registered name like ``orders`` doesn't false-positive when it + # appears as the table segment of a user-supplied full backtick path + # like ``\\`..orders\\```. Without masking, the + # cross-source check below would falsely conclude the SQL touches + # both BQ-remote and local sources, dropping every backtick-path + # query into the 50-100× slower ATTACH-catalog fallback. Devin + # Review on PR #208. sql_lower = user_sql.lower() + sql_lower_masked = _mask_backticks(sql_lower) name_lookups: list = [] seen_paths: set = set() @@ -715,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query( # mix rewritten and non-rewritten BQ paths in one query. return user_sql, False pattern = r'\b' + re.escape(str(name).lower()) + r'\b' - if re.search(pattern, sql_lower): + if re.search(pattern, sql_lower_masked): key = (bucket.lower(), source_table.lower()) if key not in seen_paths: seen_paths.add(key) @@ -752,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query( # Same name registered both BQ-remote and local? Pathological; # skip as a safety measure. return user_sql, False - if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower): + if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked): logger.info( "rewrite_skip_cross_source: user SQL references both " "BQ-remote and local-mode tables; falling back to " @@ -839,28 +960,32 @@ def _bq_quota_and_cap_guard( partitioned/clustered tables and rejecting narrow queries that BQ itself would dry-run as a few MB. - Fallback: if BQ rejects the rewritten SQL with a parse-level - ``client_error`` (e.g. DuckDB-only syntax like ``::INT`` casts that - don't translate to BQ), fall back to the pre-#171 per-table - SELECT * approach so the cap-guard still functions — over-estimate - is preferred over fail-open. Forbidden / upstream errors still - propagate as HTTP 502. + Issue #201 fix: when BQ rejects the rewritten SQL with a parse-level + ``bq_bad_request`` (e.g. DuckDB-only syntax like ``::INT`` casts, or + a rewriter bug that broke valid BQ-native input), retry with the + user's ORIGINAL SQL — BQ-native input dry-runs cleanly. If the + original ALSO fails, return a structured `remote_estimate_failed` + HTTP 400 instead of the pre-#201 synthetic ``SELECT *`` per-table + over-estimate. The synthetic fallback threw away user filters and + routinely ballooned to "full table size", blocking legitimate narrow + queries via `remote_scan_too_large`. Forbidden / upstream errors + still propagate as HTTP 502. Flow: 1. `check_daily_budget` — over-cap users get 429 BEFORE any BQ work. 2. `quota.acquire(user_id)` opened — concurrent-slot held throughout. 3. Single dry-run of rewritten user SQL → `total_bytes`. - On parse error, fall back to per-table SELECT * → sum. + On parse error, retry with the user's original SQL. + On second parse error, raise 400 `remote_estimate_failed`. 4. If total > cap → 400 `remote_scan_too_large`. 5. Yield. Caller runs `analytics.execute(...)` + `record_bytes(...)`. 6. On exit, slot released. Mutates `dry_run_set` in place: the third tuple element (bytes) is populated so the caller can sum and record bytes against the user's - quota post-flight. Single-dry-run path puts `total_bytes` on the first - entry and zero on the rest (BQ doesn't expose per-table bytes for a - composite query); the caller's `sum(b for _, _, b in dry_run_set)` - still equals `total_bytes`. + quota post-flight. Pin `total_bytes` on entry 0 and zero on the rest + — BQ doesn't expose per-table bytes for a composite query — so + `sum(b for _, _, b in dry_run_set)` still equals `total_bytes`. """ quota = _build_quota_tracker() try: @@ -901,61 +1026,71 @@ def _bq_quota_and_cap_guard( sql, name_lookups, project, ) - # Try the single-dry-run path first (issue #171). Falls back - # to the per-table SELECT * approach only on BQ parse errors - # (kind="bq_bad_request" — DuckDB-only syntax that BQ can't - # translate). All other BQ errors propagate as 502 below. + # Try the single-dry-run path first (issue #171). On BQ parse + # errors (`bq_bad_request` — typically DuckDB-only syntax the + # rewriter couldn't translate, OR — pre-#201 fix — a + # rewriter-corrupted backtick path) retry the user's ORIGINAL + # SQL: when the user submitted BQ-native SQL, the rewriter is + # the only thing standing between them and a clean dry-run. + # If the original ALSO fails, this is true DuckDB-only syntax + # that BQ cannot estimate — fail fast with a structured + # `remote_estimate_failed` instead of the pre-#201 synthetic + # `SELECT *` over-estimate (which threw away user filters and + # often ballooned to "full table size", blocking legitimate + # narrow queries via `remote_scan_too_large`). + # + # All other BQ errors (forbidden, upstream) propagate as 502. total_bytes = 0 - used_fallback = False try: total_bytes = _bq_dry_run_bytes(bq, rewritten_sql) except BqAccessError as exc: - if exc.kind == "bq_bad_request": - logger.warning( - "BQ dry-run rejected the rewritten SQL " - "(kind=%s, message=%s). Falling back to per-table " - "SELECT * estimate; the cap check will over-estimate " - "scan bytes for this query. Consider rewriting to " - "BQ-native syntax for a tight pre-check.", - exc.kind, exc.message, - ) - used_fallback = True - else: + if exc.kind != "bq_bad_request": raise HTTPException(status_code=502, detail={ "kind": exc.kind, "message": exc.message, **(exc.details or {}), }) - - if used_fallback: - # Pre-#171 path: estimate per registered table from a - # synthetic SELECT *. Over-estimates partitioned scans but - # never under-estimates, so the cap still bounds risk. - for i, (bucket, source_table, _) in enumerate(dry_run_set): - fallback_sql = ( - f"SELECT * FROM `{project}.{bucket}.{source_table}`" - ) - try: - est = _bq_dry_run_bytes(bq, fallback_sql) - except BqAccessError as exc: + logger.warning( + "BQ dry-run rejected the rewritten SQL " + "(kind=%s, message=%s). Retrying with the user's " + "original SQL.", + exc.kind, exc.message, + ) + try: + total_bytes = _bq_dry_run_bytes(bq, sql) + except BqAccessError as exc2: + if exc2.kind != "bq_bad_request": raise HTTPException(status_code=502, detail={ - "kind": exc.kind, - "message": exc.message, - **(exc.details or {}), + "kind": exc2.kind, + "message": exc2.message, + **(exc2.details or {}), }) - dry_run_set[i] = (bucket, source_table, est) - total_bytes += est - else: - # Single-dry-run path. Distribute the total to dry_run_set - # so the caller's `record_bytes(sum(...))` stays correct. - # Per-table breakdown is unavailable from a composite - # dry-run; pin total to entry 0, zero the rest. - if dry_run_set: - b0, t0, _ = dry_run_set[0] - dry_run_set[0] = (b0, t0, total_bytes) - for i in range(1, len(dry_run_set)): - bi, ti, _ = dry_run_set[i] - dry_run_set[i] = (bi, ti, 0) + raise HTTPException(status_code=400, detail={ + "kind": "remote_estimate_failed", + "message": ( + "Could not estimate scan size for this query." + ), + "hint": ( + "Use a registered table name from `agnes " + "catalog`, or write BQ-native SQL with full " + "backtick paths. Pure DuckDB-only syntax is " + "not supported for --remote queries." + ), + "underlying": exc2.message, + }) + + # Distribute the total to dry_run_set so the caller's + # `record_bytes(sum(...))` stays correct. Per-table breakdown + # is unavailable from a composite dry-run; pin total to entry + # 0, zero the rest. (Same accounting symmetry whether the + # bytes came from the rewritten SQL or the original-SQL + # retry.) + if dry_run_set: + b0, t0, _ = dry_run_set[0] + dry_run_set[0] = (b0, t0, total_bytes) + for i in range(1, len(dry_run_set)): + bi, ti, _ = dry_run_set[i] + dry_run_set[i] = (bi, ti, 0) if cap_bytes > 0 and total_bytes > cap_bytes: tables = [f"{b}.{t}" for b, t, _ in dry_run_set] diff --git a/pyproject.toml b/pyproject.toml index c98c2c7..d1c02d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agnes-the-ai-analyst" -version = "0.41.0" +version = "0.42.0" description = "Agnes — AI Data Analyst platform for AI analytical systems" requires-python = ">=3.11,<3.14" license = "MIT" diff --git a/tests/test_api_query_guardrail.py b/tests/test_api_query_guardrail.py index f0df7b4..67e09dc 100644 --- a/tests/test_api_query_guardrail.py +++ b/tests/test_api_query_guardrail.py @@ -242,15 +242,17 @@ def test_guardrail_invokes_dry_run_exactly_once_per_request( assert "`test-data-prj.marketing.traffic`" in state["last_sql"] -def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error( +def test_fallback_tries_original_sql_first( seeded_app, mock_dry_run, monkeypatch, ): - """When BQ rejects the rewritten SQL with ``bq_bad_request`` (DuckDB-only - syntax that doesn't translate — e.g. ``::INT`` casts, ``STRPOS``, …), - the cap-guard falls back to the pre-#171 per-table SELECT * approach - so a non-portable query still gets a (loose) cap estimate instead of - fail-opening. - """ + """Issue #201 — when the rewriter produces SQL that BQ rejects with + `bq_bad_request` but the user's ORIGINAL SQL dry-runs cleanly, the + cap-guard uses the original SQL's byte estimate. No more synthetic + `SELECT *` over-estimate. + + Bare-name reference populates `dry_run_set` so the cap-guard + actually fires. Mock returns parse-error on the first call + (rewritten SQL) and small bytes on the second (original).""" from connectors.bigquery.access import BqAccessError _register_bq_remote_row("ue", "finance", "ue") @@ -259,10 +261,10 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error( def fake_dry_run(_bq, sql): state["calls"].append(sql) - # First call (rewritten user SQL) → BQ parse error. + # First call (rewritten SQL) → BQ parse error. if len(state["calls"]) == 1: - raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'") - # Second call (fallback per-table SELECT *) → small bytes, pass cap. + raise BqAccessError("bq_bad_request", "Syntax error: simulated") + # Second call (the user's original SQL) → small, passes cap. return 4096 monkeypatch.setattr( @@ -271,28 +273,78 @@ def test_guardrail_falls_back_to_per_table_estimate_on_bq_parse_error( c = seeded_app["client"] token = seeded_app["admin_token"] - # SQL with DuckDB-only `::INT` cast that BQ would reject. + user_sql = "SELECT order_id FROM ue WHERE country = 'CZ'" + r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token)) + + # Two dry-runs: rewritten then original. No third synthetic-SELECT-* + # call. + assert len(state["calls"]) == 2, ( + f"expected rewritten + original-SQL retry, got " + f"{len(state['calls'])}: {state['calls']}" + ) + assert state["calls"][1] == user_sql, ( + f"second call must be the user's ORIGINAL SQL, got " + f"{state['calls'][1]!r}" + ) + # The response must NOT be remote_scan_too_large from a synthetic + # over-estimate — 4096 bytes is well under the 5 GiB cap. + if r.status_code == 400: + detail = r.json().get("detail", {}) + if isinstance(detail, dict): + assert detail.get("reason") != "remote_scan_too_large", detail + + +def test_fallback_fails_fast_on_pure_duckdb_syntax( + seeded_app, mock_dry_run, monkeypatch, +): + """When BOTH the rewritten and original SQL fail with `bq_bad_request` + (true DuckDB-only syntax like `::INT`), return HTTP 400 + `remote_estimate_failed` — never silently over-estimate via a + synthetic `SELECT *`.""" + from connectors.bigquery.access import BqAccessError + + _register_bq_remote_row("ue", "finance", "ue") + + state = {"calls": []} + + def always_parse_error(_bq, sql): + state["calls"].append(sql) + raise BqAccessError("bq_bad_request", "Syntax error: unexpected '::'") + + monkeypatch.setattr( + "app.api.query._bq_dry_run_bytes", always_parse_error, raising=False, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] r = c.post( "/api/query", json={"sql": "SELECT order_id::INT FROM ue WHERE country = 'CZ'"}, headers=_auth(token), ) - # Two dry-runs (rewritten + fallback per-table) before the (failed) - # execute. Status will be a downstream error from analytics.execute() - # since `::INT` doesn't work in DuckDB either against a remote view — - # but the GUARDRAIL must have completed without 5xx-ing. + # Two dry-runs (rewritten + original retry). NO synthetic SELECT * fallback. assert len(state["calls"]) == 2, ( - f"expected 1 rewritten + 1 fallback dry-run, got {len(state['calls'])}: " - f"{state['calls']}" + f"expected 1 rewritten + 1 original-retry, got " + f"{len(state['calls'])}: {state['calls']}" ) - assert "::" in state["calls"][0], "first call should be the rewritten user SQL" - assert state["calls"][1].startswith("SELECT * FROM"), ( - "second call should be the per-table fallback" - ) - # Whatever HTTP status comes back must NOT be 502 from the guard's - # transport-error path — fallback must absorb the bq_bad_request. - assert r.status_code != 502, r.json() + # No call should be a synthetic ``SELECT * FROM `...```. The + # original-SQL retry contains the user's SELECT clause. + for c_sql in state["calls"]: + # If a call is just a synthetic ``SELECT * FROM `..
``` + # the user's `WHERE country = 'CZ'` would be missing. + if c_sql.startswith("SELECT * FROM `") and "WHERE" not in c_sql: + raise AssertionError( + f"synthetic SELECT * fallback was used: {c_sql!r}" + ) + + assert r.status_code == 400, r.json() + detail = r.json().get("detail", {}) + assert isinstance(detail, dict), detail + assert detail.get("kind") == "remote_estimate_failed", detail + assert "underlying" in detail, detail + assert "agnes catalog" in detail.get("hint", "").lower() or \ + "backtick" in detail.get("hint", "").lower(), detail def test_guardrail_propagates_502_on_non_parse_bq_errors( @@ -432,3 +484,243 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names(): ) assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \ rewritten.lower().count("`p.fin.ue`") == 2 + + +# --------------------------------------------------------------------------- +# Issue #201: rewriter must NOT touch text inside `…` backtick segments. +# A user-supplied full BQ-native path `..
` whose +# table segment matches a registered bare name was being re-substituted +# inside the backticks, producing malformed nested-backtick SQL that BQ +# rejected with a parse error. +# --------------------------------------------------------------------------- + + +def test_rewrite_skips_inside_backtick_path(): + """Full backtick BQ path is preserved byte-for-byte even when its + final segment matches a registered bare-name alias.""" + from app.api.query import _rewrite_user_sql_for_bq_dry_run + + sql = ( + "SELECT * FROM `my-prj.finance.unit_economics` " + "WHERE country = 'CZ'" + ) + rewritten = _rewrite_user_sql_for_bq_dry_run( + sql=sql, + name_lookups=[("unit_economics", "finance", "unit_economics")], + project="my-prj", + ) + # No corruption — input is already BQ-native, rewriter is a no-op here. + assert rewritten == sql, ( + f"backtick path was rewritten:\n in : {sql!r}\n out: {rewritten!r}" + ) + # Sanity: the malformed nested form must NOT appear. + assert "`my-prj.finance.`my-prj" not in rewritten + + +def test_rewrite_skips_inside_backtick_with_outside_bare_name(): + """Mixed SQL: a bare name outside backticks is rewritten as before, + but an identically-named segment inside a backtick path is left + alone.""" + from app.api.query import _rewrite_user_sql_for_bq_dry_run + + sql = ( + "SELECT a.id, b.col FROM ue a " + "JOIN `my-prj.finance.ue` b ON a.id = b.id" + ) + rewritten = _rewrite_user_sql_for_bq_dry_run( + sql=sql, + name_lookups=[("ue", "fin_alias", "ue_alias")], + project="my-prj", + ) + # Outside-backtick `ue` rewrites to the registered alias path. + assert "`my-prj.fin_alias.ue_alias`" in rewritten + # The user-supplied backtick path is preserved verbatim. + assert "`my-prj.finance.ue`" in rewritten + # The malformed nested form must NOT appear. + assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten + + +def test_guardrail_skips_bare_name_match_inside_backticks( + seeded_app, mock_dry_run, monkeypatch, +): + """The `name_lookups` collection populated by `_bq_guardrail_inputs` + must not include a registered name when the only place that name + appears in the SQL is inside a `…` backtick segment. + + Captures the rewritten SQL the guardrail forwards to the dry-run and + asserts the bare-name was NOT substituted inside the user's backtick + path. + """ + _register_bq_remote_row("unit_economics", "finance", "unit_economics") + + captured = {"sql": None} + + def capturing_fake(_bq, sql): + captured["sql"] = sql + return 1024 + + monkeypatch.setattr( + "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + user_sql = ( + "SELECT * FROM `test-data-prj.finance.unit_economics` " + "WHERE country = 'CZ'" + ) + c.post("/api/query", json={"sql": user_sql}, headers=_auth(token)) + + sent = captured["sql"] + if sent is None: + # Guardrail decided no BQ tables were referenced — that's also + # an acceptable "no false-positive" outcome (Layer 3 will cover + # the explicit registry check for full backtick paths). We just + # need to ensure the bare-name regex didn't fire. + return + # The user's exact backtick path must survive verbatim — no nested + # backticks introduced by a stray bare-name rewrite. + assert "`test-data-prj.finance.unit_economics`" in sent, ( + f"backtick path corrupted by guardrail:\n out: {sent!r}" + ) + assert "`test-data-prj.finance.`test-data-prj" not in sent, ( + f"nested-backtick corruption signature present: {sent!r}" + ) + + +# --------------------------------------------------------------------------- +# Issue #201 Layer 3: full backtick BigQuery paths are registry-gated. +# Pre-fix these bypassed Agnes RBAC entirely — only the configured service +# account scope limited which tables a user could reach. Post-fix, they're +# treated identically to `bq.""."
"` syntax. +# --------------------------------------------------------------------------- + + +def test_full_backtick_path_unregistered_denied(seeded_app, mock_dry_run): + """Full backtick path to an unregistered `.
` (project + matches the configured data project) → HTTP 403 with + `bq_path_not_registered`.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={ + "sql": ( + "SELECT * FROM `test-data-prj.secret_ds.secret_tbl` " + "WHERE country = 'CZ'" + ), + }, + headers=_auth(token), + ) + assert r.status_code == 403, r.json() + detail = r.json().get("detail", {}) + assert isinstance(detail, dict), detail + assert detail.get("reason") == "bq_path_not_registered", detail + assert "secret_ds" in detail.get("path", ""), detail + assert "secret_tbl" in detail.get("path", ""), detail + + +def test_full_backtick_path_cross_project_denied(seeded_app, mock_dry_run): + """Full backtick path with project ≠ configured data project → HTTP + 403 with `bq_path_cross_project`. Even if the path happens to point + at a registered (bucket, source_table), the project mismatch is the + primary boundary.""" + _register_bq_remote_row("ue", "finance", "ue") + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={ + "sql": "SELECT * FROM `other-project.finance.ue` WHERE id = 1", + }, + headers=_auth(token), + ) + assert r.status_code == 403, r.json() + detail = r.json().get("detail", {}) + assert isinstance(detail, dict), detail + assert detail.get("reason") == "bq_path_cross_project", detail + assert detail.get("expected_project") == "test-data-prj", detail + assert "other-project" in detail.get("path", ""), detail + + +def test_full_backtick_path_registered_admin_passes( + seeded_app, mock_dry_run, monkeypatch, +): + """Admin caller + registered path + matching project → no RBAC + rejection. The dry-run fires (we can capture the SQL the guardrail + forwards) and no `bq_path_*` reason appears in any error response.""" + _register_bq_remote_row("ue", "finance", "ue") + + captured = {"sql": None} + + def capturing_fake(_bq, sql): + captured["sql"] = sql + return 1024 # tiny — pass cap + + monkeypatch.setattr( + "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={ + "sql": "SELECT * FROM `test-data-prj.finance.ue` WHERE id = 1", + }, + headers=_auth(token), + ) + # If 403, must NOT be the issue-#201 bq_path_* reasons. + if r.status_code == 403: + detail = r.json().get("detail", {}) + if isinstance(detail, dict): + assert detail.get("reason") not in ( + "bq_path_not_registered", + "bq_path_access_denied", + "bq_path_cross_project", + ), f"admin + registered path should pass RBAC: {detail}" + # The dry-run was invoked, meaning Pass 3 added the path to dry_run_set + # and the cap-guard fired. The user's WHERE clause must still be in + # the dry-run SQL (validates Layer 1 — backtick-aware rewrite). + assert captured["sql"] is not None, ( + "dry-run never fired — Pass 3 may not have registered the path" + ) + assert "`test-data-prj.finance.ue`" in captured["sql"], captured["sql"] + assert "WHERE id = 1" in captured["sql"], captured["sql"] + + +def test_full_backtick_path_inside_string_literal_not_gated( + seeded_app, mock_dry_run, +): + """Defensive case: a backtick path appearing inside a SQL string + literal (rare but possible) should not trigger Pass 3. Practically + this is unreachable because backticks aren't typically valid inside + BQ string literals — but the regex doesn't know that. We document + that the gate applies to ALL backtick triples to be safe; users who + really need a literal can use single-quoted strings without + backticks.""" + # No registration; the test confirms an unregistered path inside + # what looks like a string is still gated. This is the conservative + # boundary — false-positive on string literal beats false-negative + # on a real RBAC bypass. + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={ + "sql": ( + "SELECT 'matches `test-data-prj.x.y`' AS lit" + ), + }, + headers=_auth(token), + ) + # Either gated (403) or 200 if the analytics DB happens to evaluate + # the literal — both are acceptable. The point is no silent RBAC + # bypass: if the response is 200, no BQ table was reached. + if r.status_code == 403: + detail = r.json().get("detail", {}) + if isinstance(detail, dict): + assert detail.get("reason") in ( + "bq_path_not_registered", + "bq_path_cross_project", + ), detail diff --git a/tests/test_query_remote_rewrite.py b/tests/test_query_remote_rewrite.py index bc84c51..4fa8819 100644 --- a/tests/test_query_remote_rewrite.py +++ b/tests/test_query_remote_rewrite.py @@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch): assert rewritten == user_sql # untouched +def test_local_name_inside_backtick_path_does_not_trip_cross_source( + seeded_registry, monkeypatch, +): + """Devin Review on PR #208 (issue #201 follow-up): a registered + LOCAL-mode table name appearing as a segment of a user-supplied full + backtick BQ path must NOT trip the cross-source guard. Pre-fix the + bare-name regex at the cross-source check ran against unmasked + sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered + local ``orders`` inside the backticks and force the wrapper to bail + to the ATTACH-catalog slow path (50-100× slower). Post-fix the + regex runs against the backtick-masked copy, the cross-source check + correctly sees only BQ refs, and the wrap proceeds. + """ + from app.api.query import _rewrite_user_sql_for_bigquery_query + _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue", + bucket="fin", source_table="ue") + _register_local(seeded_registry, table_id="kbc.in.orders", name="orders") + _set_bq_project(monkeypatch, "test-prj") + + user_sql = ( + "SELECT u.id " + "FROM ue u " + "JOIN `test-prj.dataset.orders` o ON u.x = o.x " + "WHERE o.value > 0" + ) + rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query( + user_sql, seeded_registry, + ) + # Must wrap — both refs are BQ; the local `orders` registration is + # irrelevant to a query that touches only BQ paths. + assert did_rewrite is True + assert "bigquery_query(" in rewritten + # The user's backtick path is preserved verbatim inside the wrapped + # inner SQL (Layer 1 split-on-backticks behaviour), so the original + # `test-prj.dataset.orders` reference survives. + assert "test-prj.dataset.orders" in rewritten + + def test_no_bq_tables_passes_through(seeded_registry, monkeypatch): """User SQL referencing only local-source tables → no rewrite, no log spam, original SQL returned."""