From 720a2180c0dd62bf09fa12249c8c281f8252d16a Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Wed, 6 May 2026 17:20:33 +0200 Subject: [PATCH] fix(query): rewriter respects backtick segments (#201) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `agnes query --remote` corrupted user SQL when the request contained a full BigQuery backtick path (`..`) whose table segment matched a registered bare-name alias. The bare-name rewriter used `\b` word-boundary matching against the lower-cased SQL; both `.` and `` ` `` are non-word characters, so the regex fired INSIDE the user's backtick path and produced malformed nested-backtick SQL that BigQuery rejected at parse time. Fix: - Add `_mask_backticks(sql)` helper: replace each `…` segment with spaces of equal length, preserving offsets so word-boundary searches find positions only outside backticks. - `_bq_guardrail_inputs` (bare-name pass + forbidden-table pass) searches against the masked SQL. - `_rewrite_bq_table_refs_to_native` Pass 1 splits the SQL on `(\`[^\`]*\`)` and rewrites only the outside-backtick chunks. Pass 2 (`bq."ds"."tbl"` → backtick form) is unchanged — its prefix can't appear inside backticks. Adds three regressions covering the rewrite + guardrail paths. --- app/api/query.py | 60 ++++++++++++++++-- tests/test_api_query_guardrail.py | 102 ++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 4 deletions(-) diff --git a/app/api/query.py b/app/api/query.py index 6dbbadd..5ab0d63 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -90,6 +90,29 @@ BQ_PATH = re.compile( ) +# Issue #201 — full backtick BQ path `..
` in user +# SQL. Used by the registry-gating pass and (via `_mask_backticks`) to keep +# bare-name regexes from firing inside backtick-quoted segments. +_BACKTICK_SEGMENT = re.compile(r'`[^`]*`') +_BACKTICK_FULL_PATH = re.compile(r'`([^.`]+)\.([^.`]+)\.([^.`]+)`') + + +def _mask_backticks(sql: str) -> str: + """Replace each `…`-quoted segment with spaces of equal length so + word-boundary regexes find positions outside backticks but ignore + everything inside. Preserves all character offsets so ``re.search`` + on the masked string returns matches at the same positions as on the + original. + + Issue #201: `\\b` matches inside backtick segments because both `.` + and `` ` `` are non-word characters. A registered bare-name like + ``unit_economics`` would otherwise match inside a user-supplied full + backtick path ``\\`..unit_economics\\``` and get + falsely rewritten — corrupting the user's intended SQL. + """ + return _BACKTICK_SEGMENT.sub(lambda m: ' ' * len(m.group(0)), sql) + + def _default_remote_query_cap_bytes() -> int: """5 GiB default cap on /api/query BQ-touching scans. Configurable via `data_source.bigquery.bq_max_scan_bytes` in /admin/server-config — @@ -197,11 +220,18 @@ def execute_query( if r.get("name") and r.get("id") in allowed_ids } - # Check if query references any forbidden tables (word-boundary match) + # Check if query references any forbidden tables (word-boundary + # match). Issue #201: mask backtick segments so `\b` doesn't + # falsely fire inside a user-supplied full backtick path like + # `..
` whose final segment happens to + # collide with a forbidden master view name. The full-path + # registry-gate downstream is the proper authorization check + # for those. + sql_lower_masked = _mask_backticks(sql_lower) forbidden = all_views - allowed_view_names for table in forbidden: pattern = r'\b' + re.escape(table.lower()) + r'\b' - if re.search(pattern, sql_lower): + if re.search(pattern, sql_lower_masked): raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'") # ---- #160 BQ remote-row guardrail + RBAC patch ------------------- @@ -467,6 +497,11 @@ def _bq_guardrail_inputs( name_lookups: list = [] seen_paths: set = set() accessible_set = set(allowed) if allowed is not None else None + # Issue #201: mask backtick segments so a registered bare name like + # `unit_economics` doesn't false-positive on a user-supplied full + # backtick path `..unit_economics`. The full-path + # pass below registry-gates those properly. + sql_lower_masked = _mask_backticks(sql_lower) for r in repo.list_by_source("bigquery"): if (r.get("query_mode") or "") != "remote": continue @@ -481,7 +516,7 @@ def _bq_guardrail_inputs( # before we get here. Defensive skip. continue pattern = r'\b' + re.escape(str(name).lower()) + r'\b' - if re.search(pattern, sql_lower): + if re.search(pattern, sql_lower_masked): key = (bucket.lower(), source_table.lower()) if key not in seen_paths: seen_paths.add(key) @@ -579,6 +614,15 @@ def _rewrite_bq_table_refs_to_native( # name up in a case-insensitive dict. Single-pass means freshly # inserted backticked text isn't re-scanned, fixing the # project-ID-contains-name corruption (Devin Review on query.py:464). + # + # Issue #201: split the SQL on `…` segments and rewrite ONLY in the + # outside-backtick chunks. Without this, a user-supplied full backtick + # path like ``\\`..unit_economics\\``` whose final + # segment matches a registered bare name would have the bare-name + # regex fire INSIDE the backticks (since `\\b` treats both `.` and + # `` ` `` as non-word boundaries), producing malformed nested + # backticks. Splitting confines the rewrite to user identifier + # positions where bare-name resolution is the intended behaviour. if name_lookups: # Map name (lower-cased) → backticked target. Names are # case-insensitive on the input side per the existing helper @@ -598,7 +642,15 @@ def _rewrite_bq_table_refs_to_native( def _name_repl(m: re.Match) -> str: return name_to_target[m.group(1).lower()] - out = re.sub(pattern, _name_repl, out, flags=re.IGNORECASE) + # `re.split` with a captured group returns: [outside, backtick, + # outside, backtick, …]. Even indices are outside-backtick chunks + # eligible for bare-name rewrite; odd indices are full backtick + # segments preserved verbatim. + parts = re.split(r'(`[^`]*`)', out) + for i, part in enumerate(parts): + if i % 2 == 0: + parts[i] = re.sub(pattern, _name_repl, part, flags=re.IGNORECASE) + out = "".join(parts) # Pass 2: bq."ds"."tbl" / bq.ds.tbl → `..`. def _bq_path_repl(m: re.Match) -> str: diff --git a/tests/test_api_query_guardrail.py b/tests/test_api_query_guardrail.py index f0df7b4..afd75c9 100644 --- a/tests/test_api_query_guardrail.py +++ b/tests/test_api_query_guardrail.py @@ -432,3 +432,105 @@ def test_rewrite_helper_is_case_insensitive_on_bare_names(): ) assert "`p.fin.ue` WHERE `p.fin.ue`.id" in rewritten or \ rewritten.lower().count("`p.fin.ue`") == 2 + + +# --------------------------------------------------------------------------- +# Issue #201: rewriter must NOT touch text inside `…` backtick segments. +# A user-supplied full BQ-native path `..
` whose +# table segment matches a registered bare name was being re-substituted +# inside the backticks, producing malformed nested-backtick SQL that BQ +# rejected with a parse error. +# --------------------------------------------------------------------------- + + +def test_rewrite_skips_inside_backtick_path(): + """Full backtick BQ path is preserved byte-for-byte even when its + final segment matches a registered bare-name alias.""" + from app.api.query import _rewrite_user_sql_for_bq_dry_run + + sql = ( + "SELECT * FROM `my-prj.finance.unit_economics` " + "WHERE country = 'CZ'" + ) + rewritten = _rewrite_user_sql_for_bq_dry_run( + sql=sql, + name_lookups=[("unit_economics", "finance", "unit_economics")], + project="my-prj", + ) + # No corruption — input is already BQ-native, rewriter is a no-op here. + assert rewritten == sql, ( + f"backtick path was rewritten:\n in : {sql!r}\n out: {rewritten!r}" + ) + # Sanity: the malformed nested form must NOT appear. + assert "`my-prj.finance.`my-prj" not in rewritten + + +def test_rewrite_skips_inside_backtick_with_outside_bare_name(): + """Mixed SQL: a bare name outside backticks is rewritten as before, + but an identically-named segment inside a backtick path is left + alone.""" + from app.api.query import _rewrite_user_sql_for_bq_dry_run + + sql = ( + "SELECT a.id, b.col FROM ue a " + "JOIN `my-prj.finance.ue` b ON a.id = b.id" + ) + rewritten = _rewrite_user_sql_for_bq_dry_run( + sql=sql, + name_lookups=[("ue", "fin_alias", "ue_alias")], + project="my-prj", + ) + # Outside-backtick `ue` rewrites to the registered alias path. + assert "`my-prj.fin_alias.ue_alias`" in rewritten + # The user-supplied backtick path is preserved verbatim. + assert "`my-prj.finance.ue`" in rewritten + # The malformed nested form must NOT appear. + assert "`my-prj.finance.`my-prj.fin_alias.ue_alias`" not in rewritten + + +def test_guardrail_skips_bare_name_match_inside_backticks( + seeded_app, mock_dry_run, monkeypatch, +): + """The `name_lookups` collection populated by `_bq_guardrail_inputs` + must not include a registered name when the only place that name + appears in the SQL is inside a `…` backtick segment. + + Captures the rewritten SQL the guardrail forwards to the dry-run and + asserts the bare-name was NOT substituted inside the user's backtick + path. + """ + _register_bq_remote_row("unit_economics", "finance", "unit_economics") + + captured = {"sql": None} + + def capturing_fake(_bq, sql): + captured["sql"] = sql + return 1024 + + monkeypatch.setattr( + "app.api.query._bq_dry_run_bytes", capturing_fake, raising=False, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + user_sql = ( + "SELECT * FROM `test-data-prj.finance.unit_economics` " + "WHERE country = 'CZ'" + ) + c.post("/api/query", json={"sql": user_sql}, headers=_auth(token)) + + sent = captured["sql"] + if sent is None: + # Guardrail decided no BQ tables were referenced — that's also + # an acceptable "no false-positive" outcome (Layer 3 will cover + # the explicit registry check for full backtick paths). We just + # need to ensure the bare-name regex didn't fire. + return + # The user's exact backtick path must survive verbatim — no nested + # backticks introduced by a stray bare-name rewrite. + assert "`test-data-prj.finance.unit_economics`" in sent, ( + f"backtick path corrupted by guardrail:\n out: {sent!r}" + ) + assert "`test-data-prj.finance.`test-data-prj" not in sent, ( + f"nested-backtick corruption signature present: {sent!r}" + )