From f4bc04958d7b0ed88e8e488f47d9b06c6f073e0b Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Wed, 6 May 2026 21:06:21 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Devin=20Review=20#1=20=E2=80=94=20apply?= =?UTF-8?q?=20backtick=20mask=20to=20wrapping=20rewriter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_rewrite_user_sql_for_bigquery_query` does its own bare-name detection (mirroring the non-RBAC parts of `_bq_guardrail_inputs`). The backtick masking from #201 was applied to `_bq_guardrail_inputs` and the forbidden-table loop, but missed this third site — so a registered local-mode table name appearing as the table segment of a user-supplied full backtick path (e.g. ``\`prj.ds.orders\`` matching registered local ``orders``) tripped the cross-source guard and forced every backtick-path query into the 50-100× slower ATTACH-catalog fallback. Mask once at the top of the function, route both the BQ-name detection (line ~830) and the cross-source check (line ~867) through the masked copy. New regression test `test_local_name_inside_backtick_path_does_not_trip_cross_source` proves the wrapper now wraps when it should. --- app/api/query.py | 15 +++++++++--- tests/test_query_remote_rewrite.py | 38 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/app/api/query.py b/app/api/query.py index 3a751a8..b9b8b9b 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -787,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query( return user_sql, False # Find all referenced BQ remote-mode rows (bare-name + direct bq.path). - # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. + # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201: + # bare-name regex must run against a backtick-masked copy so a + # registered name like ``orders`` doesn't false-positive when it + # appears as the table segment of a user-supplied full backtick path + # like ``\\`..orders\\```. Without masking, the + # cross-source check below would falsely conclude the SQL touches + # both BQ-remote and local sources, dropping every backtick-path + # query into the 50-100× slower ATTACH-catalog fallback. Devin + # Review on PR #208. sql_lower = user_sql.lower() + sql_lower_masked = _mask_backticks(sql_lower) name_lookups: list = [] seen_paths: set = set() @@ -827,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query( # mix rewritten and non-rewritten BQ paths in one query. return user_sql, False pattern = r'\b' + re.escape(str(name).lower()) + r'\b' - if re.search(pattern, sql_lower): + if re.search(pattern, sql_lower_masked): key = (bucket.lower(), source_table.lower()) if key not in seen_paths: seen_paths.add(key) @@ -864,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query( # Same name registered both BQ-remote and local? Pathological; # skip as a safety measure. return user_sql, False - if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower): + if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked): logger.info( "rewrite_skip_cross_source: user SQL references both " "BQ-remote and local-mode tables; falling back to " diff --git a/tests/test_query_remote_rewrite.py b/tests/test_query_remote_rewrite.py index bc84c51..4fa8819 100644 --- a/tests/test_query_remote_rewrite.py +++ b/tests/test_query_remote_rewrite.py @@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch): assert rewritten == user_sql # untouched +def test_local_name_inside_backtick_path_does_not_trip_cross_source( + seeded_registry, monkeypatch, +): + """Devin Review on PR #208 (issue #201 follow-up): a registered + LOCAL-mode table name appearing as a segment of a user-supplied full + backtick BQ path must NOT trip the cross-source guard. Pre-fix the + bare-name regex at the cross-source check ran against unmasked + sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered + local ``orders`` inside the backticks and force the wrapper to bail + to the ATTACH-catalog slow path (50-100× slower). Post-fix the + regex runs against the backtick-masked copy, the cross-source check + correctly sees only BQ refs, and the wrap proceeds. + """ + from app.api.query import _rewrite_user_sql_for_bigquery_query + _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue", + bucket="fin", source_table="ue") + _register_local(seeded_registry, table_id="kbc.in.orders", name="orders") + _set_bq_project(monkeypatch, "test-prj") + + user_sql = ( + "SELECT u.id " + "FROM ue u " + "JOIN `test-prj.dataset.orders` o ON u.x = o.x " + "WHERE o.value > 0" + ) + rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query( + user_sql, seeded_registry, + ) + # Must wrap — both refs are BQ; the local `orders` registration is + # irrelevant to a query that touches only BQ paths. + assert did_rewrite is True + assert "bigquery_query(" in rewritten + # The user's backtick path is preserved verbatim inside the wrapped + # inner SQL (Layer 1 split-on-backticks behaviour), so the original + # `test-prj.dataset.orders` reference survives. + assert "test-prj.dataset.orders" in rewritten + + def test_no_bq_tables_passes_through(seeded_registry, monkeypatch): """User SQL referencing only local-source tables → no rewrite, no log spam, original SQL returned."""