fix: Devin Review #1 — apply backtick mask to wrapping rewriter

`_rewrite_user_sql_for_bigquery_query` does its own bare-name detection
(mirroring the non-RBAC parts of `_bq_guardrail_inputs`). The backtick
masking from #201 was applied to `_bq_guardrail_inputs` and the
forbidden-table loop, but missed this third site — so a registered
local-mode table name appearing as the table segment of a
user-supplied full backtick path (e.g. ``\`prj.ds.orders\`` matching
registered local ``orders``) tripped the cross-source guard and
forced every backtick-path query into the 50-100× slower
ATTACH-catalog fallback.

Mask once at the top of the function, route both the BQ-name
detection (line ~830) and the cross-source check (line ~867) through
the masked copy. New regression test
`test_local_name_inside_backtick_path_does_not_trip_cross_source`
proves the wrapper now wraps when it should.
This commit is contained in:
ZdenekSrotyr 2026-05-06 21:06:21 +02:00
parent 09958c9d87
commit f4bc04958d
2 changed files with 50 additions and 3 deletions

View file

@ -787,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query(
return user_sql, False return user_sql, False
# Find all referenced BQ remote-mode rows (bare-name + direct bq.path). # Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. # Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201:
# bare-name regex must run against a backtick-masked copy so a
# registered name like ``orders`` doesn't false-positive when it
# appears as the table segment of a user-supplied full backtick path
# like ``\\`<project>.<dataset>.orders\\```. Without masking, the
# cross-source check below would falsely conclude the SQL touches
# both BQ-remote and local sources, dropping every backtick-path
# query into the 50-100× slower ATTACH-catalog fallback. Devin
# Review on PR #208.
sql_lower = user_sql.lower() sql_lower = user_sql.lower()
sql_lower_masked = _mask_backticks(sql_lower)
name_lookups: list = [] name_lookups: list = []
seen_paths: set = set() seen_paths: set = set()
@ -827,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query(
# mix rewritten and non-rewritten BQ paths in one query. # mix rewritten and non-rewritten BQ paths in one query.
return user_sql, False return user_sql, False
pattern = r'\b' + re.escape(str(name).lower()) + r'\b' pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
if re.search(pattern, sql_lower): if re.search(pattern, sql_lower_masked):
key = (bucket.lower(), source_table.lower()) key = (bucket.lower(), source_table.lower())
if key not in seen_paths: if key not in seen_paths:
seen_paths.add(key) seen_paths.add(key)
@ -864,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query(
# Same name registered both BQ-remote and local? Pathological; # Same name registered both BQ-remote and local? Pathological;
# skip as a safety measure. # skip as a safety measure.
return user_sql, False return user_sql, False
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower): if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked):
logger.info( logger.info(
"rewrite_skip_cross_source: user SQL references both " "rewrite_skip_cross_source: user SQL references both "
"BQ-remote and local-mode tables; falling back to " "BQ-remote and local-mode tables; falling back to "

View file

@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
assert rewritten == user_sql # untouched assert rewritten == user_sql # untouched
def test_local_name_inside_backtick_path_does_not_trip_cross_source(
seeded_registry, monkeypatch,
):
"""Devin Review on PR #208 (issue #201 follow-up): a registered
LOCAL-mode table name appearing as a segment of a user-supplied full
backtick BQ path must NOT trip the cross-source guard. Pre-fix the
bare-name regex at the cross-source check ran against unmasked
sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
local ``orders`` inside the backticks and force the wrapper to bail
to the ATTACH-catalog slow path (50-100× slower). Post-fix the
regex runs against the backtick-masked copy, the cross-source check
correctly sees only BQ refs, and the wrap proceeds.
"""
from app.api.query import _rewrite_user_sql_for_bigquery_query
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
bucket="fin", source_table="ue")
_register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
_set_bq_project(monkeypatch, "test-prj")
user_sql = (
"SELECT u.id "
"FROM ue u "
"JOIN `test-prj.dataset.orders` o ON u.x = o.x "
"WHERE o.value > 0"
)
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
user_sql, seeded_registry,
)
# Must wrap — both refs are BQ; the local `orders` registration is
# irrelevant to a query that touches only BQ paths.
assert did_rewrite is True
assert "bigquery_query(" in rewritten
# The user's backtick path is preserved verbatim inside the wrapped
# inner SQL (Layer 1 split-on-backticks behaviour), so the original
# `test-prj.dataset.orders` reference survives.
assert "test-prj.dataset.orders" in rewritten
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch): def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
"""User SQL referencing only local-source tables → no rewrite, """User SQL referencing only local-source tables → no rewrite,
no log spam, original SQL returned.""" no log spam, original SQL returned."""