fix: Devin Review #1 — apply backtick mask to wrapping rewriter
`_rewrite_user_sql_for_bigquery_query` does its own bare-name detection (mirroring the non-RBAC parts of `_bq_guardrail_inputs`). The backtick masking from #201 was applied to `_bq_guardrail_inputs` and the forbidden-table loop, but missed this third site — so a registered local-mode table name appearing as the table segment of a user-supplied full backtick path (e.g. ``\`prj.ds.orders\`` matching registered local ``orders``) tripped the cross-source guard and forced every backtick-path query into the 50-100× slower ATTACH-catalog fallback. Mask once at the top of the function, route both the BQ-name detection (line ~830) and the cross-source check (line ~867) through the masked copy. New regression test `test_local_name_inside_backtick_path_does_not_trip_cross_source` proves the wrapper now wraps when it should.
This commit is contained in:
parent
09958c9d87
commit
f4bc04958d
2 changed files with 50 additions and 3 deletions
|
|
@ -787,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query(
|
|||
return user_sql, False
|
||||
|
||||
# Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
|
||||
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`.
|
||||
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201:
|
||||
# bare-name regex must run against a backtick-masked copy so a
|
||||
# registered name like ``orders`` doesn't false-positive when it
|
||||
# appears as the table segment of a user-supplied full backtick path
|
||||
# like ``\\`<project>.<dataset>.orders\\```. Without masking, the
|
||||
# cross-source check below would falsely conclude the SQL touches
|
||||
# both BQ-remote and local sources, dropping every backtick-path
|
||||
# query into the 50-100× slower ATTACH-catalog fallback. Devin
|
||||
# Review on PR #208.
|
||||
sql_lower = user_sql.lower()
|
||||
sql_lower_masked = _mask_backticks(sql_lower)
|
||||
name_lookups: list = []
|
||||
seen_paths: set = set()
|
||||
|
||||
|
|
@ -827,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query(
|
|||
# mix rewritten and non-rewritten BQ paths in one query.
|
||||
return user_sql, False
|
||||
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
||||
if re.search(pattern, sql_lower):
|
||||
if re.search(pattern, sql_lower_masked):
|
||||
key = (bucket.lower(), source_table.lower())
|
||||
if key not in seen_paths:
|
||||
seen_paths.add(key)
|
||||
|
|
@ -864,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query(
|
|||
# Same name registered both BQ-remote and local? Pathological;
|
||||
# skip as a safety measure.
|
||||
return user_sql, False
|
||||
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower):
|
||||
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked):
|
||||
logger.info(
|
||||
"rewrite_skip_cross_source: user SQL references both "
|
||||
"BQ-remote and local-mode tables; falling back to "
|
||||
|
|
|
|||
|
|
@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
|
|||
assert rewritten == user_sql # untouched
|
||||
|
||||
|
||||
def test_local_name_inside_backtick_path_does_not_trip_cross_source(
|
||||
seeded_registry, monkeypatch,
|
||||
):
|
||||
"""Devin Review on PR #208 (issue #201 follow-up): a registered
|
||||
LOCAL-mode table name appearing as a segment of a user-supplied full
|
||||
backtick BQ path must NOT trip the cross-source guard. Pre-fix the
|
||||
bare-name regex at the cross-source check ran against unmasked
|
||||
sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
|
||||
local ``orders`` inside the backticks and force the wrapper to bail
|
||||
to the ATTACH-catalog slow path (50-100× slower). Post-fix the
|
||||
regex runs against the backtick-masked copy, the cross-source check
|
||||
correctly sees only BQ refs, and the wrap proceeds.
|
||||
"""
|
||||
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||||
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||||
bucket="fin", source_table="ue")
|
||||
_register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
|
||||
_set_bq_project(monkeypatch, "test-prj")
|
||||
|
||||
user_sql = (
|
||||
"SELECT u.id "
|
||||
"FROM ue u "
|
||||
"JOIN `test-prj.dataset.orders` o ON u.x = o.x "
|
||||
"WHERE o.value > 0"
|
||||
)
|
||||
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||||
user_sql, seeded_registry,
|
||||
)
|
||||
# Must wrap — both refs are BQ; the local `orders` registration is
|
||||
# irrelevant to a query that touches only BQ paths.
|
||||
assert did_rewrite is True
|
||||
assert "bigquery_query(" in rewritten
|
||||
# The user's backtick path is preserved verbatim inside the wrapped
|
||||
# inner SQL (Layer 1 split-on-backticks behaviour), so the original
|
||||
# `test-prj.dataset.orders` reference survives.
|
||||
assert "test-prj.dataset.orders" in rewritten
|
||||
|
||||
|
||||
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
|
||||
"""User SQL referencing only local-source tables → no rewrite,
|
||||
no log spam, original SQL returned."""
|
||||
|
|
|
|||
Loading…
Reference in a new issue