fix: Devin Review #1 — apply backtick mask to wrapping rewriter
`_rewrite_user_sql_for_bigquery_query` does its own bare-name detection (mirroring the non-RBAC parts of `_bq_guardrail_inputs`). The backtick masking from #201 was applied to `_bq_guardrail_inputs` and the forbidden-table loop, but missed this third site — so a registered local-mode table name appearing as the table segment of a user-supplied full backtick path (e.g. ``\`prj.ds.orders\`` matching registered local ``orders``) tripped the cross-source guard and forced every backtick-path query into the 50-100× slower ATTACH-catalog fallback. Mask once at the top of the function, route both the BQ-name detection (line ~830) and the cross-source check (line ~867) through the masked copy. New regression test `test_local_name_inside_backtick_path_does_not_trip_cross_source` proves the wrapper now wraps when it should.
This commit is contained in:
parent
09958c9d87
commit
f4bc04958d
2 changed files with 50 additions and 3 deletions
|
|
@ -787,8 +787,17 @@ def _rewrite_user_sql_for_bigquery_query(
|
||||||
return user_sql, False
|
return user_sql, False
|
||||||
|
|
||||||
# Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
|
# Find all referenced BQ remote-mode rows (bare-name + direct bq.path).
|
||||||
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`.
|
# Mirrors the non-RBAC parts of `_bq_guardrail_inputs`. Issue #201:
|
||||||
|
# bare-name regex must run against a backtick-masked copy so a
|
||||||
|
# registered name like ``orders`` doesn't false-positive when it
|
||||||
|
# appears as the table segment of a user-supplied full backtick path
|
||||||
|
# like ``\\`<project>.<dataset>.orders\\```. Without masking, the
|
||||||
|
# cross-source check below would falsely conclude the SQL touches
|
||||||
|
# both BQ-remote and local sources, dropping every backtick-path
|
||||||
|
# query into the 50-100× slower ATTACH-catalog fallback. Devin
|
||||||
|
# Review on PR #208.
|
||||||
sql_lower = user_sql.lower()
|
sql_lower = user_sql.lower()
|
||||||
|
sql_lower_masked = _mask_backticks(sql_lower)
|
||||||
name_lookups: list = []
|
name_lookups: list = []
|
||||||
seen_paths: set = set()
|
seen_paths: set = set()
|
||||||
|
|
||||||
|
|
@ -827,7 +836,7 @@ def _rewrite_user_sql_for_bigquery_query(
|
||||||
# mix rewritten and non-rewritten BQ paths in one query.
|
# mix rewritten and non-rewritten BQ paths in one query.
|
||||||
return user_sql, False
|
return user_sql, False
|
||||||
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
pattern = r'\b' + re.escape(str(name).lower()) + r'\b'
|
||||||
if re.search(pattern, sql_lower):
|
if re.search(pattern, sql_lower_masked):
|
||||||
key = (bucket.lower(), source_table.lower())
|
key = (bucket.lower(), source_table.lower())
|
||||||
if key not in seen_paths:
|
if key not in seen_paths:
|
||||||
seen_paths.add(key)
|
seen_paths.add(key)
|
||||||
|
|
@ -864,7 +873,7 @@ def _rewrite_user_sql_for_bigquery_query(
|
||||||
# Same name registered both BQ-remote and local? Pathological;
|
# Same name registered both BQ-remote and local? Pathological;
|
||||||
# skip as a safety measure.
|
# skip as a safety measure.
|
||||||
return user_sql, False
|
return user_sql, False
|
||||||
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower):
|
if re.search(r'\b' + re.escape(name_lc) + r'\b', sql_lower_masked):
|
||||||
logger.info(
|
logger.info(
|
||||||
"rewrite_skip_cross_source: user SQL references both "
|
"rewrite_skip_cross_source: user SQL references both "
|
||||||
"BQ-remote and local-mode tables; falling back to "
|
"BQ-remote and local-mode tables; falling back to "
|
||||||
|
|
|
||||||
|
|
@ -232,6 +232,44 @@ def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
|
||||||
assert rewritten == user_sql # untouched
|
assert rewritten == user_sql # untouched
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_name_inside_backtick_path_does_not_trip_cross_source(
|
||||||
|
seeded_registry, monkeypatch,
|
||||||
|
):
|
||||||
|
"""Devin Review on PR #208 (issue #201 follow-up): a registered
|
||||||
|
LOCAL-mode table name appearing as a segment of a user-supplied full
|
||||||
|
backtick BQ path must NOT trip the cross-source guard. Pre-fix the
|
||||||
|
bare-name regex at the cross-source check ran against unmasked
|
||||||
|
sql_lower, so ``\\`test-prj.dataset.orders\\``` would match registered
|
||||||
|
local ``orders`` inside the backticks and force the wrapper to bail
|
||||||
|
to the ATTACH-catalog slow path (50-100× slower). Post-fix the
|
||||||
|
regex runs against the backtick-masked copy, the cross-source check
|
||||||
|
correctly sees only BQ refs, and the wrap proceeds.
|
||||||
|
"""
|
||||||
|
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||||||
|
_register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
|
||||||
|
bucket="fin", source_table="ue")
|
||||||
|
_register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
|
||||||
|
_set_bq_project(monkeypatch, "test-prj")
|
||||||
|
|
||||||
|
user_sql = (
|
||||||
|
"SELECT u.id "
|
||||||
|
"FROM ue u "
|
||||||
|
"JOIN `test-prj.dataset.orders` o ON u.x = o.x "
|
||||||
|
"WHERE o.value > 0"
|
||||||
|
)
|
||||||
|
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||||||
|
user_sql, seeded_registry,
|
||||||
|
)
|
||||||
|
# Must wrap — both refs are BQ; the local `orders` registration is
|
||||||
|
# irrelevant to a query that touches only BQ paths.
|
||||||
|
assert did_rewrite is True
|
||||||
|
assert "bigquery_query(" in rewritten
|
||||||
|
# The user's backtick path is preserved verbatim inside the wrapped
|
||||||
|
# inner SQL (Layer 1 split-on-backticks behaviour), so the original
|
||||||
|
# `test-prj.dataset.orders` reference survives.
|
||||||
|
assert "test-prj.dataset.orders" in rewritten
|
||||||
|
|
||||||
|
|
||||||
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
|
def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
|
||||||
"""User SQL referencing only local-source tables → no rewrite,
|
"""User SQL referencing only local-source tables → no rewrite,
|
||||||
no log spam, original SQL returned."""
|
no log spam, original SQL returned."""
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue