sec(query): #160 BQ_PATH catches quoted "bq" catalog token (Phase 3 review)

Phase 3 review identified an RBAC + cost-cap bypass: `SELECT * FROM
"bq"."ds"."tbl"` (catalog token quoted as a DuckDB identifier) was NOT
matched by the BQ_PATH regex, so direct quoted-form references skipped
both the registry check and the cost-cap dry-run. DuckDB resolves
`"bq"` to the same ATTACHed BQ catalog, so the bypass is real.

Widen the catalog-token alternation: `(?:"bq"|bq)` matches both forms.
Negative lookbehind `(?<![\w.])` still rejects look-alike prefixes
(`other_bq`, `my_bq`); the new "my_bq".ds.tbl negative test locks that
in alongside `other_bq.ds.tbl`.

Tests:
- 2 new positive cases in tests/test_query_bq_regex.py for the quoted
  form (`"bq"."finance"."ue"` and uppercase `"BQ"."ds"."tbl"`).
- 1 new negative case rejecting `"my_bq".ds.tbl` so the quoted-form
  widening doesn't open a different evasion.
- 1 new RBAC test in tests/test_api_query_rbac_bq_path.py: admin
  hitting an unregistered quoted path returns the same
  bq_path_not_registered 403 as the unquoted form.

All 33 Phase 3 tests pass after the fix.
This commit is contained in:
ZdenekSrotyr 2026-05-03 19:35:14 +02:00
parent eddb0d2c58
commit 77cdb65f76
3 changed files with 30 additions and 4 deletions

View file

@ -31,12 +31,16 @@ logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/query", tags=["query"]) router = APIRouter(prefix="/api/query", tags=["query"])
# Issue #160 §4.3.1 — direct `bq.<dataset>.<source_table>` references in user # Issue #160 §4.3.1 — direct `bq.<dataset>.<source_table>` references in user
# SQL. Matches all 16 cases verified empirically (fully-quoted, unquoted, # SQL. Catalog token accepts both `bq` (the unquoted DuckDB-style name) and
# mixed quoting, case-insensitive, inside CTE bodies, multiple in one stmt). # `"bq"` (quoted identifier). DuckDB resolves both to the same ATTACHed
# catalog, so the security-boundary regex must accept both — Phase 3 review
# caught the quoted variant as an RBAC + cost-cap bypass.
# Lookahead `(?=\W|$)` works where `\b` doesn't (after a closing quote). # Lookahead `(?=\W|$)` works where `\b` doesn't (after a closing quote).
# Negative lookbehind `(?<![\w.])` rejects `other_bq.x.y` and `x.bq.y.z`. # Negative lookbehind `(?<![\w.])` rejects `other_bq.x.y`, `my_bq.ds.tbl`,
# and `x.bq.y.z` so the regex doesn't fire on column qualifiers or
# look-alike-prefixed identifiers.
BQ_PATH = re.compile( BQ_PATH = re.compile(
r'(?<![\w.])bq\s*\.\s*("[^"]+"|\w+)\s*\.\s*("[^"]+"|\w+)(?=\W|$)', r'(?<![\w.])(?:"bq"|bq)\s*\.\s*("[^"]+"|\w+)\s*\.\s*("[^"]+"|\w+)(?=\W|$)',
re.IGNORECASE, re.IGNORECASE,
) )

View file

@ -36,6 +36,24 @@ def _register_bq_remote_row(name: str, bucket: str, source_table: str) -> None:
sys_conn.close() sys_conn.close()
def test_quoted_bq_catalog_token_rejected_403(seeded_app):
"""Phase 3 review evasion: `SELECT * FROM "bq"."ds"."tbl"` (catalog
token quoted) must be caught by the same RBAC check as the unquoted
form. DuckDB resolves `"bq"` to the same ATTACHed BQ catalog, so the
quoted variant is a real bypass we have to close."""
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={"sql": 'SELECT * FROM "bq"."secret_ds"."secret_tbl"'},
headers=_auth(token),
)
assert r.status_code == 403, r.json()
detail = r.json().get("detail", {})
if isinstance(detail, dict):
assert detail.get("reason") == "bq_path_not_registered", detail
def test_unregistered_bq_path_rejected_403(seeded_app): def test_unregistered_bq_path_rejected_403(seeded_app):
"""Direct reference to a `bq.<ds>.<tbl>` that no registry row points at: """Direct reference to a `bq.<ds>.<tbl>` that no registry row points at:
403 with `bq_path_not_registered`. Caller has admin token (no per-name 403 with `bq_path_not_registered`. Caller has admin token (no per-name

View file

@ -25,6 +25,9 @@ def regex():
('SELECT * FROM bq.finance."ue"', ('finance', '"ue"')), ('SELECT * FROM bq.finance."ue"', ('finance', '"ue"')),
# Case-insensitive # Case-insensitive
('select * from BQ.Finance.UE', ('Finance', 'UE')), ('select * from BQ.Finance.UE', ('Finance', 'UE')),
# Quoted catalog token "bq" — Phase 3 review evasion: must be caught
('SELECT * FROM "bq"."finance"."ue"', ('"finance"', '"ue"')),
('SELECT * FROM "BQ"."ds"."tbl"', ('"ds"', '"tbl"')),
# With WHERE # With WHERE
('SELECT a FROM bq.ds.tbl WHERE x=1', ('ds', 'tbl')), ('SELECT a FROM bq.ds.tbl WHERE x=1', ('ds', 'tbl')),
# Inside CTE body # Inside CTE body
@ -55,6 +58,7 @@ def test_regex_finds_multiple_paths_in_one_statement(regex):
('SELECT bq.col FROM tbl', '2-part bq.col is column qualifier, not catalog'), ('SELECT bq.col FROM tbl', '2-part bq.col is column qualifier, not catalog'),
('SELECT count(*) FROM unit_economics', 'aggregate on bare name'), ('SELECT count(*) FROM unit_economics', 'aggregate on bare name'),
('SELECT * FROM other_bq.ds.tbl', 'prefix that contains bq'), ('SELECT * FROM other_bq.ds.tbl', 'prefix that contains bq'),
('SELECT * FROM "my_bq".ds.tbl', 'quoted prefix that contains bq'),
('SELECT * FROM x.bq.ds.tbl', 'bq is middle component, not catalog'), ('SELECT * FROM x.bq.ds.tbl', 'bq is middle component, not catalog'),
]) ])
def test_regex_rejects_non_bq_paths(regex, sql, reason): def test_regex_rejects_non_bq_paths(regex, sql, reason):