fix: Devin Review #1 — bigquery_query() first arg uses billing project, not data
In cross-project BQ setups (where billing != data), the SA typically has serviceusage.services.use on the billing project but not on the data project. The rewriter passed bq.projects.data as the first arg to bigquery_query(), which BQ uses as the execution + billing project → 403 USER_PROJECT_DENIED. Match the convention used everywhere else in the codebase (app/api/v2_scan.py, app/api/v2_sample.py, app/api/v2_schema.py, connectors/bigquery/extractor.py): backtick paths inside the inner SQL use the **data** project (resolves the actual table location), the bigquery_query() first arg uses the **billing** project (decides who pays + which project the job runs under). For single-project deploys the two are identical so the fix is a no-op there. Test pins the cross-project case: data-prj for backticks, billing-prj for the bigquery_query() first arg.
This commit is contained in:
parent
77d88014df
commit
81d065b1ea
2 changed files with 59 additions and 12 deletions
|
|
@ -763,17 +763,27 @@ def _rewrite_user_sql_for_bigquery_query(
|
||||||
# Skip 4: BQ project not configured.
|
# Skip 4: BQ project not configured.
|
||||||
try:
|
try:
|
||||||
bq = get_bq_access()
|
bq = get_bq_access()
|
||||||
project = bq.projects.data
|
data_project = bq.projects.data
|
||||||
|
# The first arg to `bigquery_query()` is the **execution / billing**
|
||||||
|
# project — the project under which the BQ job runs and is billed.
|
||||||
|
# In cross-project deployments the SA may only have
|
||||||
|
# `serviceusage.services.use` on the billing project, so passing
|
||||||
|
# the data project there returns 403 USER_PROJECT_DENIED. Match
|
||||||
|
# the convention used everywhere else in the codebase (v2_scan /
|
||||||
|
# v2_sample / v2_schema / extractor): backtick paths use the
|
||||||
|
# **data** project, `bigquery_query()` first-arg uses the
|
||||||
|
# **billing** project. For single-project deploys the two are
|
||||||
|
# identical so the fix is a no-op there.
|
||||||
|
billing_project = bq.projects.billing or data_project
|
||||||
except Exception:
|
except Exception:
|
||||||
return user_sql, False
|
return user_sql, False
|
||||||
if not project:
|
if not data_project:
|
||||||
return user_sql, False
|
return user_sql, False
|
||||||
|
|
||||||
# Rewrite identifiers, then wrap the whole thing in bigquery_query().
|
# Rewrite identifiers using the DATA project — backtick paths
|
||||||
# The DuckDB BQ extension's UDF expects (<billing-project-string>,
|
# `<data-project>.<dataset>.<table>` resolve to the same logical
|
||||||
# <inner-sql-string>); we use the data project so the inner SQL's
|
# source no matter which project bills the query.
|
||||||
# backticked paths resolve to the same project.
|
inner_sql = _rewrite_bq_table_refs_to_native(user_sql, name_lookups, data_project)
|
||||||
inner_sql = _rewrite_bq_table_refs_to_native(user_sql, name_lookups, project)
|
|
||||||
|
|
||||||
# Embed the inner SQL using DuckDB's dollar-quoted string literal form
|
# Embed the inner SQL using DuckDB's dollar-quoted string literal form
|
||||||
# (`$tag$ ... $tag$`). Naive `replace("'", "''")` doubling misses
|
# (`$tag$ ... $tag$`). Naive `replace("'", "''")` doubling misses
|
||||||
|
|
@ -790,11 +800,11 @@ def _rewrite_user_sql_for_bigquery_query(
|
||||||
if DOLLAR_TAG in inner_sql:
|
if DOLLAR_TAG in inner_sql:
|
||||||
escaped_inner = inner_sql.replace("'", "''")
|
escaped_inner = inner_sql.replace("'", "''")
|
||||||
rewritten = (
|
rewritten = (
|
||||||
f"SELECT * FROM bigquery_query('{project}', '{escaped_inner}')"
|
f"SELECT * FROM bigquery_query('{billing_project}', '{escaped_inner}')"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
rewritten = (
|
rewritten = (
|
||||||
f"SELECT * FROM bigquery_query('{project}', "
|
f"SELECT * FROM bigquery_query('{billing_project}', "
|
||||||
f"{DOLLAR_TAG}{inner_sql}{DOLLAR_TAG})"
|
f"{DOLLAR_TAG}{inner_sql}{DOLLAR_TAG})"
|
||||||
)
|
)
|
||||||
return rewritten, True
|
return rewritten, True
|
||||||
|
|
|
||||||
|
|
@ -67,11 +67,16 @@ def _register_local(conn, *, table_id, name, source_type="keboola"):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _set_bq_project(monkeypatch, project="test-prj"):
|
def _set_bq_project(monkeypatch, project="test-prj", billing=None):
|
||||||
"""Stub get_bq_access so the rewriter sees a real-looking project ID."""
|
"""Stub get_bq_access so the rewriter sees a real-looking project ID.
|
||||||
|
|
||||||
|
`project` configures the data project (used in backtick paths).
|
||||||
|
`billing` (when provided) configures a different billing project so
|
||||||
|
cross-project deployments can be exercised; defaults to `project`
|
||||||
|
for the single-project case."""
|
||||||
from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access
|
from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access
|
||||||
bq = BqAccess(
|
bq = BqAccess(
|
||||||
BqProjects(billing=project, data=project),
|
BqProjects(billing=billing or project, data=project),
|
||||||
client_factory=lambda projects: object(),
|
client_factory=lambda projects: object(),
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
|
|
@ -546,6 +551,38 @@ def test_endpoint_falls_back_to_original_sql_on_bq_parse_error(
|
||||||
assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue"
|
assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue"
|
||||||
|
|
||||||
|
|
||||||
|
def test_rewriter_uses_billing_project_for_bigquery_query_first_arg(
|
||||||
|
seeded_registry, monkeypatch,
|
||||||
|
):
|
||||||
|
"""Devin-review BUG #1: `bigquery_query()` first arg is the
|
||||||
|
**billing** project (where BQ jobs are billed + executed), backtick
|
||||||
|
paths use the **data** project. In cross-project deploys the SA
|
||||||
|
has `serviceusage.services.use` only on the billing project, so
|
||||||
|
using the data project as billing → 403 USER_PROJECT_DENIED.
|
||||||
|
|
||||||
|
Match the existing convention in v2_scan / v2_sample / v2_schema /
|
||||||
|
extractor.
|
||||||
|
"""
|
||||||
|
from app.api.query import _rewrite_user_sql_for_bigquery_query
|
||||||
|
_register_bq_remote(
|
||||||
|
seeded_registry, table_id="bq.fin.ue", name="ue",
|
||||||
|
bucket="fin", source_table="ue",
|
||||||
|
)
|
||||||
|
_set_bq_project(monkeypatch, project="data-prj", billing="billing-prj")
|
||||||
|
|
||||||
|
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
|
||||||
|
"SELECT count(*) FROM ue",
|
||||||
|
seeded_registry,
|
||||||
|
)
|
||||||
|
assert did_rewrite is True
|
||||||
|
# First arg of bigquery_query must be the billing project.
|
||||||
|
assert "bigquery_query('billing-prj'" in rewritten
|
||||||
|
# Backtick path must use the data project.
|
||||||
|
assert "`data-prj.fin.ue`" in rewritten
|
||||||
|
# And the data project must NOT appear as the first arg.
|
||||||
|
assert "bigquery_query('data-prj'" not in rewritten
|
||||||
|
|
||||||
|
|
||||||
def test_rewriter_skips_when_bq_row_bucket_contains_dot(
|
def test_rewriter_skips_when_bq_row_bucket_contains_dot(
|
||||||
seeded_registry, monkeypatch,
|
seeded_registry, monkeypatch,
|
||||||
):
|
):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue