fix: Devin Review #1 — bigquery_query() first arg uses billing project, not data

In cross-project BQ setups (where billing != data), the SA typically has
serviceusage.services.use on the billing project but not on the data
project. The rewriter passed bq.projects.data as the first arg to
bigquery_query(), which BQ uses as the execution + billing project →
403 USER_PROJECT_DENIED.

Match the convention used everywhere else in the codebase
(app/api/v2_scan.py, app/api/v2_sample.py, app/api/v2_schema.py,
connectors/bigquery/extractor.py): backtick paths inside the inner SQL
use the **data** project (resolves the actual table location), the
bigquery_query() first arg uses the **billing** project (decides who
pays + which project the job runs under). For single-project deploys
the two are identical so the fix is a no-op there.

Test pins the cross-project case: data-prj for backticks, billing-prj
for the bigquery_query() first arg.
This commit is contained in:
ZdenekSrotyr 2026-05-06 14:07:38 +02:00
parent 77d88014df
commit 81d065b1ea
2 changed files with 59 additions and 12 deletions

View file

@ -763,17 +763,27 @@ def _rewrite_user_sql_for_bigquery_query(
# Skip 4: BQ project not configured. # Skip 4: BQ project not configured.
try: try:
bq = get_bq_access() bq = get_bq_access()
project = bq.projects.data data_project = bq.projects.data
# The first arg to `bigquery_query()` is the **execution / billing**
# project — the project under which the BQ job runs and is billed.
# In cross-project deployments the SA may only have
# `serviceusage.services.use` on the billing project, so passing
# the data project there returns 403 USER_PROJECT_DENIED. Match
# the convention used everywhere else in the codebase (v2_scan /
# v2_sample / v2_schema / extractor): backtick paths use the
# **data** project, `bigquery_query()` first-arg uses the
# **billing** project. For single-project deploys the two are
# identical so the fix is a no-op there.
billing_project = bq.projects.billing or data_project
except Exception: except Exception:
return user_sql, False return user_sql, False
if not project: if not data_project:
return user_sql, False return user_sql, False
# Rewrite identifiers, then wrap the whole thing in bigquery_query(). # Rewrite identifiers using the DATA project — backtick paths
# The DuckDB BQ extension's UDF expects (<billing-project-string>, # `<data-project>.<dataset>.<table>` resolve to the same logical
# <inner-sql-string>); we use the data project so the inner SQL's # source no matter which project bills the query.
# backticked paths resolve to the same project. inner_sql = _rewrite_bq_table_refs_to_native(user_sql, name_lookups, data_project)
inner_sql = _rewrite_bq_table_refs_to_native(user_sql, name_lookups, project)
# Embed the inner SQL using DuckDB's dollar-quoted string literal form # Embed the inner SQL using DuckDB's dollar-quoted string literal form
# (`$tag$ ... $tag$`). Naive `replace("'", "''")` doubling misses # (`$tag$ ... $tag$`). Naive `replace("'", "''")` doubling misses
@ -790,11 +800,11 @@ def _rewrite_user_sql_for_bigquery_query(
if DOLLAR_TAG in inner_sql: if DOLLAR_TAG in inner_sql:
escaped_inner = inner_sql.replace("'", "''") escaped_inner = inner_sql.replace("'", "''")
rewritten = ( rewritten = (
f"SELECT * FROM bigquery_query('{project}', '{escaped_inner}')" f"SELECT * FROM bigquery_query('{billing_project}', '{escaped_inner}')"
) )
else: else:
rewritten = ( rewritten = (
f"SELECT * FROM bigquery_query('{project}', " f"SELECT * FROM bigquery_query('{billing_project}', "
f"{DOLLAR_TAG}{inner_sql}{DOLLAR_TAG})" f"{DOLLAR_TAG}{inner_sql}{DOLLAR_TAG})"
) )
return rewritten, True return rewritten, True

View file

@ -67,11 +67,16 @@ def _register_local(conn, *, table_id, name, source_type="keboola"):
) )
def _set_bq_project(monkeypatch, project="test-prj"): def _set_bq_project(monkeypatch, project="test-prj", billing=None):
"""Stub get_bq_access so the rewriter sees a real-looking project ID.""" """Stub get_bq_access so the rewriter sees a real-looking project ID.
`project` configures the data project (used in backtick paths).
`billing` (when provided) configures a different billing project so
cross-project deployments can be exercised; defaults to `project`
for the single-project case."""
from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access
bq = BqAccess( bq = BqAccess(
BqProjects(billing=project, data=project), BqProjects(billing=billing or project, data=project),
client_factory=lambda projects: object(), client_factory=lambda projects: object(),
) )
monkeypatch.setattr( monkeypatch.setattr(
@ -546,6 +551,38 @@ def test_endpoint_falls_back_to_original_sql_on_bq_parse_error(
assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue" assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue"
def test_rewriter_uses_billing_project_for_bigquery_query_first_arg(
seeded_registry, monkeypatch,
):
"""Devin-review BUG #1: `bigquery_query()` first arg is the
**billing** project (where BQ jobs are billed + executed), backtick
paths use the **data** project. In cross-project deploys the SA
has `serviceusage.services.use` only on the billing project, so
using the data project as billing 403 USER_PROJECT_DENIED.
Match the existing convention in v2_scan / v2_sample / v2_schema /
extractor.
"""
from app.api.query import _rewrite_user_sql_for_bigquery_query
_register_bq_remote(
seeded_registry, table_id="bq.fin.ue", name="ue",
bucket="fin", source_table="ue",
)
_set_bq_project(monkeypatch, project="data-prj", billing="billing-prj")
rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
"SELECT count(*) FROM ue",
seeded_registry,
)
assert did_rewrite is True
# First arg of bigquery_query must be the billing project.
assert "bigquery_query('billing-prj'" in rewritten
# Backtick path must use the data project.
assert "`data-prj.fin.ue`" in rewritten
# And the data project must NOT appear as the first arg.
assert "bigquery_query('data-prj'" not in rewritten
def test_rewriter_skips_when_bq_row_bucket_contains_dot( def test_rewriter_skips_when_bq_row_bucket_contains_dot(
seeded_registry, monkeypatch, seeded_registry, monkeypatch,
): ):