From 81d065b1ea7a26d387ab79b357c08c5506b629dc Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Wed, 6 May 2026 14:07:38 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Devin=20Review=20#1=20=E2=80=94=20bigque?= =?UTF-8?q?ry=5Fquery()=20first=20arg=20uses=20billing=20project,=20not=20?= =?UTF-8?q?data?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In cross-project BQ setups (where billing != data), the SA typically has serviceusage.services.use on the billing project but not on the data project. The rewriter passed bq.projects.data as the first arg to bigquery_query(), which BQ uses as the execution + billing project → 403 USER_PROJECT_DENIED. Match the convention used everywhere else in the codebase (app/api/v2_scan.py, app/api/v2_sample.py, app/api/v2_schema.py, connectors/bigquery/extractor.py): backtick paths inside the inner SQL use the **data** project (resolves the actual table location), the bigquery_query() first arg uses the **billing** project (decides who pays + which project the job runs under). For single-project deploys the two are identical so the fix is a no-op there. Test pins the cross-project case: data-prj for backticks, billing-prj for the bigquery_query() first arg. --- app/api/query.py | 28 ++++++++++++------- tests/test_query_remote_rewrite.py | 43 +++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/app/api/query.py b/app/api/query.py index 207c27f..6dbbadd 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -763,17 +763,27 @@ def _rewrite_user_sql_for_bigquery_query( # Skip 4: BQ project not configured. try: bq = get_bq_access() - project = bq.projects.data + data_project = bq.projects.data + # The first arg to `bigquery_query()` is the **execution / billing** + # project — the project under which the BQ job runs and is billed. + # In cross-project deployments the SA may only have + # `serviceusage.services.use` on the billing project, so passing + # the data project there returns 403 USER_PROJECT_DENIED. Match + # the convention used everywhere else in the codebase (v2_scan / + # v2_sample / v2_schema / extractor): backtick paths use the + # **data** project, `bigquery_query()` first-arg uses the + # **billing** project. For single-project deploys the two are + # identical so the fix is a no-op there. + billing_project = bq.projects.billing or data_project except Exception: return user_sql, False - if not project: + if not data_project: return user_sql, False - # Rewrite identifiers, then wrap the whole thing in bigquery_query(). - # The DuckDB BQ extension's UDF expects (, - # ); we use the data project so the inner SQL's - # backticked paths resolve to the same project. - inner_sql = _rewrite_bq_table_refs_to_native(user_sql, name_lookups, project) + # Rewrite identifiers using the DATA project — backtick paths + # `..` resolve to the same logical + # source no matter which project bills the query. + inner_sql = _rewrite_bq_table_refs_to_native(user_sql, name_lookups, data_project) # Embed the inner SQL using DuckDB's dollar-quoted string literal form # (`$tag$ ... $tag$`). Naive `replace("'", "''")` doubling misses @@ -790,11 +800,11 @@ def _rewrite_user_sql_for_bigquery_query( if DOLLAR_TAG in inner_sql: escaped_inner = inner_sql.replace("'", "''") rewritten = ( - f"SELECT * FROM bigquery_query('{project}', '{escaped_inner}')" + f"SELECT * FROM bigquery_query('{billing_project}', '{escaped_inner}')" ) else: rewritten = ( - f"SELECT * FROM bigquery_query('{project}', " + f"SELECT * FROM bigquery_query('{billing_project}', " f"{DOLLAR_TAG}{inner_sql}{DOLLAR_TAG})" ) return rewritten, True diff --git a/tests/test_query_remote_rewrite.py b/tests/test_query_remote_rewrite.py index 224b2e1..bc84c51 100644 --- a/tests/test_query_remote_rewrite.py +++ b/tests/test_query_remote_rewrite.py @@ -67,11 +67,16 @@ def _register_local(conn, *, table_id, name, source_type="keboola"): ) -def _set_bq_project(monkeypatch, project="test-prj"): - """Stub get_bq_access so the rewriter sees a real-looking project ID.""" +def _set_bq_project(monkeypatch, project="test-prj", billing=None): + """Stub get_bq_access so the rewriter sees a real-looking project ID. + + `project` configures the data project (used in backtick paths). + `billing` (when provided) configures a different billing project so + cross-project deployments can be exercised; defaults to `project` + for the single-project case.""" from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access bq = BqAccess( - BqProjects(billing=project, data=project), + BqProjects(billing=billing or project, data=project), client_factory=lambda projects: object(), ) monkeypatch.setattr( @@ -546,6 +551,38 @@ def test_endpoint_falls_back_to_original_sql_on_bq_parse_error( assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue" +def test_rewriter_uses_billing_project_for_bigquery_query_first_arg( + seeded_registry, monkeypatch, +): + """Devin-review BUG #1: `bigquery_query()` first arg is the + **billing** project (where BQ jobs are billed + executed), backtick + paths use the **data** project. In cross-project deploys the SA + has `serviceusage.services.use` only on the billing project, so + using the data project as billing → 403 USER_PROJECT_DENIED. + + Match the existing convention in v2_scan / v2_sample / v2_schema / + extractor. + """ + from app.api.query import _rewrite_user_sql_for_bigquery_query + _register_bq_remote( + seeded_registry, table_id="bq.fin.ue", name="ue", + bucket="fin", source_table="ue", + ) + _set_bq_project(monkeypatch, project="data-prj", billing="billing-prj") + + rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query( + "SELECT count(*) FROM ue", + seeded_registry, + ) + assert did_rewrite is True + # First arg of bigquery_query must be the billing project. + assert "bigquery_query('billing-prj'" in rewritten + # Backtick path must use the data project. + assert "`data-prj.fin.ue`" in rewritten + # And the data project must NOT appear as the first arg. + assert "bigquery_query('data-prj'" not in rewritten + + def test_rewriter_skips_when_bq_row_bucket_contains_dot( seeded_registry, monkeypatch, ):