diff --git a/app/api/query.py b/app/api/query.py index 207c27f..6dbbadd 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -763,17 +763,27 @@ def _rewrite_user_sql_for_bigquery_query( # Skip 4: BQ project not configured. try: bq = get_bq_access() - project = bq.projects.data + data_project = bq.projects.data + # The first arg to `bigquery_query()` is the **execution / billing** + # project — the project under which the BQ job runs and is billed. + # In cross-project deployments the SA may only have + # `serviceusage.services.use` on the billing project, so passing + # the data project there returns 403 USER_PROJECT_DENIED. Match + # the convention used everywhere else in the codebase (v2_scan / + # v2_sample / v2_schema / extractor): backtick paths use the + # **data** project, `bigquery_query()` first-arg uses the + # **billing** project. For single-project deploys the two are + # identical so the fix is a no-op there. + billing_project = bq.projects.billing or data_project except Exception: return user_sql, False - if not project: + if not data_project: return user_sql, False - # Rewrite identifiers, then wrap the whole thing in bigquery_query(). - # The DuckDB BQ extension's UDF expects (, - # ); we use the data project so the inner SQL's - # backticked paths resolve to the same project. - inner_sql = _rewrite_bq_table_refs_to_native(user_sql, name_lookups, project) + # Rewrite identifiers using the DATA project — backtick paths + # `..` resolve to the same logical + # source no matter which project bills the query. + inner_sql = _rewrite_bq_table_refs_to_native(user_sql, name_lookups, data_project) # Embed the inner SQL using DuckDB's dollar-quoted string literal form # (`$tag$ ... $tag$`). Naive `replace("'", "''")` doubling misses @@ -790,11 +800,11 @@ def _rewrite_user_sql_for_bigquery_query( if DOLLAR_TAG in inner_sql: escaped_inner = inner_sql.replace("'", "''") rewritten = ( - f"SELECT * FROM bigquery_query('{project}', '{escaped_inner}')" + f"SELECT * FROM bigquery_query('{billing_project}', '{escaped_inner}')" ) else: rewritten = ( - f"SELECT * FROM bigquery_query('{project}', " + f"SELECT * FROM bigquery_query('{billing_project}', " f"{DOLLAR_TAG}{inner_sql}{DOLLAR_TAG})" ) return rewritten, True diff --git a/tests/test_query_remote_rewrite.py b/tests/test_query_remote_rewrite.py index 224b2e1..bc84c51 100644 --- a/tests/test_query_remote_rewrite.py +++ b/tests/test_query_remote_rewrite.py @@ -67,11 +67,16 @@ def _register_local(conn, *, table_id, name, source_type="keboola"): ) -def _set_bq_project(monkeypatch, project="test-prj"): - """Stub get_bq_access so the rewriter sees a real-looking project ID.""" +def _set_bq_project(monkeypatch, project="test-prj", billing=None): + """Stub get_bq_access so the rewriter sees a real-looking project ID. + + `project` configures the data project (used in backtick paths). + `billing` (when provided) configures a different billing project so + cross-project deployments can be exercised; defaults to `project` + for the single-project case.""" from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access bq = BqAccess( - BqProjects(billing=project, data=project), + BqProjects(billing=billing or project, data=project), client_factory=lambda projects: object(), ) monkeypatch.setattr( @@ -546,6 +551,38 @@ def test_endpoint_falls_back_to_original_sql_on_bq_parse_error( assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue" +def test_rewriter_uses_billing_project_for_bigquery_query_first_arg( + seeded_registry, monkeypatch, +): + """Devin-review BUG #1: `bigquery_query()` first arg is the + **billing** project (where BQ jobs are billed + executed), backtick + paths use the **data** project. In cross-project deploys the SA + has `serviceusage.services.use` only on the billing project, so + using the data project as billing → 403 USER_PROJECT_DENIED. + + Match the existing convention in v2_scan / v2_sample / v2_schema / + extractor. + """ + from app.api.query import _rewrite_user_sql_for_bigquery_query + _register_bq_remote( + seeded_registry, table_id="bq.fin.ue", name="ue", + bucket="fin", source_table="ue", + ) + _set_bq_project(monkeypatch, project="data-prj", billing="billing-prj") + + rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query( + "SELECT count(*) FROM ue", + seeded_registry, + ) + assert did_rewrite is True + # First arg of bigquery_query must be the billing project. + assert "bigquery_query('billing-prj'" in rewritten + # Backtick path must use the data project. + assert "`data-prj.fin.ue`" in rewritten + # And the data project must NOT appear as the first arg. + assert "bigquery_query('data-prj'" not in rewritten + + def test_rewriter_skips_when_bq_row_bucket_contains_dot( seeded_registry, monkeypatch, ):