diff --git a/app/api/query.py b/app/api/query.py index c5ec7a2..121cceb 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -33,23 +33,29 @@ router = APIRouter(prefix="/api/query", tags=["query"]) # Heuristic: did the BQ-side execution of a `bigquery_query()`-rewritten -# query reject the inner SQL? Errors we want to fall back on are upstream -# parse / validation errors — DuckDB-only syntax that survives identifier -# rewrite (`::INT` casts, `STRPTIME`, COALESCE arity differences) is -# accepted by the wrapping DuckDB layer but BQ refuses with a parse -# message that DuckDB surfaces back as a BinderException / RuntimeError -# with the BQ message embedded. Forbidden / quota / network errors should -# NOT trigger fallback (they would just fail again on the legacy path). +# query reject the inner SQL because of a **DuckDB-vs-BQ dialect mismatch** +# specifically? We want to fall back ONLY on cases where the same SQL +# would have worked under the legacy DuckDB ATTACH-catalog path — +# DuckDB-only syntax (``::INT`` casts, ``STRPTIME``, COALESCE arity quirks) +# that BQ's parser rejects. +# +# We DO NOT want to fall back on user-data errors that BQ would reject in +# either path (unknown column name, wrong function signature, invalid cast +# of literal user input). For those, the legacy ATTACH path would issue +# the same query and fail the same way — just 50-100× slower. Triggering +# fallback there is a 2× latency tax on every typo (devil's-advocate R1 +# finding #2). +# +# Conservative pattern set: only ``Syntax error`` covers genuine +# parse-level dialect mismatch. ``Unrecognized name`` etc. surface for +# both bad-user-column AND DuckDB-only-name cases — the safe assumption +# is that user-column-typo is the more common case, so we don't fall +# back. If a deployment surfaces a real DuckDB-only-name regression, +# it's better caught as a BinderException with the original SQL in the +# logs than amplified via slow-path retry. _BQ_REWRITE_PARSE_ERROR_PATTERNS = ( "Syntax error", "syntax error", - "Unrecognized name", - "unrecognized name", - "Function not found", - "function not found", - "No matching signature", - "Invalid cast", - "invalid cast", ) @@ -677,6 +683,17 @@ def _rewrite_user_sql_for_bigquery_query( # surface anything user-visible. return user_sql, False + # Multi-project guard (devil's-advocate R1 finding #5): the rewriter + # assumes every BQ-remote table resolves under the single + # `bq.projects.data` project. The current registry schema doesn't + # store `source_project` per row, so `bucket` is the only place a + # cross-project leak could hide. A bucket containing `.` (e.g. + # `other_prj.dataset`) suggests the operator encoded a project + # prefix into the bucket name — wrapping that under our single + # project would silently target the wrong project. Conservative + # skip: any BQ row whose bucket contains `.` aborts the rewrite, + # falling through to the legacy ATTACH-catalog path which uses + # whatever resolution the operator's _remote_attach configured. for r in bq_rows: if (r.get("query_mode") or "") != "remote": continue @@ -685,6 +702,11 @@ def _rewrite_user_sql_for_bigquery_query( name = r.get("name") if not (bucket and source_table and name): continue + if "." in str(bucket): + # Project-qualified bucket — can't safely wrap under our + # single-project assumption. Bail out completely so we don't + # mix rewritten and non-rewritten BQ paths in one query. + return user_sql, False pattern = r'\b' + re.escape(str(name).lower()) + r'\b' if re.search(pattern, sql_lower): key = (bucket.lower(), source_table.lower()) diff --git a/cli/client.py b/cli/client.py index d8ff1af..d791e8c 100644 --- a/cli/client.py +++ b/cli/client.py @@ -322,14 +322,55 @@ def _probe_range_support(client: httpx.Client, path: str) -> tuple[int, bool]: Never raises; transport errors during the probe are treated as "no chunking, try the GET instead and let it surface the failure in the normal retry loop". + + Probe order: HEAD first (cheap, idempotent), then GET-with-tiny-range + fallback. The HEAD path covers Caddy's `file_server` (which advertises + HEAD) and Caddy's `reverse_proxy` (which forwards HEAD upstream). The + GET-fallback covers the dev `docker compose up` deployment where + requests go straight to FastAPI's GET-only `/api/data/{tid}/download` + route — FastAPI returns **405 Method Not Allowed** to a HEAD on a + GET-only route, which without this fallback would silently disable + chunked download for every dev / non-TLS install. The GET-with-Range + probe asks for 1 byte so the server response is bounded; we discard + the body and read only the headers + status code. """ try: resp = client.head(path) - if getattr(resp, "status_code", 200) >= 400: - return (0, False) - size = int(resp.headers.get("content-length", "0") or 0) - accepts = (resp.headers.get("accept-ranges", "").lower() == "bytes") - return (size, accepts) + status = getattr(resp, "status_code", 200) + if status < 400: + size = int(resp.headers.get("content-length", "0") or 0) + accepts = (resp.headers.get("accept-ranges", "").lower() == "bytes") + if size > 0: + return (size, accepts) + # HEAD failed (405 from GET-only route is the common case in + # non-Caddy deployments) or returned 0-length — fall through to + # the tiny-Range GET probe. + except Exception: + pass + try: + with client.stream("GET", path, headers={"Range": "bytes=0-0"}) as resp: + status = getattr(resp, "status_code", 0) + if status not in (200, 206): + return (0, False) + # Drain the 1-byte body so the connection is reusable. + for _ in resp.iter_bytes(): + pass + # Content-Range on a 206 response carries the total: `bytes 0-0/12345`. + # On a 200 response the server didn't honor Range — content-length is the total. + if status == 206: + cr = resp.headers.get("content-range", "") + if "/" in cr: + try: + total = int(cr.rsplit("/", 1)[1]) + return (total, True) + except ValueError: + return (0, False) + return (0, False) + # status == 200 → server ignored Range; we can read content-length but + # accept-ranges is False (or missing) so the caller will not chunk. + size = int(resp.headers.get("content-length", "0") or 0) + accepts = (resp.headers.get("accept-ranges", "").lower() == "bytes") + return (size, accepts) except Exception: return (0, False) @@ -552,6 +593,22 @@ def _stream_download_via( if parallelism > 1: total_size, accepts_ranges = _probe_range_support(client, path) + # Sanity bound on the advertised total size (devil's-advocate R1 + # finding #4): a misconfigured proxy or buggy server returning a + # wildly inflated `Content-Length` would make us split into huge + # `Range: bytes=N-M` requests; the server then clamps each to actual + # bytes available, and we end up with overlapping bytes from the + # start of the file in every part → corrupt assembled output (caught + # later by manifest hash check, but only after wasted bandwidth). + # 100 GiB is the operational ceiling for any single materialized + # parquet on a typical Agnes deployment; values above suggest a + # server / proxy bug rather than a legitimate huge file. Drop to + # single-stream (which can't be confused by overlapping chunks). + SANE_MAX_TOTAL = 100 * 1024**3 # 100 GiB + if total_size > SANE_MAX_TOTAL: + total_size = 0 + accepts_ranges = False + use_chunked = ( parallelism > 1 and accepts_ranges diff --git a/connectors/bigquery/access.py b/connectors/bigquery/access.py index 5dee173..b26a2d1 100644 --- a/connectors/bigquery/access.py +++ b/connectors/bigquery/access.py @@ -435,6 +435,22 @@ def _default_duckdb_session_factory(projects: BqProjects): except Exception: pass continue + # Re-apply session settings (`bq_query_timeout_ms`, …) on + # every reuse so an operator's `/admin/server-config` change + # propagates to pooled entries without requiring container + # restart. Without this, a long-lived pool entry keeps the + # value baked in at first build forever (devil's-advocate + # R1 finding #3). `apply_bq_session_settings` is idempotent + # and fail-soft — re-running on every acquire is cheap. + try: + apply_bq_session_settings(entry) + except Exception: + # apply_bq_session_settings is documented as never + # raising for legitimate "extension doesn't recognise + # setting" cases (it only logs). Defensive guard for + # any unforeseen failure mode — keep the entry, the + # caller's actual query may still succeed. + pass conn = entry break diff --git a/tests/test_query_remote_rewrite.py b/tests/test_query_remote_rewrite.py index 9147a32..224b2e1 100644 --- a/tests/test_query_remote_rewrite.py +++ b/tests/test_query_remote_rewrite.py @@ -546,6 +546,85 @@ def test_endpoint_falls_back_to_original_sql_on_bq_parse_error( assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue" +def test_rewriter_skips_when_bq_row_bucket_contains_dot( + seeded_registry, monkeypatch, +): + """Devil's-advocate R1 finding #5: a BQ row whose `bucket` contains + `.` suggests the operator encoded a project prefix in the bucket + name. Wrapping under our single-project assumption could silently + target the wrong project. Rewriter must skip in that case (fall + through to ATTACH-catalog path which respects the operator's + `_remote_attach` configuration). + """ + from app.api.query import _rewrite_user_sql_for_bigquery_query + _register_bq_remote( + seeded_registry, + table_id="bq.other-prj.dataset.ue", + name="ue", + # Project-qualified bucket — the multi-project red flag. + bucket="other-prj.dataset", + source_table="ue", + ) + _set_bq_project(monkeypatch, "test-prj") + + rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query( + "SELECT count(*) FROM ue", + seeded_registry, + ) + # Skip — original SQL returned, no rewrite. + assert did_rewrite is False + assert rewritten == "SELECT count(*) FROM ue" + + +def test_fallback_does_not_trigger_on_user_column_typo( + seeded_app, stub_bq_for_endpoint, monkeypatch, +): + """Devil's-advocate R1 finding #2: previously the fallback + heuristic matched `Unrecognized name`, which BQ surfaces for both + DuckDB-only-name AND user-column-typo cases. The user-typo case + triggered re-running the original SQL through the slow ATTACH- + catalog path (90+ s) → 2× latency tax on every typo. + + Post-fix: heuristic only matches `Syntax error`. A BQ-side + `Unrecognized name: bad_col` should propagate as-is, NOT trigger + a fallback retry. + """ + _register_bq_remote_row("ue", "fin", "ue") + + calls = {"sqls": []} + + class _StubAnalytics: + description = [("c0",)] + def execute(self, sql, *args, **kwargs): + calls["sqls"].append(sql) + raise RuntimeError( + "BinderException: Query execution failed: " + "Unrecognized name: bad_col at [1:8]" + ) + def close(self): + pass + + monkeypatch.setattr( + "app.api.query.get_analytics_db_readonly", + lambda: _StubAnalytics(), + raising=False, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={"sql": "SELECT bad_col FROM ue"}, + headers=_auth(token), + ) + # Error propagates; fallback NOT triggered (only one execute call). + assert r.status_code in (400, 500, 502) + assert len(calls["sqls"]) == 1, ( + "user column typo must NOT trigger fallback retry" + ) + assert "bigquery_query(" in calls["sqls"][0] + + def test_endpoint_does_not_fall_back_on_non_parse_errors( seeded_app, stub_bq_for_endpoint, monkeypatch, ):