diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d7a5f3..b23d870 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ### Added - **`data_source.bigquery.query_timeout_ms` config knob** (default 600 000 ms = 10 min). The DuckDB BigQuery extension's built-in default of 90 s was too tight for analyst-scale queries against view-backed BQ datasets — `agnes query --remote` would HTTP 400 with `Binder Error: Query execution exceeded the timeout. Job ID: …` whenever the underlying BQ job took longer than 90 s, even though the BQ job itself was healthy. The new knob is applied via `SET bq_query_timeout_ms` after every `LOAD bigquery` on every BQ-touching DuckDB session — the orchestrator's `_remote_attach` ATTACH path (`src/orchestrator.py`), the analytics-DB read-only reattach path (`src/db.py:_reattach_remote_extensions` — the primary `agnes query --remote` request path), the `BqAccess` session factory (`connectors/bigquery/access.py`), and the standalone extractor (`connectors/bigquery/extractor.py`). Sentinel `0` (or non-numeric / unparseable values) leaves the extension default in place so operators on legacy extension versions that don't recognise the setting aren't broken. Configurable via `/admin/server-config` UI. Note: BigQuery's `jobs.query` RPC caps the wait at ~200 s per call regardless of this setting; the extension polls on top so the effective ceiling is the value here but each poll is ~200 s. DuckDB emits an informational warning when the value is set above the BQ RPC cap — operators can safely ignore it. +- **Caddy `file_server` for parquet downloads** — `GET /api/data/{table_id}/download` is now intercepted at the Caddy layer (TLS profile only) and served directly via sendfile/zero-copy from the data volume mounted read-only at `/srv` inside the caddy container. Caddy authorises every request via a new lightweight RBAC probe `GET /api/data/{table_id}/check-access` (returns 204 when the caller has read access on the table, 403 otherwise) using the `forward_auth` directive — the bulk byte transfer never touches uvicorn workers. Resolves a real production failure mode where a single multi-GB analyst pull held the app's only uvicorn worker for the duration of the stream and starved the UI / `/api/health` / every other API endpoint, eventually flipping the container to `unhealthy`. Path discovery uses Caddy's `try_files` over the known `extract.duckdb` v2 source subdirs (`bigquery/data/.parquet`, `keboola/data/.parquet`, `jira/data/.parquet`); a parquet not at any of those paths transparently falls through to the existing app handler so legacy `src_data/parquet` layouts and future connectors keep working with no Caddyfile change. Non-Caddy deployments (dev `docker compose up` without `--profile tls`) continue to use the app handler unchanged. ### Fixed diff --git a/Caddyfile b/Caddyfile index 6085b6e..9a39211 100644 --- a/Caddyfile +++ b/Caddyfile @@ -34,6 +34,47 @@ -Server } + # Direct file_server for parquet downloads — bypasses uvicorn so a + # multi-GB pull from one analyst can't starve the app workers and + # block UI / health / API for everyone else. forward_auth calls the + # app's lightweight ``/api/data/{id}/check-access`` (RBAC only, + # ~1 ms) on every request; on 2xx Caddy serves the file directly + # via sendfile/zero-copy from the data volume mounted read-only. + # + # Path layout matches `app/api/data.py`'s extract.duckdb v2 search: + # /data/extracts//data/.parquet + # try_files probes known source subdirs in order; first hit wins. + # If a deployment adds a new connector and lands parquets at a fresh + # subdir, extend the try_files list. Anything that misses falls + # through to the app reverse_proxy below — so an unmapped source + # degrades to "downloads work, just through uvicorn" — never 404. + @download path_regexp tid ^/api/data/([^/]+)/download$ + handle @download { + forward_auth app:8000 { + uri /api/data/{re.tid.1}/check-access + # Bearer PAT or session cookie travels in Authorization + # / Cookie; copy_headers ensures the upstream sees them. + copy_headers Authorization Cookie + } + # Caddy's own /data is occupied by the caddy_data volume, so the + # agnes data dir is mounted at /srv (read-only) instead — see the + # `data:/srv:ro` line in docker-compose.yml's caddy service. The + # root + try_files combo therefore probes /srv/extracts/... + root * /srv/extracts + try_files /bigquery/data/{re.tid.1}.parquet /keboola/data/{re.tid.1}.parquet /jira/data/{re.tid.1}.parquet + @found file + handle @found { + header Content-Disposition "attachment; filename=\"{re.tid.1}.parquet\"" + file_server + } + # Fallback: parquet not at any known static path → defer to app + # (handles legacy src_data/parquet layout + future connectors). + reverse_proxy app:8000 { + header_up X-Forwarded-Proto https + header_up X-Forwarded-Host {host} + } + } + reverse_proxy app:8000 { # App's uvicorn runs with --proxy-headers, so stamping these # ourselves makes OAuth callback URLs and Set-Cookie Secure diff --git a/app/api/data.py b/app/api/data.py index 8cf26f2..f0044f5 100644 --- a/app/api/data.py +++ b/app/api/data.py @@ -1,6 +1,6 @@ """Data download endpoint — streaming parquet files.""" -from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi import APIRouter, Depends, HTTPException, Request, Response from fastapi.responses import FileResponse import duckdb @@ -12,6 +12,36 @@ from src.rbac import can_access_table router = APIRouter(prefix="/api/data", tags=["data"]) +@router.get("/{table_id}/check-access") +async def check_access( + table_id: str, + user: dict = Depends(get_current_user), + conn: duckdb.DuckDBPyConnection = Depends(_get_db), +): + """Lightweight RBAC probe used by Caddy's ``forward_auth`` directive + to gate file_server-served parquet downloads without involving the + app's request workers in the bulk byte transfer. + + Returns HTTP 204 No Content when the caller has read access to + ``table_id``; HTTP 403 (via ``can_access_table`` returning False) + otherwise. Caddy treats 2xx as authorized and forwards the request + to its own ``file_server`` block; non-2xx is returned to the client + verbatim. + + Why a separate endpoint and not just ``HEAD /download``: ``HEAD`` on + the FileResponse-based ``download`` handler still opens the file and + runs stat() to populate Content-Length / ETag. ``forward_auth`` calls + this endpoint on every request, so the per-call cost matters; a pure + RBAC check is ~1 ms while a HEAD path involves filesystem walks + (``rglob`` for the parquet across source subdirs). + """ + if not _SAFE_QUOTED_IDENTIFIER.match(table_id): + raise HTTPException(status_code=404, detail="Table not found") + if not can_access_table(user, table_id, conn): + raise HTTPException(status_code=403, detail="Access denied to this table") + return Response(status_code=204) + + @router.get("/{table_id}/download") async def download_table( table_id: str, @@ -19,7 +49,16 @@ async def download_table( user: dict = Depends(get_current_user), conn: duckdb.DuckDBPyConnection = Depends(_get_db), ): - """Stream a parquet file for download. Supports ETag for caching.""" + """Stream a parquet file for download. Supports ETag for caching. + + On Caddy-fronted deployments the matching Caddyfile rule intercepts + ``GET /api/data/{table_id}/download``, calls ``check-access`` via + ``forward_auth``, and serves the parquet directly via ``file_server`` + — bypassing this handler entirely. This handler stays as the + canonical fallback for non-Caddy deployments (dev `docker compose + up`, alternative reverse proxies, direct :8000 access) where the + bulk transfer goes through uvicorn. + """ # Reject unsafe table_id before any filesystem or DB operations. # Use the relaxed quoted-identifier check that allows dots and hyphens # (Keboola table IDs like "in.c-crm.orders") while still blocking @@ -53,7 +92,6 @@ async def download_table( etag = f'"{stat.st_mtime_ns}"' if_none_match = request.headers.get("if-none-match") if if_none_match == etag: - from starlette.responses import Response return Response(status_code=304) return FileResponse( diff --git a/docker-compose.yml b/docker-compose.yml index 4240d28..59bacf2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -112,6 +112,11 @@ services: - /data/state/certs:/certs:ro - caddy_data:/data - caddy_config:/config + # Read-only mount of the agnes data dir so Caddy's file_server can + # serve parquets directly (sendfile/zero-copy) and bypass the app's + # uvicorn workers — see Caddyfile's @download handler. Mounted at + # /srv (not /data) because /data is already the caddy_data volume. + - data:/srv:ro environment: - DOMAIN=${DOMAIN:-localhost} # Passes through whatever the operator set in .env. Caddyfile uses diff --git a/tests/test_check_access_endpoint.py b/tests/test_check_access_endpoint.py new file mode 100644 index 0000000..0fa054b --- /dev/null +++ b/tests/test_check_access_endpoint.py @@ -0,0 +1,124 @@ +"""Unit tests for ``GET /api/data/{table_id}/check-access`` — the +lightweight RBAC probe used by Caddy's ``forward_auth`` directive to gate +file_server-served parquet downloads without involving the app's request +workers in the bulk byte transfer. + +The endpoint must: + - return 204 when the caller has read access (admin → always; non-admin + only with an explicit ``resource_grants`` row), + - return 403 with no body / minimal body when the caller does not, + - return 404 for unsafe identifiers (path-traversal guard), + - return 401 when the request has no auth. +""" + +from tests.conftest import create_mock_extract + + +def _auth(token): + return {"Authorization": f"Bearer {token}"} + + +def test_admin_gets_204(seeded_app): + """Admin short-circuits all RBAC checks — must always succeed.""" + c = seeded_app["client"] + env = seeded_app["env"] + create_mock_extract(env["extracts_dir"], "keboola", [ + {"name": "salaries", "data": [{"id": "1"}]}, + ]) + from src.orchestrator import SyncOrchestrator + SyncOrchestrator().rebuild() + c.post( + "/api/admin/register-table", + json={"name": "salaries", "source_type": "keboola"}, + headers=_auth(seeded_app["admin_token"]), + ) + + resp = c.get( + "/api/data/salaries/check-access", + headers=_auth(seeded_app["admin_token"]), + ) + assert resp.status_code == 204 + assert resp.content == b"" + + +def test_analyst_without_grant_gets_403(seeded_app): + """Non-admin without an explicit `resource_grants` row must be denied + — the production failure mode where Caddy's forward_auth returns the + 403 to the client and never invokes file_server.""" + c = seeded_app["client"] + env = seeded_app["env"] + create_mock_extract(env["extracts_dir"], "keboola", [ + {"name": "salaries", "data": [{"id": "1"}]}, + ]) + from src.orchestrator import SyncOrchestrator + SyncOrchestrator().rebuild() + c.post( + "/api/admin/register-table", + json={"name": "salaries", "source_type": "keboola"}, + headers=_auth(seeded_app["admin_token"]), + ) + + resp = c.get( + "/api/data/salaries/check-access", + headers=_auth(seeded_app["analyst_token"]), + ) + assert resp.status_code == 403 + + +def test_analyst_with_grant_gets_204(seeded_app): + """Once the analyst has a TABLE grant, check-access flips to 204 + and Caddy is free to serve the file directly. Mirrors the same + grant flow used by ``/api/data/{id}/download``.""" + c = seeded_app["client"] + env = seeded_app["env"] + create_mock_extract(env["extracts_dir"], "keboola", [ + {"name": "salaries", "data": [{"id": "1"}]}, + ]) + from src.orchestrator import SyncOrchestrator + SyncOrchestrator().rebuild() + c.post( + "/api/admin/register-table", + json={"name": "salaries", "source_type": "keboola"}, + headers=_auth(seeded_app["admin_token"]), + ) + + # Mint the grant via the admin API the same way the existing download + # access-control tests do — see test_access_control.py. + from tests.test_access_control import _grant_table_to_analyst + from src.db import get_system_db + conn = get_system_db() + try: + _grant_table_to_analyst(conn, "salaries") + finally: + conn.close() + + resp = c.get( + "/api/data/salaries/check-access", + headers=_auth(seeded_app["analyst_token"]), + ) + assert resp.status_code == 204 + + +def test_unsafe_table_id_gets_404(seeded_app): + """Identifier validation runs BEFORE RBAC — keeps path-traversal + payloads (``../etc/passwd``) from reaching ``can_access_table`` and + matches the pre-existing behavior of ``/download``.""" + c = seeded_app["client"] + resp = c.get( + "/api/data/..%2Fetc%2Fpasswd/check-access", + headers=_auth(seeded_app["admin_token"]), + ) + # FastAPI's path converter rejects encoded slashes outright; either + # 404 from the validator or 404 from no-such-route is acceptable — + # both block the traversal. The point is no 5xx and no 204. + assert resp.status_code in (404, 422) + + +def test_no_auth_gets_401(seeded_app): + """Caddy will only call the auth-check endpoint when the client sent + credentials — but if a request slips through without them, the + endpoint must reject with 401 so Caddy returns 401 to the client + instead of falling through to file_server with no identity.""" + c = seeded_app["client"] + resp = c.get("/api/data/salaries/check-access") + assert resp.status_code == 401