agnes-the-ai-analyst/app/api/data.py

"""Data download endpoint — streaming parquet files."""

from fastapi import APIRouter, Depends, HTTPException, Request, Response
from fastapi.responses import FileResponse
import duckdb

from app.auth.dependencies import get_current_user, _get_db
from app.utils import get_data_dir as _get_data_dir
from src.identifier_validation import _SAFE_QUOTED_IDENTIFIER
from src.rbac import can_access_table

router = APIRouter(prefix="/api/data", tags=["data"])


@router.get("/{table_id}/check-access")
async def check_access(
    table_id: str,
    user: dict = Depends(get_current_user),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Lightweight RBAC probe used by Caddy's ``forward_auth`` directive
    to gate file_server-served parquet downloads without involving the
    app's request workers in the bulk byte transfer.

    Returns HTTP 204 No Content when the caller has read access to
    ``table_id``; HTTP 403 (via ``can_access_table`` returning False)
    otherwise. Caddy treats 2xx as authorized and forwards the request
    to its own ``file_server`` block; non-2xx is returned to the client
    verbatim.

    Why a separate endpoint and not just ``HEAD /download``: ``HEAD`` on
    the FileResponse-based ``download`` handler still opens the file and
    runs stat() to populate Content-Length / ETag. ``forward_auth`` calls
    this endpoint on every request, so the per-call cost matters; a pure
    RBAC check is ~1 ms while a HEAD path involves filesystem walks
    (``rglob`` for the parquet across source subdirs).
    """
    if not _SAFE_QUOTED_IDENTIFIER.match(table_id):
        raise HTTPException(status_code=404, detail="Table not found")
    if not can_access_table(user, table_id, conn):
        raise HTTPException(status_code=403, detail="Access denied to this table")
    return Response(status_code=204)


@router.get("/{table_id}/download")
async def download_table(
    table_id: str,
    request: Request,
    user: dict = Depends(get_current_user),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Stream a parquet file for download. Supports ETag for caching.

    On Caddy-fronted deployments the matching Caddyfile rule intercepts
    ``GET /api/data/{table_id}/download``, calls ``check-access`` via
    ``forward_auth``, and serves the parquet directly via ``file_server``
    — bypassing this handler entirely. This handler stays as the
    canonical fallback for non-Caddy deployments (dev `docker compose
    up`, alternative reverse proxies, direct :8000 access) where the
    bulk transfer goes through uvicorn.
    """
    # Reject unsafe table_id before any filesystem or DB operations.
    # Use the relaxed quoted-identifier check that allows dots and hyphens
    # (Keboola table IDs like "in.c-crm.orders") while still blocking
    # path-traversal characters (/, .., \) and quote/control chars.
    if not _SAFE_QUOTED_IDENTIFIER.match(table_id):
        raise HTTPException(status_code=404, detail="Table not found")
    # Check access FIRST
    if not can_access_table(user, table_id, conn):
        raise HTTPException(status_code=403, detail="Access denied to this table")

    data_dir = _get_data_dir()

    # Search in extracts directory (v2 extract.duckdb architecture)
    extracts_dir = data_dir / "extracts"
    candidates = list(extracts_dir.rglob(f"data/{table_id}.parquet")) if extracts_dir.exists() else []

    # Fallback to legacy path for backward compatibility
    if not candidates:
        parquet_dir = data_dir / "src_data" / "parquet"
        candidates = list(parquet_dir.rglob(f"{table_id}.parquet"))
        if not candidates:
            candidates = list(parquet_dir.rglob(f"*/{table_id}.parquet"))

    if not candidates:
        raise HTTPException(status_code=404, detail=f"Table '{table_id}' not found")

    file_path = candidates[0]

    # ETag support
    stat = file_path.stat()
    etag = f'"{stat.st_mtime_ns}"'
    if_none_match = request.headers.get("if-none-match")
    if if_none_match == etag:
        return Response(status_code=304)

    return FileResponse(
        path=file_path,
        filename=f"{table_id}.parquet",
        media_type="application/octet-stream",
        headers={"ETag": etag},
    )