From 9ebe991b5512da5fd14e01fbdd5fa3267f988ca0 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 5 May 2026 09:15:12 +0200 Subject: [PATCH] feat(profile): per-session jsonl download from /profile/sessions User feedback during e2e of #179: the listing page is nice but I want to grab the raw jsonl and look at what's inside. Adds GET /profile/sessions/: - Auth via get_current_user (owner-only). - Path safety: rejects "/", "\", "..", leading ".", and any non-".jsonl" filename. The served path resolves under ${DATA_DIR}/user_sessions//; if resolution escapes that base directory, returns 404 (never 403, so existence of other users' files isn't leaked). - FileResponse with Content-Disposition: attachment. UI: Download button per row in profile_sessions.html. Tests in test_web_ui.py: path-traversal / nested / dotfile / non-jsonl all 404 for owner; unauthenticated 302/401/403; authenticated owner gets 200 + correct Content-Disposition. --- CHANGELOG.md | 3 +- app/web/router.py | 42 ++++++++++++++++++++++++- app/web/templates/profile_sessions.html | 19 +++++++++++ tests/test_web_ui.py | 29 +++++++++++++++++ 4 files changed, 91 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89c5e78..6649f6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,7 +52,8 @@ Five-defect fix for the silently-broken session pipeline on default Compose depl ### Added - `/admin/scheduler-runs` — read-only admin page showing the last 200 audit-log entries from scheduler-driven actions (`run_session_collector`, `run_verification_detector`, `run_corporate_memory`, `marketplace.sync_all`). New `AuditRepository.query_actions(actions, limit)` query helper, new admin nav entry under the Admin dropdown. `data-refresh` (`POST /api/sync/trigger`) and `script-runner` (`POST /api/scripts/run-due`) are scheduler jobs but don't write to `audit_log` today, so they can't appear here yet. Failed scheduler ticks (HTTP 401, network errors) don't reach the audit_log either — those still live only in `docker logs agnes-scheduler-1`; the page calls that out with a hint to set `SCHEDULER_API_TOKEN` if no rows show up. -- `/profile/sessions` — self-service user page in the user menu, showing all session jsonls the caller uploaded via `agnes push` joined against `session_extraction_state`. Each row shows uploaded_at, file size, status badge (`pending` / `processed` / `extracted`), processed_at, and `items_extracted`. The page docstring explicitly calls out that `items_extracted = 0` means the verification detector ran successfully but the LLM found no claims worth tracking — that's the documented "no items" outcome, not a broken pipeline. Closes the gap surfaced during the e2e test of #176 where a user could see their sessions on disk and process them through the LLM but had no UI to inspect what happened. +- `/profile/sessions` — self-service user page in the user menu, showing all session jsonls the caller uploaded via `agnes push` joined against `session_extraction_state`. Each row shows uploaded_at, file size, status badge (`pending` / `processed` / `extracted`), processed_at, `items_extracted`, and a per-row Download button. The page docstring explicitly calls out that `items_extracted = 0` means the verification detector ran successfully but the LLM found no claims worth tracking — that's the documented "no items" outcome, not a broken pipeline. Closes the gap surfaced during the e2e test of #176 where a user could see their sessions on disk and process them through the LLM but had no UI to inspect what happened. +- `GET /profile/sessions/` — owner-only download of a single jsonl. Auth via `get_current_user`; path safety locks the served file under `${DATA_DIR}/user_sessions//` and rejects path-traversal / nested-component / non-`.jsonl` / dotfile filenames with 404 (never 403, so existence of files belonging to other users is not leaked). `Content-Disposition: attachment` returns the file as a download. ### Internal diff --git a/app/web/router.py b/app/web/router.py index 207fbeb..8d58430 100644 --- a/app/web/router.py +++ b/app/web/router.py @@ -11,7 +11,7 @@ from typing import Optional from urllib.parse import quote from fastapi import APIRouter, Depends, Request, HTTPException -from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse from fastapi.templating import Jinja2Templates import duckdb @@ -1183,6 +1183,46 @@ async def profile_sessions_page( return templates.TemplateResponse(request, "profile_sessions.html", ctx) +@router.get("/profile/sessions/{filename}") +async def profile_session_download( + filename: str, + user: dict = Depends(get_current_user), +): + """Download a single jsonl session file owned by the caller. + + Path safety: filename is single-component (no separators, no `..`, + must end in `.jsonl`); the served path is built under + `${DATA_DIR}/user_sessions//` and must resolve into + that directory. Any deviation yields 404 — never 403, so we don't + leak the existence of files belonging to other users. + """ + import pathlib + + if "/" in filename or "\\" in filename or filename.startswith(".") or ".." in filename: + raise HTTPException(status_code=404, detail="Not found") + if not filename.endswith(".jsonl"): + raise HTTPException(status_code=404, detail="Not found") + + user_id = user["id"] + data_dir = pathlib.Path(os.environ.get("DATA_DIR", "/data")).resolve() + user_dir = (data_dir / "user_sessions" / user_id).resolve() + target = (user_dir / filename).resolve() + + try: + target.relative_to(user_dir) + except ValueError: + raise HTTPException(status_code=404, detail="Not found") + if not target.is_file(): + raise HTTPException(status_code=404, detail="Not found") + + return FileResponse( + path=str(target), + filename=filename, + media_type="application/x-ndjson", + headers={"Content-Disposition": f'attachment; filename="{filename}"'}, + ) + + @router.get("/_debug/throw/http/{code:int}", response_class=HTMLResponse, include_in_schema=False) async def _debug_throw_http(request: Request, code: int): """Dev helper — raise an HTTPException with the given status code. diff --git a/app/web/templates/profile_sessions.html b/app/web/templates/profile_sessions.html index d0d1cd6..4165d84 100644 --- a/app/web/templates/profile_sessions.html +++ b/app/web/templates/profile_sessions.html @@ -40,6 +40,21 @@ .badge-pending { background: #fef3c7; color: #92400e; } .badge-processed { background: #dbeafe; color: #1e40af; } .badge-extracted { background: #d1fae5; color: #065f46; } + .dl-link { + display: inline-block; + padding: 4px 10px; + border: 1px solid var(--border, #e5e7eb); + border-radius: 6px; + color: inherit; text-decoration: none; + font-size: 12px; font-weight: 500; + background: var(--surface, #fff); + transition: background 0.15s, border-color 0.15s; + } + .dl-link:hover { + background: var(--border-light, #f9fafb); + border-color: var(--primary, #6366f1); + color: var(--primary, #4338ca); + } .empty { padding: 40px 16px; text-align: center; color: var(--text-secondary, #6b7280); font-size: 13px; @@ -71,6 +86,7 @@ Status Processed Items extracted + @@ -94,6 +110,9 @@ {% if s.items_extracted is not none %}{{ s.items_extracted }}{% endif %} + + Download + {% endfor %} diff --git a/tests/test_web_ui.py b/tests/test_web_ui.py index 36450ac..6609010 100644 --- a/tests/test_web_ui.py +++ b/tests/test_web_ui.py @@ -353,6 +353,35 @@ class TestAdminRoleGuards: r = web_client.get("/profile/sessions", cookies=admin_cookie, follow_redirects=False) assert r.status_code == 200 + def test_profile_session_download_path_safety(self, web_client, analyst_cookie): + """Per-session download endpoint must reject any filename that could + escape the user's own session directory.""" + # NB: bare ".." is excluded — httpx normalises the URL to + # /profile/sessions before sending, so it never reaches the + # download handler. The %2F-encoded variant exercises the real + # path-component value that does reach the handler. + for bad in ["../etc/passwd", "subdir/file.jsonl", ".env", + "session.jsonl.bak", "..%2Fetc%2Fpasswd"]: + r = web_client.get(f"/profile/sessions/{bad}", cookies=analyst_cookie, follow_redirects=False) + assert r.status_code == 404, f"Expected 404 for {bad!r}, got {r.status_code}" + # Unauthenticated → never the file + r = web_client.get("/profile/sessions/anything.jsonl", follow_redirects=False) + assert r.status_code in (302, 401, 403) + + def test_profile_session_download_returns_file_for_owner(self, web_client, analyst_cookie, tmp_path, monkeypatch): + """Authenticated owner can fetch their own jsonl with proper Content-Disposition.""" + # The seeded analyst is "analyst1" (per conftest.seeded_app). + user_sessions = tmp_path / "user_sessions" / "analyst1" + user_sessions.mkdir(parents=True) + sample = user_sessions / "abc-123.jsonl" + sample.write_text('{"event": "test"}\n') + monkeypatch.setenv("DATA_DIR", str(tmp_path)) + + r = web_client.get("/profile/sessions/abc-123.jsonl", cookies=analyst_cookie, follow_redirects=False) + assert r.status_code == 200 + assert r.headers.get("content-disposition", "").endswith('filename="abc-123.jsonl"') + assert b'"event": "test"' in r.content + class TestUnauthenticatedHtmlRedirects: def test_dashboard_unauthenticated_redirects_to_login(self, web_client):