feat(profile): per-session jsonl download from /profile/sessions

User feedback during e2e of #179: the listing page is nice but I want
to grab the raw jsonl and look at what's inside.

Adds GET /profile/sessions/<filename>:
- Auth via get_current_user (owner-only).
- Path safety: rejects "/", "\", "..", leading ".", and any non-".jsonl"
  filename. The served path resolves under
  ${DATA_DIR}/user_sessions/<caller.id>/; if resolution escapes that
  base directory, returns 404 (never 403, so existence of other users'
  files isn't leaked).
- FileResponse with Content-Disposition: attachment.

UI: Download button per row in profile_sessions.html.

Tests in test_web_ui.py: path-traversal / nested / dotfile / non-jsonl
all 404 for owner; unauthenticated 302/401/403; authenticated owner
gets 200 + correct Content-Disposition.
This commit is contained in:
ZdenekSrotyr 2026-05-05 09:15:12 +02:00
parent e86da72997
commit 9ebe991b55
4 changed files with 91 additions and 2 deletions

View file

@ -52,7 +52,8 @@ Five-defect fix for the silently-broken session pipeline on default Compose depl
### Added ### Added
- `/admin/scheduler-runs` — read-only admin page showing the last 200 audit-log entries from scheduler-driven actions (`run_session_collector`, `run_verification_detector`, `run_corporate_memory`, `marketplace.sync_all`). New `AuditRepository.query_actions(actions, limit)` query helper, new admin nav entry under the Admin dropdown. `data-refresh` (`POST /api/sync/trigger`) and `script-runner` (`POST /api/scripts/run-due`) are scheduler jobs but don't write to `audit_log` today, so they can't appear here yet. Failed scheduler ticks (HTTP 401, network errors) don't reach the audit_log either — those still live only in `docker logs agnes-scheduler-1`; the page calls that out with a hint to set `SCHEDULER_API_TOKEN` if no rows show up. - `/admin/scheduler-runs` — read-only admin page showing the last 200 audit-log entries from scheduler-driven actions (`run_session_collector`, `run_verification_detector`, `run_corporate_memory`, `marketplace.sync_all`). New `AuditRepository.query_actions(actions, limit)` query helper, new admin nav entry under the Admin dropdown. `data-refresh` (`POST /api/sync/trigger`) and `script-runner` (`POST /api/scripts/run-due`) are scheduler jobs but don't write to `audit_log` today, so they can't appear here yet. Failed scheduler ticks (HTTP 401, network errors) don't reach the audit_log either — those still live only in `docker logs agnes-scheduler-1`; the page calls that out with a hint to set `SCHEDULER_API_TOKEN` if no rows show up.
- `/profile/sessions` — self-service user page in the user menu, showing all session jsonls the caller uploaded via `agnes push` joined against `session_extraction_state`. Each row shows uploaded_at, file size, status badge (`pending` / `processed` / `extracted`), processed_at, and `items_extracted`. The page docstring explicitly calls out that `items_extracted = 0` means the verification detector ran successfully but the LLM found no claims worth tracking — that's the documented "no items" outcome, not a broken pipeline. Closes the gap surfaced during the e2e test of #176 where a user could see their sessions on disk and process them through the LLM but had no UI to inspect what happened. - `/profile/sessions` — self-service user page in the user menu, showing all session jsonls the caller uploaded via `agnes push` joined against `session_extraction_state`. Each row shows uploaded_at, file size, status badge (`pending` / `processed` / `extracted`), processed_at, `items_extracted`, and a per-row Download button. The page docstring explicitly calls out that `items_extracted = 0` means the verification detector ran successfully but the LLM found no claims worth tracking — that's the documented "no items" outcome, not a broken pipeline. Closes the gap surfaced during the e2e test of #176 where a user could see their sessions on disk and process them through the LLM but had no UI to inspect what happened.
- `GET /profile/sessions/<filename>` — owner-only download of a single jsonl. Auth via `get_current_user`; path safety locks the served file under `${DATA_DIR}/user_sessions/<caller.id>/` and rejects path-traversal / nested-component / non-`.jsonl` / dotfile filenames with 404 (never 403, so existence of files belonging to other users is not leaked). `Content-Disposition: attachment` returns the file as a download.
### Internal ### Internal

View file

@ -11,7 +11,7 @@ from typing import Optional
from urllib.parse import quote from urllib.parse import quote
from fastapi import APIRouter, Depends, Request, HTTPException from fastapi import APIRouter, Depends, Request, HTTPException
from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
import duckdb import duckdb
@ -1183,6 +1183,46 @@ async def profile_sessions_page(
return templates.TemplateResponse(request, "profile_sessions.html", ctx) return templates.TemplateResponse(request, "profile_sessions.html", ctx)
@router.get("/profile/sessions/{filename}")
async def profile_session_download(
filename: str,
user: dict = Depends(get_current_user),
):
"""Download a single jsonl session file owned by the caller.
Path safety: filename is single-component (no separators, no `..`,
must end in `.jsonl`); the served path is built under
`${DATA_DIR}/user_sessions/<current_user.id>/` and must resolve into
that directory. Any deviation yields 404 never 403, so we don't
leak the existence of files belonging to other users.
"""
import pathlib
if "/" in filename or "\\" in filename or filename.startswith(".") or ".." in filename:
raise HTTPException(status_code=404, detail="Not found")
if not filename.endswith(".jsonl"):
raise HTTPException(status_code=404, detail="Not found")
user_id = user["id"]
data_dir = pathlib.Path(os.environ.get("DATA_DIR", "/data")).resolve()
user_dir = (data_dir / "user_sessions" / user_id).resolve()
target = (user_dir / filename).resolve()
try:
target.relative_to(user_dir)
except ValueError:
raise HTTPException(status_code=404, detail="Not found")
if not target.is_file():
raise HTTPException(status_code=404, detail="Not found")
return FileResponse(
path=str(target),
filename=filename,
media_type="application/x-ndjson",
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
)
@router.get("/_debug/throw/http/{code:int}", response_class=HTMLResponse, include_in_schema=False) @router.get("/_debug/throw/http/{code:int}", response_class=HTMLResponse, include_in_schema=False)
async def _debug_throw_http(request: Request, code: int): async def _debug_throw_http(request: Request, code: int):
"""Dev helper — raise an HTTPException with the given status code. """Dev helper — raise an HTTPException with the given status code.

View file

@ -40,6 +40,21 @@
.badge-pending { background: #fef3c7; color: #92400e; } .badge-pending { background: #fef3c7; color: #92400e; }
.badge-processed { background: #dbeafe; color: #1e40af; } .badge-processed { background: #dbeafe; color: #1e40af; }
.badge-extracted { background: #d1fae5; color: #065f46; } .badge-extracted { background: #d1fae5; color: #065f46; }
.dl-link {
display: inline-block;
padding: 4px 10px;
border: 1px solid var(--border, #e5e7eb);
border-radius: 6px;
color: inherit; text-decoration: none;
font-size: 12px; font-weight: 500;
background: var(--surface, #fff);
transition: background 0.15s, border-color 0.15s;
}
.dl-link:hover {
background: var(--border-light, #f9fafb);
border-color: var(--primary, #6366f1);
color: var(--primary, #4338ca);
}
.empty { .empty {
padding: 40px 16px; text-align: center; padding: 40px 16px; text-align: center;
color: var(--text-secondary, #6b7280); font-size: 13px; color: var(--text-secondary, #6b7280); font-size: 13px;
@ -71,6 +86,7 @@
<th>Status</th> <th>Status</th>
<th>Processed</th> <th>Processed</th>
<th class="num">Items extracted</th> <th class="num">Items extracted</th>
<th></th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
@ -94,6 +110,9 @@
<td class="num"> <td class="num">
{% if s.items_extracted is not none %}{{ s.items_extracted }}{% endif %} {% if s.items_extracted is not none %}{{ s.items_extracted }}{% endif %}
</td> </td>
<td>
<a class="dl-link" href="/profile/sessions/{{ s.name }}" download>Download</a>
</td>
</tr> </tr>
{% endfor %} {% endfor %}
</tbody> </tbody>

View file

@ -353,6 +353,35 @@ class TestAdminRoleGuards:
r = web_client.get("/profile/sessions", cookies=admin_cookie, follow_redirects=False) r = web_client.get("/profile/sessions", cookies=admin_cookie, follow_redirects=False)
assert r.status_code == 200 assert r.status_code == 200
def test_profile_session_download_path_safety(self, web_client, analyst_cookie):
"""Per-session download endpoint must reject any filename that could
escape the user's own session directory."""
# NB: bare ".." is excluded — httpx normalises the URL to
# /profile/sessions before sending, so it never reaches the
# download handler. The %2F-encoded variant exercises the real
# path-component value that does reach the handler.
for bad in ["../etc/passwd", "subdir/file.jsonl", ".env",
"session.jsonl.bak", "..%2Fetc%2Fpasswd"]:
r = web_client.get(f"/profile/sessions/{bad}", cookies=analyst_cookie, follow_redirects=False)
assert r.status_code == 404, f"Expected 404 for {bad!r}, got {r.status_code}"
# Unauthenticated → never the file
r = web_client.get("/profile/sessions/anything.jsonl", follow_redirects=False)
assert r.status_code in (302, 401, 403)
def test_profile_session_download_returns_file_for_owner(self, web_client, analyst_cookie, tmp_path, monkeypatch):
"""Authenticated owner can fetch their own jsonl with proper Content-Disposition."""
# The seeded analyst is "analyst1" (per conftest.seeded_app).
user_sessions = tmp_path / "user_sessions" / "analyst1"
user_sessions.mkdir(parents=True)
sample = user_sessions / "abc-123.jsonl"
sample.write_text('{"event": "test"}\n')
monkeypatch.setenv("DATA_DIR", str(tmp_path))
r = web_client.get("/profile/sessions/abc-123.jsonl", cookies=analyst_cookie, follow_redirects=False)
assert r.status_code == 200
assert r.headers.get("content-disposition", "").endswith('filename="abc-123.jsonl"')
assert b'"event": "test"' in r.content
class TestUnauthenticatedHtmlRedirects: class TestUnauthenticatedHtmlRedirects:
def test_dashboard_unauthenticated_redirects_to_login(self, web_client): def test_dashboard_unauthenticated_redirects_to_login(self, web_client):