feat(profile): per-session jsonl download from /profile/sessions
User feedback during e2e of #179: the listing page is nice but I want to grab the raw jsonl and look at what's inside. Adds GET /profile/sessions/<filename>: - Auth via get_current_user (owner-only). - Path safety: rejects "/", "\", "..", leading ".", and any non-".jsonl" filename. The served path resolves under ${DATA_DIR}/user_sessions/<caller.id>/; if resolution escapes that base directory, returns 404 (never 403, so existence of other users' files isn't leaked). - FileResponse with Content-Disposition: attachment. UI: Download button per row in profile_sessions.html. Tests in test_web_ui.py: path-traversal / nested / dotfile / non-jsonl all 404 for owner; unauthenticated 302/401/403; authenticated owner gets 200 + correct Content-Disposition.
This commit is contained in:
parent
e86da72997
commit
9ebe991b55
4 changed files with 91 additions and 2 deletions
|
|
@ -52,7 +52,8 @@ Five-defect fix for the silently-broken session pipeline on default Compose depl
|
|||
### Added
|
||||
|
||||
- `/admin/scheduler-runs` — read-only admin page showing the last 200 audit-log entries from scheduler-driven actions (`run_session_collector`, `run_verification_detector`, `run_corporate_memory`, `marketplace.sync_all`). New `AuditRepository.query_actions(actions, limit)` query helper, new admin nav entry under the Admin dropdown. `data-refresh` (`POST /api/sync/trigger`) and `script-runner` (`POST /api/scripts/run-due`) are scheduler jobs but don't write to `audit_log` today, so they can't appear here yet. Failed scheduler ticks (HTTP 401, network errors) don't reach the audit_log either — those still live only in `docker logs agnes-scheduler-1`; the page calls that out with a hint to set `SCHEDULER_API_TOKEN` if no rows show up.
|
||||
- `/profile/sessions` — self-service user page in the user menu, showing all session jsonls the caller uploaded via `agnes push` joined against `session_extraction_state`. Each row shows uploaded_at, file size, status badge (`pending` / `processed` / `extracted`), processed_at, and `items_extracted`. The page docstring explicitly calls out that `items_extracted = 0` means the verification detector ran successfully but the LLM found no claims worth tracking — that's the documented "no items" outcome, not a broken pipeline. Closes the gap surfaced during the e2e test of #176 where a user could see their sessions on disk and process them through the LLM but had no UI to inspect what happened.
|
||||
- `/profile/sessions` — self-service user page in the user menu, showing all session jsonls the caller uploaded via `agnes push` joined against `session_extraction_state`. Each row shows uploaded_at, file size, status badge (`pending` / `processed` / `extracted`), processed_at, `items_extracted`, and a per-row Download button. The page docstring explicitly calls out that `items_extracted = 0` means the verification detector ran successfully but the LLM found no claims worth tracking — that's the documented "no items" outcome, not a broken pipeline. Closes the gap surfaced during the e2e test of #176 where a user could see their sessions on disk and process them through the LLM but had no UI to inspect what happened.
|
||||
- `GET /profile/sessions/<filename>` — owner-only download of a single jsonl. Auth via `get_current_user`; path safety locks the served file under `${DATA_DIR}/user_sessions/<caller.id>/` and rejects path-traversal / nested-component / non-`.jsonl` / dotfile filenames with 404 (never 403, so existence of files belonging to other users is not leaked). `Content-Disposition: attachment` returns the file as a download.
|
||||
|
||||
### Internal
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from typing import Optional
|
|||
from urllib.parse import quote
|
||||
|
||||
from fastapi import APIRouter, Depends, Request, HTTPException
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
import duckdb
|
||||
|
||||
|
|
@ -1183,6 +1183,46 @@ async def profile_sessions_page(
|
|||
return templates.TemplateResponse(request, "profile_sessions.html", ctx)
|
||||
|
||||
|
||||
@router.get("/profile/sessions/{filename}")
|
||||
async def profile_session_download(
|
||||
filename: str,
|
||||
user: dict = Depends(get_current_user),
|
||||
):
|
||||
"""Download a single jsonl session file owned by the caller.
|
||||
|
||||
Path safety: filename is single-component (no separators, no `..`,
|
||||
must end in `.jsonl`); the served path is built under
|
||||
`${DATA_DIR}/user_sessions/<current_user.id>/` and must resolve into
|
||||
that directory. Any deviation yields 404 — never 403, so we don't
|
||||
leak the existence of files belonging to other users.
|
||||
"""
|
||||
import pathlib
|
||||
|
||||
if "/" in filename or "\\" in filename or filename.startswith(".") or ".." in filename:
|
||||
raise HTTPException(status_code=404, detail="Not found")
|
||||
if not filename.endswith(".jsonl"):
|
||||
raise HTTPException(status_code=404, detail="Not found")
|
||||
|
||||
user_id = user["id"]
|
||||
data_dir = pathlib.Path(os.environ.get("DATA_DIR", "/data")).resolve()
|
||||
user_dir = (data_dir / "user_sessions" / user_id).resolve()
|
||||
target = (user_dir / filename).resolve()
|
||||
|
||||
try:
|
||||
target.relative_to(user_dir)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=404, detail="Not found")
|
||||
if not target.is_file():
|
||||
raise HTTPException(status_code=404, detail="Not found")
|
||||
|
||||
return FileResponse(
|
||||
path=str(target),
|
||||
filename=filename,
|
||||
media_type="application/x-ndjson",
|
||||
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
|
||||
)
|
||||
|
||||
|
||||
@router.get("/_debug/throw/http/{code:int}", response_class=HTMLResponse, include_in_schema=False)
|
||||
async def _debug_throw_http(request: Request, code: int):
|
||||
"""Dev helper — raise an HTTPException with the given status code.
|
||||
|
|
|
|||
|
|
@ -40,6 +40,21 @@
|
|||
.badge-pending { background: #fef3c7; color: #92400e; }
|
||||
.badge-processed { background: #dbeafe; color: #1e40af; }
|
||||
.badge-extracted { background: #d1fae5; color: #065f46; }
|
||||
.dl-link {
|
||||
display: inline-block;
|
||||
padding: 4px 10px;
|
||||
border: 1px solid var(--border, #e5e7eb);
|
||||
border-radius: 6px;
|
||||
color: inherit; text-decoration: none;
|
||||
font-size: 12px; font-weight: 500;
|
||||
background: var(--surface, #fff);
|
||||
transition: background 0.15s, border-color 0.15s;
|
||||
}
|
||||
.dl-link:hover {
|
||||
background: var(--border-light, #f9fafb);
|
||||
border-color: var(--primary, #6366f1);
|
||||
color: var(--primary, #4338ca);
|
||||
}
|
||||
.empty {
|
||||
padding: 40px 16px; text-align: center;
|
||||
color: var(--text-secondary, #6b7280); font-size: 13px;
|
||||
|
|
@ -71,6 +86,7 @@
|
|||
<th>Status</th>
|
||||
<th>Processed</th>
|
||||
<th class="num">Items extracted</th>
|
||||
<th></th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
|
|
@ -94,6 +110,9 @@
|
|||
<td class="num">
|
||||
{% if s.items_extracted is not none %}{{ s.items_extracted }}{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<a class="dl-link" href="/profile/sessions/{{ s.name }}" download>Download</a>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
|
|
|
|||
|
|
@ -353,6 +353,35 @@ class TestAdminRoleGuards:
|
|||
r = web_client.get("/profile/sessions", cookies=admin_cookie, follow_redirects=False)
|
||||
assert r.status_code == 200
|
||||
|
||||
def test_profile_session_download_path_safety(self, web_client, analyst_cookie):
|
||||
"""Per-session download endpoint must reject any filename that could
|
||||
escape the user's own session directory."""
|
||||
# NB: bare ".." is excluded — httpx normalises the URL to
|
||||
# /profile/sessions before sending, so it never reaches the
|
||||
# download handler. The %2F-encoded variant exercises the real
|
||||
# path-component value that does reach the handler.
|
||||
for bad in ["../etc/passwd", "subdir/file.jsonl", ".env",
|
||||
"session.jsonl.bak", "..%2Fetc%2Fpasswd"]:
|
||||
r = web_client.get(f"/profile/sessions/{bad}", cookies=analyst_cookie, follow_redirects=False)
|
||||
assert r.status_code == 404, f"Expected 404 for {bad!r}, got {r.status_code}"
|
||||
# Unauthenticated → never the file
|
||||
r = web_client.get("/profile/sessions/anything.jsonl", follow_redirects=False)
|
||||
assert r.status_code in (302, 401, 403)
|
||||
|
||||
def test_profile_session_download_returns_file_for_owner(self, web_client, analyst_cookie, tmp_path, monkeypatch):
|
||||
"""Authenticated owner can fetch their own jsonl with proper Content-Disposition."""
|
||||
# The seeded analyst is "analyst1" (per conftest.seeded_app).
|
||||
user_sessions = tmp_path / "user_sessions" / "analyst1"
|
||||
user_sessions.mkdir(parents=True)
|
||||
sample = user_sessions / "abc-123.jsonl"
|
||||
sample.write_text('{"event": "test"}\n')
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
||||
|
||||
r = web_client.get("/profile/sessions/abc-123.jsonl", cookies=analyst_cookie, follow_redirects=False)
|
||||
assert r.status_code == 200
|
||||
assert r.headers.get("content-disposition", "").endswith('filename="abc-123.jsonl"')
|
||||
assert b'"event": "test"' in r.content
|
||||
|
||||
|
||||
class TestUnauthenticatedHtmlRedirects:
|
||||
def test_dashboard_unauthenticated_redirects_to_login(self, web_client):
|
||||
|
|
|
|||
Loading…
Reference in a new issue