From 952dc9e74d1ba60f71c91c9c7fbb068a553cb202 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 5 May 2026 09:53:06 +0200 Subject: [PATCH] fix(profile-sessions): tolerate stat() failures on individual jsonl (Devin Review on #179) The previous gather used `sorted(glob, key=lambda p: p.stat().st_mtime)`. A transient OSError (race with delete, permission flicker, EBADF on a weird filesystem) on any single file raised through the lambda and 500-ed the whole page. Reworked: stat each path under try/except into a (path, stat) list, sort the already-statted entries. Bad files drop silently from the listing. Regression test test_profile_sessions_page_tolerates_stat_failures patches Path.stat to raise on one of two files, asserts the page returns 200 with the good row rendered and the bad row dropped. --- CHANGELOG.md | 1 + app/web/router.py | 10 +++++++++- tests/test_web_ui.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 777722e..97fc44a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ Five-defect fix for the silently-broken session pipeline on default Compose depl - **#179 review — `SCHEDULER_AUDIT_ACTIONS` listed action strings that don't actually appear in `audit_log`.** The list at `app/web/router.py:952` had `"marketplaces_sync_all"` (wrong — actual is `"marketplace.sync_all"`) plus `"data_refresh"` and `"scripts_run_due"` (which `app/api/sync.py` and `app/api/scripts.py` don't write). Corrected to the four actually-logged strings, with a comment pointing at the missing audit calls in sync/scripts as a follow-up. - **#179 review — `/api/admin/run-corporate-memory` skipped audit on unhandled exceptions** (same gap as `run_verification_detector` from the previous round). Mirrored the same try/except + `unhandled_error` audit pattern, so a DuckDB lock or unexpected SDK error from `collect_all()` now produces an audit row with the error type+message before re-raising as 500. Regression test in `tests/test_admin_run_endpoints.py::TestRunCorporateMemory::test_unhandled_exception_still_audits`. - **#179 review — `/api/admin/run-session-collector` skipped audit on unhandled exceptions** (third occurrence of the same pattern, completes the trilogy of LLM-pipeline endpoints). Mirrored the same try/except + `unhandled_error` audit pattern from the other two endpoints, so a `PermissionError` walking `/home`, an `OSError` on `/data/user_sessions` mkdir, or any other unhandled exception from `collector.run()` now produces an audit row before re-raising as 500. Regression test in `tests/test_admin_run_endpoints.py::TestRunSessionCollector::test_unhandled_exception_still_audits`. +- **#179 review — `/profile/sessions` 500-ed on transient `stat()` failure.** The previous implementation used `sorted(glob, key=lambda p: p.stat().st_mtime)`; if any single jsonl file's stat call raised (race with delete, EACCES from a remount, etc.), the whole sort raised and the page returned 500 instead of just dropping that one row. Reworked the gather: stat each path under try/except into a `(path, stat)` list, then sort the already-statted entries. Bad files are silently dropped from the listing. Regression test in `tests/test_web_ui.py::TestAdminRoleGuards::test_profile_sessions_page_tolerates_stat_failures`. ### Added diff --git a/app/web/router.py b/app/web/router.py index 8d58430..2b62f97 100644 --- a/app/web/router.py +++ b/app/web/router.py @@ -1135,11 +1135,19 @@ async def profile_sessions_page( files = [] if user_sessions_dir.is_dir(): - for jsonl in sorted(user_sessions_dir.glob("*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True): + # Stat once per file with OSError tolerance, THEN sort. The previous + # `sorted(..., key=lambda p: p.stat().st_mtime)` raised on any + # transient stat failure (race with delete, permission flicker) and + # 500-ed the whole page (Devin Review on #179). + statted = [] + for jsonl in user_sessions_dir.glob("*.jsonl"): try: stat = jsonl.stat() except OSError: continue + statted.append((jsonl, stat)) + statted.sort(key=lambda pair: pair[1].st_mtime, reverse=True) + for jsonl, stat in statted: files.append({ "name": jsonl.name, "size_bytes": stat.st_size, diff --git a/tests/test_web_ui.py b/tests/test_web_ui.py index 6609010..e1521f9 100644 --- a/tests/test_web_ui.py +++ b/tests/test_web_ui.py @@ -368,6 +368,34 @@ class TestAdminRoleGuards: r = web_client.get("/profile/sessions/anything.jsonl", follow_redirects=False) assert r.status_code in (302, 401, 403) + def test_profile_sessions_page_tolerates_stat_failures(self, web_client, analyst_cookie, tmp_path, monkeypatch): + """Devin Review on d878764a: a transient stat() failure on one file + must not 500 the whole page. Skip the bad row, render the rest.""" + import pathlib + user_sessions = tmp_path / "user_sessions" / "analyst1" + user_sessions.mkdir(parents=True) + good = user_sessions / "good.jsonl" + good.write_text('{"event": "ok"}\n') + bad = user_sessions / "bad.jsonl" + bad.write_text('{"event": "stat-explodes"}\n') + + monkeypatch.setenv("DATA_DIR", str(tmp_path)) + + # Make `bad.jsonl`.stat() raise; `good.jsonl`.stat() works. + real_stat = pathlib.Path.stat + + def selective_stat(self, *args, **kwargs): + if self.name == "bad.jsonl": + raise PermissionError("simulated stat failure") + return real_stat(self, *args, **kwargs) + + monkeypatch.setattr(pathlib.Path, "stat", selective_stat) + + r = web_client.get("/profile/sessions", cookies=analyst_cookie, follow_redirects=False) + assert r.status_code == 200 + assert b"good.jsonl" in r.content + assert b"bad.jsonl" not in r.content + def test_profile_session_download_returns_file_for_owner(self, web_client, analyst_cookie, tmp_path, monkeypatch): """Authenticated owner can fetch their own jsonl with proper Content-Disposition.""" # The seeded analyst is "analyst1" (per conftest.seeded_app).