From 952dc9e74d1ba60f71c91c9c7fbb068a553cb202 Mon Sep 17 00:00:00 2001
From: ZdenekSrotyr <zdenek.srotyr@keboola.com>
Date: Tue, 5 May 2026 09:53:06 +0200
Subject: [PATCH] fix(profile-sessions): tolerate stat() failures on individual
 jsonl (Devin Review on #179)

The previous gather used `sorted(glob, key=lambda p: p.stat().st_mtime)`.
A transient OSError (race with delete, permission flicker, EBADF on a
weird filesystem) on any single file raised through the lambda and 500-ed
the whole page.

Reworked: stat each path under try/except into a (path, stat) list, sort
the already-statted entries. Bad files drop silently from the listing.

Regression test test_profile_sessions_page_tolerates_stat_failures
patches Path.stat to raise on one of two files, asserts the page returns
200 with the good row rendered and the bad row dropped.
---
 CHANGELOG.md         |  1 +
 app/web/router.py    | 10 +++++++++-
 tests/test_web_ui.py | 28 ++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 777722e..97fc44a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -49,6 +49,7 @@ Five-defect fix for the silently-broken session pipeline on default Compose depl
 - **#179 review — `SCHEDULER_AUDIT_ACTIONS` listed action strings that don't actually appear in `audit_log`.** The list at `app/web/router.py:952` had `"marketplaces_sync_all"` (wrong — actual is `"marketplace.sync_all"`) plus `"data_refresh"` and `"scripts_run_due"` (which `app/api/sync.py` and `app/api/scripts.py` don't write). Corrected to the four actually-logged strings, with a comment pointing at the missing audit calls in sync/scripts as a follow-up.
 - **#179 review — `/api/admin/run-corporate-memory` skipped audit on unhandled exceptions** (same gap as `run_verification_detector` from the previous round). Mirrored the same try/except + `unhandled_error` audit pattern, so a DuckDB lock or unexpected SDK error from `collect_all()` now produces an audit row with the error type+message before re-raising as 500. Regression test in `tests/test_admin_run_endpoints.py::TestRunCorporateMemory::test_unhandled_exception_still_audits`.
 - **#179 review — `/api/admin/run-session-collector` skipped audit on unhandled exceptions** (third occurrence of the same pattern, completes the trilogy of LLM-pipeline endpoints). Mirrored the same try/except + `unhandled_error` audit pattern from the other two endpoints, so a `PermissionError` walking `/home`, an `OSError` on `/data/user_sessions` mkdir, or any other unhandled exception from `collector.run()` now produces an audit row before re-raising as 500. Regression test in `tests/test_admin_run_endpoints.py::TestRunSessionCollector::test_unhandled_exception_still_audits`.
+- **#179 review — `/profile/sessions` 500-ed on transient `stat()` failure.** The previous implementation used `sorted(glob, key=lambda p: p.stat().st_mtime)`; if any single jsonl file's stat call raised (race with delete, EACCES from a remount, etc.), the whole sort raised and the page returned 500 instead of just dropping that one row. Reworked the gather: stat each path under try/except into a `(path, stat)` list, then sort the already-statted entries. Bad files are silently dropped from the listing. Regression test in `tests/test_web_ui.py::TestAdminRoleGuards::test_profile_sessions_page_tolerates_stat_failures`.
 
 ### Added
 
diff --git a/app/web/router.py b/app/web/router.py
index 8d58430..2b62f97 100644
--- a/app/web/router.py
+++ b/app/web/router.py
@@ -1135,11 +1135,19 @@ async def profile_sessions_page(
 
     files = []
     if user_sessions_dir.is_dir():
-        for jsonl in sorted(user_sessions_dir.glob("*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True):
+        # Stat once per file with OSError tolerance, THEN sort. The previous
+        # `sorted(..., key=lambda p: p.stat().st_mtime)` raised on any
+        # transient stat failure (race with delete, permission flicker) and
+        # 500-ed the whole page (Devin Review on #179).
+        statted = []
+        for jsonl in user_sessions_dir.glob("*.jsonl"):
             try:
                 stat = jsonl.stat()
             except OSError:
                 continue
+            statted.append((jsonl, stat))
+        statted.sort(key=lambda pair: pair[1].st_mtime, reverse=True)
+        for jsonl, stat in statted:
             files.append({
                 "name": jsonl.name,
                 "size_bytes": stat.st_size,
diff --git a/tests/test_web_ui.py b/tests/test_web_ui.py
index 6609010..e1521f9 100644
--- a/tests/test_web_ui.py
+++ b/tests/test_web_ui.py
@@ -368,6 +368,34 @@ class TestAdminRoleGuards:
         r = web_client.get("/profile/sessions/anything.jsonl", follow_redirects=False)
         assert r.status_code in (302, 401, 403)
 
+    def test_profile_sessions_page_tolerates_stat_failures(self, web_client, analyst_cookie, tmp_path, monkeypatch):
+        """Devin Review on d878764a: a transient stat() failure on one file
+        must not 500 the whole page. Skip the bad row, render the rest."""
+        import pathlib
+        user_sessions = tmp_path / "user_sessions" / "analyst1"
+        user_sessions.mkdir(parents=True)
+        good = user_sessions / "good.jsonl"
+        good.write_text('{"event": "ok"}\n')
+        bad = user_sessions / "bad.jsonl"
+        bad.write_text('{"event": "stat-explodes"}\n')
+
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+
+        # Make `bad.jsonl`.stat() raise; `good.jsonl`.stat() works.
+        real_stat = pathlib.Path.stat
+
+        def selective_stat(self, *args, **kwargs):
+            if self.name == "bad.jsonl":
+                raise PermissionError("simulated stat failure")
+            return real_stat(self, *args, **kwargs)
+
+        monkeypatch.setattr(pathlib.Path, "stat", selective_stat)
+
+        r = web_client.get("/profile/sessions", cookies=analyst_cookie, follow_redirects=False)
+        assert r.status_code == 200
+        assert b"good.jsonl" in r.content
+        assert b"bad.jsonl" not in r.content
+
     def test_profile_session_download_returns_file_for_owner(self, web_client, analyst_cookie, tmp_path, monkeypatch):
         """Authenticated owner can fetch their own jsonl with proper Content-Disposition."""
         # The seeded analyst is "analyst1" (per conftest.seeded_app).