From df2c33147c0da28102a75c632dbdc07a7f7ae88b Mon Sep 17 00:00:00 2001
From: ZdenekSrotyr <zdenek.srotyr@keboola.com>
Date: Tue, 5 May 2026 20:02:50 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20Devin=20Review=20on=20#194=20round=202?=
 =?UTF-8?q?=20=E2=80=94=203=20BUG-class=20findings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. instance.yaml overlay path now matches read site under STATE_DIR.
   Three sites updated:
     - app/api/admin.py:1005 (server-config endpoint writer)
     - app/api/admin.py:2610 (configure endpoint writer)
     - app/instance_config.py:106 (overlay reader)
   All three now go through _state_dir() so under flat-mount layout
   (STATE_DIR=/data-state) the irreplaceable instance.yaml overlay
   lands on the state disk (sdc) instead of the regenerable data
   disk (sdb). Without this fix, .env_overlay correctly went to the
   state disk while instance.yaml went to the data disk — config
   would be lost if an operator wiped sdb.

2. Strip customer-specific tokens from OSS repo per CLAUDE.md
   vendor-agnostic rule:
     - docker-compose.host-mount.yml: 'a deployer (Groupon FoundryAI)'
       → 'a deployer in production'
     - docker-compose.flat-mount.yml: 'caused 2026-05-05 in the
       Groupon FoundryAI deployment' → generic 'production failure
       mode'
     - docs/state-dir.md: rewrote the incident reference to describe
       the failure mode abstractly without naming the deployment;
       updated the recommendation table to say 'shadow-mount class'
       instead of dating the specific incident.

3. Updated docs/state-dir.md 'What reads STATE_DIR' to list all
   read/write sites including the three migrated in this round
   (admin.py, instance_config.py, marketplaces.py).

ANALYSIS finding (tls-rotate.sh hardcoded host-mount.yml) deferred
— same operator-side class as auto-upgrade.sh hardcoded host-mount,
documented limitation per the PR body.
---
 app/api/admin.py              | 8 ++++----
 app/instance_config.py        | 9 +++++++--
 docker-compose.flat-mount.yml | 4 ++--
 docker-compose.host-mount.yml | 2 +-
 docs/state-dir.md             | 7 +++++--
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/app/api/admin.py b/app/api/admin.py
index 7fd1f82..4469299 100644
--- a/app/api/admin.py
+++ b/app/api/admin.py
@@ -1001,8 +1001,8 @@ async def update_server_config(
     # atomic-write sequence; the audit log sits outside since it operates on
     # local snapshots.
     from app.instance_config import reset_cache
-    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
-    config_path = data_dir / "state" / "instance.yaml"
+    from app.secrets import _state_dir
+    config_path = _state_dir() / "instance.yaml"
     config_path.parent.mkdir(parents=True, exist_ok=True)
 
     with _overlay_write_lock:
@@ -2606,8 +2606,8 @@ async def configure_instance(
     #    — they don't belong in the overlay at all.
     # 2. Patch only the sections this endpoint touches.
     # 3. Write the narrow overlay back atomically (tmp + os.replace).
-    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
-    config_path = data_dir / "state" / "instance.yaml"
+    from app.secrets import _state_dir
+    config_path = _state_dir() / "instance.yaml"
 
     # Same serialization + corrupt-overlay handling as POST /server-config.
     with _overlay_write_lock:
diff --git a/app/instance_config.py b/app/instance_config.py
index 6155608..6901993 100644
--- a/app/instance_config.py
+++ b/app/instance_config.py
@@ -102,8 +102,13 @@ def load_instance_config() -> dict:
     # mirror the resolver here before the deep-merge — without it, the
     # LLM factory receives the literal placeholder and rejects it as an
     # invalid api key (#179 review fix).
-    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
-    overlay_path = data_dir / "state" / "instance.yaml"
+    # Resolve via _state_dir() so the path matches the writer in
+    # app/api/admin.py — under the flat-mount layout (STATE_DIR=/data-state)
+    # both the configure-endpoint and the server-config-endpoint write
+    # ``/data-state/instance.yaml``; reading from ``/data/state/...`` here
+    # would silently load stale config from the regenerable data disk.
+    from app.secrets import _state_dir
+    overlay_path = _state_dir() / "instance.yaml"
     if overlay_path.exists():
         try:
             overlay = yaml.safe_load(overlay_path.read_text()) or {}
diff --git a/docker-compose.flat-mount.yml b/docker-compose.flat-mount.yml
index 0aa3bfc..63125b0 100644
--- a/docker-compose.flat-mount.yml
+++ b/docker-compose.flat-mount.yml
@@ -7,8 +7,8 @@
 # the data disk). That layout works but has known fragility:
 #
 #   - Bind-mount propagation matters. A non-recursive bind hides the
-#     nested mount, leading to silent shadow writes (the failure mode
-#     that caused 2026-05-05 in the Groupon FoundryAI deployment).
+#     nested mount, leading to silent shadow writes — the production
+#     failure mode that motivated this overlay.
 #
 #   - Two writers, one tree. Host-side timers (tls-rotate.timer)
 #     write to /data/state/certs as root, while the container app
diff --git a/docker-compose.host-mount.yml b/docker-compose.host-mount.yml
index fa1bb27..3758a82 100644
--- a/docker-compose.host-mount.yml
+++ b/docker-compose.host-mount.yml
@@ -11,7 +11,7 @@
 # new options to existing volumes — they keep whatever options were
 # in effect at create time.
 #
-# This bit a deployer (Groupon FoundryAI) on 2026-05-05: the volume
+# This bit a deployer in production: the volume
 # was created before this overlay had `bind,rbind`, kept the old
 # `bind` (non-recursive) propagation, and containers wrote to a
 # shadowed subdirectory of the parent disk instead of the nested
diff --git a/docs/state-dir.md b/docs/state-dir.md
index 7e40749..f89ef4d 100644
--- a/docs/state-dir.md
+++ b/docs/state-dir.md
@@ -25,7 +25,7 @@ sdc at /data/state    (nested inside the data mount)
 - Two writers (host's `tls-rotate.timer` running as root; container app running as uid 999) share `/data/state` with different mount-namespace views → ownership conflicts.
 - Resizing sdb requires unmounting sdc first.
 
-The 2026-05-05 incident in the Groupon FoundryAI deployment was a manifestation of the propagation gotcha. See PRs in this repo and the deployer's infra repo for full root-cause notes.
+A production deployment hit this propagation gotcha: a volume was created with non-recursive `bind`, the file was later edited to `bind,rbind`, but Docker named-volume options are immutable after creation, so containers kept writing to a shadowed subdirectory of the parent disk. DuckDB went FATAL on a root-owned WAL during a routine container recreate; sign-in broke. Recovery required `docker volume rm` + per-VM data migration on every affected host.
 
 ## Layout B — flat
 
@@ -52,7 +52,7 @@ sdc at /data-state   (state, irreplaceable — parallel to /data, not nested)
 |---|---|
 | Existing deployment, no plans to expand | stay on layout A |
 | New deployment | layout B (cleaner, no shadow class) |
-| Existing deployment hit by 2026-05-05 shadow class | migrate to layout B |
+| Existing deployment hit by the shadow-mount class above | migrate to layout B |
 | CI / local dev | neither (use ephemeral compose volumes) |
 
 ## Migration A → B
@@ -99,6 +99,9 @@ App code:
 - `src/db.py::_get_state_dir()` — the canonical helper. Used by `get_system_db()` and the schema migration snapshot.
 - `app/secrets.py::_state_dir()` — for `.session_secret`, `.jwt_secret`. Mirrors the helper since `app/` shouldn't import from `src/`.
 - `app/main.py` — for the `.env_overlay` startup file (loaded at process start).
+- `app/instance_config.py` — for the writable `instance.yaml` overlay (read at every config-load).
+- `app/api/admin.py` — for the writable `instance.yaml` overlay (write site of `POST /api/admin/server-config` and `POST /api/admin/configure`) and for `.env_overlay` (write site of `POST /api/admin/configure`).
+- `app/api/marketplaces.py` — for `.env_overlay` (write site of marketplace PAT persistence).
 
 Host scripts:
 - `scripts/ops/agnes-auto-upgrade.sh` — mount-sanity check + cert detection.