From df2c33147c0da28102a75c632dbdc07a7f7ae88b Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 5 May 2026 20:02:50 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Devin=20Review=20on=20#194=20round=202?= =?UTF-8?q?=20=E2=80=94=203=20BUG-class=20findings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. instance.yaml overlay path now matches read site under STATE_DIR. Three sites updated: - app/api/admin.py:1005 (server-config endpoint writer) - app/api/admin.py:2610 (configure endpoint writer) - app/instance_config.py:106 (overlay reader) All three now go through _state_dir() so under flat-mount layout (STATE_DIR=/data-state) the irreplaceable instance.yaml overlay lands on the state disk (sdc) instead of the regenerable data disk (sdb). Without this fix, .env_overlay correctly went to the state disk while instance.yaml went to the data disk — config would be lost if an operator wiped sdb. 2. Strip customer-specific tokens from OSS repo per CLAUDE.md vendor-agnostic rule: - docker-compose.host-mount.yml: 'a deployer (Groupon FoundryAI)' → 'a deployer in production' - docker-compose.flat-mount.yml: 'caused 2026-05-05 in the Groupon FoundryAI deployment' → generic 'production failure mode' - docs/state-dir.md: rewrote the incident reference to describe the failure mode abstractly without naming the deployment; updated the recommendation table to say 'shadow-mount class' instead of dating the specific incident. 3. Updated docs/state-dir.md 'What reads STATE_DIR' to list all read/write sites including the three migrated in this round (admin.py, instance_config.py, marketplaces.py). ANALYSIS finding (tls-rotate.sh hardcoded host-mount.yml) deferred — same operator-side class as auto-upgrade.sh hardcoded host-mount, documented limitation per the PR body. --- app/api/admin.py | 8 ++++---- app/instance_config.py | 9 +++++++-- docker-compose.flat-mount.yml | 4 ++-- docker-compose.host-mount.yml | 2 +- docs/state-dir.md | 7 +++++-- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/app/api/admin.py b/app/api/admin.py index 7fd1f82..4469299 100644 --- a/app/api/admin.py +++ b/app/api/admin.py @@ -1001,8 +1001,8 @@ async def update_server_config( # atomic-write sequence; the audit log sits outside since it operates on # local snapshots. from app.instance_config import reset_cache - data_dir = Path(os.environ.get("DATA_DIR", "./data")) - config_path = data_dir / "state" / "instance.yaml" + from app.secrets import _state_dir + config_path = _state_dir() / "instance.yaml" config_path.parent.mkdir(parents=True, exist_ok=True) with _overlay_write_lock: @@ -2606,8 +2606,8 @@ async def configure_instance( # — they don't belong in the overlay at all. # 2. Patch only the sections this endpoint touches. # 3. Write the narrow overlay back atomically (tmp + os.replace). - data_dir = Path(os.environ.get("DATA_DIR", "./data")) - config_path = data_dir / "state" / "instance.yaml" + from app.secrets import _state_dir + config_path = _state_dir() / "instance.yaml" # Same serialization + corrupt-overlay handling as POST /server-config. with _overlay_write_lock: diff --git a/app/instance_config.py b/app/instance_config.py index 6155608..6901993 100644 --- a/app/instance_config.py +++ b/app/instance_config.py @@ -102,8 +102,13 @@ def load_instance_config() -> dict: # mirror the resolver here before the deep-merge — without it, the # LLM factory receives the literal placeholder and rejects it as an # invalid api key (#179 review fix). - data_dir = Path(os.environ.get("DATA_DIR", "./data")) - overlay_path = data_dir / "state" / "instance.yaml" + # Resolve via _state_dir() so the path matches the writer in + # app/api/admin.py — under the flat-mount layout (STATE_DIR=/data-state) + # both the configure-endpoint and the server-config-endpoint write + # ``/data-state/instance.yaml``; reading from ``/data/state/...`` here + # would silently load stale config from the regenerable data disk. + from app.secrets import _state_dir + overlay_path = _state_dir() / "instance.yaml" if overlay_path.exists(): try: overlay = yaml.safe_load(overlay_path.read_text()) or {} diff --git a/docker-compose.flat-mount.yml b/docker-compose.flat-mount.yml index 0aa3bfc..63125b0 100644 --- a/docker-compose.flat-mount.yml +++ b/docker-compose.flat-mount.yml @@ -7,8 +7,8 @@ # the data disk). That layout works but has known fragility: # # - Bind-mount propagation matters. A non-recursive bind hides the -# nested mount, leading to silent shadow writes (the failure mode -# that caused 2026-05-05 in the Groupon FoundryAI deployment). +# nested mount, leading to silent shadow writes — the production +# failure mode that motivated this overlay. # # - Two writers, one tree. Host-side timers (tls-rotate.timer) # write to /data/state/certs as root, while the container app diff --git a/docker-compose.host-mount.yml b/docker-compose.host-mount.yml index fa1bb27..3758a82 100644 --- a/docker-compose.host-mount.yml +++ b/docker-compose.host-mount.yml @@ -11,7 +11,7 @@ # new options to existing volumes — they keep whatever options were # in effect at create time. # -# This bit a deployer (Groupon FoundryAI) on 2026-05-05: the volume +# This bit a deployer in production: the volume # was created before this overlay had `bind,rbind`, kept the old # `bind` (non-recursive) propagation, and containers wrote to a # shadowed subdirectory of the parent disk instead of the nested diff --git a/docs/state-dir.md b/docs/state-dir.md index 7e40749..f89ef4d 100644 --- a/docs/state-dir.md +++ b/docs/state-dir.md @@ -25,7 +25,7 @@ sdc at /data/state (nested inside the data mount) - Two writers (host's `tls-rotate.timer` running as root; container app running as uid 999) share `/data/state` with different mount-namespace views → ownership conflicts. - Resizing sdb requires unmounting sdc first. -The 2026-05-05 incident in the Groupon FoundryAI deployment was a manifestation of the propagation gotcha. See PRs in this repo and the deployer's infra repo for full root-cause notes. +A production deployment hit this propagation gotcha: a volume was created with non-recursive `bind`, the file was later edited to `bind,rbind`, but Docker named-volume options are immutable after creation, so containers kept writing to a shadowed subdirectory of the parent disk. DuckDB went FATAL on a root-owned WAL during a routine container recreate; sign-in broke. Recovery required `docker volume rm` + per-VM data migration on every affected host. ## Layout B — flat @@ -52,7 +52,7 @@ sdc at /data-state (state, irreplaceable — parallel to /data, not nested) |---|---| | Existing deployment, no plans to expand | stay on layout A | | New deployment | layout B (cleaner, no shadow class) | -| Existing deployment hit by 2026-05-05 shadow class | migrate to layout B | +| Existing deployment hit by the shadow-mount class above | migrate to layout B | | CI / local dev | neither (use ephemeral compose volumes) | ## Migration A → B @@ -99,6 +99,9 @@ App code: - `src/db.py::_get_state_dir()` — the canonical helper. Used by `get_system_db()` and the schema migration snapshot. - `app/secrets.py::_state_dir()` — for `.session_secret`, `.jwt_secret`. Mirrors the helper since `app/` shouldn't import from `src/`. - `app/main.py` — for the `.env_overlay` startup file (loaded at process start). +- `app/instance_config.py` — for the writable `instance.yaml` overlay (read at every config-load). +- `app/api/admin.py` — for the writable `instance.yaml` overlay (write site of `POST /api/admin/server-config` and `POST /api/admin/configure`) and for `.env_overlay` (write site of `POST /api/admin/configure`). +- `app/api/marketplaces.py` — for `.env_overlay` (write site of marketplace PAT persistence). Host scripts: - `scripts/ops/agnes-auto-upgrade.sh` — mount-sanity check + cert detection.