diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b651ed..5a5b848 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,35 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +## [0.25.0] — 2026-04-30 + +### Fixed +- `scripts/ops/agnes-auto-upgrade.sh`: fail-fast guard before any `docker + compose` action — when the VM has a config disk attached + (`/dev/disk/by-id/google-config-disk` exists), `/data/state` MUST be backed + by it. Three retry attempts with backoff, then exit non-zero. Prevents the + silent regression where docker host-mount propagation unmounts the config + disk and the app writes user state (DuckDB, marketplaces, session secret) + onto `/data` (sdb) — wiped on the next container recreate. Re-applies + `mount --make-rprivate /data /data/state` on every run to defend against + propagation regressions. +- `infra/modules/customer-instance/startup-script.sh.tpl`: replaced the + inline heredoc copy of the auto-upgrade script with a `curl` from + `raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/scripts/ops/agnes-auto-upgrade.sh` + — single source of truth eliminates drift (the inline copy had fallen + behind on TLS overlay detection, array-form compose files, and the new + config-disk guard). VMs re-fetch on every boot, so script-only fixes + propagate without an infra recreate. Also: `docker-compose.tls.yml` is + now fetched unconditionally (not only when `tls_mode=caddy`), because + the canonical auto-upgrade script detects TLS at runtime via cert files + on disk — certs can appear after boot via `agnes-tls-rotate.sh` or + manual provisioning, and the cron job would otherwise fail every 5 min + until the file was placed. Same reasoning extends to `Caddyfile`: + fetched unconditionally now, plus `agnes-auto-upgrade.sh` skips the + tls overlay when `Caddyfile` is missing/empty (defensive — without + it the caddy service crash-loops while the overlay closes `:8000`, + net effect "app unreachable"). + ## [0.24.0] — 2026-04-30 ### Changed diff --git a/infra/modules/customer-instance/startup-script.sh.tpl b/infra/modules/customer-instance/startup-script.sh.tpl index 1986a68..949cede 100644 --- a/infra/modules/customer-instance/startup-script.sh.tpl +++ b/infra/modules/customer-instance/startup-script.sh.tpl @@ -60,11 +60,18 @@ curl -fsSL "$${RAW_BASE}/docker-compose.yml" -o docker-compose.yml curl -fsSL "$${RAW_BASE}/docker-compose.prod.yml" -o docker-compose.prod.yml # Overlay which binds `data` volume to host /data (persistent disk mounted above) curl -fsSL "$${RAW_BASE}/docker-compose.host-mount.yml" -o docker-compose.host-mount.yml - -# TLS overlay (Caddy + Let's Encrypt) — fetch only when actually needed; surface failures -if [ "$TLS_MODE" = "caddy" ] && [ -n "$DOMAIN" ]; then - curl -fsSL "$${RAW_BASE}/Caddyfile" -o Caddyfile -fi +# TLS overlay + Caddyfile — fetched unconditionally because agnes-auto-upgrade.sh +# (curled from main below) detects TLS at runtime via cert files on disk, +# regardless of TLS_MODE. Certs can appear after boot via agnes-tls-rotate.sh +# or manual provisioning, and: +# - the cron job would fail under `set -euo pipefail` every 5 min if +# docker-compose.tls.yml were missing, and +# - the caddy service in docker-compose.yml bind-mounts ./Caddyfile:ro, +# so without it on disk Docker auto-creates an empty directory there +# and Caddy crash-loops while the overlay has already closed :8000. +# Cheap to keep on disk either way. +curl -fsSL "$${RAW_BASE}/docker-compose.tls.yml" -o docker-compose.tls.yml +curl -fsSL "$${RAW_BASE}/Caddyfile" -o Caddyfile # --- 4. Fetch secrets from Secret Manager — fail loudly if missing --- KEBOOLA_TOKEN="" @@ -161,28 +168,23 @@ docker compose $COMPOSE_FILES $COMPOSE_PROFILES_ARG up -d # --- 6. Auto-upgrade via cron (pulls new image digest every 5 min) --- if [ "$UPGRADE_MODE" = "auto" ]; then - # Cron script sources /opt/agnes/.env for AGNES_TAG — so if operator edits .env - # (e.g. to pin a specific stable-YYYY.MM.N), cron picks it up immediately. No - # drift between what compose up reads and what the digest-check inspects. - cat > /usr/local/bin/agnes-auto-upgrade.sh <<'SCRIPTEOF' -#!/bin/bash -# Runs from cron — pulls new image if one is available, restarts containers. -set -euo pipefail -cd /opt/agnes -# Source .env so AGNES_TAG reflects any operator edits since boot. -# shellcheck disable=SC1091 -set -a; . /opt/agnes/.env; set +a -IMAGE="ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable}" -COMPOSE_FILES="-f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml" -BEFORE=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1) -docker compose $COMPOSE_FILES pull >/dev/null 2>&1 -AFTER=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1) -if [ "$BEFORE" != "$AFTER" ]; then - echo "$(date): new image digest for $IMAGE — recreating containers" - docker compose $COMPOSE_FILES up -d - docker image prune -f >/dev/null 2>&1 -fi -SCRIPTEOF + # Single-source the cron script from the OSS repo's main branch instead + # of inlining a copy here. Two reasons: + # 1. Drift prevention — earlier inline copy missed several iterations + # of the canonical script (TLS overlay detection, array-form compose + # files, config-disk fail-fast guard). + # 2. Re-fetched on every VM boot, so script-only fixes propagate + # without an infra recreate. For immediate rollout to running VMs, + # operators can also re-run this fetch by hand. + # + # Coupling note: this URL is pinned to `main` while compose files above + # honor $COMPOSE_REF. If a future canonical script references a NEW + # compose file, the fetch list above MUST be updated to match — pinned- + # ref VMs would otherwise break on the next cron tick. Treat the docker- + # compose.* fetch list as the contract that agnes-auto-upgrade.sh relies + # on; new compose files referenced from main need a corresponding fetch. + SCRIPT_URL="https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/scripts/ops/agnes-auto-upgrade.sh" + curl -fsSL --retry 3 --retry-delay 2 "$SCRIPT_URL" -o /usr/local/bin/agnes-auto-upgrade.sh chmod +x /usr/local/bin/agnes-auto-upgrade.sh # Install cron entry idempotently: remove any prior agnes-auto-upgrade line, then append ours. diff --git a/pyproject.toml b/pyproject.toml index 280e24b..7b29d22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agnes-the-ai-analyst" -version = "0.24.0" +version = "0.25.0" description = "Agnes — AI Data Analyst platform for AI analytical systems" requires-python = ">=3.11,<3.14" license = "MIT" diff --git a/scripts/ops/agnes-auto-upgrade.sh b/scripts/ops/agnes-auto-upgrade.sh index 674d638..537f2cf 100755 --- a/scripts/ops/agnes-auto-upgrade.sh +++ b/scripts/ops/agnes-auto-upgrade.sh @@ -10,6 +10,45 @@ set -euo pipefail cd /opt/agnes # shellcheck disable=SC1091 set -a; . /opt/agnes/.env; set +a + +# Fail-fast guard: if the VM has a config disk attached, it MUST be +# mounted at /data/state before any container action. Otherwise the +# app would write state onto /data (sdb) and lose it on the next +# container recreate — the regression that motivated this guard. +# Three retries (mount may race with udev on cold boot) then hard exit. +CONFIG_DEVICE=/dev/disk/by-id/google-config-disk +if [ -e "$CONFIG_DEVICE" ]; then + attempt=0 + while [ $attempt -lt 3 ]; do + attempt=$((attempt + 1)) + if mountpoint -q /data/state; then + expected_dev=$(readlink -f "$CONFIG_DEVICE") + actual_dev=$(findmnt -n -o SOURCE /data/state) + if [ "$expected_dev" = "$actual_dev" ]; then + break + fi + logger -t agnes-auto-upgrade "WARN: /data/state on $actual_dev, expected $expected_dev — attempting remount" + umount /data/state 2>/dev/null || true + fi + mount "$CONFIG_DEVICE" /data/state 2>/dev/null || true + sleep $((attempt * 2)) + done + + if ! mountpoint -q /data/state || \ + [ "$(readlink -f "$CONFIG_DEVICE")" != "$(findmnt -n -o SOURCE /data/state)" ]; then + logger -t agnes-auto-upgrade "FATAL: config disk not mounted at /data/state — refusing to start containers" + echo "FATAL: /data/state is not backed by the config disk." >&2 + echo " Refusing to run docker compose — app state must NEVER land on /data (sdb)." >&2 + echo " Inspect: mount | grep /data/state ; ls /dev/disk/by-id/google-config-disk" >&2 + exit 1 + fi + + # Re-apply propagation in case a prior container teardown reset it. + # Idempotent — safe to call when already private. + mount --make-rprivate /data 2>/dev/null || true + mount --make-rprivate /data/state 2>/dev/null || true +fi + IMAGE="ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}" # Array form (vs. word-split string) — quoted expansion survives paths # with spaces and is the modern bash idiom. Functionally identical here @@ -20,10 +59,15 @@ PROFILE_ARGS=() # rotate.sh wrote a 0-byte cert and exited (or got SIGKILLed mid-write). # Bringing up the tls profile against an empty cert would just crash # Caddy on start; better to fall back to plain :8000 until rotate -# regenerates real bytes. -if [ -s /data/state/certs/fullchain.pem ] && [ -s /data/state/certs/privkey.pem ]; then +# regenerates real bytes. Same `-s` rule for Caddyfile: without it (or +# with an empty one) the caddy service crash-loops while the tls overlay +# has already closed :8000 — net effect is "app unreachable". Skipping +# the overlay keeps the app on plain :8000 until config lands. +if [ -s /data/state/certs/fullchain.pem ] && [ -s /data/state/certs/privkey.pem ] && [ -s Caddyfile ]; then COMPOSE_FILES+=( -f docker-compose.tls.yml ) PROFILE_ARGS=( --profile tls ) +elif [ -s /data/state/certs/fullchain.pem ] && [ -s /data/state/certs/privkey.pem ]; then + logger -t agnes-auto-upgrade "WARN: certs present but Caddyfile missing/empty — skipping tls overlay" fi BEFORE=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1) docker compose "${COMPOSE_FILES[@]}" pull >/dev/null 2>&1