agnes-the-ai-analyst/scripts/ops/agnes-tls-rotate.sh
ZdenekSrotyr f3d252f17d
fix(tls-rotate): chown CERT_DIR to UID 999 so the app container can read its own certs (#143)
The script's `mkdir -p` left ownership of `/data/state/certs/` to whichever
process won the create race — root when systemd's timer fired before the
app container's first volume init, UID 999 when the container ran first.
With mode 700, a root-owned dir blocks the UID-999 agnes container from
reading its own fullchain.pem; `_read_agnes_ca_pem()` returns None, and
the cross-platform TLS trust block (Step 0 from PR #137) silently
disappears from the /install setup prompt. Operators on the unlucky-race
VMs got a setup prompt that couldn't bootstrap client trust against the
self-signed host. Existing VMs self-heal on next timer tick.
2026-04-30 13:21:59 +02:00

196 lines
8.9 KiB
Bash
Executable file

#!/bin/bash
# Deployed to /usr/local/bin/agnes-tls-rotate.sh on the VM by the infra
# repo startup.sh. A systemd timer fires it daily.
#
# Corp security rotates certs at stable URLs (TLS_FULLCHAIN_URL,
# TLS_PRIVKEY_URL in /opt/agnes/.env). This script refetches, compares
# sha via cmp, atomically replaces changed files, and sends SIGUSR1 to
# caddy for a zero-downtime reload. No-op when cert has not moved.
#
# TLS_PRIVKEY_URL is optional — leave empty when the key is provisioned
# once per VM (e.g. from Secret Manager at boot) and reused across
# cert rotations.
#
# Self-signed fallback: when TLS_FULLCHAIN_URL returns no data (security
# dept hasn't published the real cert yet) AND no fullchain.pem exists
# on disk, generate a 30-day self-signed cert against the same privkey.
# Because Security signs the eventual real cert against the CSR
# produced from this same key, the key never changes — the rotate tick
# after publication just swaps the fullchain file, SIGUSR1-reloads
# Caddy, and clients start seeing the real chain with zero downtime.
# Browsers see a self-signed warning in the meantime — acceptable for
# the bring-up window, and the only way to get Caddy up before the
# real cert exists without splitting into two code paths.
set -euo pipefail
# Disable core dumps for this script. openssl runs with the unencrypted
# privkey in process memory; a SIGSEGV core file would leak it to whoever
# can read /var/lib/systemd/coredump (typically root + adm group). Cheap
# defence in depth — this script is short-lived and has no debug needs.
ulimit -c 0
cd /opt/agnes
# shellcheck disable=SC1091
set -a; . /opt/agnes/.env; set +a
[ -n "${TLS_FULLCHAIN_URL:-}" ] || { echo "TLS_FULLCHAIN_URL empty — nothing to rotate"; exit 0; }
CERT_DIR=/data/state/certs
mkdir -p "$CERT_DIR"
# Match the agnes UID baked into the app image (Dockerfile: useradd --uid 999).
# Without this, whoever happens to win the create race (this script as root
# vs. the app container's first volume-init touch as 999) decides ownership;
# when root wins, mode 700 leaves the container unable to read its own certs
# and `_read_agnes_ca_pem()` silently returns None, suppressing the trust-
# bootstrap block in the /install setup prompt. `|| true` keeps the script
# resilient on hosts where the GID is reserved (chgrp on a non-existent
# numeric GID is fine on Linux but pedantically fails on some BSD-derived
# tooling); if the chown itself fails we keep going and surface the
# resulting permission error from the next refetch step instead.
chown 999:999 "$CERT_DIR" || true
chmod 700 "$CERT_DIR"
CHANGED=0
TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT
refetch() {
local url="$1" dest="$2" mode="$3" kind="$4"
# IMPORTANT: tls-fetch.sh may fail (404, empty body, auth error,
# invalid PEM, redirect attempt). When the caller sits behind
# `if ! refetch`, bash disables `set -e` for everything inside the
# condition — so without an explicit exit-code check we would fall
# through to `install` and overwrite $dest with whatever stale bytes
# the PREVIOUS refetch call left in $TMP. That turned the "fullchain
# unavailable → fall back to self-signed" branch into a "fullchain
# file filled with privkey bytes" bug. Check explicitly and return 1
# on any fetch failure so the caller's fallback branch fires cleanly.
if ! /usr/local/bin/tls-fetch.sh "$url" "$TMP" "$mode" "$kind"; then
return 1
fi
if [ ! -f "$dest" ] || ! cmp -s "$TMP" "$dest"; then
install -m "$mode" "$TMP" "$dest"
echo "$(date -Is) rotated $(basename "$dest")"
CHANGED=1
fi
}
# Private key handling.
#
# Three modes (decided per-VM in the infra repo's local.vm_tls):
#
# 1. TLS_PRIVKEY_URL set (sm://, gs://, https://, file://) — fetch it
# every rotate tick. Used by VMs that keep the key in Secret
# Manager or similar for VM-replace resilience (legacy pattern).
#
# 2. TLS_PRIVKEY_URL empty AND $CERT_DIR/privkey.pem already on disk
# — reuse the on-disk key, never fetch. The file survives the VM
# for the lifetime of /data's persistence.
#
# 3. TLS_PRIVKEY_URL empty AND no on-disk key — generate an RSA-2048
# key + a CSR against $DOMAIN in place. This is the "fresh VM"
# bring-up path: the key never leaves the VM, and the CSR is
# written to $CERT_DIR/cert.csr for the operator to grab via
# `gcloud compute ssh … sudo cat /data/state/certs/cert.csr` and
# attach to the SECURITY Jira that requests public-cert signing.
# Until Security publishes the real fullchain, the self-signed
# fallback below keeps Caddy serving HTTPS against this same key.
if [ -n "${TLS_PRIVKEY_URL:-}" ]; then
if ! refetch "$TLS_PRIVKEY_URL" "$CERT_DIR/privkey.pem" 600 key; then
if [ ! -s "$CERT_DIR/privkey.pem" ]; then
echo "ERROR: privkey fetch failed and no cached copy exists — aborting" >&2
exit 1
fi
echo "$(date -Is) privkey fetch failed; keeping cached $CERT_DIR/privkey.pem"
fi
elif [ ! -s "$CERT_DIR/privkey.pem" ]; then
CN="${DOMAIN:-localhost}"
# Site-specific CSR subject (C/ST/L/O fields) comes from
# TLS_CSR_SUBJECT in /opt/agnes/.env — the deployer's infra layer
# writes it with its PKI conventions. This script stays generic;
# default to a minimal /CN=<hostname> when the var is unset so the
# CSR is still syntactically valid but carries no org metadata the
# deployer didn't choose.
SUBJECT="${TLS_CSR_SUBJECT:-/CN=$CN}"
echo "$(date -Is) no privkey — generating RSA-2048 key + CSR (subject: $SUBJECT)"
CSR_CONF=$(mktemp)
cat > "$CSR_CONF" <<CFG
[ req ]
prompt = no
distinguished_name = req_distinguished_name
req_extensions = ext
[ req_distinguished_name ]
CN = $CN
[ ext ]
keyUsage = digitalSignature, keyEncipherment
extendedKeyUsage = serverAuth
subjectAltName = @subject_alt_names
[ subject_alt_names ]
DNS.1 = $CN
CFG
umask 077
openssl req -newkey rsa:2048 \
-keyout "$CERT_DIR/privkey.pem" \
-out "$CERT_DIR/cert.csr" \
-subj "$SUBJECT" \
-config "$CSR_CONF" -extensions ext -nodes 2>/dev/null
chmod 600 "$CERT_DIR/privkey.pem"
chmod 644 "$CERT_DIR/cert.csr"
rm -f "$CSR_CONF"
echo "$(date -Is) privkey.pem + cert.csr written to $CERT_DIR"
echo "$(date -Is) ACTION: send $CERT_DIR/cert.csr to your certificate authority for signing — the CSR is public and safe to transit; the key never leaves this VM."
fi
# Real cert fetch. On failure, fall back to self-signed IFF no
# fullchain exists yet. If one exists (prior real OR prior self-signed)
# keep it — a transient fetch failure should not churn certs.
if ! refetch "$TLS_FULLCHAIN_URL" "$CERT_DIR/fullchain.pem" 644 cert; then
if [ ! -s "$CERT_DIR/fullchain.pem" ]; then
echo "$(date -Is) real cert unavailable at $TLS_FULLCHAIN_URL — generating 30-day self-signed"
if [ ! -s "$CERT_DIR/privkey.pem" ]; then
echo "ERROR: no privkey available — cannot self-sign" >&2
exit 1
fi
CN="${DOMAIN:-localhost}"
# Same parametrisation as the CSR branch above — site-specific PKI
# fields belong in the deployer's .env, not in this script. Keeps
# the self-signed bring-up cert consistent with whatever the eventual
# CA-signed cert will say.
SUBJECT="${TLS_CSR_SUBJECT:-/CN=$CN}"
openssl req -x509 -new -key "$CERT_DIR/privkey.pem" \
-out "$CERT_DIR/fullchain.pem" -days 30 \
-subj "$SUBJECT" \
-addext "subjectAltName=DNS:$CN" \
-addext "keyUsage=digitalSignature,keyEncipherment" \
-addext "extendedKeyUsage=serverAuth" 2>/dev/null
chmod 644 "$CERT_DIR/fullchain.pem"
echo "$(date -Is) self-signed fullchain.pem installed (CN=$CN)"
CHANGED=1
else
echo "$(date -Is) fetch failed but cached fullchain.pem exists — keeping it"
fi
fi
if [ "$CHANGED" -eq 1 ]; then
# Array form (vs. word-split string) — quoted expansion is the
# modern bash idiom for arg lists, defensive against future filename
# weirdness. ps --status flag requires Compose v2.6.1+; if your VMs
# are older, replace with `ps --format '{{.Service}} {{.State}}'`
# and filter on the State column.
COMPOSE_FILES=( -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml -f docker-compose.tls.yml )
if docker compose "${COMPOSE_FILES[@]}" --profile tls ps --status=running --format '{{.Service}}' 2>/dev/null | grep -q '^caddy$'; then
# Caddy running — graceful reload via SIGUSR1 picks up the new
# cert without dropping connections.
docker compose "${COMPOSE_FILES[@]}" --profile tls kill -s SIGUSR1 caddy >/dev/null 2>&1 \
&& echo "$(date -Is) caddy reloaded" \
|| echo "$(date -Is) caddy reload signal failed"
else
# Caddy not running yet — first time certs land on this VM, or
# operator hasn't brought up the tls profile yet. Flip the stack
# in place so this script is self-sufficient: no separate manual
# `docker compose up` step after seeding certs.
echo "$(date -Is) caddy not running — bringing tls profile up"
docker compose "${COMPOSE_FILES[@]}" --profile tls up -d 2>&1 | tail -5
fi
fi