#!/bin/bash # Deployed to /usr/local/bin/agnes-tls-rotate.sh on the VM by the infra # repo startup.sh. A systemd timer fires it daily. # # Corp security rotates certs at stable URLs (TLS_FULLCHAIN_URL, # TLS_PRIVKEY_URL in /opt/agnes/.env). This script refetches, compares # sha via cmp, atomically replaces changed files, and sends SIGUSR1 to # caddy for a zero-downtime reload. No-op when cert has not moved. # # TLS_PRIVKEY_URL is optional — leave empty when the key is provisioned # once per VM (e.g. from Secret Manager at boot) and reused across # cert rotations. # # Self-signed fallback: when TLS_FULLCHAIN_URL returns no data (security # dept hasn't published the real cert yet) AND no fullchain.pem exists # on disk, generate a 30-day self-signed cert against the same privkey. # Because Security signs the eventual real cert against the CSR # produced from this same key, the key never changes — the rotate tick # after publication just swaps the fullchain file, SIGUSR1-reloads # Caddy, and clients start seeing the real chain with zero downtime. # Browsers see a self-signed warning in the meantime — acceptable for # the bring-up window, and the only way to get Caddy up before the # real cert exists without splitting into two code paths. set -euo pipefail # Disable core dumps for this script. openssl runs with the unencrypted # privkey in process memory; a SIGSEGV core file would leak it to whoever # can read /var/lib/systemd/coredump (typically root + adm group). Cheap # defence in depth — this script is short-lived and has no debug needs. ulimit -c 0 cd /opt/agnes # shellcheck disable=SC1091 set -a; . /opt/agnes/.env; set +a [ -n "${TLS_FULLCHAIN_URL:-}" ] || { echo "TLS_FULLCHAIN_URL empty — nothing to rotate"; exit 0; } CERT_DIR=/data/state/certs mkdir -p "$CERT_DIR" # Match the agnes UID baked into the app image (Dockerfile: useradd --uid 999). # Without this, whoever happens to win the create race (this script as root # vs. the app container's first volume-init touch as 999) decides ownership; # when root wins, mode 700 leaves the container unable to read its own certs # and `_read_agnes_ca_pem()` silently returns None, suppressing the trust- # bootstrap block in the /install setup prompt. `|| true` keeps the script # resilient on hosts where the GID is reserved (chgrp on a non-existent # numeric GID is fine on Linux but pedantically fails on some BSD-derived # tooling); if the chown itself fails we keep going and surface the # resulting permission error from the next refetch step instead. chown 999:999 "$CERT_DIR" || true chmod 700 "$CERT_DIR" CHANGED=0 TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT refetch() { local url="$1" dest="$2" mode="$3" kind="$4" # IMPORTANT: tls-fetch.sh may fail (404, empty body, auth error, # invalid PEM, redirect attempt). When the caller sits behind # `if ! refetch`, bash disables `set -e` for everything inside the # condition — so without an explicit exit-code check we would fall # through to `install` and overwrite $dest with whatever stale bytes # the PREVIOUS refetch call left in $TMP. That turned the "fullchain # unavailable → fall back to self-signed" branch into a "fullchain # file filled with privkey bytes" bug. Check explicitly and return 1 # on any fetch failure so the caller's fallback branch fires cleanly. if ! /usr/local/bin/tls-fetch.sh "$url" "$TMP" "$mode" "$kind"; then return 1 fi if [ ! -f "$dest" ] || ! cmp -s "$TMP" "$dest"; then install -m "$mode" "$TMP" "$dest" echo "$(date -Is) rotated $(basename "$dest")" CHANGED=1 fi } # Private key handling. # # Three modes (decided per-VM in the infra repo's local.vm_tls): # # 1. TLS_PRIVKEY_URL set (sm://, gs://, https://, file://) — fetch it # every rotate tick. Used by VMs that keep the key in Secret # Manager or similar for VM-replace resilience (legacy pattern). # # 2. TLS_PRIVKEY_URL empty AND $CERT_DIR/privkey.pem already on disk # — reuse the on-disk key, never fetch. The file survives the VM # for the lifetime of /data's persistence. # # 3. TLS_PRIVKEY_URL empty AND no on-disk key — generate an RSA-2048 # key + a CSR against $DOMAIN in place. This is the "fresh VM" # bring-up path: the key never leaves the VM, and the CSR is # written to $CERT_DIR/cert.csr for the operator to grab via # `gcloud compute ssh … sudo cat /data/state/certs/cert.csr` and # attach to the SECURITY Jira that requests public-cert signing. # Until Security publishes the real fullchain, the self-signed # fallback below keeps Caddy serving HTTPS against this same key. if [ -n "${TLS_PRIVKEY_URL:-}" ]; then if ! refetch "$TLS_PRIVKEY_URL" "$CERT_DIR/privkey.pem" 600 key; then if [ ! -s "$CERT_DIR/privkey.pem" ]; then echo "ERROR: privkey fetch failed and no cached copy exists — aborting" >&2 exit 1 fi echo "$(date -Is) privkey fetch failed; keeping cached $CERT_DIR/privkey.pem" fi elif [ ! -s "$CERT_DIR/privkey.pem" ]; then CN="${DOMAIN:-localhost}" # Site-specific CSR subject (C/ST/L/O fields) comes from # TLS_CSR_SUBJECT in /opt/agnes/.env — the deployer's infra layer # writes it with its PKI conventions. This script stays generic; # default to a minimal /CN= when the var is unset so the # CSR is still syntactically valid but carries no org metadata the # deployer didn't choose. SUBJECT="${TLS_CSR_SUBJECT:-/CN=$CN}" echo "$(date -Is) no privkey — generating RSA-2048 key + CSR (subject: $SUBJECT)" CSR_CONF=$(mktemp) cat > "$CSR_CONF" </dev/null chmod 600 "$CERT_DIR/privkey.pem" chmod 644 "$CERT_DIR/cert.csr" rm -f "$CSR_CONF" echo "$(date -Is) privkey.pem + cert.csr written to $CERT_DIR" echo "$(date -Is) ACTION: send $CERT_DIR/cert.csr to your certificate authority for signing — the CSR is public and safe to transit; the key never leaves this VM." fi # Real cert fetch. On failure, fall back to self-signed IFF no # fullchain exists yet. If one exists (prior real OR prior self-signed) # keep it — a transient fetch failure should not churn certs. if ! refetch "$TLS_FULLCHAIN_URL" "$CERT_DIR/fullchain.pem" 644 cert; then if [ ! -s "$CERT_DIR/fullchain.pem" ]; then echo "$(date -Is) real cert unavailable at $TLS_FULLCHAIN_URL — generating 30-day self-signed" if [ ! -s "$CERT_DIR/privkey.pem" ]; then echo "ERROR: no privkey available — cannot self-sign" >&2 exit 1 fi CN="${DOMAIN:-localhost}" # Same parametrisation as the CSR branch above — site-specific PKI # fields belong in the deployer's .env, not in this script. Keeps # the self-signed bring-up cert consistent with whatever the eventual # CA-signed cert will say. SUBJECT="${TLS_CSR_SUBJECT:-/CN=$CN}" openssl req -x509 -new -key "$CERT_DIR/privkey.pem" \ -out "$CERT_DIR/fullchain.pem" -days 30 \ -subj "$SUBJECT" \ -addext "subjectAltName=DNS:$CN" \ -addext "basicConstraints=critical,CA:FALSE" \ -addext "keyUsage=digitalSignature,keyEncipherment" \ -addext "extendedKeyUsage=serverAuth" 2>/dev/null chmod 644 "$CERT_DIR/fullchain.pem" echo "$(date -Is) self-signed fullchain.pem installed (CN=$CN)" CHANGED=1 else echo "$(date -Is) fetch failed but cached fullchain.pem exists — keeping it" fi fi if [ "$CHANGED" -eq 1 ]; then # Array form (vs. word-split string) — quoted expansion is the # modern bash idiom for arg lists, defensive against future filename # weirdness. ps --status flag requires Compose v2.6.1+; if your VMs # are older, replace with `ps --format '{{.Service}} {{.State}}'` # and filter on the State column. COMPOSE_FILES=( -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml -f docker-compose.tls.yml ) if docker compose "${COMPOSE_FILES[@]}" --profile tls ps --status=running --format '{{.Service}}' 2>/dev/null | grep -q '^caddy$'; then # Caddy running — graceful reload via SIGUSR1 picks up the new # cert without dropping connections. docker compose "${COMPOSE_FILES[@]}" --profile tls kill -s SIGUSR1 caddy >/dev/null 2>&1 \ && echo "$(date -Is) caddy reloaded" \ || echo "$(date -Is) caddy reload signal failed" else # Caddy not running yet — first time certs land on this VM, or # operator hasn't brought up the tls profile yet. Flip the stack # in place so this script is self-sufficient: no separate manual # `docker compose up` step after seeding certs. echo "$(date -Is) caddy not running — bringing tls profile up" docker compose "${COMPOSE_FILES[@]}" --profile tls up -d 2>&1 | tail -5 fi fi