From 0bbbf3e40bd110f87be6e21008a15a79fedf6a50 Mon Sep 17 00:00:00 2001 From: Vojtech <119944107+cvrysanek@users.noreply.github.com> Date: Sat, 25 Apr 2026 21:51:25 +0200 Subject: [PATCH] feat(tls): corporate-CA HTTPS with URL-driven rotation, on-VM CSR gen, self-signed fallback (#51) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the implicit Let's Encrypt flow with a general corporate-CA HTTPS path: - Caddy switches to cert-file mode (`tls /certs/fullchain.pem /certs/privkey.pem`) with HSTS + TLS 1.2/1.3 floor - New `docker-compose.tls.yml` overlay closes host `:8000` when Caddy fronts (no TLS bypass) - New `scripts/tls-fetch.sh` — generic URL fetcher for `sm://`, `gs://`, `https://`, `file://` with redirect refusal + PEM validation - New `scripts/grpn/agnes-tls-rotate.sh` — daily rotation, self-signed fallback against same key (zero key churn), on-VM RSA-2048 + CSR auto-gen, atomic swap, SIGUSR1 reload - `scripts/grpn/agnes-auto-upgrade.sh` becomes cert-aware (auto-enables tls overlay when certs present) - Compose profile `production` renamed to `tls` (aligns with DEPLOYMENT.md and infra startup) Pairs with FoundryAI/agnes-the-ai-analyst-infra#27 (merged) which wires per-VM `local.vm_tls`, writes `TLS_*` env vars into `.env`, auto-creates Secret Manager containers for `sm://` privkey URLs, and installs `agnes-tls-rotate.{service,timer}` for daily polling. Includes hardening + docs follow-ups from code review: - `TLS_CSR_SUBJECT` env-var parametrisation applied to both CSR and self-signed cert paths - curl `--max-redirs 0 --proto '=https'` + post-fetch PEM validation in `tls-fetch.sh` - `ulimit -c 0` + array-form `COMPOSE_FILES` (POSIX-safe, bash 3.2 compatible) - TLS section added to `config/.env.template` - Historical-note headers in `docs/superpowers/{plans,specs}/2026-04-09-*.md` flagging the profile rename --- CLAUDE.md | 6 + Caddyfile | 25 ++- README.md | 6 +- config/.env.template | 25 +++ docker-compose.tls.yml | 16 ++ docker-compose.yml | 7 +- docs/DEPLOYMENT.md | 54 ++++- .../plans/2026-04-09-deployment-readiness.md | 2 + ...-04-09-multi-instance-deployment-design.md | 2 + scripts/grpn/agnes-auto-upgrade.sh | 25 ++- scripts/grpn/agnes-tls-rotate.sh | 185 ++++++++++++++++++ scripts/tls-fetch.sh | 90 +++++++++ 12 files changed, 430 insertions(+), 13 deletions(-) create mode 100644 docker-compose.tls.yml create mode 100755 scripts/grpn/agnes-tls-rotate.sh create mode 100755 scripts/tls-fetch.sh diff --git a/CLAUDE.md b/CLAUDE.md index 04ce775..d2c661f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,8 +27,14 @@ Ask the user for: ```bash docker compose up # Start app + scheduler docker compose --profile full up # Include telegram bot + +# HTTPS mode — Caddy + corporate-CA certs at /data/state/certs +docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.tls.yml \ + --profile tls up -d ``` +See `docs/DEPLOYMENT.md` → **TLS** for cert provisioning + `scripts/grpn/agnes-tls-rotate.sh` (daily refetch from `TLS_FULLCHAIN_URL`, `SIGUSR1` reload on diff, no-op when unchanged). The infra repo's `startup.sh` installs this as a systemd timer automatically. + ## Project Structure ``` diff --git a/Caddyfile b/Caddyfile index ca4a5a7..5caa4e4 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,26 @@ {$DOMAIN:localhost} { - reverse_proxy app:8000 + # Cert-file mode (corporate CA path). For Let's Encrypt, drop the + # `tls` directive entirely so Caddy auto-issues. See docs/DEPLOYMENT.md. + tls /certs/fullchain.pem /certs/privkey.pem { + # Modern TLS only. Caddy default already excludes 1.0/1.1 in + # most builds, but pin explicitly so a future Caddy default + # change can't silently weaken our posture. + protocols tls1.2 tls1.3 + } + + # HSTS: tell compliant browsers to refuse plain-HTTP for this host + # for a year. Skipping `preload` so we keep an escape hatch (preload + # submission is hard-bound and blocks rollback). Skipping + # `includeSubDomains` because we don't control subdomains. + header Strict-Transport-Security "max-age=31536000" + + reverse_proxy app:8000 { + # App's uvicorn runs with --proxy-headers, so stamping these + # ourselves makes OAuth callback URLs and Set-Cookie Secure + # flags resolve to https consistently. X-Forwarded-Host is + # also Caddy's default, but pinning it explicitly insures + # against future default changes. + header_up X-Forwarded-Proto https + header_up X-Forwarded-Host {host} + } } diff --git a/README.md b/README.md index 4a57889..a389cba 100644 --- a/README.md +++ b/README.md @@ -65,9 +65,13 @@ docker compose up # Start with all optional services (Telegram bot, etc.) docker compose --profile full up + +# Start with TLS (Caddy on :443 with corporate-CA certs from /data/state/certs) +docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.tls.yml \ + --profile tls up -d ``` -Once running, the FastAPI app is available at `http://localhost:8000`. Trigger a manual sync: +Once running, the FastAPI app is available at `http://localhost:8000` (or `https://$DOMAIN` in TLS mode). See [`docs/DEPLOYMENT.md`](docs/DEPLOYMENT.md) for cert provisioning + auto-rotation via `scripts/grpn/agnes-tls-rotate.sh`. Trigger a manual sync: ```bash curl -X POST http://localhost:8000/api/sync/trigger diff --git a/config/.env.template b/config/.env.template index 7f0f52d..d45cb34 100644 --- a/config/.env.template +++ b/config/.env.template @@ -63,3 +63,28 @@ SESSION_SECRET= # python -c "import secrets; print(secrets.token_he # # Uvicorn is started with `--proxy-headers --forwarded-allow-ips='*'` so it # trusts X-Forwarded-Proto / X-Forwarded-For from the reverse proxy. + +# ── TLS TERMINATION (Caddy in cert-file mode) ─────── +# When TLS_FULLCHAIN_URL is set, scripts/grpn/agnes-tls-rotate.sh fetches +# the cert daily from this URL and reloads Caddy on diff (zero downtime). +# Empty -> no TLS, app serves plain HTTP on :8000. See docs/DEPLOYMENT.md +# -> TLS for the full bring-up flow. +# +# Supported URL schemes (all four scripts/tls-fetch.sh resolves): +# sm:// Google Secret Manager (latest version) +# gs:/// GCS object +# https:// Plain HTTPS download (no redirects allowed) +# file:// Local file (dev/testing only) +# +# TLS_FULLCHAIN_URL= +# +# TLS_PRIVKEY_URL: optional. Empty -> on-VM RSA-2048 key + CSR auto- +# generated on first rotate tick (key never leaves the host; CSR at +# /data/state/certs/cert.csr to submit to your CA). Set to a URL when +# you want VM-replace resilience (e.g. sm://). +# TLS_PRIVKEY_URL= +# +# TLS_CSR_SUBJECT: stamped on auto-generated CSRs and on the self-signed +# bring-up cert that Caddy serves until your CA publishes the real chain. +# Defaults to /CN=$DOMAIN when unset. +# TLS_CSR_SUBJECT=/C=US/ST=California/L=San Francisco/O=Your Org/CN=data.yourcompany.com diff --git a/docker-compose.tls.yml b/docker-compose.tls.yml new file mode 100644 index 0000000..e78bab3 --- /dev/null +++ b/docker-compose.tls.yml @@ -0,0 +1,16 @@ +# TLS overlay — closes direct :8000 on host when Caddy fronts the app. +# +# Use alongside docker-compose.yml (+ docker-compose.prod.yml + +# docker-compose.host-mount.yml on VMs). Requires --profile tls to +# actually start the caddy service. +# +# Usage (VM): +# docker compose \ +# -f docker-compose.yml \ +# -f docker-compose.prod.yml \ +# -f docker-compose.host-mount.yml \ +# -f docker-compose.tls.yml \ +# --profile tls up -d +services: + app: + ports: !reset [] diff --git a/docker-compose.yml b/docker-compose.yml index 01a38f4..5f1317c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -111,6 +111,10 @@ services: - full restart: unless-stopped + # TLS reverse proxy. Corporate-CA certs mounted from /data/state/certs + # (managed by scripts/grpn/agnes-tls-rotate.sh on the VM). For local + # development without certs, run without --profile tls and hit :8000 + # directly. caddy: image: caddy:2-alpine ports: @@ -118,6 +122,7 @@ services: - "443:443" volumes: - ./Caddyfile:/etc/caddy/Caddyfile:ro + - /data/state/certs:/certs:ro - caddy_data:/data - caddy_config:/config environment: @@ -127,7 +132,7 @@ services: condition: service_healthy restart: unless-stopped profiles: - - production + - tls volumes: data: diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 029dfd9..e5f9f8c 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -16,7 +16,7 @@ Highlights: - Secret Manager for tokens (no plaintext in VM metadata) - OS Login for SSH, dedicated VM service account with scoped `secretAccessor` - Cron-based auto-upgrade (pulls `:stable` image digest every 5 min) -- Caddy + Let's Encrypt TLS (opt-in with domain) +- Caddy TLS with corporate-CA or self-managed certs mounted from `/data/state/certs`; daily auto-rotation from a URL (`TLS_FULLCHAIN_URL`) with zero-downtime `SIGUSR1` reload - Uptime check + alert policy per VM (wire a notification channel to be paged) - CI/CD in the private repo: PR → `terraform plan`, merge to main → `apply-dev` auto, `apply-prod` gated by reviewer - First-boot bootstrap via `POST /auth/bootstrap` @@ -88,12 +88,52 @@ For running Agnes on your own VM / bare metal without Terraform. You're responsi ### TLS (optional) -Set `DOMAIN` in `.env` + point your DNS A-record at the host, then start with the `tls` profile: +Caddy runs as the TLS terminator. It reads certs from `/data/state/certs/{fullchain,privkey}.pem` bind-mounted into the container. Two provisioning modes: -```bash -AGNES_DOMAIN=agnes.example.com ACME_EMAIL=admin@example.com \ - docker compose -f docker-compose.yml -f docker-compose.prod.yml --profile tls up -d -``` +**A. Public internet (Let's Encrypt)** — for this path, override the `Caddyfile` to drop the `tls` directive (so Caddy auto-issues) and skip steps below. Not covered here anymore; see git history prior to the `feat(tls)` change if you need the ACME flow. + +**B. Corporate CA / self-managed certs** (recommended, and what the infra repo ships): + +Two bring-up flows, picked by whether `TLS_PRIVKEY_URL` is set in `.env`: + +- **On-VM gen** (preferred for new deployments): leave `TLS_PRIVKEY_URL` empty. On first run, `agnes-tls-rotate.sh` generates an RSA-2048 key + CSR directly into `/data/state/certs/` using the subject string from `TLS_CSR_SUBJECT`. The key never leaves the host; the CSR (`/data/state/certs/cert.csr`) is what you submit to your corporate PKI. Until the CA signs and publishes, rotate falls back to a 30-day self-signed cert against the same key so Caddy can serve :443. +- **Pre-provisioned key** (legacy / VM-replace-resilient): set `TLS_PRIVKEY_URL=sm://` (or any supported scheme). Seed the key out-of-band before first rotate. Same real-cert fetch + self-signed fallback applies. + +Both modes converge: once the CA publishes the signed chain at `TLS_FULLCHAIN_URL`, the daily rotate tick atomically swaps the fullchain in place and `SIGUSR1`-reloads Caddy. Zero key churn, zero downtime, no reload when the URL content hasn't moved. + +1. Set the required env vars in `.env`: + ``` + DOMAIN=agnes.example.com + TLS_FULLCHAIN_URL=https://your-ca.example.com/agnes/fullchain.pem + TLS_PRIVKEY_URL= # empty → on-VM gen; or sm:// + TLS_CSR_SUBJECT=/C=…/ST=…/L=…/O=…/CN=agnes.example.com + ``` +2. Start with the `tls` profile + overlay (`docker-compose.tls.yml` closes host `:8000` so all traffic enters via `:443`): + ```bash + docker compose \ + -f docker-compose.yml \ + -f docker-compose.prod.yml \ + -f docker-compose.tls.yml \ + --profile tls up -d + ``` +3. Grab the CSR if you used on-VM gen: + ```bash + sudo cat /data/state/certs/cert.csr + ``` + Submit to your corporate PKI. While waiting, Caddy is already up on :443 with the self-signed fallback. + +#### Automatic rotation + +`scripts/grpn/agnes-tls-rotate.sh` is the single entry point — it handles fetch, self-signed fallback, auto-generation on missing key, atomic cert swap, and Caddy reload. Env vars it reads: + +| Var | Required | Schemes | Notes | +|---|---|---|---| +| `DOMAIN` | yes | — | The hostname Caddy serves + the CN in auto-generated CSRs. | +| `TLS_FULLCHAIN_URL` | yes | `https://`, `sm://`, `gs://`, `file://` | Polled daily; rotate only reloads Caddy when the bytes change. | +| `TLS_PRIVKEY_URL` | optional | same | Empty activates on-VM gen. Set to pre-provisioned scheme (e.g. `sm://`) for VM-replace resilience. | +| `TLS_CSR_SUBJECT` | optional | — | Stamped on auto-generated CSRs. Defaults to `/CN=` if unset. Example: `/C=US/ST=Illinois/L=Chicago/O=Your Org/CN=agnes.example.com`. | + +`scripts/tls-fetch.sh` at `/usr/local/bin/tls-fetch.sh` is required (generic URL fetcher used by rotate). On infra-repo-managed VMs, both scripts are installed by `startup.sh` and fired via a daily systemd timer; for manual compose deployments, copy them under `/usr/local/bin/` and wire a systemd timer (`OnBootSec=10min`, `OnUnitActiveSec=24h`, `Persistent=true`). ### Upgrades (manual) @@ -116,7 +156,7 @@ Or set up a cron job — see `infra/modules/customer-instance/startup-script.sh. | Upgrades | Auto via cron, gated prod apply | Manual `docker compose pull` | | Backups | Daily GCP snapshots, 30-day retention | You set up yourself | | Monitoring / alerts | GCP Uptime Checks + alert policy | You set up yourself | -| TLS | Auto Caddy + LE | Auto Caddy + LE (same) | +| TLS | Caddy + corp cert, auto-rotated from URL | Caddy + corp cert, manual or user-scripted rotation | | Best for | Multi-tenant SaaS, production | Single-instance self-host, learning | ## Related documentation diff --git a/docs/superpowers/plans/2026-04-09-deployment-readiness.md b/docs/superpowers/plans/2026-04-09-deployment-readiness.md index d03451d..6727e2a 100644 --- a/docs/superpowers/plans/2026-04-09-deployment-readiness.md +++ b/docs/superpowers/plans/2026-04-09-deployment-readiness.md @@ -1,5 +1,7 @@ # Deployment & Multi-Instance Readiness Plan +> **Historical note (2026-04-24):** This plan is a snapshot from 2026-04-09. Some details have evolved — most notably, the Caddy `production` profile referenced throughout this document was renamed to `tls` (see PR #51). For the current deployment guide, follow `docs/DEPLOYMENT.md`. This file is preserved as design history. + > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. **Goal:** Make the platform deployable to N customer instances with minimal manual effort. diff --git a/docs/superpowers/specs/2026-04-09-multi-instance-deployment-design.md b/docs/superpowers/specs/2026-04-09-multi-instance-deployment-design.md index 9bfa44a..b8dfa25 100644 --- a/docs/superpowers/specs/2026-04-09-multi-instance-deployment-design.md +++ b/docs/superpowers/specs/2026-04-09-multi-instance-deployment-design.md @@ -1,5 +1,7 @@ # Multi-Instance Deployment & Versioning — Design Spec +> **Historical note (2026-04-24):** This spec is a snapshot from 2026-04-09. Some operational details have evolved since — most notably, the Caddy `production` profile referenced in command examples below was renamed to `tls` (see PR #51). For the current deployment commands, follow `docs/DEPLOYMENT.md`. This file is preserved as design history. + ## Goal Make Agnes deployable to 20+ independent customer instances via self-service, with safe versioning that prevents one customer's PR from breaking another's deployment. diff --git a/scripts/grpn/agnes-auto-upgrade.sh b/scripts/grpn/agnes-auto-upgrade.sh index 04009ff..674d638 100755 --- a/scripts/grpn/agnes-auto-upgrade.sh +++ b/scripts/grpn/agnes-auto-upgrade.sh @@ -2,17 +2,36 @@ # Deployed to /usr/local/bin/agnes-auto-upgrade.sh on the VM. # Cron fires it every 5 min; pulls latest image for the pinned AGNES_TAG # and recreates containers only if the digest moved. +# +# Cert-aware: if /data/state/certs/{fullchain,privkey}.pem both exist +# (populated by agnes-tls-rotate.sh), enables the tls overlay so Caddy +# fronts :443. Absence → plain HTTP on :8000. set -euo pipefail cd /opt/agnes # shellcheck disable=SC1091 set -a; . /opt/agnes/.env; set +a IMAGE="ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}" -COMPOSE_FILES="-f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml" +# Array form (vs. word-split string) — quoted expansion survives paths +# with spaces and is the modern bash idiom. Functionally identical here +# since /opt/agnes paths are tame, but it's a cheap habit to keep. +COMPOSE_FILES=( -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml ) +PROFILE_ARGS=() +# `-s` (size > 0) instead of `-f` — guards against the corner case where +# rotate.sh wrote a 0-byte cert and exited (or got SIGKILLed mid-write). +# Bringing up the tls profile against an empty cert would just crash +# Caddy on start; better to fall back to plain :8000 until rotate +# regenerates real bytes. +if [ -s /data/state/certs/fullchain.pem ] && [ -s /data/state/certs/privkey.pem ]; then + COMPOSE_FILES+=( -f docker-compose.tls.yml ) + PROFILE_ARGS=( --profile tls ) +fi BEFORE=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1) -docker compose $COMPOSE_FILES pull >/dev/null 2>&1 +docker compose "${COMPOSE_FILES[@]}" pull >/dev/null 2>&1 AFTER=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1) if [ "$BEFORE" != "$AFTER" ]; then echo "$(date): new digest for $IMAGE — recreating containers" - docker compose $COMPOSE_FILES up -d + # ${arr[@]+"${arr[@]}"} pattern: expands to nothing when array is + # empty (vs. plain "${arr[@]}" which trips `set -u` on bash <4.4). + docker compose "${COMPOSE_FILES[@]}" ${PROFILE_ARGS[@]+"${PROFILE_ARGS[@]}"} up -d docker image prune -f >/dev/null 2>&1 fi diff --git a/scripts/grpn/agnes-tls-rotate.sh b/scripts/grpn/agnes-tls-rotate.sh new file mode 100755 index 0000000..4592d3c --- /dev/null +++ b/scripts/grpn/agnes-tls-rotate.sh @@ -0,0 +1,185 @@ +#!/bin/bash +# Deployed to /usr/local/bin/agnes-tls-rotate.sh on the VM by the infra +# repo startup.sh. A systemd timer fires it daily. +# +# Corp security rotates certs at stable URLs (TLS_FULLCHAIN_URL, +# TLS_PRIVKEY_URL in /opt/agnes/.env). This script refetches, compares +# sha via cmp, atomically replaces changed files, and sends SIGUSR1 to +# caddy for a zero-downtime reload. No-op when cert has not moved. +# +# TLS_PRIVKEY_URL is optional — leave empty when the key is provisioned +# once per VM (e.g. from Secret Manager at boot) and reused across +# cert rotations. +# +# Self-signed fallback: when TLS_FULLCHAIN_URL returns no data (security +# dept hasn't published the real cert yet) AND no fullchain.pem exists +# on disk, generate a 30-day self-signed cert against the same privkey. +# Because Security signs the eventual real cert against the CSR +# produced from this same key, the key never changes — the rotate tick +# after publication just swaps the fullchain file, SIGUSR1-reloads +# Caddy, and clients start seeing the real chain with zero downtime. +# Browsers see a self-signed warning in the meantime — acceptable for +# the bring-up window, and the only way to get Caddy up before the +# real cert exists without splitting into two code paths. +set -euo pipefail +# Disable core dumps for this script. openssl runs with the unencrypted +# privkey in process memory; a SIGSEGV core file would leak it to whoever +# can read /var/lib/systemd/coredump (typically root + adm group). Cheap +# defence in depth — this script is short-lived and has no debug needs. +ulimit -c 0 + +cd /opt/agnes +# shellcheck disable=SC1091 +set -a; . /opt/agnes/.env; set +a + +[ -n "${TLS_FULLCHAIN_URL:-}" ] || { echo "TLS_FULLCHAIN_URL empty — nothing to rotate"; exit 0; } + +CERT_DIR=/data/state/certs +mkdir -p "$CERT_DIR" +chmod 700 "$CERT_DIR" + +CHANGED=0 +TMP=$(mktemp); trap 'rm -f "$TMP"' EXIT + +refetch() { + local url="$1" dest="$2" mode="$3" kind="$4" + # IMPORTANT: tls-fetch.sh may fail (404, empty body, auth error, + # invalid PEM, redirect attempt). When the caller sits behind + # `if ! refetch`, bash disables `set -e` for everything inside the + # condition — so without an explicit exit-code check we would fall + # through to `install` and overwrite $dest with whatever stale bytes + # the PREVIOUS refetch call left in $TMP. That turned the "fullchain + # unavailable → fall back to self-signed" branch into a "fullchain + # file filled with privkey bytes" bug. Check explicitly and return 1 + # on any fetch failure so the caller's fallback branch fires cleanly. + if ! /usr/local/bin/tls-fetch.sh "$url" "$TMP" "$mode" "$kind"; then + return 1 + fi + if [ ! -f "$dest" ] || ! cmp -s "$TMP" "$dest"; then + install -m "$mode" "$TMP" "$dest" + echo "$(date -Is) rotated $(basename "$dest")" + CHANGED=1 + fi +} + +# Private key handling. +# +# Three modes (decided per-VM in the infra repo's local.vm_tls): +# +# 1. TLS_PRIVKEY_URL set (sm://, gs://, https://, file://) — fetch it +# every rotate tick. Used by VMs that keep the key in Secret +# Manager or similar for VM-replace resilience (legacy pattern). +# +# 2. TLS_PRIVKEY_URL empty AND $CERT_DIR/privkey.pem already on disk +# — reuse the on-disk key, never fetch. The file survives the VM +# for the lifetime of /data's persistence. +# +# 3. TLS_PRIVKEY_URL empty AND no on-disk key — generate an RSA-2048 +# key + a CSR against $DOMAIN in place. This is the "fresh VM" +# bring-up path: the key never leaves the VM, and the CSR is +# written to $CERT_DIR/cert.csr for the operator to grab via +# `gcloud compute ssh … sudo cat /data/state/certs/cert.csr` and +# attach to the SECURITY Jira that requests public-cert signing. +# Until Security publishes the real fullchain, the self-signed +# fallback below keeps Caddy serving HTTPS against this same key. +if [ -n "${TLS_PRIVKEY_URL:-}" ]; then + if ! refetch "$TLS_PRIVKEY_URL" "$CERT_DIR/privkey.pem" 600 key; then + if [ ! -s "$CERT_DIR/privkey.pem" ]; then + echo "ERROR: privkey fetch failed and no cached copy exists — aborting" >&2 + exit 1 + fi + echo "$(date -Is) privkey fetch failed; keeping cached $CERT_DIR/privkey.pem" + fi +elif [ ! -s "$CERT_DIR/privkey.pem" ]; then + CN="${DOMAIN:-localhost}" + # Site-specific CSR subject (C/ST/L/O fields) comes from + # TLS_CSR_SUBJECT in /opt/agnes/.env — the deployer's infra layer + # writes it with its PKI conventions. This script stays generic; + # default to a minimal /CN= when the var is unset so the + # CSR is still syntactically valid but carries no org metadata the + # deployer didn't choose. + SUBJECT="${TLS_CSR_SUBJECT:-/CN=$CN}" + echo "$(date -Is) no privkey — generating RSA-2048 key + CSR (subject: $SUBJECT)" + CSR_CONF=$(mktemp) + cat > "$CSR_CONF" </dev/null + chmod 600 "$CERT_DIR/privkey.pem" + chmod 644 "$CERT_DIR/cert.csr" + rm -f "$CSR_CONF" + echo "$(date -Is) privkey.pem + cert.csr written to $CERT_DIR" + echo "$(date -Is) ACTION: send $CERT_DIR/cert.csr to your certificate authority for signing — the CSR is public and safe to transit; the key never leaves this VM." +fi + +# Real cert fetch. On failure, fall back to self-signed IFF no +# fullchain exists yet. If one exists (prior real OR prior self-signed) +# keep it — a transient fetch failure should not churn certs. +if ! refetch "$TLS_FULLCHAIN_URL" "$CERT_DIR/fullchain.pem" 644 cert; then + if [ ! -s "$CERT_DIR/fullchain.pem" ]; then + echo "$(date -Is) real cert unavailable at $TLS_FULLCHAIN_URL — generating 30-day self-signed" + if [ ! -s "$CERT_DIR/privkey.pem" ]; then + echo "ERROR: no privkey available — cannot self-sign" >&2 + exit 1 + fi + CN="${DOMAIN:-localhost}" + # Same parametrisation as the CSR branch above — site-specific PKI + # fields belong in the deployer's .env, not in this script. Keeps + # the self-signed bring-up cert consistent with whatever the eventual + # CA-signed cert will say. + SUBJECT="${TLS_CSR_SUBJECT:-/CN=$CN}" + openssl req -x509 -new -key "$CERT_DIR/privkey.pem" \ + -out "$CERT_DIR/fullchain.pem" -days 30 \ + -subj "$SUBJECT" \ + -addext "subjectAltName=DNS:$CN" \ + -addext "keyUsage=digitalSignature,keyEncipherment" \ + -addext "extendedKeyUsage=serverAuth" 2>/dev/null + chmod 644 "$CERT_DIR/fullchain.pem" + echo "$(date -Is) self-signed fullchain.pem installed (CN=$CN)" + CHANGED=1 + else + echo "$(date -Is) fetch failed but cached fullchain.pem exists — keeping it" + fi +fi + +if [ "$CHANGED" -eq 1 ]; then + # Array form (vs. word-split string) — quoted expansion is the + # modern bash idiom for arg lists, defensive against future filename + # weirdness. ps --status flag requires Compose v2.6.1+; if your VMs + # are older, replace with `ps --format '{{.Service}} {{.State}}'` + # and filter on the State column. + COMPOSE_FILES=( -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml -f docker-compose.tls.yml ) + if docker compose "${COMPOSE_FILES[@]}" --profile tls ps --status=running --format '{{.Service}}' 2>/dev/null | grep -q '^caddy$'; then + # Caddy running — graceful reload via SIGUSR1 picks up the new + # cert without dropping connections. + docker compose "${COMPOSE_FILES[@]}" --profile tls kill -s SIGUSR1 caddy >/dev/null 2>&1 \ + && echo "$(date -Is) caddy reloaded" \ + || echo "$(date -Is) caddy reload signal failed" + else + # Caddy not running yet — first time certs land on this VM, or + # operator hasn't brought up the tls profile yet. Flip the stack + # in place so this script is self-sufficient: no separate manual + # `docker compose up` step after seeding certs. + echo "$(date -Is) caddy not running — bringing tls profile up" + docker compose "${COMPOSE_FILES[@]}" --profile tls up -d 2>&1 | tail -5 + fi +fi diff --git a/scripts/tls-fetch.sh b/scripts/tls-fetch.sh new file mode 100755 index 0000000..92bf4e2 --- /dev/null +++ b/scripts/tls-fetch.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# Fetch a TLS artifact (cert chain or private key) from a URL to a local +# path with the requested file mode. Supported URL schemes: +# +# sm:// — Google Secret Manager, latest version +# gs:/// — GCS object +# https:// — plain HTTPS download (no redirects, no +# scheme downgrade — see curl flags below) +# file:// — local file copy (dev/testing only) +# +# Usage: tls-fetch.sh [mode] [kind] +# +# kind: cert (default) | key — controls post-fetch PEM validation. +# "cert" runs `openssl x509 -noout`, "key" runs `openssl pkey +# -noout`. Anything garbage (HTML error page from a corp portal, +# truncated body, unrelated file) is rejected loudly here so +# Caddy never tries to load an unparseable cert. +# +# Writes atomically via a temp file + install(1) so Caddy never sees a +# half-written cert. Exits non-zero on any failure — callers should not +# swallow errors (a silent TLS break is worse than a loud one). +# +# Exit codes: +# 2 — unsupported URL scheme +# 3 — fetched file is empty +# 4 — fetched content is not a valid PEM of the requested kind +set -euo pipefail + +URL="${1:?usage: tls-fetch.sh [mode] [kind]}" +DEST="${2:?usage: tls-fetch.sh [mode] [kind]}" +MODE="${3:-644}" +KIND="${4:-cert}" + +TMP=$(mktemp) +trap 'rm -f "$TMP"' EXIT + +case "$URL" in + sm://*) + SECRET="${URL#sm://}" + gcloud secrets versions access latest --secret="$SECRET" > "$TMP" + ;; + gs://*) + gsutil -q cp "$URL" "$TMP" + ;; + https://*) + # --max-redirs 0: a redirect on a TLS-artifact URL is a smell + # (compromised DNS / hijacked endpoint can swap the cert/key for + # an attacker-controlled one). Fail loud instead. + # --proto '=https': refuse if curl would degrade scheme. + # --retry 2: tolerate single transient blips; daily timer means + # extended outages are caught the next tick anyway. + curl -fsS --max-redirs 0 --proto '=https' --retry 2 "$URL" -o "$TMP" + ;; + file://*) + cp "${URL#file://}" "$TMP" + ;; + *) + echo "tls-fetch: unsupported URL scheme: $URL" >&2 + exit 2 + ;; +esac + +if [ ! -s "$TMP" ]; then + echo "tls-fetch: fetched empty file from $URL" >&2 + exit 3 +fi + +# PEM sanity check. Catches: HTML error pages with 200 OK, truncated +# downloads, and anything that's not a parseable PEM of the requested +# kind. Cheaper to fail here than to let Caddy crash on reload. +case "$KIND" in + cert) + if ! openssl x509 -in "$TMP" -noout 2>/dev/null; then + echo "tls-fetch: $URL did not return a valid PEM certificate" >&2 + exit 4 + fi + ;; + key) + if ! openssl pkey -in "$TMP" -noout 2>/dev/null; then + echo "tls-fetch: $URL did not return a valid PEM private key" >&2 + exit 4 + fi + ;; + *) + echo "tls-fetch: unsupported kind: $KIND (expected cert|key)" >&2 + exit 2 + ;; +esac + +install -m "$MODE" "$TMP" "$DEST"