feat(infra): address code review — scoped SA, fail-fast secrets, firewall split, cron reads .env, merge fix

Critical fixes:
- C1: VM SA now gets secretmanager.secretAccessor only on specific secrets
  (JWT + each entry in runtime_secrets). Previously project-wide.
- C3: chmod 640 on /var/log/agnes-startup.log (defense in depth)
- C4: Remove '|| echo ""' fallback on keboola-storage-token — boot now fails
  loudly if the secret is missing instead of starting a broken app.
- C5: Cron auto-upgrade script sources /opt/agnes/.env for AGNES_TAG. If an
  operator edits .env to pin a specific stable-YYYY.MM.N, cron picks it up
  immediately with no drift. Removed AGNES_TAG from crontab entry.
- C7: explicit depends_on = [IAM bindings, secret_version] on VM — prevents
  race where VM boots before IAM propagates.

Important fixes:
- I1: Split firewall into web (80/443 + conditional 8000) and ssh (port 22 with
  configurable source_ranges, default IAP range only).
- I4: Fetch docker-compose files from compose_ref (default 'main'), so customers
  can pin a specific tag for reproducibility.
- I5+I6: Merge order fixed — user-supplied dev_instances values now override
  defaults (was the other way around). Dev tls_mode default flipped to 'none'.
- I7: Remove '|| true' on Caddyfile fetch; surface failures loudly.
- New acme_email variable (falls back to seed_admin_email if empty).

Out-of-module:
- Comments translated from Czech to English where applicable (M1).
This commit is contained in:
ZdenekSrotyr 2026-04-21 19:39:53 +02:00
parent 6470e23df3
commit 921094ae40
3 changed files with 123 additions and 33 deletions

View file

@ -14,16 +14,19 @@ terraform {
locals {
# Normalize all instances into a single list so for_each is uniform across prod + dev.
all_instances = concat(
[merge(var.prod_instance, { role = "prod" })],
[for d in var.dev_instances : merge(d, {
# Note: merge({defaults}, d) d overrides defaults (fix for v1.3.0 bug where
# defaults overrode user-supplied values).
dev_defaults = {
role = "dev"
disk_size_gb = 30
data_disk_gb = 20
upgrade_mode = "auto"
tls_mode = "caddy"
tls_mode = "none" # dev VMs default to plain HTTP; TLS requires domain
domain = ""
})]
}
all_instances = concat(
[merge(var.prod_instance, { role = "prod" })],
[for d in var.dev_instances : merge(local.dev_defaults, d)]
)
}
@ -47,7 +50,7 @@ resource "google_secret_manager_secret_version" "jwt" {
secret_data = random_password.jwt.result
}
# --- VM service account (dedikovaný, jen read Secret Manageru) ---
# --- VM service account (dedicated, read-only on specific secrets only) ---
resource "google_service_account" "vm" {
account_id = "agnes-${var.customer_name}-vm"
@ -55,14 +58,37 @@ resource "google_service_account" "vm" {
project = var.gcp_project_id
}
resource "google_project_iam_member" "vm_secrets" {
# Grant read access only to the JWT secret this module owns.
# Not project-wide if the customer adds unrelated secrets (e.g. Stripe key)
# to the same project, Agnes VM must NOT be able to read them.
resource "google_secret_manager_secret_iam_member" "vm_jwt" {
project = var.gcp_project_id
secret_id = google_secret_manager_secret.jwt.secret_id
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${google_service_account.vm.email}"
}
# Grant read access to additional secrets the app needs (e.g. keboola-storage-token).
# Caller specifies these via var.runtime_secrets. Each secret must already exist.
resource "google_secret_manager_secret_iam_member" "vm_runtime" {
for_each = toset(var.runtime_secrets)
project = var.gcp_project_id
secret_id = each.value
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${google_service_account.vm.email}"
}
# --- Network ---
# Web firewall: 80/443 for Caddy (TLS), 8000 only when TLS is disabled (direct HTTP).
# Separate rule for SSH (port 22) default restricted to IAP tunnel range.
locals {
# Expose raw :8000 only when any instance has tls_mode != "caddy".
# If Caddy handles TLS, customers should hit 80/443, not bypass to 8000.
expose_raw_http_port = anytrue([for inst in local.all_instances : inst.tls_mode != "caddy"])
web_ports = local.expose_raw_http_port ? ["80", "443", "8000"] : ["80", "443"]
}
resource "google_compute_firewall" "web" {
name = "agnes-${var.customer_name}-allow-web"
project = var.gcp_project_id
@ -70,13 +96,27 @@ resource "google_compute_firewall" "web" {
allow {
protocol = "tcp"
ports = ["22", "80", "443", "8000"]
ports = local.web_ports
}
source_ranges = ["0.0.0.0/0"]
target_tags = ["agnes-${var.customer_name}"]
}
resource "google_compute_firewall" "ssh" {
name = "agnes-${var.customer_name}-allow-ssh"
project = var.gcp_project_id
network = "default"
allow {
protocol = "tcp"
ports = ["22"]
}
source_ranges = var.firewall_ssh_source_ranges
target_tags = ["agnes-${var.customer_name}"]
}
# --- Backup policy: daily snapshot with 30-day retention ---
resource "google_compute_resource_policy" "daily_backup" {
@ -175,10 +215,12 @@ resource "google_compute_instance" "vm" {
upgrade_mode = each.value.upgrade_mode
tls_mode = each.value.tls_mode
domain = each.value.domain
acme_email = var.acme_email != "" ? var.acme_email : var.seed_admin_email
data_source = var.data_source
keboola_stack_url = var.keboola_stack_url
seed_admin_email = var.seed_admin_email
role = each.value.role
compose_ref = var.compose_ref
})
service_account {
@ -193,11 +235,20 @@ resource "google_compute_instance" "vm" {
managed = "terraform"
}
# Změna startup scriptu nemění běžící VM (script běží jen na boot).
# Pro aplikaci změn je potřeba VM restartovat nebo recreate.
# Startup script changes do not modify running VMs (script only runs on boot).
# To propagate module changes, use:
# terraform apply -replace='module.agnes.google_compute_instance.vm["agnes-prod"]'
lifecycle {
ignore_changes = [metadata_startup_script]
}
# Ensure VM SA has read access to required secrets BEFORE the VM boots otherwise
# the startup script's `gcloud secrets versions access` can 403 due to IAM lag.
depends_on = [
google_secret_manager_secret_iam_member.vm_jwt,
google_secret_manager_secret_iam_member.vm_runtime,
google_secret_manager_secret_version.jwt,
]
}
# --- Monitoring: uptime check on each VM's /api/health endpoint ---

View file

@ -1,8 +1,9 @@
#!/bin/bash
# Agnes VM startup script — templated by Terraform.
# Idempotent — spustí se při každém boot.
# Idempotent — runs on every boot.
set -euo pipefail
exec > /var/log/agnes-startup.log 2>&1
chmod 640 /var/log/agnes-startup.log # defense in depth — not readable by non-root
CUSTOMER_NAME="${customer_name}"
IMAGE_REPO="${image_repo}"
@ -10,10 +11,12 @@ IMAGE_TAG="${image_tag}"
UPGRADE_MODE="${upgrade_mode}"
TLS_MODE="${tls_mode}"
DOMAIN="${domain}"
ACME_EMAIL="${acme_email}"
DATA_SOURCE="${data_source}"
KEBOOLA_STACK_URL="${keboola_stack_url}"
SEED_ADMIN_EMAIL="${seed_admin_email}"
ROLE="${role}"
COMPOSE_REF="${compose_ref}"
echo "=== [Agnes $CUSTOMER_NAME $ROLE] Startup at $(date) ==="
@ -43,21 +46,25 @@ APP_DIR="/opt/agnes"
mkdir -p "$APP_DIR"
cd "$APP_DIR"
# Fetch minimal docker-compose from public repo (main branch — stable)
curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.yml" -o docker-compose.yml
curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.prod.yml" -o docker-compose.prod.yml
# Fetch docker-compose files pinned to $COMPOSE_REF (defaults to `main`; pin to a
# stable-YYYY.MM.N tag for reproducibility across VM rebuilds).
RAW_BASE="https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/$${COMPOSE_REF}"
curl -fsSL "$${RAW_BASE}/docker-compose.yml" -o docker-compose.yml
curl -fsSL "$${RAW_BASE}/docker-compose.prod.yml" -o docker-compose.prod.yml
# Overlay which binds `data` volume to host /data (persistent disk mounted above)
curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.host-mount.yml" -o docker-compose.host-mount.yml
curl -fsSL "$${RAW_BASE}/docker-compose.host-mount.yml" -o docker-compose.host-mount.yml
# TLS overlay (Caddy + Let's Encrypt) — jen pokud potřeba
# TLS overlay (Caddy + Let's Encrypt) — fetch only when actually needed; surface failures
if [ "$TLS_MODE" = "caddy" ] && [ -n "$DOMAIN" ]; then
curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/Caddyfile" -o Caddyfile 2>/dev/null || true
curl -fsSL "$${RAW_BASE}/Caddyfile" -o Caddyfile
fi
# --- 4. Fetch secrets from Secret Manager ---
# --- 4. Fetch secrets from Secret Manager — fail loudly if missing ---
KEBOOLA_TOKEN=""
if [ "$DATA_SOURCE" = "keboola" ]; then
KEBOOLA_TOKEN=$(gcloud secrets versions access latest --secret=keboola-storage-token 2>/dev/null || echo "")
# No `|| echo ""` fallback — if the token secret is missing, boot should fail
# loudly rather than silently start an app that will fail sync cryptically later.
KEBOOLA_TOKEN=$(gcloud secrets versions access latest --secret=keboola-storage-token)
fi
JWT_KEY=$(gcloud secrets versions access latest --secret=agnes-$${CUSTOMER_NAME}-jwt-secret)
@ -71,7 +78,7 @@ SEED_ADMIN_EMAIL=$SEED_ADMIN_EMAIL
LOG_LEVEL=info
DOMAIN=$DOMAIN
AGNES_TAG=$IMAGE_TAG
ACME_EMAIL=admin@$${DOMAIN#*.}
ACME_EMAIL=$ACME_EMAIL
ENVEOF
chmod 600 "$APP_DIR/.env"
@ -86,27 +93,35 @@ COMPOSE_FILES="-f docker-compose.yml -f docker-compose.prod.yml -f docker-compos
docker compose $COMPOSE_FILES $COMPOSE_PROFILES_ARG pull
docker compose $COMPOSE_FILES $COMPOSE_PROFILES_ARG up -d
# --- 6. Auto-upgrade via cron (pullne nový tag každých 5 min) ---
# --- 6. Auto-upgrade via cron (pulls new image digest every 5 min) ---
if [ "$UPGRADE_MODE" = "auto" ]; then
# Cron script sources /opt/agnes/.env for AGNES_TAG — so if operator edits .env
# (e.g. to pin a specific stable-YYYY.MM.N), cron picks it up immediately. No
# drift between what compose up reads and what the digest-check inspects.
cat > /usr/local/bin/agnes-auto-upgrade.sh <<'SCRIPTEOF'
#!/bin/bash
# Spouští se z cronu — pullne nový image, pokud je, a restartne containers.
# Runs from cron — pulls new image if one is available, restarts containers.
set -euo pipefail
cd /opt/agnes
# Source .env so AGNES_TAG reflects any operator edits since boot.
# shellcheck disable=SC1091
set -a; . /opt/agnes/.env; set +a
IMAGE="ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable}"
COMPOSE_FILES="-f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml"
BEFORE=$(docker images --no-trunc --format '{{.Digest}}' ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable} | head -1)
BEFORE=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1)
docker compose $COMPOSE_FILES pull >/dev/null 2>&1
AFTER=$(docker images --no-trunc --format '{{.Digest}}' ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable} | head -1)
AFTER=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1)
if [ "$BEFORE" != "$AFTER" ]; then
echo "$(date): new image digest — recreating containers"
echo "$(date): new image digest for $IMAGE — recreating containers"
docker compose $COMPOSE_FILES up -d
docker image prune -f >/dev/null 2>&1
fi
SCRIPTEOF
chmod +x /usr/local/bin/agnes-auto-upgrade.sh
# Přidat do crontab (idempotentně — `sort -u` vyhodí duplikáty)
(crontab -l 2>/dev/null; echo "*/5 * * * * AGNES_TAG=$IMAGE_TAG /usr/local/bin/agnes-auto-upgrade.sh >> /var/log/agnes-auto-upgrade.log 2>&1") | sort -u | crontab -
# Install cron entry idempotently: remove any prior agnes-auto-upgrade line, then append ours.
CRON_LINE="*/5 * * * * /usr/local/bin/agnes-auto-upgrade.sh >> /var/log/agnes-auto-upgrade.log 2>&1"
(crontab -l 2>/dev/null | grep -v agnes-auto-upgrade || true; echo "$CRON_LINE") | crontab -
fi
echo "=== [Agnes $CUSTOMER_NAME $ROLE] Startup complete at $(date) ==="

View file

@ -71,6 +71,12 @@ variable "image_repo" {
default = "ghcr.io/keboola/agnes-the-ai-analyst"
}
variable "compose_ref" {
description = "Git ref to fetch docker-compose.yml and overlays from (in keboola/agnes-the-ai-analyst). Use `main` for latest, or a tag like `stable-2026.04.47` for reproducibility."
type = string
default = "main"
}
variable "enable_monitoring" {
description = "Create uptime checks + alert policies for each VM. Requires notification_channel_ids to be useful."
type = bool
@ -82,3 +88,21 @@ variable "notification_channel_ids" {
type = list(string)
default = []
}
variable "runtime_secrets" {
description = "Names of existing Secret Manager secrets the VM needs to read at runtime (e.g. Keboola Storage token). VM SA gets scoped secretAccessor on each."
type = list(string)
default = ["keboola-storage-token"]
}
variable "firewall_ssh_source_ranges" {
description = "CIDR ranges allowed to reach SSH (port 22). Default is IAP tunnel range only (use `gcloud compute ssh --tunnel-through-iap`). Override to `[\"0.0.0.0/0\"]` for unrestricted (not recommended)."
type = list(string)
default = ["35.235.240.0/20"]
}
variable "acme_email" {
description = "Email for Let's Encrypt account (used when tls_mode=caddy). Defaults to seed_admin_email if empty."
type = string
default = ""
}