From 921094ae40c28f6a1eb16420056e6d1ca956daeb Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 21 Apr 2026 19:39:53 +0200 Subject: [PATCH] =?UTF-8?q?feat(infra):=20address=20code=20review=20?= =?UTF-8?q?=E2=80=94=20scoped=20SA,=20fail-fast=20secrets,=20firewall=20sp?= =?UTF-8?q?lit,=20cron=20reads=20.env,=20merge=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical fixes: - C1: VM SA now gets secretmanager.secretAccessor only on specific secrets (JWT + each entry in runtime_secrets). Previously project-wide. - C3: chmod 640 on /var/log/agnes-startup.log (defense in depth) - C4: Remove '|| echo ""' fallback on keboola-storage-token — boot now fails loudly if the secret is missing instead of starting a broken app. - C5: Cron auto-upgrade script sources /opt/agnes/.env for AGNES_TAG. If an operator edits .env to pin a specific stable-YYYY.MM.N, cron picks it up immediately with no drift. Removed AGNES_TAG from crontab entry. - C7: explicit depends_on = [IAM bindings, secret_version] on VM — prevents race where VM boots before IAM propagates. Important fixes: - I1: Split firewall into web (80/443 + conditional 8000) and ssh (port 22 with configurable source_ranges, default IAP range only). - I4: Fetch docker-compose files from compose_ref (default 'main'), so customers can pin a specific tag for reproducibility. - I5+I6: Merge order fixed — user-supplied dev_instances values now override defaults (was the other way around). Dev tls_mode default flipped to 'none'. - I7: Remove '|| true' on Caddyfile fetch; surface failures loudly. - New acme_email variable (falls back to seed_admin_email if empty). Out-of-module: - Comments translated from Czech to English where applicable (M1). --- infra/modules/customer-instance/main.tf | 83 +++++++++++++++---- .../customer-instance/startup-script.sh.tpl | 49 +++++++---- infra/modules/customer-instance/variables.tf | 24 ++++++ 3 files changed, 123 insertions(+), 33 deletions(-) diff --git a/infra/modules/customer-instance/main.tf b/infra/modules/customer-instance/main.tf index b91a819..c5051d6 100644 --- a/infra/modules/customer-instance/main.tf +++ b/infra/modules/customer-instance/main.tf @@ -14,16 +14,19 @@ terraform { locals { # Normalize all instances into a single list so for_each is uniform across prod + dev. + # Note: merge({defaults}, d) — d overrides defaults (fix for v1.3.0 bug where + # defaults overrode user-supplied values). + dev_defaults = { + role = "dev" + disk_size_gb = 30 + data_disk_gb = 20 + upgrade_mode = "auto" + tls_mode = "none" # dev VMs default to plain HTTP; TLS requires domain + domain = "" + } all_instances = concat( [merge(var.prod_instance, { role = "prod" })], - [for d in var.dev_instances : merge(d, { - role = "dev" - disk_size_gb = 30 - data_disk_gb = 20 - upgrade_mode = "auto" - tls_mode = "caddy" - domain = "" - })] + [for d in var.dev_instances : merge(local.dev_defaults, d)] ) } @@ -47,7 +50,7 @@ resource "google_secret_manager_secret_version" "jwt" { secret_data = random_password.jwt.result } -# --- VM service account (dedikovaný, jen read Secret Manageru) --- +# --- VM service account (dedicated, read-only on specific secrets only) --- resource "google_service_account" "vm" { account_id = "agnes-${var.customer_name}-vm" @@ -55,14 +58,37 @@ resource "google_service_account" "vm" { project = var.gcp_project_id } -resource "google_project_iam_member" "vm_secrets" { - project = var.gcp_project_id - role = "roles/secretmanager.secretAccessor" - member = "serviceAccount:${google_service_account.vm.email}" +# Grant read access only to the JWT secret this module owns. +# Not project-wide — if the customer adds unrelated secrets (e.g. Stripe key) +# to the same project, Agnes VM must NOT be able to read them. +resource "google_secret_manager_secret_iam_member" "vm_jwt" { + project = var.gcp_project_id + secret_id = google_secret_manager_secret.jwt.secret_id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.vm.email}" +} + +# Grant read access to additional secrets the app needs (e.g. keboola-storage-token). +# Caller specifies these via var.runtime_secrets. Each secret must already exist. +resource "google_secret_manager_secret_iam_member" "vm_runtime" { + for_each = toset(var.runtime_secrets) + project = var.gcp_project_id + secret_id = each.value + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.vm.email}" } # --- Network --- +# Web firewall: 80/443 for Caddy (TLS), 8000 only when TLS is disabled (direct HTTP). +# Separate rule for SSH (port 22) — default restricted to IAP tunnel range. +locals { + # Expose raw :8000 only when any instance has tls_mode != "caddy". + # If Caddy handles TLS, customers should hit 80/443, not bypass to 8000. + expose_raw_http_port = anytrue([for inst in local.all_instances : inst.tls_mode != "caddy"]) + web_ports = local.expose_raw_http_port ? ["80", "443", "8000"] : ["80", "443"] +} + resource "google_compute_firewall" "web" { name = "agnes-${var.customer_name}-allow-web" project = var.gcp_project_id @@ -70,13 +96,27 @@ resource "google_compute_firewall" "web" { allow { protocol = "tcp" - ports = ["22", "80", "443", "8000"] + ports = local.web_ports } source_ranges = ["0.0.0.0/0"] target_tags = ["agnes-${var.customer_name}"] } +resource "google_compute_firewall" "ssh" { + name = "agnes-${var.customer_name}-allow-ssh" + project = var.gcp_project_id + network = "default" + + allow { + protocol = "tcp" + ports = ["22"] + } + + source_ranges = var.firewall_ssh_source_ranges + target_tags = ["agnes-${var.customer_name}"] +} + # --- Backup policy: daily snapshot with 30-day retention --- resource "google_compute_resource_policy" "daily_backup" { @@ -175,10 +215,12 @@ resource "google_compute_instance" "vm" { upgrade_mode = each.value.upgrade_mode tls_mode = each.value.tls_mode domain = each.value.domain + acme_email = var.acme_email != "" ? var.acme_email : var.seed_admin_email data_source = var.data_source keboola_stack_url = var.keboola_stack_url seed_admin_email = var.seed_admin_email role = each.value.role + compose_ref = var.compose_ref }) service_account { @@ -193,11 +235,20 @@ resource "google_compute_instance" "vm" { managed = "terraform" } - # Změna startup scriptu nemění běžící VM (script běží jen na boot). - # Pro aplikaci změn je potřeba VM restartovat nebo recreate. + # Startup script changes do not modify running VMs (script only runs on boot). + # To propagate module changes, use: + # terraform apply -replace='module.agnes.google_compute_instance.vm["agnes-prod"]' lifecycle { ignore_changes = [metadata_startup_script] } + + # Ensure VM SA has read access to required secrets BEFORE the VM boots — otherwise + # the startup script's `gcloud secrets versions access` can 403 due to IAM lag. + depends_on = [ + google_secret_manager_secret_iam_member.vm_jwt, + google_secret_manager_secret_iam_member.vm_runtime, + google_secret_manager_secret_version.jwt, + ] } # --- Monitoring: uptime check on each VM's /api/health endpoint --- diff --git a/infra/modules/customer-instance/startup-script.sh.tpl b/infra/modules/customer-instance/startup-script.sh.tpl index 3ffd2fb..ef4b1ef 100644 --- a/infra/modules/customer-instance/startup-script.sh.tpl +++ b/infra/modules/customer-instance/startup-script.sh.tpl @@ -1,8 +1,9 @@ #!/bin/bash # Agnes VM startup script — templated by Terraform. -# Idempotent — spustí se při každém boot. +# Idempotent — runs on every boot. set -euo pipefail exec > /var/log/agnes-startup.log 2>&1 +chmod 640 /var/log/agnes-startup.log # defense in depth — not readable by non-root CUSTOMER_NAME="${customer_name}" IMAGE_REPO="${image_repo}" @@ -10,10 +11,12 @@ IMAGE_TAG="${image_tag}" UPGRADE_MODE="${upgrade_mode}" TLS_MODE="${tls_mode}" DOMAIN="${domain}" +ACME_EMAIL="${acme_email}" DATA_SOURCE="${data_source}" KEBOOLA_STACK_URL="${keboola_stack_url}" SEED_ADMIN_EMAIL="${seed_admin_email}" ROLE="${role}" +COMPOSE_REF="${compose_ref}" echo "=== [Agnes $CUSTOMER_NAME $ROLE] Startup at $(date) ===" @@ -43,21 +46,25 @@ APP_DIR="/opt/agnes" mkdir -p "$APP_DIR" cd "$APP_DIR" -# Fetch minimal docker-compose from public repo (main branch — stable) -curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.yml" -o docker-compose.yml -curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.prod.yml" -o docker-compose.prod.yml +# Fetch docker-compose files pinned to $COMPOSE_REF (defaults to `main`; pin to a +# stable-YYYY.MM.N tag for reproducibility across VM rebuilds). +RAW_BASE="https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/$${COMPOSE_REF}" +curl -fsSL "$${RAW_BASE}/docker-compose.yml" -o docker-compose.yml +curl -fsSL "$${RAW_BASE}/docker-compose.prod.yml" -o docker-compose.prod.yml # Overlay which binds `data` volume to host /data (persistent disk mounted above) -curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.host-mount.yml" -o docker-compose.host-mount.yml +curl -fsSL "$${RAW_BASE}/docker-compose.host-mount.yml" -o docker-compose.host-mount.yml -# TLS overlay (Caddy + Let's Encrypt) — jen pokud potřeba +# TLS overlay (Caddy + Let's Encrypt) — fetch only when actually needed; surface failures if [ "$TLS_MODE" = "caddy" ] && [ -n "$DOMAIN" ]; then - curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/Caddyfile" -o Caddyfile 2>/dev/null || true + curl -fsSL "$${RAW_BASE}/Caddyfile" -o Caddyfile fi -# --- 4. Fetch secrets from Secret Manager --- +# --- 4. Fetch secrets from Secret Manager — fail loudly if missing --- KEBOOLA_TOKEN="" if [ "$DATA_SOURCE" = "keboola" ]; then - KEBOOLA_TOKEN=$(gcloud secrets versions access latest --secret=keboola-storage-token 2>/dev/null || echo "") + # No `|| echo ""` fallback — if the token secret is missing, boot should fail + # loudly rather than silently start an app that will fail sync cryptically later. + KEBOOLA_TOKEN=$(gcloud secrets versions access latest --secret=keboola-storage-token) fi JWT_KEY=$(gcloud secrets versions access latest --secret=agnes-$${CUSTOMER_NAME}-jwt-secret) @@ -71,7 +78,7 @@ SEED_ADMIN_EMAIL=$SEED_ADMIN_EMAIL LOG_LEVEL=info DOMAIN=$DOMAIN AGNES_TAG=$IMAGE_TAG -ACME_EMAIL=admin@$${DOMAIN#*.} +ACME_EMAIL=$ACME_EMAIL ENVEOF chmod 600 "$APP_DIR/.env" @@ -86,27 +93,35 @@ COMPOSE_FILES="-f docker-compose.yml -f docker-compose.prod.yml -f docker-compos docker compose $COMPOSE_FILES $COMPOSE_PROFILES_ARG pull docker compose $COMPOSE_FILES $COMPOSE_PROFILES_ARG up -d -# --- 6. Auto-upgrade via cron (pullne nový tag každých 5 min) --- +# --- 6. Auto-upgrade via cron (pulls new image digest every 5 min) --- if [ "$UPGRADE_MODE" = "auto" ]; then + # Cron script sources /opt/agnes/.env for AGNES_TAG — so if operator edits .env + # (e.g. to pin a specific stable-YYYY.MM.N), cron picks it up immediately. No + # drift between what compose up reads and what the digest-check inspects. cat > /usr/local/bin/agnes-auto-upgrade.sh <<'SCRIPTEOF' #!/bin/bash -# Spouští se z cronu — pullne nový image, pokud je, a restartne containers. +# Runs from cron — pulls new image if one is available, restarts containers. set -euo pipefail cd /opt/agnes +# Source .env so AGNES_TAG reflects any operator edits since boot. +# shellcheck disable=SC1091 +set -a; . /opt/agnes/.env; set +a +IMAGE="ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable}" COMPOSE_FILES="-f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml" -BEFORE=$(docker images --no-trunc --format '{{.Digest}}' ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable} | head -1) +BEFORE=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1) docker compose $COMPOSE_FILES pull >/dev/null 2>&1 -AFTER=$(docker images --no-trunc --format '{{.Digest}}' ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable} | head -1) +AFTER=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1) if [ "$BEFORE" != "$AFTER" ]; then - echo "$(date): new image digest — recreating containers" + echo "$(date): new image digest for $IMAGE — recreating containers" docker compose $COMPOSE_FILES up -d docker image prune -f >/dev/null 2>&1 fi SCRIPTEOF chmod +x /usr/local/bin/agnes-auto-upgrade.sh - # Přidat do crontab (idempotentně — `sort -u` vyhodí duplikáty) - (crontab -l 2>/dev/null; echo "*/5 * * * * AGNES_TAG=$IMAGE_TAG /usr/local/bin/agnes-auto-upgrade.sh >> /var/log/agnes-auto-upgrade.log 2>&1") | sort -u | crontab - + # Install cron entry idempotently: remove any prior agnes-auto-upgrade line, then append ours. + CRON_LINE="*/5 * * * * /usr/local/bin/agnes-auto-upgrade.sh >> /var/log/agnes-auto-upgrade.log 2>&1" + (crontab -l 2>/dev/null | grep -v agnes-auto-upgrade || true; echo "$CRON_LINE") | crontab - fi echo "=== [Agnes $CUSTOMER_NAME $ROLE] Startup complete at $(date) ===" diff --git a/infra/modules/customer-instance/variables.tf b/infra/modules/customer-instance/variables.tf index 8ce886e..a4f9194 100644 --- a/infra/modules/customer-instance/variables.tf +++ b/infra/modules/customer-instance/variables.tf @@ -71,6 +71,12 @@ variable "image_repo" { default = "ghcr.io/keboola/agnes-the-ai-analyst" } +variable "compose_ref" { + description = "Git ref to fetch docker-compose.yml and overlays from (in keboola/agnes-the-ai-analyst). Use `main` for latest, or a tag like `stable-2026.04.47` for reproducibility." + type = string + default = "main" +} + variable "enable_monitoring" { description = "Create uptime checks + alert policies for each VM. Requires notification_channel_ids to be useful." type = bool @@ -82,3 +88,21 @@ variable "notification_channel_ids" { type = list(string) default = [] } + +variable "runtime_secrets" { + description = "Names of existing Secret Manager secrets the VM needs to read at runtime (e.g. Keboola Storage token). VM SA gets scoped secretAccessor on each." + type = list(string) + default = ["keboola-storage-token"] +} + +variable "firewall_ssh_source_ranges" { + description = "CIDR ranges allowed to reach SSH (port 22). Default is IAP tunnel range only (use `gcloud compute ssh --tunnel-through-iap`). Override to `[\"0.0.0.0/0\"]` for unrestricted (not recommended)." + type = list(string) + default = ["35.235.240.0/20"] +} + +variable "acme_email" { + description = "Email for Let's Encrypt account (used when tls_mode=caddy). Defaults to seed_admin_email if empty." + type = string + default = "" +}