feat(infra): address code review — scoped SA, fail-fast secrets, firewall split, cron reads .env, merge fix
Critical fixes: - C1: VM SA now gets secretmanager.secretAccessor only on specific secrets (JWT + each entry in runtime_secrets). Previously project-wide. - C3: chmod 640 on /var/log/agnes-startup.log (defense in depth) - C4: Remove '|| echo ""' fallback on keboola-storage-token — boot now fails loudly if the secret is missing instead of starting a broken app. - C5: Cron auto-upgrade script sources /opt/agnes/.env for AGNES_TAG. If an operator edits .env to pin a specific stable-YYYY.MM.N, cron picks it up immediately with no drift. Removed AGNES_TAG from crontab entry. - C7: explicit depends_on = [IAM bindings, secret_version] on VM — prevents race where VM boots before IAM propagates. Important fixes: - I1: Split firewall into web (80/443 + conditional 8000) and ssh (port 22 with configurable source_ranges, default IAP range only). - I4: Fetch docker-compose files from compose_ref (default 'main'), so customers can pin a specific tag for reproducibility. - I5+I6: Merge order fixed — user-supplied dev_instances values now override defaults (was the other way around). Dev tls_mode default flipped to 'none'. - I7: Remove '|| true' on Caddyfile fetch; surface failures loudly. - New acme_email variable (falls back to seed_admin_email if empty). Out-of-module: - Comments translated from Czech to English where applicable (M1).
This commit is contained in:
parent
6470e23df3
commit
921094ae40
3 changed files with 123 additions and 33 deletions
|
|
@ -14,16 +14,19 @@ terraform {
|
|||
|
||||
locals {
|
||||
# Normalize all instances into a single list so for_each is uniform across prod + dev.
|
||||
all_instances = concat(
|
||||
[merge(var.prod_instance, { role = "prod" })],
|
||||
[for d in var.dev_instances : merge(d, {
|
||||
# Note: merge({defaults}, d) — d overrides defaults (fix for v1.3.0 bug where
|
||||
# defaults overrode user-supplied values).
|
||||
dev_defaults = {
|
||||
role = "dev"
|
||||
disk_size_gb = 30
|
||||
data_disk_gb = 20
|
||||
upgrade_mode = "auto"
|
||||
tls_mode = "caddy"
|
||||
tls_mode = "none" # dev VMs default to plain HTTP; TLS requires domain
|
||||
domain = ""
|
||||
})]
|
||||
}
|
||||
all_instances = concat(
|
||||
[merge(var.prod_instance, { role = "prod" })],
|
||||
[for d in var.dev_instances : merge(local.dev_defaults, d)]
|
||||
)
|
||||
}
|
||||
|
||||
|
|
@ -47,7 +50,7 @@ resource "google_secret_manager_secret_version" "jwt" {
|
|||
secret_data = random_password.jwt.result
|
||||
}
|
||||
|
||||
# --- VM service account (dedikovaný, jen read Secret Manageru) ---
|
||||
# --- VM service account (dedicated, read-only on specific secrets only) ---
|
||||
|
||||
resource "google_service_account" "vm" {
|
||||
account_id = "agnes-${var.customer_name}-vm"
|
||||
|
|
@ -55,14 +58,37 @@ resource "google_service_account" "vm" {
|
|||
project = var.gcp_project_id
|
||||
}
|
||||
|
||||
resource "google_project_iam_member" "vm_secrets" {
|
||||
# Grant read access only to the JWT secret this module owns.
|
||||
# Not project-wide — if the customer adds unrelated secrets (e.g. Stripe key)
|
||||
# to the same project, Agnes VM must NOT be able to read them.
|
||||
resource "google_secret_manager_secret_iam_member" "vm_jwt" {
|
||||
project = var.gcp_project_id
|
||||
secret_id = google_secret_manager_secret.jwt.secret_id
|
||||
role = "roles/secretmanager.secretAccessor"
|
||||
member = "serviceAccount:${google_service_account.vm.email}"
|
||||
}
|
||||
|
||||
# Grant read access to additional secrets the app needs (e.g. keboola-storage-token).
|
||||
# Caller specifies these via var.runtime_secrets. Each secret must already exist.
|
||||
resource "google_secret_manager_secret_iam_member" "vm_runtime" {
|
||||
for_each = toset(var.runtime_secrets)
|
||||
project = var.gcp_project_id
|
||||
secret_id = each.value
|
||||
role = "roles/secretmanager.secretAccessor"
|
||||
member = "serviceAccount:${google_service_account.vm.email}"
|
||||
}
|
||||
|
||||
# --- Network ---
|
||||
|
||||
# Web firewall: 80/443 for Caddy (TLS), 8000 only when TLS is disabled (direct HTTP).
|
||||
# Separate rule for SSH (port 22) — default restricted to IAP tunnel range.
|
||||
locals {
|
||||
# Expose raw :8000 only when any instance has tls_mode != "caddy".
|
||||
# If Caddy handles TLS, customers should hit 80/443, not bypass to 8000.
|
||||
expose_raw_http_port = anytrue([for inst in local.all_instances : inst.tls_mode != "caddy"])
|
||||
web_ports = local.expose_raw_http_port ? ["80", "443", "8000"] : ["80", "443"]
|
||||
}
|
||||
|
||||
resource "google_compute_firewall" "web" {
|
||||
name = "agnes-${var.customer_name}-allow-web"
|
||||
project = var.gcp_project_id
|
||||
|
|
@ -70,13 +96,27 @@ resource "google_compute_firewall" "web" {
|
|||
|
||||
allow {
|
||||
protocol = "tcp"
|
||||
ports = ["22", "80", "443", "8000"]
|
||||
ports = local.web_ports
|
||||
}
|
||||
|
||||
source_ranges = ["0.0.0.0/0"]
|
||||
target_tags = ["agnes-${var.customer_name}"]
|
||||
}
|
||||
|
||||
resource "google_compute_firewall" "ssh" {
|
||||
name = "agnes-${var.customer_name}-allow-ssh"
|
||||
project = var.gcp_project_id
|
||||
network = "default"
|
||||
|
||||
allow {
|
||||
protocol = "tcp"
|
||||
ports = ["22"]
|
||||
}
|
||||
|
||||
source_ranges = var.firewall_ssh_source_ranges
|
||||
target_tags = ["agnes-${var.customer_name}"]
|
||||
}
|
||||
|
||||
# --- Backup policy: daily snapshot with 30-day retention ---
|
||||
|
||||
resource "google_compute_resource_policy" "daily_backup" {
|
||||
|
|
@ -175,10 +215,12 @@ resource "google_compute_instance" "vm" {
|
|||
upgrade_mode = each.value.upgrade_mode
|
||||
tls_mode = each.value.tls_mode
|
||||
domain = each.value.domain
|
||||
acme_email = var.acme_email != "" ? var.acme_email : var.seed_admin_email
|
||||
data_source = var.data_source
|
||||
keboola_stack_url = var.keboola_stack_url
|
||||
seed_admin_email = var.seed_admin_email
|
||||
role = each.value.role
|
||||
compose_ref = var.compose_ref
|
||||
})
|
||||
|
||||
service_account {
|
||||
|
|
@ -193,11 +235,20 @@ resource "google_compute_instance" "vm" {
|
|||
managed = "terraform"
|
||||
}
|
||||
|
||||
# Změna startup scriptu nemění běžící VM (script běží jen na boot).
|
||||
# Pro aplikaci změn je potřeba VM restartovat nebo recreate.
|
||||
# Startup script changes do not modify running VMs (script only runs on boot).
|
||||
# To propagate module changes, use:
|
||||
# terraform apply -replace='module.agnes.google_compute_instance.vm["agnes-prod"]'
|
||||
lifecycle {
|
||||
ignore_changes = [metadata_startup_script]
|
||||
}
|
||||
|
||||
# Ensure VM SA has read access to required secrets BEFORE the VM boots — otherwise
|
||||
# the startup script's `gcloud secrets versions access` can 403 due to IAM lag.
|
||||
depends_on = [
|
||||
google_secret_manager_secret_iam_member.vm_jwt,
|
||||
google_secret_manager_secret_iam_member.vm_runtime,
|
||||
google_secret_manager_secret_version.jwt,
|
||||
]
|
||||
}
|
||||
|
||||
# --- Monitoring: uptime check on each VM's /api/health endpoint ---
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
#!/bin/bash
|
||||
# Agnes VM startup script — templated by Terraform.
|
||||
# Idempotent — spustí se při každém boot.
|
||||
# Idempotent — runs on every boot.
|
||||
set -euo pipefail
|
||||
exec > /var/log/agnes-startup.log 2>&1
|
||||
chmod 640 /var/log/agnes-startup.log # defense in depth — not readable by non-root
|
||||
|
||||
CUSTOMER_NAME="${customer_name}"
|
||||
IMAGE_REPO="${image_repo}"
|
||||
|
|
@ -10,10 +11,12 @@ IMAGE_TAG="${image_tag}"
|
|||
UPGRADE_MODE="${upgrade_mode}"
|
||||
TLS_MODE="${tls_mode}"
|
||||
DOMAIN="${domain}"
|
||||
ACME_EMAIL="${acme_email}"
|
||||
DATA_SOURCE="${data_source}"
|
||||
KEBOOLA_STACK_URL="${keboola_stack_url}"
|
||||
SEED_ADMIN_EMAIL="${seed_admin_email}"
|
||||
ROLE="${role}"
|
||||
COMPOSE_REF="${compose_ref}"
|
||||
|
||||
echo "=== [Agnes $CUSTOMER_NAME $ROLE] Startup at $(date) ==="
|
||||
|
||||
|
|
@ -43,21 +46,25 @@ APP_DIR="/opt/agnes"
|
|||
mkdir -p "$APP_DIR"
|
||||
cd "$APP_DIR"
|
||||
|
||||
# Fetch minimal docker-compose from public repo (main branch — stable)
|
||||
curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.yml" -o docker-compose.yml
|
||||
curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.prod.yml" -o docker-compose.prod.yml
|
||||
# Fetch docker-compose files pinned to $COMPOSE_REF (defaults to `main`; pin to a
|
||||
# stable-YYYY.MM.N tag for reproducibility across VM rebuilds).
|
||||
RAW_BASE="https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/$${COMPOSE_REF}"
|
||||
curl -fsSL "$${RAW_BASE}/docker-compose.yml" -o docker-compose.yml
|
||||
curl -fsSL "$${RAW_BASE}/docker-compose.prod.yml" -o docker-compose.prod.yml
|
||||
# Overlay which binds `data` volume to host /data (persistent disk mounted above)
|
||||
curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.host-mount.yml" -o docker-compose.host-mount.yml
|
||||
curl -fsSL "$${RAW_BASE}/docker-compose.host-mount.yml" -o docker-compose.host-mount.yml
|
||||
|
||||
# TLS overlay (Caddy + Let's Encrypt) — jen pokud potřeba
|
||||
# TLS overlay (Caddy + Let's Encrypt) — fetch only when actually needed; surface failures
|
||||
if [ "$TLS_MODE" = "caddy" ] && [ -n "$DOMAIN" ]; then
|
||||
curl -fsSL "https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/Caddyfile" -o Caddyfile 2>/dev/null || true
|
||||
curl -fsSL "$${RAW_BASE}/Caddyfile" -o Caddyfile
|
||||
fi
|
||||
|
||||
# --- 4. Fetch secrets from Secret Manager ---
|
||||
# --- 4. Fetch secrets from Secret Manager — fail loudly if missing ---
|
||||
KEBOOLA_TOKEN=""
|
||||
if [ "$DATA_SOURCE" = "keboola" ]; then
|
||||
KEBOOLA_TOKEN=$(gcloud secrets versions access latest --secret=keboola-storage-token 2>/dev/null || echo "")
|
||||
# No `|| echo ""` fallback — if the token secret is missing, boot should fail
|
||||
# loudly rather than silently start an app that will fail sync cryptically later.
|
||||
KEBOOLA_TOKEN=$(gcloud secrets versions access latest --secret=keboola-storage-token)
|
||||
fi
|
||||
JWT_KEY=$(gcloud secrets versions access latest --secret=agnes-$${CUSTOMER_NAME}-jwt-secret)
|
||||
|
||||
|
|
@ -71,7 +78,7 @@ SEED_ADMIN_EMAIL=$SEED_ADMIN_EMAIL
|
|||
LOG_LEVEL=info
|
||||
DOMAIN=$DOMAIN
|
||||
AGNES_TAG=$IMAGE_TAG
|
||||
ACME_EMAIL=admin@$${DOMAIN#*.}
|
||||
ACME_EMAIL=$ACME_EMAIL
|
||||
ENVEOF
|
||||
chmod 600 "$APP_DIR/.env"
|
||||
|
||||
|
|
@ -86,27 +93,35 @@ COMPOSE_FILES="-f docker-compose.yml -f docker-compose.prod.yml -f docker-compos
|
|||
docker compose $COMPOSE_FILES $COMPOSE_PROFILES_ARG pull
|
||||
docker compose $COMPOSE_FILES $COMPOSE_PROFILES_ARG up -d
|
||||
|
||||
# --- 6. Auto-upgrade via cron (pullne nový tag každých 5 min) ---
|
||||
# --- 6. Auto-upgrade via cron (pulls new image digest every 5 min) ---
|
||||
if [ "$UPGRADE_MODE" = "auto" ]; then
|
||||
# Cron script sources /opt/agnes/.env for AGNES_TAG — so if operator edits .env
|
||||
# (e.g. to pin a specific stable-YYYY.MM.N), cron picks it up immediately. No
|
||||
# drift between what compose up reads and what the digest-check inspects.
|
||||
cat > /usr/local/bin/agnes-auto-upgrade.sh <<'SCRIPTEOF'
|
||||
#!/bin/bash
|
||||
# Spouští se z cronu — pullne nový image, pokud je, a restartne containers.
|
||||
# Runs from cron — pulls new image if one is available, restarts containers.
|
||||
set -euo pipefail
|
||||
cd /opt/agnes
|
||||
# Source .env so AGNES_TAG reflects any operator edits since boot.
|
||||
# shellcheck disable=SC1091
|
||||
set -a; . /opt/agnes/.env; set +a
|
||||
IMAGE="ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable}"
|
||||
COMPOSE_FILES="-f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml"
|
||||
BEFORE=$(docker images --no-trunc --format '{{.Digest}}' ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable} | head -1)
|
||||
BEFORE=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1)
|
||||
docker compose $COMPOSE_FILES pull >/dev/null 2>&1
|
||||
AFTER=$(docker images --no-trunc --format '{{.Digest}}' ghcr.io/keboola/agnes-the-ai-analyst:$${AGNES_TAG:-stable} | head -1)
|
||||
AFTER=$(docker images --no-trunc --format '{{.Digest}}' "$IMAGE" | head -1)
|
||||
if [ "$BEFORE" != "$AFTER" ]; then
|
||||
echo "$(date): new image digest — recreating containers"
|
||||
echo "$(date): new image digest for $IMAGE — recreating containers"
|
||||
docker compose $COMPOSE_FILES up -d
|
||||
docker image prune -f >/dev/null 2>&1
|
||||
fi
|
||||
SCRIPTEOF
|
||||
chmod +x /usr/local/bin/agnes-auto-upgrade.sh
|
||||
|
||||
# Přidat do crontab (idempotentně — `sort -u` vyhodí duplikáty)
|
||||
(crontab -l 2>/dev/null; echo "*/5 * * * * AGNES_TAG=$IMAGE_TAG /usr/local/bin/agnes-auto-upgrade.sh >> /var/log/agnes-auto-upgrade.log 2>&1") | sort -u | crontab -
|
||||
# Install cron entry idempotently: remove any prior agnes-auto-upgrade line, then append ours.
|
||||
CRON_LINE="*/5 * * * * /usr/local/bin/agnes-auto-upgrade.sh >> /var/log/agnes-auto-upgrade.log 2>&1"
|
||||
(crontab -l 2>/dev/null | grep -v agnes-auto-upgrade || true; echo "$CRON_LINE") | crontab -
|
||||
fi
|
||||
|
||||
echo "=== [Agnes $CUSTOMER_NAME $ROLE] Startup complete at $(date) ==="
|
||||
|
|
|
|||
|
|
@ -71,6 +71,12 @@ variable "image_repo" {
|
|||
default = "ghcr.io/keboola/agnes-the-ai-analyst"
|
||||
}
|
||||
|
||||
variable "compose_ref" {
|
||||
description = "Git ref to fetch docker-compose.yml and overlays from (in keboola/agnes-the-ai-analyst). Use `main` for latest, or a tag like `stable-2026.04.47` for reproducibility."
|
||||
type = string
|
||||
default = "main"
|
||||
}
|
||||
|
||||
variable "enable_monitoring" {
|
||||
description = "Create uptime checks + alert policies for each VM. Requires notification_channel_ids to be useful."
|
||||
type = bool
|
||||
|
|
@ -82,3 +88,21 @@ variable "notification_channel_ids" {
|
|||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "runtime_secrets" {
|
||||
description = "Names of existing Secret Manager secrets the VM needs to read at runtime (e.g. Keboola Storage token). VM SA gets scoped secretAccessor on each."
|
||||
type = list(string)
|
||||
default = ["keboola-storage-token"]
|
||||
}
|
||||
|
||||
variable "firewall_ssh_source_ranges" {
|
||||
description = "CIDR ranges allowed to reach SSH (port 22). Default is IAP tunnel range only (use `gcloud compute ssh --tunnel-through-iap`). Override to `[\"0.0.0.0/0\"]` for unrestricted (not recommended)."
|
||||
type = list(string)
|
||||
default = ["35.235.240.0/20"]
|
||||
}
|
||||
|
||||
variable "acme_email" {
|
||||
description = "Email for Let's Encrypt account (used when tls_mode=caddy). Defaults to seed_admin_email if empty."
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue