agnes-the-ai-analyst/infra/modules/customer-instance/main.tf
ZdenekSrotyr e2eb51f657
ci(release): build image for all branches, not just feature/** (#19)
* dryrun: intentional failing test (will be reverted)

* feat(auth): optional SEED_ADMIN_PASSWORD to pre-hash seed admin (dev helper)

Terraform gains enable_seed_password + seed_admin_password (sensitive) vars
on the customer-instance module; when enabled the password is piped via
startup-script into /opt/agnes/.env as SEED_ADMIN_PASSWORD. On first boot
app/main.py argon2-hashes it onto the seed user so the admin can log in
immediately without going through /auth/bootstrap. Never overwrites an
existing password_hash — safe against accidental reset on terraform apply.

* ci(release): build :dev-<slug> on any branch, not just feature/**

Before: only 'feature/**' branches triggered release.yml, so pushing
'zs/my-edit' or 'fix/bug' did not publish an image. dev_instances entry
pinning image_tag = 'dev-zs-my-edit' then crashed VM startup with
'image not found'.

Now: any branch push (except main, which produces :stable) publishes
:dev-<slug>. Slug strips a leading 'feature/' and replaces non-[a-z0-9-]
with '-', keeping existing feature/** behavior identical.

* Revert "dryrun: intentional failing test (will be reverted)"

This reverts commit cf9cc06a7884bb401ff29fc5cb6d8baf84dc3daa.
2026-04-21 21:33:57 +02:00

313 lines
9.1 KiB
HCL

terraform {
required_version = ">= 1.5"
required_providers {
google = {
source = "hashicorp/google"
version = "~> 5.0"
}
random = {
source = "hashicorp/random"
version = "~> 3.0"
}
}
}
locals {
# Normalize all instances into a single list so for_each is uniform across prod + dev.
# Note: merge({defaults}, d) — d overrides defaults (fix for v1.3.0 bug where
# defaults overrode user-supplied values).
dev_defaults = {
role = "dev"
disk_size_gb = 30
data_disk_gb = 20
upgrade_mode = "auto"
tls_mode = "none" # dev VMs default to plain HTTP; TLS requires domain
domain = ""
}
all_instances = concat(
[merge(var.prod_instance, { role = "prod" })],
[for d in var.dev_instances : merge(local.dev_defaults, d)]
)
}
# --- Secrets ---
resource "google_secret_manager_secret" "jwt" {
secret_id = "agnes-${var.customer_name}-jwt-secret"
project = var.gcp_project_id
replication {
auto {}
}
}
resource "random_password" "jwt" {
length = 48
special = false
}
resource "google_secret_manager_secret_version" "jwt" {
secret = google_secret_manager_secret.jwt.id
secret_data = random_password.jwt.result
}
# --- VM service account (dedicated, read-only on specific secrets only) ---
resource "google_service_account" "vm" {
account_id = "agnes-${var.customer_name}-vm"
display_name = "Agnes VM runtime SA (${var.customer_name})"
project = var.gcp_project_id
}
# Grant read access only to the JWT secret this module owns.
# Not project-wide — if the customer adds unrelated secrets (e.g. Stripe key)
# to the same project, Agnes VM must NOT be able to read them.
resource "google_secret_manager_secret_iam_member" "vm_jwt" {
project = var.gcp_project_id
secret_id = google_secret_manager_secret.jwt.secret_id
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${google_service_account.vm.email}"
}
# Grant read access to additional secrets the app needs (e.g. keboola-storage-token).
# Caller specifies these via var.runtime_secrets. Each secret must already exist.
resource "google_secret_manager_secret_iam_member" "vm_runtime" {
for_each = toset(var.runtime_secrets)
project = var.gcp_project_id
secret_id = each.value
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${google_service_account.vm.email}"
}
# --- Network ---
# Web firewall: 80/443 for Caddy (TLS), 8000 only when TLS is disabled (direct HTTP).
# Separate rule for SSH (port 22) — default restricted to IAP tunnel range.
locals {
# Expose raw :8000 only when any instance has tls_mode != "caddy".
# If Caddy handles TLS, customers should hit 80/443, not bypass to 8000.
expose_raw_http_port = anytrue([for inst in local.all_instances : inst.tls_mode != "caddy"])
web_ports = local.expose_raw_http_port ? ["80", "443", "8000"] : ["80", "443"]
}
resource "google_compute_firewall" "web" {
name = "agnes-${var.customer_name}-allow-web"
project = var.gcp_project_id
network = "default"
allow {
protocol = "tcp"
ports = local.web_ports
}
source_ranges = ["0.0.0.0/0"]
target_tags = ["agnes-${var.customer_name}"]
}
resource "google_compute_firewall" "ssh" {
name = "agnes-${var.customer_name}-allow-ssh"
project = var.gcp_project_id
network = "default"
allow {
protocol = "tcp"
ports = ["22"]
}
source_ranges = var.firewall_ssh_source_ranges
target_tags = ["agnes-${var.customer_name}"]
}
# --- Backup policy: daily snapshot with 30-day retention ---
resource "google_compute_resource_policy" "daily_backup" {
name = "agnes-${var.customer_name}-daily-backup"
project = var.gcp_project_id
region = var.region
snapshot_schedule_policy {
schedule {
daily_schedule {
days_in_cycle = 1
start_time = "02:00"
}
}
retention_policy {
max_retention_days = 30
on_source_disk_delete = "KEEP_AUTO_SNAPSHOTS"
}
snapshot_properties {
labels = {
app = "agnes"
customer = var.customer_name
}
}
}
}
# --- Persistent data disks + VMs (prod + dev) ---
resource "google_compute_disk" "data" {
for_each = { for inst in local.all_instances : inst.name => inst }
name = "${each.value.name}-data"
project = var.gcp_project_id
zone = var.zone
size = each.value.data_disk_gb
type = "pd-ssd"
}
# Attach daily backup policy to data disks (boot disks are ephemeral,
# app code lives in the image so no need to snapshot them)
resource "google_compute_disk_resource_policy_attachment" "data_backup" {
for_each = { for inst in local.all_instances : inst.name => inst }
project = var.gcp_project_id
zone = var.zone
disk = google_compute_disk.data[each.key].name
name = google_compute_resource_policy.daily_backup.name
}
resource "google_compute_address" "ip" {
for_each = { for inst in local.all_instances : inst.name => inst }
name = "${each.value.name}-ip"
project = var.gcp_project_id
region = var.region
}
resource "google_compute_instance" "vm" {
for_each = { for inst in local.all_instances : inst.name => inst }
name = each.value.name
project = var.gcp_project_id
machine_type = each.value.machine_type
zone = var.zone
tags = ["agnes-${var.customer_name}"]
boot_disk {
initialize_params {
image = "ubuntu-os-cloud/ubuntu-2404-lts-amd64"
size = each.value.disk_size_gb
type = "pd-ssd"
}
}
attached_disk {
source = google_compute_disk.data[each.key].self_link
device_name = "data"
}
network_interface {
network = "default"
access_config {
nat_ip = google_compute_address.ip[each.key].address
}
}
metadata = {
enable-oslogin = "TRUE"
}
metadata_startup_script = templatefile("${path.module}/startup-script.sh.tpl", {
customer_name = var.customer_name
image_repo = var.image_repo
image_tag = each.value.image_tag
upgrade_mode = each.value.upgrade_mode
tls_mode = each.value.tls_mode
domain = each.value.domain
acme_email = var.acme_email != "" ? var.acme_email : var.seed_admin_email
data_source = var.data_source
keboola_stack_url = var.keboola_stack_url
seed_admin_email = var.seed_admin_email
seed_admin_password = var.enable_seed_password ? var.seed_admin_password : ""
role = each.value.role
compose_ref = var.compose_ref
})
service_account {
email = google_service_account.vm.email
scopes = ["cloud-platform"]
}
labels = {
app = "agnes"
customer = var.customer_name
role = each.value.role
managed = "terraform"
}
# Startup script changes do not modify running VMs (script only runs on boot).
# To propagate module changes, use:
# terraform apply -replace='module.agnes.google_compute_instance.vm["agnes-prod"]'
lifecycle {
ignore_changes = [metadata_startup_script]
}
# Ensure VM SA has read access to required secrets BEFORE the VM boots — otherwise
# the startup script's `gcloud secrets versions access` can 403 due to IAM lag.
depends_on = [
google_secret_manager_secret_iam_member.vm_jwt,
google_secret_manager_secret_iam_member.vm_runtime,
google_secret_manager_secret_version.jwt,
]
}
# --- Monitoring: uptime check on each VM's /api/health endpoint ---
resource "google_monitoring_uptime_check_config" "health" {
for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {}
project = var.gcp_project_id
display_name = "agnes-${var.customer_name}-${each.value.name}-health"
timeout = "10s"
period = "60s"
http_check {
path = "/api/health"
port = "8000"
use_ssl = false
validate_ssl = false
}
monitored_resource {
type = "uptime_url"
labels = {
project_id = var.gcp_project_id
host = google_compute_address.ip[each.key].address
}
}
}
# --- Monitoring: alert when health fails for > 5 min ---
resource "google_monitoring_alert_policy" "health_failure" {
for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {}
project = var.gcp_project_id
display_name = "agnes-${var.customer_name}-${each.value.name}-health-failure"
combiner = "OR"
conditions {
display_name = "Uptime check failed > 5 min"
condition_threshold {
filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id=\"${google_monitoring_uptime_check_config.health[each.key].uptime_check_id}\" AND resource.type=\"uptime_url\""
duration = "300s"
# ALIGN_FRACTION_TRUE yields fraction of checks that returned true.
# If the fraction stays < 1 (i.e. any probe failed) for 5 min → alert.
comparison = "COMPARISON_LT"
threshold_value = 1
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_FRACTION_TRUE"
}
trigger {
count = 1
}
}
}
notification_channels = var.notification_channel_ids
enabled = true
}