terraform { required_version = ">= 1.5" required_providers { google = { source = "hashicorp/google" version = "~> 5.0" } random = { source = "hashicorp/random" version = "~> 3.0" } } } locals { # Normalize all instances into a single list so for_each is uniform across prod + dev. # Note: merge({defaults}, d) — d overrides defaults (fix for v1.3.0 bug where # defaults overrode user-supplied values). dev_defaults = { role = "dev" disk_size_gb = 30 data_disk_gb = 20 upgrade_mode = "auto" tls_mode = "none" # dev VMs default to plain HTTP; TLS requires domain domain = "" } all_instances = concat( [merge(var.prod_instance, { role = "prod" })], [for d in var.dev_instances : merge(local.dev_defaults, d)] ) } # --- Secrets --- resource "google_secret_manager_secret" "jwt" { secret_id = "agnes-${var.customer_name}-jwt-secret" project = var.gcp_project_id replication { auto {} } } resource "random_password" "jwt" { length = 48 special = false } resource "google_secret_manager_secret_version" "jwt" { secret = google_secret_manager_secret.jwt.id secret_data = random_password.jwt.result } # --- VM service account (dedicated, read-only on specific secrets only) --- resource "google_service_account" "vm" { account_id = "agnes-${var.customer_name}-vm" display_name = "Agnes VM runtime SA (${var.customer_name})" project = var.gcp_project_id } # Grant read access only to the JWT secret this module owns. # Not project-wide — if the customer adds unrelated secrets (e.g. Stripe key) # to the same project, Agnes VM must NOT be able to read them. resource "google_secret_manager_secret_iam_member" "vm_jwt" { project = var.gcp_project_id secret_id = google_secret_manager_secret.jwt.secret_id role = "roles/secretmanager.secretAccessor" member = "serviceAccount:${google_service_account.vm.email}" } # Grant read access to additional secrets the app needs (e.g. keboola-storage-token). # Caller specifies these via var.runtime_secrets. Each secret must already exist. resource "google_secret_manager_secret_iam_member" "vm_runtime" { for_each = toset(var.runtime_secrets) project = var.gcp_project_id secret_id = each.value role = "roles/secretmanager.secretAccessor" member = "serviceAccount:${google_service_account.vm.email}" } # --- Network --- # Web firewall: 80/443 for Caddy (TLS), 8000 only when TLS is disabled (direct HTTP). # Separate rule for SSH (port 22) — default restricted to IAP tunnel range. locals { # Expose raw :8000 only when any instance has tls_mode != "caddy". # If Caddy handles TLS, customers should hit 80/443, not bypass to 8000. expose_raw_http_port = anytrue([for inst in local.all_instances : inst.tls_mode != "caddy"]) web_ports = local.expose_raw_http_port ? ["80", "443", "8000"] : ["80", "443"] } resource "google_compute_firewall" "web" { name = "agnes-${var.customer_name}-allow-web" project = var.gcp_project_id network = "default" allow { protocol = "tcp" ports = local.web_ports } source_ranges = ["0.0.0.0/0"] target_tags = ["agnes-${var.customer_name}"] } resource "google_compute_firewall" "ssh" { name = "agnes-${var.customer_name}-allow-ssh" project = var.gcp_project_id network = "default" allow { protocol = "tcp" ports = ["22"] } source_ranges = var.firewall_ssh_source_ranges target_tags = ["agnes-${var.customer_name}"] } # --- Backup policy: daily snapshot with 30-day retention --- resource "google_compute_resource_policy" "daily_backup" { name = "agnes-${var.customer_name}-daily-backup" project = var.gcp_project_id region = var.region snapshot_schedule_policy { schedule { daily_schedule { days_in_cycle = 1 start_time = "02:00" } } retention_policy { max_retention_days = 30 on_source_disk_delete = "KEEP_AUTO_SNAPSHOTS" } snapshot_properties { labels = { app = "agnes" customer = var.customer_name } } } } # --- Persistent data disks + VMs (prod + dev) --- resource "google_compute_disk" "data" { for_each = { for inst in local.all_instances : inst.name => inst } name = "${each.value.name}-data" project = var.gcp_project_id zone = var.zone size = each.value.data_disk_gb type = "pd-ssd" } # Attach daily backup policy to data disks (boot disks are ephemeral, # app code lives in the image so no need to snapshot them) resource "google_compute_disk_resource_policy_attachment" "data_backup" { for_each = { for inst in local.all_instances : inst.name => inst } project = var.gcp_project_id zone = var.zone disk = google_compute_disk.data[each.key].name name = google_compute_resource_policy.daily_backup.name } resource "google_compute_address" "ip" { for_each = { for inst in local.all_instances : inst.name => inst } name = "${each.value.name}-ip" project = var.gcp_project_id region = var.region } resource "google_compute_instance" "vm" { for_each = { for inst in local.all_instances : inst.name => inst } name = each.value.name project = var.gcp_project_id machine_type = each.value.machine_type zone = var.zone tags = ["agnes-${var.customer_name}"] # Without this, a `machine_type` change in TF triggers a full # ForceNew (destroy + recreate) of the VM. The data disk would # survive (it's a separate `attached_disk`), but VM-local state # — fingerprints, journald, ephemeral caches — would not. With # `true`, the provider stops the VM, mutates the field, and # restarts it in place, which is what an operator resizing a # running deployment actually wants. allow_stopping_for_update = true boot_disk { initialize_params { image = "ubuntu-os-cloud/ubuntu-2404-lts-amd64" size = each.value.disk_size_gb type = "pd-ssd" } } attached_disk { source = google_compute_disk.data[each.key].self_link device_name = "data" } network_interface { network = "default" access_config { nat_ip = google_compute_address.ip[each.key].address } } metadata = { enable-oslogin = "TRUE" } metadata_startup_script = templatefile("${path.module}/startup-script.sh.tpl", { customer_name = var.customer_name image_repo = var.image_repo image_tag = each.value.image_tag upgrade_mode = each.value.upgrade_mode tls_mode = each.value.tls_mode domain = each.value.domain acme_email = var.acme_email != "" ? var.acme_email : var.seed_admin_email data_source = var.data_source keboola_stack_url = var.keboola_stack_url seed_admin_email = var.seed_admin_email seed_admin_password = var.enable_seed_password ? var.seed_admin_password : "" role = each.value.role compose_ref = var.compose_ref }) service_account { email = google_service_account.vm.email scopes = ["cloud-platform"] } labels = { app = "agnes" customer = var.customer_name role = each.value.role managed = "terraform" } # Startup script changes do not modify running VMs (script only runs on boot). # To propagate module changes, use: # terraform apply -replace='module.agnes.google_compute_instance.vm["agnes-prod"]' lifecycle { ignore_changes = [metadata_startup_script] } # Ensure VM SA has read access to required secrets BEFORE the VM boots — otherwise # the startup script's `gcloud secrets versions access` can 403 due to IAM lag. depends_on = [ google_secret_manager_secret_iam_member.vm_jwt, google_secret_manager_secret_iam_member.vm_runtime, google_secret_manager_secret_version.jwt, ] } # --- Monitoring: uptime check on each VM's /api/health endpoint --- resource "google_monitoring_uptime_check_config" "health" { for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {} project = var.gcp_project_id display_name = "agnes-${var.customer_name}-${each.value.name}-health" timeout = "10s" period = "60s" http_check { path = "/api/health" port = "8000" use_ssl = false validate_ssl = false } monitored_resource { type = "uptime_url" labels = { project_id = var.gcp_project_id host = google_compute_address.ip[each.key].address } } } # --- Monitoring: alert when health fails for > 5 min --- resource "google_monitoring_alert_policy" "health_failure" { for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {} project = var.gcp_project_id display_name = "agnes-${var.customer_name}-${each.value.name}-health-failure" combiner = "OR" conditions { display_name = "Uptime check failed > 5 min" condition_threshold { filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id=\"${google_monitoring_uptime_check_config.health[each.key].uptime_check_id}\" AND resource.type=\"uptime_url\"" duration = "300s" # ALIGN_FRACTION_TRUE yields fraction of checks that returned true. # If the fraction stays < 1 (i.e. any probe failed) for 5 min → alert. comparison = "COMPARISON_LT" threshold_value = 1 aggregations { alignment_period = "60s" per_series_aligner = "ALIGN_FRACTION_TRUE" } trigger { count = 1 } } } notification_channels = var.notification_channel_ids enabled = true }