feat(infra): add daily backup snapshot + monitoring alerts

- google_compute_resource_policy.daily_backup: daily snapshot at 02:00, 30-day retention, labels (app=agnes, customer=<name>) - google_compute_disk_resource_policy_attachment.data_backup: attach policy to each data disk (prod + dev) - google_monitoring_uptime_check_config.health: per-VM /api/health uptime check every 60s, 10s timeout - google_monitoring_alert_policy.health_failure: alert when uptime check fails for > 5 min New opt-out: enable_monitoring = false (default true) New opt-in: notification_channel_ids = [...] to wire alerts to email/Slack Module API unchanged; existing customers pick up backups + monitoring on next module upgrade. TF provider requirement unchanged.
2026-04-21 19:01:56 +02:00 · 2026-04-21 19:01:56 +02:00 · 0842debf8a
commit 0842debf8a
parent 5188bd9127
3 changed files with 120 additions and 0 deletions
--- a/infra/modules/customer-instance/main.tf
+++ b/infra/modules/customer-instance/main.tf
@ -77,6 +77,33 @@ resource "google_compute_firewall" "web" {
  target_tags   = ["agnes-${var.customer_name}"]
 }

+# --- Backup policy: daily snapshot with 30-day retention ---
+
+resource "google_compute_resource_policy" "daily_backup" {
+  name    = "agnes-${var.customer_name}-daily-backup"
+  project = var.gcp_project_id
+  region  = var.region
+
+  snapshot_schedule_policy {
+    schedule {
+      daily_schedule {
+        days_in_cycle = 1
+        start_time    = "02:00"
+      }
+    }
+    retention_policy {
+      max_retention_days    = 30
+      on_source_disk_delete = "KEEP_AUTO_SNAPSHOTS"
+    }
+    snapshot_properties {
+      labels = {
+        app      = "agnes"
+        customer = var.customer_name
+      }
+    }
+  }
+}
+
 # --- Persistent data disks + VMs (prod + dev) ---

 resource "google_compute_disk" "data" {
@ -89,6 +116,17 @@ resource "google_compute_disk" "data" {
  type    = "pd-ssd"
 }

+# Attach daily backup policy to data disks (boot disks are ephemeral,
+# app code lives in the image so no need to snapshot them)
+resource "google_compute_disk_resource_policy_attachment" "data_backup" {
+  for_each = { for inst in local.all_instances : inst.name => inst }
+
+  project = var.gcp_project_id
+  zone    = var.zone
+  disk    = google_compute_disk.data[each.key].name
+  name    = google_compute_resource_policy.daily_backup.name
+}
+
 resource "google_compute_address" "ip" {
  for_each = { for inst in local.all_instances : inst.name => inst }

@ -161,3 +199,63 @@ resource "google_compute_instance" "vm" {
    ignore_changes = [metadata_startup_script]
  }
 }
+
+# --- Monitoring: uptime check on each VM's /api/health endpoint ---
+
+resource "google_monitoring_uptime_check_config" "health" {
+  for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {}
+
+  project      = var.gcp_project_id
+  display_name = "agnes-${var.customer_name}-${each.value.name}-health"
+  timeout      = "10s"
+  period       = "60s"
+
+  http_check {
+    path         = "/api/health"
+    port         = "8000"
+    use_ssl      = false
+    validate_ssl = false
+  }
+
+  monitored_resource {
+    type = "uptime_url"
+    labels = {
+      project_id = var.gcp_project_id
+      host       = google_compute_address.ip[each.key].address
+    }
+  }
+}
+
+# --- Monitoring: alert when health fails for > 5 min ---
+
+resource "google_monitoring_alert_policy" "health_failure" {
+  for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {}
+
+  project      = var.gcp_project_id
+  display_name = "agnes-${var.customer_name}-${each.value.name}-health-failure"
+  combiner     = "OR"
+
+  conditions {
+    display_name = "Uptime check failed > 5 min"
+    condition_threshold {
+      filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id=\"${google_monitoring_uptime_check_config.health[each.key].uptime_check_id}\" AND resource.type=\"uptime_url\""
+      duration        = "300s"
+      comparison      = "COMPARISON_LT"
+      threshold_value = 1
+
+      aggregations {
+        alignment_period     = "60s"
+        per_series_aligner   = "ALIGN_FRACTION_TRUE"
+        cross_series_reducer = "REDUCE_COUNT_FALSE"
+        group_by_fields      = ["resource.label.host"]
+      }
+
+      trigger {
+        count = 1
+      }
+    }
+  }
+
+  notification_channels = var.notification_channel_ids
+  enabled               = true
+}
--- a/infra/modules/customer-instance/outputs.tf
+++ b/infra/modules/customer-instance/outputs.tf
@ -17,3 +17,13 @@ output "jwt_secret_name" {
  description = "Plný název JWT secretu v Secret Manageru"
  value       = google_secret_manager_secret.jwt.name
 }
+
+output "backup_policy_id" {
+  description = "ID daily backup resource policy attached to data disks"
+  value       = google_compute_resource_policy.daily_backup.id
+}
+
+output "uptime_check_ids" {
+  description = "Map of instance name → uptime check ID (empty when enable_monitoring = false)"
+  value       = { for k, v in google_monitoring_uptime_check_config.health : k => v.uptime_check_id }
+}
--- a/infra/modules/customer-instance/variables.tf
+++ b/infra/modules/customer-instance/variables.tf
@ -70,3 +70,15 @@ variable "image_repo" {
  type        = string
  default     = "ghcr.io/keboola/agnes-the-ai-analyst"
 }
+
+variable "enable_monitoring" {
+  description = "Create uptime checks + alert policies for each VM. Requires notification_channel_ids to be useful."
+  type        = bool
+  default     = true
+}
+
+variable "notification_channel_ids" {
+  description = "Full resource IDs of GCP Monitoring notification channels (create in customer project via gcloud alpha monitoring channels create). Empty list = alerts fire but nothing is notified."
+  type        = list(string)
+  default     = []
+}