feat(infra): add daily backup snapshot + monitoring alerts

- google_compute_resource_policy.daily_backup: daily snapshot at 02:00, 30-day retention, labels (app=agnes, customer=<name>) - google_compute_disk_resource_policy_attachment.data_backup: attach policy to each data disk (prod + dev) - google_monitoring_uptime_check_config.health: per-VM /api/health uptime check every 60s, 10s timeout - google_monitoring_alert_policy.health_failure: alert when uptime check fails for > 5 min New opt-out: enable_monitoring = false (default true) New opt-in: notification_channel_ids = [...] to wire alerts to email/Slack Module API unchanged; existing customers pick up backups + monitoring on next module upgrade. TF provider requirement unchanged.
2026-04-21 19:01:56 +02:00 · 2026-04-21 19:01:56 +02:00 · 0842debf8a
commit 0842debf8a
parent 5188bd9127
3 changed files with 120 additions and 0 deletions
--- a/infra/modules/customer-instance/main.tf
+++ b/infra/modules/customer-instance/main.tf
@ -77,6 +77,33 @@ resource "google_compute_firewall" "web" {
  target_tags   = ["agnes-${var.customer_name}"]
 }
 # --- Backup policy: daily snapshot with 30-day retention ---
 resource "google_compute_resource_policy" "daily_backup" {
  name    = "agnes-${var.customer_name}-daily-backup"
  project = var.gcp_project_id
  region  = var.region
  snapshot_schedule_policy {
    schedule {
      daily_schedule {
        days_in_cycle = 1
        start_time    = "02:00"
      }
    }
    retention_policy {
      max_retention_days    = 30
      on_source_disk_delete = "KEEP_AUTO_SNAPSHOTS"
    }
    snapshot_properties {
      labels = {
        app      = "agnes"
        customer = var.customer_name
      }
    }
  }
 }
 # --- Persistent data disks + VMs (prod + dev) ---
 resource "google_compute_disk" "data" {
@ -89,6 +116,17 @@ resource "google_compute_disk" "data" {
  type    = "pd-ssd"
 }
 # Attach daily backup policy to data disks (boot disks are ephemeral,
 # app code lives in the image so no need to snapshot them)
 resource "google_compute_disk_resource_policy_attachment" "data_backup" {
  for_each = { for inst in local.all_instances : inst.name => inst }
  project = var.gcp_project_id
  zone    = var.zone
  disk    = google_compute_disk.data[each.key].name
  name    = google_compute_resource_policy.daily_backup.name
 }
 resource "google_compute_address" "ip" {
  for_each = { for inst in local.all_instances : inst.name => inst }
@ -161,3 +199,63 @@ resource "google_compute_instance" "vm" {
    ignore_changes = [metadata_startup_script]
  }
 }
 # --- Monitoring: uptime check on each VM's /api/health endpoint ---
 resource "google_monitoring_uptime_check_config" "health" {
  for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {}
  project      = var.gcp_project_id
  display_name = "agnes-${var.customer_name}-${each.value.name}-health"
  timeout      = "10s"
  period       = "60s"
  http_check {
    path         = "/api/health"
    port         = "8000"
    use_ssl      = false
    validate_ssl = false
  }
  monitored_resource {
    type = "uptime_url"
    labels = {
      project_id = var.gcp_project_id
      host       = google_compute_address.ip[each.key].address
    }
  }
 }
 # --- Monitoring: alert when health fails for > 5 min ---
 resource "google_monitoring_alert_policy" "health_failure" {
  for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {}
  project      = var.gcp_project_id
  display_name = "agnes-${var.customer_name}-${each.value.name}-health-failure"
  combiner     = "OR"
  conditions {
    display_name = "Uptime check failed > 5 min"
    condition_threshold {
      filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id=\"${google_monitoring_uptime_check_config.health[each.key].uptime_check_id}\" AND resource.type=\"uptime_url\""
      duration        = "300s"
      comparison      = "COMPARISON_LT"
      threshold_value = 1
      aggregations {
        alignment_period     = "60s"
        per_series_aligner   = "ALIGN_FRACTION_TRUE"
        cross_series_reducer = "REDUCE_COUNT_FALSE"
        group_by_fields      = ["resource.label.host"]
      }
      trigger {
        count = 1
      }
    }
  }
  notification_channels = var.notification_channel_ids
  enabled               = true
 }
--- a/infra/modules/customer-instance/outputs.tf
+++ b/infra/modules/customer-instance/outputs.tf
@ -17,3 +17,13 @@ output "jwt_secret_name" {
  description = "Plný název JWT secretu v Secret Manageru"
  value       = google_secret_manager_secret.jwt.name
 }
 output "backup_policy_id" {
  description = "ID daily backup resource policy attached to data disks"
  value       = google_compute_resource_policy.daily_backup.id
 }
 output "uptime_check_ids" {
  description = "Map of instance name → uptime check ID (empty when enable_monitoring = false)"
  value       = { for k, v in google_monitoring_uptime_check_config.health : k => v.uptime_check_id }
 }
--- a/infra/modules/customer-instance/variables.tf
+++ b/infra/modules/customer-instance/variables.tf
@ -70,3 +70,15 @@ variable "image_repo" {
  type        = string
  default     = "ghcr.io/keboola/agnes-the-ai-analyst"
 }
 variable "enable_monitoring" {
  description = "Create uptime checks + alert policies for each VM. Requires notification_channel_ids to be useful."
  type        = bool
  default     = true
 }
 variable "notification_channel_ids" {
  description = "Full resource IDs of GCP Monitoring notification channels (create in customer project via gcloud alpha monitoring channels create). Empty list = alerts fire but nothing is notified."
  type        = list(string)
  default     = []
 }