From 0842debf8ae223e5f693378a04425df6ccdb61bd Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 21 Apr 2026 19:01:56 +0200 Subject: [PATCH] feat(infra): add daily backup snapshot + monitoring alerts - google_compute_resource_policy.daily_backup: daily snapshot at 02:00, 30-day retention, labels (app=agnes, customer=) - google_compute_disk_resource_policy_attachment.data_backup: attach policy to each data disk (prod + dev) - google_monitoring_uptime_check_config.health: per-VM /api/health uptime check every 60s, 10s timeout - google_monitoring_alert_policy.health_failure: alert when uptime check fails for > 5 min New opt-out: enable_monitoring = false (default true) New opt-in: notification_channel_ids = [...] to wire alerts to email/Slack Module API unchanged; existing customers pick up backups + monitoring on next module upgrade. TF provider requirement unchanged. --- infra/modules/customer-instance/main.tf | 98 ++++++++++++++++++++ infra/modules/customer-instance/outputs.tf | 10 ++ infra/modules/customer-instance/variables.tf | 12 +++ 3 files changed, 120 insertions(+) diff --git a/infra/modules/customer-instance/main.tf b/infra/modules/customer-instance/main.tf index c32334e..b91a819 100644 --- a/infra/modules/customer-instance/main.tf +++ b/infra/modules/customer-instance/main.tf @@ -77,6 +77,33 @@ resource "google_compute_firewall" "web" { target_tags = ["agnes-${var.customer_name}"] } +# --- Backup policy: daily snapshot with 30-day retention --- + +resource "google_compute_resource_policy" "daily_backup" { + name = "agnes-${var.customer_name}-daily-backup" + project = var.gcp_project_id + region = var.region + + snapshot_schedule_policy { + schedule { + daily_schedule { + days_in_cycle = 1 + start_time = "02:00" + } + } + retention_policy { + max_retention_days = 30 + on_source_disk_delete = "KEEP_AUTO_SNAPSHOTS" + } + snapshot_properties { + labels = { + app = "agnes" + customer = var.customer_name + } + } + } +} + # --- Persistent data disks + VMs (prod + dev) --- resource "google_compute_disk" "data" { @@ -89,6 +116,17 @@ resource "google_compute_disk" "data" { type = "pd-ssd" } +# Attach daily backup policy to data disks (boot disks are ephemeral, +# app code lives in the image so no need to snapshot them) +resource "google_compute_disk_resource_policy_attachment" "data_backup" { + for_each = { for inst in local.all_instances : inst.name => inst } + + project = var.gcp_project_id + zone = var.zone + disk = google_compute_disk.data[each.key].name + name = google_compute_resource_policy.daily_backup.name +} + resource "google_compute_address" "ip" { for_each = { for inst in local.all_instances : inst.name => inst } @@ -161,3 +199,63 @@ resource "google_compute_instance" "vm" { ignore_changes = [metadata_startup_script] } } + +# --- Monitoring: uptime check on each VM's /api/health endpoint --- + +resource "google_monitoring_uptime_check_config" "health" { + for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {} + + project = var.gcp_project_id + display_name = "agnes-${var.customer_name}-${each.value.name}-health" + timeout = "10s" + period = "60s" + + http_check { + path = "/api/health" + port = "8000" + use_ssl = false + validate_ssl = false + } + + monitored_resource { + type = "uptime_url" + labels = { + project_id = var.gcp_project_id + host = google_compute_address.ip[each.key].address + } + } +} + +# --- Monitoring: alert when health fails for > 5 min --- + +resource "google_monitoring_alert_policy" "health_failure" { + for_each = var.enable_monitoring ? { for inst in local.all_instances : inst.name => inst } : {} + + project = var.gcp_project_id + display_name = "agnes-${var.customer_name}-${each.value.name}-health-failure" + combiner = "OR" + + conditions { + display_name = "Uptime check failed > 5 min" + condition_threshold { + filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id=\"${google_monitoring_uptime_check_config.health[each.key].uptime_check_id}\" AND resource.type=\"uptime_url\"" + duration = "300s" + comparison = "COMPARISON_LT" + threshold_value = 1 + + aggregations { + alignment_period = "60s" + per_series_aligner = "ALIGN_FRACTION_TRUE" + cross_series_reducer = "REDUCE_COUNT_FALSE" + group_by_fields = ["resource.label.host"] + } + + trigger { + count = 1 + } + } + } + + notification_channels = var.notification_channel_ids + enabled = true +} diff --git a/infra/modules/customer-instance/outputs.tf b/infra/modules/customer-instance/outputs.tf index 5fe605c..cb8e353 100644 --- a/infra/modules/customer-instance/outputs.tf +++ b/infra/modules/customer-instance/outputs.tf @@ -17,3 +17,13 @@ output "jwt_secret_name" { description = "Plný název JWT secretu v Secret Manageru" value = google_secret_manager_secret.jwt.name } + +output "backup_policy_id" { + description = "ID daily backup resource policy attached to data disks" + value = google_compute_resource_policy.daily_backup.id +} + +output "uptime_check_ids" { + description = "Map of instance name → uptime check ID (empty when enable_monitoring = false)" + value = { for k, v in google_monitoring_uptime_check_config.health : k => v.uptime_check_id } +} diff --git a/infra/modules/customer-instance/variables.tf b/infra/modules/customer-instance/variables.tf index cc94ee0..8ce886e 100644 --- a/infra/modules/customer-instance/variables.tf +++ b/infra/modules/customer-instance/variables.tf @@ -70,3 +70,15 @@ variable "image_repo" { type = string default = "ghcr.io/keboola/agnes-the-ai-analyst" } + +variable "enable_monitoring" { + description = "Create uptime checks + alert policies for each VM. Requires notification_channel_ids to be useful." + type = bool + default = true +} + +variable "notification_channel_ids" { + description = "Full resource IDs of GCP Monitoring notification channels (create in customer project via gcloud alpha monitoring channels create). Empty list = alerts fire but nothing is notified." + type = list(string) + default = [] +}