fix(infra): alert policy aggregation — drop cross_series_reducer

GCP rejected the policy with 'REDUCE_COUNT_FALSE cannot be applied to
metrics with value type DOUBLE' — because ALIGN_FRACTION_TRUE already
produces a fraction 0..1 per series, no need for an additional cross-series
reducer. Simplified: alert when the per-series fraction < 1 for 5 min.

Review M4 predicted this — uptime check filters needed double-checking
against live GCP.
This commit is contained in:
ZdenekSrotyr 2026-04-21 20:36:09 +02:00
parent 4ab0838ba2
commit 9a99a82e92

View file

@ -289,16 +289,16 @@ resource "google_monitoring_alert_policy" "health_failure" {
conditions {
display_name = "Uptime check failed > 5 min"
condition_threshold {
filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id=\"${google_monitoring_uptime_check_config.health[each.key].uptime_check_id}\" AND resource.type=\"uptime_url\""
filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id=\"${google_monitoring_uptime_check_config.health[each.key].uptime_check_id}\" AND resource.type=\"uptime_url\""
duration = "300s"
# ALIGN_FRACTION_TRUE yields fraction of checks that returned true.
# If the fraction stays < 1 (i.e. any probe failed) for 5 min alert.
comparison = "COMPARISON_LT"
threshold_value = 1
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_FRACTION_TRUE"
cross_series_reducer = "REDUCE_COUNT_FALSE"
group_by_fields = ["resource.label.host"]
alignment_period = "60s"
per_series_aligner = "ALIGN_FRACTION_TRUE"
}
trigger {