agnes-the-ai-analyst/webapp/health_service.py
Petr c56905d34f Initial commit: OSS data distribution platform
Open-source AI data analyst platform extracted from internal repo.
Includes data sync engine, Keboola adapter, Flask web portal,
server deployment scripts, and configuration templates.
2026-03-08 23:31:28 +01:00

164 lines
4.7 KiB
Python

"""
Health check service for monitoring.
Returns detailed system status including:
- Systemd services (webapp, telegram-bot, timers)
- Disk space
- System load
- Last Jira webhook timestamp
"""
import logging
import os
import subprocess
from datetime import datetime
from pathlib import Path
logger = logging.getLogger(__name__)
# Services to monitor
CRITICAL_SERVICES = [
"webapp.service",
"notify-bot.service",
]
TIMERS_TO_MONITOR = [
"jira-consistency.timer",
"corporate-memory.timer",
"jira-sla-poll.timer",
]
def get_service_status(service_name: str) -> dict:
"""Get systemd service status."""
try:
result = subprocess.run(
["/usr/bin/systemctl", "is-active", service_name],
capture_output=True,
text=True,
timeout=5,
)
return {
"name": service_name,
"status": result.stdout.strip(),
"healthy": result.returncode == 0,
}
except Exception as e:
logger.warning(f"Failed to check {service_name}: {e}")
return {
"name": service_name,
"status": "unknown",
"healthy": False,
"error": str(e),
}
def get_disk_usage() -> list:
"""Get disk usage for all important partitions."""
partitions = ["/", "/data", "/home", "/tmp"]
results = []
for partition in partitions:
try:
stat = os.statvfs(partition)
total = stat.f_blocks * stat.f_frsize
free = stat.f_bavail * stat.f_frsize
used = total - free
percent = (used / total) * 100 if total > 0 else 0
results.append({
"partition": partition,
"used_percent": round(percent, 1),
"free_gb": round(free / (1024**3), 2),
"total_gb": round(total / (1024**3), 2),
"healthy": percent < 90,
})
except Exception as e:
logger.warning(f"Failed to get disk usage for {partition}: {e}")
results.append({
"partition": partition,
"healthy": False,
"error": str(e)
})
return results
def get_load_average() -> dict:
"""Get system load average."""
try:
load1, load5, load15 = os.getloadavg()
# e2-medium has 2 CPUs, so load > 4 is concerning
return {
"load_1min": round(load1, 2),
"load_5min": round(load5, 2),
"load_15min": round(load15, 2),
"healthy": load1 < 4 and load5 < 4,
}
except Exception as e:
logger.warning(f"Failed to get load average: {e}")
return {"healthy": False, "error": str(e)}
def get_last_jira_webhook() -> dict:
"""Get timestamp of last Jira webhook received."""
try:
# Check recent raw Jira files
jira_dir = Path("/data/src_data/raw/jira/issues")
if not jira_dir.exists():
return {"healthy": True, "message": "Jira directory not found"}
# Get most recently modified file
files = list(jira_dir.glob("*.json"))
if not files:
return {"healthy": True, "message": "No Jira files yet"}
latest_file = max(files, key=lambda p: p.stat().st_mtime)
mtime = latest_file.stat().st_mtime
age_seconds = datetime.now().timestamp() - mtime
age_hours = age_seconds / 3600
return {
"last_webhook_file": latest_file.name,
"last_webhook_hours_ago": round(age_hours, 1),
"healthy": age_hours < 48, # Alert if no webhook in 48h
}
except Exception as e:
logger.warning(f"Failed to check Jira webhooks: {e}")
return {"healthy": True, "error": str(e)} # Non-critical
def health_check() -> tuple[dict, int]:
"""
Perform comprehensive health check.
Returns:
(response_dict, http_status_code)
"""
services = [get_service_status(s) for s in CRITICAL_SERVICES]
timers = [get_service_status(t) for t in TIMERS_TO_MONITOR]
disk = get_disk_usage()
load = get_load_average()
jira = get_last_jira_webhook()
# Overall health: all critical checks must pass
all_healthy = (
all(s["healthy"] for s in services)
and all(d["healthy"] for d in disk)
and load["healthy"]
)
response = {
"status": "healthy" if all_healthy else "degraded",
"timestamp": datetime.utcnow().isoformat() + "Z",
"services": services,
"timers": timers,
"disk": disk,
"load": load,
"jira_webhook": jira,
}
# Return 200 if healthy, 503 if degraded
status_code = 200 if all_healthy else 503
return response, status_code