Open-source AI data analyst platform extracted from internal repo. Includes data sync engine, Keboola adapter, Flask web portal, server deployment scripts, and configuration templates.
164 lines
4.7 KiB
Python
164 lines
4.7 KiB
Python
"""
|
|
Health check service for monitoring.
|
|
|
|
Returns detailed system status including:
|
|
- Systemd services (webapp, telegram-bot, timers)
|
|
- Disk space
|
|
- System load
|
|
- Last Jira webhook timestamp
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Services to monitor
|
|
CRITICAL_SERVICES = [
|
|
"webapp.service",
|
|
"notify-bot.service",
|
|
]
|
|
|
|
TIMERS_TO_MONITOR = [
|
|
"jira-consistency.timer",
|
|
"corporate-memory.timer",
|
|
"jira-sla-poll.timer",
|
|
]
|
|
|
|
|
|
def get_service_status(service_name: str) -> dict:
|
|
"""Get systemd service status."""
|
|
try:
|
|
result = subprocess.run(
|
|
["/usr/bin/systemctl", "is-active", service_name],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
)
|
|
return {
|
|
"name": service_name,
|
|
"status": result.stdout.strip(),
|
|
"healthy": result.returncode == 0,
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Failed to check {service_name}: {e}")
|
|
return {
|
|
"name": service_name,
|
|
"status": "unknown",
|
|
"healthy": False,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
def get_disk_usage() -> list:
|
|
"""Get disk usage for all important partitions."""
|
|
partitions = ["/", "/data", "/home", "/tmp"]
|
|
results = []
|
|
|
|
for partition in partitions:
|
|
try:
|
|
stat = os.statvfs(partition)
|
|
total = stat.f_blocks * stat.f_frsize
|
|
free = stat.f_bavail * stat.f_frsize
|
|
used = total - free
|
|
percent = (used / total) * 100 if total > 0 else 0
|
|
|
|
results.append({
|
|
"partition": partition,
|
|
"used_percent": round(percent, 1),
|
|
"free_gb": round(free / (1024**3), 2),
|
|
"total_gb": round(total / (1024**3), 2),
|
|
"healthy": percent < 90,
|
|
})
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get disk usage for {partition}: {e}")
|
|
results.append({
|
|
"partition": partition,
|
|
"healthy": False,
|
|
"error": str(e)
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
def get_load_average() -> dict:
|
|
"""Get system load average."""
|
|
try:
|
|
load1, load5, load15 = os.getloadavg()
|
|
# e2-medium has 2 CPUs, so load > 4 is concerning
|
|
return {
|
|
"load_1min": round(load1, 2),
|
|
"load_5min": round(load5, 2),
|
|
"load_15min": round(load15, 2),
|
|
"healthy": load1 < 4 and load5 < 4,
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get load average: {e}")
|
|
return {"healthy": False, "error": str(e)}
|
|
|
|
|
|
def get_last_jira_webhook() -> dict:
|
|
"""Get timestamp of last Jira webhook received."""
|
|
try:
|
|
# Check recent raw Jira files
|
|
jira_dir = Path("/data/src_data/raw/jira/issues")
|
|
if not jira_dir.exists():
|
|
return {"healthy": True, "message": "Jira directory not found"}
|
|
|
|
# Get most recently modified file
|
|
files = list(jira_dir.glob("*.json"))
|
|
if not files:
|
|
return {"healthy": True, "message": "No Jira files yet"}
|
|
|
|
latest_file = max(files, key=lambda p: p.stat().st_mtime)
|
|
mtime = latest_file.stat().st_mtime
|
|
age_seconds = datetime.now().timestamp() - mtime
|
|
age_hours = age_seconds / 3600
|
|
|
|
return {
|
|
"last_webhook_file": latest_file.name,
|
|
"last_webhook_hours_ago": round(age_hours, 1),
|
|
"healthy": age_hours < 48, # Alert if no webhook in 48h
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Failed to check Jira webhooks: {e}")
|
|
return {"healthy": True, "error": str(e)} # Non-critical
|
|
|
|
|
|
def health_check() -> tuple[dict, int]:
|
|
"""
|
|
Perform comprehensive health check.
|
|
|
|
Returns:
|
|
(response_dict, http_status_code)
|
|
"""
|
|
services = [get_service_status(s) for s in CRITICAL_SERVICES]
|
|
timers = [get_service_status(t) for t in TIMERS_TO_MONITOR]
|
|
disk = get_disk_usage()
|
|
load = get_load_average()
|
|
jira = get_last_jira_webhook()
|
|
|
|
# Overall health: all critical checks must pass
|
|
all_healthy = (
|
|
all(s["healthy"] for s in services)
|
|
and all(d["healthy"] for d in disk)
|
|
and load["healthy"]
|
|
)
|
|
|
|
response = {
|
|
"status": "healthy" if all_healthy else "degraded",
|
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
|
"services": services,
|
|
"timers": timers,
|
|
"disk": disk,
|
|
"load": load,
|
|
"jira_webhook": jira,
|
|
}
|
|
|
|
# Return 200 if healthy, 503 if degraded
|
|
status_code = 200 if all_healthy else 503
|
|
|
|
return response, status_code
|