agnes-the-ai-analyst/webapp/health_service.py
Petr 86edd27655 Extract Jira into connectors/jira module
Move all Jira-specific code into a self-contained connector module:
- 22 files moved via git mv (transform, service, webhook, scripts,
  systemd units, tests, docs, bin helper)
- All imports updated to use connectors.jira.* paths
- Jira is now conditional: auto-detected via JIRA_DOMAIN env var
- Webapp registers Jira blueprint only when available
- Health service monitors Jira timers only when enabled
- Profiler loads Jira tables dynamically from filesystem
- Sync settings uses config-driven dependency validation
- Renamed keboola_platform_url -> custom_url in transform
- Updated deploy.sh, sudoers-deploy, backfill_gap.sh paths
- Fixed pytest.ini to skip live tests by default
2026-03-09 11:17:50 +01:00

175 lines
5 KiB
Python

"""
Health check service for monitoring.
Returns detailed system status including:
- Systemd services (webapp, telegram-bot, timers)
- Disk space
- System load
- Optional: Jira webhook timestamp (if Jira connector enabled)
"""
import logging
import os
import subprocess
from datetime import datetime
from pathlib import Path
from .config import Config
logger = logging.getLogger(__name__)
# Services to monitor
CRITICAL_SERVICES = [
"webapp.service",
"notify-bot.service",
]
# Base timers (always monitored)
_BASE_TIMERS = [
"corporate-memory.timer",
]
# Jira timers (only if Jira connector is enabled)
_JIRA_TIMERS = [
"jira-consistency.timer",
"jira-sla-poll.timer",
]
TIMERS_TO_MONITOR = _BASE_TIMERS + (_JIRA_TIMERS if Config.JIRA_ENABLED else [])
def get_service_status(service_name: str) -> dict:
"""Get systemd service status."""
try:
result = subprocess.run(
["/usr/bin/systemctl", "is-active", service_name],
capture_output=True,
text=True,
timeout=5,
)
return {
"name": service_name,
"status": result.stdout.strip(),
"healthy": result.returncode == 0,
}
except Exception as e:
logger.warning(f"Failed to check {service_name}: {e}")
return {
"name": service_name,
"status": "unknown",
"healthy": False,
"error": str(e),
}
def get_disk_usage() -> list:
"""Get disk usage for all important partitions."""
partitions = ["/", "/data", "/home", "/tmp"]
results = []
for partition in partitions:
try:
stat = os.statvfs(partition)
total = stat.f_blocks * stat.f_frsize
free = stat.f_bavail * stat.f_frsize
used = total - free
percent = (used / total) * 100 if total > 0 else 0
results.append({
"partition": partition,
"used_percent": round(percent, 1),
"free_gb": round(free / (1024**3), 2),
"total_gb": round(total / (1024**3), 2),
"healthy": percent < 90,
})
except Exception as e:
logger.warning(f"Failed to get disk usage for {partition}: {e}")
results.append({
"partition": partition,
"healthy": False,
"error": str(e)
})
return results
def get_load_average() -> dict:
"""Get system load average."""
try:
load1, load5, load15 = os.getloadavg()
# e2-medium has 2 CPUs, so load > 4 is concerning
return {
"load_1min": round(load1, 2),
"load_5min": round(load5, 2),
"load_15min": round(load15, 2),
"healthy": load1 < 4 and load5 < 4,
}
except Exception as e:
logger.warning(f"Failed to get load average: {e}")
return {"healthy": False, "error": str(e)}
def get_last_jira_webhook() -> dict:
"""Get timestamp of last Jira webhook received."""
try:
# Check recent raw Jira files
jira_dir = Path("/data/src_data/raw/jira/issues")
if not jira_dir.exists():
return {"healthy": True, "message": "Jira directory not found"}
# Get most recently modified file
files = list(jira_dir.glob("*.json"))
if not files:
return {"healthy": True, "message": "No Jira files yet"}
latest_file = max(files, key=lambda p: p.stat().st_mtime)
mtime = latest_file.stat().st_mtime
age_seconds = datetime.now().timestamp() - mtime
age_hours = age_seconds / 3600
return {
"last_webhook_file": latest_file.name,
"last_webhook_hours_ago": round(age_hours, 1),
"healthy": age_hours < 48, # Alert if no webhook in 48h
}
except Exception as e:
logger.warning(f"Failed to check Jira webhooks: {e}")
return {"healthy": True, "error": str(e)} # Non-critical
def health_check() -> tuple[dict, int]:
"""
Perform comprehensive health check.
Returns:
(response_dict, http_status_code)
"""
services = [get_service_status(s) for s in CRITICAL_SERVICES]
timers = [get_service_status(t) for t in TIMERS_TO_MONITOR]
disk = get_disk_usage()
load = get_load_average()
# Overall health: all critical checks must pass
all_healthy = (
all(s["healthy"] for s in services)
and all(d["healthy"] for d in disk)
and load["healthy"]
)
response = {
"status": "healthy" if all_healthy else "degraded",
"timestamp": datetime.utcnow().isoformat() + "Z",
"services": services,
"timers": timers,
"disk": disk,
"load": load,
}
# Include Jira webhook status only if connector is enabled
if Config.JIRA_ENABLED:
response["jira_webhook"] = get_last_jira_webhook()
# Return 200 if healthy, 503 if degraded
status_code = 200 if all_healthy else 503
return response, status_code