agnes-the-ai-analyst/services/scheduler/__main__.py
Vojtech 38f6b639d2
feat(observability): request_id end-to-end + dev debug toolbar + centralized logging (#136)
Cuts release 0.20.0.

## Highlights
- X-Request-ID header on every response + sanitized to [A-Za-z0-9_-] (CRLF log-forging mitigation)
- Error pages (HTML + JSON 500) surface request_id for support tickets
- Dev debug toolbar gated by DEBUG=1 — fastapi-debug-toolbar with custom DuckDBPanel
- Centralized app.logging_config.setup_logging() replaces 23 scattered basicConfig calls
- Telegram bot drops bot.log file — stdout only (BREAKING)

## Devin findings addressed
- BUG_0001: .env.template no longer claims FastAPI debug=True
- BUG_0002: subprocess extractor logs INFO to stderr again
- ANALYSIS_0003: _wants_html no longer matches Accept: */* (curl gets JSON as before)
- BUG on b1c6ee9: HTML 500 page no longer leaks str(exc) in production
- BUG on b13d2fe: 2 CLAUDE.md compliance flags (transform.py + ws_gateway) accepted as scope-limited logging refactor — follow-up to update CLAUDE.md if needed

See CHANGELOG [0.20.0] for full notes.
2026-04-29 22:54:21 +02:00

207 lines
7.5 KiB
Python

"""Scheduler service — replaces systemd timers.
Lightweight sidecar that fires scheduled jobs over HTTP against the main
app. Authenticates with ``SCHEDULER_API_TOKEN`` (shared-secret synthetic
admin — see ``app.auth.scheduler_token``); falls back to no-auth in
LOCAL_DEV_MODE.
Schedules are strings parsed by ``src.scheduler.is_table_due`` — accepts
"every 15m", "every 1h", "daily 03:00", "daily 07:00,13:00".
Why every job is HTTP and nothing runs in-process: the scheduler container
shares ``/data/state/system.duckdb`` with the app container, but DuckDB
permits only one writer per file across processes. An in-process call
from the scheduler raced the app's long-lived handle and 500-ed on
``Could not set lock on file``. Going through HTTP makes the app the sole
writer; the scheduler is reduced to a pure cron clock.
Usage: python -m services.scheduler
"""
import logging
import os
import signal
import time
from datetime import datetime, timezone
import httpx
from app.logging_config import setup_logging
from src.scheduler import is_table_due
setup_logging(__name__)
logger = logging.getLogger(__name__)
API_URL = os.environ.get("API_URL", "http://localhost:8000")
SCHEDULER_API_TOKEN = os.environ.get("SCHEDULER_API_TOKEN", "")
_token_warning_emitted = False
def _get_auth_token() -> str:
"""Return the bearer token for API calls.
Production: ``SCHEDULER_API_TOKEN`` is a shared secret generated by the
Terraform startup script and written to ``/opt/agnes/.env``. Both the
``app`` and ``scheduler`` containers source the same .env via Docker
Compose ``env_file:``, so the secret is symmetric. The app validates
incoming Bearer tokens against this env var (constant-time compare in
``app.auth.scheduler_token``) and resolves matches to a synthetic
``scheduler@system.local`` user that is a member of the Admin group.
Dev / LOCAL_DEV_MODE: leave it unset. The scheduler returns the empty
string and calls the API without an ``Authorization`` header — the
API's dev-bypass auto-authenticates the request as the dev user.
"""
global _token_warning_emitted
if SCHEDULER_API_TOKEN:
return SCHEDULER_API_TOKEN
if not _token_warning_emitted:
logger.warning(
"SCHEDULER_API_TOKEN is not set — calling the API without "
"Authorization. Required in production; in LOCAL_DEV_MODE "
"the dev-bypass auto-authenticates and this is fine."
)
_token_warning_emitted = True
return ""
# --- Env parsing ------------------------------------------------------------
_DEFAULTS = {
"SCHEDULER_DATA_REFRESH_INTERVAL": 15 * 60, # seconds
"SCHEDULER_HEALTH_CHECK_INTERVAL": 5 * 60,
"SCHEDULER_SCRIPT_RUN_INTERVAL": 1 * 60,
"SCHEDULER_TICK_SECONDS": 30,
}
def _read_positive_int(name: str) -> int:
"""Read an env var as a positive integer or fall back to the default.
Treats unset env (``None``) as "use default". Treats explicitly empty
string (``""``) as an operator typo and raises — silently defaulting
on a literal ``FOO=`` in the env_file would mask configuration bugs.
"""
raw = os.environ.get(name)
if raw is None:
if name not in _DEFAULTS:
raise ValueError(f"Unknown scheduler env var: {name}")
return _DEFAULTS[name]
if raw == "":
raise ValueError(f"{name}='' must be a positive integer (seconds)")
try:
value = int(raw)
except (TypeError, ValueError):
raise ValueError(f"{name}={raw!r} must be a positive integer (seconds)")
if value <= 0:
raise ValueError(f"{name}={value} must be > 0 (seconds)")
return value
def _seconds_to_schedule(seconds: int) -> str:
"""Convert a seconds value to the closest 'every Nm' / 'every Nh' string.
Uses ceiling division so a non-multiple-of-60 input never produces a
schedule that fires MORE often than the operator configured (90s →
'every 2m', not 'every 1m'). Sub-minute inputs clamp to 'every 1m'
because the schedule grammar has minute-level resolution.
"""
if seconds % 3600 == 0 and seconds >= 3600:
return f"every {seconds // 3600}h"
# Ceiling division: -(-x // y) is the standard trick.
minutes = max(1, -(-seconds // 60))
return f"every {minutes}m"
def resolved_tick_seconds() -> int:
"""Read + validate SCHEDULER_TICK_SECONDS in isolation (test helper)."""
return _read_positive_int("SCHEDULER_TICK_SECONDS")
def build_jobs() -> list[tuple[str, str, str, str, int]]:
"""Build the JOBS list from env, applying defaults and validation.
Tuple shape: (name, schedule_string, endpoint, method, http_timeout_sec).
Marketplaces stays hardcoded — promoting it to env is out of #77 scope.
"""
refresh = _read_positive_int("SCHEDULER_DATA_REFRESH_INTERVAL")
health = _read_positive_int("SCHEDULER_HEALTH_CHECK_INTERVAL")
scripts = _read_positive_int("SCHEDULER_SCRIPT_RUN_INTERVAL")
tick = _read_positive_int("SCHEDULER_TICK_SECONDS")
smallest = min(refresh, health, scripts)
if tick > smallest:
raise ValueError(
f"SCHEDULER_TICK_SECONDS={tick} must be <= the smallest job "
f"interval ({smallest}s) so jobs don't consistently miss their "
f"cadence by up to one tick"
)
return [
("data-refresh", _seconds_to_schedule(refresh), "/api/sync/trigger", "POST", 120),
("health-check", _seconds_to_schedule(health), "/api/health", "GET", 30),
("script-runner", _seconds_to_schedule(scripts), "/api/scripts/run-due", "POST", 600),
("marketplaces", "daily 03:00", "/api/marketplaces/sync-all", "POST", 900),
]
_running = True
def _signal_handler(sig, frame):
global _running
logger.info(f"Received signal {sig}, shutting down...")
_running = False
def _call_api(endpoint: str, method: str, timeout_sec: int) -> bool:
"""Call the main app API. Returns True on success."""
url = f"{API_URL}{endpoint}"
headers = {}
token = _get_auth_token()
if token:
headers["Authorization"] = f"Bearer {token}"
try:
if method == "POST":
resp = httpx.post(url, headers=headers, timeout=timeout_sec)
else:
resp = httpx.get(url, headers=headers, timeout=timeout_sec)
if resp.status_code < 400:
logger.info(f"Job {endpoint}: {resp.status_code}")
return True
else:
logger.warning(f"Job {endpoint}: HTTP {resp.status_code} - {resp.text[:200]}")
return False
except Exception as e:
logger.error(f"Job {endpoint} failed: {e}")
return False
def run():
signal.signal(signal.SIGTERM, _signal_handler)
signal.signal(signal.SIGINT, _signal_handler)
jobs = build_jobs()
tick = resolved_tick_seconds()
logger.info(
"Scheduler started. API_URL=%s, %d jobs, tick=%ds. Schedules: %s",
API_URL, len(jobs), tick,
{name: schedule for name, schedule, *_ in jobs},
)
last_run: dict[str, str | None] = {name: None for name, *_ in jobs}
while _running:
now_iso = datetime.now(timezone.utc).isoformat()
for name, schedule, endpoint, method, timeout_sec in jobs:
if not is_table_due(schedule, last_run[name]):
continue
logger.info("Running job: %s (%s)", name, schedule)
ok = _call_api(endpoint, method, timeout_sec)
if ok:
last_run[name] = now_iso
time.sleep(tick)
logger.info("Scheduler stopped.")
if __name__ == "__main__":
run()