Cuts release 0.20.0. ## Highlights - X-Request-ID header on every response + sanitized to [A-Za-z0-9_-] (CRLF log-forging mitigation) - Error pages (HTML + JSON 500) surface request_id for support tickets - Dev debug toolbar gated by DEBUG=1 — fastapi-debug-toolbar with custom DuckDBPanel - Centralized app.logging_config.setup_logging() replaces 23 scattered basicConfig calls - Telegram bot drops bot.log file — stdout only (BREAKING) ## Devin findings addressed - BUG_0001: .env.template no longer claims FastAPI debug=True - BUG_0002: subprocess extractor logs INFO to stderr again - ANALYSIS_0003: _wants_html no longer matches Accept: */* (curl gets JSON as before) - BUG on b1c6ee9: HTML 500 page no longer leaks str(exc) in production - BUG on b13d2fe: 2 CLAUDE.md compliance flags (transform.py + ws_gateway) accepted as scope-limited logging refactor — follow-up to update CLAUDE.md if needed See CHANGELOG [0.20.0] for full notes.
207 lines
7.5 KiB
Python
207 lines
7.5 KiB
Python
"""Scheduler service — replaces systemd timers.
|
|
|
|
Lightweight sidecar that fires scheduled jobs over HTTP against the main
|
|
app. Authenticates with ``SCHEDULER_API_TOKEN`` (shared-secret synthetic
|
|
admin — see ``app.auth.scheduler_token``); falls back to no-auth in
|
|
LOCAL_DEV_MODE.
|
|
|
|
Schedules are strings parsed by ``src.scheduler.is_table_due`` — accepts
|
|
"every 15m", "every 1h", "daily 03:00", "daily 07:00,13:00".
|
|
|
|
Why every job is HTTP and nothing runs in-process: the scheduler container
|
|
shares ``/data/state/system.duckdb`` with the app container, but DuckDB
|
|
permits only one writer per file across processes. An in-process call
|
|
from the scheduler raced the app's long-lived handle and 500-ed on
|
|
``Could not set lock on file``. Going through HTTP makes the app the sole
|
|
writer; the scheduler is reduced to a pure cron clock.
|
|
|
|
Usage: python -m services.scheduler
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import signal
|
|
import time
|
|
from datetime import datetime, timezone
|
|
|
|
import httpx
|
|
|
|
from app.logging_config import setup_logging
|
|
from src.scheduler import is_table_due
|
|
|
|
setup_logging(__name__)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
API_URL = os.environ.get("API_URL", "http://localhost:8000")
|
|
SCHEDULER_API_TOKEN = os.environ.get("SCHEDULER_API_TOKEN", "")
|
|
|
|
_token_warning_emitted = False
|
|
|
|
|
|
def _get_auth_token() -> str:
|
|
"""Return the bearer token for API calls.
|
|
|
|
Production: ``SCHEDULER_API_TOKEN`` is a shared secret generated by the
|
|
Terraform startup script and written to ``/opt/agnes/.env``. Both the
|
|
``app`` and ``scheduler`` containers source the same .env via Docker
|
|
Compose ``env_file:``, so the secret is symmetric. The app validates
|
|
incoming Bearer tokens against this env var (constant-time compare in
|
|
``app.auth.scheduler_token``) and resolves matches to a synthetic
|
|
``scheduler@system.local`` user that is a member of the Admin group.
|
|
|
|
Dev / LOCAL_DEV_MODE: leave it unset. The scheduler returns the empty
|
|
string and calls the API without an ``Authorization`` header — the
|
|
API's dev-bypass auto-authenticates the request as the dev user.
|
|
"""
|
|
global _token_warning_emitted
|
|
if SCHEDULER_API_TOKEN:
|
|
return SCHEDULER_API_TOKEN
|
|
if not _token_warning_emitted:
|
|
logger.warning(
|
|
"SCHEDULER_API_TOKEN is not set — calling the API without "
|
|
"Authorization. Required in production; in LOCAL_DEV_MODE "
|
|
"the dev-bypass auto-authenticates and this is fine."
|
|
)
|
|
_token_warning_emitted = True
|
|
return ""
|
|
|
|
|
|
# --- Env parsing ------------------------------------------------------------
|
|
|
|
_DEFAULTS = {
|
|
"SCHEDULER_DATA_REFRESH_INTERVAL": 15 * 60, # seconds
|
|
"SCHEDULER_HEALTH_CHECK_INTERVAL": 5 * 60,
|
|
"SCHEDULER_SCRIPT_RUN_INTERVAL": 1 * 60,
|
|
"SCHEDULER_TICK_SECONDS": 30,
|
|
}
|
|
|
|
|
|
def _read_positive_int(name: str) -> int:
|
|
"""Read an env var as a positive integer or fall back to the default.
|
|
|
|
Treats unset env (``None``) as "use default". Treats explicitly empty
|
|
string (``""``) as an operator typo and raises — silently defaulting
|
|
on a literal ``FOO=`` in the env_file would mask configuration bugs.
|
|
"""
|
|
raw = os.environ.get(name)
|
|
if raw is None:
|
|
if name not in _DEFAULTS:
|
|
raise ValueError(f"Unknown scheduler env var: {name}")
|
|
return _DEFAULTS[name]
|
|
if raw == "":
|
|
raise ValueError(f"{name}='' must be a positive integer (seconds)")
|
|
try:
|
|
value = int(raw)
|
|
except (TypeError, ValueError):
|
|
raise ValueError(f"{name}={raw!r} must be a positive integer (seconds)")
|
|
if value <= 0:
|
|
raise ValueError(f"{name}={value} must be > 0 (seconds)")
|
|
return value
|
|
|
|
|
|
def _seconds_to_schedule(seconds: int) -> str:
|
|
"""Convert a seconds value to the closest 'every Nm' / 'every Nh' string.
|
|
|
|
Uses ceiling division so a non-multiple-of-60 input never produces a
|
|
schedule that fires MORE often than the operator configured (90s →
|
|
'every 2m', not 'every 1m'). Sub-minute inputs clamp to 'every 1m'
|
|
because the schedule grammar has minute-level resolution.
|
|
"""
|
|
if seconds % 3600 == 0 and seconds >= 3600:
|
|
return f"every {seconds // 3600}h"
|
|
# Ceiling division: -(-x // y) is the standard trick.
|
|
minutes = max(1, -(-seconds // 60))
|
|
return f"every {minutes}m"
|
|
|
|
|
|
def resolved_tick_seconds() -> int:
|
|
"""Read + validate SCHEDULER_TICK_SECONDS in isolation (test helper)."""
|
|
return _read_positive_int("SCHEDULER_TICK_SECONDS")
|
|
|
|
|
|
def build_jobs() -> list[tuple[str, str, str, str, int]]:
|
|
"""Build the JOBS list from env, applying defaults and validation.
|
|
|
|
Tuple shape: (name, schedule_string, endpoint, method, http_timeout_sec).
|
|
Marketplaces stays hardcoded — promoting it to env is out of #77 scope.
|
|
"""
|
|
refresh = _read_positive_int("SCHEDULER_DATA_REFRESH_INTERVAL")
|
|
health = _read_positive_int("SCHEDULER_HEALTH_CHECK_INTERVAL")
|
|
scripts = _read_positive_int("SCHEDULER_SCRIPT_RUN_INTERVAL")
|
|
tick = _read_positive_int("SCHEDULER_TICK_SECONDS")
|
|
smallest = min(refresh, health, scripts)
|
|
if tick > smallest:
|
|
raise ValueError(
|
|
f"SCHEDULER_TICK_SECONDS={tick} must be <= the smallest job "
|
|
f"interval ({smallest}s) so jobs don't consistently miss their "
|
|
f"cadence by up to one tick"
|
|
)
|
|
return [
|
|
("data-refresh", _seconds_to_schedule(refresh), "/api/sync/trigger", "POST", 120),
|
|
("health-check", _seconds_to_schedule(health), "/api/health", "GET", 30),
|
|
("script-runner", _seconds_to_schedule(scripts), "/api/scripts/run-due", "POST", 600),
|
|
("marketplaces", "daily 03:00", "/api/marketplaces/sync-all", "POST", 900),
|
|
]
|
|
|
|
_running = True
|
|
|
|
|
|
def _signal_handler(sig, frame):
|
|
global _running
|
|
logger.info(f"Received signal {sig}, shutting down...")
|
|
_running = False
|
|
|
|
|
|
def _call_api(endpoint: str, method: str, timeout_sec: int) -> bool:
|
|
"""Call the main app API. Returns True on success."""
|
|
url = f"{API_URL}{endpoint}"
|
|
headers = {}
|
|
token = _get_auth_token()
|
|
if token:
|
|
headers["Authorization"] = f"Bearer {token}"
|
|
try:
|
|
if method == "POST":
|
|
resp = httpx.post(url, headers=headers, timeout=timeout_sec)
|
|
else:
|
|
resp = httpx.get(url, headers=headers, timeout=timeout_sec)
|
|
if resp.status_code < 400:
|
|
logger.info(f"Job {endpoint}: {resp.status_code}")
|
|
return True
|
|
else:
|
|
logger.warning(f"Job {endpoint}: HTTP {resp.status_code} - {resp.text[:200]}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Job {endpoint} failed: {e}")
|
|
return False
|
|
|
|
|
|
def run():
|
|
signal.signal(signal.SIGTERM, _signal_handler)
|
|
signal.signal(signal.SIGINT, _signal_handler)
|
|
|
|
jobs = build_jobs()
|
|
tick = resolved_tick_seconds()
|
|
logger.info(
|
|
"Scheduler started. API_URL=%s, %d jobs, tick=%ds. Schedules: %s",
|
|
API_URL, len(jobs), tick,
|
|
{name: schedule for name, schedule, *_ in jobs},
|
|
)
|
|
|
|
last_run: dict[str, str | None] = {name: None for name, *_ in jobs}
|
|
|
|
while _running:
|
|
now_iso = datetime.now(timezone.utc).isoformat()
|
|
for name, schedule, endpoint, method, timeout_sec in jobs:
|
|
if not is_table_due(schedule, last_run[name]):
|
|
continue
|
|
logger.info("Running job: %s (%s)", name, schedule)
|
|
ok = _call_api(endpoint, method, timeout_sec)
|
|
if ok:
|
|
last_run[name] = now_iso
|
|
time.sleep(tick)
|
|
|
|
logger.info("Scheduler stopped.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|