Devin NOTABLE: SCHEDULER_VERIFICATION_DETECTOR_INTERVAL was already read by app/api/health.py to compute the staleness grace window, but the actual scheduler cadence was hardcoded to 'every 15m'. The env var name implied it controlled the cadence — it didn't. An operator throttling the detector via the env was silently ignored by the scheduler while the health grace silently widened. Wired the env var into both ends. Same pattern applied to the other two LLM-pipeline jobs: - SCHEDULER_SESSION_COLLECTOR_INTERVAL (default 600s = 10m) - SCHEDULER_VERIFICATION_DETECTOR_INTERVAL (default 900s = 15m) - SCHEDULER_CORPORATE_MEMORY_INTERVAL (default 1020s = 17m) Defaults preserve the existing 10m / 15m / 17m coprime offset so the three jobs don't fire on the same tick. build_jobs() now reads all three through _read_positive_int (matching the existing pattern for data-refresh / health-check / script-runner) and feeds them to _seconds_to_schedule. The smallest-interval check includes the new variables so an operator can't accidentally set a tick larger than any LLM cadence. New tests in tests/test_scheduler.py: - TestLLMPipelineCadenceEnvVars: env override changes the schedule string at scheduler-init time, with parametrized invalid-value rejection. - TestVerificationDetectorGraceFollowsCadence: pinning the single-source-of-truth contract — same env var moves both the scheduler cadence and the health-check grace.
229 lines
9.3 KiB
Python
229 lines
9.3 KiB
Python
"""Scheduler service — replaces systemd timers.
|
|
|
|
Lightweight sidecar that fires scheduled jobs over HTTP against the main
|
|
app. Authenticates with ``SCHEDULER_API_TOKEN`` (shared-secret synthetic
|
|
admin — see ``app.auth.scheduler_token``); falls back to no-auth in
|
|
LOCAL_DEV_MODE.
|
|
|
|
Schedules are strings parsed by ``src.scheduler.is_table_due`` — accepts
|
|
"every 15m", "every 1h", "daily 03:00", "daily 07:00,13:00".
|
|
|
|
Why every job is HTTP and nothing runs in-process: the scheduler container
|
|
shares ``/data/state/system.duckdb`` with the app container, but DuckDB
|
|
permits only one writer per file across processes. An in-process call
|
|
from the scheduler raced the app's long-lived handle and 500-ed on
|
|
``Could not set lock on file``. Going through HTTP makes the app the sole
|
|
writer; the scheduler is reduced to a pure cron clock.
|
|
|
|
Usage: python -m services.scheduler
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import signal
|
|
import time
|
|
from datetime import datetime, timezone
|
|
|
|
import httpx
|
|
|
|
from app.logging_config import setup_logging
|
|
from src.scheduler import is_table_due
|
|
|
|
setup_logging(__name__)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
API_URL = os.environ.get("API_URL", "http://localhost:8000")
|
|
SCHEDULER_API_TOKEN = os.environ.get("SCHEDULER_API_TOKEN", "")
|
|
|
|
_token_warning_emitted = False
|
|
|
|
|
|
def _get_auth_token() -> str:
|
|
"""Return the bearer token for API calls.
|
|
|
|
Production: ``SCHEDULER_API_TOKEN`` is a shared secret generated by the
|
|
Terraform startup script and written to ``/opt/agnes/.env``. Both the
|
|
``app`` and ``scheduler`` containers source the same .env via Docker
|
|
Compose ``env_file:``, so the secret is symmetric. The app validates
|
|
incoming Bearer tokens against this env var (constant-time compare in
|
|
``app.auth.scheduler_token``) and resolves matches to a synthetic
|
|
``scheduler@system.local`` user that is a member of the Admin group.
|
|
|
|
Dev / LOCAL_DEV_MODE: leave it unset. The scheduler returns the empty
|
|
string and calls the API without an ``Authorization`` header — the
|
|
API's dev-bypass auto-authenticates the request as the dev user.
|
|
"""
|
|
global _token_warning_emitted
|
|
if SCHEDULER_API_TOKEN:
|
|
return SCHEDULER_API_TOKEN
|
|
if not _token_warning_emitted:
|
|
logger.warning(
|
|
"SCHEDULER_API_TOKEN is not set — calling the API without "
|
|
"Authorization. Required in production; in LOCAL_DEV_MODE "
|
|
"the dev-bypass auto-authenticates and this is fine."
|
|
)
|
|
_token_warning_emitted = True
|
|
return ""
|
|
|
|
|
|
# --- Env parsing ------------------------------------------------------------
|
|
|
|
_DEFAULTS = {
|
|
"SCHEDULER_DATA_REFRESH_INTERVAL": 15 * 60, # seconds
|
|
"SCHEDULER_HEALTH_CHECK_INTERVAL": 5 * 60,
|
|
"SCHEDULER_SCRIPT_RUN_INTERVAL": 1 * 60,
|
|
"SCHEDULER_TICK_SECONDS": 30,
|
|
# LLM pipeline cadences (#176, #179 review). Defaults preserve the
|
|
# 10m / 15m / 17m coprime offset so the three jobs don't fire on the
|
|
# same tick and stack their API + DB load. The verification-detector
|
|
# default (900s) is also the source of truth for the health-check
|
|
# staleness grace window in app/api/health.py — single env var drives
|
|
# both, so an operator changing the cadence moves both.
|
|
"SCHEDULER_SESSION_COLLECTOR_INTERVAL": 10 * 60,
|
|
"SCHEDULER_VERIFICATION_DETECTOR_INTERVAL": 15 * 60,
|
|
"SCHEDULER_CORPORATE_MEMORY_INTERVAL": 17 * 60,
|
|
}
|
|
|
|
|
|
def _read_positive_int(name: str) -> int:
|
|
"""Read an env var as a positive integer or fall back to the default.
|
|
|
|
Treats unset env (``None``) as "use default". Treats explicitly empty
|
|
string (``""``) as an operator typo and raises — silently defaulting
|
|
on a literal ``FOO=`` in the env_file would mask configuration bugs.
|
|
"""
|
|
raw = os.environ.get(name)
|
|
if raw is None:
|
|
if name not in _DEFAULTS:
|
|
raise ValueError(f"Unknown scheduler env var: {name}")
|
|
return _DEFAULTS[name]
|
|
if raw == "":
|
|
raise ValueError(f"{name}='' must be a positive integer (seconds)")
|
|
try:
|
|
value = int(raw)
|
|
except (TypeError, ValueError):
|
|
raise ValueError(f"{name}={raw!r} must be a positive integer (seconds)")
|
|
if value <= 0:
|
|
raise ValueError(f"{name}={value} must be > 0 (seconds)")
|
|
return value
|
|
|
|
|
|
def _seconds_to_schedule(seconds: int) -> str:
|
|
"""Convert a seconds value to the closest 'every Nm' / 'every Nh' string.
|
|
|
|
Uses ceiling division so a non-multiple-of-60 input never produces a
|
|
schedule that fires MORE often than the operator configured (90s →
|
|
'every 2m', not 'every 1m'). Sub-minute inputs clamp to 'every 1m'
|
|
because the schedule grammar has minute-level resolution.
|
|
"""
|
|
if seconds % 3600 == 0 and seconds >= 3600:
|
|
return f"every {seconds // 3600}h"
|
|
# Ceiling division: -(-x // y) is the standard trick.
|
|
minutes = max(1, -(-seconds // 60))
|
|
return f"every {minutes}m"
|
|
|
|
|
|
def resolved_tick_seconds() -> int:
|
|
"""Read + validate SCHEDULER_TICK_SECONDS in isolation (test helper)."""
|
|
return _read_positive_int("SCHEDULER_TICK_SECONDS")
|
|
|
|
|
|
def build_jobs() -> list[tuple[str, str, str, str, int]]:
|
|
"""Build the JOBS list from env, applying defaults and validation.
|
|
|
|
Tuple shape: (name, schedule_string, endpoint, method, http_timeout_sec).
|
|
Marketplaces stays hardcoded — promoting it to env is out of #77 scope.
|
|
"""
|
|
refresh = _read_positive_int("SCHEDULER_DATA_REFRESH_INTERVAL")
|
|
health = _read_positive_int("SCHEDULER_HEALTH_CHECK_INTERVAL")
|
|
scripts = _read_positive_int("SCHEDULER_SCRIPT_RUN_INTERVAL")
|
|
sess = _read_positive_int("SCHEDULER_SESSION_COLLECTOR_INTERVAL")
|
|
verify = _read_positive_int("SCHEDULER_VERIFICATION_DETECTOR_INTERVAL")
|
|
corpmem = _read_positive_int("SCHEDULER_CORPORATE_MEMORY_INTERVAL")
|
|
tick = _read_positive_int("SCHEDULER_TICK_SECONDS")
|
|
smallest = min(refresh, health, scripts, sess, verify, corpmem)
|
|
if tick > smallest:
|
|
raise ValueError(
|
|
f"SCHEDULER_TICK_SECONDS={tick} must be <= the smallest job "
|
|
f"interval ({smallest}s) so jobs don't consistently miss their "
|
|
f"cadence by up to one tick"
|
|
)
|
|
return [
|
|
("data-refresh", _seconds_to_schedule(refresh), "/api/sync/trigger", "POST", 120),
|
|
("health-check", _seconds_to_schedule(health), "/api/health", "GET", 30),
|
|
("script-runner", _seconds_to_schedule(scripts), "/api/scripts/run-due", "POST", 600),
|
|
("marketplaces", "daily 03:00", "/api/marketplaces/sync-all", "POST", 900),
|
|
# LLM pipeline (#176, #179 review). Cadences are deliberately offset
|
|
# (10m / 15m / 17m by default — all coprime modulo the 30s tick) so
|
|
# the three LLM-driven jobs don't fire on the same tick and stack
|
|
# their API + DB load. Driven by env so an operator can throttle
|
|
# without a code change; the verification-detector cadence is the
|
|
# single source of truth for the health-check staleness grace
|
|
# window in app/api/health.py (which uses 2x the cadence).
|
|
("session-collector", _seconds_to_schedule(sess), "/api/admin/run-session-collector", "POST", 300),
|
|
("verification-detector", _seconds_to_schedule(verify), "/api/admin/run-verification-detector", "POST", 900),
|
|
("corporate-memory", _seconds_to_schedule(corpmem), "/api/admin/run-corporate-memory", "POST", 900),
|
|
]
|
|
|
|
_running = True
|
|
|
|
|
|
def _signal_handler(sig, frame):
|
|
global _running
|
|
logger.info(f"Received signal {sig}, shutting down...")
|
|
_running = False
|
|
|
|
|
|
def _call_api(endpoint: str, method: str, timeout_sec: int) -> bool:
|
|
"""Call the main app API. Returns True on success."""
|
|
url = f"{API_URL}{endpoint}"
|
|
headers = {}
|
|
token = _get_auth_token()
|
|
if token:
|
|
headers["Authorization"] = f"Bearer {token}"
|
|
try:
|
|
if method == "POST":
|
|
resp = httpx.post(url, headers=headers, timeout=timeout_sec)
|
|
else:
|
|
resp = httpx.get(url, headers=headers, timeout=timeout_sec)
|
|
if resp.status_code < 400:
|
|
logger.info(f"Job {endpoint}: {resp.status_code}")
|
|
return True
|
|
else:
|
|
logger.warning(f"Job {endpoint}: HTTP {resp.status_code} - {resp.text[:200]}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Job {endpoint} failed: {e}")
|
|
return False
|
|
|
|
|
|
def run():
|
|
signal.signal(signal.SIGTERM, _signal_handler)
|
|
signal.signal(signal.SIGINT, _signal_handler)
|
|
|
|
jobs = build_jobs()
|
|
tick = resolved_tick_seconds()
|
|
logger.info(
|
|
"Scheduler started. API_URL=%s, %d jobs, tick=%ds. Schedules: %s",
|
|
API_URL, len(jobs), tick,
|
|
{name: schedule for name, schedule, *_ in jobs},
|
|
)
|
|
|
|
last_run: dict[str, str | None] = {name: None for name, *_ in jobs}
|
|
|
|
while _running:
|
|
now_iso = datetime.now(timezone.utc).isoformat()
|
|
for name, schedule, endpoint, method, timeout_sec in jobs:
|
|
if not is_table_due(schedule, last_run[name]):
|
|
continue
|
|
logger.info("Running job: %s (%s)", name, schedule)
|
|
ok = _call_api(endpoint, method, timeout_sec)
|
|
if ok:
|
|
last_run[name] = now_iso
|
|
time.sleep(tick)
|
|
|
|
logger.info("Scheduler stopped.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|