"""Scheduler service — replaces systemd timers. Lightweight sidecar that fires scheduled jobs over HTTP against the main app. Authenticates with ``SCHEDULER_API_TOKEN`` (shared-secret synthetic admin — see ``app.auth.scheduler_token``); falls back to no-auth in LOCAL_DEV_MODE. Schedules are strings parsed by ``src.scheduler.is_table_due`` — accepts "every 15m", "every 1h", "daily 03:00", "daily 07:00,13:00". Why every job is HTTP and nothing runs in-process: the scheduler container shares ``/data/state/system.duckdb`` with the app container, but DuckDB permits only one writer per file across processes. An in-process call from the scheduler raced the app's long-lived handle and 500-ed on ``Could not set lock on file``. Going through HTTP makes the app the sole writer; the scheduler is reduced to a pure cron clock. Usage: python -m services.scheduler """ import logging import os import signal import threading import time from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timezone import httpx from app.logging_config import setup_logging from src.scheduler import is_table_due setup_logging(__name__) logger = logging.getLogger(__name__) API_URL = os.environ.get("API_URL", "http://localhost:8000") SCHEDULER_API_TOKEN = os.environ.get("SCHEDULER_API_TOKEN", "") _token_warning_emitted = False def _get_auth_token() -> str: """Return the bearer token for API calls. Production: ``SCHEDULER_API_TOKEN`` is a shared secret generated by the Terraform startup script and written to ``/opt/agnes/.env``. Both the ``app`` and ``scheduler`` containers source the same .env via Docker Compose ``env_file:``, so the secret is symmetric. The app validates incoming Bearer tokens against this env var (constant-time compare in ``app.auth.scheduler_token``) and resolves matches to a synthetic ``scheduler@system.local`` user that is a member of the Admin group. Dev / LOCAL_DEV_MODE: leave it unset. The scheduler returns the empty string and calls the API without an ``Authorization`` header — the API's dev-bypass auto-authenticates the request as the dev user. """ global _token_warning_emitted if SCHEDULER_API_TOKEN: return SCHEDULER_API_TOKEN if not _token_warning_emitted: logger.warning( "SCHEDULER_API_TOKEN is not set — calling the API without " "Authorization. Required in production; in LOCAL_DEV_MODE " "the dev-bypass auto-authenticates and this is fine." ) _token_warning_emitted = True return "" # --- Env parsing ------------------------------------------------------------ _DEFAULTS = { "SCHEDULER_DATA_REFRESH_INTERVAL": 15 * 60, # seconds "SCHEDULER_HEALTH_CHECK_INTERVAL": 5 * 60, "SCHEDULER_SCRIPT_RUN_INTERVAL": 1 * 60, "SCHEDULER_TICK_SECONDS": 30, # LLM pipeline cadences (#176, #179 review). Defaults preserve the # 10m / 15m / 17m coprime offset so the three jobs don't fire on the # same tick and stack their API + DB load. The verification-detector # default (900s) is also the source of truth for the health-check # staleness grace window in app/api/health.py — single env var drives # both, so an operator changing the cadence moves both. "SCHEDULER_SESSION_COLLECTOR_INTERVAL": 10 * 60, # Drives the verification session-processor cadence AND the # health-check staleness grace window in app/api/health.py # (single env var → both, so an operator changing the cadence moves # both). Name retained post session-pipeline refactor for operator # compatibility — existing docker-compose env files keep working. "SCHEDULER_VERIFICATION_DETECTOR_INTERVAL": 15 * 60, "SCHEDULER_USAGE_PROCESSOR_INTERVAL": 10 * 60, "SCHEDULER_CORPORATE_MEMORY_INTERVAL": 17 * 60, # BigQuery metadata refresh: walks remote registry rows and updates the # persistent ``bq_metadata_cache``. Default 4 h — long enough that the # cumulative BQ jobs API cost stays negligible on a typical 10–50-table # registry, short enough that operator-edited tables show real numbers # within an analyst's working day. Hot reads of ``/api/v2/catalog`` go # to DuckDB, never to BQ, so this can be tuned freely without touching # request-path latency. "SCHEDULER_BQ_METADATA_REFRESH_INTERVAL": 4 * 60 * 60, # Pause between scheduler startup and the first tick. Keeps the # scheduler from synchronising its "Table never synced, marking as # due" burst with the app's own startup cache_warmup (which writes # heavily to the system DB for several seconds). Set to 0 to disable # — useful in tests that need deterministic-fast first-tick. "SCHEDULER_STARTUP_GRACE_SECONDS": 60, } def _read_positive_int(name: str) -> int: """Read an env var as a positive integer or fall back to the default. Treats unset env (``None``) as "use default". Treats explicitly empty string (``""``) as an operator typo and raises — silently defaulting on a literal ``FOO=`` in the env_file would mask configuration bugs. """ raw = os.environ.get(name) if raw is None: if name not in _DEFAULTS: raise ValueError(f"Unknown scheduler env var: {name}") return _DEFAULTS[name] if raw == "": raise ValueError(f"{name}='' must be a positive integer (seconds)") try: value = int(raw) except (TypeError, ValueError): raise ValueError(f"{name}={raw!r} must be a positive integer (seconds)") if value <= 0: raise ValueError(f"{name}={value} must be > 0 (seconds)") return value def _read_non_negative_int(name: str) -> int: """Like ``_read_positive_int`` but also accepts ``0`` as a valid value. Used for knobs where ``0`` means "disable" rather than "operator typo" — e.g. ``SCHEDULER_STARTUP_GRACE_SECONDS=0`` legitimately skips the startup pause in unit tests / fast-iteration dev setups. """ raw = os.environ.get(name) if raw is None: if name not in _DEFAULTS: raise ValueError(f"Unknown scheduler env var: {name}") return _DEFAULTS[name] if raw == "": raise ValueError(f"{name}='' must be a non-negative integer (seconds)") try: value = int(raw) except (TypeError, ValueError): raise ValueError(f"{name}={raw!r} must be a non-negative integer (seconds)") if value < 0: raise ValueError(f"{name}={value} must be >= 0 (seconds)") return value def _seconds_to_schedule(seconds: int) -> str: """Convert a seconds value to the closest 'every Nm' / 'every Nh' string. Uses ceiling division so a non-multiple-of-60 input never produces a schedule that fires MORE often than the operator configured (90s → 'every 2m', not 'every 1m'). Sub-minute inputs clamp to 'every 1m' because the schedule grammar has minute-level resolution. """ if seconds % 3600 == 0 and seconds >= 3600: return f"every {seconds // 3600}h" # Ceiling division: -(-x // y) is the standard trick. minutes = max(1, -(-seconds // 60)) return f"every {minutes}m" def resolved_tick_seconds() -> int: """Read + validate SCHEDULER_TICK_SECONDS in isolation (test helper).""" return _read_positive_int("SCHEDULER_TICK_SECONDS") def resolved_startup_grace_seconds() -> int: """Read SCHEDULER_STARTUP_GRACE_SECONDS (default 60). Sleep duration between scheduler startup and the first tick. Mitigates the "post-deploy contention burst" where the scheduler's "everything is due" first tick (5+ paralle HTTP POSTs against the just-restarted app) overlaps the app's own startup ``cache_warmup`` job, doubling disk I/O on the host's boot disk and dropping concurrent parquet downloads from ~3 MB/s to ~1 MB/s for the duration of the burst. """ return _read_non_negative_int("SCHEDULER_STARTUP_GRACE_SECONDS") def resolved_bq_metadata_initial_offset_seconds(rng=None) -> int: """Random startup-jitter for ``bq-metadata-refresh``. Returns a value in ``[0, BQ_METADATA_INITIAL_OFFSET_MAX_SECONDS]`` that the run loop uses to fake a recent ``last_run`` for the ``bq-metadata-refresh`` job at startup. With ``last_run = now - jitter`` and the default 4 h interval, the first refresh fires ``interval - jitter`` seconds after the startup grace finishes (≈ 3 h 45 m to 4 h). This intentionally suppresses an immediate refresh on every container start — the app's own ``cache_warmup`` already populates the persistent cache at startup, so a duplicate refresh from the scheduler would just compete for disk I/O while adding nothing. ``rng`` injectable for deterministic tests. """ import random as _random cap = int(os.environ.get( "SCHEDULER_BQ_METADATA_INITIAL_OFFSET_MAX_SECONDS", "900", )) if cap <= 0: return 0 r = rng or _random.Random() return r.randint(0, cap) def build_jobs() -> list[tuple[str, str, str, str, int]]: """Build the JOBS list from env, applying defaults and validation. Tuple shape: (name, schedule_string, endpoint, method, http_timeout_sec). Marketplaces stays hardcoded — promoting it to env is out of #77 scope. """ refresh = _read_positive_int("SCHEDULER_DATA_REFRESH_INTERVAL") health = _read_positive_int("SCHEDULER_HEALTH_CHECK_INTERVAL") scripts = _read_positive_int("SCHEDULER_SCRIPT_RUN_INTERVAL") sess = _read_positive_int("SCHEDULER_SESSION_COLLECTOR_INTERVAL") verify = _read_positive_int("SCHEDULER_VERIFICATION_DETECTOR_INTERVAL") usage = _read_positive_int("SCHEDULER_USAGE_PROCESSOR_INTERVAL") corpmem = _read_positive_int("SCHEDULER_CORPORATE_MEMORY_INTERVAL") bqmeta = _read_positive_int("SCHEDULER_BQ_METADATA_REFRESH_INTERVAL") tick = _read_positive_int("SCHEDULER_TICK_SECONDS") smallest = min(refresh, health, scripts, sess, verify, usage, corpmem, bqmeta) if tick > smallest: raise ValueError( f"SCHEDULER_TICK_SECONDS={tick} must be <= the smallest job " f"interval ({smallest}s) so jobs don't consistently miss their " f"cadence by up to one tick" ) return [ ("data-refresh", _seconds_to_schedule(refresh), "/api/sync/trigger", "POST", 120), ("health-check", _seconds_to_schedule(health), "/api/health", "GET", 30), ("script-runner", _seconds_to_schedule(scripts), "/api/scripts/run-due", "POST", 600), ("marketplaces", "daily 03:00", "/api/marketplaces/sync-all", "POST", 900), # LLM pipeline (#176, #179 review). Cadences are deliberately offset # (10m / 15m / 17m by default — all coprime modulo the 30s tick) so # the three LLM-driven jobs don't fire on the same tick and stack # their API + DB load. Driven by env so an operator can throttle # without a code change; the verification-detector cadence is the # single source of truth for the health-check staleness grace # window in app/api/health.py (which uses 2x the cadence). ("session-collector", _seconds_to_schedule(sess), "/api/admin/run-session-collector", "POST", 300), # session-pipeline processors — independent loops, each invoked on # its own cadence via the parametrized run-session-processor endpoint. # Adding a third processor in the future is one line here + one entry # in services/session_processors/__init__.py registry. ("session-processor:verification", _seconds_to_schedule(verify), "/api/admin/run-session-processor?processor=verification", "POST", 900), ("session-processor:usage", _seconds_to_schedule(usage), "/api/admin/run-session-processor?processor=usage", "POST", 300), ("corporate-memory", _seconds_to_schedule(corpmem), "/api/admin/run-corporate-memory", "POST", 900), # v30: TTL purge of blocked-bundle bytes. Cheap (just rmtree # + UPDATE), runs once daily at 04:00 UTC so the spike is # visible in audit_log without competing with the marketplaces # job at 03:00. Endpoint reads guardrails.blocked_bundle_ttl_days # from instance.yaml and short-circuits when set to 0. ("store-blocked-purge", "daily 04:00", "/api/admin/run-blocked-purge", "POST", 600), # Stuck-review reaper (#7). A submission stays at # status='pending_llm' until the BackgroundTasks worker writes # a verdict. If the worker crashes, the row sits forever. Run # every 15 minutes; reap_stuck_llm_reviews flips rows older # than guardrails.stuck_review_grace_seconds (default 1800) # to review_error so admin can retry. Cheap (one indexed # SELECT + N small UPDATEs); short timeout sufficient. ("store-reap-stuck-reviews", "every 15m", "/api/admin/run-reap-stuck-reviews", "POST", 60), # BigQuery metadata refresh — keeps ``bq_metadata_cache`` warm so # ``GET /api/v2/catalog`` never has to call BQ at request time. # 30-min timeout is generous; on a 10-table dev registry the # observed full refresh ran in ~7 min when two view-backed rows # took 7 min each. Bounded concurrency # (``AGNES_BQ_METADATA_REFRESH_CONCURRENCY``, default 4) caps the # tail. ("bq-metadata-refresh", _seconds_to_schedule(bqmeta), "/api/admin/run-bq-metadata-refresh", "POST", 1800), ] _running = True def _signal_handler(sig, frame): global _running logger.info(f"Received signal {sig}, shutting down...") _running = False def _call_api(endpoint: str, method: str, timeout_sec: int) -> bool: """Call the main app API. Returns True on success.""" url = f"{API_URL}{endpoint}" headers = {} token = _get_auth_token() if token: headers["Authorization"] = f"Bearer {token}" try: if method == "POST": resp = httpx.post(url, headers=headers, timeout=timeout_sec) else: resp = httpx.get(url, headers=headers, timeout=timeout_sec) if resp.status_code < 400: logger.info(f"Job {endpoint}: {resp.status_code}") return True else: logger.warning(f"Job {endpoint}: HTTP {resp.status_code} - {resp.text[:200]}") return False except Exception as e: logger.error(f"Job {endpoint} failed: {e}") try: from src.observability import get_posthog get_posthog().capture_exception( e, distinct_id="system", properties={"job": endpoint, "method": method, "component": "scheduler"}, ) except Exception: logger.exception("PostHog capture_exception failed in scheduler") return False def run(): signal.signal(signal.SIGTERM, _signal_handler) signal.signal(signal.SIGINT, _signal_handler) jobs = build_jobs() tick = resolved_tick_seconds() grace = resolved_startup_grace_seconds() bqmeta_offset = resolved_bq_metadata_initial_offset_seconds() logger.info( "Scheduler started. API_URL=%s, %d jobs, tick=%ds, " "startup_grace=%ds, bq_metadata_initial_offset=%ds. Schedules: %s", API_URL, len(jobs), tick, grace, bqmeta_offset, {name: schedule for name, schedule, *_ in jobs}, ) # Startup grace — see ``resolved_startup_grace_seconds`` for why. # Honors SIGTERM by polling _running in short slices, so an operator # `docker compose stop` during grace doesn't hang for ~60s. grace_remaining = grace while grace_remaining > 0 and _running: time.sleep(min(grace_remaining, 5)) grace_remaining -= 5 if not _running: logger.info("Scheduler shutdown during startup grace; exiting.") return last_run: dict[str, str | None] = {name: None for name, *_ in jobs} # Suppress the first ``bq-metadata-refresh`` fire by pretending the # job ran ``bqmeta_offset`` seconds ago at startup. ``is_table_due`` # will then wait the remainder of the configured interval before # firing for the first time. Two scheduler containers that came up # within seconds of each other will pick different offsets and stop # synchronising their refresh ticks against one another. if bqmeta_offset > 0: from datetime import timedelta as _td offset_ago = ( datetime.now(timezone.utc) - _td(seconds=bqmeta_offset) ).isoformat() last_run["bq-metadata-refresh"] = offset_ago # Per-tick concurrency: one thread per job slot, so a 900s verification # run can't block the 60s health-check or the 30s data-refresh from # firing on their own cadences (PR #232 review fix). Pure I/O workload # (httpx) — GIL is irrelevant. `in_flight` prevents the same job being # re-launched on a subsequent tick while the previous invocation is # still running; otherwise a 10-min run during which 20 ticks fire # would queue 20 duplicate POSTs against the same processor (the # admin endpoint's per-processor lock would 409 most of them, but # they'd still be wasted requests + audit-log noise). in_flight: set[str] = set() in_flight_lock = threading.Lock() executor = ThreadPoolExecutor(max_workers=max(4, len(jobs))) while _running: now_iso = datetime.now(timezone.utc).isoformat() for name, schedule, endpoint, method, timeout_sec in jobs: if not is_table_due(schedule, last_run[name]): continue with in_flight_lock: if name in in_flight: # Previous tick's invocation hasn't returned yet; skip. continue in_flight.add(name) logger.info("Running job: %s (%s)", name, schedule) executor.submit( _run_job, name, endpoint, method, timeout_sec, now_iso, last_run, in_flight, in_flight_lock, ) time.sleep(tick) logger.info("Scheduler stopping; waiting for in-flight jobs.") executor.shutdown(wait=True) logger.info("Scheduler stopped.") def _run_job( name: str, endpoint: str, method: str, timeout: int, now_iso: str, last_run: dict[str, "str | None"], in_flight: set[str], in_flight_lock: threading.Lock, ) -> None: """Execute one scheduled job + bookkeeping. Lifted out of run() so it's unit-testable. Advances last_run on terminal state (success OR failure) so a permanently failing job retries on its cadence (e.g. 15 min), not on every scheduler tick (default 30s). Pre-fix behavior caused a hot-loop on persistent 5xx — 30× more requests + LLM tokens than the operator configured. Errors still surface via _call_api's logging + audit_log on the receiving side. """ try: _call_api(endpoint, method, timeout) finally: last_run[name] = now_iso with in_flight_lock: in_flight.discard(name) if __name__ == "__main__": run()