"""Scheduler service — replaces systemd timers. Lightweight sidecar that fires scheduled jobs over HTTP against the main app. Authenticates with ``SCHEDULER_API_TOKEN`` (shared-secret synthetic admin — see ``app.auth.scheduler_token``); falls back to no-auth in LOCAL_DEV_MODE. Schedules are strings parsed by ``src.scheduler.is_table_due`` — accepts "every 15m", "every 1h", "daily 03:00", "daily 07:00,13:00". Why every job is HTTP and nothing runs in-process: the scheduler container shares ``/data/state/system.duckdb`` with the app container, but DuckDB permits only one writer per file across processes. An in-process call from the scheduler raced the app's long-lived handle and 500-ed on ``Could not set lock on file``. Going through HTTP makes the app the sole writer; the scheduler is reduced to a pure cron clock. Usage: python -m services.scheduler """ import logging import os import signal import time from datetime import datetime, timezone import httpx from src.scheduler import is_table_due logging.basicConfig( level=os.environ.get("LOG_LEVEL", "INFO").upper(), format="%(asctime)s %(levelname)s [scheduler] %(message)s", ) logger = logging.getLogger(__name__) API_URL = os.environ.get("API_URL", "http://localhost:8000") SCHEDULER_API_TOKEN = os.environ.get("SCHEDULER_API_TOKEN", "") _token_warning_emitted = False def _get_auth_token() -> str: """Return the bearer token for API calls. Production: ``SCHEDULER_API_TOKEN`` is a shared secret generated by the Terraform startup script and written to ``/opt/agnes/.env``. Both the ``app`` and ``scheduler`` containers source the same .env via Docker Compose ``env_file:``, so the secret is symmetric. The app validates incoming Bearer tokens against this env var (constant-time compare in ``app.auth.scheduler_token``) and resolves matches to a synthetic ``scheduler@system.local`` user that is a member of the Admin group. Dev / LOCAL_DEV_MODE: leave it unset. The scheduler returns the empty string and calls the API without an ``Authorization`` header — the API's dev-bypass auto-authenticates the request as the dev user. """ global _token_warning_emitted if SCHEDULER_API_TOKEN: return SCHEDULER_API_TOKEN if not _token_warning_emitted: logger.warning( "SCHEDULER_API_TOKEN is not set — calling the API without " "Authorization. Required in production; in LOCAL_DEV_MODE " "the dev-bypass auto-authenticates and this is fine." ) _token_warning_emitted = True return "" # Schedule definitions: (name, schedule_string, endpoint, method, timeout_sec). # All jobs are HTTP — see the module docstring for why nothing runs # in-process anymore. ``daily 03:00`` for marketplaces matches the cadence # the previous in-process job used; the endpoint is admin-only and # idempotent (it iterates the registry and per-marketplace errors do not # abort the run). # # timeout_sec: per-job override for the httpx call. Marketplaces gets a # generous 15 min because the app handler iterates every registered # marketplace under a single lock with up to 300s of git timeout per # entry — at 120s (the default that data-refresh uses) a real-world # registry of more than 2-3 slow repos times out the scheduler call, # which then re-fires on the next 30s tick and queues a redundant sync. JOBS = [ ("data-refresh", "every 15m", "/api/sync/trigger", "POST", 120), ("health-check", "every 5m", "/api/health", "GET", 30), ("marketplaces", "daily 03:00", "/api/marketplaces/sync-all", "POST", 900), ] _running = True def _signal_handler(sig, frame): global _running logger.info(f"Received signal {sig}, shutting down...") _running = False def _call_api(endpoint: str, method: str, timeout_sec: int) -> bool: """Call the main app API. Returns True on success.""" url = f"{API_URL}{endpoint}" headers = {} token = _get_auth_token() if token: headers["Authorization"] = f"Bearer {token}" try: if method == "POST": resp = httpx.post(url, headers=headers, timeout=timeout_sec) else: resp = httpx.get(url, headers=headers, timeout=timeout_sec) if resp.status_code < 400: logger.info(f"Job {endpoint}: {resp.status_code}") return True else: logger.warning(f"Job {endpoint}: HTTP {resp.status_code} - {resp.text[:200]}") return False except Exception as e: logger.error(f"Job {endpoint} failed: {e}") return False def run(): signal.signal(signal.SIGTERM, _signal_handler) signal.signal(signal.SIGINT, _signal_handler) logger.info(f"Scheduler started. API_URL={API_URL}, {len(JOBS)} jobs configured.") # Track last successful run per job as ISO string — matches what # src.scheduler.is_table_due expects. last_run: dict[str, str | None] = {name: None for name, *_ in JOBS} while _running: now_iso = datetime.now(timezone.utc).isoformat() for name, schedule, endpoint, method, timeout_sec in JOBS: if not is_table_due(schedule, last_run[name]): continue logger.info("Running job: %s (%s)", name, schedule) ok = _call_api(endpoint, method, timeout_sec) if ok: last_run[name] = now_iso # 30s tick is plenty: interval jobs have minute-level resolution, # daily jobs have a ~24 h retry window. time.sleep(30) logger.info("Scheduler stopped.") if __name__ == "__main__": run()