agnes-the-ai-analyst/services/scheduler/__main__.py

"""Scheduler service — replaces systemd timers.

Lightweight sidecar that fires scheduled jobs over HTTP against the main
app. Authenticates with ``SCHEDULER_API_TOKEN`` (shared-secret synthetic
admin — see ``app.auth.scheduler_token``); falls back to no-auth in
LOCAL_DEV_MODE.

Schedules are strings parsed by ``src.scheduler.is_table_due`` — accepts
"every 15m", "every 1h", "daily 03:00", "daily 07:00,13:00".

Why every job is HTTP and nothing runs in-process: the scheduler container
shares ``/data/state/system.duckdb`` with the app container, but DuckDB
permits only one writer per file across processes. An in-process call
from the scheduler raced the app's long-lived handle and 500-ed on
``Could not set lock on file``. Going through HTTP makes the app the sole
writer; the scheduler is reduced to a pure cron clock.

Usage: python -m services.scheduler
"""

import logging
import os
import signal
import time
from datetime import datetime, timezone

import httpx

from src.scheduler import is_table_due

logging.basicConfig(
    level=os.environ.get("LOG_LEVEL", "INFO").upper(),
    format="%(asctime)s %(levelname)s [scheduler] %(message)s",
)
logger = logging.getLogger(__name__)

API_URL = os.environ.get("API_URL", "http://localhost:8000")
SCHEDULER_API_TOKEN = os.environ.get("SCHEDULER_API_TOKEN", "")

_token_warning_emitted = False


def _get_auth_token() -> str:
    """Return the bearer token for API calls.

    Production: ``SCHEDULER_API_TOKEN`` is a shared secret generated by the
    Terraform startup script and written to ``/opt/agnes/.env``. Both the
    ``app`` and ``scheduler`` containers source the same .env via Docker
    Compose ``env_file:``, so the secret is symmetric. The app validates
    incoming Bearer tokens against this env var (constant-time compare in
    ``app.auth.scheduler_token``) and resolves matches to a synthetic
    ``scheduler@system.local`` user that is a member of the Admin group.

    Dev / LOCAL_DEV_MODE: leave it unset. The scheduler returns the empty
    string and calls the API without an ``Authorization`` header — the
    API's dev-bypass auto-authenticates the request as the dev user.
    """
    global _token_warning_emitted
    if SCHEDULER_API_TOKEN:
        return SCHEDULER_API_TOKEN
    if not _token_warning_emitted:
        logger.warning(
            "SCHEDULER_API_TOKEN is not set — calling the API without "
            "Authorization. Required in production; in LOCAL_DEV_MODE "
            "the dev-bypass auto-authenticates and this is fine."
        )
        _token_warning_emitted = True
    return ""


# Schedule definitions: (name, schedule_string, endpoint, method, timeout_sec).
# All jobs are HTTP — see the module docstring for why nothing runs
# in-process anymore. ``daily 03:00`` for marketplaces matches the cadence
# the previous in-process job used; the endpoint is admin-only and
# idempotent (it iterates the registry and per-marketplace errors do not
# abort the run).
#
# timeout_sec: per-job override for the httpx call. Marketplaces gets a
# generous 15 min because the app handler iterates every registered
# marketplace under a single lock with up to 300s of git timeout per
# entry — at 120s (the default that data-refresh uses) a real-world
# registry of more than 2-3 slow repos times out the scheduler call,
# which then re-fires on the next 30s tick and queues a redundant sync.
JOBS = [
    ("data-refresh",    "every 15m",   "/api/sync/trigger",          "POST", 120),
    ("health-check",    "every 5m",    "/api/health",                "GET",   30),
    ("marketplaces",    "daily 03:00", "/api/marketplaces/sync-all", "POST", 900),
]

_running = True


def _signal_handler(sig, frame):
    global _running
    logger.info(f"Received signal {sig}, shutting down...")
    _running = False


def _call_api(endpoint: str, method: str, timeout_sec: int) -> bool:
    """Call the main app API. Returns True on success."""
    url = f"{API_URL}{endpoint}"
    headers = {}
    token = _get_auth_token()
    if token:
        headers["Authorization"] = f"Bearer {token}"
    try:
        if method == "POST":
            resp = httpx.post(url, headers=headers, timeout=timeout_sec)
        else:
            resp = httpx.get(url, headers=headers, timeout=timeout_sec)
        if resp.status_code < 400:
            logger.info(f"Job {endpoint}: {resp.status_code}")
            return True
        else:
            logger.warning(f"Job {endpoint}: HTTP {resp.status_code} - {resp.text[:200]}")
            return False
    except Exception as e:
        logger.error(f"Job {endpoint} failed: {e}")
        return False


def run():
    signal.signal(signal.SIGTERM, _signal_handler)
    signal.signal(signal.SIGINT, _signal_handler)

    logger.info(f"Scheduler started. API_URL={API_URL}, {len(JOBS)} jobs configured.")

    # Track last successful run per job as ISO string — matches what
    # src.scheduler.is_table_due expects.
    last_run: dict[str, str | None] = {name: None for name, *_ in JOBS}

    while _running:
        now_iso = datetime.now(timezone.utc).isoformat()
        for name, schedule, endpoint, method, timeout_sec in JOBS:
            if not is_table_due(schedule, last_run[name]):
                continue
            logger.info("Running job: %s (%s)", name, schedule)
            ok = _call_api(endpoint, method, timeout_sec)
            if ok:
                last_run[name] = now_iso
        # 30s tick is plenty: interval jobs have minute-level resolution,
        # daily jobs have a ~24 h retry window.
        time.sleep(30)

    logger.info("Scheduler stopped.")


if __name__ == "__main__":
    run()