agnes-the-ai-analyst/docker-compose.yml

services:
  app:
    build: .
    # --proxy-headers + --forwarded-allow-ips make uvicorn honor the
    # X-Forwarded-Proto / X-Forwarded-Host headers any reverse proxy (Caddy,
    # nginx, Cloudflare Tunnel) sets. Without it, request.url_for() emits
    # http://localhost:8000/... even when the user is on https://, which
    # breaks OAuth callbacks (redirect_uri_mismatch). Belt-and-suspenders —
    # FORWARDED_ALLOW_IPS=* in .env does the same via env var.
    command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --proxy-headers --forwarded-allow-ips='*'
    ports:
      - "8000:8000"
    volumes:
      - data:/data
      - ./config:/app/config:ro
      # - ./custom-connectors:/app/connectors/custom:ro  # Tier A: AI-generated connectors
    env_file: .env
    environment:
      - DATA_DIR=/data
      # Steer per-call tempdirs (Snowflake UNLOAD slice staging,
      # CSV→parquet intermediates) onto the data volume. The container
      # default ``/tmp`` lives on overlayfs (boot disk), which fills
      # under multi-GiB sliced exports — see connectors/keboola/
      # storage_api.py:get_temp_root. Operators can override per
      # deployment via .env (or unset to fall back to system /tmp).
      - AGNES_TEMP_DIR=${AGNES_TEMP_DIR:-/data/tmp}
      # /home/*/user/sessions/ doesn't exist in the Docker layout — skip
      # the legacy session-collector to silence per-10-min "0 users, 0 files"
      # + "Group 'data-ops' not found" log noise. The bare-VM deployment
      # path leaves this unset and continues to scan + log normally.
      - AGNES_SKIP_LEGACY_COLLECTOR=1
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:8000/api/health"]
      interval: 30s
      timeout: 5s
      retries: 3
    restart: unless-stopped
    mem_limit: 4g
    mem_reservation: 1g
    cpus: 2.0
    # Default 10s is too short for graceful uvicorn shutdown — under load,
    # in-flight requests + DuckDB CHECKPOINT (see lifespan in app/main.py)
    # need more headroom. SIGKILL mid-WAL-write produces a corrupt
    # system.duckdb.wal that the next image's DuckDB version cannot replay
    # ("Failure while replaying WAL ... GetDefaultDatabase with no default
    # database set"), 500-ing every authed request until WAL is removed.
    # Hits hardest during a Docker image upgrade window where the new
    # image's DuckDB version differs from the old container's, since
    # WAL replay across versions trips on internal assertions.
    stop_grace_period: 60s

  # One-shot: run extractor then rebuild orchestrator views
  extract:
    build: .
    command: >
      sh -c "python -m connectors.keboola.extractor &&
             python -c 'from src.orchestrator import SyncOrchestrator; print(SyncOrchestrator().rebuild())'"
    volumes:
      - data:/data
      - ./config:/app/config:ro
    env_file: .env
    environment:
      - DATA_DIR=/data
      - AGNES_TEMP_DIR=${AGNES_TEMP_DIR:-/data/tmp}
    profiles:
      - extract

  scheduler:
    build: .
    command: python -m services.scheduler
    volumes:
      - data:/data
      - ./config:/app/config:ro
    env_file: .env
    environment:
      - DATA_DIR=/data
      - AGNES_TEMP_DIR=${AGNES_TEMP_DIR:-/data/tmp}
      - API_URL=http://app:8000
      - SEED_ADMIN_EMAIL=${SEED_ADMIN_EMAIL:-}
      # Mirror the app service: the scheduler calls /api/admin/run-session-collector
      # over HTTP rather than running the collector in-process, but if anything
      # ever invokes the collector module from this container directly, we want
      # the same skip behavior. Bare-VM path leaves this unset.
      - AGNES_SKIP_LEGACY_COLLECTOR=1
    depends_on:
      app:
        condition: service_healthy
    restart: unless-stopped
    mem_limit: 2g
    cpus: 1.0
    # Match app service — scheduler holds DuckDB connections too; same
    # WAL-corruption risk on SIGKILL during recreate.
    stop_grace_period: 60s

  telegram-bot:
    build: .
    command: python -m services.telegram_bot
    volumes:
      - data:/data
    env_file: .env
    environment:
      - DATA_DIR=/data
    depends_on:
      - app
    profiles:
      - full
    restart: unless-stopped

  ws-gateway:
    build: .
    command: python -m services.ws_gateway
    volumes:
      - data:/data
    env_file: .env
    environment:
      - DATA_DIR=/data
    depends_on:
      - app
    profiles:
      - full
    restart: unless-stopped

  # NOTE: corporate-memory + session-collector previously ran here as
  # tight `restart: unless-stopped` boot loops behind `profiles: [full]`.
  # As of #176 the scheduler container drives both through admin HTTP
  # endpoints (/api/admin/run-corporate-memory,
  # /api/admin/run-session-collector). The verification-detector job
  # was never in compose; it now ships the same way. The app remains
  # the sole writer to system.duckdb. Operators previously running
  # COMPOSE_PROFILES=full need to drop those service stanzas from any
  # custom Compose overrides.

  # TLS reverse proxy. Corporate-CA certs mounted from /data/state/certs
  # (managed by scripts/ops/agnes-tls-rotate.sh on the VM). For local
  # development without certs, run without --profile tls and hit :8000
  # directly.
  caddy:
    image: caddy:2-alpine
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./Caddyfile:/etc/caddy/Caddyfile:ro
      - /data/state/certs:/certs:ro
      - caddy_data:/data
      - caddy_config:/config
      # Read-only mount of the agnes data dir so Caddy's file_server can
      # serve parquets directly (sendfile/zero-copy) and bypass the app's
      # uvicorn workers — see Caddyfile's @download handler. Mounted at
      # /srv (not /data) because /data is already the caddy_data volume.
      - data:/srv:ro
    environment:
      - DOMAIN=${DOMAIN:-localhost}
      # Passes through whatever the operator set in .env. Caddyfile uses
      # {$CADDY_TLS:tls /certs/fullchain.pem /certs/privkey.pem} so:
      # - unset            → cert-file mode (corp PKI rotated by tls-rotate.sh)
      # - "tls <email>"    → Let's Encrypt auto-issue
      # - "tls internal"   → Caddy-managed self-signed
      - CADDY_TLS
    depends_on:
      app:
        condition: service_healthy
    restart: unless-stopped
    profiles:
      - tls

volumes:
  data:
  caddy_data:
  caddy_config: