## Summary Two minimum-viable fixes after today's 0.44.0 → 0.47.3 release train and the production 30-user launch. Devil's advocate review of a 3-PR / 7-item plan cut scope to these 2 — the rest is deferred to a separate "operate-first, instrument-second" backlog item. ### B2 — Docker session_collector log skip `services/session_collector` was logging `Collection complete: 0 users, 0 files copied` + `WARNING: Group 'data-ops' not found, using default group` every 10 minutes in the Docker layout (where `/home/*/user/sessions/` doesn't exist). New env var `AGNES_SKIP_LEGACY_COLLECTOR=1` set by default in `docker-compose.yml` short-circuits the collector pass. The bare-VM deployment path (where /home/* IS populated by Claude Code) leaves the env var unset and continues to scan normally — including the data-ops warning, which is load-bearing for catching missing-group mis-deploys. ### O2 — FIFO check in `_check_session_pipeline` The existing check compares `MAX(processed_at)` to newest jsonl mtime — catches "detector hasn't run lately" but blind to "old file was skipped while newer ones were processed". New code finds the oldest FS jsonl that's NOT in `session_extraction_state.session_file` and flags if its mtime is older than `SESSION_PIPELINE_STUCK_FILE_GRACE_SECONDS` (default 4× the existing grace = 2h). Severity intentionally starts at `info` so we can collect prod data on false-positive rate before tightening to `warning`. The aggregator already treats `info` as non-promoting (see the severity vocabulary docstring at the top of `app/api/health.py`), so the headline `status` stays at `healthy` even when this fires — the operator sees the entry in the per-check breakdown but no spurious `degraded` overall. ## Test plan - [x] `pytest tests/test_session_collector.py` — 17 tests pass (existing 9 + new 8 covering env-set/unset, truthy variants, falsy non-skip). - [x] `pytest tests/test_health_session_pipeline.py` — 8 tests pass (existing 4 + new 4 FIFO tests covering stuck-file, under-threshold, all-processed, env-override). <!-- devin-review-badge-begin --> --- <a href="https://app.devin.ai/review/keboola/agnes-the-ai-analyst/pull/229" target="_blank"> <picture> <source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1"> <img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open in Devin Review"> </picture> </a> <!-- devin-review-badge-end -->
157 lines
5.4 KiB
YAML
157 lines
5.4 KiB
YAML
services:
|
|
app:
|
|
build: .
|
|
# --proxy-headers + --forwarded-allow-ips make uvicorn honor the
|
|
# X-Forwarded-Proto / X-Forwarded-Host headers any reverse proxy (Caddy,
|
|
# nginx, Cloudflare Tunnel) sets. Without it, request.url_for() emits
|
|
# http://localhost:8000/... even when the user is on https://, which
|
|
# breaks OAuth callbacks (redirect_uri_mismatch). Belt-and-suspenders —
|
|
# FORWARDED_ALLOW_IPS=* in .env does the same via env var.
|
|
command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --proxy-headers --forwarded-allow-ips='*'
|
|
ports:
|
|
- "8000:8000"
|
|
volumes:
|
|
- data:/data
|
|
- ./config:/app/config:ro
|
|
# - ./custom-connectors:/app/connectors/custom:ro # Tier A: AI-generated connectors
|
|
env_file: .env
|
|
environment:
|
|
- DATA_DIR=/data
|
|
# Steer per-call tempdirs (Snowflake UNLOAD slice staging,
|
|
# CSV→parquet intermediates) onto the data volume. The container
|
|
# default ``/tmp`` lives on overlayfs (boot disk), which fills
|
|
# under multi-GiB sliced exports — see connectors/keboola/
|
|
# storage_api.py:get_temp_root. Operators can override per
|
|
# deployment via .env (or unset to fall back to system /tmp).
|
|
- AGNES_TEMP_DIR=${AGNES_TEMP_DIR:-/data/tmp}
|
|
# /home/*/user/sessions/ doesn't exist in the Docker layout — skip
|
|
# the legacy session-collector to silence per-10-min "0 users, 0 files"
|
|
# + "Group 'data-ops' not found" log noise. The bare-VM deployment
|
|
# path leaves this unset and continues to scan + log normally.
|
|
- AGNES_SKIP_LEGACY_COLLECTOR=1
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-sf", "http://localhost:8000/api/health"]
|
|
interval: 30s
|
|
timeout: 5s
|
|
retries: 3
|
|
restart: unless-stopped
|
|
mem_limit: 4g
|
|
mem_reservation: 1g
|
|
cpus: 2.0
|
|
|
|
# One-shot: run extractor then rebuild orchestrator views
|
|
extract:
|
|
build: .
|
|
command: >
|
|
sh -c "python -m connectors.keboola.extractor &&
|
|
python -c 'from src.orchestrator import SyncOrchestrator; print(SyncOrchestrator().rebuild())'"
|
|
volumes:
|
|
- data:/data
|
|
- ./config:/app/config:ro
|
|
env_file: .env
|
|
environment:
|
|
- DATA_DIR=/data
|
|
- AGNES_TEMP_DIR=${AGNES_TEMP_DIR:-/data/tmp}
|
|
profiles:
|
|
- extract
|
|
|
|
scheduler:
|
|
build: .
|
|
command: python -m services.scheduler
|
|
volumes:
|
|
- data:/data
|
|
- ./config:/app/config:ro
|
|
env_file: .env
|
|
environment:
|
|
- DATA_DIR=/data
|
|
- AGNES_TEMP_DIR=${AGNES_TEMP_DIR:-/data/tmp}
|
|
- API_URL=http://app:8000
|
|
- SEED_ADMIN_EMAIL=${SEED_ADMIN_EMAIL:-}
|
|
# Mirror the app service: the scheduler calls /api/admin/run-session-collector
|
|
# over HTTP rather than running the collector in-process, but if anything
|
|
# ever invokes the collector module from this container directly, we want
|
|
# the same skip behavior. Bare-VM path leaves this unset.
|
|
- AGNES_SKIP_LEGACY_COLLECTOR=1
|
|
depends_on:
|
|
app:
|
|
condition: service_healthy
|
|
restart: unless-stopped
|
|
mem_limit: 2g
|
|
cpus: 1.0
|
|
|
|
telegram-bot:
|
|
build: .
|
|
command: python -m services.telegram_bot
|
|
volumes:
|
|
- data:/data
|
|
env_file: .env
|
|
environment:
|
|
- DATA_DIR=/data
|
|
depends_on:
|
|
- app
|
|
profiles:
|
|
- full
|
|
restart: unless-stopped
|
|
|
|
ws-gateway:
|
|
build: .
|
|
command: python -m services.ws_gateway
|
|
volumes:
|
|
- data:/data
|
|
env_file: .env
|
|
environment:
|
|
- DATA_DIR=/data
|
|
depends_on:
|
|
- app
|
|
profiles:
|
|
- full
|
|
restart: unless-stopped
|
|
|
|
# NOTE: corporate-memory + session-collector previously ran here as
|
|
# tight `restart: unless-stopped` boot loops behind `profiles: [full]`.
|
|
# As of #176 the scheduler container drives both through admin HTTP
|
|
# endpoints (/api/admin/run-corporate-memory,
|
|
# /api/admin/run-session-collector). The verification-detector job
|
|
# was never in compose; it now ships the same way. The app remains
|
|
# the sole writer to system.duckdb. Operators previously running
|
|
# COMPOSE_PROFILES=full need to drop those service stanzas from any
|
|
# custom Compose overrides.
|
|
|
|
# TLS reverse proxy. Corporate-CA certs mounted from /data/state/certs
|
|
# (managed by scripts/ops/agnes-tls-rotate.sh on the VM). For local
|
|
# development without certs, run without --profile tls and hit :8000
|
|
# directly.
|
|
caddy:
|
|
image: caddy:2-alpine
|
|
ports:
|
|
- "80:80"
|
|
- "443:443"
|
|
volumes:
|
|
- ./Caddyfile:/etc/caddy/Caddyfile:ro
|
|
- /data/state/certs:/certs:ro
|
|
- caddy_data:/data
|
|
- caddy_config:/config
|
|
# Read-only mount of the agnes data dir so Caddy's file_server can
|
|
# serve parquets directly (sendfile/zero-copy) and bypass the app's
|
|
# uvicorn workers — see Caddyfile's @download handler. Mounted at
|
|
# /srv (not /data) because /data is already the caddy_data volume.
|
|
- data:/srv:ro
|
|
environment:
|
|
- DOMAIN=${DOMAIN:-localhost}
|
|
# Passes through whatever the operator set in .env. Caddyfile uses
|
|
# {$CADDY_TLS:tls /certs/fullchain.pem /certs/privkey.pem} so:
|
|
# - unset → cert-file mode (corp PKI rotated by tls-rotate.sh)
|
|
# - "tls <email>" → Let's Encrypt auto-issue
|
|
# - "tls internal" → Caddy-managed self-signed
|
|
- CADDY_TLS
|
|
depends_on:
|
|
app:
|
|
condition: service_healthy
|
|
restart: unless-stopped
|
|
profiles:
|
|
- tls
|
|
|
|
volumes:
|
|
data:
|
|
caddy_data:
|
|
caddy_config:
|