From 5f6bb7a4b2c40715293a04f0f49da6212436daec Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr <139972147+ZdenekSrotyr@users.noreply.github.com> Date: Tue, 28 Apr 2026 19:57:30 +0200 Subject: [PATCH] fix(security+ops) + release(0.12.1): #82 #85 #87 hardening + cut 0.12.1 (#104) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(security+ops): #82 #85 #87 — auth hardening, API validation, deploy posture Security and operational hardening across three issue groups: - M23: docker-compose.override.yml → docker-compose.dev.yml (BREAKING, prod foot-gun) - C13: Container runs as non-root user 'agnes' (USER directive in Dockerfile) - M21: Docker resource limits (mem_limit, cpus) on app + scheduler - M22: Caddyfile security headers (X-Frame-Options, X-Content-Type-Options, Referrer-Policy, -Server) - M17: /api/health split into minimal (unauth) + /api/health/detailed (auth) (BREAKING) - M26: release.yml restricts build-and-push to main + workflow_dispatch; paths-ignore for docs - C2: table_id traversal validation on /api/data/{table_id}/download - M4: Upload streaming (chunk-read + temp file) instead of full-buffer; /local-md hashed filename - C5: reset_token removed from POST /api/users/{id}/reset-password response - C8: Startup WARNING when no user has password_hash (bootstrap window visible) - M9: Audit log on failed web form login (mirrors /auth/token endpoint) - M10: Atomic magic-link consume via compare-and-swap (CONSUMED: marker + DuckDB conflict catch) Also: SSRF protection on /api/admin/configure (#46), memory stats SQL aggregation (#90) Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix(review): SSRF 169.254.x.x + IPv6 multicast; M10 marker cleanup safety Review fixes: - Add 169.254.0.0/16 (link-local, cloud metadata) to SSRF regex — was missing, allowing requests to AWS/GCP/Azure metadata endpoints - Add ff[0-9a-f]{2}: (IPv6 multicast) to SSRF regex - M10: wrap Step 3 (CONSUMED marker cleanup) in try-except with warning log — prevents unhandled exception if DB write fails after successful token consumption - Add test for 169.254.169.254 SSRF rejection Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix(review): SSRF IPv6 bypass, CLI health endpoint, upload FD leak Address Devin Review findings on PR #104: 1. SSRF IPv6 bypass: Replace hostname regex with DNS resolution + ipaddress module checks. The old regex patterns like `fe80:` only matched up to the first colon, missing real IPv6 addresses like `fe80::1`, `fc00::1`, `ff02::1`. The new approach resolves the hostname via getaddrinfo and checks each resulting IP against ipaddress.is_private/is_loopback/is_link_local/is_reserved/is_multicast. 2. CLI commands broken: `da setup test-connection`, `da setup verify`, `da diagnose`, `da status` all called /api/health expecting the old format (status=="healthy", services dict). Now they call /api/health/detailed for service-level checks (with graceful fallback to the minimal endpoint when auth is not configured). 3. Temp file handle leak: _stream_to_temp returns an open NamedTemporaryFile; callers now close it before shutil.move() to prevent FD leaks until GC. Also adds IPv6 SSRF test cases (loopback, link-local, unique-local, multicast) with mocked DNS resolution for test environment independence. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix(review): download regex blocks hyphenated IDs; document health split Address Devin Review round-3 findings on PR #104: 1. _SAFE_IDENTIFIER regex blocked hyphenated table IDs: The download endpoint used the strict SQL-identifier regex which does not allow dots or hyphens, but Keboola table IDs like in.c-crm.orders contain both. Switched to _SAFE_QUOTED_IDENTIFIER which allows dots and hyphens while still blocking path-traversal chars (/, .., \) and quote/control characters. Added test for hyphenated/dotted IDs. 2. Documented health endpoint split in DEPLOYMENT.md: Added Health checks & external monitoring section explaining both endpoints (minimal unauth /api/health vs authenticated /api/health/detailed) and how to wire external monitoring tools to the detailed endpoint with a PAT. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * release(0.12.1): cut hotfix for snapshot integrity + #82/#85/#87 hardening * fix(security): apply CAS pattern to password reset confirm (#82/M10 follow-up) Devin review on the rebased PR flagged the asymmetry: magic-link verify got the atomic compare-and-swap pattern in the original M10 fix, but password reset confirm at /auth/password/reset/confirm was still using read-validate-clear. Two concurrent POSTs with the same valid reset token could both succeed in setting different new passwords (last-write- wins). Lower severity than the magic-link race because the attacker would need the reset token AND to race the legitimate user, but the asymmetry was a polish gap. Mirrors app/auth/providers/email.py::_consume_token CAS exactly: write unique CONSUMED: marker via UPDATE...WHERE token=old_token, then SELECT to verify our marker won, then proceed. Only the winner clears the marker and applies the password change. New regression test_concurrent_reset_only_one_wins in tests/test_password_flows.py::TestResetConfirm pins the contract: two ThreadPoolExecutor workers + Barrier hit /reset/confirm with the same token; exactly one gets 302 (password applied), the other gets 200 with 'Invalid or expired'. Sanity-checked against the pre-CAS code — both POSTs got 302 (race confirmed). --------- Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .github/workflows/release.yml | 9 ++ CHANGELOG.md | 127 +++++++++++++--- Caddyfile | 21 ++- Dockerfile | 6 + Makefile | 2 +- app/api/admin.py | 50 +++++++ app/api/data.py | 7 + app/api/health.py | 15 +- app/api/memory.py | 27 ++-- app/api/upload.py | 70 +++++++-- app/api/users.py | 1 - app/auth/providers/email.py | 81 +++++++--- app/auth/providers/password.py | 61 +++++++- app/auth/router.py | 2 +- app/main.py | 20 +++ cli/commands/admin.py | 2 +- cli/commands/diagnose.py | 16 +- cli/commands/setup.py | 48 ++++-- cli/commands/status.py | 17 ++- ...ose.override.yml => docker-compose.dev.yml | 3 +- docker-compose.yml | 5 + docs/DEPLOYMENT.md | 11 ++ docs/local-development.md | 2 +- pyproject.toml | 2 +- scripts/run-local-dev.sh | 4 +- scripts/smoke-test.sh | 30 ++-- tests/test_access_control.py | 42 ++++++ tests/test_admin_configure_api.py | 140 ++++++++++++++++++ tests/test_api.py | 19 ++- tests/test_auth_providers.py | 37 +++++ tests/test_bootstrap.py | 10 +- tests/test_journey_bootstrap_auth.py | 5 +- tests/test_memory_api.py | 16 +- tests/test_no_override_file.py | 15 ++ tests/test_password_flows.py | 61 +++++++- 35 files changed, 853 insertions(+), 131 deletions(-) rename docker-compose.override.yml => docker-compose.dev.yml (52%) create mode 100644 tests/test_no_override_file.py diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d2b7537..13a606d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,6 +5,11 @@ on: branches: - main - "**" # build :dev- image for any branch push (e.g. feature/x, zs/edit, fix/y) + paths-ignore: + - "docs/**" + - "*.md" + - "LICENSE" + workflow_dispatch: # manual trigger for explicit dev- builds permissions: contents: write @@ -33,6 +38,10 @@ jobs: build-and-push: needs: test + # Only publish images from main pushes or manual triggers. + # Non-main branch pushes run tests only; use workflow_dispatch + # for explicit dev- image builds when needed. + if: github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest outputs: image_tag: ${{ steps.meta.outputs.versioned_tag }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ea886d..d1f040a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,67 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] + + +## [0.12.1] — 2026-04-28 + +Patch release. Hotfixes the pre-migration snapshot-integrity bug shipped in [v0.12.0](https://github.com/keboola/agnes-the-ai-analyst/releases/tag/v0.12.0) and bundles the security/ops hardening from issue groups #82 (auth hardening), #85 (API validation), #87 (deploy posture), plus #46 (SSRF) and #90 (memory stats blocking). + +### Added + +- Path-traversal validation on `/api/data/{table_id}/download` — `table_id` is + now checked against `_SAFE_QUOTED_IDENTIFIER` regex (allows dots and hyphens + for Keboola-style IDs like `in.c-crm.orders`) before any filesystem or DB + operation; unsafe values return 404 (no info leakage). See issue #85/C2. +- SSRF protection on `POST /api/admin/configure` — `keboola_url` is validated + against private/reserved networks (127.0.0.0/8, 10.0.0.0/8, 172.16.0.0/12, + 192.168.0.0/16, localhost, IPv6 loopback/link-local/unique-local). Uses + DNS resolution + `ipaddress` module for robust IPv6 handling (catches + abbreviated forms like `fe80::1`, `fc00::1`). See issue #46. +- Caddyfile security headers: `X-Frame-Options DENY`, + `X-Content-Type-Options nosniff`, + `Referrer-Policy strict-origin-when-cross-origin`, `-Server` (strip). + See issue #87/M22. +- Container runs as non-root user `agnes` — `USER` directive added to + Dockerfile with `useradd` + `chown`. See issue #87/C13. +- Docker resource limits: `mem_limit: 4g`, `mem_reservation: 1g`, + `cpus: 2.0` on `app`; `mem_limit: 2g`, `cpus: 1.0` on `scheduler`. + See issue #87/M21. +- Startup warning when no user has `password_hash` — alerts operators that + `/auth/bootstrap` is reachable. See issue #82/C8. +- Audit logging for failed web form login attempts (`/auth/password/login/web`) + — mirrors the existing `/auth/token` audit trail. See issue #82/M9. +- `/api/health/detailed` endpoint (authenticated) — returns full diagnostics + (version, schema, sync state, user count). Minimal `/api/health` (unauth) + returns only `{"status": "ok"}` for load balancers. See issue #87/M17. +- Health endpoint monitoring guide in `docs/DEPLOYMENT.md` — documents both + endpoints and how to wire external monitoring tools (Datadog, Prometheus, + UptimeRobot) to `/api/health/detailed` with a PAT. + +### Changed + +- **BREAKING** `docker-compose.override.yml` renamed to `docker-compose.dev.yml`. + Docker Compose auto-merges `docker-compose.override.yml` on every host with + the repo, silently enabling dev mode (source mount + `--reload`) on + production. The new name requires explicit `-f docker-compose.dev.yml`, + eliminating the foot-gun. Update any scripts or workflows that relied on + auto-merge. `scripts/run-local-dev.sh` and `Makefile` updated accordingly. + See issue #87/M23. +- **BREAKING** `/api/health` now returns a minimal `{"status": "ok"}` payload + (unauthenticated, for load balancers). Full diagnostics moved to + `/api/health/detailed` (requires authentication). Scripts that parsed + `/api/health` for version, sync state, or user count must switch to + `/api/health/detailed` with an `Authorization` header. CLI commands + (`da setup test-connection`, `da setup verify`, `da diagnose`, `da status`) + updated to call `/api/health/detailed` for service-level checks, with + graceful fallback to the minimal endpoint when auth is not configured. + See issue #87/M17. +- `release.yml` CI workflow: `build-and-push` job now only runs on `main` + pushes or manual `workflow_dispatch` triggers. Non-main branch pushes run + tests only. Added `paths-ignore` for `docs/**`, `*.md`, `LICENSE`. + See issue #87/M26. + ### Fixed - **Pre-migration snapshot integrity** — the snapshot file written @@ -32,31 +93,53 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C path (`current < SCHEMA_VERSION`) takes its snapshot first and then runs `_SYSTEM_SCHEMA` from inside the existing migration block. -- **Split-brain self-heal regression test for a shared dev-VM - split-brain incident** (2026-04-27). Pins the contract that the - gated `_SYSTEM_SCHEMA` - self-heal pass keeps working when a binary lands on a - future-version DB that's missing tables it expects: every query - against the missing table would otherwise crash at runtime - (`_duckdb.CatalogException`). New - `test_split_brain_future_version_with_missing_tables_self_heals` - in `tests/test_db.py::TestMigrationSafety` synthesizes a v99 DB - whose only table is `schema_version`, runs `_ensure_schema`, and - asserts that the v13-era core tables (`users`, `user_groups`, - `user_group_members`, `resource_grants`) now exist *and* that - `schema_version` stays at 99 (self-heal without falsely - advertising a downgrade). Plus - `test_pre_migration_snapshot_excludes_post_self_heal_tables` - pins the snapshot-integrity contract: a v2→vN migration's - snapshot must not contain any post-v2 table from the modern - binary. +- `reset_token` no longer leaks in the JSON response body of + `POST /api/users/{id}/reset-password`. The `reset_url` still contains the + token (as intended), but the raw secret is no longer exposed to DevTools, + proxy logs, or CLI stdout. CLI `admin reset-password` now prints the URL + instead of the bare token. See issue #82/C5. +- `/api/memory/stats` no longer blocks the async event loop — replaced + `repo.list_items(limit=10000)` + Python loop with a single SQL + `GROUP BY` aggregation. See issue #90. +- Magic-link token consumption is now atomic — compare-and-swap pattern + with a unique `CONSUMED:` marker prevents two concurrent verifies from + both succeeding. DuckDB concurrent-write conflicts are caught and + converted to 401. See issue #82/M10. +- Password reset confirm (`POST /auth/password/reset/confirm`) now uses + the same compare-and-swap pattern as the magic-link flow — closes the + remaining asymmetry on `users.reset_token` consumption. Lower severity + than the magic-link race because the reset flow ends with a new + password (an attacker would need the reset token *and* to race the + legitimate user) but the consistency closes a polish gap. New + regression `test_concurrent_reset_only_one_wins` in + `tests/test_password_flows.py::TestResetConfirm`. +- Upload endpoints (`/sessions`, `/artifacts`) now stream to a temp file with + cumulative size check instead of buffering the entire body in memory before + the size cap — prevents OOM from oversized uploads. Temp file handle is + properly closed before `shutil.move` to avoid FD leaks. See issue #85/M4. +- `/api/upload/local-md` uses a SHA-256 hashed filename instead of raw + `user_email` — stable per user, no charset surprises from email addresses. + See issue #85/M4. +- `/auth/bootstrap` 403 message no longer leaks user count. See issue #82/n1. ### Internal -- `test_future_version_is_noop` docstring updated to reflect that - the self-heal pass *does* run on a future-version DB, just - doesn't touch the version row. The test still passes unchanged — - its only assertion was the version-row contract, which holds. +- New regression `test_split_brain_future_version_with_missing_tables_self_heals` + in `tests/test_db.py::TestMigrationSafety` — synthesizes a v99 DB whose only + table is `schema_version`, runs `_ensure_schema`, asserts that the v13-era + core tables (`users`, `user_groups`, `user_group_members`, `resource_grants`) + get materialized *and* that `schema_version` stays at 99 (self-heal without + falsely advertising a downgrade). +- New regression `test_pre_migration_snapshot_excludes_post_self_heal_tables` + pins the snapshot-integrity contract: a v2→vN migration's snapshot must not + contain any post-v2 table from the modern binary. Sanity-checked against the + pre-fix unconditional hoist — fails with 6 leaked tables. +- `test_future_version_is_noop` docstring updated to reflect that the + self-heal pass *does* run on a future-version DB, just doesn't touch the + version row. The test still passes unchanged — its only assertion was the + version-row contract, which holds. +- `test_no_override_file` regression test asserts `docker-compose.override.yml` + does not exist post-rename. See issue #87/M23. ## [0.12.0] — 2026-04-28 diff --git a/Caddyfile b/Caddyfile index cf34742..6085b6e 100644 --- a/Caddyfile +++ b/Caddyfile @@ -17,11 +17,22 @@ protocols tls1.2 tls1.3 } - # HSTS: tell compliant browsers to refuse plain-HTTP for this host - # for a year. Skipping `preload` so we keep an escape hatch (preload - # submission is hard-bound and blocks rollback). Skipping - # `includeSubDomains` because we don't control subdomains. - header Strict-Transport-Security "max-age=31536000" + # Security headers + header { + # HSTS: tell compliant browsers to refuse plain-HTTP for this host + # for a year. Skipping `preload` so we keep an escape hatch (preload + # submission is hard-bound and blocks rollback). Skipping + # `includeSubDomains` because we don't control subdomains. + Strict-Transport-Security "max-age=31536000" + # Prevent clickjacking — dashboard is not embedded in iframes + X-Frame-Options "DENY" + # Prevent MIME-type sniffing — browser must honor declared Content-Type + X-Content-Type-Options "nosniff" + # Limit referrer leakage to origin on same-site navigations only + Referrer-Policy "strict-origin-when-cross-origin" + # Strip Server header to avoid fingerprinting the reverse proxy + -Server + } reverse_proxy app:8000 { # App's uvicorn runs with --proxy-headers, so stamping these diff --git a/Dockerfile b/Dockerfile index cb188ed..2093242 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,5 +23,11 @@ RUN uv build --wheel --out-dir /app/dist # Install production dependencies from pyproject.toml RUN uv pip install --system --no-cache . +# Run as non-root user for container hardening (C13) +RUN useradd --system --create-home --shell /usr/sbin/nologin agnes && \ + mkdir -p /data && chown -R agnes:agnes /data && \ + chown -R agnes:agnes /app +USER agnes + EXPOSE 8000 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers", "--forwarded-allow-ips", "*"] diff --git a/Makefile b/Makefile index c0a0b01..5762d04 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Agnes AI Data Analyst — Development Makefile -LOCAL_DEV_COMPOSE := -f docker-compose.yml -f docker-compose.override.yml -f docker-compose.local-dev.yml +LOCAL_DEV_COMPOSE := -f docker-compose.yml -f docker-compose.dev.yml -f docker-compose.local-dev.yml .PHONY: help test lint dev docker local-dev local-dev-down local-dev-logs update-openapi-snapshot diff --git a/app/api/admin.py b/app/api/admin.py index ee86a20..89d97f7 100644 --- a/app/api/admin.py +++ b/app/api/admin.py @@ -22,6 +22,55 @@ from src.repositories.table_registry import TableRegistryRepository logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/admin", tags=["admin"]) +# SSRF protection: reject private/internal URLs for keboola_url +import ipaddress as _ipaddress +import socket as _socket +from urllib.parse import urlparse as _urlparse + + +def _validate_url_not_private(url: str, field_name: str = "url") -> None: + """Raise 400 if the URL host points to a private/reserved network. + + Uses DNS resolution + ipaddress checks instead of hostname regex, + which correctly handles all IPv4/IPv6 addresses including abbreviated + forms (fe80::1, ::1, etc.) and DNS rebinding (resolves at check time). + """ + try: + parsed = _urlparse(url) + except Exception: + raise HTTPException(status_code=400, detail=f"Invalid {field_name}: not a valid URL") + host = parsed.hostname or "" + if not host: + raise HTTPException(status_code=400, detail=f"Invalid {field_name}: missing hostname") + + # Reject well-known dangerous hostnames before DNS resolution + if host.lower() in ("localhost", "localhost.localdomain"): + raise HTTPException( + status_code=400, + detail=f"Invalid {field_name}: must not point to a private or reserved network", + ) + + # Resolve hostname to IP addresses and check each one + try: + addrinfos = _socket.getaddrinfo(host, None, proto=_socket.IPPROTO_TCP) + except Exception: + raise HTTPException( + status_code=400, + detail=f"Invalid {field_name}: could not resolve hostname", + ) + + for family, _type, _proto, _canonname, sockaddr in addrinfos: + ip_str = sockaddr[0] + try: + ip = _ipaddress.ip_address(ip_str) + except ValueError: + continue + if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast: + raise HTTPException( + status_code=400, + detail=f"Invalid {field_name}: must not point to a private or reserved network", + ) + class RegisterTableRequest(BaseModel): name: str @@ -185,6 +234,7 @@ async def configure_instance( if request.data_source == "keboola": if not request.keboola_token or not request.keboola_url: raise HTTPException(status_code=400, detail="keboola_token and keboola_url are required for Keboola data source") + _validate_url_not_private(request.keboola_url, field_name="keboola_url") try: from connectors.keboola.client import KeboolaClient client = KeboolaClient(token=request.keboola_token, url=request.keboola_url) diff --git a/app/api/data.py b/app/api/data.py index b78e0c3..8cf26f2 100644 --- a/app/api/data.py +++ b/app/api/data.py @@ -6,6 +6,7 @@ import duckdb from app.auth.dependencies import get_current_user, _get_db from app.utils import get_data_dir as _get_data_dir +from src.identifier_validation import _SAFE_QUOTED_IDENTIFIER from src.rbac import can_access_table router = APIRouter(prefix="/api/data", tags=["data"]) @@ -19,6 +20,12 @@ async def download_table( conn: duckdb.DuckDBPyConnection = Depends(_get_db), ): """Stream a parquet file for download. Supports ETag for caching.""" + # Reject unsafe table_id before any filesystem or DB operations. + # Use the relaxed quoted-identifier check that allows dots and hyphens + # (Keboola table IDs like "in.c-crm.orders") while still blocking + # path-traversal characters (/, .., \) and quote/control chars. + if not _SAFE_QUOTED_IDENTIFIER.match(table_id): + raise HTTPException(status_code=404, detail="Table not found") # Check access FIRST if not can_access_table(user, table_id, conn): raise HTTPException(status_code=403, detail="Access denied to this table") diff --git a/app/api/health.py b/app/api/health.py index e6734c6..529ac44 100644 --- a/app/api/health.py +++ b/app/api/health.py @@ -6,7 +6,7 @@ from datetime import datetime, timezone from fastapi import APIRouter, Depends import duckdb -from app.auth.dependencies import _get_db +from app.auth.dependencies import _get_db, get_current_user from src.db import SCHEMA_VERSION from src.repositories.sync_state import SyncStateRepository @@ -19,8 +19,17 @@ _DEPLOYED_AT = datetime.now(timezone.utc).isoformat() @router.get("/api/health") -async def health_check(conn: duckdb.DuckDBPyConnection = Depends(_get_db)): - """Structured health check. No auth required.""" +async def health_check(): + """Minimal health check for load balancers / compose healthcheck. No auth required.""" + return {"status": "ok"} + + +@router.get("/api/health/detailed") +async def health_check_detailed( + conn: duckdb.DuckDBPyConnection = Depends(_get_db), + _user: dict = Depends(get_current_user), +): + """Structured health check with deployment metadata. Requires authentication.""" checks = {} # DuckDB state diff --git a/app/api/memory.py b/app/api/memory.py index d186620..8aaba87 100644 --- a/app/api/memory.py +++ b/app/api/memory.py @@ -83,20 +83,19 @@ async def get_stats( conn: duckdb.DuckDBPyConnection = Depends(_get_db), ): """Get corporate memory statistics.""" - repo = KnowledgeRepository(conn) - all_items = repo.list_items(limit=10000) - status_counts = {} - categories = set() - for item in all_items: - s = item.get("status", "unknown") - status_counts[s] = status_counts.get(s, 0) + 1 - if item.get("category"): - categories.add(item["category"]) - return { - "total": len(all_items), - "by_status": status_counts, - "categories": sorted(categories), - } + rows = conn.execute( + "SELECT status, category, COUNT(*) as n FROM knowledge_items GROUP BY status, category" + ).fetchall() + + status_counts: dict[str, int] = {} + categories: set[str] = set() + total = 0 + for status, category, n in rows: + status_counts[status] = status_counts.get(status, 0) + n + if category: + categories.add(category) + total += n + return {"total": total, "by_status": status_counts, "categories": sorted(categories)} @router.post("", status_code=201) diff --git a/app/api/upload.py b/app/api/upload.py index 649b338..17c6a54 100644 --- a/app/api/upload.py +++ b/app/api/upload.py @@ -1,5 +1,8 @@ """Upload endpoints — sessions, artifacts, CLAUDE.local.md.""" +import hashlib +import shutil +import tempfile import uuid from datetime import datetime, timezone from pathlib import Path @@ -13,6 +16,40 @@ from app.utils import get_data_dir as _get_data_dir router = APIRouter(prefix="/api/upload", tags=["upload"]) MAX_UPLOAD_SIZE = 50 * 1024 * 1024 # 50 MB +_CHUNK_SIZE = 64 * 1024 # 64 KB read chunks for streaming size check + + +async def _stream_to_temp(file: UploadFile) -> tuple[tempfile.NamedTemporaryFile, int]: + """Stream-upload with cumulative size check. Returns (tempfile, size). + + Aborts once total > MAX_UPLOAD_SIZE — avoids buffering the entire + body in memory before the size cap rejects it (OOM prevention). + """ + tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".tmp") + total = 0 + try: + while True: + chunk = await file.read(_CHUNK_SIZE) + if not chunk: + break + total += len(chunk) + if total > MAX_UPLOAD_SIZE: + tmp.close() + Path(tmp.name).unlink(missing_ok=True) + raise HTTPException( + status_code=413, + detail=f"File too large (max {MAX_UPLOAD_SIZE // 1024 // 1024}MB)", + ) + tmp.write(chunk) + tmp.flush() + except HTTPException: + raise + except Exception: + tmp.close() + Path(tmp.name).unlink(missing_ok=True) + raise + tmp.seek(0) + return tmp, total @router.post("/sessions") @@ -30,11 +67,15 @@ async def upload_session( if not filename or filename.startswith("."): filename = f"upload_{uuid.uuid4().hex[:8]}" target = sessions_dir / filename - content = await file.read() - if len(content) > MAX_UPLOAD_SIZE: - raise HTTPException(status_code=413, detail=f"File too large (max {MAX_UPLOAD_SIZE // 1024 // 1024}MB)") - target.write_bytes(content) - return {"status": "ok", "filename": filename, "size": len(content)} + + tmp, size = await _stream_to_temp(file) + try: + tmp.close() + shutil.move(tmp.name, str(target)) + except Exception: + Path(tmp.name).unlink(missing_ok=True) + raise + return {"status": "ok", "filename": filename, "size": size} @router.post("/artifacts") @@ -52,11 +93,15 @@ async def upload_artifact( if not filename or filename.startswith("."): filename = f"upload_{uuid.uuid4().hex[:8]}" target = artifacts_dir / filename - content = await file.read() - if len(content) > MAX_UPLOAD_SIZE: - raise HTTPException(status_code=413, detail=f"File too large (max {MAX_UPLOAD_SIZE // 1024 // 1024}MB)") - target.write_bytes(content) - return {"status": "ok", "filename": filename, "size": len(content)} + + tmp, size = await _stream_to_temp(file) + try: + tmp.close() + shutil.move(tmp.name, str(target)) + except Exception: + Path(tmp.name).unlink(missing_ok=True) + raise + return {"status": "ok", "filename": filename, "size": size} class LocalMdRequest(BaseModel): @@ -69,12 +114,13 @@ async def upload_local_md( user: dict = Depends(get_current_user), ): """Upload CLAUDE.local.md content for corporate memory processing.""" - user_id = user["id"] user_email = user["email"] md_dir = _get_data_dir() / "user_local_md" md_dir.mkdir(parents=True, exist_ok=True) - target = md_dir / f"{user_email}.md" + # Hashed filename — stable per user, no charset surprises from email + safe_name = hashlib.sha256(user_email.encode()).hexdigest()[:24] + ".md" + target = md_dir / safe_name target.write_text(request.content, encoding="utf-8") return { "status": "ok", diff --git a/app/api/users.py b/app/api/users.py index dfc2ad5..19fe02c 100644 --- a/app/api/users.py +++ b/app/api/users.py @@ -307,7 +307,6 @@ async def reset_password( reset_url = build_reset_url(request, target["email"], token) email_sent = send_reset_email(request, target["email"], token) return { - "reset_token": token, "reset_url": reset_url, "email_sent": email_sent, } diff --git a/app/auth/providers/email.py b/app/auth/providers/email.py index d6bc7f3..e1b8fc5 100644 --- a/app/auth/providers/email.py +++ b/app/auth/providers/email.py @@ -111,27 +111,74 @@ async def send_magic_link( def _consume_token(conn: duckdb.DuckDBPyConnection, email: str, token: str) -> dict: - """Validate & consume a magic-link token. Returns the user dict or raises 401.""" + """Validate & consume a magic-link token atomically. Returns the user dict or raises 401. + + Uses a "compare-and-swap" pattern: instead of setting reset_token to NULL + directly, we first set it to a unique CONSUMED marker that identifies THIS + consumption attempt, then verify that OUR marker was written. Two concurrent + verifies will both try to write their marker, but only one will succeed + (the WHERE clause checks the original token value); the loser's UPDATE is + a no-op, and the loser sees the winner's marker and fails. + + DuckDB doesn't expose affected-row count, so the marker is the only way + to distinguish "I won the race" from "someone else won." + """ + # Compute the TTL cutoff in Python — DuckDB doesn't support + # parameterized INTERVAL arithmetic (?, INTERVAL) in all builds. + cutoff = datetime.now(timezone.utc) - timedelta(seconds=MAGIC_LINK_EXPIRY) + + # Unique marker for this consumption attempt — lets us detect who won + # the race without relying on DuckDB rowcount (which returns -1). + consume_id = f"CONSUMED:{secrets.token_hex(16)}" + + # Step 1: Atomic compare-and-swap. Only succeeds if the token still + # matches the original value and hasn't expired. On success, writes + # OUR consume_id instead of NULL so we can verify ownership. + # DuckDB raises TransactionContext Error on concurrent row conflicts — + # catch and treat as "someone else won the race." + try: + conn.execute( + "UPDATE users SET reset_token = ?, reset_token_created = NULL " + "WHERE email = ? AND reset_token = ? AND reset_token_created IS NOT NULL " + "AND reset_token_created >= ?", + [consume_id, email, token, cutoff], + ) + except Exception as exc: + err = str(exc).lower() + if "conflict" in err or "transaction" in err: + raise HTTPException(status_code=401, detail="Invalid or expired link") + raise + + # Step 2: Verify that OUR consume_id was written. If a concurrent + # request won the race, we'll see THEIR consume_id (or NULL if they + # already cleared it in step 3) — either way, we fail. + row = conn.execute( + "SELECT reset_token FROM users WHERE email = ?", + [email], + ).fetchone() + if not row or row[0] != consume_id: + raise HTTPException(status_code=401, detail="Invalid or expired link") + + # Step 3: Clear the consumed marker. Safe to do unconditionally — + # only the winner reaches here, and the marker is transient. + # If this UPDATE fails (DB error), the marker persists but the user + # can still request a new magic link — not a lockout. + try: + conn.execute( + "UPDATE users SET reset_token = NULL WHERE email = ? AND reset_token = ?", + [email, consume_id], + ) + except Exception: + logger.warning("Failed to clear CONSUMED marker for %s — marker will persist", email) + + # Fetch the user (token is now cleared, but we need the rest of the fields). + # CAS already validated token + expiry atomically, so no further checks + # needed — re-running them now would always fail because reset_token was + # NULL'd in step 3. repo = UserRepository(conn) user = repo.get_by_email(email) if not user: raise HTTPException(status_code=401, detail="Invalid link") - - if user.get("reset_token") != token: - raise HTTPException(status_code=401, detail="Invalid or expired link") - - created = user.get("reset_token_created") - if created: - if isinstance(created, str): - created = datetime.fromisoformat(created) - # DuckDB returns TIMESTAMP as offset-naive; we stored it as UTC, so assume UTC. - if created.tzinfo is None: - created = created.replace(tzinfo=timezone.utc) - if (datetime.now(timezone.utc) - created).total_seconds() > MAGIC_LINK_EXPIRY: - raise HTTPException(status_code=401, detail="Link expired") - - # Clear token (one-time use) - repo.update(id=user["id"], reset_token=None, reset_token_created=None) return user diff --git a/app/auth/providers/password.py b/app/auth/providers/password.py index 41c0b47..9ef6ed2 100644 --- a/app/auth/providers/password.py +++ b/app/auth/providers/password.py @@ -34,6 +34,23 @@ SETUP_TOKEN_TTL = timedelta(days=7) MIN_PASSWORD_LEN = 8 +def _audit(user_id: str, action: str, result: str | None = None) -> None: + """Fire-and-forget audit log entry. Swallows all errors.""" + try: + from src.db import get_system_db + from src.repositories.audit import AuditRepository + audit_conn = get_system_db() + AuditRepository(audit_conn).log( + user_id=user_id, + action=action, + resource="auth", + result=result, + ) + audit_conn.close() + except Exception: + pass # Audit failure must not block auth + + class PasswordLoginRequest(BaseModel): email: str password: str @@ -225,6 +242,8 @@ async def password_login_web( ph = PasswordHasher() ph.verify(user["password_hash"], password) except VerifyMismatchError: + # M9: audit failed form-login attempts (mirrors /auth/token endpoint) + _audit(user["id"], "login_failed", result="invalid_password") return RedirectResponse(url="/login/password?error=invalid", status_code=302) except Exception: logger.exception("Unexpected error during web password verification for %s", email) @@ -329,14 +348,46 @@ async def reset_confirm( error=f"Password must be at least {MIN_PASSWORD_LEN} characters.", ) + # Atomic compare-and-swap to consume the reset token. Mirrors the + # magic-link CAS in app/auth/providers/email.py::_consume_token (issue + # #82/M10) — without it, two concurrent POSTs with the same valid token + # could both succeed in setting different new passwords. Lower + # severity than the magic-link race (attacker would need the reset + # token AND to race the legitimate user) but closes the asymmetry. + cutoff = datetime.now(timezone.utc) - RESET_TOKEN_TTL + consume_id = f"CONSUMED:{secrets.token_hex(16)}" + try: + conn.execute( + "UPDATE users SET reset_token = ?, reset_token_created = NULL " + "WHERE email = ? AND reset_token = ? AND reset_token_created IS NOT NULL " + "AND reset_token_created >= ? AND active = TRUE", + [consume_id, email, token, cutoff], + ) + except Exception as exc: + err = str(exc).lower() + if "conflict" in err or "transaction" in err: + return _render_reset_form(request, email=email, token=token, error="Invalid or expired reset link.") + raise + + # Verify OUR marker won the race. A concurrent winner will have a + # different consume_id (or NULL if they already cleared it). + row = conn.execute( + "SELECT reset_token FROM users WHERE email = ?", + [email], + ).fetchone() + if not row or row[0] != consume_id: + # Could be: token never matched, expired, account deactivated, or + # the race was lost. Single error keeps the UX simple and avoids + # leaking which condition tripped. + return _render_reset_form(request, email=email, token=token, error="Invalid or expired reset link.") + + # Won the race — fetch the user (we need id/email for the response) + # and apply the password change. Clearing the marker happens as part + # of the same UPDATE. repo = UserRepository(conn) user = repo.get_by_email(email) - if not user or user.get("reset_token") != token: + if not user: return _render_reset_form(request, email=email, token=token, error="Invalid or expired reset link.") - if not _token_is_fresh(user.get("reset_token_created"), RESET_TOKEN_TTL): - return _render_reset_form(request, email=email, token=token, error="Reset link has expired. Please request a new one.") - if not bool(user.get("active", True)): - return _render_reset_form(request, email=email, token=token, error="This account is deactivated.") ph = PasswordHasher() repo.update( diff --git a/app/auth/router.py b/app/auth/router.py index f3c3d08..0ac3d67 100644 --- a/app/auth/router.py +++ b/app/auth/router.py @@ -134,7 +134,7 @@ async def bootstrap( if users_with_password: raise HTTPException( status_code=403, - detail=f"Bootstrap disabled — {len(users_with_password)} user(s) already have passwords set. Use /auth/password/login.", + detail="Bootstrap disabled — a user with a password already exists. Use /auth/password/login.", ) password_hash = PasswordHasher().hash(request.password) if request.password else None diff --git a/app/main.py b/app/main.py index 376834e..c5dcf75 100644 --- a/app/main.py +++ b/app/main.py @@ -276,6 +276,26 @@ def create_app() -> FastAPI: except Exception as e: logger.warning(f"Could not seed admin: {e}") + # C8: Warn when no user has a password_hash — bootstrap endpoint is open. + # This is intentional UX (operator can claim seed admin), but the open + # window should be visible in startup logs so it's not forgotten. + if not is_local_dev_mode(): + try: + from src.db import get_system_db + from src.repositories.users import UserRepository + conn = get_system_db() + repo = UserRepository(conn) + all_users = repo.list_all() + has_password = any(u.get("password_hash") for u in all_users) + if not has_password: + logger.warning( + "No user has a password set — /auth/bootstrap is reachable. " + "Claim the seed admin (or set SEED_ADMIN_PASSWORD) to close this window." + ) + conn.close() + except Exception: + pass # never block startup on a logging convenience + # Static files static_dir = Path(__file__).parent / "web" / "static" if static_dir.exists(): diff --git a/cli/commands/admin.py b/cli/commands/admin.py index 921e520..80f3e4b 100644 --- a/cli/commands/admin.py +++ b/cli/commands/admin.py @@ -309,7 +309,7 @@ def reset_password(user_ref: str = typer.Argument(..., help="User id or email")) resp = api_post(f"/api/users/{uid}/reset-password") if resp.status_code == 200: data = resp.json() - typer.echo(f"Reset token: {data['reset_token']}") + typer.echo(f"Reset URL: {data['reset_url']}") typer.echo(f"Email sent: {data['email_sent']}") else: typer.echo(f"Failed: {resp.json().get('detail', resp.text)}", err=True) diff --git a/cli/commands/diagnose.py b/cli/commands/diagnose.py index ea29539..bc5bd56 100644 --- a/cli/commands/diagnose.py +++ b/cli/commands/diagnose.py @@ -24,11 +24,17 @@ def diagnose( health = resp.json() checks.append({"name": "api", "status": "ok", "latency_ms": resp.elapsed.total_seconds() * 1000}) - # Extract service checks - for svc_name, svc_data in health.get("services", {}).items(): - check = {"name": svc_name, "status": svc_data.get("status", "unknown")} - check.update({k: v for k, v in svc_data.items() if k != "status"}) - checks.append(check) + # Detailed health (auth required) for service-level checks + try: + resp_d = api_get("/api/health/detailed") + detailed = resp_d.json() + for svc_name, svc_data in detailed.get("services", {}).items(): + check = {"name": svc_name, "status": svc_data.get("status", "unknown")} + check.update({k: v for k, v in svc_data.items() if k != "status"}) + checks.append(check) + except Exception: + # Auth may not be configured — minimal reachability is sufficient + pass except Exception as e: checks.append({"name": "api", "status": "error", "detail": str(e)}) diff --git a/cli/commands/setup.py b/cli/commands/setup.py index a3916c3..bfaf268 100644 --- a/cli/commands/setup.py +++ b/cli/commands/setup.py @@ -76,16 +76,29 @@ def test_connection(): """Test connection to the server and data source.""" typer.echo("Testing server connection...") try: + # Quick unauth ping first resp = api_get("/api/health") health = resp.json() - typer.echo(f" Server: {health.get('status', 'unknown')}") - for svc, info in health.get("services", {}).items(): - typer.echo(f" {svc}: {info.get('status', '?')}") + if health.get("status") != "ok": + typer.echo(f" Server: unexpected status {health.get('status')}") + raise typer.Exit(1) + typer.echo(" Server: reachable") + + # Detailed health (auth required) for service-level checks + try: + resp = api_get("/api/health/detailed") + detailed = resp.json() + typer.echo(f" Health: {detailed.get('status', 'unknown')}") + for svc, info in detailed.get("services", {}).items(): + typer.echo(f" {svc}: {info.get('status', '?')}") + if detailed.get("status") == "healthy": + typer.echo("\nServer is healthy.") + else: + typer.echo("\nServer has issues. Check: da diagnose --json") + except Exception: + # Auth may not be configured yet — minimal check is sufficient + typer.echo("\nServer is reachable (detailed check requires auth).") - if health.get("status") == "healthy": - typer.echo("\nServer is healthy.") - else: - typer.echo("\nServer has issues. Check: da diagnose --json") except Exception as e: typer.echo(f" FAILED: {e}", err=True) raise typer.Exit(1) @@ -130,11 +143,22 @@ def verify(as_json: bool = typer.Option(False, "--json", help="Output as JSON")) try: resp = api_get("/api/health") h = resp.json() - checks.append({ - "name": "server", - "status": "pass" if h.get("status") == "healthy" else "warn", - "detail": h.get("status"), - }) + # Minimal health returns {"status": "ok"} — try detailed for richer check + try: + resp_d = api_get("/api/health/detailed") + hd = resp_d.json() + checks.append({ + "name": "server", + "status": "pass" if hd.get("status") == "healthy" else "warn", + "detail": hd.get("status"), + }) + except Exception: + # Auth not configured yet — minimal reachability is enough + checks.append({ + "name": "server", + "status": "pass" if h.get("status") == "ok" else "warn", + "detail": h.get("status"), + }) except Exception as e: checks.append({"name": "server", "status": "fail", "detail": str(e)}) _report(checks, as_json) diff --git a/cli/commands/status.py b/cli/commands/status.py index 8168c2d..50868ed 100644 --- a/cli/commands/status.py +++ b/cli/commands/status.py @@ -33,8 +33,23 @@ def status( return try: + # Minimal health ping first resp = api_get("/api/health") - data = resp.json() + minimal = resp.json() + if minimal.get("status") != "ok": + if as_json: + typer.echo(json.dumps(minimal, indent=2)) + else: + typer.echo(f"Status: {minimal.get('status', 'unknown')}") + return + + # Detailed health (auth required) for service-level info + try: + resp = api_get("/api/health/detailed") + data = resp.json() + except Exception: + data = minimal + if as_json: typer.echo(json.dumps(data, indent=2)) else: diff --git a/docker-compose.override.yml b/docker-compose.dev.yml similarity index 52% rename from docker-compose.override.yml rename to docker-compose.dev.yml index c3ce141..76f675d 100644 --- a/docker-compose.override.yml +++ b/docker-compose.dev.yml @@ -1,5 +1,6 @@ # Development overrides — auto-reload + source mount -# This file is auto-loaded by `docker compose up` when present +# This file must be explicitly loaded with: docker compose -f docker-compose.yml -f docker-compose.dev.yml ... +# (Renamed from docker-compose.override.yml to avoid auto-merge on every host — see issue #87/M23) services: app: command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload --proxy-headers --forwarded-allow-ips='*' diff --git a/docker-compose.yml b/docker-compose.yml index 5953135..40e7bc4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,6 +23,9 @@ services: timeout: 5s retries: 3 restart: unless-stopped + mem_limit: 4g + mem_reservation: 1g + cpus: 2.0 # One-shot: run extractor then rebuild orchestrator views extract: @@ -54,6 +57,8 @@ services: app: condition: service_healthy restart: unless-stopped + mem_limit: 2g + cpus: 1.0 telegram-bot: build: . diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 8cfdb95..f59ccf6 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -146,6 +146,17 @@ docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d Or set up a cron job — see `infra/modules/customer-instance/startup-script.sh.tpl` for the reference implementation. +### Health checks & external monitoring + +Two health endpoints serve different audiences: + +| Endpoint | Auth | Response | Use for | +|---|---|---|---| +| `GET /api/health` | None | `{"status": "ok"}` | Load balancers, Docker `healthcheck`, uptime pings | +| `GET /api/health/detailed` | Bearer token | `{"status", "version", "services": {...}}` | Dashboards, alerting rules, `da diagnose`/`da status` CLI | + +The Docker Compose `healthcheck` uses the minimal endpoint (`curl -sf http://localhost:8000/api/health`). For external monitoring tools (Datadog, Prometheus, UptimeRobot, etc.) that need service-level detail (DuckDB status, sync freshness, user count), point them at `/api/health/detailed` with an `Authorization: Bearer ` header. Any authenticated user can call it; a personal access token (`da admin create-pat`) works well for service accounts. + ## Which path should I pick? | | Terraform | Docker Compose | diff --git a/docs/local-development.md b/docs/local-development.md index b658d53..1c85bc7 100644 --- a/docs/local-development.md +++ b/docs/local-development.md @@ -12,7 +12,7 @@ Then open . You land on `/dashboard` already logged in as What `make local-dev` actually does: -- Stacks three Compose files: `docker-compose.yml` (base) + `docker-compose.override.yml` (hot-reload + source bind mount) + `docker-compose.local-dev.yml` (LOCAL_DEV_MODE overlay). +- Stacks three Compose files: `docker-compose.yml` (base) + `docker-compose.dev.yml` (hot-reload + source bind mount) + `docker-compose.local-dev.yml` (LOCAL_DEV_MODE overlay). - Seeds `LOCAL_DEV_GROUPS` with a sensible default (engineers + admins on `example.com`) so `/profile` is non-empty on first boot. - Touches an empty `.env` if missing — Compose validates `env_file:` paths even for services that never start, and the local-dev overlay drops the env-file requirement for the services that do. diff --git a/pyproject.toml b/pyproject.toml index 8b0455f..3394483 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agnes-the-ai-analyst" -version = "0.12.0" +version = "0.12.1" description = "Agnes — AI Data Analyst platform for AI analytical systems" requires-python = ">=3.11,<3.14" license = "MIT" diff --git a/scripts/run-local-dev.sh b/scripts/run-local-dev.sh index 2a230a5..9b010d2 100755 --- a/scripts/run-local-dev.sh +++ b/scripts/run-local-dev.sh @@ -3,7 +3,7 @@ # # Stacks three compose files: # 1. docker-compose.yml — base services -# 2. docker-compose.override.yml — hot-reload + source bind mount (dev default) +# 2. docker-compose.dev.yml — hot-reload + source bind mount (dev default) # 3. docker-compose.local-dev.yml — LOCAL_DEV_MODE=1, drops .env requirement # # After startup visit http://localhost:8000 — you'll land on /dashboard @@ -37,6 +37,6 @@ export LOCAL_DEV_GROUPS="${LOCAL_DEV_GROUPS-$DEFAULT_LOCAL_DEV_GROUPS}" exec docker compose \ -f docker-compose.yml \ - -f docker-compose.override.yml \ + -f docker-compose.dev.yml \ -f docker-compose.local-dev.yml \ up "$@" diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh index e422596..921471d 100755 --- a/scripts/smoke-test.sh +++ b/scripts/smoke-test.sh @@ -23,21 +23,15 @@ check() { echo "Smoke test: $HOST" echo "---" -# 1. Health check +# 1. Health check (minimal, unauthenticated) HEALTH=$(curl -sf "$HOST/api/health" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unreachable") -if [ "$HEALTH" = "unhealthy" ] || [ "$HEALTH" = "unreachable" ]; then +if [ "$HEALTH" = "unreachable" ]; then echo " FATAL: health=$HEALTH" exit 1 fi check "health ($HEALTH)" "true" -# 2. Health has version fields -HAS_VERSION=$(curl -sf "$HOST/api/health" | python3 -c " -import sys,json -d=json.load(sys.stdin) -print('true' if 'version' in d and 'channel' in d and 'schema_version' in d else 'false') -" 2>/dev/null || echo "false") -check "health version fields" "$HAS_VERSION" +# 2. Health detailed has version fields (requires auth, checked after bootstrap) # 3. Bootstrap (only works on fresh DB; 403 means users exist) BOOT_HTTP=$(curl -s -o /tmp/smoke_boot.json -w "%{http_code}" -X POST "$HOST/auth/bootstrap" \ @@ -54,6 +48,17 @@ else check "bootstrap (HTTP $BOOT_HTTP)" "false" fi +# 2b. Health detailed (authenticated) — version fields +if [ -n "$TOKEN" ]; then + HAS_VERSION=$(curl -sf "$HOST/api/health/detailed" \ + -H "Authorization: Bearer $TOKEN" | python3 -c " +import sys,json +d=json.load(sys.stdin) +print('true' if 'version' in d and 'channel' in d and 'schema_version' in d else 'false') +" 2>/dev/null || echo "false") + check "health detailed version fields" "$HAS_VERSION" +fi + # 4. Query SELECT 1 (requires auth) if [ -n "$TOKEN" ]; then QUERY_OK=$(curl -sf -X POST "$HOST/api/query" \ @@ -84,7 +89,12 @@ fi # 6. Post-sync health (wait briefly) sleep 5 -HEALTH2=$(curl -sf "$HOST/api/health" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unreachable") +if [ -n "$TOKEN" ]; then + HEALTH2=$(curl -sf "$HOST/api/health/detailed" \ + -H "Authorization: Bearer $TOKEN" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unreachable") +else + HEALTH2=$(curl -sf "$HOST/api/health" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unreachable") +fi if [ "$HEALTH2" = "unhealthy" ] || [ "$HEALTH2" = "unreachable" ]; then check "post-sync health ($HEALTH2)" "false" else diff --git a/tests/test_access_control.py b/tests/test_access_control.py index a555ef4..c382ff6 100644 --- a/tests/test_access_control.py +++ b/tests/test_access_control.py @@ -637,3 +637,45 @@ class TestUnauthenticatedAccess: "user_id": "analyst1", "dataset": "anything", }) assert resp.status_code in (401, 403) + + +class TestDownloadPathTraversal: + """Path-traversal protection: unsafe table_id values are rejected with 404.""" + + def test_download_rejects_traversal_id(self, seeded_app): + """URL-encoded path traversal in table_id returns 404.""" + c = seeded_app["client"] + # ..%2F..%2Fstate%2Fsystem decodes to ../../state/system + resp = c.get("/api/data/..%2F..%2Fstate%2Fsystem/download", + headers=_auth(seeded_app["admin_token"])) + assert resp.status_code == 404 + + def test_download_rejects_dotdot(self, seeded_app): + """Literal ../../etc/passwd in table_id returns 404.""" + c = seeded_app["client"] + resp = c.get('/api/data/../../etc/passwd/download', + headers=_auth(seeded_app["admin_token"])) + assert resp.status_code == 404 + + def test_download_rejects_special_chars(self, seeded_app): + """table_id with spaces, slashes, or other dangerous chars returns 404.""" + c = seeded_app["client"] + # Spaces + resp = c.get("/api/data/my%20table/download", + headers=_auth(seeded_app["admin_token"])) + assert resp.status_code == 404 + # Slashes + resp = c.get("/api/data/foo/bar/download", + headers=_auth(seeded_app["admin_token"])) + assert resp.status_code == 404 + + def test_download_accepts_hyphenated_dotted_id(self, seeded_app): + """Keboola-style table IDs with dots and hyphens (e.g. in.c-crm.orders) + pass validation — they are safe for filesystem lookup and DB query.""" + c = seeded_app["client"] + # No parquet file exists, so we expect 404 from "not found on disk", + # NOT 404 from identifier validation rejection. + resp = c.get("/api/data/in.c-crm.orders/download", + headers=_auth(seeded_app["admin_token"])) + assert resp.status_code == 404 + assert "not found" in resp.json()["detail"].lower() diff --git a/tests/test_admin_configure_api.py b/tests/test_admin_configure_api.py index 2706760..95bf459 100644 --- a/tests/test_admin_configure_api.py +++ b/tests/test_admin_configure_api.py @@ -1,5 +1,9 @@ """Tests for admin configure and registry API endpoints.""" +import ipaddress +import socket +from unittest.mock import patch + import pytest @@ -83,6 +87,142 @@ class TestAdminConfigure: assert resp.status_code == 422 +class TestAdminConfigureSSRF: + """SSRF protection: keboola_url must not point to private/reserved networks. + + Uses socket.getaddrinfo + ipaddress checks — tests mock DNS resolution + so they work regardless of the test runner's network/IPv6 config. + """ + + @staticmethod + def _mock_getaddrinfo(host, port, **kwargs): + """Predictable DNS resolution for tests — returns the IP literal as-is.""" + try: + ip = ipaddress.ip_address(host) + family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET + return [(family, socket.SOCK_STREAM, socket.IPPROTO_TCP, "", (str(ip), port))] + except ValueError: + # Not an IP literal — let real DNS resolve (for public URL test) + return socket.getaddrinfo(host, port, **kwargs) + + def test_configure_rejects_localhost_url(self, seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "http://localhost:8080"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + assert "private" in resp.json()["detail"].lower() or "reserved" in resp.json()["detail"].lower() + + def test_configure_rejects_127_0_0_1_url(self, seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch("app.api.admin._socket.getaddrinfo", self._mock_getaddrinfo): + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "https://127.0.0.1"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + + def test_configure_rejects_10_0_0_1_url(self, seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch("app.api.admin._socket.getaddrinfo", self._mock_getaddrinfo): + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "https://10.0.0.1"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + + def test_configure_rejects_192_168_url(self, seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch("app.api.admin._socket.getaddrinfo", self._mock_getaddrinfo): + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "https://192.168.1.1"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + + def test_configure_rejects_169_254_metadata_url(self, seeded_app): + """169.254.x.x (link-local) must be rejected — cloud metadata endpoint.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch("app.api.admin._socket.getaddrinfo", self._mock_getaddrinfo): + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "http://169.254.169.254"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + + def test_configure_rejects_ipv6_loopback(self, seeded_app): + """IPv6 loopback ::1 must be rejected.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch("app.api.admin._socket.getaddrinfo", self._mock_getaddrinfo): + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "http://[::1]:8080"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + + def test_configure_rejects_ipv6_link_local(self, seeded_app): + """IPv6 link-local fe80::1 must be rejected.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch("app.api.admin._socket.getaddrinfo", self._mock_getaddrinfo): + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "http://[fe80::1]:8080"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + + def test_configure_rejects_ipv6_unique_local(self, seeded_app): + """IPv6 unique-local fc00::1 must be rejected.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch("app.api.admin._socket.getaddrinfo", self._mock_getaddrinfo): + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "http://[fc00::1]:8080"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + + def test_configure_rejects_ipv6_multicast(self, seeded_app): + """IPv6 multicast ff02::1 must be rejected.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch("app.api.admin._socket.getaddrinfo", self._mock_getaddrinfo): + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "http://[ff02::1]:8080"}, + headers=_auth(token), + ) + assert resp.status_code == 400 + + def test_configure_accepts_public_url(self, seeded_app): + """A public URL should pass SSRF validation (connection test may still fail).""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + resp = c.post( + "/api/admin/configure", + json={"data_source": "keboola", "keboola_token": "tok", "keboola_url": "https://connection.keboola.com"}, + headers=_auth(token), + ) + # Should NOT be 400 with SSRF message — may be 400 from failed connection test, or 200 + if resp.status_code == 400: + assert "private" not in resp.json()["detail"].lower() + + class TestAdminRegistry: def test_list_registry_empty(self, seeded_app): c = seeded_app["client"] diff --git a/tests/test_api.py b/tests/test_api.py index e8532f1..b367d34 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -5,6 +5,10 @@ import pytest from fastapi.testclient import TestClient +def _auth(token): + return {"Authorization": f"Bearer {token}"} + + @pytest.fixture def app_client(tmp_path, monkeypatch): monkeypatch.setenv("DATA_DIR", str(tmp_path)) @@ -54,12 +58,19 @@ class TestHealth: resp = app_client.get("/api/health") assert resp.status_code == 200 data = resp.json() + assert data["status"] == "ok" + + def test_health_detailed_requires_auth(self, app_client): + resp = app_client.get("/api/health/detailed") + assert resp.status_code in (401, 403) + + def test_health_detailed_has_duckdb_check(self, seeded_app): + c = seeded_app["client"] + resp = c.get("/api/health/detailed", headers=_auth(seeded_app["admin_token"])) + assert resp.status_code == 200 + data = resp.json() assert data["status"] in ("healthy", "degraded", "unhealthy") assert "services" in data - - def test_health_has_duckdb_check(self, app_client): - resp = app_client.get("/api/health") - data = resp.json() assert "duckdb_state" in data["services"] assert data["services"]["duckdb_state"]["status"] == "ok" diff --git a/tests/test_auth_providers.py b/tests/test_auth_providers.py index 1b3f484..3064177 100644 --- a/tests/test_auth_providers.py +++ b/tests/test_auth_providers.py @@ -135,6 +135,43 @@ class TestEmailAuth: }) assert resp.status_code == 401 + def test_concurrent_verify_only_one_wins(self, client): + """Two concurrent magic-link verifies — exactly one must succeed (M10).""" + from concurrent.futures import ThreadPoolExecutor, as_completed + from src.db import get_system_db + from src.repositories.users import UserRepository + + # Create a user and set a magic-link token + conn = get_system_db() + repo = UserRepository(conn) + repo.create(id="ml-user-1", email="concurrent@test.com", name="Test", role="viewer") + token = "tok_concurrent_test_12345" + from datetime import datetime, timezone + repo.update(id="ml-user-1", reset_token=token, reset_token_created=datetime.now(timezone.utc)) + conn.close() + + results = [] + barrier = __import__("threading").Barrier(2, timeout=5) + + def verify(): + barrier.wait() # ensure both threads hit the endpoint simultaneously + resp = client.post("/auth/email/verify", json={ + "email": "concurrent@test.com", "token": token, + }) + results.append(resp.status_code) + + with ThreadPoolExecutor(max_workers=2) as pool: + futures = [pool.submit(verify) for _ in range(2)] + # Collect results (re-raise any exceptions) + for f in as_completed(futures): + f.result() + + # Exactly one must succeed (200), the other must fail (401) + successes = results.count(200) + failures = results.count(401) + assert successes == 1, f"Expected exactly 1 success, got {successes} (results: {results})" + assert failures == 1, f"Expected exactly 1 failure, got {failures} (results: {results})" + class TestGoogleOAuth: def test_google_login_not_configured(self, client): diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py index 65b904f..03dc416 100644 --- a/tests/test_bootstrap.py +++ b/tests/test_bootstrap.py @@ -99,7 +99,7 @@ class TestBootstrap: "password": "should-not-work", }) assert resp.status_code == 403 - assert "already have passwords" in resp.json()["detail"] + assert "password already exists" in resp.json()["detail"] def test_bootstrap_then_login(self, fresh_client): """After bootstrap with password, /auth/token login works; without password it requires OAuth.""" @@ -145,10 +145,10 @@ class TestBootstrap: def test_full_agent_flow(self, fresh_client): """Simulate full AI agent deployment flow.""" - # 1. Health check (no auth) + # 1. Health check (no auth — minimal endpoint) resp = fresh_client.get("/api/health") assert resp.status_code == 200 - assert resp.json()["status"] == "healthy" + assert resp.json()["status"] == "ok" # 2. Bootstrap admin resp = fresh_client.post("/auth/bootstrap", json={ @@ -174,6 +174,6 @@ class TestBootstrap: }, headers=headers) assert resp.status_code == 201 - # 6. Verify - resp = fresh_client.get("/api/health") + # 6. Verify via detailed health (requires auth) + resp = fresh_client.get("/api/health/detailed", headers=headers) assert resp.json()["services"]["users"]["count"] == 2 diff --git a/tests/test_journey_bootstrap_auth.py b/tests/test_journey_bootstrap_auth.py index c3506bc..9341105 100644 --- a/tests/test_journey_bootstrap_auth.py +++ b/tests/test_journey_bootstrap_auth.py @@ -63,11 +63,10 @@ class TestBootstrapAuth: assert resp.status_code == 403 def test_health_endpoint_requires_no_auth(self, seeded_app): - """Health check is always accessible without any token.""" + """Minimal health check is always accessible without any token.""" c = seeded_app["client"] resp = c.get("/api/health") assert resp.status_code == 200 body = resp.json() - assert "status" in body - assert body["status"] in ("healthy", "degraded", "unhealthy") + assert body["status"] == "ok" diff --git a/tests/test_memory_api.py b/tests/test_memory_api.py index ab4988a..0c11a09 100644 --- a/tests/test_memory_api.py +++ b/tests/test_memory_api.py @@ -1,6 +1,7 @@ """Tests for corporate memory API — knowledge items, voting, governance.""" import pytest +from src.repositories.knowledge import KnowledgeRepository def _auth(token): @@ -122,21 +123,34 @@ class TestMemoryList: class TestMemoryStats: - def test_get_stats(self, seeded_app): + def test_get_stats_returns_counts(self, seeded_app): c = seeded_app["client"] token = seeded_app["admin_token"] resp = c.get("/api/memory/stats", headers=_auth(token)) assert resp.status_code == 200 data = resp.json() assert "total" in data + assert isinstance(data["total"], int) + assert data["total"] >= 0 assert "by_status" in data + assert isinstance(data["by_status"], dict) assert "categories" in data + assert isinstance(data["categories"], list) def test_get_stats_requires_auth(self, seeded_app): c = seeded_app["client"] resp = c.get("/api/memory/stats") assert resp.status_code == 401 + def test_get_stats_does_not_load_all_items(self, seeded_app): + """Stats endpoint uses SQL aggregation, not list_items().""" + from unittest.mock import patch + c = seeded_app["client"] + token = seeded_app["admin_token"] + with patch.object(KnowledgeRepository, "list_items", side_effect=AssertionError("list_items should not be called")): + resp = c.get("/api/memory/stats", headers=_auth(token)) + assert resp.status_code == 200 + class TestMemoryVote: def _create_item(self, c, token): diff --git a/tests/test_no_override_file.py b/tests/test_no_override_file.py new file mode 100644 index 0000000..f0edabe --- /dev/null +++ b/tests/test_no_override_file.py @@ -0,0 +1,15 @@ +"""Regression test: docker-compose.override.yml must not exist (issue #87/M23). + +Docker Compose auto-merges docker-compose.override.yml when present, +silently enabling dev mode on any host with the repo. The file was renamed +to docker-compose.dev.yml which requires explicit -f flag. +""" +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +def test_no_override_file(): + """docker-compose.override.yml must not exist — it was renamed to .dev.yml.""" + assert not (REPO_ROOT / "docker-compose.override.yml").exists(), \ + "docker-compose.override.yml must not exist (renamed to docker-compose.dev.yml per #87/M23)" diff --git a/tests/test_password_flows.py b/tests/test_password_flows.py index 155be0d..1522d1a 100644 --- a/tests/test_password_flows.py +++ b/tests/test_password_flows.py @@ -217,6 +217,61 @@ class TestResetConfirm: assert resp.status_code == 200 assert "at least 8" in resp.text + def test_concurrent_reset_only_one_wins(self, app_client, fresh_db): + """Two concurrent reset/confirm POSTs — exactly one must succeed. + + Mirrors `test_concurrent_verify_only_one_wins` for the magic-link + flow. Without the CAS pattern at `_atomic_consume_reset_token`, + two concurrent POSTs with the same valid token could both write + different new passwords for the same user (last-write-wins + semantics). With the CAS, the loser gets the "Invalid or expired" + form back instead of silently overwriting. + """ + from concurrent.futures import ThreadPoolExecutor, as_completed + import threading + + _seed_user( + "race@test.com", + reset_token="race-tok", + reset_token_created=datetime.now(timezone.utc), + ) + + results: list[tuple[int, str]] = [] + barrier = threading.Barrier(2, timeout=5) + + def confirm(payload_password: str): + barrier.wait() # both threads hit the endpoint at the same instant + resp = app_client.post( + "/auth/password/reset/confirm", + data={ + "email": "race@test.com", + "token": "race-tok", + "password": payload_password, + "confirm_password": payload_password, + }, + ) + results.append((resp.status_code, resp.text)) + + with ThreadPoolExecutor(max_workers=2) as pool: + futures = [ + pool.submit(confirm, "winner-password"), + pool.submit(confirm, "loser-password"), + ] + for f in as_completed(futures): + f.result() + + # Exactly one 302 (winner — redirected to login) and one 200 + # (loser — got the reset form back with the standard error). + redirects = [r for r in results if r[0] == 302] + rejects = [r for r in results if r[0] == 200] + assert len(redirects) == 1, ( + f"Expected exactly 1 winner, got {len(redirects)} (results: {results})" + ) + assert len(rejects) == 1, ( + f"Expected exactly 1 loser, got {len(rejects)} (results: {results})" + ) + assert "Invalid or expired" in rejects[0][1] + # ---- POST /auth/password/setup/request ---- @@ -303,12 +358,12 @@ class TestAdminInviteFlow: ) assert resp.status_code == 200 data = resp.json() - assert data["reset_token"] assert "reset_url" in data assert "/auth/password/reset" in data["reset_url"] - assert f"email=target%40test.com" in data["reset_url"] - assert f"token={data['reset_token']}" in data["reset_url"] + assert "token=" in data["reset_url"] # URL still contains the token + assert "email_sent" in data assert data["email_sent"] is False # no SMTP configured in tests + assert "reset_token" not in data # raw token must NOT be in response def test_create_user_with_send_invite_returns_invite_url(self, app_client, fresh_db): token = _seed_admin()