agnes-the-ai-analyst/tests/test_security.py
ZdenekSrotyr e9d7af3cce feat(rbac+marketplace): RBAC v13 + Claude Code marketplace + #81/#83/#44 hardening
This squashes 13 commits from ma/staging plus a small docstring translation
into a single coherent unit. Three workstreams.

== RBAC v13 redesign ==
- Drops core.viewer/analyst/km_admin/admin hierarchy and the
  internal_roles / group_mappings / user_role_grants / plugin_access tables.
- Replaced by user_group_members + resource_grants. Atomic v12→v13 backfill
  wrapped in BEGIN/COMMIT; ROLLBACK leaves schema_version at 12 for retry.
- Two authorization primitives in app.auth.access:
    require_admin                        — Admin-group god-mode
    require_resource_access(rt, "{path}") — entity-scoped grants
  Single DB lookup per request; no session cache; no implies BFS.
- /admin/access UI (single page) replaces /admin/role-mapping +
  /admin/plugin-access. CLI `da admin group/grant *` replaces
  `da admin role/mapping/grant-role/revoke-role/effective-roles`.
- ResourceType.TABLE listing-only — admins can record table grants,
  runtime enforcement still flows through legacy dataset_permissions
  (migration plan in docs/TODO-rbac-data-enforcement.md).

== Claude Code marketplace ==
- Aggregated /marketplace.zip + /marketplace.git/* (PAT-gated,
  RBAC-filtered, content-addressed cache via dulwich).
- Admin god-mode dropped on the marketplace surface — admins curate
  their own view via grants like everyone else.
- Bare-repo cache materializes per RBAC-filtered ETag; stale entries
  not pruned in this iteration (disclaimed in git_backend.py docstring).

== #81 #83 #44 security/ops hardening ==
- #81 Group A — orchestrator ATTACH allow-listing (extension/url/alias).
- #81 Group B — Keboola extractor 3-state exit codes:
    0 success / 1 total fail / 2 PARTIAL fail
  Sync API logs PARTIAL FAILURE alert on exit 2. Operators with binary
  alerting must teach it the new partial signal.
- #81 Group C — schema v10 view_ownership; rejects silent overwrite
  of a prior connector's view name on collision.
- #81 Group D — extractor-side identifier validation.
- #83 — Jira webhook fail-closed when JIRA_WEBHOOK_SECRET unset
  + path-traversal fix.
- #44 — entire /api/scripts/* surface is admin-only (planted-script +
  sandbox-bypass risk closed).

== Web UI polish + deploy fix ==
- /admin/access: live grant-count badges (no stale snapshot revert),
  shared-header CSS link added to /catalog and /admin/{tables,permissions},
  per-resource-type colored stripes.
- docker-compose.host-mount.yml: bind,rbind so dual-disk hosts don't
  silently shadow sub-mounts and write state to the wrong disk.

== OSS vendor-neutralization (waves 1+2) ==
- scripts/grpn/ → scripts/ops/. Customer-specific identifiers
  (project IDs, internal hostnames, dev/prod VM IPs, brand names)
  replaced with placeholders across code, docs, Terraform, Caddyfile,
  OAuth probe, and planning docs. Downstream infra repos that copied
  scripts/grpn/agnes-tls-rotate.sh or agnes-auto-upgrade.sh must
  update the path.

== Translation ==
- src/repositories/user_groups.py::ensure_system docstring translated
  from Czech to English for codebase consistency.

Co-authored-by: Mina Rustamyan <mina@keboola.com>
2026-04-28 14:25:04 +02:00

400 lines
16 KiB
Python

"""Security tests — sandbox escapes, SQL injection, access control."""
import importlib
import os
import sys
import pytest
from fastapi.testclient import TestClient
@pytest.fixture
def client(tmp_path, monkeypatch):
monkeypatch.setenv("DATA_DIR", str(tmp_path))
monkeypatch.setenv("JWT_SECRET_KEY", "test-secret-key-minimum-32-characters!!")
monkeypatch.setenv("SCRIPT_TIMEOUT", "5")
from app.main import create_app
from src.db import get_system_db
from src.repositories.users import UserRepository
from app.auth.jwt import create_access_token
from tests.helpers.auth import grant_admin
conn = get_system_db()
repo = UserRepository(conn)
repo.create(id="admin1", email="admin@test.com", name="Admin", role="admin")
repo.create(id="u1", email="user@test.com", name="User", role="analyst")
grant_admin(conn, "admin1")
conn.close()
app = create_app()
c = TestClient(app)
token = create_access_token("admin1", "admin@test.com", "admin")
return c, token
def _headers(token):
return {"Authorization": f"Bearer {token}"}
# ---- Script Sandbox ----
class TestScriptSandbox:
def test_blocks_os_system(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={"source": "import os\nos.system('whoami')"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_dunder_import(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={"source": "__import__('subprocess').run(['ls'])"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_eval(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={"source": "eval('print(1)')"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_exec(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={"source": "exec('import os')"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_open(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={"source": "open('/etc/passwd').read()"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_socket(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={"source": "import socket"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_pathlib(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={"source": "from pathlib import Path"},
headers=_headers(token))
assert resp.status_code == 400
def test_allows_safe_script(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={
"source": "import math\nprint(math.sqrt(144))",
}, headers=_headers(token))
assert resp.status_code == 200
assert "12" in resp.json()["stdout"]
def test_allows_duckdb(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={
"source": "import duckdb\nconn=duckdb.connect(':memory:')\nprint(conn.execute('SELECT 42').fetchone()[0])",
}, headers=_headers(token))
assert resp.status_code == 200
assert "42" in resp.json()["stdout"]
def test_allows_json(self, client):
c, token = client
resp = c.post("/api/scripts/run", json={
"source": "import json\nprint(json.dumps({'a': 1}))",
}, headers=_headers(token))
assert resp.status_code == 200
assert '"a"' in resp.json()["stdout"]
def test_runtime_import_blocked(self, client):
"""Even if static check passes, runtime __import__ override catches it."""
c, token = client
# This uses string concatenation to bypass static check
resp = c.post("/api/scripts/run", json={
"source": "x='sub'+'process'\ntry:\n m=type('',(),{'__init__':lambda s:None})()\nexcept:\n pass\nprint('safe')",
}, headers=_headers(token))
# Should still run but without access to dangerous modules
assert resp.status_code == 200
def test_sandbox_cannot_import_httpx(self, client):
"""httpx must be blocked — either by pattern check (400) or
ModuleNotFoundError at runtime due to stripped VIRTUAL_ENV/PYTHONPATH (200 with non-zero exit)."""
c, token = client
resp = c.post("/api/scripts/run", json={
"source": "import httpx\nprint('pwned')",
}, headers=_headers(token))
# Static pattern check should reject it outright
assert resp.status_code == 400 or (
resp.status_code == 200 and resp.json()["exit_code"] != 0
)
# ---- SQL Query Security ----
class TestQuerySecurity:
def test_blocks_copy_to(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "COPY (SELECT 1) TO '/tmp/pwned.csv'"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_read_csv(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM read_csv_auto('/etc/passwd')"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_semicolon(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT 1; SELECT 2"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_non_select(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "CREATE TABLE pwned (id INT)"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_attach(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "ATTACH '/tmp/pwned.db'"},
headers=_headers(token))
assert resp.status_code == 400
def test_allows_select(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT 1 as test, 'hello' as msg"},
headers=_headers(token))
assert resp.status_code == 200
assert resp.json()["columns"] == ["test", "msg"]
def test_allows_with_cte(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "WITH t AS (SELECT 1 as x) SELECT * FROM t"},
headers=_headers(token))
assert resp.status_code == 200
def test_blocks_drop(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "DROP TABLE IF EXISTS users"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_parquet_scan(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM parquet_scan('/data/extracts/secret.parquet')"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_read_csv_auto(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM read_csv_auto('/etc/passwd')"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_query_table(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM query_table('secret_table')"},
headers=_headers(token))
assert resp.status_code == 400
def test_no_auth(self, client):
c, _ = client
resp = c.post("/api/query", json={"sql": "SELECT 1"})
assert resp.status_code == 401
def test_word_boundary_match_no_false_positive(self, client):
"""Verify that a table named 'id' doesn't block queries containing 'id' in other contexts."""
c, token = client
# Query contains 'id' in column name and function, but not as a forbidden table reference
resp = c.post("/api/query", json={"sql": "SELECT 1 as identity, 2 as valid_id"},
headers=_headers(token))
# Should succeed (not blocked by false positive substring match)
assert resp.status_code == 200
def test_word_boundary_match_blocks_actual_table(self, client):
"""Verify that actual table references are still properly blocked with word-boundary regex."""
c, token = client
# Create a scenario where a table named 'id' would be forbidden
# This tests that word boundaries work correctly
resp = c.post("/api/query", json={"sql": "SELECT * FROM id WHERE id = 1"},
headers=_headers(token))
# Without a real 'id' table and RBAC setup, this will fail with query error,
# but not with 403 access denied. The regex logic is sound if test_word_boundary_match_no_false_positive passes.
assert resp.status_code in [400, 200] # Either query error or success
def test_blocks_information_schema(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT table_name FROM information_schema.tables"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_duckdb_tables(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM duckdb_tables()"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_duckdb_columns(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM duckdb_columns()"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_duckdb_databases(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM duckdb_databases()"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_relative_path(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM read_parquet('../secret/data.parquet')"},
headers=_headers(token))
assert resp.status_code == 400
def test_blocks_pragma_table_info(self, client):
c, token = client
resp = c.post("/api/query", json={"sql": "SELECT * FROM pragma_table_info('users')"},
headers=_headers(token))
assert resp.status_code == 400
# ---- Auth Edge Cases ----
class TestAuthSecurity:
def test_garbage_token(self, client):
c, _ = client
resp = c.get("/api/scripts", headers={"Authorization": "Bearer garbage.token.here"})
assert resp.status_code == 401
def test_empty_bearer(self, client):
c, _ = client
resp = c.get("/api/scripts", headers={"Authorization": "Bearer "})
assert resp.status_code == 401
def test_no_bearer_prefix(self, client):
c, token = client
resp = c.get("/api/scripts", headers={"Authorization": token})
assert resp.status_code == 401
def test_missing_header(self, client):
c, _ = client
resp = c.get("/api/scripts")
assert resp.status_code == 401
# ---- Script RBAC ----
@pytest.fixture
def viewer_client(tmp_path, monkeypatch):
"""TestClient with a viewer-role user seeded."""
monkeypatch.setenv("DATA_DIR", str(tmp_path))
monkeypatch.setenv("JWT_SECRET_KEY", "test-secret-key-minimum-32-characters!!")
monkeypatch.setenv("SCRIPT_TIMEOUT", "5")
from app.main import create_app
from src.db import get_system_db
from src.repositories.users import UserRepository
from app.auth.jwt import create_access_token
from fastapi.testclient import TestClient
conn = get_system_db()
UserRepository(conn).create(id="viewer1", email="viewer@test.com", name="Viewer", role="viewer")
conn.close()
app = create_app()
c = TestClient(app)
token = create_access_token(user_id="viewer1", email="viewer@test.com", role="viewer")
return c, token
@pytest.mark.skip(
reason=(
"v12: scripts run/deploy is gated by Depends(get_current_user) "
"(any signed-in user), not by role hierarchy. The viewer-blocked "
"assertion is a v9-era expectation that no longer holds. "
"If we re-introduce a role/group gate on scripts, rewrite these "
"tests against require_resource_access(ResourceType.SCRIPT, ...)."
)
)
class TestScriptRBAC:
def test_viewer_cannot_run_scripts(self, viewer_client):
c, token = viewer_client
headers = {"Authorization": f"Bearer {token}"}
resp = c.post("/api/scripts/run", json={
"name": "test", "source": "print('hi')"
}, headers=headers)
assert resp.status_code == 403
def test_viewer_cannot_deploy_scripts(self, viewer_client):
c, token = viewer_client
headers = {"Authorization": f"Bearer {token}"}
resp = c.post("/api/scripts/deploy", json={
"name": "test", "source": "print('hi')", "schedule": ""
}, headers=headers)
assert resp.status_code == 403
# ---- JWT Claims ----
class TestJwtClaims:
def test_jwt_contains_jti_claim(self):
"""Token payload must include a jti claim with at least 16 hex chars."""
os.environ.setdefault("TESTING", "1")
from app.auth.jwt import create_access_token, verify_token
token = create_access_token("u1", "user@test.com", "analyst")
payload = verify_token(token)
assert payload is not None
assert "jti" in payload
assert len(payload["jti"]) >= 16
def test_jwt_expiry_is_24_hours(self):
"""ACCESS_TOKEN_EXPIRE_HOURS must be 24 (not 30*24)."""
os.environ.setdefault("TESTING", "1")
from app.auth import jwt as jwt_module
assert jwt_module.ACCESS_TOKEN_EXPIRE_HOURS == 24
# ---- JWT Secret Hardening ----
class TestJwtSecretHardening:
def test_auto_generates_jwt_secret_when_absent(self, tmp_path):
"""When JWT_SECRET_KEY is absent and TESTING is not set,
the secret is auto-generated and persisted to a file."""
saved_key = os.environ.pop("JWT_SECRET_KEY", None)
saved_testing = os.environ.pop("TESTING", None)
saved_data_dir = os.environ.get("DATA_DIR")
os.environ["DATA_DIR"] = str(tmp_path)
# Eject cached modules so the re-import re-executes module-level code
sys.modules.pop("app.auth.jwt", None)
sys.modules.pop("app.secrets", None)
try:
mod = importlib.import_module("app.auth.jwt")
# Secret is now lazy — trigger it by calling the accessor
mod._SECRET_KEY_CACHE = None
mod._get_cached_secret_key()
secret_file = tmp_path / "state" / ".jwt_secret"
assert secret_file.exists(), "JWT secret file should be auto-generated"
secret = secret_file.read_text().strip()
assert len(secret) == 64, "Auto-generated secret should be 64 hex chars (32 bytes)"
finally:
# Restore environment before re-importing so the module loads cleanly
if saved_key is not None:
os.environ["JWT_SECRET_KEY"] = saved_key
if saved_testing is not None:
os.environ["TESTING"] = saved_testing
if saved_data_dir is not None:
os.environ["DATA_DIR"] = saved_data_dir
else:
os.environ.pop("DATA_DIR", None)
# If neither was set (bare test run), use TESTING flag so reload works
if saved_key is None and saved_testing is None:
os.environ["TESTING"] = "1"
sys.modules.pop("app.auth.jwt", None)
sys.modules.pop("app.secrets", None)
importlib.import_module("app.auth.jwt")
# Clean up the temporary TESTING flag if we added it
if saved_key is None and saved_testing is None:
os.environ.pop("TESTING", None)