agnes-the-ai-analyst/tests/test_db_remote_attach_security.py
ZdenekSrotyr 23be8ad46f
fix(security): #81 Group A — orchestrator attach hardening (squashed) (#95)
Closes the C1 findings from issue #81 plus the round-3/4 follow-ups
on the read-only query path.

Both _attach_remote_extensions (rebuild path) and
_reattach_remote_extensions (query path) now apply the same hard
allowlists for extensions and token-env names, single-quote-escape
the URL, and split built-in vs community install. The CHANGELOG bullet
documents the full scope including the table_schema → table_catalog
fix that made the rebuild path a silent no-op for every connector.

New module src/orchestrator_security.py centralises the policy. Tests
in tests/test_orchestrator_remote_attach_security.py — 28/28 pass.

Refs #81.
2026-04-27 21:34:04 +02:00

142 lines
5.7 KiB
Python

"""Issue #81 Group A — query-path (db.py) trust-boundary tests.
Mirror of `tests/test_orchestrator_remote_attach_security.py` for
`src/db.py:_reattach_remote_extensions`. The query path runs on every
`/api/query` request via `get_analytics_db_readonly()`; it must enforce
the same allowlists as the rebuild path or the security guarantee is
hollow.
Setup is real-DuckDB (not mock-conn) because db.py introspects via
`information_schema.tables`/`duckdb_databases()` rather than just
executing whatever SQL we hand it. We feed it a real extract.duckdb
with a programmable `_remote_attach` row, ATTACH it, then call the
function and assert which `LOAD/ATTACH` SQL fired (or didn't) by
sniffing connection state.
"""
from __future__ import annotations
import logging
from pathlib import Path
import duckdb
import pytest
from src.db import _reattach_remote_extensions
def _make_extract_with_remote_attach(
path: Path, alias: str, extension: str, url: str, token_env: str
) -> None:
"""Create a tiny extract.duckdb whose _remote_attach table has one row."""
path.parent.mkdir(parents=True, exist_ok=True)
if path.exists():
path.unlink()
wal = path.with_suffix(".duckdb.wal")
if wal.exists():
wal.unlink()
c = duckdb.connect(str(path))
c.execute(
"CREATE TABLE _remote_attach ("
"alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR)"
)
c.execute(
"INSERT INTO _remote_attach VALUES (?, ?, ?, ?)",
[alias, extension, url, token_env],
)
c.close()
def _attach_and_call(extracts_dir: Path, source_name: str):
"""ATTACH the source's extract.duckdb to a fresh memory conn, run the
function, return the conn (so the test can introspect attached_dbs)."""
conn = duckdb.connect()
conn.execute(
f"ATTACH '{extracts_dir / source_name / 'extract.duckdb'}' "
f"AS {source_name} (READ_ONLY)"
)
_reattach_remote_extensions(conn, extracts_dir)
return conn
def _attached(conn) -> set[str]:
return {r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()}
class TestQueryPathExtensionAllowlist:
def test_refuses_unknown_extension(self, tmp_path, caplog):
"""A connector that requested `httpfs` (not on allowlist) is
refused — `httpfs` does not appear among attached databases."""
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="ext_alias", extension="httpfs",
url="https://x", token_env="",
)
with caplog.at_level(logging.ERROR):
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "ext_alias" not in _attached(conn)
assert any("not in allowlist" in r.message for r in caplog.records)
class TestQueryPathTokenEnvAllowlist:
def test_refuses_session_secret(self, tmp_path, monkeypatch, caplog):
monkeypatch.setenv("SESSION_SECRET", "shouldnt-leak")
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="kbc", extension="keboola",
url="https://x", token_env="SESSION_SECRET",
)
with caplog.at_level(logging.ERROR):
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "kbc" not in _attached(conn)
assert any(
"token_env" in r.message and "not in allowlist" in r.message
for r in caplog.records
)
def test_refuses_jwt_secret_key(self, tmp_path, monkeypatch, caplog):
monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64)
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="kbc", extension="keboola",
url="https://x", token_env="JWT_SECRET_KEY",
)
with caplog.at_level(logging.ERROR):
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "kbc" not in _attached(conn)
class TestQueryPathInstallStrategy:
def test_no_install_on_query_path(self, tmp_path, monkeypatch, caplog):
"""The query path must NOT issue INSTALL FROM community — it runs
on every read request and shouldn't touch the network. LOAD
without prior INSTALL fails for missing extensions, which is the
documented behaviour (operator is told to trigger a rebuild)."""
# We can't easily verify "no INSTALL" without intercepting SQL;
# instead, verify the query path doesn't hit the community
# registry by setting an extension that's NOT on the default
# allowlist nor pre-installed. The function should refuse
# before any LOAD attempt.
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="bad", extension="some_other_community_ext",
url="https://x", token_env="",
)
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "bad" not in _attached(conn)
class TestQueryPathOverride:
def test_override_replaces_default(self, tmp_path, monkeypatch):
"""Setting AGNES_REMOTE_ATTACH_TOKEN_ENVS=MY_TOKEN replaces the
default — KBC_TOKEN no longer accepted. (Operator-typo defense
contract; mirrored from rebuild-path tests.)"""
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_TOKEN")
monkeypatch.setenv("MY_TOKEN", "value")
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="kbc", extension="keboola",
url="https://x", token_env="KBC_TOKEN", # NOT in the override set
)
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "kbc" not in _attached(conn)