agnes-the-ai-analyst/tests/test_orchestrator_remote_attach_security.py
ZdenekSrotyr 23be8ad46f
fix(security): #81 Group A — orchestrator attach hardening (squashed) (#95)
Closes the C1 findings from issue #81 plus the round-3/4 follow-ups
on the read-only query path.

Both _attach_remote_extensions (rebuild path) and
_reattach_remote_extensions (query path) now apply the same hard
allowlists for extensions and token-env names, single-quote-escape
the URL, and split built-in vs community install. The CHANGELOG bullet
documents the full scope including the table_schema → table_catalog
fix that made the rebuild path a silent no-op for every connector.

New module src/orchestrator_security.py centralises the policy. Tests
in tests/test_orchestrator_remote_attach_security.py — 28/28 pass.

Refs #81.
2026-04-27 21:34:04 +02:00

219 lines
10 KiB
Python

"""Issue #81 Group A — connector → orchestrator trust-boundary tests.
The orchestrator reads `_remote_attach` rows that connectors write into their
extract.duckdb, then runs `INSTALL`/`LOAD`/`ATTACH` SQL. This file exercises
each of the C1 hardening fixes:
- A.1 extension allowlist (refuse non-allowlisted extension)
- A.2 token-env hard allowlist (refuse well-known runtime secrets)
- A.3 URL single-quote escape (no injection through the URL literal)
- Built-in vs community install path split
Each test writes a malicious `_remote_attach` row into a fixture
extract.duckdb and asserts that the orchestrator either refuses (no ATTACH
call) or issues a safely-escaped one. We capture SQL strings via a fake
DuckDB connection so the assertions don't depend on any real extension being
installed.
"""
import logging
from unittest.mock import MagicMock
import duckdb
import pytest
from src.orchestrator import SyncOrchestrator
from src.orchestrator_security import escape_sql_string_literal
@pytest.fixture
def captured_conn(monkeypatch, tmp_path):
"""A duckdb.Connection-like mock that records every execute() string."""
sql_calls: list[str] = []
conn = MagicMock()
# information_schema.tables → return _remote_attach exists
# _remote_attach rows are programmed per-test via attach_rows()
rows_buffer = {"attach": []}
def execute_side_effect(sql, *args, **kwargs):
sql_calls.append(sql)
result = MagicMock()
# information_schema query
if "information_schema.tables" in sql and "_remote_attach" in sql:
result.fetchall.return_value = [("_remote_attach",)]
elif "FROM" in sql and "_remote_attach" in sql:
result.fetchall.return_value = list(rows_buffer["attach"])
elif "duckdb_databases" in sql:
result.fetchall.return_value = [] # nothing attached yet
else:
result.fetchall.return_value = []
return result
conn.execute.side_effect = execute_side_effect
def set_attach_rows(rows):
rows_buffer["attach"] = rows
return conn, sql_calls, set_attach_rows
def _attach_call_count(sql_calls: list[str]) -> int:
"""Number of ATTACH statements actually issued against the conn."""
return sum(1 for s in sql_calls if s.lstrip().upper().startswith("ATTACH "))
class TestExtensionAllowlist:
def test_refuses_unknown_extension(self, captured_conn, caplog):
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "httpfs", "https://x", "")])
with caplog.at_level(logging.ERROR):
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 0
assert any("not in the allowlist" in r.message for r in caplog.records)
def test_allows_keboola(self, captured_conn, monkeypatch):
monkeypatch.setenv("KBC_TOKEN", "secret-token-value")
conn, sql_calls, set_rows = captured_conn
set_rows([("kbc", "keboola", "https://example.keboola.com", "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 1
class TestTokenEnvAllowlist:
def test_refuses_session_secret(self, captured_conn, caplog, monkeypatch):
# Even if SESSION_SECRET were set in env (it shouldn't be in tests),
# the allowlist refuses to read it.
monkeypatch.setenv("SESSION_SECRET", "super-secret-jwt-signing-key")
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "SESSION_SECRET")])
with caplog.at_level(logging.ERROR):
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 0
assert any("token_env" in r.message and "not in the allowlist" in r.message
for r in caplog.records)
def test_refuses_jwt_secret_key(self, captured_conn, monkeypatch, caplog):
monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64)
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "JWT_SECRET_KEY")])
with caplog.at_level(logging.ERROR):
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 0
def test_refuses_arbitrary_custom_token_env(self, captured_conn, monkeypatch, caplog):
# Names that pass the structural regex but aren't on the allowlist
# are still refused — defense against accidental exposure.
monkeypatch.setenv("MY_RANDOM_TOKEN", "value")
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")])
with caplog.at_level(logging.ERROR):
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 0
def test_operator_override_replaces_default(
self, captured_conn, monkeypatch
):
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_RANDOM_TOKEN")
monkeypatch.setenv("MY_RANDOM_TOKEN", "value")
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 1
def test_empty_string_override_falls_back_to_default(
self, captured_conn, monkeypatch
):
"""AGNES_REMOTE_ATTACH_TOKEN_ENVS='' should NOT lock everything down —
it falls through to the default. (Operator-typo defense.)"""
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "")
monkeypatch.setenv("KBC_TOKEN", "value")
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 1
def test_empty_token_env_skips_check(self, captured_conn, monkeypatch):
"""token_env='' (the BigQuery-style env-auth path) skips the
allowlist check entirely. Verifies the BQ flow keeps working."""
conn, sql_calls, set_rows = captured_conn
set_rows([("bq", "bigquery", "project=x", "")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 1
def test_structurally_invalid_token_env_refused(
self, captured_conn, monkeypatch, caplog
):
"""Even names on the allowlist via override must pass the structural
regex `^[A-Z][A-Z0-9_]{0,63}$`. A name with a space or lowercase
letter is refused regardless of allowlist contents."""
# Try to add a structurally-invalid name to the allowlist via
# override; the regex inside is_token_env_allowed must still refuse.
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "kbc_token,KBC TOKEN,LEGIT_TOKEN")
monkeypatch.setenv("LEGIT_TOKEN", "value")
conn, sql_calls, set_rows = captured_conn
set_rows([
("a1", "keboola", "https://x", "kbc_token"), # lowercase
("a2", "keboola", "https://x", "KBC TOKEN"), # space
("a3", "keboola", "https://x", "LEGIT_TOKEN"), # OK
])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
# Only a3 should attach; a1 and a2 fail the structural regex even
# though the operator listed them in the override.
assert _attach_call_count(sql_calls) == 1
class TestUrlEscape:
def test_single_quote_in_url_is_escaped(self, captured_conn, monkeypatch):
monkeypatch.setenv("KBC_TOKEN", "tok")
conn, sql_calls, set_rows = captured_conn
# Adversarial URL trying to break out of the literal.
adversarial_url = "https://example.com'); DROP DATABASE x; --"
set_rows([("kbc", "keboola", adversarial_url, "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")]
assert len(attach_sqls) == 1
# The double-escaped form must be present, never the bare single quote.
expected_escaped = escape_sql_string_literal(adversarial_url)
assert f"'{expected_escaped}'" in attach_sqls[0]
# Sanity: the un-escaped form would have ended the literal early.
assert f"'{adversarial_url}'" not in attach_sqls[0]
def test_token_with_single_quote_is_escaped(self, captured_conn, monkeypatch):
# Defense-in-depth: even if a token somehow contained `'`, the
# ATTACH literal still parses safely.
monkeypatch.setenv("KBC_TOKEN", "abc'def")
conn, sql_calls, set_rows = captured_conn
set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")]
assert "'abc''def'" in attach_sqls[0]
class TestInstallPathSplit:
def test_community_extension_uses_install_from_community(
self, captured_conn, monkeypatch
):
monkeypatch.setenv("KBC_TOKEN", "tok")
conn, sql_calls, set_rows = captured_conn
set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()]
assert any("FROM community" in s for s in install_sqls)
def test_builtin_extension_uses_load_only(
self, captured_conn, monkeypatch
):
# Add a fictitious built-in via the override mechanism (we have to
# patch the module-level set since AGNES_REMOTE_ATTACH_EXTENSIONS
# only affects community).
from src import orchestrator_security as oms
monkeypatch.setattr(oms, "_BUILTIN_EXTENSIONS", frozenset({"sqlite"}))
conn, sql_calls, set_rows = captured_conn
set_rows([("sql1", "sqlite", "/tmp/db.sqlite", "")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()]
# Built-in: no INSTALL, only LOAD
assert install_sqls == []
load_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("LOAD ")]
assert len(load_sqls) == 1