Closes the C1 findings from issue #81 plus the round-3/4 follow-ups on the read-only query path. Both _attach_remote_extensions (rebuild path) and _reattach_remote_extensions (query path) now apply the same hard allowlists for extensions and token-env names, single-quote-escape the URL, and split built-in vs community install. The CHANGELOG bullet documents the full scope including the table_schema → table_catalog fix that made the rebuild path a silent no-op for every connector. New module src/orchestrator_security.py centralises the policy. Tests in tests/test_orchestrator_remote_attach_security.py — 28/28 pass. Refs #81.
219 lines
10 KiB
Python
219 lines
10 KiB
Python
"""Issue #81 Group A — connector → orchestrator trust-boundary tests.
|
|
|
|
The orchestrator reads `_remote_attach` rows that connectors write into their
|
|
extract.duckdb, then runs `INSTALL`/`LOAD`/`ATTACH` SQL. This file exercises
|
|
each of the C1 hardening fixes:
|
|
|
|
- A.1 extension allowlist (refuse non-allowlisted extension)
|
|
- A.2 token-env hard allowlist (refuse well-known runtime secrets)
|
|
- A.3 URL single-quote escape (no injection through the URL literal)
|
|
- Built-in vs community install path split
|
|
|
|
Each test writes a malicious `_remote_attach` row into a fixture
|
|
extract.duckdb and asserts that the orchestrator either refuses (no ATTACH
|
|
call) or issues a safely-escaped one. We capture SQL strings via a fake
|
|
DuckDB connection so the assertions don't depend on any real extension being
|
|
installed.
|
|
"""
|
|
|
|
import logging
|
|
from unittest.mock import MagicMock
|
|
|
|
import duckdb
|
|
import pytest
|
|
|
|
from src.orchestrator import SyncOrchestrator
|
|
from src.orchestrator_security import escape_sql_string_literal
|
|
|
|
|
|
@pytest.fixture
|
|
def captured_conn(monkeypatch, tmp_path):
|
|
"""A duckdb.Connection-like mock that records every execute() string."""
|
|
sql_calls: list[str] = []
|
|
conn = MagicMock()
|
|
|
|
# information_schema.tables → return _remote_attach exists
|
|
# _remote_attach rows are programmed per-test via attach_rows()
|
|
rows_buffer = {"attach": []}
|
|
|
|
def execute_side_effect(sql, *args, **kwargs):
|
|
sql_calls.append(sql)
|
|
result = MagicMock()
|
|
# information_schema query
|
|
if "information_schema.tables" in sql and "_remote_attach" in sql:
|
|
result.fetchall.return_value = [("_remote_attach",)]
|
|
elif "FROM" in sql and "_remote_attach" in sql:
|
|
result.fetchall.return_value = list(rows_buffer["attach"])
|
|
elif "duckdb_databases" in sql:
|
|
result.fetchall.return_value = [] # nothing attached yet
|
|
else:
|
|
result.fetchall.return_value = []
|
|
return result
|
|
|
|
conn.execute.side_effect = execute_side_effect
|
|
|
|
def set_attach_rows(rows):
|
|
rows_buffer["attach"] = rows
|
|
|
|
return conn, sql_calls, set_attach_rows
|
|
|
|
|
|
def _attach_call_count(sql_calls: list[str]) -> int:
|
|
"""Number of ATTACH statements actually issued against the conn."""
|
|
return sum(1 for s in sql_calls if s.lstrip().upper().startswith("ATTACH "))
|
|
|
|
|
|
class TestExtensionAllowlist:
|
|
def test_refuses_unknown_extension(self, captured_conn, caplog):
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("alias1", "httpfs", "https://x", "")])
|
|
with caplog.at_level(logging.ERROR):
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
assert _attach_call_count(sql_calls) == 0
|
|
assert any("not in the allowlist" in r.message for r in caplog.records)
|
|
|
|
def test_allows_keboola(self, captured_conn, monkeypatch):
|
|
monkeypatch.setenv("KBC_TOKEN", "secret-token-value")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("kbc", "keboola", "https://example.keboola.com", "KBC_TOKEN")])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
assert _attach_call_count(sql_calls) == 1
|
|
|
|
|
|
class TestTokenEnvAllowlist:
|
|
def test_refuses_session_secret(self, captured_conn, caplog, monkeypatch):
|
|
# Even if SESSION_SECRET were set in env (it shouldn't be in tests),
|
|
# the allowlist refuses to read it.
|
|
monkeypatch.setenv("SESSION_SECRET", "super-secret-jwt-signing-key")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("alias1", "keboola", "https://x", "SESSION_SECRET")])
|
|
with caplog.at_level(logging.ERROR):
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
assert _attach_call_count(sql_calls) == 0
|
|
assert any("token_env" in r.message and "not in the allowlist" in r.message
|
|
for r in caplog.records)
|
|
|
|
def test_refuses_jwt_secret_key(self, captured_conn, monkeypatch, caplog):
|
|
monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64)
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("alias1", "keboola", "https://x", "JWT_SECRET_KEY")])
|
|
with caplog.at_level(logging.ERROR):
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
assert _attach_call_count(sql_calls) == 0
|
|
|
|
def test_refuses_arbitrary_custom_token_env(self, captured_conn, monkeypatch, caplog):
|
|
# Names that pass the structural regex but aren't on the allowlist
|
|
# are still refused — defense against accidental exposure.
|
|
monkeypatch.setenv("MY_RANDOM_TOKEN", "value")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")])
|
|
with caplog.at_level(logging.ERROR):
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
assert _attach_call_count(sql_calls) == 0
|
|
|
|
def test_operator_override_replaces_default(
|
|
self, captured_conn, monkeypatch
|
|
):
|
|
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_RANDOM_TOKEN")
|
|
monkeypatch.setenv("MY_RANDOM_TOKEN", "value")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
assert _attach_call_count(sql_calls) == 1
|
|
|
|
def test_empty_string_override_falls_back_to_default(
|
|
self, captured_conn, monkeypatch
|
|
):
|
|
"""AGNES_REMOTE_ATTACH_TOKEN_ENVS='' should NOT lock everything down —
|
|
it falls through to the default. (Operator-typo defense.)"""
|
|
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "")
|
|
monkeypatch.setenv("KBC_TOKEN", "value")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("alias1", "keboola", "https://x", "KBC_TOKEN")])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
assert _attach_call_count(sql_calls) == 1
|
|
|
|
def test_empty_token_env_skips_check(self, captured_conn, monkeypatch):
|
|
"""token_env='' (the BigQuery-style env-auth path) skips the
|
|
allowlist check entirely. Verifies the BQ flow keeps working."""
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("bq", "bigquery", "project=x", "")])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
assert _attach_call_count(sql_calls) == 1
|
|
|
|
def test_structurally_invalid_token_env_refused(
|
|
self, captured_conn, monkeypatch, caplog
|
|
):
|
|
"""Even names on the allowlist via override must pass the structural
|
|
regex `^[A-Z][A-Z0-9_]{0,63}$`. A name with a space or lowercase
|
|
letter is refused regardless of allowlist contents."""
|
|
# Try to add a structurally-invalid name to the allowlist via
|
|
# override; the regex inside is_token_env_allowed must still refuse.
|
|
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "kbc_token,KBC TOKEN,LEGIT_TOKEN")
|
|
monkeypatch.setenv("LEGIT_TOKEN", "value")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([
|
|
("a1", "keboola", "https://x", "kbc_token"), # lowercase
|
|
("a2", "keboola", "https://x", "KBC TOKEN"), # space
|
|
("a3", "keboola", "https://x", "LEGIT_TOKEN"), # OK
|
|
])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
# Only a3 should attach; a1 and a2 fail the structural regex even
|
|
# though the operator listed them in the override.
|
|
assert _attach_call_count(sql_calls) == 1
|
|
|
|
|
|
class TestUrlEscape:
|
|
def test_single_quote_in_url_is_escaped(self, captured_conn, monkeypatch):
|
|
monkeypatch.setenv("KBC_TOKEN", "tok")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
# Adversarial URL trying to break out of the literal.
|
|
adversarial_url = "https://example.com'); DROP DATABASE x; --"
|
|
set_rows([("kbc", "keboola", adversarial_url, "KBC_TOKEN")])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")]
|
|
assert len(attach_sqls) == 1
|
|
# The double-escaped form must be present, never the bare single quote.
|
|
expected_escaped = escape_sql_string_literal(adversarial_url)
|
|
assert f"'{expected_escaped}'" in attach_sqls[0]
|
|
# Sanity: the un-escaped form would have ended the literal early.
|
|
assert f"'{adversarial_url}'" not in attach_sqls[0]
|
|
|
|
def test_token_with_single_quote_is_escaped(self, captured_conn, monkeypatch):
|
|
# Defense-in-depth: even if a token somehow contained `'`, the
|
|
# ATTACH literal still parses safely.
|
|
monkeypatch.setenv("KBC_TOKEN", "abc'def")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")]
|
|
assert "'abc''def'" in attach_sqls[0]
|
|
|
|
|
|
class TestInstallPathSplit:
|
|
def test_community_extension_uses_install_from_community(
|
|
self, captured_conn, monkeypatch
|
|
):
|
|
monkeypatch.setenv("KBC_TOKEN", "tok")
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()]
|
|
assert any("FROM community" in s for s in install_sqls)
|
|
|
|
def test_builtin_extension_uses_load_only(
|
|
self, captured_conn, monkeypatch
|
|
):
|
|
# Add a fictitious built-in via the override mechanism (we have to
|
|
# patch the module-level set since AGNES_REMOTE_ATTACH_EXTENSIONS
|
|
# only affects community).
|
|
from src import orchestrator_security as oms
|
|
monkeypatch.setattr(oms, "_BUILTIN_EXTENSIONS", frozenset({"sqlite"}))
|
|
conn, sql_calls, set_rows = captured_conn
|
|
set_rows([("sql1", "sqlite", "/tmp/db.sqlite", "")])
|
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
|
install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()]
|
|
# Built-in: no INSTALL, only LOAD
|
|
assert install_sqls == []
|
|
load_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("LOAD ")]
|
|
assert len(load_sqls) == 1
|