Closes the C1 findings from issue #81 plus the round-3/4 follow-ups on the read-only query path. Both _attach_remote_extensions (rebuild path) and _reattach_remote_extensions (query path) now apply the same hard allowlists for extensions and token-env names, single-quote-escape the URL, and split built-in vs community install. The CHANGELOG bullet documents the full scope including the table_schema → table_catalog fix that made the rebuild path a silent no-op for every connector. New module src/orchestrator_security.py centralises the policy. Tests in tests/test_orchestrator_remote_attach_security.py — 28/28 pass. Refs #81.
142 lines
5.7 KiB
Python
142 lines
5.7 KiB
Python
"""Issue #81 Group A — query-path (db.py) trust-boundary tests.
|
|
|
|
Mirror of `tests/test_orchestrator_remote_attach_security.py` for
|
|
`src/db.py:_reattach_remote_extensions`. The query path runs on every
|
|
`/api/query` request via `get_analytics_db_readonly()`; it must enforce
|
|
the same allowlists as the rebuild path or the security guarantee is
|
|
hollow.
|
|
|
|
Setup is real-DuckDB (not mock-conn) because db.py introspects via
|
|
`information_schema.tables`/`duckdb_databases()` rather than just
|
|
executing whatever SQL we hand it. We feed it a real extract.duckdb
|
|
with a programmable `_remote_attach` row, ATTACH it, then call the
|
|
function and assert which `LOAD/ATTACH` SQL fired (or didn't) by
|
|
sniffing connection state.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import duckdb
|
|
import pytest
|
|
|
|
from src.db import _reattach_remote_extensions
|
|
|
|
|
|
def _make_extract_with_remote_attach(
|
|
path: Path, alias: str, extension: str, url: str, token_env: str
|
|
) -> None:
|
|
"""Create a tiny extract.duckdb whose _remote_attach table has one row."""
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
if path.exists():
|
|
path.unlink()
|
|
wal = path.with_suffix(".duckdb.wal")
|
|
if wal.exists():
|
|
wal.unlink()
|
|
c = duckdb.connect(str(path))
|
|
c.execute(
|
|
"CREATE TABLE _remote_attach ("
|
|
"alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR)"
|
|
)
|
|
c.execute(
|
|
"INSERT INTO _remote_attach VALUES (?, ?, ?, ?)",
|
|
[alias, extension, url, token_env],
|
|
)
|
|
c.close()
|
|
|
|
|
|
def _attach_and_call(extracts_dir: Path, source_name: str):
|
|
"""ATTACH the source's extract.duckdb to a fresh memory conn, run the
|
|
function, return the conn (so the test can introspect attached_dbs)."""
|
|
conn = duckdb.connect()
|
|
conn.execute(
|
|
f"ATTACH '{extracts_dir / source_name / 'extract.duckdb'}' "
|
|
f"AS {source_name} (READ_ONLY)"
|
|
)
|
|
_reattach_remote_extensions(conn, extracts_dir)
|
|
return conn
|
|
|
|
|
|
def _attached(conn) -> set[str]:
|
|
return {r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()}
|
|
|
|
|
|
class TestQueryPathExtensionAllowlist:
|
|
def test_refuses_unknown_extension(self, tmp_path, caplog):
|
|
"""A connector that requested `httpfs` (not on allowlist) is
|
|
refused — `httpfs` does not appear among attached databases."""
|
|
_make_extract_with_remote_attach(
|
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
|
alias="ext_alias", extension="httpfs",
|
|
url="https://x", token_env="",
|
|
)
|
|
with caplog.at_level(logging.ERROR):
|
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
|
assert "ext_alias" not in _attached(conn)
|
|
assert any("not in allowlist" in r.message for r in caplog.records)
|
|
|
|
|
|
class TestQueryPathTokenEnvAllowlist:
|
|
def test_refuses_session_secret(self, tmp_path, monkeypatch, caplog):
|
|
monkeypatch.setenv("SESSION_SECRET", "shouldnt-leak")
|
|
_make_extract_with_remote_attach(
|
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
|
alias="kbc", extension="keboola",
|
|
url="https://x", token_env="SESSION_SECRET",
|
|
)
|
|
with caplog.at_level(logging.ERROR):
|
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
|
assert "kbc" not in _attached(conn)
|
|
assert any(
|
|
"token_env" in r.message and "not in allowlist" in r.message
|
|
for r in caplog.records
|
|
)
|
|
|
|
def test_refuses_jwt_secret_key(self, tmp_path, monkeypatch, caplog):
|
|
monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64)
|
|
_make_extract_with_remote_attach(
|
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
|
alias="kbc", extension="keboola",
|
|
url="https://x", token_env="JWT_SECRET_KEY",
|
|
)
|
|
with caplog.at_level(logging.ERROR):
|
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
|
assert "kbc" not in _attached(conn)
|
|
|
|
|
|
class TestQueryPathInstallStrategy:
|
|
def test_no_install_on_query_path(self, tmp_path, monkeypatch, caplog):
|
|
"""The query path must NOT issue INSTALL FROM community — it runs
|
|
on every read request and shouldn't touch the network. LOAD
|
|
without prior INSTALL fails for missing extensions, which is the
|
|
documented behaviour (operator is told to trigger a rebuild)."""
|
|
# We can't easily verify "no INSTALL" without intercepting SQL;
|
|
# instead, verify the query path doesn't hit the community
|
|
# registry by setting an extension that's NOT on the default
|
|
# allowlist nor pre-installed. The function should refuse
|
|
# before any LOAD attempt.
|
|
_make_extract_with_remote_attach(
|
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
|
alias="bad", extension="some_other_community_ext",
|
|
url="https://x", token_env="",
|
|
)
|
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
|
assert "bad" not in _attached(conn)
|
|
|
|
|
|
class TestQueryPathOverride:
|
|
def test_override_replaces_default(self, tmp_path, monkeypatch):
|
|
"""Setting AGNES_REMOTE_ATTACH_TOKEN_ENVS=MY_TOKEN replaces the
|
|
default — KBC_TOKEN no longer accepted. (Operator-typo defense
|
|
contract; mirrored from rebuild-path tests.)"""
|
|
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_TOKEN")
|
|
monkeypatch.setenv("MY_TOKEN", "value")
|
|
_make_extract_with_remote_attach(
|
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
|
alias="kbc", extension="keboola",
|
|
url="https://x", token_env="KBC_TOKEN", # NOT in the override set
|
|
)
|
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
|
assert "kbc" not in _attached(conn)
|