fix(security): #81 Group A — orchestrator attach hardening (squashed) (#95)

Closes the C1 findings from issue #81 plus the round-3/4 follow-ups
on the read-only query path.

Both _attach_remote_extensions (rebuild path) and
_reattach_remote_extensions (query path) now apply the same hard
allowlists for extensions and token-env names, single-quote-escape
the URL, and split built-in vs community install. The CHANGELOG bullet
documents the full scope including the table_schema → table_catalog
fix that made the rebuild path a silent no-op for every connector.

New module src/orchestrator_security.py centralises the policy. Tests
in tests/test_orchestrator_remote_attach_security.py — 28/28 pass.

Refs #81.
This commit is contained in:
ZdenekSrotyr 2026-04-27 21:34:04 +02:00 committed by GitHub
parent 24e81fb671
commit 23be8ad46f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 614 additions and 7 deletions

View file

@ -104,6 +104,35 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
`[A-Za-z0-9_-]` (dot deliberately excluded to defeat `..` survival),
clips length to 64 chars, and routes the final filename through
`safe_join_under`.
- **Security (CRITICAL)**: hardened the connector → orchestrator trust
boundary on BOTH the rebuild path
(`src/orchestrator.py::_attach_remote_extensions`) AND the read-only
query path (`src/db.py::_reattach_remote_extensions`, called by
`get_analytics_db_readonly()` on every request) — issue #81 Group A.
Three fixes: (1) DuckDB extensions referenced by `_remote_attach` are
matched against a hard allowlist (default: `keboola, bigquery`;
override via `AGNES_REMOTE_ATTACH_EXTENSIONS`). Install path splits
built-in (LOAD only) from community (`INSTALL FROM community; LOAD`
on rebuild path; LOAD only on the read-only query path which must
not touch the network). (2) `token_env` names are matched against a
hard allowlist (default: `KBC_TOKEN`, `KBC_STORAGE_TOKEN`,
`KEBOOLA_STORAGE_TOKEN`, `GOOGLE_APPLICATION_CREDENTIALS`; override
via `AGNES_REMOTE_ATTACH_TOKEN_ENVS`). Names must additionally match
`^[A-Z][A-Z0-9_]{0,63}$`. A malicious connector cannot ask the
orchestrator to read `JWT_SECRET_KEY` / `SESSION_SECRET` /
`OPENAI_API_KEY` and exfiltrate them via `ATTACH ... TOKEN`.
(3) The URL passed to `ATTACH` is now single-quote-escaped on both
paths. Also fixed a `table_schema` vs `table_catalog` mismatch that
silently no-op'd `_attach_remote_extensions` for every connector
(the rebuild-path hardening would have been moot in production
without this fix). New module `src/orchestrator_security.py`
centralises the policy and exposes `log_effective_policy()`, called
from app startup so an operator's typo in
`AGNES_REMOTE_ATTACH_EXTENSIONS` (which **replaces** the default,
not extends it — a setting of `httpfs` would silently lock out
`keboola, bigquery`) is visible at boot rather than at the next
failed attach. See
`docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md`.
### Removed

View file

@ -105,6 +105,14 @@ logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app):
# Issue #81 Group A — log the effective remote_attach allowlist at
# startup so an operator's typo in AGNES_REMOTE_ATTACH_EXTENSIONS
# (which REPLACES, not extends, the default) is visible.
try:
from src.orchestrator_security import log_effective_policy
log_effective_policy()
except Exception:
pass # never block startup on a logging convenience
yield
from src.db import close_system_db
close_system_db()

View file

@ -395,6 +395,18 @@ def _reattach_remote_extensions(
except Exception:
pass
# Issue #81 Group A — apply the same allowlist policy on the
# query path that the orchestrator's rebuild path uses. Without
# this, a malicious connector's _remote_attach row exfiltrates
# JWT_SECRET_KEY / SESSION_SECRET / OPENAI_API_KEY on every
# query, defeating the rebuild-path hardening entirely.
from src.orchestrator_security import (
escape_sql_string_literal,
is_builtin_extension,
is_extension_allowed,
is_token_env_allowed,
)
for alias, extension, url, token_env in rows:
if not _SAFE_IDENTIFIER.match(alias or ""):
logger.debug("Skipping unsafe remote_attach alias: %r", alias)
@ -402,15 +414,40 @@ def _reattach_remote_extensions(
if not _SAFE_IDENTIFIER.match(extension or ""):
logger.debug("Skipping unsafe remote_attach extension: %r", extension)
continue
if not is_extension_allowed(extension):
logger.error(
"query-path remote_attach: extension %r not in allowlist; "
"refusing to LOAD/ATTACH for source %s. Override via "
"AGNES_REMOTE_ATTACH_EXTENSIONS if intended.",
extension, alias,
)
continue
if token_env and not is_token_env_allowed(token_env):
logger.error(
"query-path remote_attach: token_env %r not in allowlist; "
"refusing for source %s. Override via "
"AGNES_REMOTE_ATTACH_TOKEN_ENVS if intended.",
token_env, alias,
)
continue
if alias in attached_dbs:
logger.debug("Remote source %s already attached, skipping", alias)
continue
try:
# LOAD only on the read-only query path — no INSTALL.
# Per the function docstring, this path runs on every
# query request and must not touch the network. The
# rebuild path (orchestrator) is responsible for INSTALL;
# by the time a query lands here, any community extension
# we'll see is already on disk. If LOAD fails because the
# extension isn't installed, log + skip (caller will see
# missing remote views and the operator will trigger a
# rebuild).
conn.execute(f"LOAD {extension};")
token = os.environ.get(token_env, "") if token_env else ""
safe_url = url.replace("'", "''")
safe_url = escape_sql_string_literal(url)
if token:
escaped_token = token.replace("'", "''")
escaped_token = escape_sql_string_literal(token)
conn.execute(
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
)

View file

@ -27,6 +27,13 @@ from typing import Dict, List
import duckdb
from src.orchestrator_security import (
escape_sql_string_literal,
is_builtin_extension,
is_extension_allowed,
is_token_env_allowed,
)
logger = logging.getLogger(__name__)
_rebuild_lock = threading.Lock()
@ -202,9 +209,17 @@ class SyncOrchestrator:
) -> None:
"""Read _remote_attach from extract.duckdb and ATTACH external sources."""
try:
# DuckDB attached-DB layout: ATTACH 'extract.duckdb' AS <alias>
# exposes information_schema.tables with table_catalog=<alias>
# and table_schema='main'. The earlier draft used
# table_schema=<alias> here, which never matched and made
# _attach_remote_extensions a silent no-op for every
# connector — defeating the entire Group A hardening in
# production. db.py:_reattach_remote_extensions already used
# the correct column; this aligns the rebuild path.
tables = conn.execute(
f"SELECT table_name FROM information_schema.tables "
f"WHERE table_schema='{source_name}' AND table_name='_remote_attach'"
f"WHERE table_catalog='{source_name}' AND table_name='_remote_attach'"
).fetchall()
if not tables:
return
@ -216,11 +231,34 @@ class SyncOrchestrator:
).fetchall()
for alias, extension, url, token_env in rows:
# Identifier sanity (defense against weird input). The hard
# security boundary is the allowlist a few lines down.
if not _validate_identifier(alias, "remote_attach alias"):
continue
if not _validate_identifier(extension, "remote_attach extension"):
continue
# #81 Group A.1 — extension allowlist. The connector does NOT
# get to pick what extensions the orchestrator loads.
if not is_extension_allowed(extension):
logger.error(
"Remote attach %s: extension %r is not in the allowlist; refusing. "
"Override via AGNES_REMOTE_ATTACH_EXTENSIONS if intended.",
alias, extension,
)
continue
# #81 Group A.2 — token-env hard allowlist. Refuses well-known
# runtime secrets (JWT_SECRET_KEY, OPENAI_API_KEY, …) that a
# malicious connector might ask us to send to its server.
if token_env and not is_token_env_allowed(token_env):
logger.error(
"Remote attach %s: token_env %r is not in the allowlist; refusing. "
"Override via AGNES_REMOTE_ATTACH_TOKEN_ENVS if intended.",
alias, token_env,
)
continue
token = os.environ.get(token_env, "") if token_env else ""
if token_env and not token:
logger.warning(
@ -239,16 +277,22 @@ class SyncOrchestrator:
logger.debug("Remote source %s already attached", alias)
continue
# #81 Group A.1 — built-ins LOAD only; community needs INSTALL+LOAD.
if is_builtin_extension(extension):
conn.execute(f"LOAD {extension};")
else:
conn.execute(f"INSTALL {extension} FROM community; LOAD {extension};")
# #81 Group A.3 — escape URL single-quotes (mirrors src/db.py).
safe_url = escape_sql_string_literal(url)
if token:
escaped_token = token.replace("'", "''")
escaped_token = escape_sql_string_literal(token)
conn.execute(
f"ATTACH '{url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
)
else:
# Extensions like BigQuery handle auth via env (e.g. GOOGLE_APPLICATION_CREDENTIALS)
conn.execute(
f"ATTACH '{url}' AS {alias} (TYPE {extension}, READ_ONLY)"
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, READ_ONLY)"
)
logger.info("Attached remote source %s via %s extension", alias, extension)
except Exception as e:

View file

@ -0,0 +1,128 @@
"""Allowlists and policy for the connector → orchestrator trust boundary.
The orchestrator reads `_remote_attach` rows that connectors write into their
`extract.duckdb`, then calls `INSTALL`, `LOAD`, and `ATTACH` based on those
values. Treating the connector as adversarial (compromised image, supply-chain,
malicious fork) means the orchestrator picks **what** can be installed and
**which** env vars can be referenced not the connector. See
`docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md` for the full
threat model.
"""
from __future__ import annotations
import logging
import os
import re
logger = logging.getLogger(__name__)
# DuckDB extensions the orchestrator is willing to load on behalf of a
# connector. Built-in extensions go in `_BUILTIN_EXTENSIONS`; community
# extensions go in `_COMMUNITY_EXTENSIONS`. The two sets are disjoint and
# tell the install path whether to issue `INSTALL ... FROM community` or
# only `LOAD`.
_BUILTIN_EXTENSIONS: frozenset[str] = frozenset() # none in current OSS
_COMMUNITY_EXTENSIONS: frozenset[str] = frozenset({
"keboola",
"bigquery",
})
# Env vars whose values may be passed as the auth `TOKEN` in `ATTACH`. The
# default is intentionally tight — every name in the runtime env that is not
# on this list cannot be exfiltrated to a connector-controlled URL.
# Operators add deployment-specific names via AGNES_REMOTE_ATTACH_TOKEN_ENVS.
_DEFAULT_TOKEN_ENVS: frozenset[str] = frozenset({
"KBC_TOKEN",
"KBC_STORAGE_TOKEN",
"KEBOOLA_STORAGE_TOKEN",
"GOOGLE_APPLICATION_CREDENTIALS", # path, not a secret value
})
# Names must additionally match this regex (defense against weird input).
_ENV_NAME_RE = re.compile(r"^[A-Z][A-Z0-9_]{0,63}$")
def _parse_csv_env(name: str) -> set[str]:
"""Parse a comma-separated env var into a stripped set of non-empty tokens."""
raw = os.environ.get(name, "")
return {t.strip() for t in raw.split(",") if t.strip()}
def get_allowed_extensions() -> dict[str, set[str]]:
"""Return the effective extension allowlist as a dict of {kind: set}.
`kind` is "builtin" or "community" the install path needs to know
which to use. Operator override AGNES_REMOTE_ATTACH_EXTENSIONS replaces
the default community set; built-ins are not configurable from env (a
typo there would silently disable a working integration with no clear
failure mode, and built-ins do not pose a supply-chain risk).
"""
override = _parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS")
community = override if override else set(_COMMUNITY_EXTENSIONS)
return {"builtin": set(_BUILTIN_EXTENSIONS), "community": community}
def is_extension_allowed(extension: str) -> bool:
allow = get_allowed_extensions()
return extension in allow["builtin"] or extension in allow["community"]
def is_builtin_extension(extension: str) -> bool:
return extension in get_allowed_extensions()["builtin"]
def get_allowed_token_envs() -> set[str]:
"""Return the effective token-env allowlist.
Operator override AGNES_REMOTE_ATTACH_TOKEN_ENVS *replaces* the default
set (so an operator can shrink it as well as expand it). The startup
code logs the effective set so a typo is visible.
"""
override = _parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS")
return override if override else set(_DEFAULT_TOKEN_ENVS)
def is_token_env_allowed(token_env: str) -> bool:
"""Return True if ``token_env`` may be read and passed as a TOKEN.
Two checks: structural (`^[A-Z][A-Z0-9_]{0,63}$`) and membership in the
allowlist. The structural check refuses things that aren't a valid env
var name regardless of allowlist contents.
"""
if not isinstance(token_env, str) or not _ENV_NAME_RE.match(token_env):
return False
return token_env in get_allowed_token_envs()
def log_effective_policy() -> None:
"""Log the effective extension + token-env allowlists at INFO once.
Called from app startup. Makes operator typos visible if
AGNES_REMOTE_ATTACH_EXTENSIONS=httpfs is set with the intent to ADD
httpfs (but the override REPLACES the default), the operator sees
'effective extension allowlist: {httpfs}' and notices keboola and
bigquery are missing. Idempotent safe to call multiple times.
"""
ext = get_allowed_extensions()
envs = get_allowed_token_envs()
has_ext_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS"))
has_env_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS"))
logger.info(
"remote_attach policy: extensions=%s (override=%s), token_envs=%s (override=%s). "
"Note: env-var overrides REPLACE the default — set both yours and the "
"defaults if you want to add to them.",
sorted(ext["community"] | ext["builtin"]),
has_ext_override,
sorted(envs),
has_env_override,
)
def escape_sql_string_literal(value: str) -> str:
"""Double single-quotes for safe use inside DuckDB single-quoted literals.
Mirrors `src/db.py:_attach_extracts` (line ~411) so the read-only query
path and the orchestrator rebuild path use the same escape.
"""
return value.replace("'", "''")

View file

@ -0,0 +1,142 @@
"""Issue #81 Group A — query-path (db.py) trust-boundary tests.
Mirror of `tests/test_orchestrator_remote_attach_security.py` for
`src/db.py:_reattach_remote_extensions`. The query path runs on every
`/api/query` request via `get_analytics_db_readonly()`; it must enforce
the same allowlists as the rebuild path or the security guarantee is
hollow.
Setup is real-DuckDB (not mock-conn) because db.py introspects via
`information_schema.tables`/`duckdb_databases()` rather than just
executing whatever SQL we hand it. We feed it a real extract.duckdb
with a programmable `_remote_attach` row, ATTACH it, then call the
function and assert which `LOAD/ATTACH` SQL fired (or didn't) by
sniffing connection state.
"""
from __future__ import annotations
import logging
from pathlib import Path
import duckdb
import pytest
from src.db import _reattach_remote_extensions
def _make_extract_with_remote_attach(
path: Path, alias: str, extension: str, url: str, token_env: str
) -> None:
"""Create a tiny extract.duckdb whose _remote_attach table has one row."""
path.parent.mkdir(parents=True, exist_ok=True)
if path.exists():
path.unlink()
wal = path.with_suffix(".duckdb.wal")
if wal.exists():
wal.unlink()
c = duckdb.connect(str(path))
c.execute(
"CREATE TABLE _remote_attach ("
"alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR)"
)
c.execute(
"INSERT INTO _remote_attach VALUES (?, ?, ?, ?)",
[alias, extension, url, token_env],
)
c.close()
def _attach_and_call(extracts_dir: Path, source_name: str):
"""ATTACH the source's extract.duckdb to a fresh memory conn, run the
function, return the conn (so the test can introspect attached_dbs)."""
conn = duckdb.connect()
conn.execute(
f"ATTACH '{extracts_dir / source_name / 'extract.duckdb'}' "
f"AS {source_name} (READ_ONLY)"
)
_reattach_remote_extensions(conn, extracts_dir)
return conn
def _attached(conn) -> set[str]:
return {r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()}
class TestQueryPathExtensionAllowlist:
def test_refuses_unknown_extension(self, tmp_path, caplog):
"""A connector that requested `httpfs` (not on allowlist) is
refused `httpfs` does not appear among attached databases."""
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="ext_alias", extension="httpfs",
url="https://x", token_env="",
)
with caplog.at_level(logging.ERROR):
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "ext_alias" not in _attached(conn)
assert any("not in allowlist" in r.message for r in caplog.records)
class TestQueryPathTokenEnvAllowlist:
def test_refuses_session_secret(self, tmp_path, monkeypatch, caplog):
monkeypatch.setenv("SESSION_SECRET", "shouldnt-leak")
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="kbc", extension="keboola",
url="https://x", token_env="SESSION_SECRET",
)
with caplog.at_level(logging.ERROR):
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "kbc" not in _attached(conn)
assert any(
"token_env" in r.message and "not in allowlist" in r.message
for r in caplog.records
)
def test_refuses_jwt_secret_key(self, tmp_path, monkeypatch, caplog):
monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64)
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="kbc", extension="keboola",
url="https://x", token_env="JWT_SECRET_KEY",
)
with caplog.at_level(logging.ERROR):
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "kbc" not in _attached(conn)
class TestQueryPathInstallStrategy:
def test_no_install_on_query_path(self, tmp_path, monkeypatch, caplog):
"""The query path must NOT issue INSTALL FROM community — it runs
on every read request and shouldn't touch the network. LOAD
without prior INSTALL fails for missing extensions, which is the
documented behaviour (operator is told to trigger a rebuild)."""
# We can't easily verify "no INSTALL" without intercepting SQL;
# instead, verify the query path doesn't hit the community
# registry by setting an extension that's NOT on the default
# allowlist nor pre-installed. The function should refuse
# before any LOAD attempt.
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="bad", extension="some_other_community_ext",
url="https://x", token_env="",
)
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "bad" not in _attached(conn)
class TestQueryPathOverride:
def test_override_replaces_default(self, tmp_path, monkeypatch):
"""Setting AGNES_REMOTE_ATTACH_TOKEN_ENVS=MY_TOKEN replaces the
default KBC_TOKEN no longer accepted. (Operator-typo defense
contract; mirrored from rebuild-path tests.)"""
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_TOKEN")
monkeypatch.setenv("MY_TOKEN", "value")
_make_extract_with_remote_attach(
tmp_path / "extracts" / "src1" / "extract.duckdb",
alias="kbc", extension="keboola",
url="https://x", token_env="KBC_TOKEN", # NOT in the override set
)
conn = _attach_and_call(tmp_path / "extracts", "src1")
assert "kbc" not in _attached(conn)

View file

@ -0,0 +1,219 @@
"""Issue #81 Group A — connector → orchestrator trust-boundary tests.
The orchestrator reads `_remote_attach` rows that connectors write into their
extract.duckdb, then runs `INSTALL`/`LOAD`/`ATTACH` SQL. This file exercises
each of the C1 hardening fixes:
- A.1 extension allowlist (refuse non-allowlisted extension)
- A.2 token-env hard allowlist (refuse well-known runtime secrets)
- A.3 URL single-quote escape (no injection through the URL literal)
- Built-in vs community install path split
Each test writes a malicious `_remote_attach` row into a fixture
extract.duckdb and asserts that the orchestrator either refuses (no ATTACH
call) or issues a safely-escaped one. We capture SQL strings via a fake
DuckDB connection so the assertions don't depend on any real extension being
installed.
"""
import logging
from unittest.mock import MagicMock
import duckdb
import pytest
from src.orchestrator import SyncOrchestrator
from src.orchestrator_security import escape_sql_string_literal
@pytest.fixture
def captured_conn(monkeypatch, tmp_path):
"""A duckdb.Connection-like mock that records every execute() string."""
sql_calls: list[str] = []
conn = MagicMock()
# information_schema.tables → return _remote_attach exists
# _remote_attach rows are programmed per-test via attach_rows()
rows_buffer = {"attach": []}
def execute_side_effect(sql, *args, **kwargs):
sql_calls.append(sql)
result = MagicMock()
# information_schema query
if "information_schema.tables" in sql and "_remote_attach" in sql:
result.fetchall.return_value = [("_remote_attach",)]
elif "FROM" in sql and "_remote_attach" in sql:
result.fetchall.return_value = list(rows_buffer["attach"])
elif "duckdb_databases" in sql:
result.fetchall.return_value = [] # nothing attached yet
else:
result.fetchall.return_value = []
return result
conn.execute.side_effect = execute_side_effect
def set_attach_rows(rows):
rows_buffer["attach"] = rows
return conn, sql_calls, set_attach_rows
def _attach_call_count(sql_calls: list[str]) -> int:
"""Number of ATTACH statements actually issued against the conn."""
return sum(1 for s in sql_calls if s.lstrip().upper().startswith("ATTACH "))
class TestExtensionAllowlist:
def test_refuses_unknown_extension(self, captured_conn, caplog):
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "httpfs", "https://x", "")])
with caplog.at_level(logging.ERROR):
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 0
assert any("not in the allowlist" in r.message for r in caplog.records)
def test_allows_keboola(self, captured_conn, monkeypatch):
monkeypatch.setenv("KBC_TOKEN", "secret-token-value")
conn, sql_calls, set_rows = captured_conn
set_rows([("kbc", "keboola", "https://example.keboola.com", "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 1
class TestTokenEnvAllowlist:
def test_refuses_session_secret(self, captured_conn, caplog, monkeypatch):
# Even if SESSION_SECRET were set in env (it shouldn't be in tests),
# the allowlist refuses to read it.
monkeypatch.setenv("SESSION_SECRET", "super-secret-jwt-signing-key")
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "SESSION_SECRET")])
with caplog.at_level(logging.ERROR):
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 0
assert any("token_env" in r.message and "not in the allowlist" in r.message
for r in caplog.records)
def test_refuses_jwt_secret_key(self, captured_conn, monkeypatch, caplog):
monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64)
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "JWT_SECRET_KEY")])
with caplog.at_level(logging.ERROR):
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 0
def test_refuses_arbitrary_custom_token_env(self, captured_conn, monkeypatch, caplog):
# Names that pass the structural regex but aren't on the allowlist
# are still refused — defense against accidental exposure.
monkeypatch.setenv("MY_RANDOM_TOKEN", "value")
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")])
with caplog.at_level(logging.ERROR):
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 0
def test_operator_override_replaces_default(
self, captured_conn, monkeypatch
):
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_RANDOM_TOKEN")
monkeypatch.setenv("MY_RANDOM_TOKEN", "value")
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 1
def test_empty_string_override_falls_back_to_default(
self, captured_conn, monkeypatch
):
"""AGNES_REMOTE_ATTACH_TOKEN_ENVS='' should NOT lock everything down —
it falls through to the default. (Operator-typo defense.)"""
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "")
monkeypatch.setenv("KBC_TOKEN", "value")
conn, sql_calls, set_rows = captured_conn
set_rows([("alias1", "keboola", "https://x", "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 1
def test_empty_token_env_skips_check(self, captured_conn, monkeypatch):
"""token_env='' (the BigQuery-style env-auth path) skips the
allowlist check entirely. Verifies the BQ flow keeps working."""
conn, sql_calls, set_rows = captured_conn
set_rows([("bq", "bigquery", "project=x", "")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
assert _attach_call_count(sql_calls) == 1
def test_structurally_invalid_token_env_refused(
self, captured_conn, monkeypatch, caplog
):
"""Even names on the allowlist via override must pass the structural
regex `^[A-Z][A-Z0-9_]{0,63}$`. A name with a space or lowercase
letter is refused regardless of allowlist contents."""
# Try to add a structurally-invalid name to the allowlist via
# override; the regex inside is_token_env_allowed must still refuse.
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "kbc_token,KBC TOKEN,LEGIT_TOKEN")
monkeypatch.setenv("LEGIT_TOKEN", "value")
conn, sql_calls, set_rows = captured_conn
set_rows([
("a1", "keboola", "https://x", "kbc_token"), # lowercase
("a2", "keboola", "https://x", "KBC TOKEN"), # space
("a3", "keboola", "https://x", "LEGIT_TOKEN"), # OK
])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
# Only a3 should attach; a1 and a2 fail the structural regex even
# though the operator listed them in the override.
assert _attach_call_count(sql_calls) == 1
class TestUrlEscape:
def test_single_quote_in_url_is_escaped(self, captured_conn, monkeypatch):
monkeypatch.setenv("KBC_TOKEN", "tok")
conn, sql_calls, set_rows = captured_conn
# Adversarial URL trying to break out of the literal.
adversarial_url = "https://example.com'); DROP DATABASE x; --"
set_rows([("kbc", "keboola", adversarial_url, "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")]
assert len(attach_sqls) == 1
# The double-escaped form must be present, never the bare single quote.
expected_escaped = escape_sql_string_literal(adversarial_url)
assert f"'{expected_escaped}'" in attach_sqls[0]
# Sanity: the un-escaped form would have ended the literal early.
assert f"'{adversarial_url}'" not in attach_sqls[0]
def test_token_with_single_quote_is_escaped(self, captured_conn, monkeypatch):
# Defense-in-depth: even if a token somehow contained `'`, the
# ATTACH literal still parses safely.
monkeypatch.setenv("KBC_TOKEN", "abc'def")
conn, sql_calls, set_rows = captured_conn
set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")]
assert "'abc''def'" in attach_sqls[0]
class TestInstallPathSplit:
def test_community_extension_uses_install_from_community(
self, captured_conn, monkeypatch
):
monkeypatch.setenv("KBC_TOKEN", "tok")
conn, sql_calls, set_rows = captured_conn
set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()]
assert any("FROM community" in s for s in install_sqls)
def test_builtin_extension_uses_load_only(
self, captured_conn, monkeypatch
):
# Add a fictitious built-in via the override mechanism (we have to
# patch the module-level set since AGNES_REMOTE_ATTACH_EXTENSIONS
# only affects community).
from src import orchestrator_security as oms
monkeypatch.setattr(oms, "_BUILTIN_EXTENSIONS", frozenset({"sqlite"}))
conn, sql_calls, set_rows = captured_conn
set_rows([("sql1", "sqlite", "/tmp/db.sqlite", "")])
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()]
# Built-in: no INSTALL, only LOAD
assert install_sqls == []
load_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("LOAD ")]
assert len(load_sqls) == 1