Closes the C1 findings from issue #81 plus the round-3/4 follow-ups on the read-only query path. Both _attach_remote_extensions (rebuild path) and _reattach_remote_extensions (query path) now apply the same hard allowlists for extensions and token-env names, single-quote-escape the URL, and split built-in vs community install. The CHANGELOG bullet documents the full scope including the table_schema → table_catalog fix that made the rebuild path a silent no-op for every connector. New module src/orchestrator_security.py centralises the policy. Tests in tests/test_orchestrator_remote_attach_security.py — 28/28 pass. Refs #81.
This commit is contained in:
parent
24e81fb671
commit
23be8ad46f
7 changed files with 614 additions and 7 deletions
29
CHANGELOG.md
29
CHANGELOG.md
|
|
@ -104,6 +104,35 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
|
||||||
`[A-Za-z0-9_-]` (dot deliberately excluded to defeat `..` survival),
|
`[A-Za-z0-9_-]` (dot deliberately excluded to defeat `..` survival),
|
||||||
clips length to 64 chars, and routes the final filename through
|
clips length to 64 chars, and routes the final filename through
|
||||||
`safe_join_under`.
|
`safe_join_under`.
|
||||||
|
- **Security (CRITICAL)**: hardened the connector → orchestrator trust
|
||||||
|
boundary on BOTH the rebuild path
|
||||||
|
(`src/orchestrator.py::_attach_remote_extensions`) AND the read-only
|
||||||
|
query path (`src/db.py::_reattach_remote_extensions`, called by
|
||||||
|
`get_analytics_db_readonly()` on every request) — issue #81 Group A.
|
||||||
|
Three fixes: (1) DuckDB extensions referenced by `_remote_attach` are
|
||||||
|
matched against a hard allowlist (default: `keboola, bigquery`;
|
||||||
|
override via `AGNES_REMOTE_ATTACH_EXTENSIONS`). Install path splits
|
||||||
|
built-in (LOAD only) from community (`INSTALL FROM community; LOAD`
|
||||||
|
on rebuild path; LOAD only on the read-only query path which must
|
||||||
|
not touch the network). (2) `token_env` names are matched against a
|
||||||
|
hard allowlist (default: `KBC_TOKEN`, `KBC_STORAGE_TOKEN`,
|
||||||
|
`KEBOOLA_STORAGE_TOKEN`, `GOOGLE_APPLICATION_CREDENTIALS`; override
|
||||||
|
via `AGNES_REMOTE_ATTACH_TOKEN_ENVS`). Names must additionally match
|
||||||
|
`^[A-Z][A-Z0-9_]{0,63}$`. A malicious connector cannot ask the
|
||||||
|
orchestrator to read `JWT_SECRET_KEY` / `SESSION_SECRET` /
|
||||||
|
`OPENAI_API_KEY` and exfiltrate them via `ATTACH ... TOKEN`.
|
||||||
|
(3) The URL passed to `ATTACH` is now single-quote-escaped on both
|
||||||
|
paths. Also fixed a `table_schema` vs `table_catalog` mismatch that
|
||||||
|
silently no-op'd `_attach_remote_extensions` for every connector
|
||||||
|
(the rebuild-path hardening would have been moot in production
|
||||||
|
without this fix). New module `src/orchestrator_security.py`
|
||||||
|
centralises the policy and exposes `log_effective_policy()`, called
|
||||||
|
from app startup so an operator's typo in
|
||||||
|
`AGNES_REMOTE_ATTACH_EXTENSIONS` (which **replaces** the default,
|
||||||
|
not extends it — a setting of `httpfs` would silently lock out
|
||||||
|
`keboola, bigquery`) is visible at boot rather than at the next
|
||||||
|
failed attach. See
|
||||||
|
`docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md`.
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -105,6 +105,14 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app):
|
async def lifespan(app):
|
||||||
|
# Issue #81 Group A — log the effective remote_attach allowlist at
|
||||||
|
# startup so an operator's typo in AGNES_REMOTE_ATTACH_EXTENSIONS
|
||||||
|
# (which REPLACES, not extends, the default) is visible.
|
||||||
|
try:
|
||||||
|
from src.orchestrator_security import log_effective_policy
|
||||||
|
log_effective_policy()
|
||||||
|
except Exception:
|
||||||
|
pass # never block startup on a logging convenience
|
||||||
yield
|
yield
|
||||||
from src.db import close_system_db
|
from src.db import close_system_db
|
||||||
close_system_db()
|
close_system_db()
|
||||||
|
|
|
||||||
41
src/db.py
41
src/db.py
|
|
@ -395,6 +395,18 @@ def _reattach_remote_extensions(
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Issue #81 Group A — apply the same allowlist policy on the
|
||||||
|
# query path that the orchestrator's rebuild path uses. Without
|
||||||
|
# this, a malicious connector's _remote_attach row exfiltrates
|
||||||
|
# JWT_SECRET_KEY / SESSION_SECRET / OPENAI_API_KEY on every
|
||||||
|
# query, defeating the rebuild-path hardening entirely.
|
||||||
|
from src.orchestrator_security import (
|
||||||
|
escape_sql_string_literal,
|
||||||
|
is_builtin_extension,
|
||||||
|
is_extension_allowed,
|
||||||
|
is_token_env_allowed,
|
||||||
|
)
|
||||||
|
|
||||||
for alias, extension, url, token_env in rows:
|
for alias, extension, url, token_env in rows:
|
||||||
if not _SAFE_IDENTIFIER.match(alias or ""):
|
if not _SAFE_IDENTIFIER.match(alias or ""):
|
||||||
logger.debug("Skipping unsafe remote_attach alias: %r", alias)
|
logger.debug("Skipping unsafe remote_attach alias: %r", alias)
|
||||||
|
|
@ -402,15 +414,40 @@ def _reattach_remote_extensions(
|
||||||
if not _SAFE_IDENTIFIER.match(extension or ""):
|
if not _SAFE_IDENTIFIER.match(extension or ""):
|
||||||
logger.debug("Skipping unsafe remote_attach extension: %r", extension)
|
logger.debug("Skipping unsafe remote_attach extension: %r", extension)
|
||||||
continue
|
continue
|
||||||
|
if not is_extension_allowed(extension):
|
||||||
|
logger.error(
|
||||||
|
"query-path remote_attach: extension %r not in allowlist; "
|
||||||
|
"refusing to LOAD/ATTACH for source %s. Override via "
|
||||||
|
"AGNES_REMOTE_ATTACH_EXTENSIONS if intended.",
|
||||||
|
extension, alias,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if token_env and not is_token_env_allowed(token_env):
|
||||||
|
logger.error(
|
||||||
|
"query-path remote_attach: token_env %r not in allowlist; "
|
||||||
|
"refusing for source %s. Override via "
|
||||||
|
"AGNES_REMOTE_ATTACH_TOKEN_ENVS if intended.",
|
||||||
|
token_env, alias,
|
||||||
|
)
|
||||||
|
continue
|
||||||
if alias in attached_dbs:
|
if alias in attached_dbs:
|
||||||
logger.debug("Remote source %s already attached, skipping", alias)
|
logger.debug("Remote source %s already attached, skipping", alias)
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
|
# LOAD only on the read-only query path — no INSTALL.
|
||||||
|
# Per the function docstring, this path runs on every
|
||||||
|
# query request and must not touch the network. The
|
||||||
|
# rebuild path (orchestrator) is responsible for INSTALL;
|
||||||
|
# by the time a query lands here, any community extension
|
||||||
|
# we'll see is already on disk. If LOAD fails because the
|
||||||
|
# extension isn't installed, log + skip (caller will see
|
||||||
|
# missing remote views and the operator will trigger a
|
||||||
|
# rebuild).
|
||||||
conn.execute(f"LOAD {extension};")
|
conn.execute(f"LOAD {extension};")
|
||||||
token = os.environ.get(token_env, "") if token_env else ""
|
token = os.environ.get(token_env, "") if token_env else ""
|
||||||
safe_url = url.replace("'", "''")
|
safe_url = escape_sql_string_literal(url)
|
||||||
if token:
|
if token:
|
||||||
escaped_token = token.replace("'", "''")
|
escaped_token = escape_sql_string_literal(token)
|
||||||
conn.execute(
|
conn.execute(
|
||||||
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
|
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,13 @@ from typing import Dict, List
|
||||||
|
|
||||||
import duckdb
|
import duckdb
|
||||||
|
|
||||||
|
from src.orchestrator_security import (
|
||||||
|
escape_sql_string_literal,
|
||||||
|
is_builtin_extension,
|
||||||
|
is_extension_allowed,
|
||||||
|
is_token_env_allowed,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_rebuild_lock = threading.Lock()
|
_rebuild_lock = threading.Lock()
|
||||||
|
|
@ -202,9 +209,17 @@ class SyncOrchestrator:
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Read _remote_attach from extract.duckdb and ATTACH external sources."""
|
"""Read _remote_attach from extract.duckdb and ATTACH external sources."""
|
||||||
try:
|
try:
|
||||||
|
# DuckDB attached-DB layout: ATTACH 'extract.duckdb' AS <alias>
|
||||||
|
# exposes information_schema.tables with table_catalog=<alias>
|
||||||
|
# and table_schema='main'. The earlier draft used
|
||||||
|
# table_schema=<alias> here, which never matched and made
|
||||||
|
# _attach_remote_extensions a silent no-op for every
|
||||||
|
# connector — defeating the entire Group A hardening in
|
||||||
|
# production. db.py:_reattach_remote_extensions already used
|
||||||
|
# the correct column; this aligns the rebuild path.
|
||||||
tables = conn.execute(
|
tables = conn.execute(
|
||||||
f"SELECT table_name FROM information_schema.tables "
|
f"SELECT table_name FROM information_schema.tables "
|
||||||
f"WHERE table_schema='{source_name}' AND table_name='_remote_attach'"
|
f"WHERE table_catalog='{source_name}' AND table_name='_remote_attach'"
|
||||||
).fetchall()
|
).fetchall()
|
||||||
if not tables:
|
if not tables:
|
||||||
return
|
return
|
||||||
|
|
@ -216,11 +231,34 @@ class SyncOrchestrator:
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
for alias, extension, url, token_env in rows:
|
for alias, extension, url, token_env in rows:
|
||||||
|
# Identifier sanity (defense against weird input). The hard
|
||||||
|
# security boundary is the allowlist a few lines down.
|
||||||
if not _validate_identifier(alias, "remote_attach alias"):
|
if not _validate_identifier(alias, "remote_attach alias"):
|
||||||
continue
|
continue
|
||||||
if not _validate_identifier(extension, "remote_attach extension"):
|
if not _validate_identifier(extension, "remote_attach extension"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# #81 Group A.1 — extension allowlist. The connector does NOT
|
||||||
|
# get to pick what extensions the orchestrator loads.
|
||||||
|
if not is_extension_allowed(extension):
|
||||||
|
logger.error(
|
||||||
|
"Remote attach %s: extension %r is not in the allowlist; refusing. "
|
||||||
|
"Override via AGNES_REMOTE_ATTACH_EXTENSIONS if intended.",
|
||||||
|
alias, extension,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# #81 Group A.2 — token-env hard allowlist. Refuses well-known
|
||||||
|
# runtime secrets (JWT_SECRET_KEY, OPENAI_API_KEY, …) that a
|
||||||
|
# malicious connector might ask us to send to its server.
|
||||||
|
if token_env and not is_token_env_allowed(token_env):
|
||||||
|
logger.error(
|
||||||
|
"Remote attach %s: token_env %r is not in the allowlist; refusing. "
|
||||||
|
"Override via AGNES_REMOTE_ATTACH_TOKEN_ENVS if intended.",
|
||||||
|
alias, token_env,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
token = os.environ.get(token_env, "") if token_env else ""
|
token = os.environ.get(token_env, "") if token_env else ""
|
||||||
if token_env and not token:
|
if token_env and not token:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|
@ -239,16 +277,22 @@ class SyncOrchestrator:
|
||||||
logger.debug("Remote source %s already attached", alias)
|
logger.debug("Remote source %s already attached", alias)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
conn.execute(f"INSTALL {extension} FROM community; LOAD {extension};")
|
# #81 Group A.1 — built-ins LOAD only; community needs INSTALL+LOAD.
|
||||||
|
if is_builtin_extension(extension):
|
||||||
|
conn.execute(f"LOAD {extension};")
|
||||||
|
else:
|
||||||
|
conn.execute(f"INSTALL {extension} FROM community; LOAD {extension};")
|
||||||
|
# #81 Group A.3 — escape URL single-quotes (mirrors src/db.py).
|
||||||
|
safe_url = escape_sql_string_literal(url)
|
||||||
if token:
|
if token:
|
||||||
escaped_token = token.replace("'", "''")
|
escaped_token = escape_sql_string_literal(token)
|
||||||
conn.execute(
|
conn.execute(
|
||||||
f"ATTACH '{url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
|
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Extensions like BigQuery handle auth via env (e.g. GOOGLE_APPLICATION_CREDENTIALS)
|
# Extensions like BigQuery handle auth via env (e.g. GOOGLE_APPLICATION_CREDENTIALS)
|
||||||
conn.execute(
|
conn.execute(
|
||||||
f"ATTACH '{url}' AS {alias} (TYPE {extension}, READ_ONLY)"
|
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, READ_ONLY)"
|
||||||
)
|
)
|
||||||
logger.info("Attached remote source %s via %s extension", alias, extension)
|
logger.info("Attached remote source %s via %s extension", alias, extension)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
128
src/orchestrator_security.py
Normal file
128
src/orchestrator_security.py
Normal file
|
|
@ -0,0 +1,128 @@
|
||||||
|
"""Allowlists and policy for the connector → orchestrator trust boundary.
|
||||||
|
|
||||||
|
The orchestrator reads `_remote_attach` rows that connectors write into their
|
||||||
|
`extract.duckdb`, then calls `INSTALL`, `LOAD`, and `ATTACH` based on those
|
||||||
|
values. Treating the connector as adversarial (compromised image, supply-chain,
|
||||||
|
malicious fork) means the orchestrator picks **what** can be installed and
|
||||||
|
**which** env vars can be referenced — not the connector. See
|
||||||
|
`docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md` for the full
|
||||||
|
threat model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# DuckDB extensions the orchestrator is willing to load on behalf of a
|
||||||
|
# connector. Built-in extensions go in `_BUILTIN_EXTENSIONS`; community
|
||||||
|
# extensions go in `_COMMUNITY_EXTENSIONS`. The two sets are disjoint and
|
||||||
|
# tell the install path whether to issue `INSTALL ... FROM community` or
|
||||||
|
# only `LOAD`.
|
||||||
|
_BUILTIN_EXTENSIONS: frozenset[str] = frozenset() # none in current OSS
|
||||||
|
_COMMUNITY_EXTENSIONS: frozenset[str] = frozenset({
|
||||||
|
"keboola",
|
||||||
|
"bigquery",
|
||||||
|
})
|
||||||
|
|
||||||
|
# Env vars whose values may be passed as the auth `TOKEN` in `ATTACH`. The
|
||||||
|
# default is intentionally tight — every name in the runtime env that is not
|
||||||
|
# on this list cannot be exfiltrated to a connector-controlled URL.
|
||||||
|
# Operators add deployment-specific names via AGNES_REMOTE_ATTACH_TOKEN_ENVS.
|
||||||
|
_DEFAULT_TOKEN_ENVS: frozenset[str] = frozenset({
|
||||||
|
"KBC_TOKEN",
|
||||||
|
"KBC_STORAGE_TOKEN",
|
||||||
|
"KEBOOLA_STORAGE_TOKEN",
|
||||||
|
"GOOGLE_APPLICATION_CREDENTIALS", # path, not a secret value
|
||||||
|
})
|
||||||
|
|
||||||
|
# Names must additionally match this regex (defense against weird input).
|
||||||
|
_ENV_NAME_RE = re.compile(r"^[A-Z][A-Z0-9_]{0,63}$")
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_csv_env(name: str) -> set[str]:
|
||||||
|
"""Parse a comma-separated env var into a stripped set of non-empty tokens."""
|
||||||
|
raw = os.environ.get(name, "")
|
||||||
|
return {t.strip() for t in raw.split(",") if t.strip()}
|
||||||
|
|
||||||
|
|
||||||
|
def get_allowed_extensions() -> dict[str, set[str]]:
|
||||||
|
"""Return the effective extension allowlist as a dict of {kind: set}.
|
||||||
|
|
||||||
|
`kind` is "builtin" or "community" — the install path needs to know
|
||||||
|
which to use. Operator override AGNES_REMOTE_ATTACH_EXTENSIONS replaces
|
||||||
|
the default community set; built-ins are not configurable from env (a
|
||||||
|
typo there would silently disable a working integration with no clear
|
||||||
|
failure mode, and built-ins do not pose a supply-chain risk).
|
||||||
|
"""
|
||||||
|
override = _parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS")
|
||||||
|
community = override if override else set(_COMMUNITY_EXTENSIONS)
|
||||||
|
return {"builtin": set(_BUILTIN_EXTENSIONS), "community": community}
|
||||||
|
|
||||||
|
|
||||||
|
def is_extension_allowed(extension: str) -> bool:
|
||||||
|
allow = get_allowed_extensions()
|
||||||
|
return extension in allow["builtin"] or extension in allow["community"]
|
||||||
|
|
||||||
|
|
||||||
|
def is_builtin_extension(extension: str) -> bool:
|
||||||
|
return extension in get_allowed_extensions()["builtin"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_allowed_token_envs() -> set[str]:
|
||||||
|
"""Return the effective token-env allowlist.
|
||||||
|
|
||||||
|
Operator override AGNES_REMOTE_ATTACH_TOKEN_ENVS *replaces* the default
|
||||||
|
set (so an operator can shrink it as well as expand it). The startup
|
||||||
|
code logs the effective set so a typo is visible.
|
||||||
|
"""
|
||||||
|
override = _parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS")
|
||||||
|
return override if override else set(_DEFAULT_TOKEN_ENVS)
|
||||||
|
|
||||||
|
|
||||||
|
def is_token_env_allowed(token_env: str) -> bool:
|
||||||
|
"""Return True if ``token_env`` may be read and passed as a TOKEN.
|
||||||
|
|
||||||
|
Two checks: structural (`^[A-Z][A-Z0-9_]{0,63}$`) and membership in the
|
||||||
|
allowlist. The structural check refuses things that aren't a valid env
|
||||||
|
var name regardless of allowlist contents.
|
||||||
|
"""
|
||||||
|
if not isinstance(token_env, str) or not _ENV_NAME_RE.match(token_env):
|
||||||
|
return False
|
||||||
|
return token_env in get_allowed_token_envs()
|
||||||
|
|
||||||
|
|
||||||
|
def log_effective_policy() -> None:
|
||||||
|
"""Log the effective extension + token-env allowlists at INFO once.
|
||||||
|
|
||||||
|
Called from app startup. Makes operator typos visible — if
|
||||||
|
AGNES_REMOTE_ATTACH_EXTENSIONS=httpfs is set with the intent to ADD
|
||||||
|
httpfs (but the override REPLACES the default), the operator sees
|
||||||
|
'effective extension allowlist: {httpfs}' and notices keboola and
|
||||||
|
bigquery are missing. Idempotent — safe to call multiple times.
|
||||||
|
"""
|
||||||
|
ext = get_allowed_extensions()
|
||||||
|
envs = get_allowed_token_envs()
|
||||||
|
has_ext_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS"))
|
||||||
|
has_env_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS"))
|
||||||
|
logger.info(
|
||||||
|
"remote_attach policy: extensions=%s (override=%s), token_envs=%s (override=%s). "
|
||||||
|
"Note: env-var overrides REPLACE the default — set both yours and the "
|
||||||
|
"defaults if you want to add to them.",
|
||||||
|
sorted(ext["community"] | ext["builtin"]),
|
||||||
|
has_ext_override,
|
||||||
|
sorted(envs),
|
||||||
|
has_env_override,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def escape_sql_string_literal(value: str) -> str:
|
||||||
|
"""Double single-quotes for safe use inside DuckDB single-quoted literals.
|
||||||
|
|
||||||
|
Mirrors `src/db.py:_attach_extracts` (line ~411) so the read-only query
|
||||||
|
path and the orchestrator rebuild path use the same escape.
|
||||||
|
"""
|
||||||
|
return value.replace("'", "''")
|
||||||
142
tests/test_db_remote_attach_security.py
Normal file
142
tests/test_db_remote_attach_security.py
Normal file
|
|
@ -0,0 +1,142 @@
|
||||||
|
"""Issue #81 Group A — query-path (db.py) trust-boundary tests.
|
||||||
|
|
||||||
|
Mirror of `tests/test_orchestrator_remote_attach_security.py` for
|
||||||
|
`src/db.py:_reattach_remote_extensions`. The query path runs on every
|
||||||
|
`/api/query` request via `get_analytics_db_readonly()`; it must enforce
|
||||||
|
the same allowlists as the rebuild path or the security guarantee is
|
||||||
|
hollow.
|
||||||
|
|
||||||
|
Setup is real-DuckDB (not mock-conn) because db.py introspects via
|
||||||
|
`information_schema.tables`/`duckdb_databases()` rather than just
|
||||||
|
executing whatever SQL we hand it. We feed it a real extract.duckdb
|
||||||
|
with a programmable `_remote_attach` row, ATTACH it, then call the
|
||||||
|
function and assert which `LOAD/ATTACH` SQL fired (or didn't) by
|
||||||
|
sniffing connection state.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import duckdb
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.db import _reattach_remote_extensions
|
||||||
|
|
||||||
|
|
||||||
|
def _make_extract_with_remote_attach(
|
||||||
|
path: Path, alias: str, extension: str, url: str, token_env: str
|
||||||
|
) -> None:
|
||||||
|
"""Create a tiny extract.duckdb whose _remote_attach table has one row."""
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if path.exists():
|
||||||
|
path.unlink()
|
||||||
|
wal = path.with_suffix(".duckdb.wal")
|
||||||
|
if wal.exists():
|
||||||
|
wal.unlink()
|
||||||
|
c = duckdb.connect(str(path))
|
||||||
|
c.execute(
|
||||||
|
"CREATE TABLE _remote_attach ("
|
||||||
|
"alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR)"
|
||||||
|
)
|
||||||
|
c.execute(
|
||||||
|
"INSERT INTO _remote_attach VALUES (?, ?, ?, ?)",
|
||||||
|
[alias, extension, url, token_env],
|
||||||
|
)
|
||||||
|
c.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _attach_and_call(extracts_dir: Path, source_name: str):
|
||||||
|
"""ATTACH the source's extract.duckdb to a fresh memory conn, run the
|
||||||
|
function, return the conn (so the test can introspect attached_dbs)."""
|
||||||
|
conn = duckdb.connect()
|
||||||
|
conn.execute(
|
||||||
|
f"ATTACH '{extracts_dir / source_name / 'extract.duckdb'}' "
|
||||||
|
f"AS {source_name} (READ_ONLY)"
|
||||||
|
)
|
||||||
|
_reattach_remote_extensions(conn, extracts_dir)
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def _attached(conn) -> set[str]:
|
||||||
|
return {r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()}
|
||||||
|
|
||||||
|
|
||||||
|
class TestQueryPathExtensionAllowlist:
|
||||||
|
def test_refuses_unknown_extension(self, tmp_path, caplog):
|
||||||
|
"""A connector that requested `httpfs` (not on allowlist) is
|
||||||
|
refused — `httpfs` does not appear among attached databases."""
|
||||||
|
_make_extract_with_remote_attach(
|
||||||
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
||||||
|
alias="ext_alias", extension="httpfs",
|
||||||
|
url="https://x", token_env="",
|
||||||
|
)
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
||||||
|
assert "ext_alias" not in _attached(conn)
|
||||||
|
assert any("not in allowlist" in r.message for r in caplog.records)
|
||||||
|
|
||||||
|
|
||||||
|
class TestQueryPathTokenEnvAllowlist:
|
||||||
|
def test_refuses_session_secret(self, tmp_path, monkeypatch, caplog):
|
||||||
|
monkeypatch.setenv("SESSION_SECRET", "shouldnt-leak")
|
||||||
|
_make_extract_with_remote_attach(
|
||||||
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
||||||
|
alias="kbc", extension="keboola",
|
||||||
|
url="https://x", token_env="SESSION_SECRET",
|
||||||
|
)
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
||||||
|
assert "kbc" not in _attached(conn)
|
||||||
|
assert any(
|
||||||
|
"token_env" in r.message and "not in allowlist" in r.message
|
||||||
|
for r in caplog.records
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_refuses_jwt_secret_key(self, tmp_path, monkeypatch, caplog):
|
||||||
|
monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64)
|
||||||
|
_make_extract_with_remote_attach(
|
||||||
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
||||||
|
alias="kbc", extension="keboola",
|
||||||
|
url="https://x", token_env="JWT_SECRET_KEY",
|
||||||
|
)
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
||||||
|
assert "kbc" not in _attached(conn)
|
||||||
|
|
||||||
|
|
||||||
|
class TestQueryPathInstallStrategy:
|
||||||
|
def test_no_install_on_query_path(self, tmp_path, monkeypatch, caplog):
|
||||||
|
"""The query path must NOT issue INSTALL FROM community — it runs
|
||||||
|
on every read request and shouldn't touch the network. LOAD
|
||||||
|
without prior INSTALL fails for missing extensions, which is the
|
||||||
|
documented behaviour (operator is told to trigger a rebuild)."""
|
||||||
|
# We can't easily verify "no INSTALL" without intercepting SQL;
|
||||||
|
# instead, verify the query path doesn't hit the community
|
||||||
|
# registry by setting an extension that's NOT on the default
|
||||||
|
# allowlist nor pre-installed. The function should refuse
|
||||||
|
# before any LOAD attempt.
|
||||||
|
_make_extract_with_remote_attach(
|
||||||
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
||||||
|
alias="bad", extension="some_other_community_ext",
|
||||||
|
url="https://x", token_env="",
|
||||||
|
)
|
||||||
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
||||||
|
assert "bad" not in _attached(conn)
|
||||||
|
|
||||||
|
|
||||||
|
class TestQueryPathOverride:
|
||||||
|
def test_override_replaces_default(self, tmp_path, monkeypatch):
|
||||||
|
"""Setting AGNES_REMOTE_ATTACH_TOKEN_ENVS=MY_TOKEN replaces the
|
||||||
|
default — KBC_TOKEN no longer accepted. (Operator-typo defense
|
||||||
|
contract; mirrored from rebuild-path tests.)"""
|
||||||
|
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_TOKEN")
|
||||||
|
monkeypatch.setenv("MY_TOKEN", "value")
|
||||||
|
_make_extract_with_remote_attach(
|
||||||
|
tmp_path / "extracts" / "src1" / "extract.duckdb",
|
||||||
|
alias="kbc", extension="keboola",
|
||||||
|
url="https://x", token_env="KBC_TOKEN", # NOT in the override set
|
||||||
|
)
|
||||||
|
conn = _attach_and_call(tmp_path / "extracts", "src1")
|
||||||
|
assert "kbc" not in _attached(conn)
|
||||||
219
tests/test_orchestrator_remote_attach_security.py
Normal file
219
tests/test_orchestrator_remote_attach_security.py
Normal file
|
|
@ -0,0 +1,219 @@
|
||||||
|
"""Issue #81 Group A — connector → orchestrator trust-boundary tests.
|
||||||
|
|
||||||
|
The orchestrator reads `_remote_attach` rows that connectors write into their
|
||||||
|
extract.duckdb, then runs `INSTALL`/`LOAD`/`ATTACH` SQL. This file exercises
|
||||||
|
each of the C1 hardening fixes:
|
||||||
|
|
||||||
|
- A.1 extension allowlist (refuse non-allowlisted extension)
|
||||||
|
- A.2 token-env hard allowlist (refuse well-known runtime secrets)
|
||||||
|
- A.3 URL single-quote escape (no injection through the URL literal)
|
||||||
|
- Built-in vs community install path split
|
||||||
|
|
||||||
|
Each test writes a malicious `_remote_attach` row into a fixture
|
||||||
|
extract.duckdb and asserts that the orchestrator either refuses (no ATTACH
|
||||||
|
call) or issues a safely-escaped one. We capture SQL strings via a fake
|
||||||
|
DuckDB connection so the assertions don't depend on any real extension being
|
||||||
|
installed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import duckdb
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.orchestrator import SyncOrchestrator
|
||||||
|
from src.orchestrator_security import escape_sql_string_literal
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def captured_conn(monkeypatch, tmp_path):
|
||||||
|
"""A duckdb.Connection-like mock that records every execute() string."""
|
||||||
|
sql_calls: list[str] = []
|
||||||
|
conn = MagicMock()
|
||||||
|
|
||||||
|
# information_schema.tables → return _remote_attach exists
|
||||||
|
# _remote_attach rows are programmed per-test via attach_rows()
|
||||||
|
rows_buffer = {"attach": []}
|
||||||
|
|
||||||
|
def execute_side_effect(sql, *args, **kwargs):
|
||||||
|
sql_calls.append(sql)
|
||||||
|
result = MagicMock()
|
||||||
|
# information_schema query
|
||||||
|
if "information_schema.tables" in sql and "_remote_attach" in sql:
|
||||||
|
result.fetchall.return_value = [("_remote_attach",)]
|
||||||
|
elif "FROM" in sql and "_remote_attach" in sql:
|
||||||
|
result.fetchall.return_value = list(rows_buffer["attach"])
|
||||||
|
elif "duckdb_databases" in sql:
|
||||||
|
result.fetchall.return_value = [] # nothing attached yet
|
||||||
|
else:
|
||||||
|
result.fetchall.return_value = []
|
||||||
|
return result
|
||||||
|
|
||||||
|
conn.execute.side_effect = execute_side_effect
|
||||||
|
|
||||||
|
def set_attach_rows(rows):
|
||||||
|
rows_buffer["attach"] = rows
|
||||||
|
|
||||||
|
return conn, sql_calls, set_attach_rows
|
||||||
|
|
||||||
|
|
||||||
|
def _attach_call_count(sql_calls: list[str]) -> int:
|
||||||
|
"""Number of ATTACH statements actually issued against the conn."""
|
||||||
|
return sum(1 for s in sql_calls if s.lstrip().upper().startswith("ATTACH "))
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtensionAllowlist:
|
||||||
|
def test_refuses_unknown_extension(self, captured_conn, caplog):
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("alias1", "httpfs", "https://x", "")])
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
assert _attach_call_count(sql_calls) == 0
|
||||||
|
assert any("not in the allowlist" in r.message for r in caplog.records)
|
||||||
|
|
||||||
|
def test_allows_keboola(self, captured_conn, monkeypatch):
|
||||||
|
monkeypatch.setenv("KBC_TOKEN", "secret-token-value")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("kbc", "keboola", "https://example.keboola.com", "KBC_TOKEN")])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
assert _attach_call_count(sql_calls) == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestTokenEnvAllowlist:
|
||||||
|
def test_refuses_session_secret(self, captured_conn, caplog, monkeypatch):
|
||||||
|
# Even if SESSION_SECRET were set in env (it shouldn't be in tests),
|
||||||
|
# the allowlist refuses to read it.
|
||||||
|
monkeypatch.setenv("SESSION_SECRET", "super-secret-jwt-signing-key")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("alias1", "keboola", "https://x", "SESSION_SECRET")])
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
assert _attach_call_count(sql_calls) == 0
|
||||||
|
assert any("token_env" in r.message and "not in the allowlist" in r.message
|
||||||
|
for r in caplog.records)
|
||||||
|
|
||||||
|
def test_refuses_jwt_secret_key(self, captured_conn, monkeypatch, caplog):
|
||||||
|
monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64)
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("alias1", "keboola", "https://x", "JWT_SECRET_KEY")])
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
assert _attach_call_count(sql_calls) == 0
|
||||||
|
|
||||||
|
def test_refuses_arbitrary_custom_token_env(self, captured_conn, monkeypatch, caplog):
|
||||||
|
# Names that pass the structural regex but aren't on the allowlist
|
||||||
|
# are still refused — defense against accidental exposure.
|
||||||
|
monkeypatch.setenv("MY_RANDOM_TOKEN", "value")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")])
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
assert _attach_call_count(sql_calls) == 0
|
||||||
|
|
||||||
|
def test_operator_override_replaces_default(
|
||||||
|
self, captured_conn, monkeypatch
|
||||||
|
):
|
||||||
|
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_RANDOM_TOKEN")
|
||||||
|
monkeypatch.setenv("MY_RANDOM_TOKEN", "value")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
assert _attach_call_count(sql_calls) == 1
|
||||||
|
|
||||||
|
def test_empty_string_override_falls_back_to_default(
|
||||||
|
self, captured_conn, monkeypatch
|
||||||
|
):
|
||||||
|
"""AGNES_REMOTE_ATTACH_TOKEN_ENVS='' should NOT lock everything down —
|
||||||
|
it falls through to the default. (Operator-typo defense.)"""
|
||||||
|
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "")
|
||||||
|
monkeypatch.setenv("KBC_TOKEN", "value")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("alias1", "keboola", "https://x", "KBC_TOKEN")])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
assert _attach_call_count(sql_calls) == 1
|
||||||
|
|
||||||
|
def test_empty_token_env_skips_check(self, captured_conn, monkeypatch):
|
||||||
|
"""token_env='' (the BigQuery-style env-auth path) skips the
|
||||||
|
allowlist check entirely. Verifies the BQ flow keeps working."""
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("bq", "bigquery", "project=x", "")])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
assert _attach_call_count(sql_calls) == 1
|
||||||
|
|
||||||
|
def test_structurally_invalid_token_env_refused(
|
||||||
|
self, captured_conn, monkeypatch, caplog
|
||||||
|
):
|
||||||
|
"""Even names on the allowlist via override must pass the structural
|
||||||
|
regex `^[A-Z][A-Z0-9_]{0,63}$`. A name with a space or lowercase
|
||||||
|
letter is refused regardless of allowlist contents."""
|
||||||
|
# Try to add a structurally-invalid name to the allowlist via
|
||||||
|
# override; the regex inside is_token_env_allowed must still refuse.
|
||||||
|
monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "kbc_token,KBC TOKEN,LEGIT_TOKEN")
|
||||||
|
monkeypatch.setenv("LEGIT_TOKEN", "value")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([
|
||||||
|
("a1", "keboola", "https://x", "kbc_token"), # lowercase
|
||||||
|
("a2", "keboola", "https://x", "KBC TOKEN"), # space
|
||||||
|
("a3", "keboola", "https://x", "LEGIT_TOKEN"), # OK
|
||||||
|
])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
# Only a3 should attach; a1 and a2 fail the structural regex even
|
||||||
|
# though the operator listed them in the override.
|
||||||
|
assert _attach_call_count(sql_calls) == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestUrlEscape:
|
||||||
|
def test_single_quote_in_url_is_escaped(self, captured_conn, monkeypatch):
|
||||||
|
monkeypatch.setenv("KBC_TOKEN", "tok")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
# Adversarial URL trying to break out of the literal.
|
||||||
|
adversarial_url = "https://example.com'); DROP DATABASE x; --"
|
||||||
|
set_rows([("kbc", "keboola", adversarial_url, "KBC_TOKEN")])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")]
|
||||||
|
assert len(attach_sqls) == 1
|
||||||
|
# The double-escaped form must be present, never the bare single quote.
|
||||||
|
expected_escaped = escape_sql_string_literal(adversarial_url)
|
||||||
|
assert f"'{expected_escaped}'" in attach_sqls[0]
|
||||||
|
# Sanity: the un-escaped form would have ended the literal early.
|
||||||
|
assert f"'{adversarial_url}'" not in attach_sqls[0]
|
||||||
|
|
||||||
|
def test_token_with_single_quote_is_escaped(self, captured_conn, monkeypatch):
|
||||||
|
# Defense-in-depth: even if a token somehow contained `'`, the
|
||||||
|
# ATTACH literal still parses safely.
|
||||||
|
monkeypatch.setenv("KBC_TOKEN", "abc'def")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")]
|
||||||
|
assert "'abc''def'" in attach_sqls[0]
|
||||||
|
|
||||||
|
|
||||||
|
class TestInstallPathSplit:
|
||||||
|
def test_community_extension_uses_install_from_community(
|
||||||
|
self, captured_conn, monkeypatch
|
||||||
|
):
|
||||||
|
monkeypatch.setenv("KBC_TOKEN", "tok")
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()]
|
||||||
|
assert any("FROM community" in s for s in install_sqls)
|
||||||
|
|
||||||
|
def test_builtin_extension_uses_load_only(
|
||||||
|
self, captured_conn, monkeypatch
|
||||||
|
):
|
||||||
|
# Add a fictitious built-in via the override mechanism (we have to
|
||||||
|
# patch the module-level set since AGNES_REMOTE_ATTACH_EXTENSIONS
|
||||||
|
# only affects community).
|
||||||
|
from src import orchestrator_security as oms
|
||||||
|
monkeypatch.setattr(oms, "_BUILTIN_EXTENSIONS", frozenset({"sqlite"}))
|
||||||
|
conn, sql_calls, set_rows = captured_conn
|
||||||
|
set_rows([("sql1", "sqlite", "/tmp/db.sqlite", "")])
|
||||||
|
SyncOrchestrator()._attach_remote_extensions(conn, "src1")
|
||||||
|
install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()]
|
||||||
|
# Built-in: no INSTALL, only LOAD
|
||||||
|
assert install_sqls == []
|
||||||
|
load_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("LOAD ")]
|
||||||
|
assert len(load_sqls) == 1
|
||||||
Loading…
Reference in a new issue