diff --git a/CHANGELOG.md b/CHANGELOG.md index 946b8e5..611d016 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -104,6 +104,35 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C `[A-Za-z0-9_-]` (dot deliberately excluded to defeat `..` survival), clips length to 64 chars, and routes the final filename through `safe_join_under`. +- **Security (CRITICAL)**: hardened the connector → orchestrator trust + boundary on BOTH the rebuild path + (`src/orchestrator.py::_attach_remote_extensions`) AND the read-only + query path (`src/db.py::_reattach_remote_extensions`, called by + `get_analytics_db_readonly()` on every request) — issue #81 Group A. + Three fixes: (1) DuckDB extensions referenced by `_remote_attach` are + matched against a hard allowlist (default: `keboola, bigquery`; + override via `AGNES_REMOTE_ATTACH_EXTENSIONS`). Install path splits + built-in (LOAD only) from community (`INSTALL FROM community; LOAD` + on rebuild path; LOAD only on the read-only query path which must + not touch the network). (2) `token_env` names are matched against a + hard allowlist (default: `KBC_TOKEN`, `KBC_STORAGE_TOKEN`, + `KEBOOLA_STORAGE_TOKEN`, `GOOGLE_APPLICATION_CREDENTIALS`; override + via `AGNES_REMOTE_ATTACH_TOKEN_ENVS`). Names must additionally match + `^[A-Z][A-Z0-9_]{0,63}$`. A malicious connector cannot ask the + orchestrator to read `JWT_SECRET_KEY` / `SESSION_SECRET` / + `OPENAI_API_KEY` and exfiltrate them via `ATTACH ... TOKEN`. + (3) The URL passed to `ATTACH` is now single-quote-escaped on both + paths. Also fixed a `table_schema` vs `table_catalog` mismatch that + silently no-op'd `_attach_remote_extensions` for every connector + (the rebuild-path hardening would have been moot in production + without this fix). New module `src/orchestrator_security.py` + centralises the policy and exposes `log_effective_policy()`, called + from app startup so an operator's typo in + `AGNES_REMOTE_ATTACH_EXTENSIONS` (which **replaces** the default, + not extends it — a setting of `httpfs` would silently lock out + `keboola, bigquery`) is visible at boot rather than at the next + failed attach. See + `docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md`. ### Removed diff --git a/app/main.py b/app/main.py index da64a54..9c0eb87 100644 --- a/app/main.py +++ b/app/main.py @@ -105,6 +105,14 @@ logger = logging.getLogger(__name__) @asynccontextmanager async def lifespan(app): + # Issue #81 Group A — log the effective remote_attach allowlist at + # startup so an operator's typo in AGNES_REMOTE_ATTACH_EXTENSIONS + # (which REPLACES, not extends, the default) is visible. + try: + from src.orchestrator_security import log_effective_policy + log_effective_policy() + except Exception: + pass # never block startup on a logging convenience yield from src.db import close_system_db close_system_db() diff --git a/src/db.py b/src/db.py index f89e0de..1210cbb 100644 --- a/src/db.py +++ b/src/db.py @@ -395,6 +395,18 @@ def _reattach_remote_extensions( except Exception: pass + # Issue #81 Group A — apply the same allowlist policy on the + # query path that the orchestrator's rebuild path uses. Without + # this, a malicious connector's _remote_attach row exfiltrates + # JWT_SECRET_KEY / SESSION_SECRET / OPENAI_API_KEY on every + # query, defeating the rebuild-path hardening entirely. + from src.orchestrator_security import ( + escape_sql_string_literal, + is_builtin_extension, + is_extension_allowed, + is_token_env_allowed, + ) + for alias, extension, url, token_env in rows: if not _SAFE_IDENTIFIER.match(alias or ""): logger.debug("Skipping unsafe remote_attach alias: %r", alias) @@ -402,15 +414,40 @@ def _reattach_remote_extensions( if not _SAFE_IDENTIFIER.match(extension or ""): logger.debug("Skipping unsafe remote_attach extension: %r", extension) continue + if not is_extension_allowed(extension): + logger.error( + "query-path remote_attach: extension %r not in allowlist; " + "refusing to LOAD/ATTACH for source %s. Override via " + "AGNES_REMOTE_ATTACH_EXTENSIONS if intended.", + extension, alias, + ) + continue + if token_env and not is_token_env_allowed(token_env): + logger.error( + "query-path remote_attach: token_env %r not in allowlist; " + "refusing for source %s. Override via " + "AGNES_REMOTE_ATTACH_TOKEN_ENVS if intended.", + token_env, alias, + ) + continue if alias in attached_dbs: logger.debug("Remote source %s already attached, skipping", alias) continue try: + # LOAD only on the read-only query path — no INSTALL. + # Per the function docstring, this path runs on every + # query request and must not touch the network. The + # rebuild path (orchestrator) is responsible for INSTALL; + # by the time a query lands here, any community extension + # we'll see is already on disk. If LOAD fails because the + # extension isn't installed, log + skip (caller will see + # missing remote views and the operator will trigger a + # rebuild). conn.execute(f"LOAD {extension};") token = os.environ.get(token_env, "") if token_env else "" - safe_url = url.replace("'", "''") + safe_url = escape_sql_string_literal(url) if token: - escaped_token = token.replace("'", "''") + escaped_token = escape_sql_string_literal(token) conn.execute( f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')" ) diff --git a/src/orchestrator.py b/src/orchestrator.py index 37b0d75..833145a 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -27,6 +27,13 @@ from typing import Dict, List import duckdb +from src.orchestrator_security import ( + escape_sql_string_literal, + is_builtin_extension, + is_extension_allowed, + is_token_env_allowed, +) + logger = logging.getLogger(__name__) _rebuild_lock = threading.Lock() @@ -202,9 +209,17 @@ class SyncOrchestrator: ) -> None: """Read _remote_attach from extract.duckdb and ATTACH external sources.""" try: + # DuckDB attached-DB layout: ATTACH 'extract.duckdb' AS + # exposes information_schema.tables with table_catalog= + # and table_schema='main'. The earlier draft used + # table_schema= here, which never matched and made + # _attach_remote_extensions a silent no-op for every + # connector — defeating the entire Group A hardening in + # production. db.py:_reattach_remote_extensions already used + # the correct column; this aligns the rebuild path. tables = conn.execute( f"SELECT table_name FROM information_schema.tables " - f"WHERE table_schema='{source_name}' AND table_name='_remote_attach'" + f"WHERE table_catalog='{source_name}' AND table_name='_remote_attach'" ).fetchall() if not tables: return @@ -216,11 +231,34 @@ class SyncOrchestrator: ).fetchall() for alias, extension, url, token_env in rows: + # Identifier sanity (defense against weird input). The hard + # security boundary is the allowlist a few lines down. if not _validate_identifier(alias, "remote_attach alias"): continue if not _validate_identifier(extension, "remote_attach extension"): continue + # #81 Group A.1 — extension allowlist. The connector does NOT + # get to pick what extensions the orchestrator loads. + if not is_extension_allowed(extension): + logger.error( + "Remote attach %s: extension %r is not in the allowlist; refusing. " + "Override via AGNES_REMOTE_ATTACH_EXTENSIONS if intended.", + alias, extension, + ) + continue + + # #81 Group A.2 — token-env hard allowlist. Refuses well-known + # runtime secrets (JWT_SECRET_KEY, OPENAI_API_KEY, …) that a + # malicious connector might ask us to send to its server. + if token_env and not is_token_env_allowed(token_env): + logger.error( + "Remote attach %s: token_env %r is not in the allowlist; refusing. " + "Override via AGNES_REMOTE_ATTACH_TOKEN_ENVS if intended.", + alias, token_env, + ) + continue + token = os.environ.get(token_env, "") if token_env else "" if token_env and not token: logger.warning( @@ -239,16 +277,22 @@ class SyncOrchestrator: logger.debug("Remote source %s already attached", alias) continue - conn.execute(f"INSTALL {extension} FROM community; LOAD {extension};") + # #81 Group A.1 — built-ins LOAD only; community needs INSTALL+LOAD. + if is_builtin_extension(extension): + conn.execute(f"LOAD {extension};") + else: + conn.execute(f"INSTALL {extension} FROM community; LOAD {extension};") + # #81 Group A.3 — escape URL single-quotes (mirrors src/db.py). + safe_url = escape_sql_string_literal(url) if token: - escaped_token = token.replace("'", "''") + escaped_token = escape_sql_string_literal(token) conn.execute( - f"ATTACH '{url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')" + f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')" ) else: # Extensions like BigQuery handle auth via env (e.g. GOOGLE_APPLICATION_CREDENTIALS) conn.execute( - f"ATTACH '{url}' AS {alias} (TYPE {extension}, READ_ONLY)" + f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, READ_ONLY)" ) logger.info("Attached remote source %s via %s extension", alias, extension) except Exception as e: diff --git a/src/orchestrator_security.py b/src/orchestrator_security.py new file mode 100644 index 0000000..66369ba --- /dev/null +++ b/src/orchestrator_security.py @@ -0,0 +1,128 @@ +"""Allowlists and policy for the connector → orchestrator trust boundary. + +The orchestrator reads `_remote_attach` rows that connectors write into their +`extract.duckdb`, then calls `INSTALL`, `LOAD`, and `ATTACH` based on those +values. Treating the connector as adversarial (compromised image, supply-chain, +malicious fork) means the orchestrator picks **what** can be installed and +**which** env vars can be referenced — not the connector. See +`docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md` for the full +threat model. +""" + +from __future__ import annotations + +import logging +import os +import re + +logger = logging.getLogger(__name__) + +# DuckDB extensions the orchestrator is willing to load on behalf of a +# connector. Built-in extensions go in `_BUILTIN_EXTENSIONS`; community +# extensions go in `_COMMUNITY_EXTENSIONS`. The two sets are disjoint and +# tell the install path whether to issue `INSTALL ... FROM community` or +# only `LOAD`. +_BUILTIN_EXTENSIONS: frozenset[str] = frozenset() # none in current OSS +_COMMUNITY_EXTENSIONS: frozenset[str] = frozenset({ + "keboola", + "bigquery", +}) + +# Env vars whose values may be passed as the auth `TOKEN` in `ATTACH`. The +# default is intentionally tight — every name in the runtime env that is not +# on this list cannot be exfiltrated to a connector-controlled URL. +# Operators add deployment-specific names via AGNES_REMOTE_ATTACH_TOKEN_ENVS. +_DEFAULT_TOKEN_ENVS: frozenset[str] = frozenset({ + "KBC_TOKEN", + "KBC_STORAGE_TOKEN", + "KEBOOLA_STORAGE_TOKEN", + "GOOGLE_APPLICATION_CREDENTIALS", # path, not a secret value +}) + +# Names must additionally match this regex (defense against weird input). +_ENV_NAME_RE = re.compile(r"^[A-Z][A-Z0-9_]{0,63}$") + + +def _parse_csv_env(name: str) -> set[str]: + """Parse a comma-separated env var into a stripped set of non-empty tokens.""" + raw = os.environ.get(name, "") + return {t.strip() for t in raw.split(",") if t.strip()} + + +def get_allowed_extensions() -> dict[str, set[str]]: + """Return the effective extension allowlist as a dict of {kind: set}. + + `kind` is "builtin" or "community" — the install path needs to know + which to use. Operator override AGNES_REMOTE_ATTACH_EXTENSIONS replaces + the default community set; built-ins are not configurable from env (a + typo there would silently disable a working integration with no clear + failure mode, and built-ins do not pose a supply-chain risk). + """ + override = _parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS") + community = override if override else set(_COMMUNITY_EXTENSIONS) + return {"builtin": set(_BUILTIN_EXTENSIONS), "community": community} + + +def is_extension_allowed(extension: str) -> bool: + allow = get_allowed_extensions() + return extension in allow["builtin"] or extension in allow["community"] + + +def is_builtin_extension(extension: str) -> bool: + return extension in get_allowed_extensions()["builtin"] + + +def get_allowed_token_envs() -> set[str]: + """Return the effective token-env allowlist. + + Operator override AGNES_REMOTE_ATTACH_TOKEN_ENVS *replaces* the default + set (so an operator can shrink it as well as expand it). The startup + code logs the effective set so a typo is visible. + """ + override = _parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS") + return override if override else set(_DEFAULT_TOKEN_ENVS) + + +def is_token_env_allowed(token_env: str) -> bool: + """Return True if ``token_env`` may be read and passed as a TOKEN. + + Two checks: structural (`^[A-Z][A-Z0-9_]{0,63}$`) and membership in the + allowlist. The structural check refuses things that aren't a valid env + var name regardless of allowlist contents. + """ + if not isinstance(token_env, str) or not _ENV_NAME_RE.match(token_env): + return False + return token_env in get_allowed_token_envs() + + +def log_effective_policy() -> None: + """Log the effective extension + token-env allowlists at INFO once. + + Called from app startup. Makes operator typos visible — if + AGNES_REMOTE_ATTACH_EXTENSIONS=httpfs is set with the intent to ADD + httpfs (but the override REPLACES the default), the operator sees + 'effective extension allowlist: {httpfs}' and notices keboola and + bigquery are missing. Idempotent — safe to call multiple times. + """ + ext = get_allowed_extensions() + envs = get_allowed_token_envs() + has_ext_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS")) + has_env_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS")) + logger.info( + "remote_attach policy: extensions=%s (override=%s), token_envs=%s (override=%s). " + "Note: env-var overrides REPLACE the default — set both yours and the " + "defaults if you want to add to them.", + sorted(ext["community"] | ext["builtin"]), + has_ext_override, + sorted(envs), + has_env_override, + ) + + +def escape_sql_string_literal(value: str) -> str: + """Double single-quotes for safe use inside DuckDB single-quoted literals. + + Mirrors `src/db.py:_attach_extracts` (line ~411) so the read-only query + path and the orchestrator rebuild path use the same escape. + """ + return value.replace("'", "''") diff --git a/tests/test_db_remote_attach_security.py b/tests/test_db_remote_attach_security.py new file mode 100644 index 0000000..7f29d30 --- /dev/null +++ b/tests/test_db_remote_attach_security.py @@ -0,0 +1,142 @@ +"""Issue #81 Group A — query-path (db.py) trust-boundary tests. + +Mirror of `tests/test_orchestrator_remote_attach_security.py` for +`src/db.py:_reattach_remote_extensions`. The query path runs on every +`/api/query` request via `get_analytics_db_readonly()`; it must enforce +the same allowlists as the rebuild path or the security guarantee is +hollow. + +Setup is real-DuckDB (not mock-conn) because db.py introspects via +`information_schema.tables`/`duckdb_databases()` rather than just +executing whatever SQL we hand it. We feed it a real extract.duckdb +with a programmable `_remote_attach` row, ATTACH it, then call the +function and assert which `LOAD/ATTACH` SQL fired (or didn't) by +sniffing connection state. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +import duckdb +import pytest + +from src.db import _reattach_remote_extensions + + +def _make_extract_with_remote_attach( + path: Path, alias: str, extension: str, url: str, token_env: str +) -> None: + """Create a tiny extract.duckdb whose _remote_attach table has one row.""" + path.parent.mkdir(parents=True, exist_ok=True) + if path.exists(): + path.unlink() + wal = path.with_suffix(".duckdb.wal") + if wal.exists(): + wal.unlink() + c = duckdb.connect(str(path)) + c.execute( + "CREATE TABLE _remote_attach (" + "alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR)" + ) + c.execute( + "INSERT INTO _remote_attach VALUES (?, ?, ?, ?)", + [alias, extension, url, token_env], + ) + c.close() + + +def _attach_and_call(extracts_dir: Path, source_name: str): + """ATTACH the source's extract.duckdb to a fresh memory conn, run the + function, return the conn (so the test can introspect attached_dbs).""" + conn = duckdb.connect() + conn.execute( + f"ATTACH '{extracts_dir / source_name / 'extract.duckdb'}' " + f"AS {source_name} (READ_ONLY)" + ) + _reattach_remote_extensions(conn, extracts_dir) + return conn + + +def _attached(conn) -> set[str]: + return {r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()} + + +class TestQueryPathExtensionAllowlist: + def test_refuses_unknown_extension(self, tmp_path, caplog): + """A connector that requested `httpfs` (not on allowlist) is + refused — `httpfs` does not appear among attached databases.""" + _make_extract_with_remote_attach( + tmp_path / "extracts" / "src1" / "extract.duckdb", + alias="ext_alias", extension="httpfs", + url="https://x", token_env="", + ) + with caplog.at_level(logging.ERROR): + conn = _attach_and_call(tmp_path / "extracts", "src1") + assert "ext_alias" not in _attached(conn) + assert any("not in allowlist" in r.message for r in caplog.records) + + +class TestQueryPathTokenEnvAllowlist: + def test_refuses_session_secret(self, tmp_path, monkeypatch, caplog): + monkeypatch.setenv("SESSION_SECRET", "shouldnt-leak") + _make_extract_with_remote_attach( + tmp_path / "extracts" / "src1" / "extract.duckdb", + alias="kbc", extension="keboola", + url="https://x", token_env="SESSION_SECRET", + ) + with caplog.at_level(logging.ERROR): + conn = _attach_and_call(tmp_path / "extracts", "src1") + assert "kbc" not in _attached(conn) + assert any( + "token_env" in r.message and "not in allowlist" in r.message + for r in caplog.records + ) + + def test_refuses_jwt_secret_key(self, tmp_path, monkeypatch, caplog): + monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64) + _make_extract_with_remote_attach( + tmp_path / "extracts" / "src1" / "extract.duckdb", + alias="kbc", extension="keboola", + url="https://x", token_env="JWT_SECRET_KEY", + ) + with caplog.at_level(logging.ERROR): + conn = _attach_and_call(tmp_path / "extracts", "src1") + assert "kbc" not in _attached(conn) + + +class TestQueryPathInstallStrategy: + def test_no_install_on_query_path(self, tmp_path, monkeypatch, caplog): + """The query path must NOT issue INSTALL FROM community — it runs + on every read request and shouldn't touch the network. LOAD + without prior INSTALL fails for missing extensions, which is the + documented behaviour (operator is told to trigger a rebuild).""" + # We can't easily verify "no INSTALL" without intercepting SQL; + # instead, verify the query path doesn't hit the community + # registry by setting an extension that's NOT on the default + # allowlist nor pre-installed. The function should refuse + # before any LOAD attempt. + _make_extract_with_remote_attach( + tmp_path / "extracts" / "src1" / "extract.duckdb", + alias="bad", extension="some_other_community_ext", + url="https://x", token_env="", + ) + conn = _attach_and_call(tmp_path / "extracts", "src1") + assert "bad" not in _attached(conn) + + +class TestQueryPathOverride: + def test_override_replaces_default(self, tmp_path, monkeypatch): + """Setting AGNES_REMOTE_ATTACH_TOKEN_ENVS=MY_TOKEN replaces the + default — KBC_TOKEN no longer accepted. (Operator-typo defense + contract; mirrored from rebuild-path tests.)""" + monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_TOKEN") + monkeypatch.setenv("MY_TOKEN", "value") + _make_extract_with_remote_attach( + tmp_path / "extracts" / "src1" / "extract.duckdb", + alias="kbc", extension="keboola", + url="https://x", token_env="KBC_TOKEN", # NOT in the override set + ) + conn = _attach_and_call(tmp_path / "extracts", "src1") + assert "kbc" not in _attached(conn) diff --git a/tests/test_orchestrator_remote_attach_security.py b/tests/test_orchestrator_remote_attach_security.py new file mode 100644 index 0000000..9415d38 --- /dev/null +++ b/tests/test_orchestrator_remote_attach_security.py @@ -0,0 +1,219 @@ +"""Issue #81 Group A — connector → orchestrator trust-boundary tests. + +The orchestrator reads `_remote_attach` rows that connectors write into their +extract.duckdb, then runs `INSTALL`/`LOAD`/`ATTACH` SQL. This file exercises +each of the C1 hardening fixes: + +- A.1 extension allowlist (refuse non-allowlisted extension) +- A.2 token-env hard allowlist (refuse well-known runtime secrets) +- A.3 URL single-quote escape (no injection through the URL literal) +- Built-in vs community install path split + +Each test writes a malicious `_remote_attach` row into a fixture +extract.duckdb and asserts that the orchestrator either refuses (no ATTACH +call) or issues a safely-escaped one. We capture SQL strings via a fake +DuckDB connection so the assertions don't depend on any real extension being +installed. +""" + +import logging +from unittest.mock import MagicMock + +import duckdb +import pytest + +from src.orchestrator import SyncOrchestrator +from src.orchestrator_security import escape_sql_string_literal + + +@pytest.fixture +def captured_conn(monkeypatch, tmp_path): + """A duckdb.Connection-like mock that records every execute() string.""" + sql_calls: list[str] = [] + conn = MagicMock() + + # information_schema.tables → return _remote_attach exists + # _remote_attach rows are programmed per-test via attach_rows() + rows_buffer = {"attach": []} + + def execute_side_effect(sql, *args, **kwargs): + sql_calls.append(sql) + result = MagicMock() + # information_schema query + if "information_schema.tables" in sql and "_remote_attach" in sql: + result.fetchall.return_value = [("_remote_attach",)] + elif "FROM" in sql and "_remote_attach" in sql: + result.fetchall.return_value = list(rows_buffer["attach"]) + elif "duckdb_databases" in sql: + result.fetchall.return_value = [] # nothing attached yet + else: + result.fetchall.return_value = [] + return result + + conn.execute.side_effect = execute_side_effect + + def set_attach_rows(rows): + rows_buffer["attach"] = rows + + return conn, sql_calls, set_attach_rows + + +def _attach_call_count(sql_calls: list[str]) -> int: + """Number of ATTACH statements actually issued against the conn.""" + return sum(1 for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")) + + +class TestExtensionAllowlist: + def test_refuses_unknown_extension(self, captured_conn, caplog): + conn, sql_calls, set_rows = captured_conn + set_rows([("alias1", "httpfs", "https://x", "")]) + with caplog.at_level(logging.ERROR): + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + assert _attach_call_count(sql_calls) == 0 + assert any("not in the allowlist" in r.message for r in caplog.records) + + def test_allows_keboola(self, captured_conn, monkeypatch): + monkeypatch.setenv("KBC_TOKEN", "secret-token-value") + conn, sql_calls, set_rows = captured_conn + set_rows([("kbc", "keboola", "https://example.keboola.com", "KBC_TOKEN")]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + assert _attach_call_count(sql_calls) == 1 + + +class TestTokenEnvAllowlist: + def test_refuses_session_secret(self, captured_conn, caplog, monkeypatch): + # Even if SESSION_SECRET were set in env (it shouldn't be in tests), + # the allowlist refuses to read it. + monkeypatch.setenv("SESSION_SECRET", "super-secret-jwt-signing-key") + conn, sql_calls, set_rows = captured_conn + set_rows([("alias1", "keboola", "https://x", "SESSION_SECRET")]) + with caplog.at_level(logging.ERROR): + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + assert _attach_call_count(sql_calls) == 0 + assert any("token_env" in r.message and "not in the allowlist" in r.message + for r in caplog.records) + + def test_refuses_jwt_secret_key(self, captured_conn, monkeypatch, caplog): + monkeypatch.setenv("JWT_SECRET_KEY", "x" * 64) + conn, sql_calls, set_rows = captured_conn + set_rows([("alias1", "keboola", "https://x", "JWT_SECRET_KEY")]) + with caplog.at_level(logging.ERROR): + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + assert _attach_call_count(sql_calls) == 0 + + def test_refuses_arbitrary_custom_token_env(self, captured_conn, monkeypatch, caplog): + # Names that pass the structural regex but aren't on the allowlist + # are still refused — defense against accidental exposure. + monkeypatch.setenv("MY_RANDOM_TOKEN", "value") + conn, sql_calls, set_rows = captured_conn + set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")]) + with caplog.at_level(logging.ERROR): + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + assert _attach_call_count(sql_calls) == 0 + + def test_operator_override_replaces_default( + self, captured_conn, monkeypatch + ): + monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "MY_RANDOM_TOKEN") + monkeypatch.setenv("MY_RANDOM_TOKEN", "value") + conn, sql_calls, set_rows = captured_conn + set_rows([("alias1", "keboola", "https://x", "MY_RANDOM_TOKEN")]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + assert _attach_call_count(sql_calls) == 1 + + def test_empty_string_override_falls_back_to_default( + self, captured_conn, monkeypatch + ): + """AGNES_REMOTE_ATTACH_TOKEN_ENVS='' should NOT lock everything down — + it falls through to the default. (Operator-typo defense.)""" + monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "") + monkeypatch.setenv("KBC_TOKEN", "value") + conn, sql_calls, set_rows = captured_conn + set_rows([("alias1", "keboola", "https://x", "KBC_TOKEN")]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + assert _attach_call_count(sql_calls) == 1 + + def test_empty_token_env_skips_check(self, captured_conn, monkeypatch): + """token_env='' (the BigQuery-style env-auth path) skips the + allowlist check entirely. Verifies the BQ flow keeps working.""" + conn, sql_calls, set_rows = captured_conn + set_rows([("bq", "bigquery", "project=x", "")]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + assert _attach_call_count(sql_calls) == 1 + + def test_structurally_invalid_token_env_refused( + self, captured_conn, monkeypatch, caplog + ): + """Even names on the allowlist via override must pass the structural + regex `^[A-Z][A-Z0-9_]{0,63}$`. A name with a space or lowercase + letter is refused regardless of allowlist contents.""" + # Try to add a structurally-invalid name to the allowlist via + # override; the regex inside is_token_env_allowed must still refuse. + monkeypatch.setenv("AGNES_REMOTE_ATTACH_TOKEN_ENVS", "kbc_token,KBC TOKEN,LEGIT_TOKEN") + monkeypatch.setenv("LEGIT_TOKEN", "value") + conn, sql_calls, set_rows = captured_conn + set_rows([ + ("a1", "keboola", "https://x", "kbc_token"), # lowercase + ("a2", "keboola", "https://x", "KBC TOKEN"), # space + ("a3", "keboola", "https://x", "LEGIT_TOKEN"), # OK + ]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + # Only a3 should attach; a1 and a2 fail the structural regex even + # though the operator listed them in the override. + assert _attach_call_count(sql_calls) == 1 + + +class TestUrlEscape: + def test_single_quote_in_url_is_escaped(self, captured_conn, monkeypatch): + monkeypatch.setenv("KBC_TOKEN", "tok") + conn, sql_calls, set_rows = captured_conn + # Adversarial URL trying to break out of the literal. + adversarial_url = "https://example.com'); DROP DATABASE x; --" + set_rows([("kbc", "keboola", adversarial_url, "KBC_TOKEN")]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")] + assert len(attach_sqls) == 1 + # The double-escaped form must be present, never the bare single quote. + expected_escaped = escape_sql_string_literal(adversarial_url) + assert f"'{expected_escaped}'" in attach_sqls[0] + # Sanity: the un-escaped form would have ended the literal early. + assert f"'{adversarial_url}'" not in attach_sqls[0] + + def test_token_with_single_quote_is_escaped(self, captured_conn, monkeypatch): + # Defense-in-depth: even if a token somehow contained `'`, the + # ATTACH literal still parses safely. + monkeypatch.setenv("KBC_TOKEN", "abc'def") + conn, sql_calls, set_rows = captured_conn + set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + attach_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("ATTACH ")] + assert "'abc''def'" in attach_sqls[0] + + +class TestInstallPathSplit: + def test_community_extension_uses_install_from_community( + self, captured_conn, monkeypatch + ): + monkeypatch.setenv("KBC_TOKEN", "tok") + conn, sql_calls, set_rows = captured_conn + set_rows([("kbc", "keboola", "https://x", "KBC_TOKEN")]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()] + assert any("FROM community" in s for s in install_sqls) + + def test_builtin_extension_uses_load_only( + self, captured_conn, monkeypatch + ): + # Add a fictitious built-in via the override mechanism (we have to + # patch the module-level set since AGNES_REMOTE_ATTACH_EXTENSIONS + # only affects community). + from src import orchestrator_security as oms + monkeypatch.setattr(oms, "_BUILTIN_EXTENSIONS", frozenset({"sqlite"})) + conn, sql_calls, set_rows = captured_conn + set_rows([("sql1", "sqlite", "/tmp/db.sqlite", "")]) + SyncOrchestrator()._attach_remote_extensions(conn, "src1") + install_sqls = [s for s in sql_calls if "INSTALL" in s.upper()] + # Built-in: no INSTALL, only LOAD + assert install_sqls == [] + load_sqls = [s for s in sql_calls if s.lstrip().upper().startswith("LOAD ")] + assert len(load_sqls) == 1