agnes-the-ai-analyst/src/orchestrator_security.py

"""Allowlists and policy for the connector → orchestrator trust boundary.

The orchestrator reads `_remote_attach` rows that connectors write into their
`extract.duckdb`, then calls `INSTALL`, `LOAD`, and `ATTACH` based on those
values. Treating the connector as adversarial (compromised image, supply-chain,
malicious fork) means the orchestrator picks **what** can be installed and
**which** env vars can be referenced — not the connector.
"""

from __future__ import annotations

import logging
import os
import re

logger = logging.getLogger(__name__)

# DuckDB extensions the orchestrator is willing to load on behalf of a
# connector. Built-in extensions go in `_BUILTIN_EXTENSIONS`; community
# extensions go in `_COMMUNITY_EXTENSIONS`. The two sets are disjoint and
# tell the install path whether to issue `INSTALL ... FROM community` or
# only `LOAD`.
_BUILTIN_EXTENSIONS: frozenset[str] = frozenset()  # none in current OSS
_COMMUNITY_EXTENSIONS: frozenset[str] = frozenset({
    "keboola",
    "bigquery",
})

# Env vars whose values may be passed as the auth `TOKEN` in `ATTACH`. The
# default is intentionally tight — every name in the runtime env that is not
# on this list cannot be exfiltrated to a connector-controlled URL.
# Operators add deployment-specific names via AGNES_REMOTE_ATTACH_TOKEN_ENVS.
_DEFAULT_TOKEN_ENVS: frozenset[str] = frozenset({
    "KBC_TOKEN",
    "KBC_STORAGE_TOKEN",
    "KEBOOLA_STORAGE_TOKEN",
    "GOOGLE_APPLICATION_CREDENTIALS",  # path, not a secret value
})

# Names must additionally match this regex (defense against weird input).
_ENV_NAME_RE = re.compile(r"^[A-Z][A-Z0-9_]{0,63}$")


def _parse_csv_env(name: str) -> set[str]:
    """Parse a comma-separated env var into a stripped set of non-empty tokens."""
    raw = os.environ.get(name, "")
    return {t.strip() for t in raw.split(",") if t.strip()}


def get_allowed_extensions() -> dict[str, set[str]]:
    """Return the effective extension allowlist as a dict of {kind: set}.

    `kind` is "builtin" or "community" — the install path needs to know
    which to use. Operator override AGNES_REMOTE_ATTACH_EXTENSIONS replaces
    the default community set; built-ins are not configurable from env (a
    typo there would silently disable a working integration with no clear
    failure mode, and built-ins do not pose a supply-chain risk).
    """
    override = _parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS")
    community = override if override else set(_COMMUNITY_EXTENSIONS)
    return {"builtin": set(_BUILTIN_EXTENSIONS), "community": community}


def is_extension_allowed(extension: str) -> bool:
    allow = get_allowed_extensions()
    return extension in allow["builtin"] or extension in allow["community"]


def is_builtin_extension(extension: str) -> bool:
    return extension in get_allowed_extensions()["builtin"]


def get_allowed_token_envs() -> set[str]:
    """Return the effective token-env allowlist.

    Operator override AGNES_REMOTE_ATTACH_TOKEN_ENVS *replaces* the default
    set (so an operator can shrink it as well as expand it). The startup
    code logs the effective set so a typo is visible.
    """
    override = _parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS")
    return override if override else set(_DEFAULT_TOKEN_ENVS)


def is_token_env_allowed(token_env: str) -> bool:
    """Return True if ``token_env`` may be read and passed as a TOKEN.

    Two checks: structural (`^[A-Z][A-Z0-9_]{0,63}$`) and membership in the
    allowlist. The structural check refuses things that aren't a valid env
    var name regardless of allowlist contents.
    """
    if not isinstance(token_env, str) or not _ENV_NAME_RE.match(token_env):
        return False
    return token_env in get_allowed_token_envs()


def log_effective_policy() -> None:
    """Log the effective extension + token-env allowlists at INFO once.

    Called from app startup. Makes operator typos visible — if
    AGNES_REMOTE_ATTACH_EXTENSIONS=httpfs is set with the intent to ADD
    httpfs (but the override REPLACES the default), the operator sees
    'effective extension allowlist: {httpfs}' and notices keboola and
    bigquery are missing. Idempotent — safe to call multiple times.
    """
    ext = get_allowed_extensions()
    envs = get_allowed_token_envs()
    has_ext_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS"))
    has_env_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS"))
    logger.info(
        "remote_attach policy: extensions=%s (override=%s), token_envs=%s (override=%s). "
        "Note: env-var overrides REPLACE the default — set both yours and the "
        "defaults if you want to add to them.",
        sorted(ext["community"] | ext["builtin"]),
        has_ext_override,
        sorted(envs),
        has_env_override,
    )


def escape_sql_string_literal(value: str) -> str:
    """Double single-quotes for safe use inside DuckDB single-quoted literals.

    Mirrors `src/db.py:_attach_extracts` (line ~411) so the read-only query
    path and the orchestrator rebuild path use the same escape.
    """
    return value.replace("'", "''")