agnes-the-ai-analyst/src/orchestrator_security.py
ZdenekSrotyr a48524509a
docs: consolidate and de-clutter the documentation tree (#306)
CLAUDE.md rewritten (708 -> ~320 lines): four overlapping release
sections collapsed to one, stale v1->v35 schema history dropped (it
lives in CHANGELOG), marketplace endpoint internals and verbose
process sections moved out or tightened.

New focused docs:
- docs/RELEASING.md - release process, deploy workflows, CI quirks
  (RELEASE_TEMPLATE.md folded in as an appendix)
- docs/marketplace.md - marketplace ingestion + re-serving internals
- docs/README.md - documentation index by audience, linked from
  README.md and CLAUDE.md

Archived under docs/archive/: docs/superpowers/ (52 historical
planning artifacts), HACKATHON.md, pd-ps-comments.md,
security-audit-2026-04.md, future/NOTIFICATIONS.md.

Removed the docs/auto-install.md stub. Fixed dangling links in
connectors/jira/README.md and dev_docs/README.md, repointed
code/doc references to archived paths.
2026-05-14 18:54:22 +00:00

126 lines
5 KiB
Python

"""Allowlists and policy for the connector → orchestrator trust boundary.
The orchestrator reads `_remote_attach` rows that connectors write into their
`extract.duckdb`, then calls `INSTALL`, `LOAD`, and `ATTACH` based on those
values. Treating the connector as adversarial (compromised image, supply-chain,
malicious fork) means the orchestrator picks **what** can be installed and
**which** env vars can be referenced — not the connector.
"""
from __future__ import annotations
import logging
import os
import re
logger = logging.getLogger(__name__)
# DuckDB extensions the orchestrator is willing to load on behalf of a
# connector. Built-in extensions go in `_BUILTIN_EXTENSIONS`; community
# extensions go in `_COMMUNITY_EXTENSIONS`. The two sets are disjoint and
# tell the install path whether to issue `INSTALL ... FROM community` or
# only `LOAD`.
_BUILTIN_EXTENSIONS: frozenset[str] = frozenset() # none in current OSS
_COMMUNITY_EXTENSIONS: frozenset[str] = frozenset({
"keboola",
"bigquery",
})
# Env vars whose values may be passed as the auth `TOKEN` in `ATTACH`. The
# default is intentionally tight — every name in the runtime env that is not
# on this list cannot be exfiltrated to a connector-controlled URL.
# Operators add deployment-specific names via AGNES_REMOTE_ATTACH_TOKEN_ENVS.
_DEFAULT_TOKEN_ENVS: frozenset[str] = frozenset({
"KBC_TOKEN",
"KBC_STORAGE_TOKEN",
"KEBOOLA_STORAGE_TOKEN",
"GOOGLE_APPLICATION_CREDENTIALS", # path, not a secret value
})
# Names must additionally match this regex (defense against weird input).
_ENV_NAME_RE = re.compile(r"^[A-Z][A-Z0-9_]{0,63}$")
def _parse_csv_env(name: str) -> set[str]:
"""Parse a comma-separated env var into a stripped set of non-empty tokens."""
raw = os.environ.get(name, "")
return {t.strip() for t in raw.split(",") if t.strip()}
def get_allowed_extensions() -> dict[str, set[str]]:
"""Return the effective extension allowlist as a dict of {kind: set}.
`kind` is "builtin" or "community" — the install path needs to know
which to use. Operator override AGNES_REMOTE_ATTACH_EXTENSIONS replaces
the default community set; built-ins are not configurable from env (a
typo there would silently disable a working integration with no clear
failure mode, and built-ins do not pose a supply-chain risk).
"""
override = _parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS")
community = override if override else set(_COMMUNITY_EXTENSIONS)
return {"builtin": set(_BUILTIN_EXTENSIONS), "community": community}
def is_extension_allowed(extension: str) -> bool:
allow = get_allowed_extensions()
return extension in allow["builtin"] or extension in allow["community"]
def is_builtin_extension(extension: str) -> bool:
return extension in get_allowed_extensions()["builtin"]
def get_allowed_token_envs() -> set[str]:
"""Return the effective token-env allowlist.
Operator override AGNES_REMOTE_ATTACH_TOKEN_ENVS *replaces* the default
set (so an operator can shrink it as well as expand it). The startup
code logs the effective set so a typo is visible.
"""
override = _parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS")
return override if override else set(_DEFAULT_TOKEN_ENVS)
def is_token_env_allowed(token_env: str) -> bool:
"""Return True if ``token_env`` may be read and passed as a TOKEN.
Two checks: structural (`^[A-Z][A-Z0-9_]{0,63}$`) and membership in the
allowlist. The structural check refuses things that aren't a valid env
var name regardless of allowlist contents.
"""
if not isinstance(token_env, str) or not _ENV_NAME_RE.match(token_env):
return False
return token_env in get_allowed_token_envs()
def log_effective_policy() -> None:
"""Log the effective extension + token-env allowlists at INFO once.
Called from app startup. Makes operator typos visible — if
AGNES_REMOTE_ATTACH_EXTENSIONS=httpfs is set with the intent to ADD
httpfs (but the override REPLACES the default), the operator sees
'effective extension allowlist: {httpfs}' and notices keboola and
bigquery are missing. Idempotent — safe to call multiple times.
"""
ext = get_allowed_extensions()
envs = get_allowed_token_envs()
has_ext_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS"))
has_env_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS"))
logger.info(
"remote_attach policy: extensions=%s (override=%s), token_envs=%s (override=%s). "
"Note: env-var overrides REPLACE the default — set both yours and the "
"defaults if you want to add to them.",
sorted(ext["community"] | ext["builtin"]),
has_ext_override,
sorted(envs),
has_env_override,
)
def escape_sql_string_literal(value: str) -> str:
"""Double single-quotes for safe use inside DuckDB single-quoted literals.
Mirrors `src/db.py:_attach_extracts` (line ~411) so the read-only query
path and the orchestrator rebuild path use the same escape.
"""
return value.replace("'", "''")