CLAUDE.md rewritten (708 -> ~320 lines): four overlapping release sections collapsed to one, stale v1->v35 schema history dropped (it lives in CHANGELOG), marketplace endpoint internals and verbose process sections moved out or tightened. New focused docs: - docs/RELEASING.md - release process, deploy workflows, CI quirks (RELEASE_TEMPLATE.md folded in as an appendix) - docs/marketplace.md - marketplace ingestion + re-serving internals - docs/README.md - documentation index by audience, linked from README.md and CLAUDE.md Archived under docs/archive/: docs/superpowers/ (52 historical planning artifacts), HACKATHON.md, pd-ps-comments.md, security-audit-2026-04.md, future/NOTIFICATIONS.md. Removed the docs/auto-install.md stub. Fixed dangling links in connectors/jira/README.md and dev_docs/README.md, repointed code/doc references to archived paths.
126 lines
5 KiB
Python
126 lines
5 KiB
Python
"""Allowlists and policy for the connector → orchestrator trust boundary.
|
|
|
|
The orchestrator reads `_remote_attach` rows that connectors write into their
|
|
`extract.duckdb`, then calls `INSTALL`, `LOAD`, and `ATTACH` based on those
|
|
values. Treating the connector as adversarial (compromised image, supply-chain,
|
|
malicious fork) means the orchestrator picks **what** can be installed and
|
|
**which** env vars can be referenced — not the connector.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# DuckDB extensions the orchestrator is willing to load on behalf of a
|
|
# connector. Built-in extensions go in `_BUILTIN_EXTENSIONS`; community
|
|
# extensions go in `_COMMUNITY_EXTENSIONS`. The two sets are disjoint and
|
|
# tell the install path whether to issue `INSTALL ... FROM community` or
|
|
# only `LOAD`.
|
|
_BUILTIN_EXTENSIONS: frozenset[str] = frozenset() # none in current OSS
|
|
_COMMUNITY_EXTENSIONS: frozenset[str] = frozenset({
|
|
"keboola",
|
|
"bigquery",
|
|
})
|
|
|
|
# Env vars whose values may be passed as the auth `TOKEN` in `ATTACH`. The
|
|
# default is intentionally tight — every name in the runtime env that is not
|
|
# on this list cannot be exfiltrated to a connector-controlled URL.
|
|
# Operators add deployment-specific names via AGNES_REMOTE_ATTACH_TOKEN_ENVS.
|
|
_DEFAULT_TOKEN_ENVS: frozenset[str] = frozenset({
|
|
"KBC_TOKEN",
|
|
"KBC_STORAGE_TOKEN",
|
|
"KEBOOLA_STORAGE_TOKEN",
|
|
"GOOGLE_APPLICATION_CREDENTIALS", # path, not a secret value
|
|
})
|
|
|
|
# Names must additionally match this regex (defense against weird input).
|
|
_ENV_NAME_RE = re.compile(r"^[A-Z][A-Z0-9_]{0,63}$")
|
|
|
|
|
|
def _parse_csv_env(name: str) -> set[str]:
|
|
"""Parse a comma-separated env var into a stripped set of non-empty tokens."""
|
|
raw = os.environ.get(name, "")
|
|
return {t.strip() for t in raw.split(",") if t.strip()}
|
|
|
|
|
|
def get_allowed_extensions() -> dict[str, set[str]]:
|
|
"""Return the effective extension allowlist as a dict of {kind: set}.
|
|
|
|
`kind` is "builtin" or "community" — the install path needs to know
|
|
which to use. Operator override AGNES_REMOTE_ATTACH_EXTENSIONS replaces
|
|
the default community set; built-ins are not configurable from env (a
|
|
typo there would silently disable a working integration with no clear
|
|
failure mode, and built-ins do not pose a supply-chain risk).
|
|
"""
|
|
override = _parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS")
|
|
community = override if override else set(_COMMUNITY_EXTENSIONS)
|
|
return {"builtin": set(_BUILTIN_EXTENSIONS), "community": community}
|
|
|
|
|
|
def is_extension_allowed(extension: str) -> bool:
|
|
allow = get_allowed_extensions()
|
|
return extension in allow["builtin"] or extension in allow["community"]
|
|
|
|
|
|
def is_builtin_extension(extension: str) -> bool:
|
|
return extension in get_allowed_extensions()["builtin"]
|
|
|
|
|
|
def get_allowed_token_envs() -> set[str]:
|
|
"""Return the effective token-env allowlist.
|
|
|
|
Operator override AGNES_REMOTE_ATTACH_TOKEN_ENVS *replaces* the default
|
|
set (so an operator can shrink it as well as expand it). The startup
|
|
code logs the effective set so a typo is visible.
|
|
"""
|
|
override = _parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS")
|
|
return override if override else set(_DEFAULT_TOKEN_ENVS)
|
|
|
|
|
|
def is_token_env_allowed(token_env: str) -> bool:
|
|
"""Return True if ``token_env`` may be read and passed as a TOKEN.
|
|
|
|
Two checks: structural (`^[A-Z][A-Z0-9_]{0,63}$`) and membership in the
|
|
allowlist. The structural check refuses things that aren't a valid env
|
|
var name regardless of allowlist contents.
|
|
"""
|
|
if not isinstance(token_env, str) or not _ENV_NAME_RE.match(token_env):
|
|
return False
|
|
return token_env in get_allowed_token_envs()
|
|
|
|
|
|
def log_effective_policy() -> None:
|
|
"""Log the effective extension + token-env allowlists at INFO once.
|
|
|
|
Called from app startup. Makes operator typos visible — if
|
|
AGNES_REMOTE_ATTACH_EXTENSIONS=httpfs is set with the intent to ADD
|
|
httpfs (but the override REPLACES the default), the operator sees
|
|
'effective extension allowlist: {httpfs}' and notices keboola and
|
|
bigquery are missing. Idempotent — safe to call multiple times.
|
|
"""
|
|
ext = get_allowed_extensions()
|
|
envs = get_allowed_token_envs()
|
|
has_ext_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_EXTENSIONS"))
|
|
has_env_override = bool(_parse_csv_env("AGNES_REMOTE_ATTACH_TOKEN_ENVS"))
|
|
logger.info(
|
|
"remote_attach policy: extensions=%s (override=%s), token_envs=%s (override=%s). "
|
|
"Note: env-var overrides REPLACE the default — set both yours and the "
|
|
"defaults if you want to add to them.",
|
|
sorted(ext["community"] | ext["builtin"]),
|
|
has_ext_override,
|
|
sorted(envs),
|
|
has_env_override,
|
|
)
|
|
|
|
|
|
def escape_sql_string_literal(value: str) -> str:
|
|
"""Double single-quotes for safe use inside DuckDB single-quoted literals.
|
|
|
|
Mirrors `src/db.py:_attach_extracts` (line ~411) so the read-only query
|
|
path and the orchestrator rebuild path use the same escape.
|
|
"""
|
|
return value.replace("'", "''")
|