Replaces the BigQuery wrap-view pattern with a discovery + scoped-fetch toolkit driven by the analyst's Claude session. Adds /api/v2/{catalog,schema,sample,scan,scan/estimate}, da catalog/schema/describe/fetch/snapshot/disk-info CLI commands, sqlglot-backed WHERE validator, process-local quota tracker, agent rails skill (cli/skills/agnes-data-querying.md). BREAKING: BQ wrap views off by default — set data_source.bigquery.legacy_wrap_views=true for one cycle. Backward-compat field_validator on primary_key. Catalog cache now matches documented 300s TTL with RBAC fresh per request. Cuts release v0.14.0.
951 lines
39 KiB
Python
951 lines
39 KiB
Python
"""Admin endpoints — table discovery, registry management, instance configuration.
|
|
|
|
Every gate on this router uses ``require_admin`` from ``app.auth.access``,
|
|
which checks Admin user_group membership for both OAuth session and PAT
|
|
callers via the same ``_user_group_ids`` lookup.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import threading
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
from pydantic import BaseModel, Field, field_validator
|
|
from typing import Optional, List, Dict, Any
|
|
import duckdb
|
|
|
|
from app.auth.access import require_admin
|
|
from app.auth.dependencies import _get_db
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
from src.repositories.audit import AuditRepository
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/admin", tags=["admin"])
|
|
|
|
# Serializes the read-modify-write of state/instance.yaml across the two
|
|
# endpoints that mutate the overlay (POST /server-config and POST /configure).
|
|
# Without it, two admins saving concurrently would each read the same overlay
|
|
# snapshot, merge their disjoint patches, and the second os.replace would silently
|
|
# drop the first patch. Single-process FastAPI workers; multi-worker deployments
|
|
# would need an OS-level file lock — documented limitation.
|
|
_overlay_write_lock = threading.Lock()
|
|
|
|
# SSRF protection: reject private/internal URLs for keboola_url
|
|
import ipaddress as _ipaddress
|
|
import socket as _socket
|
|
from urllib.parse import urlparse as _urlparse
|
|
|
|
|
|
def _validate_url_not_private(url: str, field_name: str = "url") -> None:
|
|
"""Raise 400 if the URL host points to a private/reserved network.
|
|
|
|
Uses DNS resolution + ipaddress checks instead of hostname regex,
|
|
which correctly handles all IPv4/IPv6 addresses including abbreviated
|
|
forms (fe80::1, ::1, etc.) and DNS rebinding (resolves at check time).
|
|
"""
|
|
try:
|
|
parsed = _urlparse(url)
|
|
except Exception:
|
|
raise HTTPException(status_code=400, detail=f"Invalid {field_name}: not a valid URL")
|
|
host = parsed.hostname or ""
|
|
if not host:
|
|
raise HTTPException(status_code=400, detail=f"Invalid {field_name}: missing hostname")
|
|
|
|
# Reject well-known dangerous hostnames before DNS resolution
|
|
if host.lower() in ("localhost", "localhost.localdomain"):
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid {field_name}: must not point to a private or reserved network",
|
|
)
|
|
|
|
# Resolve hostname to IP addresses and check each one
|
|
try:
|
|
addrinfos = _socket.getaddrinfo(host, None, proto=_socket.IPPROTO_TCP)
|
|
except Exception:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid {field_name}: could not resolve hostname",
|
|
)
|
|
|
|
for family, _type, _proto, _canonname, sockaddr in addrinfos:
|
|
ip_str = sockaddr[0]
|
|
try:
|
|
ip = _ipaddress.ip_address(ip_str)
|
|
except ValueError:
|
|
continue
|
|
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid {field_name}: must not point to a private or reserved network",
|
|
)
|
|
|
|
|
|
def _normalize_primary_key(v):
|
|
"""Coerce a string primary_key to ``[v]`` for backward compatibility.
|
|
|
|
The 0.14.0 contract is ``Optional[List[str]]`` so composite primary keys
|
|
(e.g. session-grain tables keyed on ``(session_id, event_date)``) round-
|
|
trip cleanly. Pre-0.14.0 callers sent a single string; Pydantic v2
|
|
refuses to coerce, so without this validator a CLI script posting
|
|
``"primary_key": "session_id"`` would now hit a 422. Wrap a bare string
|
|
in a one-element list so old and new callers both work.
|
|
"""
|
|
if v is None:
|
|
return v
|
|
if isinstance(v, str):
|
|
return [v]
|
|
return v
|
|
|
|
|
|
# Patches to these section paths must pass _validate_url_not_private. The
|
|
# tuple is `(section, *intermediate_keys, leaf_key)` — same SSRF gate the
|
|
# /configure wizard applies to keboola_url, so an admin can't sneak
|
|
# http://169.254.169.254/ in via the server-config editor's data_source patch.
|
|
_URL_BEARING_FIELDS: tuple[tuple[str, ...], ...] = (
|
|
("data_source", "keboola", "stack_url"),
|
|
)
|
|
|
|
|
|
def _validate_urls_in_patch(sections: Dict[str, Dict[str, Any]]) -> None:
|
|
"""Apply SSRF protection to every URL-bearing field present in the patch.
|
|
|
|
Walks each registered ``(section, *path, leaf)`` against the incoming
|
|
patch and runs ``_validate_url_not_private`` on any string value found.
|
|
Missing intermediate keys / non-dict nodes are silently skipped — the
|
|
patch hasn't touched that field, no validation needed.
|
|
"""
|
|
for path in _URL_BEARING_FIELDS:
|
|
section = path[0]
|
|
if section not in sections:
|
|
continue
|
|
node: Any = sections[section]
|
|
for key in path[1:-1]:
|
|
if not isinstance(node, dict) or key not in node:
|
|
node = None
|
|
break
|
|
node = node[key]
|
|
if isinstance(node, dict):
|
|
value = node.get(path[-1])
|
|
if isinstance(value, str) and value:
|
|
_validate_url_not_private(value, field_name=".".join(path))
|
|
|
|
|
|
# --- Server-config (instance.yaml) editor -----------------------------------
|
|
#
|
|
# The /admin/server-config UI POSTs a partial dict here keyed by section
|
|
# (instance, data_source, email, telegram, jira, theme, server, auth) with
|
|
# the field values to merge into instance.yaml. Each save:
|
|
# 1. Loads the current instance.yaml (writable overlay first, then static).
|
|
# 2. Deep-merges the patch on top.
|
|
# 3. Writes to DATA_DIR/state/instance.yaml (the writable overlay).
|
|
# 4. Writes one audit_log entry tagged `instance_config.update` containing
|
|
# a sanitized diff (secret-looking keys are masked).
|
|
# Hot-reload is OUT OF SCOPE for #91 — the response carries
|
|
# `restart_required=True` so the UI can show the banner.
|
|
|
|
# Sections an admin can mutate. Keep the list explicit so a typo'd section
|
|
# in the request body is rejected loudly instead of being silently merged
|
|
# into the YAML root and confusing future loads.
|
|
_EDITABLE_SECTIONS: tuple[str, ...] = (
|
|
"instance",
|
|
"data_source",
|
|
"email",
|
|
"telegram",
|
|
"jira",
|
|
"theme",
|
|
"server",
|
|
"auth",
|
|
)
|
|
|
|
# "Danger-zone" sections — flipping these can lock operators out (auth.*) or
|
|
# break OAuth callbacks (server.hostname/host). The UI shows a confirmation
|
|
# dialog before submitting them. The API accepts them; this list exists so
|
|
# the audit entry can flag the change as high-risk and the UI can surface
|
|
# the right warning copy.
|
|
_DANGER_SECTIONS: tuple[str, ...] = ("auth", "server")
|
|
|
|
# Keys whose values must be redacted from the audit diff. We match
|
|
# substring (case-insensitive) so `client_secret`, `api_token`,
|
|
# `webapp_secret_key`, `bot_token`, `password`, `smtp_password`, etc. all
|
|
# get masked even when nested.
|
|
_SECRET_KEY_PATTERNS: tuple[str, ...] = (
|
|
"secret",
|
|
"token",
|
|
"password",
|
|
"api_key",
|
|
)
|
|
|
|
|
|
def _is_secret_key(key: str) -> bool:
|
|
"""True if a config key holds a credential and should be masked in audit logs."""
|
|
k = key.lower()
|
|
return any(pat in k for pat in _SECRET_KEY_PATTERNS)
|
|
|
|
|
|
def _mask(value: Any) -> str:
|
|
"""Replacement value used in the audit diff for secret fields.
|
|
|
|
We deliberately do NOT preserve length or any hint about the secret —
|
|
the diff is read by other admins, and there's no operator value to
|
|
leaking "the new SMTP password is 16 chars". `***` is enough to show
|
|
that the field changed without exposing it.
|
|
"""
|
|
if value in (None, ""):
|
|
return "<empty>"
|
|
return "***"
|
|
|
|
|
|
# Sentinel values produced by `_mask`. Any patch leaf that arrives at a
|
|
# secret-keyed slot still bearing one of these strings means the caller
|
|
# round-tripped the GET payload (which redacts secret-keyed children inside
|
|
# nested objects) without changing the value — `_strip_redacted_sentinels`
|
|
# drops the leaf so deep-merge preserves whatever the overlay already had,
|
|
# rather than persisting the placeholder on top of the real secret.
|
|
_REDACTED_SENTINELS: frozenset = frozenset({"***", "<empty>"})
|
|
|
|
|
|
def _strip_redacted_sentinels(value: Any, key_hint: str = "") -> Any:
|
|
"""Recursively drop secret-keyed leaves whose value is a redaction sentinel.
|
|
|
|
Symmetric with `_redact`: the GET handler masks secret-keyed children
|
|
inside nested objects so the form never shows cleartext, and this
|
|
function is the write-side counterpart that ensures the placeholder
|
|
doesn't make a round-trip back into the overlay. Defense-in-depth
|
|
alongside the client-side `scrubRedactedSecrets` in
|
|
`admin_server_config.html` — an API caller (CLI / script) that forgets
|
|
to scrub still can't corrupt secrets via this endpoint.
|
|
"""
|
|
if isinstance(value, dict):
|
|
out: Dict[str, Any] = {}
|
|
for k, v in value.items():
|
|
if _is_secret_key(k) and isinstance(v, str) and v in _REDACTED_SENTINELS:
|
|
continue
|
|
out[k] = _strip_redacted_sentinels(v, k)
|
|
return out
|
|
if isinstance(value, list):
|
|
return [_strip_redacted_sentinels(item, key_hint) for item in value]
|
|
return value
|
|
|
|
|
|
def _redact(value: Any, key_hint: str = "") -> Any:
|
|
"""Recursively mask secret-looking fields in a config subtree.
|
|
|
|
`key_hint` is the parent key — used so a string value like
|
|
``"${KEBOOLA_TOKEN}"`` under ``token_env`` is masked even though the
|
|
value itself isn't a credential, because the key signals it points at
|
|
one.
|
|
"""
|
|
if isinstance(value, dict):
|
|
return {k: (_mask(v) if _is_secret_key(k) else _redact(v, k)) for k, v in value.items()}
|
|
if isinstance(value, list):
|
|
return [_redact(item, key_hint) for item in value]
|
|
if key_hint and _is_secret_key(key_hint):
|
|
return _mask(value)
|
|
return value
|
|
|
|
|
|
def _diff_dicts(before: dict, after: dict, path: str = "") -> List[Dict[str, Any]]:
|
|
"""Flat list of changed fields between two dicts.
|
|
|
|
Output: [{"path": "email.smtp_host", "before": "...", "after": "..."}].
|
|
Diff is computed on RAW values, then each row's `before`/`after` is
|
|
masked via `_mask` when the leaf key matches `_is_secret_key` — pre-
|
|
masking the inputs would collapse a secret rotation (e.g. password A
|
|
→ password B) into "no diff" because both sides redact to ``"***"``,
|
|
and the audit log would then silently fail to record one of the most
|
|
security-relevant changes. Compare raw, redact when emitting.
|
|
|
|
Recurses into a dict on either side (treating the missing side as
|
|
`{}`) so adding a brand-new section reports per-field paths
|
|
(`email.smtp_host`) rather than a single opaque `email` blob — that
|
|
keeps the audit row useful when an admin populates a section for the
|
|
first time.
|
|
"""
|
|
changes: List[Dict[str, Any]] = []
|
|
keys = set(before.keys()) | set(after.keys())
|
|
for key in sorted(keys):
|
|
new_path = f"{path}.{key}" if path else key
|
|
b_val = before.get(key)
|
|
a_val = after.get(key)
|
|
b_is_dict = isinstance(b_val, dict)
|
|
a_is_dict = isinstance(a_val, dict)
|
|
# Dict-vs-dict (or dict-vs-None) → recurse for per-field paths.
|
|
if b_is_dict and a_is_dict:
|
|
changes.extend(_diff_dicts(b_val, a_val, new_path))
|
|
elif b_is_dict and a_val is None:
|
|
changes.extend(_diff_dicts(b_val, {}, new_path))
|
|
elif a_is_dict and b_val is None:
|
|
changes.extend(_diff_dicts({}, a_val, new_path))
|
|
# Dict↔scalar shape change is recorded as a single replacement at
|
|
# the parent path. Recursing with `{}` would lose the scalar side
|
|
# entirely (admin sets `keboola: {…}` to `keboola: "disabled"` —
|
|
# auditor would see members removed but never the new value).
|
|
# The dict side may itself contain secret-keyed children (e.g.
|
|
# `keboola: {token_env: ${KEBOOLA_TOKEN}}` resolved to cleartext);
|
|
# `_redact` masks those children even when the parent key isn't
|
|
# secret-named, so the audit log doesn't leak ${ENV_VAR}-resolved
|
|
# values when a section is replaced wholesale.
|
|
elif b_is_dict != a_is_dict:
|
|
if _is_secret_key(key):
|
|
changes.append({
|
|
"path": new_path,
|
|
"before": _mask(b_val),
|
|
"after": _mask(a_val),
|
|
})
|
|
else:
|
|
changes.append({
|
|
"path": new_path,
|
|
"before": _redact(b_val, key) if b_is_dict else b_val,
|
|
"after": _redact(a_val, key) if a_is_dict else a_val,
|
|
})
|
|
elif b_val != a_val:
|
|
if _is_secret_key(key):
|
|
changes.append({
|
|
"path": new_path,
|
|
"before": _mask(b_val),
|
|
"after": _mask(a_val),
|
|
})
|
|
else:
|
|
changes.append({"path": new_path, "before": b_val, "after": a_val})
|
|
return changes
|
|
|
|
|
|
def _deep_merge(base: dict, patch: dict) -> dict:
|
|
"""Merge `patch` into `base` recursively, returning a new dict.
|
|
|
|
Patch values overwrite base values. Dict-into-dict recurses; everything
|
|
else (lists, scalars, None) is replaced wholesale — admin sets
|
|
``email: {smtp_port: 465}`` and we don't try to re-merge nested ports.
|
|
"""
|
|
out = dict(base)
|
|
for key, value in patch.items():
|
|
if isinstance(value, dict) and isinstance(out.get(key), dict):
|
|
out[key] = _deep_merge(out[key], value)
|
|
else:
|
|
out[key] = value
|
|
return out
|
|
|
|
|
|
def _load_current_instance_yaml() -> dict:
|
|
"""Return the editor's view of instance.yaml — deep-merge of static +
|
|
overlay via ``app.instance_config.load_instance_config``.
|
|
|
|
Readers (GET /server-config) hit the cache and trust that writers
|
|
invalidate. Writers must call ``reset_cache()`` explicitly *before*
|
|
the read so they see the latest disk state in the read-modify-write
|
|
sequence. The shared helper is the authoritative source so the editor
|
|
never sees a different view than the rest of the running app.
|
|
"""
|
|
from app.instance_config import load_instance_config
|
|
return load_instance_config()
|
|
|
|
|
|
def _public_view(config: dict) -> dict:
|
|
"""Return a config dict safe to render in the admin UI form.
|
|
|
|
Deep-copies and redacts secret-looking fields so an admin can see
|
|
*which* fields are populated without the cleartext leaking into the
|
|
rendered HTML / browser DevTools.
|
|
"""
|
|
import copy
|
|
return _redact(copy.deepcopy(config))
|
|
|
|
|
|
class ServerConfigUpdateRequest(BaseModel):
|
|
"""Patch payload for POST /api/admin/server-config.
|
|
|
|
Only the sections listed in `_EDITABLE_SECTIONS` are accepted; anything
|
|
else is rejected with 400. `confirm_danger` must be true if the patch
|
|
touches any danger-zone section (auth.*, server.*).
|
|
"""
|
|
sections: Dict[str, Dict[str, Any]] = Field(
|
|
default_factory=dict,
|
|
description="Per-section patch dict (e.g. {'instance': {'name': 'X'}})",
|
|
)
|
|
confirm_danger: bool = Field(
|
|
default=False,
|
|
description="Must be true to apply changes touching auth.* or server.*",
|
|
)
|
|
|
|
|
|
@router.get("/server-config")
|
|
async def get_server_config(
|
|
user: dict = Depends(require_admin),
|
|
):
|
|
"""Return the current instance.yaml with secrets redacted.
|
|
|
|
Used by the /admin/server-config UI to prefill its form. The redacted
|
|
payload mirrors the actual file shape, so the UI doesn't need to know
|
|
the schema — it iterates over the editable sections and renders the
|
|
fields it finds. Empty sections still show in the response so the form
|
|
knows to render their headers.
|
|
"""
|
|
config = _load_current_instance_yaml()
|
|
redacted = _public_view(config)
|
|
# Surface every editable section so the UI renders them even when the
|
|
# file omits them — operator can populate from scratch without manual
|
|
# JSON edits.
|
|
sections = {section: redacted.get(section, {}) for section in _EDITABLE_SECTIONS}
|
|
return {
|
|
"sections": sections,
|
|
"editable_sections": list(_EDITABLE_SECTIONS),
|
|
"danger_sections": list(_DANGER_SECTIONS),
|
|
"secret_key_patterns": list(_SECRET_KEY_PATTERNS),
|
|
}
|
|
|
|
|
|
@router.post("/server-config")
|
|
async def update_server_config(
|
|
request: ServerConfigUpdateRequest,
|
|
user: dict = Depends(require_admin),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Patch instance.yaml from the /admin/server-config editor.
|
|
|
|
Accepts a partial patch keyed by section. Validates sections, refuses
|
|
danger-zone edits without explicit confirmation, deep-merges into the
|
|
current overlay, writes the file, and emits one audit entry per save
|
|
with a sanitized diff. Returns ``restart_required=true`` so the UI can
|
|
show the restart banner — hot-reload is a separate issue (see #91 Out
|
|
of scope).
|
|
"""
|
|
import yaml
|
|
|
|
if not request.sections:
|
|
raise HTTPException(status_code=422, detail="sections cannot be empty")
|
|
|
|
# Reject unknown sections loudly. Without this, a typo like "thmee"
|
|
# would silently land in the YAML root and the operator wouldn't see
|
|
# their colour change apply.
|
|
unknown = sorted(set(request.sections.keys()) - set(_EDITABLE_SECTIONS))
|
|
if unknown:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"unknown section(s): {', '.join(unknown)}. "
|
|
f"Editable: {', '.join(_EDITABLE_SECTIONS)}",
|
|
)
|
|
|
|
# Danger-zone gate. The UI shows a confirmation dialog before posting
|
|
# with confirm_danger=true; an API caller (CLI/script) has to pass it
|
|
# explicitly so they can't fat-finger a hostname change.
|
|
danger_touched = sorted(set(request.sections.keys()) & set(_DANGER_SECTIONS))
|
|
if danger_touched and not request.confirm_danger:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"section(s) {', '.join(danger_touched)} require confirm_danger=true",
|
|
)
|
|
|
|
# SSRF protection — same gate the /configure wizard applies to
|
|
# keboola_url, but here it covers any URL-bearing field reachable via
|
|
# the per-section patch (e.g. data_source.keboola.stack_url).
|
|
_validate_urls_in_patch(request.sections)
|
|
|
|
# Defense-in-depth: scrub redaction sentinels (`***` / `<empty>`) out of
|
|
# secret-keyed leaves in the patch before they reach the deep-merge.
|
|
# The client form does the same scrub, but an API caller round-tripping
|
|
# the GET payload could otherwise overwrite real overlay secrets with
|
|
# the placeholder shown in the form.
|
|
scrubbed_sections: Dict[str, Dict[str, Any]] = {
|
|
section: _strip_redacted_sentinels(patch, section)
|
|
for section, patch in request.sections.items()
|
|
}
|
|
|
|
# Serialize read-modify-write across concurrent admin saves. Without the
|
|
# lock, two saves would each read the same overlay snapshot, merge their
|
|
# disjoint patches, and the second os.replace would silently drop the
|
|
# first patch. The lock spans the cache-invalidate → load → merge →
|
|
# atomic-write sequence; the audit log sits outside since it operates on
|
|
# local snapshots.
|
|
from app.instance_config import reset_cache
|
|
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
|
config_path = data_dir / "state" / "instance.yaml"
|
|
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with _overlay_write_lock:
|
|
# Drop the in-process cache so we read the latest on-disk state,
|
|
# including any update that landed from a concurrent caller before
|
|
# we acquired the lock.
|
|
reset_cache()
|
|
before = _load_current_instance_yaml()
|
|
|
|
# Deep merge — section-by-section so we never accidentally delete a
|
|
# sibling section the patch didn't touch. Use the redaction-scrubbed
|
|
# patch so a round-tripped GET payload can't overwrite real secrets
|
|
# with the `***` placeholder.
|
|
after = dict(before)
|
|
for section, patch in scrubbed_sections.items():
|
|
if not isinstance(patch, dict):
|
|
raise HTTPException(
|
|
status_code=422,
|
|
detail=f"section '{section}' must be an object, got {type(patch).__name__}",
|
|
)
|
|
if isinstance(after.get(section), dict):
|
|
after[section] = _deep_merge(after[section], patch)
|
|
else:
|
|
after[section] = patch
|
|
|
|
# Write only the sections the user actually patched in this request.
|
|
# Two reasons:
|
|
# 1. Persisting the full merged config (or every editable section)
|
|
# would snapshot non-editable static sections into the overlay,
|
|
# shadowing later operator updates to those sections in the
|
|
# static file (`_load_current_instance_yaml` merges static + overlay,
|
|
# overlay wins per leaf).
|
|
# 2. The merged config has `${ENV_VAR}` placeholders RESOLVED to the
|
|
# runtime values by config.loader. Writing every editable section
|
|
# back would persist real cleartext secrets where the static file
|
|
# had only env-var references — turning `smtp_password:
|
|
# ${SMTP_PASSWORD}` into `smtp_password: hunter2` in the overlay.
|
|
# By writing only the sections in `request.sections` we keep both the
|
|
# static-evolution and the env-var-placeholder properties intact.
|
|
overlay_payload: Dict[str, Any] = {}
|
|
if config_path.exists():
|
|
try:
|
|
overlay_payload = yaml.safe_load(config_path.read_text()) or {}
|
|
except Exception as e:
|
|
# A corrupt overlay used to be silently replaced — that masked
|
|
# disk corruption / partial writes / hand-edits and dropped
|
|
# every previously-saved section on the next save. Refuse and
|
|
# surface so the operator can investigate.
|
|
logger.exception("server-config: refusing to overwrite corrupt overlay at %s", config_path)
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"refusing to overwrite corrupt overlay at {config_path} ({e}); "
|
|
"back up and remove the file, or fix it by hand",
|
|
) from e
|
|
for section, patch in scrubbed_sections.items():
|
|
if section not in _EDITABLE_SECTIONS:
|
|
continue
|
|
# Deep-merge the patch into the existing overlay slot (or static-
|
|
# backed `before` if overlay had nothing for this section). This
|
|
# preserves any unrelated keys the operator didn't touch in this
|
|
# request — e.g. patching `email.smtp_host` doesn't blow away the
|
|
# `email.smtp_password: ${SMTP_PASSWORD}` reference.
|
|
existing = overlay_payload.get(section)
|
|
if not isinstance(existing, dict):
|
|
existing = {}
|
|
overlay_payload[section] = _deep_merge(existing, patch)
|
|
|
|
# Atomic via tmp + os.replace so two concurrent admin saves can't
|
|
# interleave bytes and produce corrupt YAML (especially harmful since
|
|
# auth.* is editable here — half-written file → operator lockout).
|
|
tmp_path = config_path.with_suffix(config_path.suffix + ".tmp")
|
|
tmp_path.write_text(yaml.dump(overlay_payload, default_flow_style=False, sort_keys=False))
|
|
os.replace(tmp_path, config_path)
|
|
logger.info("server-config: wrote %d section(s) to %s",
|
|
len(request.sections), config_path)
|
|
|
|
# Invalidate cached instance config so subsequent reads pick up the
|
|
# change. Hot-reload of running modules (auth providers, SMTP client)
|
|
# is out of scope — the restart banner tells the operator to bounce.
|
|
reset_cache()
|
|
|
|
# Audit entry — diff is computed on RAW values then `_diff_dicts`
|
|
# redacts each row whose leaf key matches `_is_secret_key`. Pre-
|
|
# masking the inputs would collapse a secret rotation into "no
|
|
# diff" because both sides redact to ``***``, hiding the most
|
|
# security-relevant changes from the audit log. We log even if no
|
|
# fields changed so the operator's intent (touched the page, hit
|
|
# save) is auditable.
|
|
diff = _diff_dicts(before, after)
|
|
AuditRepository(conn).log(
|
|
user_id=user.get("id"),
|
|
action="instance_config.update",
|
|
resource="instance.yaml",
|
|
params={
|
|
"sections": sorted(request.sections.keys()),
|
|
"danger_sections": danger_touched,
|
|
"diff": diff,
|
|
"diff_count": len(diff),
|
|
},
|
|
)
|
|
|
|
return {
|
|
"status": "ok",
|
|
"restart_required": True,
|
|
"sections_updated": sorted(request.sections.keys()),
|
|
"diff_count": len(diff),
|
|
}
|
|
|
|
|
|
# --- End server-config editor -----------------------------------------------
|
|
|
|
|
|
class RegisterTableRequest(BaseModel):
|
|
name: str
|
|
folder: Optional[str] = None
|
|
sync_strategy: str = "full_refresh"
|
|
# Composite primary keys are real (session-grain MSA tables key on
|
|
# `(session_id, event_date)`, browse rows on more). The frontend sends +
|
|
# reads this as a list; backend stores it JSON-serialized in VARCHAR.
|
|
# A bare string is accepted for backward compat — see _normalize_primary_key.
|
|
primary_key: Optional[List[str]] = None
|
|
description: Optional[str] = None
|
|
source_type: Optional[str] = None
|
|
bucket: Optional[str] = None
|
|
source_table: Optional[str] = None
|
|
query_mode: str = "local"
|
|
sync_schedule: Optional[str] = None
|
|
profile_after_sync: bool = True
|
|
|
|
@field_validator("primary_key", mode="before")
|
|
@classmethod
|
|
def _coerce_primary_key(cls, v):
|
|
return _normalize_primary_key(v)
|
|
|
|
|
|
class UpdateTableRequest(BaseModel):
|
|
name: Optional[str] = None
|
|
sync_strategy: Optional[str] = None
|
|
primary_key: Optional[List[str]] = None
|
|
description: Optional[str] = None
|
|
source_type: Optional[str] = None
|
|
bucket: Optional[str] = None
|
|
source_table: Optional[str] = None
|
|
query_mode: Optional[str] = None
|
|
sync_schedule: Optional[str] = None
|
|
profile_after_sync: Optional[bool] = None
|
|
|
|
@field_validator("primary_key", mode="before")
|
|
@classmethod
|
|
def _coerce_primary_key(cls, v):
|
|
return _normalize_primary_key(v)
|
|
|
|
|
|
class ConfigureRequest(BaseModel):
|
|
data_source: str # "keboola" | "bigquery" | "local"
|
|
keboola_token: Optional[str] = None
|
|
keboola_url: Optional[str] = None
|
|
bigquery_project: Optional[str] = None
|
|
bigquery_location: Optional[str] = None
|
|
instance_name: Optional[str] = None
|
|
allowed_domain: Optional[str] = None
|
|
|
|
|
|
@router.get("/discover-tables")
|
|
async def discover_tables(
|
|
user: dict = Depends(require_admin),
|
|
):
|
|
"""Discover all available tables from the configured data source."""
|
|
try:
|
|
from app.instance_config import get_data_source_type
|
|
source_type = get_data_source_type()
|
|
|
|
if source_type == "keboola":
|
|
from connectors.keboola.client import KeboolaClient
|
|
from app.instance_config import get_value
|
|
url = get_value("data_source", "keboola", "stack_url", default="")
|
|
token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
|
|
token = os.environ.get(token_env, "") if token_env else ""
|
|
if not token:
|
|
token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
|
|
client = KeboolaClient(token=token, url=url)
|
|
tables = client.discover_all_tables()
|
|
return {"tables": tables, "count": len(tables), "source": "keboola"}
|
|
else:
|
|
return {"tables": [], "count": 0, "source": source_type, "error": "Discovery not implemented for this source"}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Discovery failed: {e}")
|
|
|
|
|
|
@router.get("/registry")
|
|
async def list_registry(
|
|
user: dict = Depends(require_admin),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Get full table registry."""
|
|
repo = TableRegistryRepository(conn)
|
|
tables = repo.list_all()
|
|
return {"tables": tables, "count": len(tables)}
|
|
|
|
|
|
@router.post("/register-table", status_code=201)
|
|
async def register_table(
|
|
request: RegisterTableRequest,
|
|
user: dict = Depends(require_admin),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Register a new table in the system."""
|
|
if not request.name or not request.name.strip():
|
|
raise HTTPException(status_code=422, detail="Table name cannot be empty")
|
|
repo = TableRegistryRepository(conn)
|
|
table_id = request.name.strip().lower().replace(" ", "_")
|
|
|
|
if repo.get(table_id):
|
|
raise HTTPException(status_code=409, detail=f"Table '{table_id}' already registered")
|
|
|
|
repo.register(
|
|
id=table_id,
|
|
name=request.name,
|
|
folder=request.folder,
|
|
sync_strategy=request.sync_strategy,
|
|
primary_key=request.primary_key,
|
|
description=request.description,
|
|
registered_by=user.get("email"),
|
|
source_type=request.source_type,
|
|
bucket=request.bucket,
|
|
source_table=request.source_table,
|
|
query_mode=request.query_mode,
|
|
sync_schedule=request.sync_schedule,
|
|
profile_after_sync=request.profile_after_sync,
|
|
)
|
|
|
|
return {"id": table_id, "name": request.name, "status": "registered"}
|
|
|
|
|
|
@router.put("/registry/{table_id}")
|
|
async def update_table(
|
|
table_id: str,
|
|
request: UpdateTableRequest,
|
|
user: dict = Depends(require_admin),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Update a registered table's configuration."""
|
|
repo = TableRegistryRepository(conn)
|
|
if not repo.get(table_id):
|
|
raise HTTPException(status_code=404, detail="Table not found")
|
|
|
|
updates = {k: v for k, v in request.model_dump().items() if v is not None}
|
|
if updates:
|
|
existing = repo.get(table_id)
|
|
merged = {k: v for k, v in existing.items() if k != "registered_at"}
|
|
merged.update(updates)
|
|
merged.pop("id", None) # avoid duplicate id kwarg
|
|
repo.register(id=table_id, **merged)
|
|
return {"id": table_id, "updated": list(updates.keys())}
|
|
|
|
|
|
@router.delete("/registry/{table_id}", status_code=204)
|
|
async def unregister_table(
|
|
table_id: str,
|
|
user: dict = Depends(require_admin),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Unregister a table from the system."""
|
|
repo = TableRegistryRepository(conn)
|
|
if not repo.get(table_id):
|
|
raise HTTPException(status_code=404, detail="Table not found")
|
|
repo.unregister(table_id)
|
|
|
|
|
|
@router.post("/configure")
|
|
async def configure_instance(
|
|
request: ConfigureRequest,
|
|
user: dict = Depends(require_admin),
|
|
):
|
|
"""Configure data source and instance settings via API.
|
|
|
|
Writes config to instance.yaml and persists secrets to .env_overlay.
|
|
AI agents and the /setup wizard use this instead of manual file editing.
|
|
"""
|
|
import yaml
|
|
|
|
if request.data_source not in ("keboola", "bigquery", "local"):
|
|
raise HTTPException(status_code=400, detail="data_source must be 'keboola', 'bigquery', or 'local'")
|
|
|
|
# Validate credentials if provided
|
|
if request.data_source == "keboola":
|
|
if not request.keboola_token or not request.keboola_url:
|
|
raise HTTPException(status_code=400, detail="keboola_token and keboola_url are required for Keboola data source")
|
|
_validate_url_not_private(request.keboola_url, field_name="keboola_url")
|
|
try:
|
|
from connectors.keboola.client import KeboolaClient
|
|
client = KeboolaClient(token=request.keboola_token, url=request.keboola_url)
|
|
client.test_connection()
|
|
except Exception as e:
|
|
logger.error("Keboola connection validation failed: %s", e)
|
|
raise HTTPException(status_code=400, detail="Keboola connection failed. Check your token and URL.")
|
|
|
|
elif request.data_source == "bigquery":
|
|
if not request.bigquery_project:
|
|
raise HTTPException(status_code=400, detail="bigquery_project is required for BigQuery data source")
|
|
|
|
# Write instance.yaml to DATA_DIR/state/ (writable Docker volume),
|
|
# NOT to CONFIG_DIR which is mounted read-only in Docker.
|
|
#
|
|
# Narrow-overlay write strategy — must match `/api/admin/server-config`:
|
|
# 1. Read overlay verbatim (do NOT fall back to static). Falling back
|
|
# would copy env-resolved cleartext secrets from the merged static
|
|
# file back into the overlay (e.g. `smtp_password: ${SMTP_PASSWORD}`
|
|
# → `smtp_password: hunter2`). The wizard only ever sets
|
|
# `instance`, `auth`, `data_source` here, so other sections must
|
|
# flow from the static file via `load_instance_config`'s deep-merge
|
|
# — they don't belong in the overlay at all.
|
|
# 2. Patch only the sections this endpoint touches.
|
|
# 3. Write the narrow overlay back atomically (tmp + os.replace).
|
|
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
|
config_path = data_dir / "state" / "instance.yaml"
|
|
|
|
# Same serialization + corrupt-overlay handling as POST /server-config.
|
|
with _overlay_write_lock:
|
|
overlay: dict = {}
|
|
if config_path.exists():
|
|
try:
|
|
overlay = yaml.safe_load(config_path.read_text()) or {}
|
|
except Exception as e:
|
|
logger.exception("configure: refusing to overwrite corrupt overlay at %s", config_path)
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"refusing to overwrite corrupt overlay at {config_path} ({e}); "
|
|
"back up and remove the file, or fix it by hand",
|
|
) from e
|
|
|
|
# Merge instance settings into the overlay only — never seed from the
|
|
# env-resolved merged config.
|
|
if request.instance_name:
|
|
overlay.setdefault("instance", {})["name"] = request.instance_name
|
|
|
|
if request.allowed_domain:
|
|
overlay.setdefault("auth", {})["allowed_domain"] = request.allowed_domain
|
|
|
|
# data_source is fully owned by this endpoint — replace wholesale.
|
|
overlay["data_source"] = {"type": request.data_source}
|
|
if request.data_source == "keboola":
|
|
overlay["data_source"]["keboola"] = {
|
|
"stack_url": request.keboola_url,
|
|
"token_env": "KEBOOLA_STORAGE_TOKEN",
|
|
}
|
|
elif request.data_source == "bigquery":
|
|
overlay["data_source"]["bigquery"] = {
|
|
"project": request.bigquery_project,
|
|
"location": request.bigquery_location or "us",
|
|
}
|
|
|
|
# Atomic write to writable data volume — same tmp + os.replace pattern
|
|
# as the server-config editor so a concurrent save can't tear the file.
|
|
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
tmp_path = config_path.with_suffix(config_path.suffix + ".tmp")
|
|
tmp_path.write_text(yaml.dump(overlay, default_flow_style=False, sort_keys=False))
|
|
os.replace(tmp_path, config_path)
|
|
logger.info("Wrote instance config to %s", config_path)
|
|
|
|
# Persist secrets to .env_overlay (in data volume, never in git)
|
|
secrets_to_persist = {}
|
|
if request.keboola_token:
|
|
secrets_to_persist["KEBOOLA_STORAGE_TOKEN"] = request.keboola_token
|
|
if request.keboola_url:
|
|
secrets_to_persist["KEBOOLA_STACK_URL"] = request.keboola_url
|
|
|
|
if secrets_to_persist:
|
|
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
|
overlay_path = data_dir / "state" / ".env_overlay"
|
|
overlay_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Merge with existing overlay
|
|
existing_overlay = {}
|
|
if overlay_path.exists():
|
|
for line in overlay_path.read_text().splitlines():
|
|
if "=" in line and not line.startswith("#"):
|
|
k, v = line.split("=", 1)
|
|
existing_overlay[k.strip()] = v.strip()
|
|
existing_overlay.update(secrets_to_persist)
|
|
|
|
overlay_path.write_text(
|
|
"\n".join(f"{k}={v}" for k, v in existing_overlay.items()) + "\n"
|
|
)
|
|
try:
|
|
overlay_path.chmod(0o600)
|
|
except OSError:
|
|
pass
|
|
logger.info("Persisted %d secrets to .env_overlay", len(secrets_to_persist))
|
|
|
|
# Inject into current process environment
|
|
for k, v in secrets_to_persist.items():
|
|
os.environ[k] = v
|
|
|
|
# Invalidate cached instance config so next read picks up changes.
|
|
# Use the public helper (matches `/api/admin/server-config`); reaching
|
|
# into the private global silently breaks if the cache layout changes.
|
|
from app.instance_config import reset_cache
|
|
reset_cache()
|
|
|
|
return {
|
|
"status": "ok",
|
|
"data_source": request.data_source,
|
|
"connection": "verified" if request.data_source != "local" else "local",
|
|
}
|
|
|
|
|
|
def _discover_and_register_tables(conn: duckdb.DuckDBPyConnection, user_email: str) -> dict:
|
|
"""Discover tables from configured source and register them. Shared logic for API and sync."""
|
|
from app.instance_config import get_data_source_type, get_value
|
|
|
|
source_type = get_data_source_type()
|
|
if source_type != "keboola":
|
|
return {"registered": 0, "skipped": 0, "errors": 0, "tables": [], "source": source_type}
|
|
|
|
from connectors.keboola.client import KeboolaClient
|
|
# Read from data_source.keboola (matches what /api/admin/configure writes)
|
|
url = get_value("data_source", "keboola", "stack_url", default="")
|
|
token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
|
|
token = os.environ.get(token_env, "") if token_env else ""
|
|
if not token:
|
|
token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
|
|
|
|
client = KeboolaClient(token=token, url=url)
|
|
discovered = client.discover_all_tables()
|
|
|
|
repo = TableRegistryRepository(conn)
|
|
registered = 0
|
|
skipped = 0
|
|
errors = 0
|
|
table_names = []
|
|
|
|
for table in discovered:
|
|
table_id = table.get("id", "").strip().lower().replace(".", "_").replace(" ", "_")
|
|
if not table_id:
|
|
errors += 1
|
|
continue
|
|
|
|
if repo.get(table_id):
|
|
skipped += 1
|
|
continue
|
|
|
|
try:
|
|
# Parse bucket from table ID (format: in.c-bucket.table_name)
|
|
parts = table.get("id", "").split(".")
|
|
bucket = parts[1] if len(parts) > 1 else ""
|
|
source_table = parts[2] if len(parts) > 2 else table.get("name", "")
|
|
|
|
repo.register(
|
|
id=table_id,
|
|
name=table.get("name", table_id),
|
|
source_type="keboola",
|
|
bucket=bucket,
|
|
source_table=source_table,
|
|
query_mode="local",
|
|
registered_by=user_email,
|
|
description=f"Auto-discovered from Keboola: {table.get('id', '')}",
|
|
)
|
|
registered += 1
|
|
table_names.append(table_id)
|
|
except Exception as e:
|
|
logger.warning("Failed to register %s: %s", table_id, e)
|
|
errors += 1
|
|
|
|
return {
|
|
"registered": registered,
|
|
"skipped": skipped,
|
|
"errors": errors,
|
|
"tables": table_names,
|
|
"source": "keboola",
|
|
}
|
|
|
|
|
|
@router.post("/discover-and-register")
|
|
async def discover_and_register(
|
|
user: dict = Depends(require_admin),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Discover tables from configured source and auto-register them.
|
|
|
|
Combines discover-tables + register-table into one call.
|
|
Skips already-registered tables. Used by /setup wizard and AI agents.
|
|
"""
|
|
try:
|
|
result = _discover_and_register_tables(conn, user.get("email", "admin"))
|
|
return result
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Discovery and registration failed: {e}")
|