New `instance.support` (`AGNES_INSTANCE_SUPPORT` env override) config field renders operator-authored HTML in a mint-accent callout panel inside the welcome hero on /home, below the Overview footnotes. Designed for a one-line invitation pointing at a chat space, mailing list, or runbook so every user knows where to ask for help. - `get_instance_support()` helper mirrors `get_instance_overview()` (env > yaml > "" resolution, `| safe` filter trust boundary). - Wired into the home template context as `config.INSTANCE_SUPPORT`. - Template renders the callout inside the welcome hero, after the Overview footnotes block — empty yaml hides the block so the OSS stays vendor-neutral. - Registered in `_KNOWN_FIELDS["instance"]` so the field appears in `/admin/server-config` as "Available but unset" even before the operator populates it (discoverability for first-time setup). - 4 new tests cover the gated render path, the hidden-when-unset path, and independence from `instance.overview`. Operators who want to fill the block via terraform write the body to `modules/.../assets/support.html` in their infra repo and include it in the startup.sh yaml heredoc — the OSS template treats this as one more `| safe`-rendered field, no other plumbing needed.
4471 lines
190 KiB
Python
4471 lines
190 KiB
Python
"""Admin endpoints — table discovery, registry management, instance configuration.
|
||
|
||
Every gate on this router uses ``require_admin`` from ``app.auth.access``,
|
||
which checks Admin user_group membership for both OAuth session and PAT
|
||
callers via the same ``_user_group_ids`` lookup.
|
||
"""
|
||
|
||
import logging
|
||
import os
|
||
import threading
|
||
import uuid
|
||
from pathlib import Path
|
||
|
||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query
|
||
from pydantic import BaseModel, Field, field_validator, model_validator
|
||
from typing import Optional, List, Dict, Any
|
||
import duckdb
|
||
|
||
from app.auth.access import require_admin
|
||
from app.auth.dependencies import _get_db
|
||
from src.repositories.table_registry import TableRegistryRepository
|
||
from src.repositories.audit import AuditRepository
|
||
from src.identifier_validation import (
|
||
is_safe_identifier as _is_safe_identifier,
|
||
is_safe_quoted_identifier as _is_safe_quoted_identifier,
|
||
)
|
||
from src.sql_safe import is_safe_project_id as _is_safe_project_id
|
||
from src.scheduler import is_valid_schedule
|
||
logger = logging.getLogger(__name__)
|
||
router = APIRouter(prefix="/api/admin", tags=["admin"])
|
||
|
||
# Serializes the read-modify-write of state/instance.yaml across the two
|
||
# endpoints that mutate the overlay (POST /server-config and POST /configure).
|
||
# Without it, two admins saving concurrently would each read the same overlay
|
||
# snapshot, merge their disjoint patches, and the second os.replace would silently
|
||
# drop the first patch. Single-process FastAPI workers; multi-worker deployments
|
||
# would need an OS-level file lock — documented limitation.
|
||
_overlay_write_lock = threading.Lock()
|
||
|
||
# Per-processor advisory locks for /api/admin/run-session-processor.
|
||
# Two trigger paths exist for the same processor (scheduler tick + manual
|
||
# admin POST). Without serialization, overlapping runs would re-process the
|
||
# same /data/user_sessions/* set, double-call the LLM, and pile up duplicate
|
||
# `verification_evidence` rows — the dedup short-circuit in
|
||
# VerificationProcessor only catches the create+contradiction branches, not
|
||
# create_evidence (per ADR Decision 3, which expects evidence to accumulate
|
||
# per distinct verification event). Lock is non-blocking → second caller
|
||
# gets 409 Conflict so the operator sees what happened instead of stacking
|
||
# behind a long-running tick.
|
||
_processor_run_locks: dict[str, threading.Lock] = {}
|
||
_processor_run_locks_mutex = threading.Lock()
|
||
|
||
|
||
def _get_processor_run_lock(name: str) -> threading.Lock:
|
||
"""Per-name lock factory; the registry mutex guards dict insertion so
|
||
two threads simultaneously asking for a never-seen processor don't
|
||
each install their own lock instance."""
|
||
with _processor_run_locks_mutex:
|
||
if name not in _processor_run_locks:
|
||
_processor_run_locks[name] = threading.Lock()
|
||
return _processor_run_locks[name]
|
||
|
||
|
||
# SSRF protection: reject private/internal URLs for keboola_url
|
||
import ipaddress as _ipaddress
|
||
import socket as _socket
|
||
from urllib.parse import urlparse as _urlparse
|
||
|
||
|
||
def _validate_url_not_private(url: str, field_name: str = "url") -> None:
|
||
"""Raise 400 if the URL host points to a private/reserved network.
|
||
|
||
Uses DNS resolution + ipaddress checks instead of hostname regex,
|
||
which correctly handles all IPv4/IPv6 addresses including abbreviated
|
||
forms (fe80::1, ::1, etc.) and DNS rebinding (resolves at check time).
|
||
"""
|
||
try:
|
||
parsed = _urlparse(url)
|
||
except Exception:
|
||
raise HTTPException(status_code=400, detail=f"Invalid {field_name}: not a valid URL")
|
||
host = parsed.hostname or ""
|
||
if not host:
|
||
raise HTTPException(status_code=400, detail=f"Invalid {field_name}: missing hostname")
|
||
|
||
# Reject well-known dangerous hostnames before DNS resolution
|
||
if host.lower() in ("localhost", "localhost.localdomain"):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Invalid {field_name}: must not point to a private or reserved network",
|
||
)
|
||
|
||
# Resolve hostname to IP addresses and check each one
|
||
try:
|
||
addrinfos = _socket.getaddrinfo(host, None, proto=_socket.IPPROTO_TCP)
|
||
except Exception:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Invalid {field_name}: could not resolve hostname",
|
||
)
|
||
|
||
for family, _type, _proto, _canonname, sockaddr in addrinfos:
|
||
ip_str = sockaddr[0]
|
||
try:
|
||
ip = _ipaddress.ip_address(ip_str)
|
||
except ValueError:
|
||
continue
|
||
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Invalid {field_name}: must not point to a private or reserved network",
|
||
)
|
||
|
||
|
||
def _unescape_shell_quoting(s: str | None) -> str | None:
|
||
"""Defensive normalization for descriptions arriving via shell-quoting tooling.
|
||
|
||
Some operators register tables with bash/curl invocations whose quoting
|
||
injects literal backslash escapes into the payload (e.g. ``Don\\'t`` or
|
||
embedded ``\\n`` instead of real newlines). The backend would otherwise
|
||
persist those bytes verbatim and the UI would render them verbatim too.
|
||
Mirrored in JS as ``unescapeShellQuoting`` in
|
||
``app/web/templates/admin_tables.html`` for already-stored rows.
|
||
"""
|
||
if not s:
|
||
return s
|
||
# Order matters: protect real backslashes first.
|
||
SENTINEL = "\x00"
|
||
return (
|
||
s.replace("\\\\", SENTINEL)
|
||
.replace("\\n", "\n")
|
||
.replace("\\r", "\r")
|
||
.replace("\\t", "\t")
|
||
.replace("\\'", "'")
|
||
.replace('\\"', '"')
|
||
.replace(SENTINEL, "\\")
|
||
)
|
||
|
||
|
||
def _normalize_primary_key(v):
|
||
"""Coerce a string primary_key to ``[v]`` for backward compatibility.
|
||
|
||
The 0.14.0 contract is ``Optional[List[str]]`` so composite primary keys
|
||
(e.g. session-grain tables keyed on ``(session_id, event_date)``) round-
|
||
trip cleanly. Pre-0.14.0 callers sent a single string; Pydantic v2
|
||
refuses to coerce, so without this validator a CLI script posting
|
||
``"primary_key": "session_id"`` would now hit a 422. Wrap a bare string
|
||
in a one-element list so old and new callers both work.
|
||
"""
|
||
if v is None:
|
||
return v
|
||
if isinstance(v, str):
|
||
return [v]
|
||
return v
|
||
|
||
|
||
# Patches to these section paths must pass _validate_url_not_private. The
|
||
# tuple is `(section, *intermediate_keys, leaf_key)` — same SSRF gate the
|
||
# /configure wizard applies to keboola_url, so an admin can't sneak
|
||
# http://169.254.169.254/ in via the server-config editor's data_source patch.
|
||
#
|
||
# Intentionally NOT included: ``("ai", "base_url")``. The openai_compat
|
||
# provider legitimately points at internal services (LiteLLM proxy on a
|
||
# private network, on-cluster vLLM endpoint, etc.) — see
|
||
# config/instance.yaml.example "LiteLLM proxy" example. SSRF blocking
|
||
# would break those valid setups. Operators with stricter posture should
|
||
# enforce the constraint upstream (firewall / egress proxy allowlist).
|
||
# Devin ANALYSIS_0001 on PR #141 5f649a4 review.
|
||
_URL_BEARING_FIELDS: tuple[tuple[str, ...], ...] = (
|
||
("data_source", "keboola", "stack_url"),
|
||
("marketplace", "curators_url"),
|
||
)
|
||
|
||
|
||
def _validate_urls_in_patch(sections: Dict[str, Dict[str, Any]]) -> None:
|
||
"""Apply SSRF protection to every URL-bearing field present in the patch.
|
||
|
||
Walks each registered ``(section, *path, leaf)`` against the incoming
|
||
patch and runs ``_validate_url_not_private`` on any string value found.
|
||
Missing intermediate keys / non-dict nodes are silently skipped — the
|
||
patch hasn't touched that field, no validation needed.
|
||
"""
|
||
for path in _URL_BEARING_FIELDS:
|
||
section = path[0]
|
||
if section not in sections:
|
||
continue
|
||
node: Any = sections[section]
|
||
for key in path[1:-1]:
|
||
if not isinstance(node, dict) or key not in node:
|
||
node = None
|
||
break
|
||
node = node[key]
|
||
if isinstance(node, dict):
|
||
value = node.get(path[-1])
|
||
if isinstance(value, str) and value:
|
||
_validate_url_not_private(value, field_name=".".join(path))
|
||
|
||
|
||
_LOCK_TTL_MIN = 60
|
||
_LOCK_TTL_MAX = 7 * 24 * 3600 # 604800 — one week
|
||
|
||
|
||
def _validate_materialize_section(sections: Dict[str, Dict[str, Any]]) -> None:
|
||
"""Validate the materialize section patch when present.
|
||
|
||
Checks field-level constraints that the Pydantic envelope can't enforce
|
||
(it only validates the outer shape, not nested leaf values).
|
||
"""
|
||
mat = sections.get("materialize")
|
||
if not isinstance(mat, dict):
|
||
return
|
||
ttl = mat.get("lock_ttl_seconds")
|
||
if ttl is None:
|
||
return
|
||
if not isinstance(ttl, int) or isinstance(ttl, bool):
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail="materialize.lock_ttl_seconds must be an integer",
|
||
)
|
||
if ttl < _LOCK_TTL_MIN or ttl > _LOCK_TTL_MAX:
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail=(
|
||
f"materialize.lock_ttl_seconds must be between "
|
||
f"{_LOCK_TTL_MIN} and {_LOCK_TTL_MAX} "
|
||
f"(got {ttl})"
|
||
),
|
||
)
|
||
|
||
|
||
# --- Server-config (instance.yaml) editor -----------------------------------
|
||
#
|
||
# The /admin/server-config UI POSTs a partial dict here keyed by section
|
||
# (instance, data_source, email, telegram, jira, theme, server, auth) with
|
||
# the field values to merge into instance.yaml. Each save:
|
||
# 1. Loads the current instance.yaml (writable overlay first, then static).
|
||
# 2. Deep-merges the patch on top.
|
||
# 3. Writes to DATA_DIR/state/instance.yaml (the writable overlay).
|
||
# 4. Writes one audit_log entry tagged `instance_config.update` containing
|
||
# a sanitized diff (secret-looking keys are masked).
|
||
# Hot-reload is OUT OF SCOPE for #91 — the response carries
|
||
# `restart_required=True` so the UI can show the banner.
|
||
|
||
# Sections an admin can mutate. Keep the list explicit so a typo'd section
|
||
# in the request body is rejected loudly instead of being silently merged
|
||
# into the YAML root and confusing future loads.
|
||
_EDITABLE_SECTIONS: tuple[str, ...] = (
|
||
"instance",
|
||
"data_source",
|
||
"email",
|
||
"telegram",
|
||
"jira",
|
||
"theme",
|
||
"server",
|
||
"auth",
|
||
"ai",
|
||
"openmetadata",
|
||
"desktop",
|
||
"corporate_memory",
|
||
"materialize",
|
||
"guardrails",
|
||
"marketplace",
|
||
)
|
||
|
||
# "Danger-zone" sections — flipping these can lock operators out (auth.*) or
|
||
# break OAuth callbacks (server.hostname/host). The UI shows a confirmation
|
||
# dialog before submitting them. The API accepts them; this list exists so
|
||
# the audit entry can flag the change as high-risk and the UI can surface
|
||
# the right warning copy.
|
||
_DANGER_SECTIONS: tuple[str, ...] = ("auth", "server")
|
||
|
||
# Known-but-optional config fields per section. The /admin/server-config UI
|
||
# uses this registry alongside the YAML payload to render fields the operator
|
||
# might want to set even though they're not currently in instance.yaml.
|
||
#
|
||
# Schema per field:
|
||
# {
|
||
# "kind": "string" | "secret" | "bool" | "int" | "select" | "object" | "array",
|
||
# "default": <type-appropriate default> (optional)
|
||
# "hint": "<one-line operator-facing help>"
|
||
# "options": [...] (only for kind="select")
|
||
# "fields": {<name>: <fieldspec>} (only for kind="object", nested fields)
|
||
# "item_kind": "string" | ... (only for kind="array", element type)
|
||
# "required": bool (defaults False; UI marks the label)
|
||
# }
|
||
#
|
||
# Subagents 2-4 will populate the bodies. The registry enables the UI to
|
||
# render missing-but-known fields with placeholders + hints rather than
|
||
# forcing the operator to discover them via the JSON-patch textarea or
|
||
# hitting a runtime error first. The smoke fixture below
|
||
# (data_source.bigquery.billing_project) proves the renderer wiring works
|
||
# end-to-end so subagents 2-4 only have to add registry entries — they
|
||
# don't need to touch admin_server_config.html.
|
||
_KNOWN_FIELDS: dict[str, dict[str, dict]] = {
|
||
"instance": {
|
||
# UI theme — flips `<html data-theme="...">` so the
|
||
# design-system tokens (`--ds-*`) switch palettes via CSS
|
||
# without any markup change. Resolved by
|
||
# `app/instance_config.py::get_instance_theme()`.
|
||
"theme": {
|
||
"kind": "select",
|
||
"options": ["blue", "navy"],
|
||
"default": "blue",
|
||
"hint": (
|
||
"Page-hero colour scheme. `blue` (default) uses the "
|
||
"brand-blue hero + blue CTAs. `navy` opts into the "
|
||
"darker palette with the dark navy hero gradient + "
|
||
"mint-green CTAs and eyebrow accents."
|
||
),
|
||
},
|
||
# Operator-injected HTML/JS blocks rendered into base.html.
|
||
# `kind: array` renders as a JSON textarea in the admin UI
|
||
# (per admin_server_config.html:702-708 — arrays fall back to
|
||
# the JSON path); the hint documents the per-item shape so the
|
||
# operator knows what to paste. Resolved by
|
||
# `app/instance_config.py::get_custom_scripts()`.
|
||
"custom_scripts": {
|
||
"kind": "array",
|
||
"hint": (
|
||
"Operator-injected HTML/JS blocks rendered into base.html. "
|
||
"Each entry: {name: str, enabled: bool, placement: "
|
||
"head_start|head_end|body_end, html: str}. Used for feedback "
|
||
"widgets (Marker.io), analytics (GTM, PostHog), error capture "
|
||
"(Sentry). Rendered with | safe — admin trust boundary. Review "
|
||
"third-party widget privacy posture before enabling (most "
|
||
"capture session data). Restart required after save."
|
||
),
|
||
},
|
||
# Operator-authored Support HTML rendered inside the welcome
|
||
# hero on /home, below the operator-owned Overview footnotes.
|
||
# Resolved by `app/instance_config.py::get_instance_support()`.
|
||
# Typical content: a one-line invitation pointing at a chat
|
||
# space, mailing list, or internal runbook. Empty value =
|
||
# block hidden (OSS stays vendor-neutral).
|
||
"support": {
|
||
"kind": "string",
|
||
"hint": (
|
||
"HTML body rendered inside the welcome hero's Support "
|
||
"block on /home (mint-accent panel below the Overview "
|
||
"footnotes). Typically a one-line invitation linking to "
|
||
"a chat space, mailing list, or runbook — e.g. "
|
||
"'<p><strong>Need help?</strong> Drop into our "
|
||
"<a href=\"https://chat.example.com/room/xxx\">Support</a> "
|
||
"chat space.</p>'. Rendered with | safe — admin trust "
|
||
"boundary (link target is operator-controlled). Empty "
|
||
"value hides the block."
|
||
),
|
||
},
|
||
},
|
||
"data_source": {
|
||
"bigquery": {
|
||
"kind": "object",
|
||
"hint": "BigQuery connection knobs (read more in docs/DEPLOYMENT.md)",
|
||
"fields": {
|
||
"billing_project": {
|
||
"kind": "string",
|
||
"hint": (
|
||
"GCP project to bill BQ jobs against. Set when SA can read "
|
||
"the data project but cannot bill there (e.g. shared read-only "
|
||
"data project). Defaults to data_source.bigquery.project. "
|
||
"Mismatch → 403 USER_PROJECT_DENIED on every BQ call."
|
||
),
|
||
# Issue #160 §4.7.5: when this field is empty in the
|
||
# admin form, the JS template shows "(defaults to <project>)"
|
||
# as placeholder text — surfacing the access.py:339-340
|
||
# fallback rule directly in the UI without the operator
|
||
# having to read source. Path is walked against the
|
||
# `original` config payload from GET /api/admin/server-config.
|
||
"placeholder_from": ["data_source", "bigquery", "project"],
|
||
},
|
||
"max_bytes_per_materialize": {
|
||
"kind": "int",
|
||
"default": 10737418240,
|
||
"hint": (
|
||
"Cost guardrail for query_mode='materialized' BQ scans (dry-run "
|
||
"check before running). Bytes processed; exceeds → registration "
|
||
"or sync rejected. 0 disables the gate. Default 10737418240 = 10 GiB."
|
||
),
|
||
},
|
||
"bq_max_scan_bytes": {
|
||
"kind": "int",
|
||
"default": 5368709120,
|
||
"hint": (
|
||
"Cost guardrail for `agnes query --remote` against query_mode='remote' "
|
||
"BQ rows (dry-run check on the underlying SELECT before execute). "
|
||
"Bytes processed; exceeds → 400 remote_scan_too_large with a "
|
||
"`agnes snapshot create` suggestion. 0 disables the gate. Default 5368709120 = 5 GiB."
|
||
),
|
||
},
|
||
"query_timeout_ms": {
|
||
"kind": "int",
|
||
"default": 600000,
|
||
"hint": (
|
||
"DuckDB BigQuery extension query timeout (milliseconds). Applied "
|
||
"via `SET bq_query_timeout_ms` after every `LOAD bigquery` on "
|
||
"every BQ-touching DuckDB session (orchestrator remote-view "
|
||
"ATTACH, BqAccess factory, standalone extractor). Extension "
|
||
"default is 90 000 ms = 90 s, which is too tight for analyst "
|
||
"queries against view-backed datasets — bumped to 600 000 ms = "
|
||
"10 min by default. Set 0 to fall through to the extension "
|
||
"default. Note: the underlying BQ jobs.query RPC caps the wait "
|
||
"at ~200 s per call; the extension polls on top, so the "
|
||
"effective ceiling is this value but each poll round-trip is "
|
||
"~200 s. DuckDB itself emits a warning when this is set above "
|
||
"~200 s — that warning is informational, not an error."
|
||
),
|
||
},
|
||
},
|
||
},
|
||
"keboola": {
|
||
"kind": "object",
|
||
"hint": "Keboola Storage API connection",
|
||
"fields": {
|
||
"stack_url": {
|
||
"kind": "string",
|
||
"hint": (
|
||
"e.g. https://connection.keboola.com (instance-specific stack URL). "
|
||
"Validated against private-IP allowlist on save (SSRF guard)."
|
||
),
|
||
},
|
||
"project_id": {
|
||
"kind": "string",
|
||
"hint": "Keboola project ID (numeric, but kept as string in YAML).",
|
||
},
|
||
},
|
||
},
|
||
},
|
||
"email": {
|
||
# SMTP fields render via the populated path (always set when email
|
||
# is enabled); no commonly-missing optional knobs at this layer.
|
||
},
|
||
"telegram": {
|
||
# Rarely missing; leave empty.
|
||
},
|
||
"jira": {
|
||
# Webhook + REST credentials always present when Jira is configured.
|
||
},
|
||
"theme": {
|
||
# Cosmetic only; rarely missing.
|
||
},
|
||
"server": {
|
||
# TLS / hostname knobs are mostly env-side; nothing to surface here.
|
||
},
|
||
"auth": {
|
||
"allowed_domain": {
|
||
"kind": "string",
|
||
"hint": (
|
||
"Comma-separated list of allowed sign-in email domains (e.g. "
|
||
"'acme.com,acme-internal.com'). Single domain works too. Empty → no "
|
||
"domain restriction (any verified Google identity can sign in)."
|
||
),
|
||
},
|
||
},
|
||
"ai": {
|
||
"base_url": {
|
||
"kind": "string",
|
||
"hint": (
|
||
"Required for provider='openai_compat' (LiteLLM, OpenRouter, vLLM, etc.). "
|
||
"Ignored when provider='anthropic'. Examples: https://litellm.example.com, "
|
||
"https://openrouter.ai/api/v1."
|
||
),
|
||
},
|
||
"structured_output": {
|
||
"kind": "select",
|
||
"options": ["strict", "json", "auto"],
|
||
"default": "auto",
|
||
"hint": (
|
||
"JSON-schema enforcement strategy. strict=Layer 1 only "
|
||
"(Anthropic/OpenAI native, fail otherwise). json=Layer 1 + Layer 2 "
|
||
"fallback. auto=all three layers including prompt-based JSON (most "
|
||
"compatible, least strict)."
|
||
),
|
||
},
|
||
},
|
||
"openmetadata": {
|
||
"url": {
|
||
"kind": "string",
|
||
"hint": "Base URL of your OpenMetadata server (e.g. https://catalog.example.com).",
|
||
},
|
||
"token": {
|
||
"kind": "secret",
|
||
"hint": (
|
||
"JWT bearer token. Use ${OPENMETADATA_TOKEN} env-var reference "
|
||
"(don't paste secret directly)."
|
||
),
|
||
},
|
||
"cache_ttl_seconds": {
|
||
"kind": "int",
|
||
"default": 3600,
|
||
"hint": "How long to cache catalog responses in-process. Default 3600s (1h).",
|
||
},
|
||
"verify_ssl": {
|
||
"kind": "bool",
|
||
"default": True,
|
||
"hint": (
|
||
"TLS verification. Default true. Set false ONLY for internal CAs / "
|
||
"self-signed certs — sends the JWT over an unverified channel."
|
||
),
|
||
},
|
||
},
|
||
"desktop": {
|
||
"jwt_issuer": {
|
||
"kind": "string",
|
||
"default": "data-analyst",
|
||
"hint": "JWT iss claim. Match what the desktop app verifies.",
|
||
},
|
||
"jwt_secret": {
|
||
"kind": "secret",
|
||
"hint": "JWT signing secret. Use ${DESKTOP_JWT_SECRET} env-var reference.",
|
||
},
|
||
"url_scheme": {
|
||
"kind": "string",
|
||
"default": "data-analyst",
|
||
"hint": "Custom URL scheme registered by the desktop app (data-analyst://...).",
|
||
},
|
||
},
|
||
# corporate_memory governance — optional. When the section is missing
|
||
# from instance.yaml the system runs in legacy democratic-wiki mode
|
||
# (no admin review). Schema mirrors config/instance.yaml.example
|
||
# lines 224-317; renderer handles arbitrary depth + arrays + maps.
|
||
"corporate_memory": {
|
||
"distribution_mode": {
|
||
"kind": "select",
|
||
"options": ["mandatory_only", "admin_curated", "hybrid"],
|
||
"default": "hybrid",
|
||
"hint": (
|
||
"How knowledge reaches users. mandatory_only = admin-only; "
|
||
"admin_curated = admin + user voting as feedback; "
|
||
"hybrid = default (mandatory from admin + optional from user voting)."
|
||
),
|
||
},
|
||
"approval_mode": {
|
||
"kind": "select",
|
||
"options": ["review_queue", "auto_publish", "threshold"],
|
||
"default": "review_queue",
|
||
"hint": (
|
||
"How AI-extracted items enter the system. review_queue = admin "
|
||
"approval required (default); auto_publish = live immediately; "
|
||
"threshold = high-confidence auto, low-confidence to queue."
|
||
),
|
||
},
|
||
"review_period_months": {
|
||
"kind": "int",
|
||
"default": 6,
|
||
"hint": "How often approved/mandatory items are flagged for re-review (months).",
|
||
},
|
||
"notify_on_new_items": {
|
||
"kind": "bool",
|
||
"default": True,
|
||
"hint": "Notify km_admins when new pending items arrive.",
|
||
},
|
||
"sources": {
|
||
"kind": "object",
|
||
"hint": (
|
||
"Knowledge-source ingestion. Each source has its own enabled "
|
||
"flag + base confidence."
|
||
),
|
||
"fields": {
|
||
"claude_local_md": {
|
||
"kind": "object",
|
||
"fields": {
|
||
"enabled": {"kind": "bool", "default": True},
|
||
"confidence_base": {
|
||
"kind": "float",
|
||
"default": 0.50,
|
||
"hint": "Confidence assigned to extractions from CLAUDE.local.md (0-1).",
|
||
},
|
||
},
|
||
},
|
||
"session_transcripts": {
|
||
"kind": "object",
|
||
"fields": {
|
||
"enabled": {"kind": "bool", "default": True},
|
||
"confidence_base": {"kind": "float", "default": 0.60},
|
||
"max_turns_per_session": {
|
||
"kind": "int",
|
||
"default": 100,
|
||
"hint": "Truncate transcripts longer than this many turns.",
|
||
},
|
||
"detection_types": {
|
||
"kind": "array",
|
||
"item_kind": "string",
|
||
"default": [
|
||
"correction",
|
||
"confirmation",
|
||
"unprompted_definition",
|
||
],
|
||
"hint": (
|
||
"Which extraction patterns to detect. Each entry "
|
||
"is a detection-type tag."
|
||
),
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
"extraction": {
|
||
"kind": "object",
|
||
"fields": {
|
||
"model": {
|
||
"kind": "string",
|
||
"default": "claude-haiku-4-5-20251001",
|
||
"hint": "LLM used to extract knowledge. Override for cost or quality.",
|
||
},
|
||
"sensitivity_check": {"kind": "bool", "default": True},
|
||
"contradiction_check": {"kind": "bool", "default": True},
|
||
},
|
||
},
|
||
"confidence": {
|
||
"kind": "object",
|
||
"hint": "Confidence scoring + decay rules.",
|
||
"fields": {
|
||
"base": {
|
||
"kind": "map",
|
||
"key_kind": "string",
|
||
"value_kind": "float",
|
||
"default": {
|
||
"user_verification.correction": 0.90,
|
||
"user_verification.unprompted_definition": 0.90,
|
||
"user_verification.confirmation": 0.60,
|
||
"admin_mandate": 1.00,
|
||
"claude_local_md": 0.50,
|
||
"session_transcript": 0.50,
|
||
},
|
||
"hint": (
|
||
"Base score per source/detection. Keys are 'source_type' "
|
||
"or 'source_type.detection_type' (the dot is data, not "
|
||
"nesting)."
|
||
),
|
||
},
|
||
"modifiers": {
|
||
# map<string, map<string, float>>. The renderer's structured
|
||
# editor for "map of objects with declared subfields" is a
|
||
# TODO (see admin_server_config.html); for now this falls
|
||
# back to a JSON textarea — admins editing it see the
|
||
# schema doc inline via the hint.
|
||
"kind": "map",
|
||
"key_kind": "string",
|
||
"value_kind": "object",
|
||
"value_fields": {}, # signals the JSON-textarea fallback
|
||
"hint": (
|
||
"Per-key modifier step sizes applied to base when "
|
||
"optional signals are present (3-level dotted paths). "
|
||
"Edit as a JSON object — outer keys mirror confidence.base "
|
||
"keys; inner objects map signal name to bonus float."
|
||
),
|
||
},
|
||
"decay": {
|
||
"kind": "object",
|
||
"fields": {
|
||
"mode": {
|
||
"kind": "select",
|
||
"options": ["linear", "exponential"],
|
||
"default": "exponential",
|
||
},
|
||
"half_life_months": {
|
||
"kind": "int",
|
||
"default": 12,
|
||
"hint": "Used when mode=exponential.",
|
||
},
|
||
"decay_rate_monthly": {
|
||
"kind": "float",
|
||
"default": 0.02,
|
||
"hint": "Used when mode=linear.",
|
||
},
|
||
"floor": {
|
||
"kind": "map",
|
||
"key_kind": "string",
|
||
"value_kind": "float",
|
||
"default": {
|
||
"admin_mandate": 0.50,
|
||
"user_verification": 0.40,
|
||
"default": 0.0,
|
||
},
|
||
"hint": (
|
||
"Per-source minimum confidence — items never decay "
|
||
"below this floor."
|
||
),
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
"contradiction_detection": {
|
||
"kind": "object",
|
||
"fields": {
|
||
"enabled": {"kind": "bool", "default": True},
|
||
"max_candidates": {
|
||
"kind": "int",
|
||
"default": 10,
|
||
"hint": "Max contradiction candidates to evaluate per new item.",
|
||
},
|
||
},
|
||
},
|
||
"entity_resolution": {
|
||
"kind": "object",
|
||
"fields": {
|
||
"enabled": {"kind": "bool", "default": True},
|
||
"entities": {
|
||
"kind": "map",
|
||
"key_kind": "string",
|
||
"value_kind": "array",
|
||
"value_item_kind": "string",
|
||
"default": {
|
||
"metrics": ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"],
|
||
"products": ["Platform", "API", "Dashboard"],
|
||
},
|
||
"hint": (
|
||
"Domain-entity vocabulary. Key = domain category; value = "
|
||
"canonical names list."
|
||
),
|
||
},
|
||
},
|
||
},
|
||
"domain_owners": {
|
||
"kind": "map",
|
||
"key_kind": "string",
|
||
"value_kind": "array",
|
||
"value_item_kind": "string",
|
||
"hint": (
|
||
"Per-domain admin emails. Key = domain name; value = email list."
|
||
),
|
||
},
|
||
"domains": {
|
||
"kind": "array",
|
||
"item_kind": "string",
|
||
"default": [
|
||
"finance",
|
||
"engineering",
|
||
"product",
|
||
"data",
|
||
"operations",
|
||
"infrastructure",
|
||
],
|
||
"hint": (
|
||
"Knowledge domains analysts can target. Each must match a key "
|
||
"in domain_owners."
|
||
),
|
||
},
|
||
},
|
||
# materialize — file-lock TTL for the concurrent-materialize safety net.
|
||
# A single field; more knobs may follow as the feature matures.
|
||
"materialize": {
|
||
"lock_ttl_seconds": {
|
||
"kind": "int",
|
||
"default": 86400,
|
||
"hint": (
|
||
"How long (seconds) before a stale materialize lock file is "
|
||
"reclaimed. The lock is a .parquet.lock sibling file; if the "
|
||
"holder process is hard-killed, the next attempt reclaims the "
|
||
"lock once the file's mtime is older than this TTL. "
|
||
"Default 86400 (24 h). Min 60, max 604800 (7 days). "
|
||
"Lower only if you know materializes never exceed the new value "
|
||
"and your host regularly hard-kills processes."
|
||
),
|
||
},
|
||
},
|
||
"guardrails": {
|
||
"min_description_chars": {
|
||
"kind": "int",
|
||
"default": 60,
|
||
"hint": (
|
||
"Minimum character floor for skill / agent / plugin "
|
||
"descriptions on flea-market uploads (the inline content "
|
||
"guardrail). Real-world Claude skill descriptions cluster "
|
||
"150–220 chars; the default 60 is the bottom of the bar "
|
||
"to catch placeholders. Bump to 100+ to push submitters "
|
||
"closer to the ecosystem norm. Min 1."
|
||
),
|
||
},
|
||
"min_command_description_chars": {
|
||
"kind": "int",
|
||
"default": 25,
|
||
"hint": (
|
||
"Minimum character floor for slash-command descriptions. "
|
||
"Tighter than skills because commands are one-verb "
|
||
"actions (\"run tests\", \"format code\"). Default 25. Min 1."
|
||
),
|
||
},
|
||
"min_distinct_words": {
|
||
"kind": "int",
|
||
"default": 5,
|
||
"hint": (
|
||
"Minimum number of DISTINCT words in any description "
|
||
"string. Defends against padding-only descriptions like "
|
||
"\"description description description\" that hit the "
|
||
"character count but say nothing. Default 5. Min 1."
|
||
),
|
||
},
|
||
"min_body_chars": {
|
||
"kind": "int",
|
||
"default": 200,
|
||
"hint": (
|
||
"Minimum body-content floor for skill / agent files "
|
||
"(the markdown after the YAML frontmatter). Real skill "
|
||
"bodies run 500–2000 chars; the default 200 is a "
|
||
"\"one paragraph\" floor that catches stubs. Min 1."
|
||
),
|
||
},
|
||
"enabled": {
|
||
"kind": "bool",
|
||
"default": True,
|
||
"hint": (
|
||
"Master kill-switch for the LLM guardrail tier. When "
|
||
"False (or when ANTHROPIC_API_KEY / LLM_API_KEY is "
|
||
"absent), uploads still run the inline mechanical "
|
||
"checks but skip the LLM security + content-quality "
|
||
"review and auto-approve. Default True."
|
||
),
|
||
},
|
||
"review_model": {
|
||
"kind": "select",
|
||
"default": "haiku",
|
||
"options": ["haiku", "sonnet", "opus"],
|
||
"hint": (
|
||
"Anthropic model tier for the LLM security + content "
|
||
"review. Haiku is the cheapest and fastest; Sonnet / "
|
||
"Opus catch subtler prompt-injection + vague descriptions "
|
||
"at proportionally higher per-upload cost."
|
||
),
|
||
},
|
||
"blocked_quota_per_day": {
|
||
"kind": "int",
|
||
"default": 50,
|
||
"hint": (
|
||
"Per-submitter cap on `blocked_llm` + `review_error` "
|
||
"rows in the trailing 24h. Bounds the worst case where "
|
||
"a bot loops on bundles that survive inline checks but "
|
||
"trip the async LLM reviewer. Inline failures are "
|
||
"hard-rejected upstream (no row, not counted). 0 "
|
||
"disables the quota. Default 50."
|
||
),
|
||
},
|
||
"blocked_bundle_ttl_days": {
|
||
"kind": "int",
|
||
"default": 30,
|
||
"hint": (
|
||
"How many days to keep a blocked bundle's bytes on disk. "
|
||
"The submission row + sha256 + size always survive; only "
|
||
"the bytes get removed. 0 disables the purge entirely. "
|
||
"Default 30."
|
||
),
|
||
},
|
||
"stuck_review_grace_seconds": {
|
||
"kind": "int",
|
||
"default": 1800,
|
||
"hint": (
|
||
"How long a submission may stay at `status='pending_llm'` "
|
||
"before the reaper flips it to `review_error`. Default "
|
||
"1800 (30 min) comfortably exceeds Sonnet / Opus p99 "
|
||
"wall time. 0 disables the reaper."
|
||
),
|
||
},
|
||
},
|
||
"marketplace": {
|
||
"curators_url": {
|
||
"kind": "string",
|
||
"hint": (
|
||
"URL the 'See all curators →' link on /marketplace points to "
|
||
"(e.g. an internal wiki page listing curators accountable for "
|
||
"the curated marketplace). Empty → the link is hidden. "
|
||
"Validated against private-IP allowlist on save (SSRF guard)."
|
||
),
|
||
},
|
||
},
|
||
}
|
||
|
||
# Keys whose values must be redacted from the audit diff. We match
|
||
# substring (case-insensitive) so `client_secret`, `api_token`,
|
||
# `webapp_secret_key`, `bot_token`, `password`, `smtp_password`, etc. all
|
||
# get masked even when nested.
|
||
_SECRET_KEY_PATTERNS: tuple[str, ...] = (
|
||
"secret",
|
||
"token",
|
||
"password",
|
||
"api_key",
|
||
)
|
||
|
||
|
||
def _is_secret_key(key: str) -> bool:
|
||
"""True if a config key holds a credential and should be masked in audit logs."""
|
||
k = key.lower()
|
||
return any(pat in k for pat in _SECRET_KEY_PATTERNS)
|
||
|
||
|
||
def _mask(value: Any) -> str:
|
||
"""Replacement value used in the audit diff for secret fields.
|
||
|
||
We deliberately do NOT preserve length or any hint about the secret —
|
||
the diff is read by other admins, and there's no operator value to
|
||
leaking "the new SMTP password is 16 chars". `***` is enough to show
|
||
that the field changed without exposing it.
|
||
"""
|
||
if value in (None, ""):
|
||
return "<empty>"
|
||
return "***"
|
||
|
||
|
||
# Sentinel values produced by `_mask`. Any patch leaf that arrives at a
|
||
# secret-keyed slot still bearing one of these strings means the caller
|
||
# round-tripped the GET payload (which redacts secret-keyed children inside
|
||
# nested objects) without changing the value — `_strip_redacted_sentinels`
|
||
# drops the leaf so deep-merge preserves whatever the overlay already had,
|
||
# rather than persisting the placeholder on top of the real secret.
|
||
_REDACTED_SENTINELS: frozenset = frozenset({"***", "<empty>"})
|
||
|
||
|
||
def _strip_redacted_sentinels(value: Any, key_hint: str = "") -> Any:
|
||
"""Recursively drop secret-keyed leaves whose value is a redaction sentinel.
|
||
|
||
Symmetric with `_redact`: the GET handler masks secret-keyed children
|
||
inside nested objects so the form never shows cleartext, and this
|
||
function is the write-side counterpart that ensures the placeholder
|
||
doesn't make a round-trip back into the overlay. Defense-in-depth
|
||
alongside the client-side `scrubRedactedSecrets` in
|
||
`admin_server_config.html` — an API caller (CLI / script) that forgets
|
||
to scrub still can't corrupt secrets via this endpoint.
|
||
"""
|
||
if isinstance(value, dict):
|
||
out: Dict[str, Any] = {}
|
||
for k, v in value.items():
|
||
if _is_secret_key(k) and isinstance(v, str) and v in _REDACTED_SENTINELS:
|
||
continue
|
||
out[k] = _strip_redacted_sentinels(v, k)
|
||
return out
|
||
if isinstance(value, list):
|
||
return [_strip_redacted_sentinels(item, key_hint) for item in value]
|
||
return value
|
||
|
||
|
||
def _redact(value: Any, key_hint: str = "") -> Any:
|
||
"""Recursively mask secret-looking fields in a config subtree.
|
||
|
||
`key_hint` is the parent key — used so a string value like
|
||
``"${KEBOOLA_TOKEN}"`` under ``token_env`` is masked even though the
|
||
value itself isn't a credential, because the key signals it points at
|
||
one.
|
||
"""
|
||
if isinstance(value, dict):
|
||
return {k: (_mask(v) if _is_secret_key(k) else _redact(v, k)) for k, v in value.items()}
|
||
if isinstance(value, list):
|
||
return [_redact(item, key_hint) for item in value]
|
||
if key_hint and _is_secret_key(key_hint):
|
||
return _mask(value)
|
||
return value
|
||
|
||
|
||
def _diff_dicts(before: dict, after: dict, path: str = "") -> List[Dict[str, Any]]:
|
||
"""Flat list of changed fields between two dicts.
|
||
|
||
Output: [{"path": "email.smtp_host", "before": "...", "after": "..."}].
|
||
Diff is computed on RAW values, then each row's `before`/`after` is
|
||
masked via `_mask` when the leaf key matches `_is_secret_key` — pre-
|
||
masking the inputs would collapse a secret rotation (e.g. password A
|
||
→ password B) into "no diff" because both sides redact to ``"***"``,
|
||
and the audit log would then silently fail to record one of the most
|
||
security-relevant changes. Compare raw, redact when emitting.
|
||
|
||
Recurses into a dict on either side (treating the missing side as
|
||
`{}`) so adding a brand-new section reports per-field paths
|
||
(`email.smtp_host`) rather than a single opaque `email` blob — that
|
||
keeps the audit row useful when an admin populates a section for the
|
||
first time.
|
||
"""
|
||
changes: List[Dict[str, Any]] = []
|
||
keys = set(before.keys()) | set(after.keys())
|
||
for key in sorted(keys):
|
||
new_path = f"{path}.{key}" if path else key
|
||
b_val = before.get(key)
|
||
a_val = after.get(key)
|
||
b_is_dict = isinstance(b_val, dict)
|
||
a_is_dict = isinstance(a_val, dict)
|
||
# Dict-vs-dict (or dict-vs-None) → recurse for per-field paths.
|
||
if b_is_dict and a_is_dict:
|
||
changes.extend(_diff_dicts(b_val, a_val, new_path))
|
||
elif b_is_dict and a_val is None:
|
||
changes.extend(_diff_dicts(b_val, {}, new_path))
|
||
elif a_is_dict and b_val is None:
|
||
changes.extend(_diff_dicts({}, a_val, new_path))
|
||
# Dict↔scalar shape change is recorded as a single replacement at
|
||
# the parent path. Recursing with `{}` would lose the scalar side
|
||
# entirely (admin sets `keboola: {…}` to `keboola: "disabled"` —
|
||
# auditor would see members removed but never the new value).
|
||
# The dict side may itself contain secret-keyed children (e.g.
|
||
# `keboola: {token_env: ${KEBOOLA_TOKEN}}` resolved to cleartext);
|
||
# `_redact` masks those children even when the parent key isn't
|
||
# secret-named, so the audit log doesn't leak ${ENV_VAR}-resolved
|
||
# values when a section is replaced wholesale.
|
||
elif b_is_dict != a_is_dict:
|
||
if _is_secret_key(key):
|
||
changes.append({
|
||
"path": new_path,
|
||
"before": _mask(b_val),
|
||
"after": _mask(a_val),
|
||
})
|
||
else:
|
||
changes.append({
|
||
"path": new_path,
|
||
"before": _redact(b_val, key) if b_is_dict else b_val,
|
||
"after": _redact(a_val, key) if a_is_dict else a_val,
|
||
})
|
||
elif b_val != a_val:
|
||
if _is_secret_key(key):
|
||
changes.append({
|
||
"path": new_path,
|
||
"before": _mask(b_val),
|
||
"after": _mask(a_val),
|
||
})
|
||
else:
|
||
changes.append({"path": new_path, "before": b_val, "after": a_val})
|
||
return changes
|
||
|
||
|
||
def _deep_merge(base: dict, patch: dict) -> dict:
|
||
"""Merge `patch` into `base` recursively, returning a new dict.
|
||
|
||
Patch values overwrite base values. Dict-into-dict recurses; everything
|
||
else (lists, scalars, None) is replaced wholesale — admin sets
|
||
``email: {smtp_port: 465}`` and we don't try to re-merge nested ports.
|
||
"""
|
||
out = dict(base)
|
||
for key, value in patch.items():
|
||
if isinstance(value, dict) and isinstance(out.get(key), dict):
|
||
out[key] = _deep_merge(out[key], value)
|
||
else:
|
||
out[key] = value
|
||
return out
|
||
|
||
|
||
def _load_current_instance_yaml() -> dict:
|
||
"""Return the editor's view of instance.yaml — deep-merge of static +
|
||
overlay via ``app.instance_config.load_instance_config``.
|
||
|
||
Readers (GET /server-config) hit the cache and trust that writers
|
||
invalidate. Writers must call ``reset_cache()`` explicitly *before*
|
||
the read so they see the latest disk state in the read-modify-write
|
||
sequence. The shared helper is the authoritative source so the editor
|
||
never sees a different view than the rest of the running app.
|
||
"""
|
||
from app.instance_config import load_instance_config
|
||
return load_instance_config()
|
||
|
||
|
||
def _public_view(config: dict) -> dict:
|
||
"""Return a config dict safe to render in the admin UI form.
|
||
|
||
Deep-copies and redacts secret-looking fields so an admin can see
|
||
*which* fields are populated without the cleartext leaking into the
|
||
rendered HTML / browser DevTools.
|
||
"""
|
||
import copy
|
||
return _redact(copy.deepcopy(config))
|
||
|
||
|
||
class ServerConfigUpdateRequest(BaseModel):
|
||
"""Patch payload for POST /api/admin/server-config.
|
||
|
||
Only the sections listed in `_EDITABLE_SECTIONS` are accepted; anything
|
||
else is rejected with 400. `confirm_danger` must be true if the patch
|
||
touches any danger-zone section (auth.*, server.*).
|
||
"""
|
||
sections: Dict[str, Dict[str, Any]] = Field(
|
||
default_factory=dict,
|
||
description="Per-section patch dict (e.g. {'instance': {'name': 'X'}})",
|
||
)
|
||
confirm_danger: bool = Field(
|
||
default=False,
|
||
description="Must be true to apply changes touching auth.* or server.*",
|
||
)
|
||
|
||
|
||
# Optional BQ fields whose runtime defaults are documented but which used to
|
||
# be invisible in the editor when YAML omitted them. The data_source.bigquery
|
||
# subtree renders as a JSON textarea; a key that's absent from the GET
|
||
# payload literally cannot appear in the form for the operator to edit. We
|
||
# surface them with their documented defaults so the UI always shows them as
|
||
# editable knobs — see Phase J of the admin-tables-cleanup work.
|
||
#
|
||
# - billing_project: defaults to data project; explicit value needed when
|
||
# the SA can read the data project but not bill against it.
|
||
# - max_bytes_per_materialize: cost guardrail for `query_mode='materialized'`
|
||
# (default 10 GiB; 0 disables; null falls through to the default).
|
||
_BQ_OPTIONAL_FIELD_DEFAULTS: Dict[str, Any] = {
|
||
# `billing_project` intentionally NOT seeded here. The empty-string
|
||
# default would inject `billing_project: ""` into every GET payload,
|
||
# which makes the JS `isUnset = (value === undefined)` check evaluate
|
||
# False — and the `(defaults to <project>)` placeholder feature
|
||
# (#160 §4.7.5) would never render. Leaving it absent keeps the
|
||
# field in the unset rendering path so placeholder_from fires.
|
||
# Devin Review iter #3 on PR #168.
|
||
"max_bytes_per_materialize": 10737418240,
|
||
"bq_max_scan_bytes": 5368709120,
|
||
}
|
||
|
||
|
||
def _ensure_bq_optional_fields(sections: Dict[str, Any]) -> None:
|
||
"""In-place: add missing BQ optional fields to data_source.bigquery so the
|
||
UI's JSON-textarea renders them as editable keys. Existing values are
|
||
preserved — only absent keys are populated with their documented default.
|
||
"""
|
||
ds = sections.get("data_source")
|
||
if not isinstance(ds, dict):
|
||
return
|
||
bq = ds.get("bigquery")
|
||
if not isinstance(bq, dict):
|
||
# No BQ subsection — leave alone. Non-BQ instances don't need these
|
||
# knobs, and creating an empty bigquery dict would be misleading.
|
||
return
|
||
for key, default in _BQ_OPTIONAL_FIELD_DEFAULTS.items():
|
||
bq.setdefault(key, default)
|
||
|
||
|
||
@router.get("/server-config")
|
||
async def get_server_config(
|
||
user: dict = Depends(require_admin),
|
||
):
|
||
"""Return the current instance.yaml with secrets redacted.
|
||
|
||
Used by the /admin/server-config UI to prefill its form. The redacted
|
||
payload mirrors the actual file shape, so the UI doesn't need to know
|
||
the schema — it iterates over the editable sections and renders the
|
||
fields it finds. Empty sections still show in the response so the form
|
||
knows to render their headers.
|
||
"""
|
||
config = _load_current_instance_yaml()
|
||
redacted = _public_view(config)
|
||
# Surface every editable section so the UI renders them even when the
|
||
# file omits them — operator can populate from scratch without manual
|
||
# JSON edits.
|
||
sections = {section: redacted.get(section, {}) for section in _EDITABLE_SECTIONS}
|
||
# Always surface the optional BQ knobs so the operator sees them in the
|
||
# UI's JSON editor instead of having to know they exist (Phase J).
|
||
_ensure_bq_optional_fields(sections)
|
||
return {
|
||
"sections": sections,
|
||
"editable_sections": list(_EDITABLE_SECTIONS),
|
||
"danger_sections": list(_DANGER_SECTIONS),
|
||
"secret_key_patterns": list(_SECRET_KEY_PATTERNS),
|
||
# Known-but-optional fields per section so the UI can render
|
||
# placeholders for fields the operator hasn't set yet (Phase J).
|
||
# Subagents 2-4 populate the bodies; the renderer ships now so the
|
||
# mechanism is wired end-to-end and adding entries is purely a
|
||
# data-edit in `_KNOWN_FIELDS` above.
|
||
"known_fields": _KNOWN_FIELDS,
|
||
}
|
||
|
||
|
||
@router.post("/server-config")
|
||
async def update_server_config(
|
||
request: ServerConfigUpdateRequest,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Patch instance.yaml from the /admin/server-config editor.
|
||
|
||
Accepts a partial patch keyed by section. Validates sections, refuses
|
||
danger-zone edits without explicit confirmation, deep-merges into the
|
||
current overlay, writes the file, and emits one audit entry per save
|
||
with a sanitized diff. Returns ``restart_required=true`` so the UI can
|
||
show the restart banner — hot-reload is a separate issue (see #91 Out
|
||
of scope).
|
||
"""
|
||
import yaml
|
||
|
||
if not request.sections:
|
||
raise HTTPException(status_code=422, detail="sections cannot be empty")
|
||
|
||
# Reject unknown sections loudly. Without this, a typo like "thmee"
|
||
# would silently land in the YAML root and the operator wouldn't see
|
||
# their colour change apply.
|
||
unknown = sorted(set(request.sections.keys()) - set(_EDITABLE_SECTIONS))
|
||
if unknown:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"unknown section(s): {', '.join(unknown)}. "
|
||
f"Editable: {', '.join(_EDITABLE_SECTIONS)}",
|
||
)
|
||
|
||
# Danger-zone gate. The UI shows a confirmation dialog before posting
|
||
# with confirm_danger=true; an API caller (CLI/script) has to pass it
|
||
# explicitly so they can't fat-finger a hostname change.
|
||
danger_touched = sorted(set(request.sections.keys()) & set(_DANGER_SECTIONS))
|
||
if danger_touched and not request.confirm_danger:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"section(s) {', '.join(danger_touched)} require confirm_danger=true",
|
||
)
|
||
|
||
# SSRF protection — same gate the /configure wizard applies to
|
||
# keboola_url, but here it covers any URL-bearing field reachable via
|
||
# the per-section patch (e.g. data_source.keboola.stack_url).
|
||
_validate_urls_in_patch(request.sections)
|
||
|
||
# Field-level constraints for sections whose values have documented ranges.
|
||
_validate_materialize_section(request.sections)
|
||
|
||
# Defense-in-depth: scrub redaction sentinels (`***` / `<empty>`) out of
|
||
# secret-keyed leaves in the patch before they reach the deep-merge.
|
||
# The client form does the same scrub, but an API caller round-tripping
|
||
# the GET payload could otherwise overwrite real overlay secrets with
|
||
# the placeholder shown in the form.
|
||
scrubbed_sections: Dict[str, Dict[str, Any]] = {
|
||
section: _strip_redacted_sentinels(patch, section)
|
||
for section, patch in request.sections.items()
|
||
}
|
||
|
||
# Serialize read-modify-write across concurrent admin saves. Without the
|
||
# lock, two saves would each read the same overlay snapshot, merge their
|
||
# disjoint patches, and the second os.replace would silently drop the
|
||
# first patch. The lock spans the cache-invalidate → load → merge →
|
||
# atomic-write sequence; the audit log sits outside since it operates on
|
||
# local snapshots.
|
||
from app.instance_config import reset_cache
|
||
from app.secrets import _state_dir
|
||
config_path = _state_dir() / "instance.yaml"
|
||
config_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
with _overlay_write_lock:
|
||
# Drop the in-process cache so we read the latest on-disk state,
|
||
# including any update that landed from a concurrent caller before
|
||
# we acquired the lock.
|
||
reset_cache()
|
||
before = _load_current_instance_yaml()
|
||
|
||
# Deep merge — section-by-section so we never accidentally delete a
|
||
# sibling section the patch didn't touch. Use the redaction-scrubbed
|
||
# patch so a round-tripped GET payload can't overwrite real secrets
|
||
# with the `***` placeholder.
|
||
after = dict(before)
|
||
for section, patch in scrubbed_sections.items():
|
||
if not isinstance(patch, dict):
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail=f"section '{section}' must be an object, got {type(patch).__name__}",
|
||
)
|
||
if isinstance(after.get(section), dict):
|
||
after[section] = _deep_merge(after[section], patch)
|
||
else:
|
||
after[section] = patch
|
||
|
||
# Write only the sections the user actually patched in this request.
|
||
# Two reasons:
|
||
# 1. Persisting the full merged config (or every editable section)
|
||
# would snapshot non-editable static sections into the overlay,
|
||
# shadowing later operator updates to those sections in the
|
||
# static file (`_load_current_instance_yaml` merges static + overlay,
|
||
# overlay wins per leaf).
|
||
# 2. The merged config has `${ENV_VAR}` placeholders RESOLVED to the
|
||
# runtime values by config.loader. Writing every editable section
|
||
# back would persist real cleartext secrets where the static file
|
||
# had only env-var references — turning `smtp_password:
|
||
# ${SMTP_PASSWORD}` into `smtp_password: hunter2` in the overlay.
|
||
# By writing only the sections in `request.sections` we keep both the
|
||
# static-evolution and the env-var-placeholder properties intact.
|
||
overlay_payload: Dict[str, Any] = {}
|
||
if config_path.exists():
|
||
try:
|
||
overlay_payload = yaml.safe_load(config_path.read_text()) or {}
|
||
except Exception as e:
|
||
# A corrupt overlay used to be silently replaced — that masked
|
||
# disk corruption / partial writes / hand-edits and dropped
|
||
# every previously-saved section on the next save. Refuse and
|
||
# surface so the operator can investigate.
|
||
logger.exception("server-config: refusing to overwrite corrupt overlay at %s", config_path)
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"refusing to overwrite corrupt overlay at {config_path} ({e}); "
|
||
"back up and remove the file, or fix it by hand",
|
||
) from e
|
||
for section, patch in scrubbed_sections.items():
|
||
if section not in _EDITABLE_SECTIONS:
|
||
continue
|
||
# Deep-merge the patch into the existing overlay slot (or static-
|
||
# backed `before` if overlay had nothing for this section). This
|
||
# preserves any unrelated keys the operator didn't touch in this
|
||
# request — e.g. patching `email.smtp_host` doesn't blow away the
|
||
# `email.smtp_password: ${SMTP_PASSWORD}` reference.
|
||
existing = overlay_payload.get(section)
|
||
if not isinstance(existing, dict):
|
||
existing = {}
|
||
overlay_payload[section] = _deep_merge(existing, patch)
|
||
|
||
# Atomic via tmp + os.replace so two concurrent admin saves can't
|
||
# interleave bytes and produce corrupt YAML (especially harmful since
|
||
# auth.* is editable here — half-written file → operator lockout).
|
||
tmp_path = config_path.with_suffix(config_path.suffix + ".tmp")
|
||
tmp_path.write_text(yaml.dump(overlay_payload, default_flow_style=False, sort_keys=False))
|
||
os.replace(tmp_path, config_path)
|
||
logger.info("server-config: wrote %d section(s) to %s",
|
||
len(request.sections), config_path)
|
||
|
||
# Invalidate cached instance config so subsequent reads pick up the
|
||
# change. Hot-reload of running modules (auth providers, SMTP client)
|
||
# is out of scope — the restart banner tells the operator to bounce.
|
||
reset_cache()
|
||
|
||
# Audit entry — diff is computed on RAW values then `_diff_dicts`
|
||
# redacts each row whose leaf key matches `_is_secret_key`. Pre-
|
||
# masking the inputs would collapse a secret rotation into "no
|
||
# diff" because both sides redact to ``***``, hiding the most
|
||
# security-relevant changes from the audit log. We log even if no
|
||
# fields changed so the operator's intent (touched the page, hit
|
||
# save) is auditable.
|
||
diff = _diff_dicts(before, after)
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action="instance_config.update",
|
||
resource="instance.yaml",
|
||
params={
|
||
"sections": sorted(request.sections.keys()),
|
||
"danger_sections": danger_touched,
|
||
"diff": diff,
|
||
"diff_count": len(diff),
|
||
},
|
||
)
|
||
|
||
return {
|
||
"status": "ok",
|
||
"restart_required": True,
|
||
"sections_updated": sorted(request.sections.keys()),
|
||
"diff_count": len(diff),
|
||
}
|
||
|
||
|
||
# --- End server-config editor -----------------------------------------------
|
||
|
||
|
||
# Source types accepted by /api/admin/register-table. Anything else is
|
||
# rejected with 422 — keeps a typo'd source_type from silently landing in
|
||
# table_registry (where it would later confuse the orchestrator scan).
|
||
_VALID_SOURCE_TYPES: tuple[str, ...] = ("keboola", "bigquery", "jira", "local")
|
||
|
||
# Explicit allowlist of audit-payload keys whose values are credentials and
|
||
# must be masked. Substring-scan + ad-hoc whitelist (the previous shape) is
|
||
# fragile in two ways:
|
||
# 1. False positive: legit fields like `primary_key` get masked because
|
||
# they contain "key" — we then need a whitelist exception, which has
|
||
# to be kept in sync as new fields are added.
|
||
# 2. False negative: a future field like `primary_key_hash` *would* be
|
||
# masked (defensible) but `not_actually_a_token` ALSO matches "token"
|
||
# and gets masked unnecessarily; conversely, a brand-new credential
|
||
# field that doesn't contain one of the patterns (`auth_material`,
|
||
# `bearer`) silently leaks.
|
||
# Allowlist puts the burden on the developer adding a new secret-bearing
|
||
# field: they must add the literal key name here, which forces a code-
|
||
# review touch on the audit path. Audit the current Pydantic models
|
||
# (RegisterTableRequest / UpdateTableRequest / ConfigureRequest /
|
||
# ServerConfigUpdateRequest) when extending — the registry payloads don't
|
||
# currently carry credentials, but ConfigureRequest does (`keboola_token`)
|
||
# and could be routed through this sanitizer in the future.
|
||
_SECRET_FIELDS: frozenset = frozenset({
|
||
# ConfigureRequest — POST /api/admin/configure carries Keboola creds.
|
||
"keboola_token",
|
||
# Generic names that have appeared in earlier iterations of admin
|
||
# request bodies and could resurface — keep them masked defensively.
|
||
"api_token",
|
||
"auth_token",
|
||
"bot_token",
|
||
"client_secret",
|
||
"google_client_secret",
|
||
"google_oauth_client_secret",
|
||
"password",
|
||
"smtp_password",
|
||
"webapp_secret_key",
|
||
"bot_secret",
|
||
# Marketplace PATs (private repos) — see src/marketplace.py.
|
||
"marketplace_token",
|
||
"marketplace_pat",
|
||
})
|
||
|
||
|
||
def _sanitize_for_audit(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""Mask credential-bearing fields in a request payload before audit_log.
|
||
|
||
Uses an explicit `_SECRET_FIELDS` allowlist (case-insensitive) instead
|
||
of substring matching. The trade-off is that adding a new secret field
|
||
requires updating the set — but that's the *point*: the test suite
|
||
asserts `not_actually_a_token` does NOT get masked, so a substring-
|
||
based regression would surface immediately, and a missing entry for a
|
||
real new credential gets caught at code review of the audit path.
|
||
"""
|
||
out: Dict[str, Any] = {}
|
||
for k, v in payload.items():
|
||
if k.lower() in _SECRET_FIELDS:
|
||
out[k] = "***" if v not in (None, "") else "<empty>"
|
||
else:
|
||
out[k] = v
|
||
return out
|
||
|
||
|
||
# Both the BigQuery and Keboola materialize paths funnel `source_query`
|
||
# through DuckDB (BQ via the bigquery extension's COPY translation, Keboola
|
||
# via an ATTACH'd extension and a direct COPY). DuckDB uses double quotes
|
||
# for quoted identifiers — backticks are a BigQuery-native syntactic form
|
||
# DuckDB's parser does not honor, so a backtick-quoted source_query either
|
||
# parse-errors at COPY time or silently scans nothing. Surfaced from the
|
||
# field validator on RegisterTableRequest AND the merged-record path in
|
||
# `update_table` so neither route can persist a backtick query.
|
||
_BACKTICK_REJECTION_MESSAGE = (
|
||
"source_query uses BigQuery-native backtick identifiers (e.g. "
|
||
"`project.dataset.table`), but the materialize path runs the SQL "
|
||
"through DuckDB's BigQuery extension which uses DuckDB-flavor "
|
||
"identifiers. Rewrite to DuckDB syntax: bq.\"dataset\".\"table\" "
|
||
"(with the attached catalog alias `bq` plus double-quoted dataset/"
|
||
"table). The instance is configured with the data project, so you "
|
||
"don't need to repeat it in the FROM clause."
|
||
)
|
||
|
||
|
||
class RegisterTableRequest(BaseModel):
|
||
name: str
|
||
folder: Optional[str] = None
|
||
sync_strategy: str = Field(
|
||
default="full_refresh",
|
||
description=(
|
||
"Per-table extraction strategy. v26+: drives the Keboola "
|
||
"extractor's dispatcher in connectors/keboola/extractor.py. "
|
||
"Allowed values: 'full_refresh' (default; full table dump on "
|
||
"each sync), 'incremental' (Storage API changedSince + "
|
||
"primary-key dedup merge), 'partitioned' (per-partition "
|
||
"parquet files keyed by partition_by column, per-partition "
|
||
"merge for daily updates, chunked initial load). "
|
||
"Pre-v26 this field was inert; existing rows default to "
|
||
"'full_refresh' so behavior is unchanged unless an admin "
|
||
"opts a table in to incremental/partitioned."
|
||
),
|
||
)
|
||
# Composite primary keys are real (session-grain MSA tables key on
|
||
# `(session_id, event_date)`, browse rows on more). The frontend sends +
|
||
# reads this as a list; backend stores it JSON-serialized in VARCHAR.
|
||
# A bare string is accepted for backward compat — see _normalize_primary_key.
|
||
primary_key: Optional[List[str]] = None
|
||
description: Optional[str] = None
|
||
source_type: Optional[str] = None
|
||
bucket: Optional[str] = None
|
||
source_table: Optional[str] = None
|
||
# Backs query_mode='materialized'. Stored verbatim in
|
||
# table_registry.source_query (schema v20); the trigger pass runs it
|
||
# through the DuckDB BQ extension via BqAccess and writes the result
|
||
# to /data/extracts/bigquery/data/<id>.parquet.
|
||
source_query: Optional[str] = None
|
||
query_mode: str = "local"
|
||
sync_schedule: Optional[str] = None
|
||
profile_after_sync: bool = Field(
|
||
default=True,
|
||
deprecated=True,
|
||
description=(
|
||
"DEPRECATED: not consumed by the runtime (Agent 1 finding "
|
||
"2026-05-01). Profiler runs unconditionally on every synced "
|
||
"table; this flag has no effect. Field stays for back-compat."
|
||
),
|
||
)
|
||
# v26 — Keboola sync-strategy support fields. All optional; meaningful
|
||
# only when paired with the matching sync_strategy. Per-strategy
|
||
# required-field rules + conflict policy enforced in the model_validator
|
||
# below.
|
||
incremental_window_days: Optional[int] = None
|
||
max_history_days: Optional[int] = None
|
||
incremental_column: Optional[str] = None
|
||
where_filters: Optional[List[Dict[str, Any]]] = None
|
||
partition_by: Optional[str] = None
|
||
partition_granularity: Optional[str] = None
|
||
initial_load_chunk_days: Optional[int] = None
|
||
# v51 — fully-qualified BigQuery path. When set on a BigQuery row,
|
||
# the extractor uses ``project.dataset.table`` from this field instead
|
||
# of constructing the path from ``bucket`` + ``source_table`` against
|
||
# the globally-attached project. Decouples UX/RBAC ``bucket`` label
|
||
# from physical BQ dataset (issue #343). Format ``project.dataset.table``;
|
||
# validated by ``connectors.bigquery.extractor.parse_bq_fqn``.
|
||
bq_fqn: Optional[str] = Field(
|
||
default=None,
|
||
description=(
|
||
"Fully-qualified BigQuery path (``project.dataset.table``). "
|
||
"Only applies to source_type='bigquery'. When set, overrides "
|
||
"the legacy bucket+source_table path construction. Use this "
|
||
"to register a table whose BQ dataset name differs from the "
|
||
"Agnes ``bucket`` label (issue #343)."
|
||
),
|
||
)
|
||
|
||
@model_validator(mode="after")
|
||
def _check_mode_query_coherence(self):
|
||
"""Enforce query_mode ↔ source_query invariants up front so an admin
|
||
can't persist a remote/local row carrying an orphan source_query.
|
||
|
||
For BigQuery materialized rows, an empty source_query is allowed here
|
||
because _validate_bigquery_register_payload generates it from
|
||
bucket+source_table after this validator runs. For all other source
|
||
types (e.g. Keboola), source_query is still required for materialized.
|
||
"""
|
||
sq = (self.source_query or "").strip() or None
|
||
if self.query_mode != "materialized" and sq:
|
||
raise ValueError(
|
||
"source_query is only valid when query_mode='materialized'"
|
||
)
|
||
# BigQuery materialized auto-generates a full-table-dump SQL from
|
||
# `bucket`+`source_table` when source_query is omitted (see
|
||
# `register_table` BQ branch). Keboola materialized: a NULL
|
||
# source_query means "full-table export via Storage API
|
||
# export-async" — no SQL needed (the API takes a structured
|
||
# filter, see `connectors/keboola/storage_api.py:ExportFilter`).
|
||
# Other source_types (e.g. jira) don't support materialized mode
|
||
# and require an explicit source_query if the operator opts in.
|
||
if (
|
||
self.query_mode == "materialized"
|
||
and not sq
|
||
and self.source_type not in ("bigquery", "keboola")
|
||
):
|
||
raise ValueError(
|
||
f"query_mode='materialized' for source_type='{self.source_type}' "
|
||
"requires a non-empty source_query"
|
||
)
|
||
# Backtick guard stays for non-materialized rows (DuckDB-flavor SQL
|
||
# contract); materialized SQL is BigQuery-native and MUST allow
|
||
# backticks for dashed identifiers (e.g. `prj-org.dataset.table`).
|
||
if self.query_mode != "materialized" and sq and "`" in sq:
|
||
raise ValueError(_BACKTICK_REJECTION_MESSAGE)
|
||
# Normalise: stash the trimmed-or-None form so the persisted column
|
||
# never carries surrounding whitespace or empty-string sentinels.
|
||
self.source_query = sq
|
||
return self
|
||
|
||
@field_validator("primary_key", mode="before")
|
||
@classmethod
|
||
def _coerce_primary_key(cls, v):
|
||
return _normalize_primary_key(v)
|
||
|
||
@field_validator("description", mode="before")
|
||
@classmethod
|
||
def _normalize_description(cls, v):
|
||
# Defensive normalization for descriptions arriving via shell-quoting
|
||
# tooling that injects literal backslash escapes (e.g. `Don\'t`, `\n`).
|
||
return _unescape_shell_quoting(v)
|
||
|
||
@field_validator("source_type", mode="before")
|
||
@classmethod
|
||
def _validate_source_type(cls, v):
|
||
# None is tolerated for backward compat with old CLI scripts that
|
||
# didn't set a source_type; the route resolves it later. Anything
|
||
# else must be in the canonical list.
|
||
if v in (None, ""):
|
||
return v
|
||
if v not in _VALID_SOURCE_TYPES:
|
||
raise ValueError(
|
||
f"source_type must be one of {sorted(_VALID_SOURCE_TYPES)}, got {v!r}"
|
||
)
|
||
return v
|
||
|
||
@field_validator("sync_schedule", mode="before")
|
||
@classmethod
|
||
def _validate_sync_schedule(cls, v):
|
||
# None / "" → no schedule, accepted.
|
||
# Any non-empty string (including pure whitespace) must parse as a
|
||
# valid schedule — otherwise it would be persisted and silently
|
||
# ignored by the runtime evaluator.
|
||
if v in (None, ""):
|
||
return v
|
||
if not is_valid_schedule(v):
|
||
raise ValueError(
|
||
f"sync_schedule must be 'every Nm' / 'every Nh' / "
|
||
f"'daily HH:MM[,HH:MM,...]', got {v!r}"
|
||
)
|
||
return v
|
||
|
||
@field_validator("sync_strategy", mode="before")
|
||
@classmethod
|
||
def _validate_sync_strategy(cls, v):
|
||
"""v26: enforce the strategy enum. NULL/empty → 'full_refresh' default.
|
||
|
||
Pre-v26 the column accepted any string (catalog/profiler metadata
|
||
only). Now the extractor dispatches off this value, so unknown
|
||
strings would silently fall through to the default branch and
|
||
confuse operators.
|
||
"""
|
||
if v in (None, ""):
|
||
return "full_refresh"
|
||
allowed = {"full_refresh", "incremental", "partitioned"}
|
||
if v not in allowed:
|
||
raise ValueError(
|
||
f"sync_strategy must be one of {sorted(allowed)}, got {v!r}"
|
||
)
|
||
return v
|
||
|
||
@field_validator("partition_granularity", mode="before")
|
||
@classmethod
|
||
def _validate_partition_granularity(cls, v):
|
||
if v in (None, ""):
|
||
return v
|
||
allowed = {"day", "month", "year"}
|
||
if v not in allowed:
|
||
raise ValueError(
|
||
f"partition_granularity must be one of {sorted(allowed)}, got {v!r}"
|
||
)
|
||
return v
|
||
|
||
@field_validator("where_filters", mode="before")
|
||
@classmethod
|
||
def _validate_where_filters(cls, v):
|
||
"""Validate filter shape via parse_filters from the keboola module.
|
||
|
||
Accepts None / empty list, a JSON string, or a pre-parsed list.
|
||
Returns the canonical list-of-dicts form for storage. Raises
|
||
ValueError(InvalidFilterError message) on malformed shape so
|
||
FastAPI returns 422 with a useful body. Placeholders are NOT
|
||
resolved here — they're resolved at sync time so a misspelled
|
||
token is caught when the next sync runs (admin can register a
|
||
rolling-window filter today and the sync next month uses the
|
||
same filter shape with a fresh date)."""
|
||
if v in (None, "", []):
|
||
return None
|
||
from connectors.keboola.where_filters import parse_filters, InvalidFilterError
|
||
try:
|
||
return parse_filters(v)
|
||
except InvalidFilterError as e:
|
||
raise ValueError(str(e))
|
||
|
||
@model_validator(mode="after")
|
||
def _check_strategy_invariants(self):
|
||
"""v27 conflict policy + per-strategy required-field rules.
|
||
|
||
Reject combinations that are silently broken at the extractor
|
||
layer rather than letting the row land in the registry and
|
||
confuse operators when the next sync misbehaves.
|
||
|
||
- partitioned ⇒ partition_by required, query_mode='local' only.
|
||
partition_granularity defaults to 'month' if omitted.
|
||
- incremental + where_filters → 400. changedSince already does
|
||
temporal filtering; layering server-side row filters on top is
|
||
not supported by the extractor (legacy repo silently drops
|
||
filters in this combination — match the rejection here).
|
||
- partitioned + where_filters → 400. extract_partitioned does
|
||
not thread where_filters through to its chunked downloads;
|
||
accepting the pair would persist a filter that gets silently
|
||
ignored at sync time (Devin Review concern). Reject explicitly
|
||
until threading lands.
|
||
- query_mode='remote' + where_filters → 400. _extract_via_extension
|
||
(the remote/extension path) doesn't take a filters argument;
|
||
accepting would silently drop them.
|
||
"""
|
||
if self.sync_strategy == "partitioned":
|
||
if not self.partition_by:
|
||
raise ValueError(
|
||
"sync_strategy='partitioned' requires partition_by to be set"
|
||
)
|
||
if self.query_mode == "remote":
|
||
raise ValueError(
|
||
"sync_strategy='partitioned' is incompatible with query_mode='remote' "
|
||
"— partitioned writes per-partition parquet files locally"
|
||
)
|
||
if self.where_filters:
|
||
raise ValueError(
|
||
"sync_strategy='partitioned' is incompatible with where_filters "
|
||
"in v27 — extract_partitioned does not thread where_filters "
|
||
"through its chunked downloads; the filter would be silently "
|
||
"ignored. Use 'full_refresh' for filter+full-overwrite, or "
|
||
"wait for partitioned + where_filters wiring in a future PR."
|
||
)
|
||
if not self.partition_granularity:
|
||
self.partition_granularity = "month"
|
||
|
||
if self.sync_strategy == "incremental" and self.where_filters:
|
||
raise ValueError(
|
||
"sync_strategy='incremental' is incompatible with where_filters "
|
||
"— changedSince already filters temporally; layering whereFilters "
|
||
"on top is silently dropped by the extractor (use 'full_refresh' "
|
||
"for filter+full-overwrite)"
|
||
)
|
||
|
||
# query_mode='remote' + where_filters: the DuckDB Keboola extension
|
||
# path does not consume whereFilters. Accepting would silently drop
|
||
# them at sync time. Caller must use query_mode='local' (Direct
|
||
# extract) to apply filters.
|
||
if self.query_mode == "remote" and self.where_filters:
|
||
raise ValueError(
|
||
"query_mode='remote' is incompatible with where_filters "
|
||
"— the DuckDB Keboola extension does not expose whereFilters. "
|
||
"Use query_mode='local' (Direct extract) to apply server-side "
|
||
"row filters."
|
||
)
|
||
|
||
return self
|
||
|
||
|
||
def _generate_materialized_source_query(
|
||
bucket: str, source_table: str, project_id: str,
|
||
) -> str:
|
||
"""Build the canonical full-table-dump source_query for a materialized
|
||
BQ row when admin only supplies dataset + table. The result is
|
||
BigQuery-native SQL — wrapped at materialize time into
|
||
bigquery_query(...) by connectors.bigquery.extractor.materialize_query."""
|
||
if not _is_safe_quoted_identifier(bucket):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"bigquery: dataset {bucket!r} is unsafe",
|
||
)
|
||
if not _is_safe_quoted_identifier(source_table):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"bigquery: source_table {source_table!r} is unsafe",
|
||
)
|
||
if not _is_safe_project_id(project_id):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"bigquery: data_source.bigquery.project {project_id!r} is malformed",
|
||
)
|
||
return f"SELECT * FROM `{project_id}.{bucket}.{source_table}`"
|
||
|
||
|
||
def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
|
||
"""Enforce BQ-specific shape on a register/precheck request.
|
||
|
||
Two BQ paths:
|
||
|
||
- ``query_mode='materialized'`` — admin-registered SQL writes a parquet on
|
||
schedule. Requires ``source_query``; ``bucket`` / ``source_table`` are
|
||
not used (the SQL inlines the references). Doesn't force any field; the
|
||
Pydantic ``model_validator`` already gated the query/mode coherence.
|
||
|
||
- ``query_mode='remote'`` (or default) — remote view over a single BQ
|
||
table. Requires ``bucket`` (BQ dataset) + ``source_table``. Mutates
|
||
the model: forces ``query_mode='remote'`` and ``profile_after_sync=False``
|
||
(per Decision 7 in #108) so a caller can't accidentally enqueue a
|
||
parquet profiling pass for a remote view that has no local file.
|
||
|
||
Raises HTTPException(422) for missing required fields and
|
||
HTTPException(400) for unsafe identifiers / bogus project_id.
|
||
"""
|
||
if req.query_mode == "materialized":
|
||
# Materialized BQ rows: the SQL body replaces dataset+table refs.
|
||
# source_query may be empty if admin supplied bucket+source_table —
|
||
# in that case the server generates a full-table-dump SQL below.
|
||
raw_name = req.name or ""
|
||
if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
f"bigquery: view name {raw_name!r} is unsafe — must match "
|
||
f"^[a-zA-Z_][a-zA-Z0-9_]{{0,63}}$ (DuckDB identifier rules) "
|
||
"with no leading/trailing whitespace"
|
||
),
|
||
)
|
||
from app.instance_config import get_value
|
||
project_id = get_value("data_source", "bigquery", "project", default="") or ""
|
||
if not project_id:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
"bigquery: data_source.bigquery.project is not set in "
|
||
"instance.yaml; configure it via /admin/server-config or "
|
||
"/api/admin/configure first"
|
||
),
|
||
)
|
||
if not _is_safe_project_id(project_id):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
f"bigquery: data_source.bigquery.project {project_id!r} "
|
||
"is malformed — must match GCP project_id grammar "
|
||
"^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
|
||
),
|
||
)
|
||
|
||
if not (req.source_query and req.source_query.strip()):
|
||
# Server-generate from bucket+source_table. Trivial full-table
|
||
# dump path; admin only sets dataset+table and the server
|
||
# builds BQ-native SQL from instance.yaml's configured project.
|
||
if not (req.bucket and req.source_table):
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail=(
|
||
"bigquery materialized requires either source_query "
|
||
"(custom SQL) or bucket+source_table (server-generates "
|
||
"the full-table-dump SQL)"
|
||
),
|
||
)
|
||
req.source_query = _generate_materialized_source_query(
|
||
req.bucket, req.source_table, project_id,
|
||
)
|
||
|
||
# Phase C: profile_after_sync is now inert (Pydantic field marked
|
||
# deprecated; not read by app/api/sync.py:410-438). The runtime
|
||
# profiles every synced table unconditionally, so we no longer
|
||
# force-set this here as a "signal."
|
||
return
|
||
|
||
if not req.bucket or not req.bucket.strip():
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail="bigquery: 'bucket' (BQ dataset) is required",
|
||
)
|
||
if not req.source_table or not req.source_table.strip():
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail="bigquery: 'source_table' is required",
|
||
)
|
||
# No wildcard / sharded BQ tables in M1 (Decision 8).
|
||
if "*" in (req.source_table or "") or "*" in (req.bucket or ""):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail="bigquery: wildcard / sharded tables are not supported (see #108 M3+)",
|
||
)
|
||
# Strict identifier on the DuckDB view name. CRITICAL: validate the RAW
|
||
# name (the value that ``register_table`` actually persists to
|
||
# ``table_registry.name`` and which the BQ extractor reads back as the
|
||
# DuckDB view name at next rebuild). Earlier revisions normalized first
|
||
# (``strip().lower().replace(" ", "_")``) and then checked, which let
|
||
# names like ``"my table"`` pass here, get stored verbatim, and then
|
||
# blow up inside ``_init_extract`` at view-create time — defeating the
|
||
# whole point of fast-fail-at-register. We do NOT silently rewrite the
|
||
# operator's name; if they typed ``"my table"``, return 400 with a
|
||
# clear message and let them retype with a corrected name.
|
||
raw_name = req.name or ""
|
||
if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
f"bigquery: view name {raw_name!r} is unsafe — must match "
|
||
f"^[a-zA-Z_][a-zA-Z0-9_]{{0,63}}$ (DuckDB identifier rules) "
|
||
"with no leading/trailing whitespace"
|
||
),
|
||
)
|
||
# Same fast-fail rule as ``raw_name`` above: validate the RAW value the
|
||
# caller sent, not a stripped form. ``register_table`` persists ``bucket``
|
||
# / ``source_table`` verbatim, and the BQ extractor splices them straight
|
||
# into the ``ATTACH … AS bq_<bucket>`` and view DDL at next rebuild — so a
|
||
# value with leading/trailing whitespace passes validation here, gets
|
||
# stored as-is, and explodes inside DuckDB at view-create time. Surface
|
||
# the offending raw value in the 400 detail and let the operator retype.
|
||
raw_bucket = req.bucket
|
||
if raw_bucket.strip() != raw_bucket or not _is_safe_quoted_identifier(raw_bucket):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
f"bigquery: dataset {raw_bucket!r} is unsafe (only [A-Za-z0-9_.-] "
|
||
"allowed, no leading/trailing whitespace)"
|
||
),
|
||
)
|
||
raw_source_table = req.source_table
|
||
if raw_source_table.strip() != raw_source_table or not _is_safe_quoted_identifier(raw_source_table):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
f"bigquery: source_table {raw_source_table!r} is unsafe (only "
|
||
"[A-Za-z0-9_.-] allowed, no leading/trailing whitespace)"
|
||
),
|
||
)
|
||
# Pull project from instance.yaml — single-project model in M1
|
||
# (Decision: no per-table project field). Validate the format here so
|
||
# we surface a config issue at registration rather than at first
|
||
# rebuild, where the operator no longer has a request to look at.
|
||
from app.instance_config import get_value
|
||
project_id = get_value("data_source", "bigquery", "project", default="")
|
||
if not project_id:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
"bigquery: data_source.bigquery.project is not set in instance.yaml; "
|
||
"configure it via /admin/server-config or /api/admin/configure first"
|
||
),
|
||
)
|
||
if not _is_safe_project_id(project_id):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
f"bigquery: data_source.bigquery.project {project_id!r} is malformed — "
|
||
"must match GCP project_id grammar ^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
|
||
),
|
||
)
|
||
# Force the BQ-required mode (Decision 7). The orchestrator and
|
||
# extractor both assume remote; persisting `local` here would later create
|
||
# a profiling job against a non-existent parquet file.
|
||
# Phase C: profile_after_sync is now inert (deprecated, not read by the
|
||
# runtime); no longer force-set here.
|
||
req.query_mode = "remote"
|
||
|
||
|
||
# Source types that don't depend on a `data_source.<name>.*` block — they
|
||
# get their data through a different ingestion path (e.g. Jira via
|
||
# webhooks). Registrations against these types are allowed regardless of
|
||
# the configured primary `data_source.type`.
|
||
_SOURCE_TYPES_INDEPENDENT_OF_DATA_SOURCE: frozenset[str] = frozenset({
|
||
"jira",
|
||
"local",
|
||
})
|
||
|
||
|
||
def _validate_source_type_configured(source_type: Optional[str]) -> None:
|
||
"""Refuse register-table requests whose ``source_type`` isn't actually
|
||
configured on this instance.
|
||
|
||
Pre-fix the route happily persisted e.g. ``source_type='keboola'`` on a
|
||
BQ-only instance — the row landed in the registry but the scheduler had
|
||
no Keboola URL/token to ATTACH against, so it silently never synced.
|
||
No upfront error, no operator-visible signal until they noticed the
|
||
table was missing from `agnes catalog`.
|
||
|
||
A source_type is considered configured when:
|
||
|
||
- it matches the instance's primary ``data_source.type``, OR
|
||
- a non-empty ``data_source.<source_type>`` block exists in the
|
||
effective `instance.yaml` (multi-source instances), OR
|
||
- it's in the small allowlist of types that don't sit under
|
||
`data_source.*` at all (Jira, local — see
|
||
``_SOURCE_TYPES_INDEPENDENT_OF_DATA_SOURCE``).
|
||
|
||
Special case: when the configured primary is ``'local'`` (the default
|
||
when an instance is freshly bootstrapped and no `data_source.type` has
|
||
been set yet), the validator stays permissive — refusing registrations
|
||
here would block the first-time-setup workflow where the operator
|
||
registers a few tables against a not-yet-fully-configured instance.
|
||
The misconfiguration that this validator targets is the *explicit
|
||
mismatch*: `type=bigquery` instance + `source_type=keboola` payload
|
||
with no `data_source.keboola.*` block. That case still 422s.
|
||
|
||
A bare/None source_type is tolerated for backward compat with legacy
|
||
CLI scripts; the route resolves it later against
|
||
``get_data_source_type()``.
|
||
"""
|
||
if not source_type:
|
||
return
|
||
if source_type in _SOURCE_TYPES_INDEPENDENT_OF_DATA_SOURCE:
|
||
return
|
||
|
||
from app.instance_config import get_data_source_type, get_value
|
||
|
||
configured_primary = get_data_source_type()
|
||
if source_type == configured_primary:
|
||
return
|
||
|
||
# Multi-source: accept if a non-empty `data_source.<source_type>` block
|
||
# exists. Empty dict / None / "" all count as "not configured".
|
||
secondary_block = get_value("data_source", source_type, default=None)
|
||
if secondary_block:
|
||
# Truthy non-empty dict / mapping / scalar — treat as configured.
|
||
return
|
||
|
||
# Bootstrap-friendliness: a primary of 'local' means the instance hasn't
|
||
# been pointed at a real source yet (or has been deliberately set to
|
||
# local-only). Don't gate registrations in that state — the operator is
|
||
# likely in the middle of first-time setup and will fill in the config
|
||
# next. The check still fires when primary is an actual source type
|
||
# (bigquery / keboola) and the requested source_type doesn't match
|
||
# AND has no secondary block.
|
||
if configured_primary == "local":
|
||
return
|
||
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail=(
|
||
f"source_type={source_type!r} is not configured on this instance. "
|
||
f"The configured data source is {configured_primary!r}. To enable "
|
||
f"a secondary source, set data_source.{source_type}.* fields in "
|
||
"instance.yaml or via /admin/server-config."
|
||
),
|
||
)
|
||
|
||
|
||
class UpdateTableRequest(BaseModel):
|
||
name: Optional[str] = None
|
||
sync_strategy: Optional[str] = Field(
|
||
default=None,
|
||
description=(
|
||
"v26+: drives the Keboola extractor dispatcher. PUT-shape "
|
||
"requires a value if sent. See RegisterTableRequest.sync_strategy."
|
||
),
|
||
)
|
||
primary_key: Optional[List[str]] = None
|
||
description: Optional[str] = None
|
||
source_type: Optional[str] = None
|
||
bucket: Optional[str] = None
|
||
source_table: Optional[str] = None
|
||
source_query: Optional[str] = None
|
||
query_mode: Optional[str] = None
|
||
sync_schedule: Optional[str] = None
|
||
profile_after_sync: Optional[bool] = Field(
|
||
default=None,
|
||
deprecated=True,
|
||
description=(
|
||
"DEPRECATED: not consumed by the runtime. See "
|
||
"RegisterTableRequest.profile_after_sync."
|
||
),
|
||
)
|
||
# v26 — same fields as RegisterTableRequest, all optional. The PUT
|
||
# handler overlays the body on the existing row and re-runs the
|
||
# synthetic RegisterTableRequest validator on the merged record, so
|
||
# cross-field invariants are checked against the post-update state.
|
||
incremental_window_days: Optional[int] = None
|
||
max_history_days: Optional[int] = None
|
||
incremental_column: Optional[str] = None
|
||
where_filters: Optional[List[Dict[str, Any]]] = None
|
||
partition_by: Optional[str] = None
|
||
partition_granularity: Optional[str] = None
|
||
initial_load_chunk_days: Optional[int] = None
|
||
# v51 — see RegisterTableRequest.bq_fqn. PUT lets an admin add or
|
||
# clear bq_fqn on an existing row (cleared via explicit `null`,
|
||
# per the PUT shape contract documented on the handler below).
|
||
bq_fqn: Optional[str] = None
|
||
|
||
@field_validator("sync_strategy", mode="before")
|
||
@classmethod
|
||
def _validate_sync_strategy(cls, v):
|
||
if v in (None, ""):
|
||
return v
|
||
allowed = {"full_refresh", "incremental", "partitioned"}
|
||
if v not in allowed:
|
||
raise ValueError(
|
||
f"sync_strategy must be one of {sorted(allowed)}, got {v!r}"
|
||
)
|
||
return v
|
||
|
||
@field_validator("partition_granularity", mode="before")
|
||
@classmethod
|
||
def _validate_partition_granularity(cls, v):
|
||
if v in (None, ""):
|
||
return v
|
||
allowed = {"day", "month", "year"}
|
||
if v not in allowed:
|
||
raise ValueError(
|
||
f"partition_granularity must be one of {sorted(allowed)}, got {v!r}"
|
||
)
|
||
return v
|
||
|
||
@field_validator("where_filters", mode="before")
|
||
@classmethod
|
||
def _validate_where_filters(cls, v):
|
||
if v in (None, "", []):
|
||
return None
|
||
from connectors.keboola.where_filters import parse_filters, InvalidFilterError
|
||
try:
|
||
return parse_filters(v)
|
||
except InvalidFilterError as e:
|
||
raise ValueError(str(e))
|
||
|
||
@model_validator(mode="after")
|
||
def _check_mode_query_coherence(self):
|
||
"""PUT semantics — only the fields explicitly in the body are
|
||
validated. The body is overlaid on the existing row at the handler
|
||
level (see ``update_table``), so omitted fields keep their stored
|
||
values and the synthetic ``RegisterTableRequest`` constructed against
|
||
the merged record runs the strict cross-field check before persist.
|
||
|
||
The only invariants enforceable from the PUT body alone:
|
||
|
||
- explicit ``source_query='SELECT ...'`` paired with ``query_mode``
|
||
that isn't materialized → coherent reject (the SQL would be dead);
|
||
- explicit ``source_query='SELECT ...'`` without any ``query_mode``
|
||
in the body → reject; the operator must commit to materialized;
|
||
- explicit empty/whitespace ``source_query=''`` paired with
|
||
``query_mode='materialized'`` → reject (operator clearly
|
||
mistyped — they sent the field).
|
||
|
||
Pre-fix this validator also rejected ``{"query_mode": "materialized",
|
||
"sync_schedule": "every 12h"}`` because ``source_query`` was None
|
||
— but that's the canonical "edit the schedule on a materialized
|
||
row" use-case from the Edit modal, which always sends
|
||
``query_mode`` to indicate intent. Devin BUG_0002 on PR #148
|
||
commit 2219255.
|
||
"""
|
||
if self.query_mode is None and self.source_query is None:
|
||
return self
|
||
|
||
sq_raw = self.source_query
|
||
sq = (sq_raw or "").strip() or None
|
||
|
||
# Operator explicitly sent source_query as empty/whitespace while
|
||
# claiming materialized — typo / bad form data, reject.
|
||
if (
|
||
self.query_mode == "materialized"
|
||
and sq_raw is not None
|
||
and not sq
|
||
):
|
||
raise ValueError(
|
||
"query_mode='materialized' requires a non-empty source_query"
|
||
)
|
||
|
||
# source_query only makes sense with materialized mode. Allow None
|
||
# (omitted) to flow through; only reject when explicitly set with
|
||
# the wrong mode.
|
||
if (
|
||
self.query_mode is not None
|
||
and self.query_mode != "materialized"
|
||
and sq
|
||
):
|
||
raise ValueError(
|
||
"source_query is only valid when query_mode='materialized'"
|
||
)
|
||
if self.query_mode is None and sq:
|
||
raise ValueError(
|
||
"source_query requires query_mode='materialized' to be set "
|
||
"in the same request"
|
||
)
|
||
|
||
# Normalise: drop whitespace-only strings to None so the persisted
|
||
# column is clean. Don't touch when source_query was None to begin
|
||
# with — that signals "PUT didn't touch this field, keep existing".
|
||
if sq_raw is not None:
|
||
self.source_query = sq
|
||
return self
|
||
|
||
@field_validator("primary_key", mode="before")
|
||
@classmethod
|
||
def _coerce_primary_key(cls, v):
|
||
return _normalize_primary_key(v)
|
||
|
||
@field_validator("description", mode="before")
|
||
@classmethod
|
||
def _normalize_description(cls, v):
|
||
# Defensive normalization for descriptions arriving via shell-quoting
|
||
# tooling that injects literal backslash escapes (e.g. `Don\'t`, `\n`).
|
||
return _unescape_shell_quoting(v)
|
||
|
||
# Duplicated from RegisterTableRequest — Pydantic v2 validators don't
|
||
# inherit cleanly across unrelated BaseModel classes; a shared mixin
|
||
# would be overkill for two fields.
|
||
@field_validator("sync_schedule", mode="before")
|
||
@classmethod
|
||
def _validate_sync_schedule(cls, v):
|
||
# None / "" → no schedule, accepted.
|
||
# Any non-empty string (including pure whitespace) must parse as a
|
||
# valid schedule — otherwise it would be persisted and silently
|
||
# ignored by the runtime evaluator.
|
||
if v in (None, ""):
|
||
return v
|
||
if not is_valid_schedule(v):
|
||
raise ValueError(
|
||
f"sync_schedule must be 'every Nm' / 'every Nh' / "
|
||
f"'daily HH:MM[,HH:MM,...]', got {v!r}"
|
||
)
|
||
return v
|
||
|
||
|
||
class ConfigureRequest(BaseModel):
|
||
data_source: str # "keboola" | "bigquery" | "local"
|
||
keboola_token: Optional[str] = None
|
||
keboola_url: Optional[str] = None
|
||
bigquery_project: Optional[str] = None
|
||
bigquery_location: Optional[str] = None
|
||
instance_name: Optional[str] = None
|
||
allowed_domain: Optional[str] = None
|
||
|
||
|
||
@router.get("/discover-tables")
|
||
async def discover_tables(
|
||
user: dict = Depends(require_admin),
|
||
dataset: Optional[str] = None,
|
||
):
|
||
"""Discover available tables from the configured data source.
|
||
|
||
For ``data_source.type='keboola'`` returns the full Storage API table
|
||
list (single round-trip). For ``data_source.type='bigquery'``:
|
||
|
||
- Without ``dataset``: list datasets in the configured project.
|
||
- With ``dataset=name``: list tables (BASE TABLE + VIEW) in that dataset.
|
||
|
||
Two-step shape avoids paying the per-dataset list_tables cost up-front
|
||
on projects with hundreds of datasets — the UI populates the dataset
|
||
dropdown first, then fetches tables only for the selected dataset.
|
||
"""
|
||
try:
|
||
from app.instance_config import get_data_source_type
|
||
source_type = get_data_source_type()
|
||
|
||
if source_type == "keboola":
|
||
from connectors.keboola.client import KeboolaClient
|
||
from app.instance_config import get_value
|
||
url = get_value("data_source", "keboola", "stack_url", default="")
|
||
token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
|
||
token = os.environ.get(token_env, "") if token_env else ""
|
||
if not token:
|
||
token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
|
||
client = KeboolaClient(token=token, url=url)
|
||
tables = client.discover_all_tables()
|
||
return {"tables": tables, "count": len(tables), "source": "keboola"}
|
||
|
||
if source_type == "bigquery":
|
||
return _discover_bigquery(dataset=dataset)
|
||
|
||
return {
|
||
"tables": [],
|
||
"count": 0,
|
||
"source": source_type,
|
||
"error": f"Discovery not implemented for source_type={source_type!r}",
|
||
}
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
raise HTTPException(status_code=500, detail=f"Discovery failed: {e}")
|
||
|
||
|
||
def _discover_bigquery(dataset: Optional[str]) -> Dict[str, Any]:
|
||
"""List BQ datasets (when ``dataset`` is None) or tables-in-dataset.
|
||
|
||
Routes through ``BqAccess.client()`` so config / auth / error
|
||
translation matches the rest of the BQ surface (#138 facade). Returns
|
||
the same shape as the Keboola path so the UI doesn't have to branch.
|
||
"""
|
||
from connectors.bigquery.access import (
|
||
get_bq_access,
|
||
BqAccessError,
|
||
translate_bq_error,
|
||
)
|
||
|
||
try:
|
||
bq = get_bq_access()
|
||
client = bq.client()
|
||
except BqAccessError as e:
|
||
raise HTTPException(
|
||
status_code=BqAccessError.HTTP_STATUS.get(e.kind, 500),
|
||
detail={"error": e.message, "kind": e.kind, "details": e.details},
|
||
)
|
||
|
||
try:
|
||
if dataset is None:
|
||
datasets = []
|
||
for ds in client.list_datasets():
|
||
datasets.append({
|
||
"dataset_id": ds.dataset_id,
|
||
"full_id": f"{ds.project}.{ds.dataset_id}",
|
||
})
|
||
return {
|
||
"datasets": sorted(datasets, key=lambda d: d["dataset_id"]),
|
||
"count": len(datasets),
|
||
"source": "bigquery",
|
||
}
|
||
|
||
# List tables in the named dataset. `list_tables` returns
|
||
# `TableListItem` with `table_id` + `table_type` ('TABLE', 'VIEW',
|
||
# 'MATERIALIZED_VIEW', 'EXTERNAL', 'SNAPSHOT'). UI maps TABLE → Type
|
||
# selector "table" and VIEW/MATERIALIZED_VIEW → "view"; the rest are
|
||
# passed through with their raw type so the operator can decide.
|
||
tables = []
|
||
for t in client.list_tables(dataset):
|
||
tables.append({
|
||
"table_id": t.table_id,
|
||
"table_type": t.table_type,
|
||
"full_id": f"{t.project}.{t.dataset_id}.{t.table_id}",
|
||
})
|
||
return {
|
||
"tables": sorted(tables, key=lambda t: t["table_id"]),
|
||
"count": len(tables),
|
||
"source": "bigquery",
|
||
"dataset": dataset,
|
||
}
|
||
except Exception as e:
|
||
# `translate_bq_error` re-raises non-Google exceptions unchanged,
|
||
# so wrap in HTTPException to keep the JSON-shape contract.
|
||
try:
|
||
err = translate_bq_error(e, bq.projects, bad_request_status="upstream_error")
|
||
except Exception:
|
||
raise HTTPException(status_code=502, detail=f"BQ discovery failed: {e}")
|
||
raise HTTPException(
|
||
status_code=BqAccessError.HTTP_STATUS.get(err.kind, 502),
|
||
detail={"error": err.message, "kind": err.kind, "details": err.details},
|
||
)
|
||
|
||
|
||
@router.get("/registry")
|
||
async def list_registry(
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Get full table registry.
|
||
|
||
Each table row is enriched with `last_sync_error` from sync_state so
|
||
operators can see WHY a row isn't materializing without trawling
|
||
scheduler logs. None for rows that have never errored or have already
|
||
recovered (status='ok'); the per-row error message string otherwise.
|
||
"""
|
||
repo = TableRegistryRepository(conn)
|
||
tables = repo.list_all()
|
||
|
||
# Single batched read of sync_state errors — avoid N+1 GETs against
|
||
# `sync_state` for large registries. The sync_state row is keyed on
|
||
# `table_id` which mirrors `table_registry.name` (see comment in
|
||
# _run_materialized_pass / _build_manifest_for_user about name vs id).
|
||
error_by_name: Dict[str, Optional[str]] = {}
|
||
try:
|
||
rows = conn.execute(
|
||
"SELECT table_id, error FROM sync_state "
|
||
"WHERE status = 'error' AND error IS NOT NULL AND error <> ''"
|
||
).fetchall()
|
||
error_by_name = {r[0]: r[1] for r in rows}
|
||
except Exception:
|
||
# Defensive: if sync_state is unreadable for any reason, the
|
||
# registry response still serializes — operators just lose the
|
||
# last_sync_error column on this call.
|
||
logger.exception("Failed to read sync_state errors for registry")
|
||
|
||
for t in tables:
|
||
# Sync_state.table_id == table_registry.name by convention.
|
||
t["last_sync_error"] = error_by_name.get(t.get("name"))
|
||
|
||
return {"tables": tables, "count": len(tables)}
|
||
|
||
|
||
# Wall-clock budget for the synchronous BQ materialization that runs after
|
||
# a successful BQ register. If the rebuild + view creation exceeds this,
|
||
# we hand the rest off to BackgroundTasks and return 202. 5s matches the
|
||
# UX contract in #108 ("Queryable as <view> within seconds") — long enough
|
||
# to cover a healthy GCE round-trip, short enough that a hung GCE call
|
||
# doesn't park the request handler.
|
||
_BQ_SYNC_REGISTER_TIMEOUT_S: float = 5.0
|
||
|
||
|
||
def _materialize_bigquery_extract() -> Dict[str, Any]:
|
||
"""Re-build the BigQuery extract.duckdb + master views.
|
||
|
||
Wrapper used by both the synchronous (in-band) and async (BackgroundTask)
|
||
code paths after a BQ register/update/delete. Imports kept inside the
|
||
function so non-BQ instances don't pay the import cost on app start.
|
||
|
||
Opens a FRESH system DB connection rather than reusing the request-scoped
|
||
one. The request handler closes its connection in a `finally` after the
|
||
response, but BackgroundTask + the timeout-fallback daemon thread can
|
||
both outlive that close — they would then operate on a closed handle (or
|
||
one being torn down concurrently). A fresh handle is cheap (DuckDB is an
|
||
embedded engine) and isolates the worker's lifetime from the request's.
|
||
|
||
Returns the rebuild result dict (``{"errors": [...], "tables_registered":
|
||
N, ...}``) so the synchronous caller can propagate failures to the
|
||
operator. Background-task callers ignore the return value, but the loud
|
||
log inside ``_run_bigquery_materialize_with_timeout`` covers that path.
|
||
"""
|
||
from connectors.bigquery import extractor as _bq_extractor
|
||
from src.db import get_system_db
|
||
from src.orchestrator import SyncOrchestrator
|
||
|
||
fresh_conn = get_system_db()
|
||
try:
|
||
result = _bq_extractor.rebuild_from_registry(conn=fresh_conn)
|
||
SyncOrchestrator().rebuild()
|
||
return result or {}
|
||
finally:
|
||
try:
|
||
fresh_conn.close()
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _materialize_bigquery_extract_bg() -> None:
|
||
"""BackgroundTask wrapper around `_materialize_bigquery_extract`.
|
||
|
||
BackgroundTasks discard return values, but `rebuild_from_registry` can
|
||
surface auth / config / identifier errors via the ``errors`` list. Log
|
||
those at ERROR level so the failure is loud in the operator's logs even
|
||
though the 202 response can't carry the detail (Decision 3 in #108: a
|
||
202 is documented as "accepted, may not be queryable yet" — we don't
|
||
block on it but we shouldn't swallow it either).
|
||
"""
|
||
try:
|
||
result = _materialize_bigquery_extract()
|
||
except Exception:
|
||
logger.exception("BQ post-register background materialize crashed")
|
||
return
|
||
errors = (result or {}).get("errors") or []
|
||
if errors:
|
||
logger.error(
|
||
"BQ post-register background materialize completed with %d error(s): %s",
|
||
len(errors), errors,
|
||
)
|
||
|
||
|
||
def _run_bigquery_materialize_with_timeout(
|
||
background: BackgroundTasks,
|
||
) -> Dict[str, Any]:
|
||
"""Try to materialize synchronously within the wall-clock budget.
|
||
|
||
Returns a dict with:
|
||
- ``status`` ∈ {"ok", "errors", "timeout"} — caller maps to HTTP code
|
||
- ``errors``: list of {table, error} surfaced by ``rebuild_from_registry``
|
||
(only present on ``status="errors"``)
|
||
|
||
Mapping by caller (`register_table`):
|
||
- "ok" → 200 (synchronous success)
|
||
- "errors" → 500 (rebuild ran but reported errors — propagate so
|
||
the operator knows the registry row exists but the
|
||
view wasn't created)
|
||
- "timeout" → 202 (rebuild still running on a BackgroundTask)
|
||
|
||
The synchronous worker runs on a daemon thread (so a hung GCE call
|
||
can't park the request) that opens its OWN system DB connection (see
|
||
`_materialize_bigquery_extract`). Even though FastAPI now invokes the
|
||
sync route in a threadpool — and `done.wait()` no longer blocks the
|
||
event loop — we still off-load to a daemon so the wait is bounded
|
||
even if `rebuild_from_registry` ignores its own timeouts.
|
||
"""
|
||
import threading
|
||
|
||
done = threading.Event()
|
||
err_holder: Dict[str, Any] = {}
|
||
result_holder: Dict[str, Any] = {}
|
||
|
||
def _worker():
|
||
try:
|
||
result_holder["result"] = _materialize_bigquery_extract()
|
||
except Exception as e: # pragma: no cover — logged below
|
||
err_holder["error"] = e
|
||
finally:
|
||
done.set()
|
||
|
||
t = threading.Thread(target=_worker, daemon=True, name="bq-register-rebuild")
|
||
t.start()
|
||
finished = done.wait(_BQ_SYNC_REGISTER_TIMEOUT_S)
|
||
|
||
if finished:
|
||
if "error" in err_holder:
|
||
# Worker finished within the wall-clock budget but raised. This
|
||
# is a HARD ERROR, not a timeout — surface it as such so the
|
||
# operator gets the actual exception in the 500 body instead
|
||
# of a misleading 202 + "still working in the background".
|
||
# Earlier revisions returned ``{"status": "timeout"}`` here,
|
||
# which the register handler then mapped to 202 + a retry
|
||
# BackgroundTask; that hid the real failure for `_BQ_SYNC_
|
||
# REGISTER_TIMEOUT_S` seconds before the BG retry surfaced
|
||
# the same exception in the logs.
|
||
exc = err_holder["error"]
|
||
logger.error(
|
||
"BQ post-register rebuild raised within budget: %r",
|
||
exc,
|
||
)
|
||
return {
|
||
"status": "errors",
|
||
"errors": [{"error": f"{type(exc).__name__}: {exc}"}],
|
||
}
|
||
# Synchronous worker finished cleanly — but check whether
|
||
# `rebuild_from_registry` itself surfaced any errors (auth fail,
|
||
# missing project from the overlay, unsafe identifier slipping the
|
||
# validator, etc.). Without this, those errors got silently logged
|
||
# and the API claimed success.
|
||
result = result_holder.get("result") or {}
|
||
errors = result.get("errors") or []
|
||
if errors:
|
||
logger.error(
|
||
"BQ post-register rebuild reported %d error(s): %s",
|
||
len(errors), errors,
|
||
)
|
||
return {"status": "errors", "errors": errors}
|
||
return {"status": "ok"}
|
||
|
||
# Timed out — let the worker keep running on its thread (already daemon)
|
||
# and also schedule a BackgroundTask so the orchestrator gets called via
|
||
# the supported FastAPI path. `_INIT_EXTRACT_LOCK` in the BQ extractor
|
||
# serializes the two file-swap calls so the slow daemon thread and the
|
||
# background task can't tear `extract.duckdb`; the orchestrator's own
|
||
# `_rebuild_lock` protects the master-view rebuild step downstream.
|
||
logger.info(
|
||
"BQ post-register rebuild exceeded %ss budget — handing off to BackgroundTask",
|
||
_BQ_SYNC_REGISTER_TIMEOUT_S,
|
||
)
|
||
background.add_task(_materialize_bigquery_extract_bg)
|
||
return {"status": "timeout"}
|
||
|
||
|
||
@router.post(
|
||
"/register-table",
|
||
responses={
|
||
200: {"description": "BigQuery row registered + materialized synchronously"},
|
||
201: {"description": "Non-BigQuery row registered (no post-insert materialize)"},
|
||
202: {"description": "BigQuery row registered; materialize continues in background"},
|
||
409: {"description": "Table id or view name already in use"},
|
||
500: {"description": "BigQuery row registered but post-insert rebuild failed"},
|
||
},
|
||
)
|
||
def register_table(
|
||
request: RegisterTableRequest,
|
||
background: BackgroundTasks,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Register a new table in the system.
|
||
|
||
Behavior by source_type:
|
||
- **bigquery**: validates BQ-specific shape (dataset / source_table /
|
||
identifier safety / project_id format), forces query_mode='remote' and
|
||
profile_after_sync=False, then synchronously rebuilds extract.duckdb +
|
||
master views with a wall-clock budget. Returns 200 with the view name
|
||
on success, 202 on budget overrun (rebuild continues in a
|
||
BackgroundTask), or 500 if the synchronous rebuild ran but reported
|
||
an error (e.g. auth failure, missing project, unsafe identifier).
|
||
- other source types: insert-only, no post-register hook. Returns 201.
|
||
|
||
Defined as a plain ``def`` (not ``async def``) so FastAPI runs it in a
|
||
threadpool — the synchronous-materialize path waits on
|
||
``threading.Event.wait()``, which would otherwise block the asyncio
|
||
event loop and stall every other request for up to ``_BQ_SYNC_REGISTER_
|
||
TIMEOUT_S``. ``Depends(...)``, ``BackgroundTasks``, and
|
||
``JSONResponse`` all work the same in sync handlers; the rest of the
|
||
admin module mixes both styles already.
|
||
|
||
The route does NOT carry a default ``status_code`` — each branch returns
|
||
its own JSONResponse with the right code. A blanket ``status_code=201``
|
||
on the decorator would mislead OpenAPI consumers about the BQ branch.
|
||
|
||
Always: 409 on view-name collision against the existing registry, audit
|
||
log entry on success.
|
||
"""
|
||
from fastapi.responses import JSONResponse
|
||
if not request.name or not request.name.strip():
|
||
raise HTTPException(status_code=422, detail="Table name cannot be empty")
|
||
repo = TableRegistryRepository(conn)
|
||
table_id = request.name.strip().lower().replace(" ", "_")
|
||
|
||
if repo.get(table_id):
|
||
raise HTTPException(status_code=409, detail=f"Table '{table_id}' already registered")
|
||
|
||
# View-name collision pre-check — distinct from id collision above.
|
||
# `id` is derived from `name`, but two callers could legally pick
|
||
# different display names that lower-case + slugify to the same view
|
||
# (e.g. "Orders v2" + "orders_v2"); the strict view-name uniqueness
|
||
# check stops that here, before the orchestrator surfaces it as a
|
||
# silent overwrite at next rebuild.
|
||
existing_by_name = next(
|
||
(r for r in repo.list_all() if (r.get("name") or "") == request.name),
|
||
None,
|
||
)
|
||
if existing_by_name is not None:
|
||
raise HTTPException(
|
||
status_code=409,
|
||
detail=f"View name '{request.name}' is already in use by table id '{existing_by_name.get('id')}'",
|
||
)
|
||
|
||
# Refuse rows whose source_type isn't actually configured — pre-fix the
|
||
# row landed in the registry but never synced because there was no
|
||
# Keboola URL/token (or BQ project) to ATTACH against. Surfaces the
|
||
# misconfig at registration time so the operator sees the gap before
|
||
# they wonder why `agnes catalog` is missing the table.
|
||
_validate_source_type_configured(request.source_type)
|
||
|
||
# BQ rows go through the extra validation + post-insert materialization
|
||
# contract from issue #108. Other source types keep the legacy insert-only
|
||
# flow — Keboola materialization happens via the scheduled sync, Jira via
|
||
# webhook, local via a manual extractor run.
|
||
is_bigquery = request.source_type == "bigquery"
|
||
if is_bigquery:
|
||
_validate_bigquery_register_payload(request)
|
||
|
||
# Phase C: profile_after_sync is no longer passed — the field is
|
||
# deprecated and inert at the runtime layer. The DB column keeps its
|
||
# schema default; the registry response no longer reflects request
|
||
# values for this flag.
|
||
# v51 — validate bq_fqn upfront. The extractor would catch a malformed
|
||
# value at next rebuild and skip the row, but failing at register time
|
||
# gives the admin a clean 422 with the specific complaint instead of
|
||
# a silent "table registered but never materialized" state.
|
||
if request.bq_fqn is not None and request.source_type != "bigquery":
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail="bq_fqn only applies to source_type='bigquery'",
|
||
)
|
||
if request.bq_fqn is not None:
|
||
from connectors.bigquery.extractor import parse_bq_fqn
|
||
try:
|
||
parse_bq_fqn(request.bq_fqn)
|
||
except ValueError as e:
|
||
raise HTTPException(status_code=422, detail=str(e))
|
||
|
||
repo.register(
|
||
id=table_id,
|
||
name=request.name,
|
||
folder=request.folder,
|
||
sync_strategy=request.sync_strategy,
|
||
primary_key=request.primary_key,
|
||
description=request.description,
|
||
registered_by=user.get("email"),
|
||
source_type=request.source_type,
|
||
bucket=request.bucket,
|
||
source_table=request.source_table,
|
||
source_query=request.source_query,
|
||
query_mode=request.query_mode,
|
||
sync_schedule=request.sync_schedule,
|
||
# v26 sync-strategy support fields. None for non-Keboola or
|
||
# full_refresh tables; persisted as NULL.
|
||
incremental_window_days=request.incremental_window_days,
|
||
max_history_days=request.max_history_days,
|
||
incremental_column=request.incremental_column,
|
||
where_filters=request.where_filters,
|
||
partition_by=request.partition_by,
|
||
partition_granularity=request.partition_granularity,
|
||
initial_load_chunk_days=request.initial_load_chunk_days,
|
||
bq_fqn=request.bq_fqn,
|
||
)
|
||
|
||
# Audit entry — masked params; description kept raw (it's documentation).
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action="register_table",
|
||
resource=table_id,
|
||
params=_sanitize_for_audit(request.model_dump()),
|
||
)
|
||
|
||
from app.api.v2_catalog import invalidate_for_table
|
||
invalidate_for_table(table_id)
|
||
|
||
if not is_bigquery:
|
||
# Keboola / Jira / local rows are insert-only here. 201 Created — the
|
||
# decorator no longer carries a default status, so each branch is
|
||
# explicit about its code (BQ branch overrides via JSONResponse).
|
||
return JSONResponse(
|
||
status_code=201,
|
||
content={"id": table_id, "name": request.name, "status": "registered"},
|
||
)
|
||
|
||
if request.query_mode == "materialized":
|
||
# Materialized BQ rows are picked up by the trigger pass on the next
|
||
# scheduled tick (or via POST /api/sync/trigger). No synchronous
|
||
# rebuild — the COPY can scan multi-GB and would block the request.
|
||
return JSONResponse(
|
||
status_code=201,
|
||
content={
|
||
"id": table_id,
|
||
"name": request.name,
|
||
"status": "registered",
|
||
"view_name": table_id,
|
||
"message": (
|
||
"Materialized — parquet will be written on the next sync "
|
||
"tick. Trigger now via POST /api/sync/trigger."
|
||
),
|
||
},
|
||
)
|
||
|
||
# BQ post-register: rebuild extract + master views, with timeout fallback.
|
||
# Decision 1: 200 on synchronous success, 202 on timeout, 500 if the
|
||
# synchronous rebuild surfaced errors. Distinct from the 201 Keboola
|
||
# path above, so the BQ branch builds its own response.
|
||
outcome = _run_bigquery_materialize_with_timeout(background)
|
||
status = outcome.get("status")
|
||
if status == "ok":
|
||
return JSONResponse(
|
||
status_code=200,
|
||
content={
|
||
"id": table_id,
|
||
"name": request.name,
|
||
"status": "ok",
|
||
"view_name": table_id,
|
||
},
|
||
)
|
||
if status == "errors":
|
||
# Registry insert succeeded but the post-insert rebuild reported
|
||
# errors — the row is in the registry but the master view was NOT
|
||
# created. Surface the failure verbatim so the operator can fix
|
||
# the underlying config (typically a missing
|
||
# `data_source.bigquery.project` in the overlay or auth that lacks
|
||
# bigquery.metadata.get on the dataset). The row stays in the
|
||
# registry; a re-run after fixing the config picks up the existing
|
||
# row and creates the view on the next register/update or
|
||
# scheduler tick.
|
||
return JSONResponse(
|
||
status_code=500,
|
||
content={
|
||
"id": table_id,
|
||
"name": request.name,
|
||
"status": "rebuild_failed",
|
||
"view_name": table_id,
|
||
"errors": outcome.get("errors") or [],
|
||
"message": (
|
||
"Registry row created but post-insert rebuild failed; "
|
||
"view is not queryable. See `errors` for details."
|
||
),
|
||
},
|
||
)
|
||
# Default: timeout — rebuild continues on a BackgroundTask.
|
||
return JSONResponse(
|
||
status_code=202,
|
||
content={
|
||
"id": table_id,
|
||
"name": request.name,
|
||
"status": "accepted",
|
||
"view_name": table_id,
|
||
"message": "Registration accepted; materializing in background",
|
||
},
|
||
)
|
||
|
||
|
||
class PrecheckResponse(BaseModel):
|
||
"""Response model for /api/admin/register-table/precheck.
|
||
|
||
Documented here so OpenAPI consumers know what to expect; the route
|
||
returns a plain dict for backwards compatibility with the rest of the
|
||
admin API which doesn't use response_model.
|
||
"""
|
||
ok: bool
|
||
table: Dict[str, Any]
|
||
|
||
|
||
@router.post("/register-table/precheck")
|
||
def register_table_precheck(
|
||
request: RegisterTableRequest,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Validate a register-table payload + (BQ only) confirm the source table exists.
|
||
|
||
No DB write. Used by the UI to surface row count + size + column count
|
||
in the modal before the operator clicks Register, and by the CLI's
|
||
``--dry-run`` to print what *would* be registered without touching
|
||
state. Identical Pydantic validation to register-table; for BQ rows we
|
||
additionally make a ``bigquery.Client(project).get_table(...)`` call
|
||
and surface the GCP error verbatim.
|
||
|
||
Defined as a plain ``def`` (not ``async def``) so FastAPI runs it in a
|
||
threadpool — the BQ branch makes synchronous ``bigquery.Client(...)``
|
||
/``client.get_table(...)`` calls, which would otherwise block the
|
||
asyncio event loop and stall every other request for the duration of
|
||
the GCE round-trip. Mirrors the same conversion done for
|
||
``register_table`` (see comment on that route). ``Depends(...)`` works
|
||
identically in sync handlers.
|
||
"""
|
||
if not request.name or not request.name.strip():
|
||
raise HTTPException(status_code=422, detail="Table name cannot be empty")
|
||
|
||
if request.source_type != "bigquery":
|
||
# M1 only adds BQ-specific precheck. Other source types get a
|
||
# validation-only response so the CLI / UI can rely on the same
|
||
# endpoint shape across types.
|
||
return {
|
||
"ok": True,
|
||
"table": {
|
||
"name": request.name,
|
||
"source_type": request.source_type,
|
||
"bucket": request.bucket,
|
||
"source_table": request.source_table,
|
||
"rows": None,
|
||
"size_bytes": None,
|
||
"columns": [],
|
||
"note": "precheck for non-bigquery sources is validation-only in M1",
|
||
},
|
||
}
|
||
|
||
# BQ-specific shape validation (forces query_mode/profile_after_sync,
|
||
# checks identifier safety, validates project_id from instance.yaml).
|
||
_validate_bigquery_register_payload(request)
|
||
|
||
# Materialized BQ rows have no `dataset.source_table` to round-trip —
|
||
# the SQL body is the contract. Skip the BQ-jobs-API call and return a
|
||
# validation-only precheck so the CLI's `--dry-run --query-mode
|
||
# materialized` path doesn't crash on an empty fully-qualified name.
|
||
if request.query_mode == "materialized":
|
||
return {
|
||
"ok": True,
|
||
"table": {
|
||
"name": request.name,
|
||
"source_type": "bigquery",
|
||
"query_mode": "materialized",
|
||
"source_query": request.source_query,
|
||
"rows": None,
|
||
"size_bytes": None,
|
||
"columns": [],
|
||
"note": (
|
||
"materialized precheck is validation-only — the SQL is "
|
||
"evaluated for cost on each scheduled materialize tick"
|
||
),
|
||
},
|
||
}
|
||
|
||
# Round-trip the BQ jobs API to confirm the table exists and the SA can
|
||
# see it. Imports kept local to avoid pulling google-cloud-bigquery into
|
||
# the import chain on non-BQ instances.
|
||
try:
|
||
from google.cloud import bigquery # noqa: PLC0415
|
||
from google.api_core import exceptions as google_exc # noqa: PLC0415
|
||
except ImportError as e:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=(
|
||
"google-cloud-bigquery not installed; install the bigquery "
|
||
f"extras to use BQ precheck ({e})"
|
||
),
|
||
) from e
|
||
|
||
from app.instance_config import get_value
|
||
project_id = get_value("data_source", "bigquery", "project", default="")
|
||
dataset = (request.bucket or "").strip()
|
||
source_table = (request.source_table or "").strip()
|
||
fq = f"{project_id}.{dataset}.{source_table}"
|
||
|
||
try:
|
||
client = bigquery.Client(project=project_id)
|
||
bq_table = client.get_table(fq)
|
||
except google_exc.NotFound as e:
|
||
raise HTTPException(status_code=404, detail=f"BigQuery table not found: {fq} ({e})") from e
|
||
except google_exc.Forbidden as e:
|
||
raise HTTPException(
|
||
status_code=403,
|
||
detail=(
|
||
f"BigQuery access denied for {fq}: {e}. "
|
||
"Service account needs bigquery.metadata.get on the dataset."
|
||
),
|
||
) from e
|
||
except Exception as e:
|
||
# Auth errors, transient 5xx, malformed table refs — surface as 400
|
||
# so the operator gets the GCP error verbatim and can fix their
|
||
# config without us guessing the right HTTP code.
|
||
raise HTTPException(status_code=400, detail=f"BigQuery precheck failed for {fq}: {e}") from e
|
||
|
||
columns = [
|
||
{"name": f.name, "type": f.field_type}
|
||
for f in (bq_table.schema or [])
|
||
]
|
||
return {
|
||
"ok": True,
|
||
"table": {
|
||
"name": request.name,
|
||
"source_type": "bigquery",
|
||
"bucket": dataset,
|
||
"source_table": source_table,
|
||
"project_id": project_id,
|
||
"rows": int(bq_table.num_rows or 0),
|
||
"size_bytes": int(bq_table.num_bytes or 0),
|
||
"columns": columns,
|
||
"column_count": len(columns),
|
||
},
|
||
}
|
||
|
||
|
||
@router.put("/registry/{table_id}")
|
||
async def update_table(
|
||
table_id: str,
|
||
request: UpdateTableRequest,
|
||
background: BackgroundTasks,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Update a registered table's configuration.
|
||
|
||
For BQ rows, schedules a background rebuild so the master view picks
|
||
up changes (e.g. a renamed dataset) without waiting for the next
|
||
scheduled sync.
|
||
"""
|
||
repo = TableRegistryRepository(conn)
|
||
existing = repo.get(table_id)
|
||
if not existing:
|
||
raise HTTPException(status_code=404, detail="Table not found")
|
||
|
||
# `exclude_unset=True` honors the PUT-shape distinction between
|
||
# "field omitted from body" (keep existing) vs "field sent as null"
|
||
# (clear to NULL). Pre-v26 the handler used `model_dump()` filtered by
|
||
# `if v is not None`, which collapsed both cases to "omitted" — meaning
|
||
# an admin couldn't clear a field via PUT. v26 needs the clear path so
|
||
# the Edit modal can switch a partitioned row back to full_refresh and
|
||
# have the stale partition_by / partition_granularity / max_history_days
|
||
# actually go away (without this fix, those fields linger and either
|
||
# confuse the dispatcher or trip the v26 conflict-policy validator on
|
||
# the next edit).
|
||
#
|
||
# Contract change (Devin Review finding 0001): callers that previously
|
||
# sent explicit `null` to mean "no-op, keep existing" will now have the
|
||
# field cleared. In practice this is fine — the only known caller is
|
||
# the Edit modal, which pre-populates form fields from the existing row
|
||
# and JSON-encodes the populated (non-null) value back. CLI register-table
|
||
# only POSTs new rows, never PUTs nulls. If a future client needs the
|
||
# old "null = no-op" semantics for some field, it should omit the field
|
||
# from the body instead of sending null — that's the canonical PUT shape.
|
||
updates = request.model_dump(exclude_unset=True)
|
||
# Run BQ-shape validation BEFORE persisting whenever the merged record
|
||
# would be a bigquery row (existing was BQ, or the patch flips it to BQ,
|
||
# or the patch touches BQ-relevant fields on an already-BQ row). Without
|
||
# this gate, an admin could PUT `bucket="evil\"; DROP --"` onto a BQ
|
||
# row and the next rebuild would silently fail at view-create time —
|
||
# surface the bad shape at PUT time instead.
|
||
if updates:
|
||
# Preserve the original `registered_at` across PUTs — `repo.register`
|
||
# now accepts it as an optional kwarg; without this the upsert would
|
||
# stamp a fresh `now()` on every edit (issue #130).
|
||
merged = dict(existing)
|
||
merged.update(updates)
|
||
merged.pop("id", None) # avoid duplicate id kwarg
|
||
|
||
# v52 + v56: per-table docs fields (sample_questions /
|
||
# things_to_know / pairs_well_with + grain / platforms /
|
||
# partition_col / history / gotchas) live on table_registry
|
||
# but have their own PATCH /registry/{id}/docs endpoint.
|
||
# ``repo.register()`` doesn't know them; stripping here keeps
|
||
# the read-modify-write loop the PUT handler relies on
|
||
# (existing → merged → register) from blowing up with
|
||
# TypeError when the docs columns are populated.
|
||
for _docs_key in (
|
||
"sample_questions", "things_to_know", "pairs_well_with",
|
||
"grain", "platforms", "partition_col", "history", "gotchas",
|
||
):
|
||
merged.pop(_docs_key, None)
|
||
|
||
# When switching the merged record away from materialized mode, drop
|
||
# the stale source_query — the request validator can't clear it via
|
||
# the `if v is not None` filter above. Without this, a remote/local
|
||
# row would carry an orphan source_query in the registry.
|
||
if merged.get("query_mode") != "materialized":
|
||
merged["source_query"] = None
|
||
|
||
# Cross-source coherence: query_mode='materialized' requires a
|
||
# non-empty source_query for ALL source types, not just BigQuery.
|
||
# BQ rows without source_query can be server-generated from
|
||
# bucket+source_table (handled by _validate_bigquery_register_payload
|
||
# via the synthetic RegisterTableRequest below). Non-BQ rows (e.g.
|
||
# Keboola) still require an explicit source_query at PUT time.
|
||
if merged.get("query_mode") == "materialized":
|
||
sq = merged.get("source_query")
|
||
if not sq or not str(sq).strip():
|
||
# BQ rows: let _validate_bigquery_register_payload generate
|
||
# source_query from bucket+source_table (falls through below).
|
||
# Non-BQ rows: no server-generate fallback; raise 422.
|
||
if merged.get("source_type") != "bigquery":
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail=(
|
||
"query_mode='materialized' requires a non-empty "
|
||
"source_query. To revert to a non-materialized mode, "
|
||
"PATCH query_mode='local' (Keboola) or 'remote' "
|
||
"(BigQuery) and the stale source_query is cleared "
|
||
"automatically."
|
||
),
|
||
)
|
||
# Backtick guard removed for materialized rows: the Task 2 wrapping
|
||
# path (connectors.bigquery.extractor.materialize_query) now runs
|
||
# admin SQL through the BQ jobs API using BQ-native syntax, which
|
||
# requires backticks for dashed project/dataset identifiers.
|
||
# Non-materialized rows still reject backticks in the model validator.
|
||
|
||
if merged.get("source_type") == "bigquery":
|
||
# Reuse the register-time validator. It mutates the request to
|
||
# force query_mode='remote' / profile_after_sync=False (or to
|
||
# leave a materialized row alone) — apply the same coercion to
|
||
# `merged` so the persisted row matches.
|
||
synthetic = RegisterTableRequest(
|
||
name=merged.get("name") or table_id,
|
||
bucket=merged.get("bucket"),
|
||
source_table=merged.get("source_table"),
|
||
source_query=merged.get("source_query"),
|
||
source_type="bigquery",
|
||
query_mode=merged.get("query_mode") or "remote",
|
||
profile_after_sync=bool(merged.get("profile_after_sync") or False),
|
||
primary_key=merged.get("primary_key"),
|
||
description=merged.get("description"),
|
||
folder=merged.get("folder"),
|
||
sync_strategy=merged.get("sync_strategy") or "full_refresh",
|
||
sync_schedule=merged.get("sync_schedule"),
|
||
)
|
||
_validate_bigquery_register_payload(synthetic)
|
||
merged["query_mode"] = synthetic.query_mode
|
||
merged["profile_after_sync"] = synthetic.profile_after_sync
|
||
merged["source_query"] = synthetic.source_query
|
||
|
||
# v51 — same bq_fqn validation as register-table. PUT can both
|
||
# add a fresh bq_fqn or update an existing one; in either case
|
||
# malformed values should reject at PUT time, not silently
|
||
# land in the DB and break the next rebuild.
|
||
if merged.get("bq_fqn"):
|
||
from connectors.bigquery.extractor import parse_bq_fqn
|
||
try:
|
||
parse_bq_fqn(merged["bq_fqn"])
|
||
except ValueError as e:
|
||
raise HTTPException(status_code=422, detail=str(e))
|
||
else:
|
||
# Non-BQ row carrying bq_fqn is nonsensical — reject the same
|
||
# way register-table does.
|
||
if merged.get("bq_fqn"):
|
||
raise HTTPException(
|
||
status_code=422,
|
||
detail="bq_fqn only applies to source_type='bigquery'",
|
||
)
|
||
|
||
repo.register(id=table_id, **merged)
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action="update_table",
|
||
resource=table_id,
|
||
params=_sanitize_for_audit({"updated_fields": sorted(updates.keys()), **updates}),
|
||
)
|
||
|
||
# If we updated a BQ row (or one that's now BQ), refresh the extract in
|
||
# the background so the view picks up renames / column-list changes.
|
||
# Use the BG wrapper so any rebuild errors are logged at ERROR level
|
||
# instead of being silently dropped by BackgroundTasks (which discards
|
||
# return values).
|
||
after = repo.get(table_id) or {}
|
||
if after.get("source_type") == "bigquery":
|
||
background.add_task(_materialize_bigquery_extract_bg)
|
||
|
||
from app.api.v2_catalog import invalidate_for_table
|
||
invalidate_for_table(table_id)
|
||
|
||
return {"id": table_id, "updated": list(updates.keys())}
|
||
|
||
|
||
class _GotchaItem(BaseModel):
|
||
"""v56: a single gotcha entry. ``key=True`` marks the first one as
|
||
the "Key gotcha" rendered distinctly by the package detail page."""
|
||
key: bool = False
|
||
body: str
|
||
|
||
|
||
class TableDocsRequest(BaseModel):
|
||
"""Per-table docs surface — v52 (sample_questions / things_to_know /
|
||
pairs_well_with) extended in v56 with structured fields (grain /
|
||
platforms / partition_col / history / gotchas) for the
|
||
/catalog/p/<slug> package detail page rewrite.
|
||
|
||
All fields optional. Sending `[]` for a list clears it; sending
|
||
`""` for a scalar clears it; omitting leaves it untouched
|
||
(Optional-is-no-op contract).
|
||
"""
|
||
# v52 fields.
|
||
sample_questions: Optional[List[str]] = None
|
||
things_to_know: Optional[str] = None
|
||
pairs_well_with: Optional[List[str]] = None
|
||
# v56 fields.
|
||
grain: Optional[str] = None
|
||
platforms: Optional[List[str]] = None
|
||
partition_col: Optional[str] = None
|
||
history: Optional[str] = None
|
||
gotchas: Optional[List[_GotchaItem]] = None
|
||
|
||
@field_validator("platforms")
|
||
@classmethod
|
||
def _check_platforms(cls, v: Optional[List[str]]) -> Optional[List[str]]:
|
||
if v is None:
|
||
return None
|
||
if len(v) > 8:
|
||
raise ValueError("platforms: max 8 entries")
|
||
return v
|
||
|
||
@field_validator("gotchas")
|
||
@classmethod
|
||
def _check_gotchas(cls, v):
|
||
if v is None:
|
||
return None
|
||
if len(v) > 8:
|
||
raise ValueError("gotchas: max 8 entries")
|
||
return v
|
||
|
||
|
||
@router.patch("/registry/{table_id}/docs")
|
||
async def update_table_docs(
|
||
table_id: str,
|
||
payload: TableDocsRequest,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Write the admin-authored per-table docs read by /catalog/t/<id>
|
||
and (for the v56 structured fields) by the per-table extended
|
||
section on /catalog/p/<slug>. Separated from PUT /registry/{id} so
|
||
admins can flip these fields without re-submitting the whole big
|
||
registration payload."""
|
||
repo = TableRegistryRepository(conn)
|
||
if not repo.get(table_id):
|
||
raise HTTPException(status_code=404, detail="table_not_found")
|
||
# Empty-string ``things_to_know`` clears; explicit `[]` clears lists.
|
||
clear_things = payload.things_to_know == ""
|
||
clear_questions = payload.sample_questions == []
|
||
clear_pairs = payload.pairs_well_with == []
|
||
# v56 ``gotchas`` Pydantic models → list of dicts for the repo (JSON
|
||
# serializer handles plain dicts; we'd lose the validator if we
|
||
# passed _GotchaItem instances through).
|
||
gotchas_payload = (
|
||
[g.model_dump() for g in payload.gotchas]
|
||
if payload.gotchas is not None
|
||
else None
|
||
)
|
||
repo.update_docs(
|
||
table_id,
|
||
sample_questions=(
|
||
None if clear_questions else payload.sample_questions
|
||
),
|
||
things_to_know=(None if clear_things else payload.things_to_know),
|
||
pairs_well_with=(
|
||
None if clear_pairs else payload.pairs_well_with
|
||
),
|
||
clear_sample_questions=clear_questions,
|
||
clear_things_to_know=clear_things,
|
||
clear_pairs_well_with=clear_pairs,
|
||
# v56 — same Optional-is-no-op contract.
|
||
grain=payload.grain,
|
||
platforms=payload.platforms,
|
||
partition_col=payload.partition_col,
|
||
history=payload.history,
|
||
gotchas=gotchas_payload,
|
||
)
|
||
# Echo the fresh state so the admin client can re-render without a
|
||
# second GET. Lets the test suite (and the eventual admin UI) inspect
|
||
# what landed in DB.
|
||
fresh = repo.get(table_id) or {}
|
||
return {
|
||
"id": table_id,
|
||
"sample_questions": fresh.get("sample_questions") or [],
|
||
"things_to_know": fresh.get("things_to_know"),
|
||
"pairs_well_with": fresh.get("pairs_well_with") or [],
|
||
"grain": fresh.get("grain"),
|
||
"platforms": fresh.get("platforms") or [],
|
||
"partition_col": fresh.get("partition_col"),
|
||
"history": fresh.get("history"),
|
||
"gotchas": fresh.get("gotchas") or [],
|
||
}
|
||
|
||
|
||
@router.delete("/registry/{table_id}", status_code=204)
|
||
async def unregister_table(
|
||
table_id: str,
|
||
background: BackgroundTasks,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Unregister a table from the system.
|
||
|
||
For BQ rows, schedules a background rebuild so the dropped row's
|
||
master view is removed from analytics.duckdb (rather than hanging
|
||
around until the next scheduled sync).
|
||
|
||
For materialized rows, also removes the canonical parquet at
|
||
`${DATA_DIR}/extracts/<source_type>/data/<id>.parquet` and clears
|
||
the matching `sync_state` row. Without these two cleanups, the
|
||
manifest endpoint kept advertising the dropped table to `agnes pull`
|
||
(sync_state-driven) and the orchestrator's next rebuild could
|
||
resurrect a master view from the leftover parquet (E2E sub-agent
|
||
finding 2026-05-01).
|
||
"""
|
||
repo = TableRegistryRepository(conn)
|
||
existing = repo.get(table_id)
|
||
if not existing:
|
||
raise HTTPException(status_code=404, detail="Table not found")
|
||
|
||
was_bigquery = existing.get("source_type") == "bigquery"
|
||
was_materialized = existing.get("query_mode") == "materialized"
|
||
source_type = existing.get("source_type") or ""
|
||
name = existing.get("name") or table_id
|
||
|
||
repo.unregister(table_id)
|
||
|
||
# Drop the canonical parquet for materialized rows. Path layout:
|
||
# `${DATA_DIR}/extracts/<source_type>/data/<name>.parquet` — the
|
||
# filename is keyed by `table_registry.name` (matches sync_state
|
||
# bookkeeping convention; see _run_materialized_pass + the manifest
|
||
# builder for the same name-keyed lookup). Defensively remove the
|
||
# `.parquet.tmp` sibling too in case a prior materialize crashed
|
||
# mid-COPY. Failure to remove (file missing, permission error) is
|
||
# logged but doesn't fail the DELETE — the registry row is already
|
||
# gone, and the orphan parquet will not produce a master view at
|
||
# next rebuild because the orchestrator's _meta-driven scan never
|
||
# picks up bare parquet files.
|
||
if was_materialized and source_type in ("bigquery", "keboola"):
|
||
try:
|
||
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
||
base = data_dir / "extracts" / source_type / "data"
|
||
for candidate in (
|
||
base / f"{name}.parquet",
|
||
base / f"{name}.parquet.tmp",
|
||
):
|
||
if candidate.exists():
|
||
candidate.unlink()
|
||
logger.info(
|
||
"Removed materialized parquet for unregistered table %s: %s",
|
||
table_id, candidate,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"Failed to remove materialized parquet for %s: %s — registry row is "
|
||
"still dropped; clean up the file manually if it lingers",
|
||
table_id, e,
|
||
)
|
||
|
||
# Clear sync_state for any source/mode (a row that was synced at any
|
||
# point — local/materialized — has a sync_state entry that the manifest
|
||
# serves regardless of registry state). Pre-fix, the manifest still
|
||
# advertised the dropped table to `agnes pull` because sync_state was
|
||
# never cleaned up, and analysts kept getting it through the manifest.
|
||
try:
|
||
conn.execute("DELETE FROM sync_state WHERE table_id = ?", [name])
|
||
conn.execute("DELETE FROM sync_history WHERE table_id = ?", [name])
|
||
except Exception as e:
|
||
logger.warning(
|
||
"Failed to clear sync_state for unregistered table %s: %s — "
|
||
"manifest may still advertise the dropped row to agnes pull",
|
||
table_id, e,
|
||
)
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action="unregister_table",
|
||
resource=table_id,
|
||
params=_sanitize_for_audit({
|
||
"name": existing.get("name"),
|
||
"source_type": existing.get("source_type"),
|
||
"bucket": existing.get("bucket"),
|
||
"source_table": existing.get("source_table"),
|
||
}),
|
||
)
|
||
|
||
from app.api.v2_catalog import invalidate_for_table
|
||
invalidate_for_table(table_id)
|
||
|
||
if was_bigquery:
|
||
background.add_task(_materialize_bigquery_extract_bg)
|
||
|
||
|
||
@router.post("/configure")
|
||
async def configure_instance(
|
||
request: ConfigureRequest,
|
||
user: dict = Depends(require_admin),
|
||
):
|
||
"""Configure data source and instance settings via API.
|
||
|
||
Writes config to instance.yaml and persists secrets to .env_overlay.
|
||
AI agents and the /setup wizard use this instead of manual file editing.
|
||
"""
|
||
import yaml
|
||
|
||
if request.data_source not in ("keboola", "bigquery", "local"):
|
||
raise HTTPException(status_code=400, detail="data_source must be 'keboola', 'bigquery', or 'local'")
|
||
|
||
# Validate credentials if provided
|
||
if request.data_source == "keboola":
|
||
if not request.keboola_token or not request.keboola_url:
|
||
raise HTTPException(status_code=400, detail="keboola_token and keboola_url are required for Keboola data source")
|
||
_validate_url_not_private(request.keboola_url, field_name="keboola_url")
|
||
try:
|
||
from connectors.keboola.client import KeboolaClient
|
||
client = KeboolaClient(token=request.keboola_token, url=request.keboola_url)
|
||
client.test_connection()
|
||
except Exception as e:
|
||
logger.error("Keboola connection validation failed: %s", e)
|
||
raise HTTPException(status_code=400, detail="Keboola connection failed. Check your token and URL.")
|
||
|
||
elif request.data_source == "bigquery":
|
||
if not request.bigquery_project:
|
||
raise HTTPException(status_code=400, detail="bigquery_project is required for BigQuery data source")
|
||
|
||
# Write instance.yaml to DATA_DIR/state/ (writable Docker volume),
|
||
# NOT to CONFIG_DIR which is mounted read-only in Docker.
|
||
#
|
||
# Narrow-overlay write strategy — must match `/api/admin/server-config`:
|
||
# 1. Read overlay verbatim (do NOT fall back to static). Falling back
|
||
# would copy env-resolved cleartext secrets from the merged static
|
||
# file back into the overlay (e.g. `smtp_password: ${SMTP_PASSWORD}`
|
||
# → `smtp_password: hunter2`). The wizard only ever sets
|
||
# `instance`, `auth`, `data_source` here, so other sections must
|
||
# flow from the static file via `load_instance_config`'s deep-merge
|
||
# — they don't belong in the overlay at all.
|
||
# 2. Patch only the sections this endpoint touches.
|
||
# 3. Write the narrow overlay back atomically (tmp + os.replace).
|
||
from app.secrets import _state_dir
|
||
config_path = _state_dir() / "instance.yaml"
|
||
|
||
# Same serialization + corrupt-overlay handling as POST /server-config.
|
||
with _overlay_write_lock:
|
||
overlay: dict = {}
|
||
if config_path.exists():
|
||
try:
|
||
overlay = yaml.safe_load(config_path.read_text()) or {}
|
||
except Exception as e:
|
||
logger.exception("configure: refusing to overwrite corrupt overlay at %s", config_path)
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"refusing to overwrite corrupt overlay at {config_path} ({e}); "
|
||
"back up and remove the file, or fix it by hand",
|
||
) from e
|
||
|
||
# Merge instance settings into the overlay only — never seed from the
|
||
# env-resolved merged config.
|
||
if request.instance_name:
|
||
overlay.setdefault("instance", {})["name"] = request.instance_name
|
||
|
||
if request.allowed_domain:
|
||
overlay.setdefault("auth", {})["allowed_domain"] = request.allowed_domain
|
||
|
||
# data_source is fully owned by this endpoint — replace wholesale.
|
||
overlay["data_source"] = {"type": request.data_source}
|
||
if request.data_source == "keboola":
|
||
overlay["data_source"]["keboola"] = {
|
||
"stack_url": request.keboola_url,
|
||
"token_env": "KEBOOLA_STORAGE_TOKEN",
|
||
}
|
||
elif request.data_source == "bigquery":
|
||
overlay["data_source"]["bigquery"] = {
|
||
"project": request.bigquery_project,
|
||
"location": request.bigquery_location or "us",
|
||
}
|
||
|
||
# Seed an ai: block on first-time setup so LLM-driven services
|
||
# (corporate_memory, verification_detector) can boot without manual
|
||
# YAML editing. Only inserts when the overlay has no ai: yet AND an
|
||
# appropriate env var is present — never overwrites operator config,
|
||
# never writes a placeholder block (#176).
|
||
if "ai" not in overlay:
|
||
anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
|
||
llm_key = os.environ.get("LLM_API_KEY", "").strip()
|
||
if anthropic_key:
|
||
overlay["ai"] = {
|
||
"provider": "anthropic",
|
||
"api_key": "${ANTHROPIC_API_KEY}",
|
||
"model": "claude-haiku-4-5-20251001",
|
||
"structured_output": "auto",
|
||
}
|
||
elif llm_key:
|
||
overlay["ai"] = {
|
||
"provider": "anthropic",
|
||
"api_key": "${LLM_API_KEY}",
|
||
"model": "claude-haiku-4-5-20251001",
|
||
"structured_output": "auto",
|
||
}
|
||
|
||
# Atomic write to writable data volume — same tmp + os.replace pattern
|
||
# as the server-config editor so a concurrent save can't tear the file.
|
||
config_path.parent.mkdir(parents=True, exist_ok=True)
|
||
tmp_path = config_path.with_suffix(config_path.suffix + ".tmp")
|
||
tmp_path.write_text(yaml.dump(overlay, default_flow_style=False, sort_keys=False))
|
||
os.replace(tmp_path, config_path)
|
||
logger.info("Wrote instance config to %s", config_path)
|
||
|
||
# Persist secrets to .env_overlay (in data volume, never in git)
|
||
secrets_to_persist = {}
|
||
if request.keboola_token:
|
||
secrets_to_persist["KEBOOLA_STORAGE_TOKEN"] = request.keboola_token
|
||
if request.keboola_url:
|
||
secrets_to_persist["KEBOOLA_STACK_URL"] = request.keboola_url
|
||
|
||
if secrets_to_persist:
|
||
# Resolve via _state_dir() so the path matches app/main.py's
|
||
# startup-time read of the same overlay. Without this, an operator
|
||
# on the flat-mount layout (STATE_DIR=/data-state) would write
|
||
# secrets to /data/state/.env_overlay here while the app reads
|
||
# from /data-state/.env_overlay — silent loss on next restart.
|
||
from app.secrets import _state_dir
|
||
overlay_path = _state_dir() / ".env_overlay"
|
||
overlay_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Merge with existing overlay
|
||
existing_overlay = {}
|
||
if overlay_path.exists():
|
||
for line in overlay_path.read_text().splitlines():
|
||
if "=" in line and not line.startswith("#"):
|
||
k, v = line.split("=", 1)
|
||
existing_overlay[k.strip()] = v.strip()
|
||
existing_overlay.update(secrets_to_persist)
|
||
|
||
overlay_path.write_text(
|
||
"\n".join(f"{k}={v}" for k, v in existing_overlay.items()) + "\n"
|
||
)
|
||
try:
|
||
overlay_path.chmod(0o600)
|
||
except OSError:
|
||
pass
|
||
logger.info("Persisted %d secrets to .env_overlay", len(secrets_to_persist))
|
||
|
||
# Inject into current process environment
|
||
for k, v in secrets_to_persist.items():
|
||
os.environ[k] = v
|
||
|
||
# Invalidate cached instance config so next read picks up changes.
|
||
# Use the public helper (matches `/api/admin/server-config`); reaching
|
||
# into the private global silently breaks if the cache layout changes.
|
||
from app.instance_config import reset_cache
|
||
reset_cache()
|
||
|
||
return {
|
||
"status": "ok",
|
||
"data_source": request.data_source,
|
||
"connection": "verified" if request.data_source != "local" else "local",
|
||
}
|
||
|
||
|
||
def _split_keboola_table_id(full_id: str, fallback_name: str = "") -> tuple[str, str]:
|
||
"""Split a Keboola table id into ``(bucket, source_table)``.
|
||
|
||
Keboola convention: ``<stage>.<bucket-id>.<table>`` where stage ∈
|
||
``{in, out, sys}`` and bucket-id typically starts with ``c-``
|
||
(e.g. ``in.c-finance.orders``). Storage API export-async needs the
|
||
FULL ``<stage>.<bucket-id>`` as the bucket arg — a stripped
|
||
``c-finance`` 404s. The 2-segment fallback covers id strings
|
||
without the stage prefix; the 0/1-segment path returns empty
|
||
bucket and uses ``fallback_name`` as the table name so the row
|
||
fails loud at sync time rather than silently registering with
|
||
no source coordinates.
|
||
"""
|
||
parts = (full_id or "").strip().split(".")
|
||
if len(parts) >= 3:
|
||
return ".".join(parts[:-1]), parts[-1]
|
||
if len(parts) == 2:
|
||
return parts[0], parts[1]
|
||
return "", fallback_name or full_id
|
||
|
||
|
||
def _build_keboola_discovery_plan(
|
||
conn: duckdb.DuckDBPyConnection, discovered: list[dict],
|
||
) -> dict:
|
||
"""Inspect ``discovered`` (output of ``KeboolaClient.discover_all_tables``)
|
||
against the live registry and bucket every entry into one of:
|
||
|
||
- ``new``: not in registry, will be inserted.
|
||
- ``existing_match``: row already in registry under the same id
|
||
AND its ``(bucket, source_table)`` matches what discovery would
|
||
write — no-op, nothing to do.
|
||
- ``existing_drift``: a row in the registry conflicts with what
|
||
discovery would write. Two flavours, both surfaced for operator
|
||
visibility but **never overwritten**:
|
||
|
||
1. Same registry id, different ``(bucket, source_table)`` —
|
||
admin corrected the coordinates inline (rarer).
|
||
2. Different registry id but the discovered ``name`` clashes
|
||
with an existing row's ``name`` (case-insensitive). Real
|
||
example: registry has ``id='kbc_job', name='kbc_job',
|
||
bucket='in.c-kbc_telemetry'``; Keboola exposes the same
|
||
logical table at id ``in.c-keboola-storage.job`` (which
|
||
slugs to a different ``table_id``). Without this
|
||
check, auto-discovery would insert a duplicate ``kbc_job``
|
||
whose Storage API export-async 404s.
|
||
|
||
- ``invalid``: id couldn't produce a usable ``table_id`` slug.
|
||
|
||
Each bucket carries the exact rows; the API endpoint composes a
|
||
summary + (optionally) executes. Pre-fix, this logic was inlined
|
||
in ``_discover_and_register_tables`` and there was no way to see
|
||
what would change without writing.
|
||
"""
|
||
repo = TableRegistryRepository(conn)
|
||
# Pre-load all keboola rows once so the name-collision lookup
|
||
# below is O(1) per discovered entry. Falls back to per-id
|
||
# `repo.get(...)` calls when list_all isn't available — keeps
|
||
# the single-row test stubs working without forcing them to
|
||
# implement list_all.
|
||
try:
|
||
all_rows = [r for r in repo.list_all() if r.get("source_type") == "keboola"]
|
||
except AttributeError:
|
||
all_rows = []
|
||
by_name: dict[str, dict] = {
|
||
(r.get("name") or "").strip().lower(): r for r in all_rows
|
||
}
|
||
|
||
plan = {"new": [], "existing_match": [], "existing_drift": [], "invalid": []}
|
||
for table in discovered:
|
||
full_id = (table.get("id") or "").strip()
|
||
# Slug used as the registry primary key. Lowercase, dots/spaces
|
||
# → underscores. Stable across discovery runs.
|
||
table_id = full_id.lower().replace(".", "_").replace(" ", "_")
|
||
if not table_id:
|
||
plan["invalid"].append({
|
||
"table_id": "",
|
||
"full_id": full_id,
|
||
"reason": "empty id from discovery payload",
|
||
})
|
||
continue
|
||
|
||
# Prefer Keboola's authoritative `bucket_id` (separate field in
|
||
# the API response, normalised by `discover_all_tables`) over
|
||
# parsing the full id string. Fall back to the parser when
|
||
# the API didn't return bucket_id (older fallback path inside
|
||
# discover_all_tables).
|
||
bucket = (table.get("bucket_id") or "").strip()
|
||
name = (table.get("name") or "").strip()
|
||
source_table = name
|
||
if not bucket or not source_table:
|
||
bucket, source_table = _split_keboola_table_id(full_id, source_table)
|
||
|
||
entry = {
|
||
"table_id": table_id,
|
||
"name": table.get("name", table_id),
|
||
"full_id": full_id,
|
||
"bucket": bucket,
|
||
"source_table": source_table,
|
||
}
|
||
|
||
existing = repo.get(table_id)
|
||
if existing is not None:
|
||
ex_bucket = existing.get("bucket") or ""
|
||
ex_source_table = existing.get("source_table") or ""
|
||
if ex_bucket == bucket and ex_source_table == source_table:
|
||
plan["existing_match"].append(entry)
|
||
else:
|
||
plan["existing_drift"].append({
|
||
**entry,
|
||
"registry_bucket": ex_bucket,
|
||
"registry_source_table": ex_source_table,
|
||
"registry_id": existing.get("id"),
|
||
"drift_kind": "same_id_diff_coords",
|
||
})
|
||
continue
|
||
|
||
# No row at this id. Look for a name collision (admin
|
||
# registered the same logical table under a different id).
|
||
name_match = by_name.get(name.lower()) if name else None
|
||
if name_match is not None:
|
||
plan["existing_drift"].append({
|
||
**entry,
|
||
"registry_bucket": name_match.get("bucket") or "",
|
||
"registry_source_table": name_match.get("source_table") or "",
|
||
"registry_id": name_match.get("id"),
|
||
"drift_kind": "name_collision",
|
||
})
|
||
continue
|
||
|
||
plan["new"].append(entry)
|
||
return plan
|
||
|
||
|
||
def _discover_and_register_tables(
|
||
conn: duckdb.DuckDBPyConnection,
|
||
user_email: str,
|
||
*,
|
||
dry_run: bool = False,
|
||
) -> dict:
|
||
"""Discover tables from configured source and register them.
|
||
|
||
Behavior:
|
||
- Only the configured source type ``keboola`` is supported here
|
||
(BigQuery uses a different discovery endpoint).
|
||
- Already-registered rows are NEVER overwritten. The plan
|
||
classifies them as ``existing_match`` (no-op, registry agrees
|
||
with discovery) or ``existing_drift`` (admin edited the
|
||
coordinates; left alone, surfaced in the response so the
|
||
operator sees the divergence).
|
||
- ``dry_run=True`` returns the plan without writing anything —
|
||
useful for auditing before a re-discovery on a registry that
|
||
already has admin overrides.
|
||
"""
|
||
from app.instance_config import get_data_source_type, get_value
|
||
|
||
source_type = get_data_source_type()
|
||
if source_type != "keboola":
|
||
return {
|
||
"registered": 0,
|
||
"skipped": 0,
|
||
"errors": 0,
|
||
"drifted": 0,
|
||
"tables": [],
|
||
"source": source_type,
|
||
"dry_run": dry_run,
|
||
}
|
||
|
||
from connectors.keboola.client import KeboolaClient
|
||
# Read from data_source.keboola (matches what /api/admin/configure writes)
|
||
url = get_value("data_source", "keboola", "stack_url", default="")
|
||
token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
|
||
token = os.environ.get(token_env, "") if token_env else ""
|
||
if not token:
|
||
token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
|
||
|
||
client = KeboolaClient(token=token, url=url)
|
||
discovered = client.discover_all_tables()
|
||
|
||
plan = _build_keboola_discovery_plan(conn, discovered)
|
||
drift_summary = [
|
||
{
|
||
"table_id": e["table_id"],
|
||
"discovery": {"bucket": e["bucket"], "source_table": e["source_table"]},
|
||
"registry": {"bucket": e["registry_bucket"],
|
||
"source_table": e["registry_source_table"]},
|
||
}
|
||
for e in plan["existing_drift"]
|
||
]
|
||
|
||
if dry_run:
|
||
return {
|
||
"registered": 0,
|
||
"skipped": len(plan["existing_match"]),
|
||
"errors": len(plan["invalid"]),
|
||
"drifted": len(plan["existing_drift"]),
|
||
"tables": [e["table_id"] for e in plan["new"]],
|
||
"would_register": [e["table_id"] for e in plan["new"]],
|
||
"drift": drift_summary,
|
||
"invalid": plan["invalid"],
|
||
"source": "keboola",
|
||
"dry_run": True,
|
||
}
|
||
|
||
repo = TableRegistryRepository(conn)
|
||
registered = 0
|
||
errors = 0
|
||
table_names = []
|
||
|
||
for entry in plan["new"]:
|
||
try:
|
||
repo.register(
|
||
id=entry["table_id"],
|
||
name=entry["name"],
|
||
source_type="keboola",
|
||
bucket=entry["bucket"],
|
||
source_table=entry["source_table"],
|
||
# Keboola goes through Storage API export-async via the
|
||
# materialized path (NULL source_query = full table). The
|
||
# legacy `local` mode for Keboola was retired in v26 and
|
||
# would no-op here anyway.
|
||
query_mode="materialized",
|
||
registered_by=user_email,
|
||
description=f"Auto-discovered from Keboola: {entry['full_id']}",
|
||
)
|
||
registered += 1
|
||
table_names.append(entry["table_id"])
|
||
except Exception as e:
|
||
logger.warning("Failed to register %s: %s", entry["table_id"], e)
|
||
errors += 1
|
||
|
||
if plan["existing_drift"]:
|
||
logger.warning(
|
||
"Auto-discover skipped %d row(s) where the admin-edited "
|
||
"bucket/source_table differs from discovery — preserving "
|
||
"the admin values. Run with dry_run=True to see the deltas.",
|
||
len(plan["existing_drift"]),
|
||
)
|
||
|
||
return {
|
||
"registered": registered,
|
||
"skipped": len(plan["existing_match"]),
|
||
"errors": errors + len(plan["invalid"]),
|
||
"drifted": len(plan["existing_drift"]),
|
||
"tables": table_names,
|
||
"drift": drift_summary,
|
||
"invalid": plan["invalid"],
|
||
"source": "keboola",
|
||
"dry_run": False,
|
||
}
|
||
|
||
|
||
@router.post("/discover-and-register")
|
||
async def discover_and_register(
|
||
dry_run: bool = False,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Discover tables from configured source and auto-register them.
|
||
|
||
Combines discover-tables + register-table into one call. Already-
|
||
registered rows are NEVER overwritten — admin edits to bucket /
|
||
source_table win. The response surfaces a ``drift`` array listing
|
||
any rows where discovery would have written different coordinates
|
||
than what's in the registry, so operators can audit divergence
|
||
after a Keboola-side bucket rename / table move.
|
||
|
||
Query params:
|
||
- ``dry_run=true`` returns the plan without writing anything.
|
||
Lists ``would_register``, ``drift``, and ``invalid`` so an
|
||
operator can decide whether to proceed (or, in the drift case,
|
||
which side they want to fix).
|
||
|
||
Used by /setup wizard and AI agents.
|
||
"""
|
||
try:
|
||
result = _discover_and_register_tables(
|
||
conn, user.get("email", "admin"), dry_run=dry_run,
|
||
)
|
||
return result
|
||
except Exception as e:
|
||
raise HTTPException(status_code=500, detail=f"Discovery and registration failed: {e}")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Scheduler-driven LLM pipeline endpoints (#176)
|
||
#
|
||
# The scheduler container drives these via HTTP rather than running them
|
||
# in-process — same reasoning as the existing /api/marketplaces/sync-all
|
||
# job: DuckDB allows only one writer per file across processes, and the
|
||
# app keeps a long-lived handle on system.duckdb. Routing through the app
|
||
# inherits the existing connection without contention.
|
||
#
|
||
# Each endpoint is `def` (sync), so FastAPI runs it in a thread pool —
|
||
# the underlying jobs do blocking I/O (LLM calls, DuckDB writes,
|
||
# filesystem scans). Running on the asyncio thread would block health
|
||
# checks for the duration of a job.
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@router.post("/run-session-collector")
|
||
def run_session_collector(
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Trigger the session-collector job from the scheduler.
|
||
|
||
Walks /home/*/user/sessions/*.jsonl and copies new files into
|
||
/data/user_sessions/<user>/. Idempotent — already-collected files
|
||
are skipped.
|
||
"""
|
||
from services.session_collector import collector
|
||
|
||
# Call run() not main(): main() does argparse.parse_args() which would
|
||
# try to parse uvicorn's sys.argv and SystemExit(2) the worker.
|
||
rc: int = 1
|
||
stats: dict = {}
|
||
job_error: Optional[Exception] = None
|
||
try:
|
||
rc, stats = collector.run(dry_run=False, verbose=False)
|
||
except Exception as e:
|
||
# Mirror run_verification_detector / run_corporate_memory
|
||
# (#179 review): capture any unhandled error so audit_log +
|
||
# /admin/scheduler-runs reflect the failure. Re-raised below
|
||
# after audit. Filesystem permission, OSError on /home walking,
|
||
# etc. are realistic failure modes worth surfacing.
|
||
job_error = e
|
||
|
||
audit_params: dict = {"rc": rc, **stats}
|
||
if job_error is not None:
|
||
audit_params["unhandled_error"] = f"{type(job_error).__name__}: {job_error}"
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action="run_session_collector",
|
||
resource="job:session-collector",
|
||
params=audit_params,
|
||
)
|
||
|
||
if job_error is not None:
|
||
raise HTTPException(status_code=500, detail=audit_params["unhandled_error"])
|
||
|
||
return {"ok": rc == 0, "details": {"rc": rc, **stats}}
|
||
|
||
|
||
@router.post("/run-session-processor")
|
||
def run_session_processor(
|
||
processor: str = Query(..., description="Processor name (e.g. 'verification', 'usage')"),
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Trigger one session-pipeline processor against /data/user_sessions/*.
|
||
|
||
Replaces the per-processor /run-* endpoints with a single parametrized
|
||
entry. The scheduler invokes this once per registered processor on its
|
||
own cadence; processors are independent (one slow / failing processor
|
||
can't block any other).
|
||
|
||
Returns 400 if `processor` is unknown. The verification processor
|
||
requires an LLM extractor — if the instance has no ai: config and no
|
||
ANTHROPIC_API_KEY / LLM_API_KEY, it won't appear in the registry and
|
||
the call returns 400 the same as a misspelled name.
|
||
"""
|
||
from services.session_pipeline.runner import run_processor as _run_processor
|
||
from services.session_processors import get_processor, list_processor_names
|
||
from src.db import get_system_db
|
||
|
||
proc = get_processor(processor)
|
||
if proc is None:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=(
|
||
f"Unknown processor '{processor}'. "
|
||
f"Known: {', '.join(list_processor_names())}"
|
||
),
|
||
)
|
||
|
||
# Reject overlapping invocations of the same processor (PR #232 review).
|
||
# See `_get_processor_run_lock` docstring for why this matters
|
||
# (verification_evidence row duplication on race).
|
||
proc_lock = _get_processor_run_lock(processor)
|
||
if not proc_lock.acquire(blocking=False):
|
||
raise HTTPException(
|
||
status_code=409,
|
||
detail=f"Processor '{processor}' is already running",
|
||
)
|
||
|
||
job_conn = get_system_db()
|
||
stats: dict = {}
|
||
job_error: Optional[Exception] = None
|
||
try:
|
||
stats = _run_processor(job_conn, proc)
|
||
# Rebuild daily rollups after a successful usage run so the
|
||
# marketplace / admin dashboards see fresh aggregates. Runs on the
|
||
# same connection while it's still open; incremental (last-7-days)
|
||
# so it's cheap. Kept here (not in runner.py) to stay
|
||
# processor-agnostic at the framework level.
|
||
if processor == "usage" and stats.get("errors", 0) == 0:
|
||
from services.session_processors.usage_lib import rebuild_rollups
|
||
try:
|
||
rebuild_rollups(job_conn)
|
||
except Exception as rollup_exc:
|
||
logger.warning("usage rollup rebuild failed: %s", rollup_exc)
|
||
except Exception as e:
|
||
# Capture and re-raise after audit so an unhandled runner error
|
||
# (DuckDB lock, network blip, unexpected SDK type) still leaves a
|
||
# row in audit_log — the /admin/scheduler-runs page is the
|
||
# operator's only signal beyond docker logs.
|
||
job_error = e
|
||
finally:
|
||
try:
|
||
job_conn.close()
|
||
except Exception:
|
||
pass
|
||
# Always release, even if the runner raised. A leaked lock would
|
||
# wedge the processor permanently until process restart.
|
||
proc_lock.release()
|
||
|
||
audit_params: dict = {
|
||
"processor": processor,
|
||
"scanned": stats.get("scanned", 0),
|
||
"processed": stats.get("processed", 0),
|
||
"skipped": stats.get("skipped", 0),
|
||
"errors": stats.get("errors", 0),
|
||
"items_extracted": stats.get("items_extracted", 0),
|
||
}
|
||
if job_error is not None:
|
||
audit_params["unhandled_error"] = f"{type(job_error).__name__}: {job_error}"
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action=f"run_session_processor:{processor}",
|
||
resource=f"job:session-processor:{processor}",
|
||
params=audit_params,
|
||
)
|
||
|
||
if job_error is not None:
|
||
raise HTTPException(status_code=500, detail=audit_params["unhandled_error"])
|
||
|
||
return {"ok": stats.get("errors", 0) == 0, "details": stats}
|
||
|
||
|
||
@router.post("/run-corporate-memory")
|
||
def run_corporate_memory(
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Trigger the corporate-memory catalog refresh from the scheduler.
|
||
|
||
Reads all CLAUDE.local.md files, sends them through the LLM with the
|
||
existing catalog, and writes an updated catalog to knowledge.json.
|
||
"""
|
||
from services.corporate_memory.collector import collect_all
|
||
|
||
# Fail-fast (#176): collect_all raises ValueError when no ai: block AND
|
||
# no env keys are present. Surface the actionable factory message in a
|
||
# 500 instead of letting it crash the request anonymously.
|
||
stats: dict = {}
|
||
job_error: Optional[Exception] = None
|
||
try:
|
||
stats = collect_all(dry_run=False)
|
||
except ValueError as e:
|
||
# Already-translated misconfiguration → 500 with actionable message
|
||
# but no audit row (the request never reached the LLM stage).
|
||
raise HTTPException(status_code=500, detail=str(e))
|
||
except Exception as e:
|
||
# Mirror run_verification_detector (#179 review): capture any other
|
||
# unhandled error so audit_log + /admin/scheduler-runs reflect the
|
||
# failure. Re-raised below after audit.
|
||
job_error = e
|
||
|
||
audit_params: dict = {
|
||
"items_new": stats.get("items_new", 0),
|
||
"items_filtered": stats.get("items_filtered", 0),
|
||
"errors": len(stats.get("errors", [])),
|
||
"skipped": stats.get("skipped", False),
|
||
}
|
||
if job_error is not None:
|
||
audit_params["unhandled_error"] = f"{type(job_error).__name__}: {job_error}"
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action="run_corporate_memory",
|
||
resource="job:corporate-memory",
|
||
params=audit_params,
|
||
)
|
||
|
||
if job_error is not None:
|
||
raise HTTPException(status_code=500, detail=audit_params["unhandled_error"])
|
||
|
||
return {"ok": not stats.get("errors"), "details": stats}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Flea-market guardrails — admin endpoints
|
||
#
|
||
# Backs /admin/store/submissions (the human triage page) and the override /
|
||
# retry / delete-submission action buttons. Every action here writes an
|
||
# audit_log row so the trail of "who force-published what, and why" is
|
||
# permanent — same governance posture as the corporate-memory + scheduler
|
||
# runs surfaces.
|
||
# ---------------------------------------------------------------------------
|
||
|
||
import shutil as _shutil
|
||
|
||
|
||
@router.get("/store/submissions")
|
||
async def admin_list_store_submissions(
|
||
status: Optional[str] = None,
|
||
submitter: Optional[str] = None,
|
||
type: Optional[str] = None, # noqa: A002 — FastAPI query-param name
|
||
name: Optional[str] = None,
|
||
version: Optional[str] = None,
|
||
sort: Optional[str] = None,
|
||
order: Optional[str] = None,
|
||
limit: int = 100,
|
||
skip: int = 0,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""List flea-market guardrail submissions newest-first.
|
||
|
||
All filters AND together. ``status`` is comma-separated
|
||
(e.g. ``blocked_inline,blocked_llm``). ``submitter`` matches
|
||
``submitter_id`` exactly. ``type`` is one of ``skill`` / ``agent`` /
|
||
``plugin``. ``name`` and ``version`` are case-insensitive substrings.
|
||
``limit`` clamped to [1, 500].
|
||
"""
|
||
from src.repositories.store_submissions import StoreSubmissionsRepository
|
||
|
||
statuses = None
|
||
if status:
|
||
statuses = [s.strip() for s in status.split(",") if s.strip()]
|
||
if type and type not in {"skill", "agent", "plugin"}:
|
||
raise HTTPException(status_code=400, detail="invalid_type")
|
||
limit = max(1, min(int(limit), 500))
|
||
skip = max(0, int(skip))
|
||
|
||
# v36+ chip routing: 'archived' / 'deleted' tokens in ?status=
|
||
# are LIFECYCLE filters, not verdict filters. The repo handles the
|
||
# JOIN-on-entity logic for archived; submission terminal marker
|
||
# for deleted. Verdict tokens (approved, blocked_*, pending_*,
|
||
# overridden, review_error) pass through unchanged.
|
||
lifecycle = None
|
||
if statuses == ["archived"]:
|
||
lifecycle = "archived"
|
||
statuses = None
|
||
elif statuses == ["deleted"]:
|
||
lifecycle = "deleted"
|
||
statuses = None
|
||
|
||
try:
|
||
items, total = StoreSubmissionsRepository(conn).list_for_admin(
|
||
status=statuses,
|
||
submitter_id=submitter or None,
|
||
type_=type or None,
|
||
name_substr=name or None,
|
||
version_substr=version or None,
|
||
sort_by=sort or None,
|
||
sort_order=order or None,
|
||
lifecycle=lifecycle,
|
||
limit=limit, skip=skip,
|
||
)
|
||
except ValueError as e:
|
||
# Sort key whitelist rejection (#23) — surface as 400 so the UI
|
||
# can show the operator a meaningful message instead of 500.
|
||
msg = str(e)
|
||
if msg.startswith("invalid_sort_key"):
|
||
raise HTTPException(status_code=400, detail="invalid_sort_key")
|
||
raise
|
||
return {"items": items, "total": total, "limit": limit, "skip": skip}
|
||
|
||
|
||
@router.get("/store/submissions/{submission_id}")
|
||
async def admin_get_store_submission(
|
||
submission_id: str,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
from src.repositories.store_submissions import StoreSubmissionsRepository
|
||
|
||
sub = StoreSubmissionsRepository(conn).get(submission_id)
|
||
if sub is None:
|
||
raise HTTPException(status_code=404, detail="submission_not_found")
|
||
return sub
|
||
|
||
|
||
class _OverrideRequest(BaseModel):
|
||
reason: str = Field(..., min_length=4, max_length=2000)
|
||
|
||
|
||
@router.post("/store/submissions/{submission_id}/override")
|
||
async def admin_override_store_submission(
|
||
submission_id: str,
|
||
body: _OverrideRequest,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Force-publish a previously-blocked submission.
|
||
|
||
Flips the submission to ``status='overridden'`` and the linked
|
||
store_entities row to ``visibility_status='approved'``. Audit row
|
||
captures who, why, and the verdict that was overridden so the next
|
||
time this submission shows up, the trail is intact.
|
||
"""
|
||
from src.repositories.store_entities import StoreEntitiesRepository
|
||
from src.repositories.store_submissions import StoreSubmissionsRepository
|
||
|
||
subs = StoreSubmissionsRepository(conn)
|
||
sub = subs.get(submission_id)
|
||
if sub is None:
|
||
raise HTTPException(status_code=404, detail="submission_not_found")
|
||
if sub["status"] not in {"blocked_inline", "blocked_llm", "review_error", "pending_llm"}:
|
||
raise HTTPException(
|
||
status_code=409,
|
||
detail=f"cannot_override_status:{sub['status']}",
|
||
)
|
||
|
||
entity_id = sub.get("entity_id")
|
||
if not entity_id:
|
||
# v30+ ought to always carry entity_id. Legacy rows from the
|
||
# pre-v30 inline-rollback design land here — refuse with a
|
||
# message that points at the only path forward (Delete +
|
||
# ask submitter to re-upload).
|
||
raise HTTPException(
|
||
status_code=409,
|
||
detail="cannot_override_legacy_without_entity",
|
||
)
|
||
|
||
subs.set_override(submission_id, admin_user_id=user["id"], reason=body.reason)
|
||
ents_repo = StoreEntitiesRepository(conn)
|
||
ents_repo.set_visibility(entity_id, "approved")
|
||
|
||
# Mirror the runner's deferred-promotion path. An override on a
|
||
# v2+ edit/restore must promote the overridden version + swap the
|
||
# on-disk live bundle, otherwise the entity stays at the prior
|
||
# approved version and installers keep receiving stale bytes the
|
||
# admin just told us to replace. For an initial v1 submission
|
||
# (no prior approved) the version_no already matches — the loop
|
||
# just no-ops and we skip promotion harmlessly.
|
||
entity_row = ents_repo.get(entity_id) or {}
|
||
promoted_to: Optional[int] = None
|
||
# Look up THIS submission's version entry by submission_id, NOT
|
||
# by hash. Hash-based lookup breaks when the user re-uploads
|
||
# byte-identical bundles (e.g. v2 same content as v1): the loop
|
||
# picks the FIRST history entry with that hash (always v1, n=1),
|
||
# so target_version_no lands at 1 instead of the actual new
|
||
# entry's n. The forward-only `target > current` guard then
|
||
# skips the promote, leaving the entity stuck at v1. Surfaced
|
||
# live on a development deployment.
|
||
from app.api.store import _version_no_for_submission
|
||
target_version_no: Optional[int] = _version_no_for_submission(
|
||
entity_row, submission_id,
|
||
)
|
||
# Forward-only: refuse to promote backwards. An admin overriding a
|
||
# stale v2 submission when v3 is already approved + live must NOT
|
||
# demote the live bundle back to v2's bytes. Override flips the
|
||
# row's status + visibility regardless; only the version-promote
|
||
# is gated. Forward (target > current) is the only motion the
|
||
# publish-gate model is designed to express.
|
||
if (target_version_no is not None
|
||
and target_version_no > int(entity_row.get("version_no") or 0)):
|
||
# Atomic helper: swap live bundle first, then update the DB.
|
||
# Eliminates the "DB promoted but live still on prior bytes"
|
||
# window. If the helper returns None (source missing / swap
|
||
# failed) the row's status + visibility are still flipped
|
||
# above — admin can re-trigger via /rescan once the bundle
|
||
# is recovered.
|
||
from app.api.store import promote_to_version
|
||
promoted_to = promote_to_version(
|
||
entity_id, target_version_no, ents_repo,
|
||
)
|
||
if promoted_to is not None:
|
||
# Re-read after promotion so attribution picks up the
|
||
# new version's name/type if a rename was bundled in.
|
||
entity_row = ents_repo.get(entity_id) or entity_row
|
||
|
||
# v46: attribution lookup is live — the next UsageProcessor tick
|
||
# preloads the newly-approved entity by name.
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user["id"],
|
||
action="store.submission.overridden",
|
||
resource=f"store_submission:{submission_id}",
|
||
params={
|
||
"entity_id": entity_id,
|
||
"reason": body.reason,
|
||
"prior_status": sub["status"],
|
||
"prior_findings": sub.get("llm_findings"),
|
||
"prior_inline": sub.get("inline_checks"),
|
||
"promoted_to_version_no": promoted_to,
|
||
},
|
||
result="ok",
|
||
)
|
||
return {"ok": True, "submission_id": submission_id, "entity_id": entity_id}
|
||
|
||
|
||
@router.post("/store/submissions/{submission_id}/rescan")
|
||
async def admin_rescan_store_submission(
|
||
submission_id: str,
|
||
background: BackgroundTasks,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Re-run **all** guardrail checks (inline + LLM) against the current
|
||
bundle.
|
||
|
||
Different from ``/retry``: rescan starts from scratch (re-runs the
|
||
deterministic inline checks too) and is allowed regardless of
|
||
current status. Use when check rules have changed and a previously-
|
||
approved entity might now fail (or vice versa).
|
||
|
||
Effects:
|
||
* inline checks run sync; verdict written to ``inline_checks``
|
||
* on inline fail → ``status='blocked_inline'``, entity hidden
|
||
* on inline pass → ``status='pending_llm'``, LLM call scheduled,
|
||
entity visibility flipped to ``pending`` until verdict lands
|
||
* audit_log entry recorded for both outcomes — admin sees the
|
||
rescan in the detail-page activity timeline
|
||
* audit row recorded
|
||
|
||
Requires the bundle to still be on disk. Inline-blocked submissions
|
||
whose bundle was rolled back (no ``entity_id``) cannot be rescanned —
|
||
nothing to scan.
|
||
"""
|
||
from app.api.store import (
|
||
_plugin_dir,
|
||
_submission_plugin_dir,
|
||
_version_no_for_submission,
|
||
)
|
||
from src.db import get_system_db
|
||
from src.repositories.store_entities import StoreEntitiesRepository
|
||
from src.repositories.store_submissions import StoreSubmissionsRepository
|
||
from src.store_guardrails import run_inline_checks
|
||
from src.store_guardrails.runner import (
|
||
default_api_key_loader,
|
||
default_model_loader,
|
||
run_llm_review,
|
||
)
|
||
from app.instance_config import (
|
||
get_guardrails_enabled,
|
||
get_guardrails_llm_provider_ready,
|
||
)
|
||
|
||
subs = StoreSubmissionsRepository(conn)
|
||
sub = subs.get(submission_id)
|
||
if sub is None:
|
||
raise HTTPException(status_code=404, detail="submission_not_found")
|
||
entity_id = sub.get("entity_id")
|
||
if not entity_id:
|
||
raise HTTPException(status_code=409, detail="cannot_rescan_without_entity")
|
||
|
||
ents = StoreEntitiesRepository(conn)
|
||
entity = ents.get(entity_id)
|
||
# Rescan the bundle this submission represents — not live. See the
|
||
# equivalent fix in /retry for the full reasoning. Same fall-back
|
||
# to live for legacy rows that never seeded a versions/v<N>/plugin/.
|
||
target_n = _version_no_for_submission(entity or {}, submission_id)
|
||
if target_n is not None:
|
||
plugin_dir = _submission_plugin_dir(entity_id, target_n)
|
||
if not plugin_dir.exists():
|
||
plugin_dir = _plugin_dir(entity_id)
|
||
else:
|
||
plugin_dir = _plugin_dir(entity_id)
|
||
if not plugin_dir.exists():
|
||
raise HTTPException(status_code=410, detail="bundle_missing")
|
||
|
||
description = (entity or {}).get("description")
|
||
|
||
inline = run_inline_checks(
|
||
plugin_dir, type_=sub["type"], description=description,
|
||
)
|
||
|
||
if not inline.passed:
|
||
# Re-failed inline. Hide the entity (was approved or pending);
|
||
# admin can either fix the bundle (PUT to recreate) or override.
|
||
subs.conn.execute(
|
||
"UPDATE store_submissions SET inline_checks = ?, llm_findings = NULL, "
|
||
"status = 'blocked_inline', updated_at = current_timestamp "
|
||
"WHERE id = ?",
|
||
[__import__("json").dumps(inline.to_response_dict()), submission_id],
|
||
)
|
||
ents.set_visibility(entity_id, "hidden")
|
||
AuditRepository(conn).log(
|
||
user_id=user["id"],
|
||
action="store.submission.rescan",
|
||
resource=f"store_submission:{submission_id}",
|
||
params={"entity_id": entity_id, "outcome": "blocked_inline"},
|
||
)
|
||
return {"ok": True, "submission_id": submission_id, "status": "blocked_inline"}
|
||
|
||
# Inline passes. Three-state matrix:
|
||
# - intent False → auto-approve (operator opt-out)
|
||
# - intent True + ready → pending_llm, schedule LLM
|
||
# - intent True + not-ready → pending_llm, DO NOT schedule (admin
|
||
# retries from the same endpoint after providing credentials)
|
||
guardrails_enabled = get_guardrails_enabled()
|
||
provider_ready = get_guardrails_llm_provider_ready()
|
||
hold_for_review = guardrails_enabled
|
||
schedule_async_llm = guardrails_enabled and provider_ready
|
||
guardrails_on = hold_for_review # retained for audit-log compat
|
||
new_status = "pending_llm" if hold_for_review else "approved"
|
||
subs.conn.execute(
|
||
"UPDATE store_submissions SET inline_checks = ?, llm_findings = NULL, "
|
||
"status = ?, updated_at = current_timestamp "
|
||
"WHERE id = ?",
|
||
[__import__("json").dumps(inline.to_response_dict()), new_status, submission_id],
|
||
)
|
||
if hold_for_review:
|
||
ents.set_visibility(entity_id, "pending")
|
||
else:
|
||
ents.set_visibility(entity_id, "approved")
|
||
# Guardrails explicitly disabled — immediately live. Promote
|
||
# the rescanned submission's version forward (same atomic
|
||
# helper the create / update / restore inline-promote paths
|
||
# use). Pre-fix this branch flipped visibility but never
|
||
# called promote_to_version, so a rescan that re-approved a
|
||
# non-current v2+ left the entity stuck at the prior version.
|
||
# Surfaced by adversarial review of PR #330.
|
||
from app.api.store import promote_to_version
|
||
entity_row = ents.get(entity_id) or {}
|
||
if target_n is not None and target_n > int(entity_row.get("version_no") or 0):
|
||
promote_to_version(entity_id, target_n, ents)
|
||
# v46: attribution lookup is live — no explicit refresh needed.
|
||
AuditRepository(conn).log(
|
||
user_id=user["id"],
|
||
action="store.submission.rescan",
|
||
resource=f"store_submission:{submission_id}",
|
||
params={"entity_id": entity_id, "outcome": new_status,
|
||
"guardrails_enabled": guardrails_on,
|
||
"provider_ready": provider_ready},
|
||
)
|
||
if schedule_async_llm:
|
||
background.add_task(
|
||
run_llm_review,
|
||
submission_id,
|
||
plugin_dir=plugin_dir,
|
||
conn_factory=get_system_db,
|
||
api_key_loader=default_api_key_loader,
|
||
model_loader=default_model_loader,
|
||
)
|
||
return {"ok": True, "submission_id": submission_id, "status": new_status}
|
||
|
||
|
||
@router.post("/store/submissions/{submission_id}/retry")
|
||
async def admin_retry_store_submission(
|
||
submission_id: str,
|
||
background: BackgroundTasks,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Re-queue the LLM review for a submission.
|
||
|
||
Eligible statuses:
|
||
* ``review_error`` — LLM call failed, admin retrying after the
|
||
underlying issue (rate limit, timeout, transient outage) clears.
|
||
* ``blocked_llm`` — admin disagrees with the prior verdict; rerun
|
||
from a clean slate (review rules may have shifted since).
|
||
* ``pending_llm`` — submission was held when the LLM provider had
|
||
no credentials in env (fail-CLOSED matrix: intent True + not
|
||
ready). Admin sets the key and re-fires from here.
|
||
|
||
Only valid when the original submission's plugin tree is still on
|
||
disk — for inline-blocked rows the bundle was deleted at POST time.
|
||
"""
|
||
from app.api.store import (
|
||
_plugin_dir,
|
||
_submission_plugin_dir,
|
||
_version_no_for_submission,
|
||
)
|
||
from src.db import get_system_db
|
||
from src.repositories.store_entities import StoreEntitiesRepository
|
||
from src.repositories.store_submissions import StoreSubmissionsRepository
|
||
from src.store_guardrails.runner import (
|
||
default_api_key_loader,
|
||
default_model_loader,
|
||
run_llm_review,
|
||
)
|
||
|
||
subs = StoreSubmissionsRepository(conn)
|
||
sub = subs.get(submission_id)
|
||
if sub is None:
|
||
raise HTTPException(status_code=404, detail="submission_not_found")
|
||
if sub["status"] not in {"review_error", "blocked_llm", "pending_llm"}:
|
||
raise HTTPException(
|
||
status_code=409, detail=f"cannot_retry_status:{sub['status']}",
|
||
)
|
||
entity_id = sub.get("entity_id")
|
||
if not entity_id:
|
||
raise HTTPException(
|
||
status_code=409, detail="cannot_retry_without_entity",
|
||
)
|
||
|
||
# Review the STAGED version's bytes — not live. For a v2+ edit
|
||
# held at pending_llm or blocked_llm, live `plugin/` still holds
|
||
# the prior approved version. Reviewing live would produce a
|
||
# verdict against the wrong bytes; the runner's hash-match
|
||
# promotion would then advance the entity to staged bytes that
|
||
# were never actually reviewed.
|
||
ent = StoreEntitiesRepository(conn).get(entity_id) or {}
|
||
target_n = _version_no_for_submission(ent, submission_id)
|
||
if target_n is not None:
|
||
plugin_dir = _submission_plugin_dir(entity_id, target_n)
|
||
# Fall back to live for legacy pre-v37 rows where the version
|
||
# dir was never seeded.
|
||
if not plugin_dir.exists():
|
||
plugin_dir = _plugin_dir(entity_id)
|
||
else:
|
||
plugin_dir = _plugin_dir(entity_id)
|
||
if not plugin_dir.exists():
|
||
raise HTTPException(status_code=410, detail="bundle_missing")
|
||
|
||
subs.update_status(submission_id, status="pending_llm")
|
||
AuditRepository(conn).log(
|
||
user_id=user["id"],
|
||
action="store.submission.retry",
|
||
resource=f"store_submission:{submission_id}",
|
||
params={"entity_id": entity_id},
|
||
)
|
||
background.add_task(
|
||
run_llm_review,
|
||
submission_id,
|
||
plugin_dir=plugin_dir,
|
||
conn_factory=get_system_db,
|
||
api_key_loader=default_api_key_loader,
|
||
model_loader=default_model_loader,
|
||
)
|
||
return {"ok": True, "submission_id": submission_id, "status": "pending_llm"}
|
||
|
||
|
||
@router.delete("/store/submissions/{submission_id}", status_code=204)
|
||
async def admin_delete_store_submission(
|
||
submission_id: str,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Hard-delete a submission record + its linked bundle (if any).
|
||
|
||
Use this for spam / accidental uploads after override-publish is the
|
||
wrong call. The audit_log row preserves what was deleted in case
|
||
triage needs the evidence trail later.
|
||
"""
|
||
from app.api.store import _entity_dir
|
||
from src.repositories.store_entities import StoreEntitiesRepository
|
||
from src.repositories.store_submissions import StoreSubmissionsRepository
|
||
from src.repositories.user_store_installs import UserStoreInstallsRepository
|
||
|
||
subs = StoreSubmissionsRepository(conn)
|
||
sub = subs.get(submission_id)
|
||
if sub is None:
|
||
raise HTTPException(status_code=404, detail="submission_not_found")
|
||
|
||
entity_id = sub.get("entity_id")
|
||
if entity_id:
|
||
UserStoreInstallsRepository(conn).delete_all_for_entity(entity_id)
|
||
StoreEntitiesRepository(conn).delete(entity_id)
|
||
_shutil.rmtree(_entity_dir(entity_id), ignore_errors=True)
|
||
conn.execute("DELETE FROM store_submissions WHERE id = ?", [submission_id])
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user["id"],
|
||
action="store.submission.deleted",
|
||
resource=f"store_submission:{submission_id}",
|
||
params={
|
||
"entity_id": entity_id,
|
||
"submitter_id": sub.get("submitter_id"),
|
||
"name": sub.get("name"),
|
||
"status": sub.get("status"),
|
||
},
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# v30: download blocked bundle for forensic inspection
|
||
# ---------------------------------------------------------------------------
|
||
|
||
from fastapi.responses import StreamingResponse
|
||
|
||
|
||
@router.get("/store/submissions/{submission_id}/bundle.zip")
|
||
async def admin_download_store_submission_bundle(
|
||
submission_id: str,
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Stream the on-disk bundle as a fresh ZIP for admin inspection.
|
||
|
||
Required by the forensic use case: admin needs to inspect what a
|
||
submitter actually tried to upload (not just the verdict). Bundle
|
||
must still be on disk — TTL purge nulls ``entity_id`` and removes
|
||
the directory, in which case this returns 410.
|
||
"""
|
||
import io as _io
|
||
import zipfile as _zipfile
|
||
from pathlib import Path as _P
|
||
from app.api.store import (
|
||
_plugin_dir as _sp_plugin_dir,
|
||
_submission_plugin_dir,
|
||
_version_no_for_submission,
|
||
)
|
||
|
||
from src.repositories.store_entities import StoreEntitiesRepository
|
||
from src.repositories.store_submissions import StoreSubmissionsRepository
|
||
|
||
sub = StoreSubmissionsRepository(conn).get(submission_id)
|
||
if sub is None:
|
||
raise HTTPException(status_code=404, detail="submission_not_found")
|
||
entity_id = sub.get("entity_id")
|
||
if not entity_id:
|
||
raise HTTPException(status_code=410, detail="bundle_purged_or_missing")
|
||
|
||
# Resolve the STAGED bundle this submission represents, not live.
|
||
# Under deferred promotion, live `plugin/` holds the prior approved
|
||
# version — so for a blocked v2 row, live shows v1's safe bytes
|
||
# while the staged v2 bytes (the actual risky upload the admin is
|
||
# reviewing) sit in `versions/v2/plugin/`. Falls back to live for
|
||
# legacy rows that never seeded a versions/ dir.
|
||
ent = StoreEntitiesRepository(conn).get(entity_id) or {}
|
||
target_n = _version_no_for_submission(ent, submission_id)
|
||
if target_n is not None:
|
||
plugin_dir = _submission_plugin_dir(entity_id, target_n)
|
||
if not plugin_dir.exists():
|
||
plugin_dir = _sp_plugin_dir(entity_id)
|
||
else:
|
||
plugin_dir = _sp_plugin_dir(entity_id)
|
||
if not plugin_dir.exists():
|
||
raise HTTPException(status_code=410, detail="bundle_missing")
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user["id"],
|
||
action="store.submission.bundle_downloaded",
|
||
resource=f"store_submission:{submission_id}",
|
||
params={"entity_id": entity_id, "name": sub.get("name")},
|
||
)
|
||
|
||
buf = _io.BytesIO()
|
||
with _zipfile.ZipFile(buf, "w", _zipfile.ZIP_DEFLATED) as zf:
|
||
for f in sorted(_P(plugin_dir).rglob("*")):
|
||
if not f.is_file():
|
||
continue
|
||
arcname = f.relative_to(plugin_dir).as_posix()
|
||
zf.write(f, arcname)
|
||
buf.seek(0)
|
||
|
||
safe_name = (sub.get("name") or "bundle").replace("/", "_")
|
||
filename = f"{safe_name}-{submission_id[:8]}.zip"
|
||
return StreamingResponse(
|
||
buf,
|
||
media_type="application/zip",
|
||
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# v30: scheduled TTL purge of blocked bundle bytes
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@router.post("/run-blocked-purge")
|
||
async def run_blocked_purge(
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Trigger the TTL purge of blocked bundle bytes.
|
||
|
||
Wraps :func:`src.store_guardrails.purge.purge_blocked_bundles`. The
|
||
scheduler service hits this endpoint daily (under
|
||
``SCHEDULER_API_TOKEN`` like the corporate-memory + verification
|
||
jobs); admins can also run it on demand from the UI.
|
||
"""
|
||
from app.instance_config import get_guardrails_blocked_bundle_ttl_days
|
||
from src.store_guardrails.purge import purge_blocked_bundles
|
||
|
||
ttl = get_guardrails_blocked_bundle_ttl_days()
|
||
result = purge_blocked_bundles(conn, ttl_days=ttl)
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action="run_blocked_purge",
|
||
resource="job:store-blocked-purge",
|
||
params={"ttl_days": ttl, "purged": result.get("purged", 0),
|
||
"skipped": result.get("skipped", False)},
|
||
)
|
||
return {"ok": True, "details": result}
|
||
|
||
|
||
@router.post("/run-reap-stuck-reviews")
|
||
async def run_reap_stuck_reviews(
|
||
user: dict = Depends(require_admin),
|
||
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
||
):
|
||
"""Trigger the stuck-review reaper.
|
||
|
||
Wraps :func:`src.store_guardrails.reaper.reap_stuck_llm_reviews`.
|
||
The scheduler hits this every 15 minutes; admins can run it on
|
||
demand if a worker crash is suspected. Flips any
|
||
``status='pending_llm'`` row older than the configured grace to
|
||
``review_error`` so the queue stops growing indefinitely.
|
||
"""
|
||
from app.instance_config import get_guardrails_stuck_review_grace_seconds
|
||
from src.store_guardrails.reaper import reap_stuck_llm_reviews
|
||
|
||
grace = get_guardrails_stuck_review_grace_seconds()
|
||
result = reap_stuck_llm_reviews(conn, grace_seconds=grace)
|
||
|
||
AuditRepository(conn).log(
|
||
user_id=user.get("id"),
|
||
action="run_reap_stuck_reviews",
|
||
resource="job:store-reap-stuck-reviews",
|
||
params={"grace_seconds": grace,
|
||
"reaped": result.get("reaped", 0),
|
||
"skipped": result.get("skipped", False)},
|
||
)
|
||
return {"ok": True, "details": result}
|