Add a generic, placement-aware mechanism for operators to inject HTML/JS into every page that extends base.html or base_login.html. Each entry takes name, enabled, placement (head_start | head_end | body_end), and html. Replaces the need for per-vendor helpers when shipping feedback widgets, analytics, or error-capture snippets. Trust boundary mirrors the existing instance.logo_svg / instance.overview pattern — admin-only, rendered with `| safe`. Resolved by app/instance_config.py::get_custom_scripts(), surfaced in /admin/server-config via _KNOWN_FIELDS["instance"]. Empty default keeps the OSS vendor-neutral; sample Marker.io block ships commented out in config/instance.yaml.example as the canonical example.
522 lines
26 KiB
Text
522 lines
26 KiB
Text
# AI Data Analyst - Instance Configuration
|
||
# ==========================================
|
||
# This is the main configuration file for your instance.
|
||
# Copy to instance.yaml and fill in your values.
|
||
#
|
||
# SECRET VALUES use ${ENV_VAR} syntax - actual values go in .env file.
|
||
# Non-secret values are set directly here.
|
||
|
||
# --- Config version ---
|
||
# Incremented when the config schema changes. Must match SUPPORTED_CONFIG_VERSIONS
|
||
# in config/loader.py. Currently only version 1 is supported.
|
||
config_version: 1
|
||
|
||
# --- Instance branding ---
|
||
instance:
|
||
name: "AI Data Analyst"
|
||
subtitle: "Your Organization"
|
||
copyright: "Your Organization"
|
||
# brand: "Agnes" # Product-name brand string used in analyst-facing UI copy
|
||
# (/home hero, /setup, /login messages, clipboard setup
|
||
# script). Default "Agnes". Set to e.g. "Foundry AI" to
|
||
# rebrand without forking. Distinct from `name` above:
|
||
# `name` is the deploying organization, `brand` is the
|
||
# product. Env override: AGNES_INSTANCE_BRAND.
|
||
# workspace_dir: "FoundryAI" # Filesystem-safe folder name for the analyst's local
|
||
# workspace (~/<workspace_dir>). When unset, derived from
|
||
# `brand` by stripping non-alphanumerics ("Foundry AI" ->
|
||
# "FoundryAI"). Set explicitly only if you want a folder
|
||
# name that differs from the auto-derivation. Env override:
|
||
# AGNES_WORKSPACE_DIR_NAME.
|
||
# logo_svg: | # Inline <svg> element rendered into the header brand slot.
|
||
# <svg width="120" height="30" viewBox="0 0 100 30" xmlns="http://www.w3.org/2000/svg">
|
||
# <text y="22" font-size="24" fill="#333">Logo</text>
|
||
# </svg>
|
||
# # When set, the SVG replaces the text brand in the header.
|
||
# # `name` above still drives browser <title> text and page
|
||
# # headings — keep it populated. Env override:
|
||
# # AGNES_INSTANCE_LOGO_SVG.
|
||
# overview: | # Operator-authored Overview body rendered in the new
|
||
# <p>Free-form HTML — paragraphs, links, lists.</p>
|
||
# # Overview section on /home (between Getting Started and
|
||
# # Usage modes). Use for product framing, privacy posture,
|
||
# # what-data-flows summary — operator-specific copy stays
|
||
# # out of the OSS this way. HTML in, HTML out (same `| safe`
|
||
# # filter as news_intro). Empty/unset = section hidden.
|
||
# # Env override: AGNES_INSTANCE_OVERVIEW.
|
||
# sync_interval: "1 hour" # Cadence shown in analyst CLAUDE.md (e.g., "1 hour", "30 minutes", "daily")
|
||
# admin_email: "ops@acme.com" # Operator contact shown on /home GWS connector tile as
|
||
# an "Email admin" mailto button (analysts whose operator
|
||
# hasn't pre-provisioned a shared OAuth app can request
|
||
# one without leaving the workspace). Empty/unset hides
|
||
# the button. Env override: AGNES_INSTANCE_ADMIN_EMAIL.
|
||
# home: # Per-instance toggles for /home content blocks.
|
||
# show_automode: true # Render Step-3 auto-accept-mode block. Default true.
|
||
# # Env: AGNES_HOME_SHOW_AUTOMODE.
|
||
# show_status_frame: true # Render the 5-card status frame (Last sync, Sessions,
|
||
# # Prompts, Tokens, Projects). Visible only to onboarded
|
||
# # users regardless of this flag. Default true. Env:
|
||
# # AGNES_HOME_SHOW_STATUS_FRAME.
|
||
# custom_scripts: # Operator-injected HTML/JS blocks rendered into every
|
||
# # page that extends base.html. Use for feedback widgets
|
||
# # (Marker.io), analytics (GTM, PostHog), error capture
|
||
# # (Sentry), etc. Each entry needs name + enabled +
|
||
# # placement + html. Admin-only; rendered with `| safe`.
|
||
# # Review the widget's privacy posture before enabling —
|
||
# # most third-party widgets capture screenshots, console
|
||
# # logs, or user actions on submit. Resolved by
|
||
# # `app/instance_config.py::get_custom_scripts()`. No
|
||
# # env override (structure doesn't fit env vars cleanly).
|
||
# - name: "marker-io" # Example: Marker.io feedback widget.
|
||
# enabled: true # Kill switch — set false to disable without deleting.
|
||
# placement: "head_end" # head_start | head_end | body_end
|
||
# html: |
|
||
# <script>
|
||
# window.markerConfig = {
|
||
# project: 'YOUR_MARKER_IO_PROJECT_ID',
|
||
# source: 'snippet'
|
||
# };
|
||
# !function(e,r,a){if(!e.__Marker){e.__Marker={};var t=[],n={__cs:t};["show","hide","isVisible","capture","cancelCapture","unload","reload","isExtensionInstalled","setReporter","clearReporter","setCustomData","on","off"].forEach(function(e){n[e]=function(){var r=Array.prototype.slice.call(arguments);r.unshift(e),t.push(r)}}),e.Marker=n;var s=r.createElement("script");s.async=1,s.src="https://edge.marker.io/latest/shim.js";var i=r.getElementsByTagName("script")[0];i.parentNode.insertBefore(s,i)}}(window,document);
|
||
# </script>
|
||
|
||
# --- Server ---
|
||
server:
|
||
hostname: "" # DNS name (e.g., "data.acme.com")
|
||
host: "" # IP address
|
||
app_dir: "/opt/data-analyst" # Installation directory
|
||
# --- Client setup (shown in "Get Started" on dashboard) ---
|
||
# ssh_alias: "data-analyst" # SSH config Host alias for analysts (default: "data-analyst")
|
||
# ssh_key: "~/.ssh/data_analyst_server" # SSH key path for analysts (default: "~/.ssh/data_analyst_server")
|
||
# project_dir: "data-analyst" # Local project folder name (default: "data-analyst")
|
||
|
||
# --- Admin users ---
|
||
# Manage the server, own data files, get unlimited resource limits.
|
||
# SSH keys are used by server/setup.sh during provisioning.
|
||
admins:
|
||
- username: "admin"
|
||
ssh_public_key: "ssh-ed25519 AAAA..."
|
||
|
||
# --- Deployment ---
|
||
deployment:
|
||
method: "manual" # manual | github_actions
|
||
repo_url: "" # e.g., "git@github.com:acme/ai-data-analyst.git"
|
||
branch: "main"
|
||
|
||
# --- Authentication ---
|
||
# At minimum, set allowed_domain and webapp_secret_key.
|
||
# Email magic link auth works out of the box (no external service needed).
|
||
# Google OAuth is optional - add credentials to enable it.
|
||
auth:
|
||
allowed_domain: "" # Email domain(s) for login, comma-separated (e.g., "acme.com" or "acme.com, partner.org")
|
||
webapp_secret_key: "${WEBAPP_SECRET_KEY}"
|
||
# Optional: Google OAuth (if not set, only email magic link is available)
|
||
google_client_id: "${GOOGLE_CLIENT_ID}"
|
||
google_client_secret: "${GOOGLE_CLIENT_SECRET}"
|
||
|
||
# --- Webapp username shaping ---
|
||
#
|
||
# By default, a user's OS account is derived from their full email:
|
||
# e.psimecek@acme.com -> e_psimecek_acme_com
|
||
#
|
||
# Two options let you control this:
|
||
#
|
||
# username_strip_domain: true
|
||
# Use only the local part of the email (before @).
|
||
# Safe when allowed_domain ensures all users share a single domain.
|
||
# e.psimecek@acme.com -> e_psimecek
|
||
# Keeps usernames short and readable.
|
||
#
|
||
# username_prefix: "myapp_"
|
||
# Prepend a fixed string to every webapp-created account name.
|
||
# Necessary when an external identity system (GCP OS Login, LDAP, SAML)
|
||
# already creates OS accounts in /home/ using the same naming scheme.
|
||
# Without a prefix, the webapp sees those existing OS accounts and refuses
|
||
# to register new analyst accounts ("already in use by a system account").
|
||
# With prefix "myapp_" and strip_domain true:
|
||
# e.psimecek@acme.com -> myapp_e_psimecek
|
||
# Linux enforces a 32-character username limit. Keep the prefix short.
|
||
# Changing or removing either option later will invalidate all existing
|
||
# analyst accounts. Use username_mapping (top-level) to bridge legacy accounts.
|
||
#
|
||
# username_strip_domain: false
|
||
# username_prefix: ""
|
||
# disabled_providers: # Hide auth methods from login page
|
||
# - "email" # Disable email magic link (use when Google OAuth is configured)
|
||
|
||
# --- Theme (optional) ---
|
||
# Customize colors, fonts, and shape to match your brand.
|
||
# All values are optional - defaults provide a clean blue theme.
|
||
# See docs/theme-reference.html for a visual guide.
|
||
theme:
|
||
# primary: "#0073D1" # Main brand color (buttons, links, accents)
|
||
# primary_dark: "#005BA3" # Hover/active state of primary
|
||
# primary_light: "rgba(0, 115, 209, 0.1)" # Light tint backgrounds
|
||
# text_primary: "#1A253C" # Main text color
|
||
# text_secondary: "#6B7280" # Muted/secondary text
|
||
# background: "#F5F7FA" # Page background
|
||
# surface: "#FFFFFF" # Card/panel background
|
||
# border: "#E5E7EB" # Borders and dividers
|
||
# font_primary: "'Inter', system-ui, sans-serif"
|
||
# font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"
|
||
# radius: "6px" # Border radius (cards, buttons, inputs)
|
||
# success: "#10B77F"
|
||
# warning: "#F59F0A"
|
||
# error: "#EA580C"
|
||
|
||
# --- Data source ---
|
||
data_source:
|
||
type: "keboola" # keboola | bigquery | local
|
||
keboola:
|
||
storage_token: "${KEBOOLA_STORAGE_TOKEN}"
|
||
stack_url: "" # e.g., "https://connection.keboola.com"
|
||
project_id: ""
|
||
bigquery:
|
||
project: "${BIGQUERY_PROJECT}" # GCP project hosting the data (used in FROM clause)
|
||
location: "${BIGQUERY_LOCATION}" # BigQuery location (e.g., "us-central1", "US")
|
||
# Uses ADC (Application Default Credentials) - VM service account on GCP
|
||
# Data can live in a different project -- use fully-qualified table IDs in data_description.md
|
||
# billing_project: "prj-billing" # GCP project to bill BQ jobs to / submit jobs from.
|
||
# # Defaults to `project`. Set when the SA has bigquery.data.* on
|
||
# # the data project but lacks serviceusage.services.use there.
|
||
# # Mismatch -> every BQ call 403 USER_PROJECT_DENIED.
|
||
# # `da diagnose` warns when this falls back to `project`.
|
||
# # Configurable via /admin/server-config UI.
|
||
# max_bytes_per_materialize: 10737418240
|
||
# # Cost guardrail (bytes) for query_mode='materialized' BQ scans.
|
||
# # Dry-run check before running; exceeding -> registration / sync
|
||
# # rejected. Default 10 GiB (10737418240). Set 0 to disable.
|
||
# # null falls through to default. Configurable via /admin/server-config UI.
|
||
# query_timeout_ms: 600000
|
||
# # DuckDB BigQuery extension query timeout (milliseconds).
|
||
# # Applied via `SET bq_query_timeout_ms` after every LOAD bigquery
|
||
# # on every BQ-touching DuckDB session. Extension default is
|
||
# # 90 000 ms = 90 s, which is too tight for analyst queries against
|
||
# # view-backed datasets -- bumped to 600 000 ms = 10 min by default.
|
||
# # Set 0 to fall through to the extension default. Configurable via
|
||
# # /admin/server-config UI.
|
||
# session_pool_size: 4
|
||
# # Number of pre-warmed DuckDB+bigquery-extension sessions kept
|
||
# # in a process-local pool. Each acquire amortizes the
|
||
# # ~0.5 s INSTALL/LOAD/CREATE-SECRET cost across requests; a fresh
|
||
# # build only happens when the pool is empty. Default 4. Set 0
|
||
# # to disable pooling (every acquire builds + closes a fresh
|
||
# # session; matches pre-pool behavior).
|
||
|
||
# --- OpenMetadata catalog (optional) ---
|
||
# Enriches table and column metadata from OpenMetadata REST API.
|
||
# If not configured, app works normally without catalog enrichment.
|
||
# All openmetadata.* fields configurable via /admin/server-config UI.
|
||
# openmetadata:
|
||
# url: "https://your-catalog.example.com"
|
||
# token: "${OPENMETADATA_TOKEN}" # JWT bearer token
|
||
# cache_ttl_seconds: 3600 # Cache TTL in seconds
|
||
# verify_ssl: true # set to false ONLY for internal
|
||
# # CAs / self-signed certs; defaults
|
||
# # to true. Setting false ships the
|
||
# # JWT over an unverified channel.
|
||
|
||
# --- Email delivery (optional, for magic link auth) ---
|
||
# Without SMTP, magic links are shown directly in browser (development mode).
|
||
# For production, configure any SMTP relay (Gmail, Mailgun, SendGrid SMTP, etc.)
|
||
email:
|
||
from_address: "noreply@example.com"
|
||
from_name: "AI Data Analyst"
|
||
smtp_host: "${SMTP_HOST}" # e.g., "smtp.gmail.com"
|
||
smtp_port: 587 # 587 for STARTTLS, 465 for SSL
|
||
smtp_user: "${SMTP_USER}"
|
||
smtp_password: "${SMTP_PASSWORD}"
|
||
|
||
# --- Desktop app (optional) ---
|
||
# All desktop.* fields configurable via /admin/server-config UI (rarely changed once set).
|
||
desktop:
|
||
jwt_issuer: "data-analyst"
|
||
jwt_secret: "${DESKTOP_JWT_SECRET}"
|
||
url_scheme: "data-analyst"
|
||
|
||
# --- Telegram notifications (optional) ---
|
||
telegram:
|
||
bot_token: "${TELEGRAM_BOT_TOKEN}"
|
||
bot_username: ""
|
||
domain_suffix: ""
|
||
|
||
# --- Jira integration (optional) ---
|
||
jira:
|
||
domain: ""
|
||
email: ""
|
||
api_token: "${JIRA_API_TOKEN}"
|
||
webhook_secret: "${JIRA_WEBHOOK_SECRET}"
|
||
sla_email: ""
|
||
sla_api_token: "${JIRA_SLA_API_TOKEN}"
|
||
cloud_id: ""
|
||
|
||
# --- Corporate Memory AI (optional) ---
|
||
# Extracts shared knowledge from team members' CLAUDE.local.md files.
|
||
# Provider: "anthropic" (direct API) or "openai_compat" (LiteLLM, OpenRouter, Azure, etc.)
|
||
ai:
|
||
provider: "anthropic" # or "openai_compat"
|
||
api_key: "${ANTHROPIC_API_KEY}" # or "${LLM_API_KEY}" for proxy
|
||
# base_url: "https://litellm.example.com" # Required for provider='openai_compat' (LiteLLM,
|
||
# OpenRouter, vLLM). Ignored when provider='anthropic'.
|
||
# Configurable via /admin/server-config UI.
|
||
model: "claude-haiku-4-5-20251001" # any model available on your provider
|
||
# --- Structured output quality control ---
|
||
# AI models can return JSON in three ways, each with different reliability:
|
||
#
|
||
# Layer 1 - "json_schema" (best):
|
||
# The provider enforces an exact schema. Every field, type, and structure
|
||
# is guaranteed. Available on: Anthropic, OpenAI, Claude via LiteLLM.
|
||
#
|
||
# Layer 2 - "json_object" (good):
|
||
# The provider guarantees valid JSON, but does not enforce a specific schema.
|
||
# Fields may be missing or have wrong types. Available on most providers.
|
||
#
|
||
# Layer 3 - "prompt" (acceptable):
|
||
# The AI is asked to respond in JSON via instructions in the prompt.
|
||
# No technical enforcement -- the model may still return invalid JSON.
|
||
# Works everywhere, but least reliable.
|
||
#
|
||
# "strict" = only Layer 1. Fail if provider doesn't support json_schema.
|
||
# Use when data quality is non-negotiable.
|
||
# "json" = Layer 1, fall back to Layer 2. No prompt-based fallback.
|
||
# Good balance of quality and compatibility.
|
||
# "auto" = All three layers as progressive fallback. Maximum compatibility.
|
||
# Use when you'd rather get imperfect data than no data.
|
||
structured_output: "auto"
|
||
|
||
# Legacy format (still supported, equivalent to provider: "anthropic"):
|
||
# ai:
|
||
# anthropic_api_key: "${ANTHROPIC_API_KEY}"
|
||
|
||
# Examples:
|
||
# --- LiteLLM proxy ---
|
||
# ai:
|
||
# provider: "openai_compat"
|
||
# base_url: "https://litellm.example.com"
|
||
# api_key: "${LLM_API_KEY}"
|
||
# model: "claude-haiku-4-5-20251001"
|
||
# structured_output: "strict"
|
||
#
|
||
# --- OpenRouter ---
|
||
# ai:
|
||
# provider: "openai_compat"
|
||
# base_url: "https://openrouter.ai/api/v1"
|
||
# api_key: "${OPENROUTER_API_KEY}"
|
||
# model: "anthropic/claude-3-haiku"
|
||
# structured_output: "auto"
|
||
|
||
# --- Flea-market upload guardrails (optional) ---
|
||
# Controls the pre-publish check pipeline for skill/agent/plugin uploads
|
||
# to /store. See docs/STORE_GUARDRAILS.md for the full check catalogue.
|
||
#
|
||
# guardrails:
|
||
# # Master kill-switch. When false, inline manifest/security/quality
|
||
# # checks still run (they're free) but the LLM step is skipped and new
|
||
# # uploads are auto-approved. Useful for local dev without an LLM key.
|
||
# enabled: true
|
||
#
|
||
# # Anthropic model tier for the LLM security review.
|
||
# # haiku — ~$0.001/review, default, good enough for routine uploads
|
||
# # sonnet — ~$0.015/review, deeper reasoning, fewer false negatives
|
||
# # opus — ~$0.075/review, only for high-stakes deployments
|
||
# # You can also pin a concrete model ID (e.g. "claude-haiku-4-5-20251001").
|
||
# review_model: "haiku"
|
||
#
|
||
# # Per-submitter daily cap on inline-blocked uploads. Bounds disk +
|
||
# # admin-queue spam. Set to 0 to disable. Default 50.
|
||
# blocked_quota_per_day: 50
|
||
#
|
||
# # How many days to keep blocked bundle bytes on disk before the
|
||
# # daily TTL job purges them. Submission row + sha256 + size always
|
||
# # survive — only the bundle bytes go. Set to 0 to retain forever
|
||
# # (rely on admin Delete). Default 30.
|
||
# blocked_bundle_ttl_days: 30
|
||
|
||
# --- Corporate Memory governance (optional) ---
|
||
# Controls how AI-extracted knowledge is reviewed and distributed.
|
||
# If not present, system operates in legacy mode (democratic wiki, no admin review).
|
||
#
|
||
# The corporate_memory.* schema is editable via /admin/server-config UI; you can
|
||
# also continue to manage it via this YAML file. The UI surfaces every leaf with
|
||
# a hint, so use it to discover the schema if this comment block has aged.
|
||
#
|
||
# corporate_memory:
|
||
# # How knowledge reaches users:
|
||
# # "mandatory_only" — admin controls everything, no user voting
|
||
# # "admin_curated" — admin controls, users vote as feedback signal
|
||
# # "hybrid" — mandatory from admin + optional from user voting (default)
|
||
# distribution_mode: "hybrid"
|
||
#
|
||
# # How new AI-extracted items enter the system:
|
||
# # "review_queue" — nothing published without admin approval (default)
|
||
# # "auto_publish" — items go live immediately, admin intervenes retroactively
|
||
# # "threshold" — high-confidence auto-publish, low-confidence to review queue
|
||
# approval_mode: "review_queue"
|
||
#
|
||
# # Default review period for approved/mandatory items (months)
|
||
# review_period_months: 6
|
||
#
|
||
# # Notify km_admins about new pending items
|
||
# notify_on_new_items: true
|
||
#
|
||
# # --- V1 Context Engineering ---
|
||
#
|
||
# sources:
|
||
# claude_local_md:
|
||
# enabled: true
|
||
# confidence_base: 0.50
|
||
# session_transcripts:
|
||
# enabled: true
|
||
# confidence_base: 0.60
|
||
# max_turns_per_session: 100
|
||
# detection_types:
|
||
# - correction
|
||
# - confirmation
|
||
# - unprompted_definition
|
||
#
|
||
# extraction:
|
||
# model: "claude-haiku-4-5-20251001"
|
||
# sensitivity_check: true
|
||
# contradiction_check: true
|
||
#
|
||
# confidence:
|
||
# # Base score per extraction source. Key format: "source_type" or "source_type.detection_type"
|
||
# base:
|
||
# user_verification.correction: 0.90
|
||
# user_verification.unprompted_definition: 0.90
|
||
# user_verification.confirmation: 0.60
|
||
# admin_mandate: 1.00
|
||
# claude_local_md: 0.50
|
||
# session_transcript: 0.50
|
||
# # Per-key modifier step sizes applied to base when optional signals are present.
|
||
# modifiers:
|
||
# user_verification.correction:
|
||
# additional_verifiers: 0.05 # per extra unique verifier
|
||
# user_verification.unprompted_definition:
|
||
# additional_verifiers: 0.05
|
||
# user_verification.confirmation:
|
||
# admin_confirmed: 0.20
|
||
# session_transcript:
|
||
# user_confirmed_in_session: 0.20
|
||
# # Confidence decay applied to items as they age.
|
||
# decay:
|
||
# mode: exponential # linear | exponential
|
||
# half_life_months: 12 # used when mode=exponential
|
||
# decay_rate_monthly: 0.02 # used when mode=linear
|
||
# floor:
|
||
# admin_mandate: 0.50 # admin policies don't silently decay to zero
|
||
# user_verification: 0.40 # user-verified facts never fall below 0.40
|
||
# default: 0.0
|
||
#
|
||
# contradiction_detection:
|
||
# enabled: true
|
||
# max_candidates: 10
|
||
#
|
||
# entity_resolution:
|
||
# enabled: true
|
||
# entities:
|
||
# metrics: ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"]
|
||
# products: ["Platform", "API", "Dashboard"]
|
||
#
|
||
# domain_owners:
|
||
# finance: ["cfo@company.com"]
|
||
# engineering: ["cto@company.com"]
|
||
# product: ["pm@company.com"]
|
||
#
|
||
# domains:
|
||
# - finance
|
||
# - engineering
|
||
# - product
|
||
# - data
|
||
# - operations
|
||
# - infrastructure
|
||
|
||
# --- User groups for audience targeting (optional) ---
|
||
# Used with Corporate Memory governance to target mandatory knowledge to specific groups.
|
||
#
|
||
# groups:
|
||
# finance:
|
||
# label: "Finance & Analytics"
|
||
# members: ["analyst1@company.com", "analyst2@company.com"]
|
||
# engineering:
|
||
# label: "Engineering"
|
||
# members: ["dev1@company.com", "dev2@company.com"]
|
||
|
||
# --- User display and permissions ---
|
||
# Corporate Memory avatars + optional km_admin flag for governance.
|
||
# users:
|
||
# admin@company.com:
|
||
# display_name: "Admin User"
|
||
# km_admin: true # Corporate Memory admin (approve/mandate knowledge)
|
||
# analyst@company.com:
|
||
# display_name: "Analyst User"
|
||
users: {}
|
||
|
||
# --- Username mapping (webapp email -> server username, only if different) ---
|
||
username_mapping: {}
|
||
|
||
# --- Optional datasets (sync settings UI) ---
|
||
datasets: {}
|
||
|
||
# --- Data catalog ---
|
||
catalog:
|
||
categories: {}
|
||
order: []
|
||
|
||
# --- Data profiler (optional) ---
|
||
# profiler:
|
||
# sample_size: 500000 # If table > this, sample this many rows; otherwise use all
|
||
# max_categorical_distinct: 50 # Treat as categorical if unique <= this
|
||
# top_values_limit: 10 # Top values per categorical column
|
||
# histogram_bins: 15 # Bins in histogram visualizations
|
||
# sample_rows_limit: 5 # Sample rows to show in UI "Sample" tab
|
||
# alert_high_missing_pct: 30.0 # Alert threshold for high missing %
|
||
# alert_missing_pct: 5.0 # Alert threshold for missing %
|
||
# alert_imbalance_pct: 60.0 # Alert threshold for imbalance %
|
||
# alert_high_cardinality: 50 # Alert threshold for high cardinality columns
|
||
|
||
# --- Remote query (optional) ---
|
||
# Settings for remote BigQuery queries via `python -m src.remote_query`.
|
||
# Used when tables have query_mode: "remote" in data_description.md.
|
||
# remote_query:
|
||
# timeout_seconds: 300 # BQ + DuckDB query timeout
|
||
# max_result_rows: 100000 # Max rows in final output
|
||
# max_bq_registration_rows: 500000 # Max rows per --register-bq sub-query
|
||
# default_format: "table" # Default output format
|
||
# output_dir: "/tmp/remote_query" # Directory for Parquet/CSV exports
|
||
|
||
# --- v2 API knobs (optional) ---
|
||
# Controls for the /api/v2/{catalog,schema,sample,scan,scan/estimate} endpoints.
|
||
# All values are optional — the defaults shown below are applied if keys are absent.
|
||
#
|
||
# api:
|
||
# # --- Scan / fetch limits ---
|
||
# scan:
|
||
# max_limit: 10000000 # Hard row cap per /api/v2/scan request (default: 10 M)
|
||
# max_result_bytes: 2147483648 # Hard byte cap on Arrow stream response: 2 GB (default)
|
||
# # If exceeded, partial result returned with X-Agnes-Truncated header.
|
||
# max_concurrent_per_user: 5 # In-flight /api/v2/scan requests allowed per user (default: 5)
|
||
# # Note: quota is process-local; N replicas → effective N× cap.
|
||
# max_daily_bytes_per_user: 53687091200 # Per-user daily byte quota: 50 GB (default). Resets at UTC midnight.
|
||
# bq_cost_per_tb_usd: 5.00 # Cost rate shown in /api/v2/scan/estimate response (default: $5/TB)
|
||
# request_timeout_seconds: 300 # Server-side timeout for a single scan request (default: 300 s)
|
||
# # --- Discovery cache TTLs ---
|
||
# catalog_cache_ttl_seconds: 300 # /api/v2/catalog response cache lifetime (default: 5 min)
|
||
# schema_cache_ttl_seconds: 3600 # /api/v2/schema/{table_id} cache lifetime (default: 1 h)
|
||
# sample_cache_ttl_seconds: 3600 # /api/v2/sample/{table_id} cache lifetime (default: 1 h)
|
||
# # Admins can force-refresh via POST /api/v2/sample/{id}?refresh=true
|
||
|
||
# --- Materialize concurrency safety (optional) ---
|
||
# Concurrency safety net for the materialize path (BQ + Keboola). When
|
||
# two materialize attempts race for the same table_id, the second one
|
||
# raises MaterializeInFlightError and skips. The lock is held in a
|
||
# .parquet.lock sibling file; if a holder process is hard-killed before
|
||
# kernel-level flock release, the next attempt reclaims the lock once
|
||
# the file's mtime is older than this TTL.
|
||
#
|
||
# Default 86400 (24h) is generous on purpose — anything shorter risks
|
||
# a long-running COPY being interrupted by its own scheduler successor.
|
||
# Lower it only if you know your materialize never exceeds the new
|
||
# value AND your host has a habit of hard-killing processes.
|
||
# Min 60 (1 minute), max 604800 (7 days). Configurable via /admin/server-config UI.
|
||
materialize:
|
||
lock_ttl_seconds: 86400
|