agnes-the-ai-analyst/config/instance.yaml.example

# AI Data Analyst - Instance Configuration
# ==========================================
# This is the main configuration file for your instance.
# Copy to instance.yaml and fill in your values.
#
# SECRET VALUES use ${ENV_VAR} syntax - actual values go in .env file.
# Non-secret values are set directly here.

# --- Config version ---
# Incremented when the config schema changes. Must match SUPPORTED_CONFIG_VERSIONS
# in config/loader.py. Currently only version 1 is supported.
config_version: 1

# --- Instance branding ---
instance:
  name: "AI Data Analyst"
  subtitle: "Your Organization"
  copyright: "Your Organization"
  # logo_svg: Full <svg> element for header logo (optional, default: Keboola logo)
  # Example: '<svg width="120" height="30" viewBox="0 0 100 30" xmlns="http://www.w3.org/2000/svg"><text y="22" font-size="24" fill="#333">Logo</text></svg>'
  # sync_interval: "1 hour"          # Cadence shown in analyst CLAUDE.md (e.g., "1 hour", "30 minutes", "daily")

# --- Server ---
server:
  hostname: ""                    # DNS name (e.g., "data.acme.com")
  host: ""                        # IP address
  app_dir: "/opt/data-analyst"    # Installation directory
  # --- Client setup (shown in "Get Started" on dashboard) ---
  # ssh_alias: "data-analyst"     # SSH config Host alias for analysts (default: "data-analyst")
  # ssh_key: "~/.ssh/data_analyst_server"  # SSH key path for analysts (default: "~/.ssh/data_analyst_server")
  # project_dir: "data-analyst"   # Local project folder name (default: "data-analyst")

# --- Admin users ---
# Manage the server, own data files, get unlimited resource limits.
# SSH keys are used by server/setup.sh during provisioning.
admins:
  - username: "admin"
    ssh_public_key: "ssh-ed25519 AAAA..."

# --- Deployment ---
deployment:
  method: "manual"                # manual | github_actions
  repo_url: ""                    # e.g., "git@github.com:acme/ai-data-analyst.git"
  branch: "main"

# --- Authentication ---
# At minimum, set allowed_domain and webapp_secret_key.
# Email magic link auth works out of the box (no external service needed).
# Google OAuth is optional - add credentials to enable it.
auth:
  allowed_domain: ""              # Email domain(s) for login, comma-separated (e.g., "acme.com" or "acme.com, partner.org")
  webapp_secret_key: "${WEBAPP_SECRET_KEY}"
  # Optional: Google OAuth (if not set, only email magic link is available)
  google_client_id: "${GOOGLE_CLIENT_ID}"
  google_client_secret: "${GOOGLE_CLIENT_SECRET}"

  # --- Webapp username shaping ---
  #
  # By default, a user's OS account is derived from their full email:
  #   e.psimecek@acme.com  ->  e_psimecek_acme_com
  #
  # Two options let you control this:
  #
  # username_strip_domain: true
  #   Use only the local part of the email (before @).
  #   Safe when allowed_domain ensures all users share a single domain.
  #   e.psimecek@acme.com  ->  e_psimecek
  #   Keeps usernames short and readable.
  #
  # username_prefix: "myapp_"
  #   Prepend a fixed string to every webapp-created account name.
  #   Necessary when an external identity system (GCP OS Login, LDAP, SAML)
  #   already creates OS accounts in /home/ using the same naming scheme.
  #   Without a prefix, the webapp sees those existing OS accounts and refuses
  #   to register new analyst accounts ("already in use by a system account").
  #   With prefix "myapp_" and strip_domain true:
  #     e.psimecek@acme.com  ->  myapp_e_psimecek
  #   Linux enforces a 32-character username limit. Keep the prefix short.
  #   Changing or removing either option later will invalidate all existing
  #   analyst accounts. Use username_mapping (top-level) to bridge legacy accounts.
  #
  # username_strip_domain: false
  # username_prefix: ""
  # disabled_providers:            # Hide auth methods from login page
  #   - "email"                    # Disable email magic link (use when Google OAuth is configured)

# --- Theme (optional) ---
# Customize colors, fonts, and shape to match your brand.
# All values are optional - defaults provide a clean blue theme.
# See docs/theme-reference.html for a visual guide.
theme:
  # primary: "#0073D1"              # Main brand color (buttons, links, accents)
  # primary_dark: "#005BA3"         # Hover/active state of primary
  # primary_light: "rgba(0, 115, 209, 0.1)"  # Light tint backgrounds
  # text_primary: "#1A253C"         # Main text color
  # text_secondary: "#6B7280"       # Muted/secondary text
  # background: "#F5F7FA"           # Page background
  # surface: "#FFFFFF"              # Card/panel background
  # border: "#E5E7EB"              # Borders and dividers
  # font_primary: "'Inter', system-ui, sans-serif"
  # font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"
  # radius: "6px"                   # Border radius (cards, buttons, inputs)
  # success: "#10B77F"
  # warning: "#F59F0A"
  # error: "#EA580C"

# --- Data source ---
data_source:
  type: "keboola"                 # keboola | bigquery | local
  keboola:
    storage_token: "${KEBOOLA_STORAGE_TOKEN}"
    stack_url: ""                 # e.g., "https://connection.keboola.com"
    project_id: ""
  bigquery:
    project: "${BIGQUERY_PROJECT}"       # GCP project hosting the data (used in FROM clause)
    location: "${BIGQUERY_LOCATION}"     # BigQuery location (e.g., "us-central1", "US")
    # Uses ADC (Application Default Credentials) - VM service account on GCP
    # Data can live in a different project -- use fully-qualified table IDs in data_description.md
    # billing_project: "prj-billing"     # GCP project to bill BQ jobs to / submit jobs from.
    #                                    # Defaults to `project`. Set when the SA has bigquery.data.* on
    #                                    # the data project but lacks serviceusage.services.use there.
    #                                    # Mismatch -> every BQ call 403 USER_PROJECT_DENIED.
    #                                    # `da diagnose` warns when this falls back to `project`.
    #                                    # Configurable via /admin/server-config UI.
    # max_bytes_per_materialize: 10737418240
    #                                    # Cost guardrail (bytes) for query_mode='materialized' BQ scans.
    #                                    # Dry-run check before running; exceeding -> registration / sync
    #                                    # rejected. Default 10 GiB (10737418240). Set 0 to disable.
    #                                    # null falls through to default. Configurable via /admin/server-config UI.
    # query_timeout_ms: 600000
    #                                    # DuckDB BigQuery extension query timeout (milliseconds).
    #                                    # Applied via `SET bq_query_timeout_ms` after every LOAD bigquery
    #                                    # on every BQ-touching DuckDB session. Extension default is
    #                                    # 90 000 ms = 90 s, which is too tight for analyst queries against
    #                                    # view-backed datasets -- bumped to 600 000 ms = 10 min by default.
    #                                    # Set 0 to fall through to the extension default. Configurable via
    #                                    # /admin/server-config UI.
    # session_pool_size: 4
    #                                    # Number of pre-warmed DuckDB+bigquery-extension sessions kept
    #                                    # in a process-local pool. Each acquire amortizes the
    #                                    # ~0.5 s INSTALL/LOAD/CREATE-SECRET cost across requests; a fresh
    #                                    # build only happens when the pool is empty. Default 4. Set 0
    #                                    # to disable pooling (every acquire builds + closes a fresh
    #                                    # session; matches pre-pool behavior).

# --- OpenMetadata catalog (optional) ---
# Enriches table and column metadata from OpenMetadata REST API.
# If not configured, app works normally without catalog enrichment.
# All openmetadata.* fields configurable via /admin/server-config UI.
# openmetadata:
#   url: "https://your-catalog.example.com"
#   token: "${OPENMETADATA_TOKEN}"        # JWT bearer token
#   cache_ttl_seconds: 3600               # Cache TTL in seconds
#   verify_ssl: true                       # set to false ONLY for internal
#                                          # CAs / self-signed certs; defaults
#                                          # to true. Setting false ships the
#                                          # JWT over an unverified channel.

# --- Email delivery (optional, for magic link auth) ---
# Without SMTP, magic links are shown directly in browser (development mode).
# For production, configure any SMTP relay (Gmail, Mailgun, SendGrid SMTP, etc.)
email:
  from_address: "noreply@example.com"
  from_name: "AI Data Analyst"
  smtp_host: "${SMTP_HOST}"          # e.g., "smtp.gmail.com"
  smtp_port: 587                     # 587 for STARTTLS, 465 for SSL
  smtp_user: "${SMTP_USER}"
  smtp_password: "${SMTP_PASSWORD}"

# --- Desktop app (optional) ---
# All desktop.* fields configurable via /admin/server-config UI (rarely changed once set).
desktop:
  jwt_issuer: "data-analyst"
  jwt_secret: "${DESKTOP_JWT_SECRET}"
  url_scheme: "data-analyst"

# --- Telegram notifications (optional) ---
telegram:
  bot_token: "${TELEGRAM_BOT_TOKEN}"
  bot_username: ""
  domain_suffix: ""

# --- Jira integration (optional) ---
jira:
  domain: ""
  email: ""
  api_token: "${JIRA_API_TOKEN}"
  webhook_secret: "${JIRA_WEBHOOK_SECRET}"
  sla_email: ""
  sla_api_token: "${JIRA_SLA_API_TOKEN}"
  cloud_id: ""

# --- Corporate Memory AI (optional) ---
# Extracts shared knowledge from team members' CLAUDE.local.md files.
# Provider: "anthropic" (direct API) or "openai_compat" (LiteLLM, OpenRouter, Azure, etc.)
ai:
  provider: "anthropic"                    # or "openai_compat"
  api_key: "${ANTHROPIC_API_KEY}"          # or "${LLM_API_KEY}" for proxy
  # base_url: "https://litellm.example.com"  # Required for provider='openai_compat' (LiteLLM,
                                              # OpenRouter, vLLM). Ignored when provider='anthropic'.
                                              # Configurable via /admin/server-config UI.
  model: "claude-haiku-4-5-20251001"       # any model available on your provider
  # --- Structured output quality control ---
  # AI models can return JSON in three ways, each with different reliability:
  #
  # Layer 1 - "json_schema" (best):
  #   The provider enforces an exact schema. Every field, type, and structure
  #   is guaranteed. Available on: Anthropic, OpenAI, Claude via LiteLLM.
  #
  # Layer 2 - "json_object" (good):
  #   The provider guarantees valid JSON, but does not enforce a specific schema.
  #   Fields may be missing or have wrong types. Available on most providers.
  #
  # Layer 3 - "prompt" (acceptable):
  #   The AI is asked to respond in JSON via instructions in the prompt.
  #   No technical enforcement -- the model may still return invalid JSON.
  #   Works everywhere, but least reliable.
  #
  # "strict" = only Layer 1. Fail if provider doesn't support json_schema.
  #            Use when data quality is non-negotiable.
  # "json"   = Layer 1, fall back to Layer 2. No prompt-based fallback.
  #            Good balance of quality and compatibility.
  # "auto"   = All three layers as progressive fallback. Maximum compatibility.
  #            Use when you'd rather get imperfect data than no data.
  structured_output: "auto"

# Legacy format (still supported, equivalent to provider: "anthropic"):
# ai:
#   anthropic_api_key: "${ANTHROPIC_API_KEY}"

# Examples:
# --- LiteLLM proxy ---
# ai:
#   provider: "openai_compat"
#   base_url: "https://litellm.example.com"
#   api_key: "${LLM_API_KEY}"
#   model: "claude-haiku-4-5-20251001"
#   structured_output: "strict"
#
# --- OpenRouter ---
# ai:
#   provider: "openai_compat"
#   base_url: "https://openrouter.ai/api/v1"
#   api_key: "${OPENROUTER_API_KEY}"
#   model: "anthropic/claude-3-haiku"
#   structured_output: "auto"

# --- Flea-market upload guardrails (optional) ---
# Controls the pre-publish check pipeline for skill/agent/plugin uploads
# to /store. See docs/STORE_GUARDRAILS.md for the full check catalogue.
#
# guardrails:
#   # Master kill-switch. When false, inline manifest/security/quality
#   # checks still run (they're free) but the LLM step is skipped and new
#   # uploads are auto-approved. Useful for local dev without an LLM key.
#   enabled: true
#
#   # Anthropic model tier for the LLM security review.
#   # haiku  — ~$0.001/review, default, good enough for routine uploads
#   # sonnet — ~$0.015/review, deeper reasoning, fewer false negatives
#   # opus   — ~$0.075/review, only for high-stakes deployments
#   # You can also pin a concrete model ID (e.g. "claude-haiku-4-5-20251001").
#   review_model: "haiku"
#
#   # Per-submitter daily cap on inline-blocked uploads. Bounds disk +
#   # admin-queue spam. Set to 0 to disable. Default 50.
#   blocked_quota_per_day: 50
#
#   # How many days to keep blocked bundle bytes on disk before the
#   # daily TTL job purges them. Submission row + sha256 + size always
#   # survive — only the bundle bytes go. Set to 0 to retain forever
#   # (rely on admin Delete). Default 30.
#   blocked_bundle_ttl_days: 30

# --- Corporate Memory governance (optional) ---
# Controls how AI-extracted knowledge is reviewed and distributed.
# If not present, system operates in legacy mode (democratic wiki, no admin review).
#
# The corporate_memory.* schema is editable via /admin/server-config UI; you can
# also continue to manage it via this YAML file. The UI surfaces every leaf with
# a hint, so use it to discover the schema if this comment block has aged.
#
# corporate_memory:
#   # How knowledge reaches users:
#   # "mandatory_only" — admin controls everything, no user voting
#   # "admin_curated" — admin controls, users vote as feedback signal
#   # "hybrid" — mandatory from admin + optional from user voting (default)
#   distribution_mode: "hybrid"
#
#   # How new AI-extracted items enter the system:
#   # "review_queue" — nothing published without admin approval (default)
#   # "auto_publish" — items go live immediately, admin intervenes retroactively
#   # "threshold" — high-confidence auto-publish, low-confidence to review queue
#   approval_mode: "review_queue"
#
#   # Default review period for approved/mandatory items (months)
#   review_period_months: 6
#
#   # Notify km_admins about new pending items
#   notify_on_new_items: true
#
#   # --- V1 Context Engineering ---
#
#   sources:
#     claude_local_md:
#       enabled: true
#       confidence_base: 0.50
#     session_transcripts:
#       enabled: true
#       confidence_base: 0.60
#       max_turns_per_session: 100
#       detection_types:
#         - correction
#         - confirmation
#         - unprompted_definition
#
#   extraction:
#     model: "claude-haiku-4-5-20251001"
#     sensitivity_check: true
#     contradiction_check: true
#
#   confidence:
#     # Base score per extraction source. Key format: "source_type" or "source_type.detection_type"
#     base:
#       user_verification.correction: 0.90
#       user_verification.unprompted_definition: 0.90
#       user_verification.confirmation: 0.60
#       admin_mandate: 1.00
#       claude_local_md: 0.50
#       session_transcript: 0.50
#     # Per-key modifier step sizes applied to base when optional signals are present.
#     modifiers:
#       user_verification.correction:
#         additional_verifiers: 0.05    # per extra unique verifier
#       user_verification.unprompted_definition:
#         additional_verifiers: 0.05
#       user_verification.confirmation:
#         admin_confirmed: 0.20
#       session_transcript:
#         user_confirmed_in_session: 0.20
#     # Confidence decay applied to items as they age.
#     decay:
#       mode: exponential               # linear | exponential
#       half_life_months: 12            # used when mode=exponential
#       decay_rate_monthly: 0.02        # used when mode=linear
#       floor:
#         admin_mandate: 0.50           # admin policies don't silently decay to zero
#         user_verification: 0.40       # user-verified facts never fall below 0.40
#         default: 0.0
#
#   contradiction_detection:
#     enabled: true
#     max_candidates: 10
#
#   entity_resolution:
#     enabled: true
#     entities:
#       metrics: ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"]
#       products: ["Platform", "API", "Dashboard"]
#
#   domain_owners:
#     finance: ["cfo@company.com"]
#     engineering: ["cto@company.com"]
#     product: ["pm@company.com"]
#
#   domains:
#     - finance
#     - engineering
#     - product
#     - data
#     - operations
#     - infrastructure

# --- User groups for audience targeting (optional) ---
# Used with Corporate Memory governance to target mandatory knowledge to specific groups.
#
# groups:
#   finance:
#     label: "Finance & Analytics"
#     members: ["analyst1@company.com", "analyst2@company.com"]
#   engineering:
#     label: "Engineering"
#     members: ["dev1@company.com", "dev2@company.com"]

# --- User display and permissions ---
# Corporate Memory avatars + optional km_admin flag for governance.
# users:
#   admin@company.com:
#     display_name: "Admin User"
#     km_admin: true              # Corporate Memory admin (approve/mandate knowledge)
#   analyst@company.com:
#     display_name: "Analyst User"
users: {}

# --- Username mapping (webapp email -> server username, only if different) ---
username_mapping: {}

# --- Optional datasets (sync settings UI) ---
datasets: {}

# --- Data catalog ---
catalog:
  categories: {}
  order: []

# --- Data profiler (optional) ---
# profiler:
#   sample_size: 500000              # If table > this, sample this many rows; otherwise use all
#   max_categorical_distinct: 50     # Treat as categorical if unique <= this
#   top_values_limit: 10             # Top values per categorical column
#   histogram_bins: 15               # Bins in histogram visualizations
#   sample_rows_limit: 5             # Sample rows to show in UI "Sample" tab
#   alert_high_missing_pct: 30.0     # Alert threshold for high missing %
#   alert_missing_pct: 5.0           # Alert threshold for missing %
#   alert_imbalance_pct: 60.0        # Alert threshold for imbalance %
#   alert_high_cardinality: 50       # Alert threshold for high cardinality columns

# --- Remote query (optional) ---
# Settings for remote BigQuery queries via `python -m src.remote_query`.
# Used when tables have query_mode: "remote" in data_description.md.
# remote_query:
#   timeout_seconds: 300              # BQ + DuckDB query timeout
#   max_result_rows: 100000           # Max rows in final output
#   max_bq_registration_rows: 500000  # Max rows per --register-bq sub-query
#   default_format: "table"           # Default output format
#   output_dir: "/tmp/remote_query"   # Directory for Parquet/CSV exports

# --- v2 API knobs (optional) ---
# Controls for the /api/v2/{catalog,schema,sample,scan,scan/estimate} endpoints.
# All values are optional — the defaults shown below are applied if keys are absent.
#
# api:
#   # --- Scan / fetch limits ---
#   scan:
#     max_limit: 10000000             # Hard row cap per /api/v2/scan request (default: 10 M)
#     max_result_bytes: 2147483648    # Hard byte cap on Arrow stream response: 2 GB (default)
#                                     # If exceeded, partial result returned with X-Agnes-Truncated header.
#     max_concurrent_per_user: 5      # In-flight /api/v2/scan requests allowed per user (default: 5)
#                                     # Note: quota is process-local; N replicas → effective N× cap.
#     max_daily_bytes_per_user: 53687091200  # Per-user daily byte quota: 50 GB (default). Resets at UTC midnight.
#     bq_cost_per_tb_usd: 5.00        # Cost rate shown in /api/v2/scan/estimate response (default: $5/TB)
#     request_timeout_seconds: 300    # Server-side timeout for a single scan request (default: 300 s)
#   # --- Discovery cache TTLs ---
#   catalog_cache_ttl_seconds: 300    # /api/v2/catalog response cache lifetime (default: 5 min)
#   schema_cache_ttl_seconds: 3600    # /api/v2/schema/{table_id} cache lifetime (default: 1 h)
#   sample_cache_ttl_seconds: 3600    # /api/v2/sample/{table_id} cache lifetime (default: 1 h)
#                                     # Admins can force-refresh via POST /api/v2/sample/{id}?refresh=true

# --- Materialize concurrency safety (optional) ---
# Concurrency safety net for the materialize path (BQ + Keboola). When
# two materialize attempts race for the same table_id, the second one
# raises MaterializeInFlightError and skips. The lock is held in a
# .parquet.lock sibling file; if a holder process is hard-killed before
# kernel-level flock release, the next attempt reclaims the lock once
# the file's mtime is older than this TTL.
#
# Default 86400 (24h) is generous on purpose — anything shorter risks
# a long-running COPY being interrupted by its own scheduler successor.
# Lower it only if you know your materialize never exceeds the new
# value AND your host has a habit of hard-killing processes.
# Min 60 (1 minute), max 604800 (7 days). Configurable via /admin/server-config UI.
materialize:
  lock_ttl_seconds: 86400