agnes-the-ai-analyst/config/instance.yaml.example

# AI Data Analyst - Instance Configuration
# ==========================================
# This is the main configuration file for your instance.
# Copy to instance.yaml and fill in your values.
#
# SECRET VALUES use ${ENV_VAR} syntax - actual values go in .env file.
# Non-secret values are set directly here.

# --- Instance branding ---
instance:
  name: "AI Data Analyst"
  subtitle: "Your Organization"
  copyright: "Your Organization"
  # logo_svg: Full <svg> element for header logo (optional, default: Keboola logo)
  # Example: '<svg width="120" height="30" viewBox="0 0 100 30" xmlns="http://www.w3.org/2000/svg"><text y="22" font-size="24" fill="#333">Logo</text></svg>'

# --- Server ---
server:
  hostname: ""                    # DNS name (e.g., "data.acme.com")
  host: ""                        # IP address
  app_dir: "/opt/data-analyst"    # Installation directory
  # --- Client setup (shown in "Get Started" on dashboard) ---
  # ssh_alias: "data-analyst"     # SSH config Host alias for analysts (default: "data-analyst")
  # ssh_key: "~/.ssh/data_analyst_server"  # SSH key path for analysts (default: "~/.ssh/data_analyst_server")
  # project_dir: "data-analyst"   # Local project folder name (default: "data-analyst")

# --- Admin users ---
# Manage the server, own data files, get unlimited resource limits.
# SSH keys are used by server/setup.sh during provisioning.
admins:
  - username: "admin"
    ssh_public_key: "ssh-ed25519 AAAA..."

# --- Deployment ---
deployment:
  method: "manual"                # manual | github_actions
  repo_url: ""                    # e.g., "git@github.com:acme/ai-data-analyst.git"
  branch: "main"

# --- Authentication ---
# At minimum, set allowed_domain and webapp_secret_key.
# Email magic link auth works out of the box (no external service needed).
# Google OAuth is optional - add credentials to enable it.
auth:
  allowed_domain: ""              # Email domain(s) for login, comma-separated (e.g., "acme.com" or "acme.com, partner.org")
  webapp_secret_key: "${WEBAPP_SECRET_KEY}"
  # Optional: Google OAuth (if not set, only email magic link is available)
  google_client_id: "${GOOGLE_CLIENT_ID}"
  google_client_secret: "${GOOGLE_CLIENT_SECRET}"

  # --- Webapp username shaping ---
  #
  # By default, a user's OS account is derived from their full email:
  #   e.psimecek@acme.com  ->  e_psimecek_acme_com
  #
  # Two options let you control this:
  #
  # username_strip_domain: true
  #   Use only the local part of the email (before @).
  #   Safe when allowed_domain ensures all users share a single domain.
  #   e.psimecek@acme.com  ->  e_psimecek
  #   Keeps usernames short and readable.
  #
  # username_prefix: "myapp_"
  #   Prepend a fixed string to every webapp-created account name.
  #   Necessary when an external identity system (GCP OS Login, LDAP, SAML)
  #   already creates OS accounts in /home/ using the same naming scheme.
  #   Without a prefix, the webapp sees those existing OS accounts and refuses
  #   to register new analyst accounts ("already in use by a system account").
  #   With prefix "myapp_" and strip_domain true:
  #     e.psimecek@acme.com  ->  myapp_e_psimecek
  #   Linux enforces a 32-character username limit. Keep the prefix short.
  #   Changing or removing either option later will invalidate all existing
  #   analyst accounts. Use username_mapping (top-level) to bridge legacy accounts.
  #
  # username_strip_domain: false
  # username_prefix: ""
  # disabled_providers:            # Hide auth methods from login page
  #   - "email"                    # Disable email magic link (use when Google OAuth is configured)

# --- Theme (optional) ---
# Customize colors, fonts, and shape to match your brand.
# All values are optional - defaults provide a clean blue theme.
# See docs/theme-reference.html for a visual guide.
theme:
  # primary: "#0073D1"              # Main brand color (buttons, links, accents)
  # primary_dark: "#005BA3"         # Hover/active state of primary
  # primary_light: "rgba(0, 115, 209, 0.1)"  # Light tint backgrounds
  # text_primary: "#1A253C"         # Main text color
  # text_secondary: "#6B7280"       # Muted/secondary text
  # background: "#F5F7FA"           # Page background
  # surface: "#FFFFFF"              # Card/panel background
  # border: "#E5E7EB"              # Borders and dividers
  # font_primary: "'Inter', system-ui, sans-serif"
  # font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"
  # radius: "6px"                   # Border radius (cards, buttons, inputs)
  # success: "#10B77F"
  # warning: "#F59F0A"
  # error: "#EA580C"

# --- Data source ---
data_source:
  type: "keboola"                 # keboola | bigquery | local
  keboola:
    storage_token: "${KEBOOLA_STORAGE_TOKEN}"
    stack_url: ""                 # e.g., "https://connection.keboola.com"
    project_id: ""
  bigquery:
    project: "${BIGQUERY_PROJECT}"       # GCP project for job execution/billing
    location: "${BIGQUERY_LOCATION}"     # BigQuery location (e.g., "us-central1", "US")
    # Uses ADC (Application Default Credentials) - VM service account on GCP
    # Data can live in a different project -- use fully-qualified table IDs in data_description.md

# --- OpenMetadata catalog (optional) ---
# Enriches table and column metadata from OpenMetadata REST API.
# If not configured, app works normally without catalog enrichment.
# openmetadata:
#   url: "https://your-catalog.example.com"
#   token: "${OPENMETADATA_TOKEN}"        # JWT bearer token
#   cache_ttl_seconds: 3600               # Cache TTL in seconds

# --- Email delivery (optional, for magic link auth) ---
# Without SMTP, magic links are shown directly in browser (development mode).
# For production, configure any SMTP relay (Gmail, Mailgun, SendGrid SMTP, etc.)
email:
  from_address: "noreply@example.com"
  from_name: "AI Data Analyst"
  smtp_host: "${SMTP_HOST}"          # e.g., "smtp.gmail.com"
  smtp_port: 587                     # 587 for STARTTLS, 465 for SSL
  smtp_user: "${SMTP_USER}"
  smtp_password: "${SMTP_PASSWORD}"

# --- Desktop app (optional) ---
desktop:
  jwt_issuer: "data-analyst"
  jwt_secret: "${DESKTOP_JWT_SECRET}"
  url_scheme: "data-analyst"

# --- Telegram notifications (optional) ---
telegram:
  bot_token: "${TELEGRAM_BOT_TOKEN}"
  bot_username: ""
  domain_suffix: ""

# --- Jira integration (optional) ---
jira:
  domain: ""
  email: ""
  api_token: "${JIRA_API_TOKEN}"
  webhook_secret: "${JIRA_WEBHOOK_SECRET}"
  sla_email: ""
  sla_api_token: "${JIRA_SLA_API_TOKEN}"
  cloud_id: ""

# --- Corporate Memory AI (optional) ---
# Extracts shared knowledge from team members' CLAUDE.local.md files.
# Provider: "anthropic" (direct API) or "openai_compat" (LiteLLM, OpenRouter, Azure, etc.)
ai:
  provider: "anthropic"                    # or "openai_compat"
  api_key: "${ANTHROPIC_API_KEY}"          # or "${LLM_API_KEY}" for proxy
  # base_url: "https://litellm.example.com"  # required for openai_compat
  model: "claude-haiku-4-5-20251001"       # any model available on your provider
  # --- Structured output quality control ---
  # AI models can return JSON in three ways, each with different reliability:
  #
  # Layer 1 - "json_schema" (best):
  #   The provider enforces an exact schema. Every field, type, and structure
  #   is guaranteed. Available on: Anthropic, OpenAI, Claude via LiteLLM.
  #
  # Layer 2 - "json_object" (good):
  #   The provider guarantees valid JSON, but does not enforce a specific schema.
  #   Fields may be missing or have wrong types. Available on most providers.
  #
  # Layer 3 - "prompt" (acceptable):
  #   The AI is asked to respond in JSON via instructions in the prompt.
  #   No technical enforcement -- the model may still return invalid JSON.
  #   Works everywhere, but least reliable.
  #
  # "strict" = only Layer 1. Fail if provider doesn't support json_schema.
  #            Use when data quality is non-negotiable.
  # "json"   = Layer 1, fall back to Layer 2. No prompt-based fallback.
  #            Good balance of quality and compatibility.
  # "auto"   = All three layers as progressive fallback. Maximum compatibility.
  #            Use when you'd rather get imperfect data than no data.
  structured_output: "auto"

# Legacy format (still supported, equivalent to provider: "anthropic"):
# ai:
#   anthropic_api_key: "${ANTHROPIC_API_KEY}"

# Examples:
# --- LiteLLM proxy ---
# ai:
#   provider: "openai_compat"
#   base_url: "https://litellm.example.com"
#   api_key: "${LLM_API_KEY}"
#   model: "claude-haiku-4-5-20251001"
#   structured_output: "strict"
#
# --- OpenRouter ---
# ai:
#   provider: "openai_compat"
#   base_url: "https://openrouter.ai/api/v1"
#   api_key: "${OPENROUTER_API_KEY}"
#   model: "anthropic/claude-3-haiku"
#   structured_output: "auto"

# --- Corporate Memory governance (optional) ---
# Controls how AI-extracted knowledge is reviewed and distributed.
# If not present, system operates in legacy mode (democratic wiki, no admin review).
#
# corporate_memory:
#   # How knowledge reaches users:
#   # "mandatory_only" — admin controls everything, no user voting
#   # "admin_curated" — admin controls, users vote as feedback signal
#   # "hybrid" — mandatory from admin + optional from user voting (default)
#   distribution_mode: "hybrid"
#
#   # How new AI-extracted items enter the system:
#   # "review_queue" — nothing published without admin approval (default)
#   # "auto_publish" — items go live immediately, admin intervenes retroactively
#   # "threshold" — high-confidence auto-publish, low-confidence to review queue
#   approval_mode: "review_queue"
#
#   # Default review period for approved/mandatory items (months)
#   review_period_months: 6
#
#   # Notify km_admins about new pending items
#   notify_on_new_items: true

# --- User groups for audience targeting (optional) ---
# Used with Corporate Memory governance to target mandatory knowledge to specific groups.
#
# groups:
#   finance:
#     label: "Finance & Analytics"
#     members: ["analyst1@company.com", "analyst2@company.com"]
#   engineering:
#     label: "Engineering"
#     members: ["dev1@company.com", "dev2@company.com"]

# --- User display and permissions ---
# Corporate Memory avatars + optional km_admin flag for governance.
# users:
#   admin@company.com:
#     display_name: "Admin User"
#     km_admin: true              # Corporate Memory admin (approve/mandate knowledge)
#   analyst@company.com:
#     display_name: "Analyst User"
users: {}

# --- Username mapping (webapp email -> server username, only if different) ---
username_mapping: {}

# --- Optional datasets (sync settings UI) ---
datasets: {}

# --- Data catalog ---
catalog:
  categories: {}
  order: []

# --- Data profiler (optional) ---
# profiler:
#   sample_size: 500000              # If table > this, sample this many rows; otherwise use all
#   max_categorical_distinct: 50     # Treat as categorical if unique <= this
#   top_values_limit: 10             # Top values per categorical column
#   histogram_bins: 15               # Bins in histogram visualizations
#   sample_rows_limit: 5             # Sample rows to show in UI "Sample" tab
#   alert_high_missing_pct: 30.0     # Alert threshold for high missing %
#   alert_missing_pct: 5.0           # Alert threshold for missing %
#   alert_imbalance_pct: 60.0        # Alert threshold for imbalance %
#   alert_high_cardinality: 50       # Alert threshold for high cardinality columns

# --- Remote query (optional) ---
# Settings for remote BigQuery queries via `python -m src.remote_query`.
# Used when tables have query_mode: "remote" in data_description.md.
# remote_query:
#   timeout_seconds: 300              # BQ + DuckDB query timeout
#   max_result_rows: 100000           # Max rows in final output
#   max_bq_registration_rows: 500000  # Max rows per --register-bq sub-query
#   default_format: "table"           # Default output format
#   output_dir: "/tmp/remote_query"   # Directory for Parquet/CSV exports