agnes-the-ai-analyst/config/instance.yaml.example
Petr 95358448e6 Add modular LLM connector for Corporate Memory
Replace hardwired Anthropic API calls with a pluggable provider system.
Each deployment configures its AI provider in instance.yaml — switching
between Anthropic, LiteLLM, OpenRouter, or any OpenAI-compatible proxy
is a config change, not a code change.

New connectors/llm/ module:
- StructuredExtractor Protocol with extract_json() interface
- AnthropicExtractor: direct Anthropic SDK with retry + backoff
- OpenAICompatExtractor: any OpenAI-compatible proxy with three-layer
  structured output fallback (json_schema -> json_object -> prompt)
- Configurable structured_output policy (strict/json/auto)
- Custom exception hierarchy (auth/rate_limit/timeout/format/refusal)
- Zero secrets in logs: no API keys, prompts, or responses logged

Reviewed by: Google Gemini, Claude Sonnet, OpenAI GPT-5.4.
Security audit passed with all critical findings resolved.
2026-03-23 12:08:33 +01:00

242 lines
10 KiB
Text

# AI Data Analyst - Instance Configuration
# ==========================================
# This is the main configuration file for your instance.
# Copy to instance.yaml and fill in your values.
#
# SECRET VALUES use ${ENV_VAR} syntax - actual values go in .env file.
# Non-secret values are set directly here.
# --- Instance branding ---
instance:
name: "AI Data Analyst"
subtitle: "Your Organization"
copyright: "Your Organization"
# logo_svg: Full <svg> element for header logo (optional, default: Keboola logo)
# Example: '<svg width="120" height="30" viewBox="0 0 100 30" xmlns="http://www.w3.org/2000/svg"><text y="22" font-size="24" fill="#333">Logo</text></svg>'
# --- Server ---
server:
hostname: "" # DNS name (e.g., "data.acme.com")
host: "" # IP address
app_dir: "/opt/data-analyst" # Installation directory
# --- Client setup (shown in "Get Started" on dashboard) ---
# ssh_alias: "data-analyst" # SSH config Host alias for analysts (default: "data-analyst")
# ssh_key: "~/.ssh/data_analyst_server" # SSH key path for analysts (default: "~/.ssh/data_analyst_server")
# project_dir: "data-analyst" # Local project folder name (default: "data-analyst")
# --- Admin users ---
# Manage the server, own data files, get unlimited resource limits.
# SSH keys are used by server/setup.sh during provisioning.
admins:
- username: "admin"
ssh_public_key: "ssh-ed25519 AAAA..."
# --- Deployment ---
deployment:
method: "manual" # manual | github_actions
repo_url: "" # e.g., "git@github.com:acme/ai-data-analyst.git"
branch: "main"
# --- Authentication ---
# At minimum, set allowed_domain and webapp_secret_key.
# Email magic link auth works out of the box (no external service needed).
# Google OAuth is optional - add credentials to enable it.
auth:
allowed_domain: "" # Email domain(s) for login, comma-separated (e.g., "acme.com" or "acme.com, partner.org")
webapp_secret_key: "${WEBAPP_SECRET_KEY}"
# Optional: Google OAuth (if not set, only email magic link is available)
google_client_id: "${GOOGLE_CLIENT_ID}"
google_client_secret: "${GOOGLE_CLIENT_SECRET}"
# --- Webapp username shaping ---
#
# By default, a user's OS account is derived from their full email:
# e.psimecek@acme.com -> e_psimecek_acme_com
#
# Two options let you control this:
#
# username_strip_domain: true
# Use only the local part of the email (before @).
# Safe when allowed_domain ensures all users share a single domain.
# e.psimecek@acme.com -> e_psimecek
# Keeps usernames short and readable.
#
# username_prefix: "myapp_"
# Prepend a fixed string to every webapp-created account name.
# Necessary when an external identity system (GCP OS Login, LDAP, SAML)
# already creates OS accounts in /home/ using the same naming scheme.
# Without a prefix, the webapp sees those existing OS accounts and refuses
# to register new analyst accounts ("already in use by a system account").
# With prefix "myapp_" and strip_domain true:
# e.psimecek@acme.com -> myapp_e_psimecek
# Linux enforces a 32-character username limit. Keep the prefix short.
# Changing or removing either option later will invalidate all existing
# analyst accounts. Use username_mapping (top-level) to bridge legacy accounts.
#
# username_strip_domain: false
# username_prefix: ""
# disabled_providers: # Hide auth methods from login page
# - "email" # Disable email magic link (use when Google OAuth is configured)
# --- Theme (optional) ---
# Customize colors, fonts, and shape to match your brand.
# All values are optional - defaults provide a clean blue theme.
# See docs/theme-reference.html for a visual guide.
theme:
# primary: "#0073D1" # Main brand color (buttons, links, accents)
# primary_dark: "#005BA3" # Hover/active state of primary
# primary_light: "rgba(0, 115, 209, 0.1)" # Light tint backgrounds
# text_primary: "#1A253C" # Main text color
# text_secondary: "#6B7280" # Muted/secondary text
# background: "#F5F7FA" # Page background
# surface: "#FFFFFF" # Card/panel background
# border: "#E5E7EB" # Borders and dividers
# font_primary: "'Inter', system-ui, sans-serif"
# font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"
# radius: "6px" # Border radius (cards, buttons, inputs)
# success: "#10B77F"
# warning: "#F59F0A"
# error: "#EA580C"
# --- Data source ---
data_source:
type: "keboola" # keboola | bigquery | local
keboola:
storage_token: "${KEBOOLA_STORAGE_TOKEN}"
stack_url: "" # e.g., "https://connection.keboola.com"
project_id: ""
bigquery:
project: "${BIGQUERY_PROJECT}" # GCP project for job execution/billing
location: "${BIGQUERY_LOCATION}" # BigQuery location (e.g., "us-central1", "US")
# Uses ADC (Application Default Credentials) - VM service account on GCP
# Data can live in a different project -- use fully-qualified table IDs in data_description.md
# --- OpenMetadata catalog (optional - Groupon-specific) ---
# Enriches table and column metadata from OpenMetadata REST API.
# If not configured, app works normally without catalog enrichment.
# openmetadata:
# url: "https://your-catalog.example.com"
# token: "${OPENMETADATA_TOKEN}" # JWT bearer token
# cache_ttl_seconds: 3600 # Cache TTL in seconds
# --- Email delivery (optional, for magic link auth) ---
# Without SMTP, magic links are shown directly in browser (development mode).
# For production, configure any SMTP relay (Gmail, Mailgun, SendGrid SMTP, etc.)
email:
from_address: "noreply@example.com"
from_name: "AI Data Analyst"
smtp_host: "${SMTP_HOST}" # e.g., "smtp.gmail.com"
smtp_port: 587 # 587 for STARTTLS, 465 for SSL
smtp_user: "${SMTP_USER}"
smtp_password: "${SMTP_PASSWORD}"
# --- Desktop app (optional) ---
desktop:
jwt_issuer: "data-analyst"
jwt_secret: "${DESKTOP_JWT_SECRET}"
url_scheme: "data-analyst"
# --- Telegram notifications (optional) ---
telegram:
bot_token: "${TELEGRAM_BOT_TOKEN}"
bot_username: ""
domain_suffix: ""
# --- Jira integration (optional) ---
jira:
domain: ""
email: ""
api_token: "${JIRA_API_TOKEN}"
webhook_secret: "${JIRA_WEBHOOK_SECRET}"
sla_email: ""
sla_api_token: "${JIRA_SLA_API_TOKEN}"
cloud_id: ""
# --- Corporate Memory AI (optional) ---
# Extracts shared knowledge from team members' CLAUDE.local.md files.
# Provider: "anthropic" (direct API) or "openai_compat" (LiteLLM, OpenRouter, Azure, etc.)
ai:
provider: "anthropic" # or "openai_compat"
api_key: "${ANTHROPIC_API_KEY}" # or "${LLM_API_KEY}" for proxy
# base_url: "https://litellm.example.com" # required for openai_compat
model: "claude-haiku-4-5-20251001" # any model available on your provider
# --- Structured output quality control ---
# AI models can return JSON in three ways, each with different reliability:
#
# Layer 1 - "json_schema" (best):
# The provider enforces an exact schema. Every field, type, and structure
# is guaranteed. Available on: Anthropic, OpenAI, Claude via LiteLLM.
#
# Layer 2 - "json_object" (good):
# The provider guarantees valid JSON, but does not enforce a specific schema.
# Fields may be missing or have wrong types. Available on most providers.
#
# Layer 3 - "prompt" (acceptable):
# The AI is asked to respond in JSON via instructions in the prompt.
# No technical enforcement -- the model may still return invalid JSON.
# Works everywhere, but least reliable.
#
# "strict" = only Layer 1. Fail if provider doesn't support json_schema.
# Use when data quality is non-negotiable.
# "json" = Layer 1, fall back to Layer 2. No prompt-based fallback.
# Good balance of quality and compatibility.
# "auto" = All three layers as progressive fallback. Maximum compatibility.
# Use when you'd rather get imperfect data than no data.
structured_output: "auto"
# Legacy format (still supported, equivalent to provider: "anthropic"):
# ai:
# anthropic_api_key: "${ANTHROPIC_API_KEY}"
# Examples:
# --- LiteLLM proxy ---
# ai:
# provider: "openai_compat"
# base_url: "https://litellm.example.com"
# api_key: "${LLM_API_KEY}"
# model: "claude-haiku-4-5-20251001"
# structured_output: "strict"
#
# --- OpenRouter ---
# ai:
# provider: "openai_compat"
# base_url: "https://openrouter.ai/api/v1"
# api_key: "${OPENROUTER_API_KEY}"
# model: "anthropic/claude-3-haiku"
# structured_output: "auto"
# --- User display (for Corporate Memory avatars) ---
users: {}
# --- Username mapping (webapp email -> server username, only if different) ---
username_mapping: {}
# --- Optional datasets (sync settings UI) ---
datasets: {}
# --- Data catalog ---
catalog:
categories: {}
order: []
# --- Data profiler (optional) ---
# profiler:
# sample_size: 500000 # If table > this, sample this many rows; otherwise use all
# max_categorical_distinct: 50 # Treat as categorical if unique <= this
# top_values_limit: 10 # Top values per categorical column
# histogram_bins: 15 # Bins in histogram visualizations
# sample_rows_limit: 5 # Sample rows to show in UI "Sample" tab
# alert_high_missing_pct: 30.0 # Alert threshold for high missing %
# alert_missing_pct: 5.0 # Alert threshold for missing %
# alert_imbalance_pct: 60.0 # Alert threshold for imbalance %
# alert_high_cardinality: 50 # Alert threshold for high cardinality columns
# --- Remote query (optional) ---
# Settings for remote BigQuery queries via `python -m src.remote_query`.
# Used when tables have query_mode: "remote" in data_description.md.
# remote_query:
# timeout_seconds: 300 # BQ + DuckDB query timeout
# max_result_rows: 100000 # Max rows in final output
# max_bq_registration_rows: 500000 # Max rows per --register-bq sub-query
# default_format: "table" # Default output format
# output_dir: "/tmp/remote_query" # Directory for Parquet/CSV exports