Replace hardwired Anthropic API calls with a pluggable provider system. Each deployment configures its AI provider in instance.yaml — switching between Anthropic, LiteLLM, OpenRouter, or any OpenAI-compatible proxy is a config change, not a code change. New connectors/llm/ module: - StructuredExtractor Protocol with extract_json() interface - AnthropicExtractor: direct Anthropic SDK with retry + backoff - OpenAICompatExtractor: any OpenAI-compatible proxy with three-layer structured output fallback (json_schema -> json_object -> prompt) - Configurable structured_output policy (strict/json/auto) - Custom exception hierarchy (auth/rate_limit/timeout/format/refusal) - Zero secrets in logs: no API keys, prompts, or responses logged Reviewed by: Google Gemini, Claude Sonnet, OpenAI GPT-5.4. Security audit passed with all critical findings resolved.
242 lines
10 KiB
Text
242 lines
10 KiB
Text
# AI Data Analyst - Instance Configuration
|
|
# ==========================================
|
|
# This is the main configuration file for your instance.
|
|
# Copy to instance.yaml and fill in your values.
|
|
#
|
|
# SECRET VALUES use ${ENV_VAR} syntax - actual values go in .env file.
|
|
# Non-secret values are set directly here.
|
|
|
|
# --- Instance branding ---
|
|
instance:
|
|
name: "AI Data Analyst"
|
|
subtitle: "Your Organization"
|
|
copyright: "Your Organization"
|
|
# logo_svg: Full <svg> element for header logo (optional, default: Keboola logo)
|
|
# Example: '<svg width="120" height="30" viewBox="0 0 100 30" xmlns="http://www.w3.org/2000/svg"><text y="22" font-size="24" fill="#333">Logo</text></svg>'
|
|
|
|
# --- Server ---
|
|
server:
|
|
hostname: "" # DNS name (e.g., "data.acme.com")
|
|
host: "" # IP address
|
|
app_dir: "/opt/data-analyst" # Installation directory
|
|
# --- Client setup (shown in "Get Started" on dashboard) ---
|
|
# ssh_alias: "data-analyst" # SSH config Host alias for analysts (default: "data-analyst")
|
|
# ssh_key: "~/.ssh/data_analyst_server" # SSH key path for analysts (default: "~/.ssh/data_analyst_server")
|
|
# project_dir: "data-analyst" # Local project folder name (default: "data-analyst")
|
|
|
|
# --- Admin users ---
|
|
# Manage the server, own data files, get unlimited resource limits.
|
|
# SSH keys are used by server/setup.sh during provisioning.
|
|
admins:
|
|
- username: "admin"
|
|
ssh_public_key: "ssh-ed25519 AAAA..."
|
|
|
|
# --- Deployment ---
|
|
deployment:
|
|
method: "manual" # manual | github_actions
|
|
repo_url: "" # e.g., "git@github.com:acme/ai-data-analyst.git"
|
|
branch: "main"
|
|
|
|
# --- Authentication ---
|
|
# At minimum, set allowed_domain and webapp_secret_key.
|
|
# Email magic link auth works out of the box (no external service needed).
|
|
# Google OAuth is optional - add credentials to enable it.
|
|
auth:
|
|
allowed_domain: "" # Email domain(s) for login, comma-separated (e.g., "acme.com" or "acme.com, partner.org")
|
|
webapp_secret_key: "${WEBAPP_SECRET_KEY}"
|
|
# Optional: Google OAuth (if not set, only email magic link is available)
|
|
google_client_id: "${GOOGLE_CLIENT_ID}"
|
|
google_client_secret: "${GOOGLE_CLIENT_SECRET}"
|
|
|
|
# --- Webapp username shaping ---
|
|
#
|
|
# By default, a user's OS account is derived from their full email:
|
|
# e.psimecek@acme.com -> e_psimecek_acme_com
|
|
#
|
|
# Two options let you control this:
|
|
#
|
|
# username_strip_domain: true
|
|
# Use only the local part of the email (before @).
|
|
# Safe when allowed_domain ensures all users share a single domain.
|
|
# e.psimecek@acme.com -> e_psimecek
|
|
# Keeps usernames short and readable.
|
|
#
|
|
# username_prefix: "myapp_"
|
|
# Prepend a fixed string to every webapp-created account name.
|
|
# Necessary when an external identity system (GCP OS Login, LDAP, SAML)
|
|
# already creates OS accounts in /home/ using the same naming scheme.
|
|
# Without a prefix, the webapp sees those existing OS accounts and refuses
|
|
# to register new analyst accounts ("already in use by a system account").
|
|
# With prefix "myapp_" and strip_domain true:
|
|
# e.psimecek@acme.com -> myapp_e_psimecek
|
|
# Linux enforces a 32-character username limit. Keep the prefix short.
|
|
# Changing or removing either option later will invalidate all existing
|
|
# analyst accounts. Use username_mapping (top-level) to bridge legacy accounts.
|
|
#
|
|
# username_strip_domain: false
|
|
# username_prefix: ""
|
|
# disabled_providers: # Hide auth methods from login page
|
|
# - "email" # Disable email magic link (use when Google OAuth is configured)
|
|
|
|
# --- Theme (optional) ---
|
|
# Customize colors, fonts, and shape to match your brand.
|
|
# All values are optional - defaults provide a clean blue theme.
|
|
# See docs/theme-reference.html for a visual guide.
|
|
theme:
|
|
# primary: "#0073D1" # Main brand color (buttons, links, accents)
|
|
# primary_dark: "#005BA3" # Hover/active state of primary
|
|
# primary_light: "rgba(0, 115, 209, 0.1)" # Light tint backgrounds
|
|
# text_primary: "#1A253C" # Main text color
|
|
# text_secondary: "#6B7280" # Muted/secondary text
|
|
# background: "#F5F7FA" # Page background
|
|
# surface: "#FFFFFF" # Card/panel background
|
|
# border: "#E5E7EB" # Borders and dividers
|
|
# font_primary: "'Inter', system-ui, sans-serif"
|
|
# font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"
|
|
# radius: "6px" # Border radius (cards, buttons, inputs)
|
|
# success: "#10B77F"
|
|
# warning: "#F59F0A"
|
|
# error: "#EA580C"
|
|
|
|
# --- Data source ---
|
|
data_source:
|
|
type: "keboola" # keboola | bigquery | local
|
|
keboola:
|
|
storage_token: "${KEBOOLA_STORAGE_TOKEN}"
|
|
stack_url: "" # e.g., "https://connection.keboola.com"
|
|
project_id: ""
|
|
bigquery:
|
|
project: "${BIGQUERY_PROJECT}" # GCP project for job execution/billing
|
|
location: "${BIGQUERY_LOCATION}" # BigQuery location (e.g., "us-central1", "US")
|
|
# Uses ADC (Application Default Credentials) - VM service account on GCP
|
|
# Data can live in a different project -- use fully-qualified table IDs in data_description.md
|
|
|
|
# --- OpenMetadata catalog (optional - Groupon-specific) ---
|
|
# Enriches table and column metadata from OpenMetadata REST API.
|
|
# If not configured, app works normally without catalog enrichment.
|
|
# openmetadata:
|
|
# url: "https://your-catalog.example.com"
|
|
# token: "${OPENMETADATA_TOKEN}" # JWT bearer token
|
|
# cache_ttl_seconds: 3600 # Cache TTL in seconds
|
|
|
|
# --- Email delivery (optional, for magic link auth) ---
|
|
# Without SMTP, magic links are shown directly in browser (development mode).
|
|
# For production, configure any SMTP relay (Gmail, Mailgun, SendGrid SMTP, etc.)
|
|
email:
|
|
from_address: "noreply@example.com"
|
|
from_name: "AI Data Analyst"
|
|
smtp_host: "${SMTP_HOST}" # e.g., "smtp.gmail.com"
|
|
smtp_port: 587 # 587 for STARTTLS, 465 for SSL
|
|
smtp_user: "${SMTP_USER}"
|
|
smtp_password: "${SMTP_PASSWORD}"
|
|
|
|
# --- Desktop app (optional) ---
|
|
desktop:
|
|
jwt_issuer: "data-analyst"
|
|
jwt_secret: "${DESKTOP_JWT_SECRET}"
|
|
url_scheme: "data-analyst"
|
|
|
|
# --- Telegram notifications (optional) ---
|
|
telegram:
|
|
bot_token: "${TELEGRAM_BOT_TOKEN}"
|
|
bot_username: ""
|
|
domain_suffix: ""
|
|
|
|
# --- Jira integration (optional) ---
|
|
jira:
|
|
domain: ""
|
|
email: ""
|
|
api_token: "${JIRA_API_TOKEN}"
|
|
webhook_secret: "${JIRA_WEBHOOK_SECRET}"
|
|
sla_email: ""
|
|
sla_api_token: "${JIRA_SLA_API_TOKEN}"
|
|
cloud_id: ""
|
|
|
|
# --- Corporate Memory AI (optional) ---
|
|
# Extracts shared knowledge from team members' CLAUDE.local.md files.
|
|
# Provider: "anthropic" (direct API) or "openai_compat" (LiteLLM, OpenRouter, Azure, etc.)
|
|
ai:
|
|
provider: "anthropic" # or "openai_compat"
|
|
api_key: "${ANTHROPIC_API_KEY}" # or "${LLM_API_KEY}" for proxy
|
|
# base_url: "https://litellm.example.com" # required for openai_compat
|
|
model: "claude-haiku-4-5-20251001" # any model available on your provider
|
|
# --- Structured output quality control ---
|
|
# AI models can return JSON in three ways, each with different reliability:
|
|
#
|
|
# Layer 1 - "json_schema" (best):
|
|
# The provider enforces an exact schema. Every field, type, and structure
|
|
# is guaranteed. Available on: Anthropic, OpenAI, Claude via LiteLLM.
|
|
#
|
|
# Layer 2 - "json_object" (good):
|
|
# The provider guarantees valid JSON, but does not enforce a specific schema.
|
|
# Fields may be missing or have wrong types. Available on most providers.
|
|
#
|
|
# Layer 3 - "prompt" (acceptable):
|
|
# The AI is asked to respond in JSON via instructions in the prompt.
|
|
# No technical enforcement -- the model may still return invalid JSON.
|
|
# Works everywhere, but least reliable.
|
|
#
|
|
# "strict" = only Layer 1. Fail if provider doesn't support json_schema.
|
|
# Use when data quality is non-negotiable.
|
|
# "json" = Layer 1, fall back to Layer 2. No prompt-based fallback.
|
|
# Good balance of quality and compatibility.
|
|
# "auto" = All three layers as progressive fallback. Maximum compatibility.
|
|
# Use when you'd rather get imperfect data than no data.
|
|
structured_output: "auto"
|
|
|
|
# Legacy format (still supported, equivalent to provider: "anthropic"):
|
|
# ai:
|
|
# anthropic_api_key: "${ANTHROPIC_API_KEY}"
|
|
|
|
# Examples:
|
|
# --- LiteLLM proxy ---
|
|
# ai:
|
|
# provider: "openai_compat"
|
|
# base_url: "https://litellm.example.com"
|
|
# api_key: "${LLM_API_KEY}"
|
|
# model: "claude-haiku-4-5-20251001"
|
|
# structured_output: "strict"
|
|
#
|
|
# --- OpenRouter ---
|
|
# ai:
|
|
# provider: "openai_compat"
|
|
# base_url: "https://openrouter.ai/api/v1"
|
|
# api_key: "${OPENROUTER_API_KEY}"
|
|
# model: "anthropic/claude-3-haiku"
|
|
# structured_output: "auto"
|
|
|
|
# --- User display (for Corporate Memory avatars) ---
|
|
users: {}
|
|
|
|
# --- Username mapping (webapp email -> server username, only if different) ---
|
|
username_mapping: {}
|
|
|
|
# --- Optional datasets (sync settings UI) ---
|
|
datasets: {}
|
|
|
|
# --- Data catalog ---
|
|
catalog:
|
|
categories: {}
|
|
order: []
|
|
|
|
# --- Data profiler (optional) ---
|
|
# profiler:
|
|
# sample_size: 500000 # If table > this, sample this many rows; otherwise use all
|
|
# max_categorical_distinct: 50 # Treat as categorical if unique <= this
|
|
# top_values_limit: 10 # Top values per categorical column
|
|
# histogram_bins: 15 # Bins in histogram visualizations
|
|
# sample_rows_limit: 5 # Sample rows to show in UI "Sample" tab
|
|
# alert_high_missing_pct: 30.0 # Alert threshold for high missing %
|
|
# alert_missing_pct: 5.0 # Alert threshold for missing %
|
|
# alert_imbalance_pct: 60.0 # Alert threshold for imbalance %
|
|
# alert_high_cardinality: 50 # Alert threshold for high cardinality columns
|
|
|
|
# --- Remote query (optional) ---
|
|
# Settings for remote BigQuery queries via `python -m src.remote_query`.
|
|
# Used when tables have query_mode: "remote" in data_description.md.
|
|
# remote_query:
|
|
# timeout_seconds: 300 # BQ + DuckDB query timeout
|
|
# max_result_rows: 100000 # Max rows in final output
|
|
# max_bq_registration_rows: 500000 # Max rows per --register-bq sub-query
|
|
# default_format: "table" # Default output format
|
|
# output_dir: "/tmp/remote_query" # Directory for Parquet/CSV exports
|