Replaces the BigQuery wrap-view pattern with a discovery + scoped-fetch toolkit driven by the analyst's Claude session. Adds /api/v2/{catalog,schema,sample,scan,scan/estimate}, da catalog/schema/describe/fetch/snapshot/disk-info CLI commands, sqlglot-backed WHERE validator, process-local quota tracker, agent rails skill (cli/skills/agnes-data-querying.md). BREAKING: BQ wrap views off by default — set data_source.bigquery.legacy_wrap_views=true for one cycle. Backward-compat field_validator on primary_key. Catalog cache now matches documented 300s TTL with RBAC fresh per request. Cuts release v0.14.0.
311 lines
14 KiB
Text
311 lines
14 KiB
Text
# AI Data Analyst - Instance Configuration
|
||
# ==========================================
|
||
# This is the main configuration file for your instance.
|
||
# Copy to instance.yaml and fill in your values.
|
||
#
|
||
# SECRET VALUES use ${ENV_VAR} syntax - actual values go in .env file.
|
||
# Non-secret values are set directly here.
|
||
|
||
# --- Instance branding ---
|
||
instance:
|
||
name: "AI Data Analyst"
|
||
subtitle: "Your Organization"
|
||
copyright: "Your Organization"
|
||
# logo_svg: Full <svg> element for header logo (optional, default: Keboola logo)
|
||
# Example: '<svg width="120" height="30" viewBox="0 0 100 30" xmlns="http://www.w3.org/2000/svg"><text y="22" font-size="24" fill="#333">Logo</text></svg>'
|
||
|
||
# --- Server ---
|
||
server:
|
||
hostname: "" # DNS name (e.g., "data.acme.com")
|
||
host: "" # IP address
|
||
app_dir: "/opt/data-analyst" # Installation directory
|
||
# --- Client setup (shown in "Get Started" on dashboard) ---
|
||
# ssh_alias: "data-analyst" # SSH config Host alias for analysts (default: "data-analyst")
|
||
# ssh_key: "~/.ssh/data_analyst_server" # SSH key path for analysts (default: "~/.ssh/data_analyst_server")
|
||
# project_dir: "data-analyst" # Local project folder name (default: "data-analyst")
|
||
|
||
# --- Admin users ---
|
||
# Manage the server, own data files, get unlimited resource limits.
|
||
# SSH keys are used by server/setup.sh during provisioning.
|
||
admins:
|
||
- username: "admin"
|
||
ssh_public_key: "ssh-ed25519 AAAA..."
|
||
|
||
# --- Deployment ---
|
||
deployment:
|
||
method: "manual" # manual | github_actions
|
||
repo_url: "" # e.g., "git@github.com:acme/ai-data-analyst.git"
|
||
branch: "main"
|
||
|
||
# --- Authentication ---
|
||
# At minimum, set allowed_domain and webapp_secret_key.
|
||
# Email magic link auth works out of the box (no external service needed).
|
||
# Google OAuth is optional - add credentials to enable it.
|
||
auth:
|
||
allowed_domain: "" # Email domain(s) for login, comma-separated (e.g., "acme.com" or "acme.com, partner.org")
|
||
webapp_secret_key: "${WEBAPP_SECRET_KEY}"
|
||
# Optional: Google OAuth (if not set, only email magic link is available)
|
||
google_client_id: "${GOOGLE_CLIENT_ID}"
|
||
google_client_secret: "${GOOGLE_CLIENT_SECRET}"
|
||
|
||
# --- Webapp username shaping ---
|
||
#
|
||
# By default, a user's OS account is derived from their full email:
|
||
# e.psimecek@acme.com -> e_psimecek_acme_com
|
||
#
|
||
# Two options let you control this:
|
||
#
|
||
# username_strip_domain: true
|
||
# Use only the local part of the email (before @).
|
||
# Safe when allowed_domain ensures all users share a single domain.
|
||
# e.psimecek@acme.com -> e_psimecek
|
||
# Keeps usernames short and readable.
|
||
#
|
||
# username_prefix: "myapp_"
|
||
# Prepend a fixed string to every webapp-created account name.
|
||
# Necessary when an external identity system (GCP OS Login, LDAP, SAML)
|
||
# already creates OS accounts in /home/ using the same naming scheme.
|
||
# Without a prefix, the webapp sees those existing OS accounts and refuses
|
||
# to register new analyst accounts ("already in use by a system account").
|
||
# With prefix "myapp_" and strip_domain true:
|
||
# e.psimecek@acme.com -> myapp_e_psimecek
|
||
# Linux enforces a 32-character username limit. Keep the prefix short.
|
||
# Changing or removing either option later will invalidate all existing
|
||
# analyst accounts. Use username_mapping (top-level) to bridge legacy accounts.
|
||
#
|
||
# username_strip_domain: false
|
||
# username_prefix: ""
|
||
# disabled_providers: # Hide auth methods from login page
|
||
# - "email" # Disable email magic link (use when Google OAuth is configured)
|
||
|
||
# --- Theme (optional) ---
|
||
# Customize colors, fonts, and shape to match your brand.
|
||
# All values are optional - defaults provide a clean blue theme.
|
||
# See docs/theme-reference.html for a visual guide.
|
||
theme:
|
||
# primary: "#0073D1" # Main brand color (buttons, links, accents)
|
||
# primary_dark: "#005BA3" # Hover/active state of primary
|
||
# primary_light: "rgba(0, 115, 209, 0.1)" # Light tint backgrounds
|
||
# text_primary: "#1A253C" # Main text color
|
||
# text_secondary: "#6B7280" # Muted/secondary text
|
||
# background: "#F5F7FA" # Page background
|
||
# surface: "#FFFFFF" # Card/panel background
|
||
# border: "#E5E7EB" # Borders and dividers
|
||
# font_primary: "'Inter', system-ui, sans-serif"
|
||
# font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"
|
||
# radius: "6px" # Border radius (cards, buttons, inputs)
|
||
# success: "#10B77F"
|
||
# warning: "#F59F0A"
|
||
# error: "#EA580C"
|
||
|
||
# --- Data source ---
|
||
data_source:
|
||
type: "keboola" # keboola | bigquery | local
|
||
keboola:
|
||
storage_token: "${KEBOOLA_STORAGE_TOKEN}"
|
||
stack_url: "" # e.g., "https://connection.keboola.com"
|
||
project_id: ""
|
||
bigquery:
|
||
project: "${BIGQUERY_PROJECT}" # GCP project hosting the data (used in FROM clause)
|
||
location: "${BIGQUERY_LOCATION}" # BigQuery location (e.g., "us-central1", "US")
|
||
# Uses ADC (Application Default Credentials) - VM service account on GCP
|
||
# Data can live in a different project -- use fully-qualified table IDs in data_description.md
|
||
# billing_project: "" # Optional: GCP project to bill BQ jobs to / submit jobs from.
|
||
# # Defaults to `project`. Set this when the SA has bigquery.data.* on
|
||
# # the data project but lacks serviceusage.services.use there (i.e.,
|
||
# # cross-project read pattern). Submission/billing target must be a
|
||
# # project the SA can use; data project just needs read.
|
||
# legacy_wrap_views: false # Set true to restore pre-v2 wrap views for BQ VIEW/MATERIALIZED_VIEW
|
||
# # tables in analytics.duckdb (migration escape hatch; default: false)
|
||
|
||
# --- OpenMetadata catalog (optional) ---
|
||
# Enriches table and column metadata from OpenMetadata REST API.
|
||
# If not configured, app works normally without catalog enrichment.
|
||
# openmetadata:
|
||
# url: "https://your-catalog.example.com"
|
||
# token: "${OPENMETADATA_TOKEN}" # JWT bearer token
|
||
# cache_ttl_seconds: 3600 # Cache TTL in seconds
|
||
|
||
# --- Email delivery (optional, for magic link auth) ---
|
||
# Without SMTP, magic links are shown directly in browser (development mode).
|
||
# For production, configure any SMTP relay (Gmail, Mailgun, SendGrid SMTP, etc.)
|
||
email:
|
||
from_address: "noreply@example.com"
|
||
from_name: "AI Data Analyst"
|
||
smtp_host: "${SMTP_HOST}" # e.g., "smtp.gmail.com"
|
||
smtp_port: 587 # 587 for STARTTLS, 465 for SSL
|
||
smtp_user: "${SMTP_USER}"
|
||
smtp_password: "${SMTP_PASSWORD}"
|
||
|
||
# --- Desktop app (optional) ---
|
||
desktop:
|
||
jwt_issuer: "data-analyst"
|
||
jwt_secret: "${DESKTOP_JWT_SECRET}"
|
||
url_scheme: "data-analyst"
|
||
|
||
# --- Telegram notifications (optional) ---
|
||
telegram:
|
||
bot_token: "${TELEGRAM_BOT_TOKEN}"
|
||
bot_username: ""
|
||
domain_suffix: ""
|
||
|
||
# --- Jira integration (optional) ---
|
||
jira:
|
||
domain: ""
|
||
email: ""
|
||
api_token: "${JIRA_API_TOKEN}"
|
||
webhook_secret: "${JIRA_WEBHOOK_SECRET}"
|
||
sla_email: ""
|
||
sla_api_token: "${JIRA_SLA_API_TOKEN}"
|
||
cloud_id: ""
|
||
|
||
# --- Corporate Memory AI (optional) ---
|
||
# Extracts shared knowledge from team members' CLAUDE.local.md files.
|
||
# Provider: "anthropic" (direct API) or "openai_compat" (LiteLLM, OpenRouter, Azure, etc.)
|
||
ai:
|
||
provider: "anthropic" # or "openai_compat"
|
||
api_key: "${ANTHROPIC_API_KEY}" # or "${LLM_API_KEY}" for proxy
|
||
# base_url: "https://litellm.example.com" # required for openai_compat
|
||
model: "claude-haiku-4-5-20251001" # any model available on your provider
|
||
# --- Structured output quality control ---
|
||
# AI models can return JSON in three ways, each with different reliability:
|
||
#
|
||
# Layer 1 - "json_schema" (best):
|
||
# The provider enforces an exact schema. Every field, type, and structure
|
||
# is guaranteed. Available on: Anthropic, OpenAI, Claude via LiteLLM.
|
||
#
|
||
# Layer 2 - "json_object" (good):
|
||
# The provider guarantees valid JSON, but does not enforce a specific schema.
|
||
# Fields may be missing or have wrong types. Available on most providers.
|
||
#
|
||
# Layer 3 - "prompt" (acceptable):
|
||
# The AI is asked to respond in JSON via instructions in the prompt.
|
||
# No technical enforcement -- the model may still return invalid JSON.
|
||
# Works everywhere, but least reliable.
|
||
#
|
||
# "strict" = only Layer 1. Fail if provider doesn't support json_schema.
|
||
# Use when data quality is non-negotiable.
|
||
# "json" = Layer 1, fall back to Layer 2. No prompt-based fallback.
|
||
# Good balance of quality and compatibility.
|
||
# "auto" = All three layers as progressive fallback. Maximum compatibility.
|
||
# Use when you'd rather get imperfect data than no data.
|
||
structured_output: "auto"
|
||
|
||
# Legacy format (still supported, equivalent to provider: "anthropic"):
|
||
# ai:
|
||
# anthropic_api_key: "${ANTHROPIC_API_KEY}"
|
||
|
||
# Examples:
|
||
# --- LiteLLM proxy ---
|
||
# ai:
|
||
# provider: "openai_compat"
|
||
# base_url: "https://litellm.example.com"
|
||
# api_key: "${LLM_API_KEY}"
|
||
# model: "claude-haiku-4-5-20251001"
|
||
# structured_output: "strict"
|
||
#
|
||
# --- OpenRouter ---
|
||
# ai:
|
||
# provider: "openai_compat"
|
||
# base_url: "https://openrouter.ai/api/v1"
|
||
# api_key: "${OPENROUTER_API_KEY}"
|
||
# model: "anthropic/claude-3-haiku"
|
||
# structured_output: "auto"
|
||
|
||
# --- Corporate Memory governance (optional) ---
|
||
# Controls how AI-extracted knowledge is reviewed and distributed.
|
||
# If not present, system operates in legacy mode (democratic wiki, no admin review).
|
||
#
|
||
# corporate_memory:
|
||
# # How knowledge reaches users:
|
||
# # "mandatory_only" — admin controls everything, no user voting
|
||
# # "admin_curated" — admin controls, users vote as feedback signal
|
||
# # "hybrid" — mandatory from admin + optional from user voting (default)
|
||
# distribution_mode: "hybrid"
|
||
#
|
||
# # How new AI-extracted items enter the system:
|
||
# # "review_queue" — nothing published without admin approval (default)
|
||
# # "auto_publish" — items go live immediately, admin intervenes retroactively
|
||
# # "threshold" — high-confidence auto-publish, low-confidence to review queue
|
||
# approval_mode: "review_queue"
|
||
#
|
||
# # Default review period for approved/mandatory items (months)
|
||
# review_period_months: 6
|
||
#
|
||
# # Notify km_admins about new pending items
|
||
# notify_on_new_items: true
|
||
|
||
# --- User groups for audience targeting (optional) ---
|
||
# Used with Corporate Memory governance to target mandatory knowledge to specific groups.
|
||
#
|
||
# groups:
|
||
# finance:
|
||
# label: "Finance & Analytics"
|
||
# members: ["analyst1@company.com", "analyst2@company.com"]
|
||
# engineering:
|
||
# label: "Engineering"
|
||
# members: ["dev1@company.com", "dev2@company.com"]
|
||
|
||
# --- User display and permissions ---
|
||
# Corporate Memory avatars + optional km_admin flag for governance.
|
||
# users:
|
||
# admin@company.com:
|
||
# display_name: "Admin User"
|
||
# km_admin: true # Corporate Memory admin (approve/mandate knowledge)
|
||
# analyst@company.com:
|
||
# display_name: "Analyst User"
|
||
users: {}
|
||
|
||
# --- Username mapping (webapp email -> server username, only if different) ---
|
||
username_mapping: {}
|
||
|
||
# --- Optional datasets (sync settings UI) ---
|
||
datasets: {}
|
||
|
||
# --- Data catalog ---
|
||
catalog:
|
||
categories: {}
|
||
order: []
|
||
|
||
# --- Data profiler (optional) ---
|
||
# profiler:
|
||
# sample_size: 500000 # If table > this, sample this many rows; otherwise use all
|
||
# max_categorical_distinct: 50 # Treat as categorical if unique <= this
|
||
# top_values_limit: 10 # Top values per categorical column
|
||
# histogram_bins: 15 # Bins in histogram visualizations
|
||
# sample_rows_limit: 5 # Sample rows to show in UI "Sample" tab
|
||
# alert_high_missing_pct: 30.0 # Alert threshold for high missing %
|
||
# alert_missing_pct: 5.0 # Alert threshold for missing %
|
||
# alert_imbalance_pct: 60.0 # Alert threshold for imbalance %
|
||
# alert_high_cardinality: 50 # Alert threshold for high cardinality columns
|
||
|
||
# --- Remote query (optional) ---
|
||
# Settings for remote BigQuery queries via `python -m src.remote_query`.
|
||
# Used when tables have query_mode: "remote" in data_description.md.
|
||
# remote_query:
|
||
# timeout_seconds: 300 # BQ + DuckDB query timeout
|
||
# max_result_rows: 100000 # Max rows in final output
|
||
# max_bq_registration_rows: 500000 # Max rows per --register-bq sub-query
|
||
# default_format: "table" # Default output format
|
||
# output_dir: "/tmp/remote_query" # Directory for Parquet/CSV exports
|
||
|
||
# --- v2 API knobs (optional) ---
|
||
# Controls for the /api/v2/{catalog,schema,sample,scan,scan/estimate} endpoints.
|
||
# All values are optional — the defaults shown below are applied if keys are absent.
|
||
#
|
||
# api:
|
||
# # --- Scan / fetch limits ---
|
||
# scan:
|
||
# max_limit: 10000000 # Hard row cap per /api/v2/scan request (default: 10 M)
|
||
# max_result_bytes: 2147483648 # Hard byte cap on Arrow stream response: 2 GB (default)
|
||
# # If exceeded, partial result returned with X-Agnes-Truncated header.
|
||
# max_concurrent_per_user: 5 # In-flight /api/v2/scan requests allowed per user (default: 5)
|
||
# # Note: quota is process-local; N replicas → effective N× cap.
|
||
# max_daily_bytes_per_user: 53687091200 # Per-user daily byte quota: 50 GB (default). Resets at UTC midnight.
|
||
# bq_cost_per_tb_usd: 5.00 # Cost rate shown in /api/v2/scan/estimate response (default: $5/TB)
|
||
# request_timeout_seconds: 300 # Server-side timeout for a single scan request (default: 300 s)
|
||
# # --- Discovery cache TTLs ---
|
||
# catalog_cache_ttl_seconds: 300 # /api/v2/catalog response cache lifetime (default: 5 min)
|
||
# schema_cache_ttl_seconds: 3600 # /api/v2/schema/{table_id} cache lifetime (default: 1 h)
|
||
# sample_cache_ttl_seconds: 3600 # /api/v2/sample/{table_id} cache lifetime (default: 1 h)
|
||
# # Admins can force-refresh via POST /api/v2/sample/{id}?refresh=true
|