# AI Data Analyst - Instance Configuration # ========================================== # This is the main configuration file for your instance. # Copy to instance.yaml and fill in your values. # # SECRET VALUES use ${ENV_VAR} syntax - actual values go in .env file. # Non-secret values are set directly here. # --- Config version --- # Incremented when the config schema changes. Must match SUPPORTED_CONFIG_VERSIONS # in config/loader.py. Currently only version 1 is supported. config_version: 1 # --- Instance branding --- instance: name: "AI Data Analyst" subtitle: "Your Organization" copyright: "Your Organization" # brand: "Agnes" # Product-name brand string used in analyst-facing UI copy # (/home hero, /setup, /login messages, clipboard setup # script). Default "Agnes". Set to e.g. "Foundry AI" to # rebrand without forking. Distinct from `name` above: # `name` is the deploying organization, `brand` is the # product. Env override: AGNES_INSTANCE_BRAND. # workspace_dir: "FoundryAI" # Filesystem-safe folder name for the analyst's local # workspace (~/). When unset, derived from # `brand` by stripping non-alphanumerics ("Foundry AI" -> # "FoundryAI"). Set explicitly only if you want a folder # name that differs from the auto-derivation. Env override: # AGNES_WORKSPACE_DIR_NAME. # logo_svg: | # Inline element rendered into the header brand slot. # # Logo # # # When set, the SVG replaces the text brand in the header. # # `name` above still drives browser text and page # # headings — keep it populated. Env override: # # AGNES_INSTANCE_LOGO_SVG. # overview: | # Operator-authored Overview body rendered in the new #

Free-form HTML — paragraphs, links, lists.

# # Overview section on /home (between Getting Started and # # Usage modes). Use for product framing, privacy posture, # # what-data-flows summary — operator-specific copy stays # # out of the OSS this way. HTML in, HTML out (same `| safe` # # filter as news_intro). Empty/unset = section hidden. # # Env override: AGNES_INSTANCE_OVERVIEW. # sync_interval: "1 hour" # Cadence shown in analyst CLAUDE.md (e.g., "1 hour", "30 minutes", "daily") # admin_email: "ops@acme.com" # Operator contact shown on /home GWS connector tile as # an "Email admin" mailto button (analysts whose operator # hasn't pre-provisioned a shared OAuth app can request # one without leaving the workspace). Empty/unset hides # the button. Env override: AGNES_INSTANCE_ADMIN_EMAIL. # home: # Per-instance toggles for /home content blocks. # show_automode: true # Render Step-3 auto-accept-mode block. Default true. # # Env: AGNES_HOME_SHOW_AUTOMODE. # show_status_frame: true # Render the 5-card status frame (Last sync, Sessions, # # Prompts, Tokens, Projects). Visible only to onboarded # # users regardless of this flag. Default true. Env: # # AGNES_HOME_SHOW_STATUS_FRAME. # custom_scripts: # Operator-injected HTML/JS blocks rendered into every # # page that extends base.html. Use for feedback widgets # # (Marker.io), analytics (GTM, PostHog), error capture # # (Sentry), etc. Each entry needs name + enabled + # # placement + html. Admin-only; rendered with `| safe`. # # Review the widget's privacy posture before enabling — # # most third-party widgets capture screenshots, console # # logs, or user actions on submit. Resolved by # # `app/instance_config.py::get_custom_scripts()`. No # # env override (structure doesn't fit env vars cleanly). # - name: "marker-io" # Example: Marker.io feedback widget. # enabled: true # Kill switch — set false to disable without deleting. # placement: "head_end" # head_start | head_end | body_end # html: | # # --- Server --- server: hostname: "" # DNS name (e.g., "data.acme.com") host: "" # IP address app_dir: "/opt/data-analyst" # Installation directory # --- Client setup (shown in "Get Started" on dashboard) --- # ssh_alias: "data-analyst" # SSH config Host alias for analysts (default: "data-analyst") # ssh_key: "~/.ssh/data_analyst_server" # SSH key path for analysts (default: "~/.ssh/data_analyst_server") # project_dir: "data-analyst" # Local project folder name (default: "data-analyst") # --- Admin users --- # Manage the server, own data files, get unlimited resource limits. # SSH keys are used by server/setup.sh during provisioning. admins: - username: "admin" ssh_public_key: "ssh-ed25519 AAAA..." # --- Deployment --- deployment: method: "manual" # manual | github_actions repo_url: "" # e.g., "git@github.com:acme/ai-data-analyst.git" branch: "main" # --- Authentication --- # At minimum, set allowed_domain and webapp_secret_key. # Email magic link auth works out of the box (no external service needed). # Google OAuth is optional - add credentials to enable it. auth: allowed_domain: "" # Email domain(s) for login, comma-separated (e.g., "acme.com" or "acme.com, partner.org") webapp_secret_key: "${WEBAPP_SECRET_KEY}" # Optional: Google OAuth (if not set, only email magic link is available) google_client_id: "${GOOGLE_CLIENT_ID}" google_client_secret: "${GOOGLE_CLIENT_SECRET}" # --- Webapp username shaping --- # # By default, a user's OS account is derived from their full email: # e.psimecek@acme.com -> e_psimecek_acme_com # # Two options let you control this: # # username_strip_domain: true # Use only the local part of the email (before @). # Safe when allowed_domain ensures all users share a single domain. # e.psimecek@acme.com -> e_psimecek # Keeps usernames short and readable. # # username_prefix: "myapp_" # Prepend a fixed string to every webapp-created account name. # Necessary when an external identity system (GCP OS Login, LDAP, SAML) # already creates OS accounts in /home/ using the same naming scheme. # Without a prefix, the webapp sees those existing OS accounts and refuses # to register new analyst accounts ("already in use by a system account"). # With prefix "myapp_" and strip_domain true: # e.psimecek@acme.com -> myapp_e_psimecek # Linux enforces a 32-character username limit. Keep the prefix short. # Changing or removing either option later will invalidate all existing # analyst accounts. Use username_mapping (top-level) to bridge legacy accounts. # # username_strip_domain: false # username_prefix: "" # disabled_providers: # Hide auth methods from login page # - "email" # Disable email magic link (use when Google OAuth is configured) # --- Theme (optional) --- # Customize colors, fonts, and shape to match your brand. # All values are optional - defaults provide a clean blue theme. # See docs/theme-reference.html for a visual guide. theme: # primary: "#0073D1" # Main brand color (buttons, links, accents) # primary_dark: "#005BA3" # Hover/active state of primary # primary_light: "rgba(0, 115, 209, 0.1)" # Light tint backgrounds # text_primary: "#1A253C" # Main text color # text_secondary: "#6B7280" # Muted/secondary text # background: "#F5F7FA" # Page background # surface: "#FFFFFF" # Card/panel background # border: "#E5E7EB" # Borders and dividers # font_primary: "'Inter', system-ui, sans-serif" # font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" # radius: "6px" # Border radius (cards, buttons, inputs) # success: "#10B77F" # warning: "#F59F0A" # error: "#EA580C" # --- Data source --- data_source: type: "keboola" # keboola | bigquery | local keboola: storage_token: "${KEBOOLA_STORAGE_TOKEN}" stack_url: "" # e.g., "https://connection.keboola.com" project_id: "" bigquery: project: "${BIGQUERY_PROJECT}" # GCP project hosting the data (used in FROM clause) location: "${BIGQUERY_LOCATION}" # BigQuery location (e.g., "us-central1", "US") # Uses ADC (Application Default Credentials) - VM service account on GCP # Data can live in a different project -- use fully-qualified table IDs in data_description.md # billing_project: "prj-billing" # GCP project to bill BQ jobs to / submit jobs from. # # Defaults to `project`. Set when the SA has bigquery.data.* on # # the data project but lacks serviceusage.services.use there. # # Mismatch -> every BQ call 403 USER_PROJECT_DENIED. # # `da diagnose` warns when this falls back to `project`. # # Configurable via /admin/server-config UI. # max_bytes_per_materialize: 10737418240 # # Cost guardrail (bytes) for query_mode='materialized' BQ scans. # # Dry-run check before running; exceeding -> registration / sync # # rejected. Default 10 GiB (10737418240). Set 0 to disable. # # null falls through to default. Configurable via /admin/server-config UI. # query_timeout_ms: 600000 # # DuckDB BigQuery extension query timeout (milliseconds). # # Applied via `SET bq_query_timeout_ms` after every LOAD bigquery # # on every BQ-touching DuckDB session. Extension default is # # 90 000 ms = 90 s, which is too tight for analyst queries against # # view-backed datasets -- bumped to 600 000 ms = 10 min by default. # # Set 0 to fall through to the extension default. Configurable via # # /admin/server-config UI. # session_pool_size: 4 # # Number of pre-warmed DuckDB+bigquery-extension sessions kept # # in a process-local pool. Each acquire amortizes the # # ~0.5 s INSTALL/LOAD/CREATE-SECRET cost across requests; a fresh # # build only happens when the pool is empty. Default 4. Set 0 # # to disable pooling (every acquire builds + closes a fresh # # session; matches pre-pool behavior). # --- OpenMetadata catalog (optional) --- # Enriches table and column metadata from OpenMetadata REST API. # If not configured, app works normally without catalog enrichment. # All openmetadata.* fields configurable via /admin/server-config UI. # openmetadata: # url: "https://your-catalog.example.com" # token: "${OPENMETADATA_TOKEN}" # JWT bearer token # cache_ttl_seconds: 3600 # Cache TTL in seconds # verify_ssl: true # set to false ONLY for internal # # CAs / self-signed certs; defaults # # to true. Setting false ships the # # JWT over an unverified channel. # --- Email delivery (optional, for magic link auth) --- # Without SMTP, magic links are shown directly in browser (development mode). # For production, configure any SMTP relay (Gmail, Mailgun, SendGrid SMTP, etc.) email: from_address: "noreply@example.com" from_name: "AI Data Analyst" smtp_host: "${SMTP_HOST}" # e.g., "smtp.gmail.com" smtp_port: 587 # 587 for STARTTLS, 465 for SSL smtp_user: "${SMTP_USER}" smtp_password: "${SMTP_PASSWORD}" # --- Desktop app (optional) --- # All desktop.* fields configurable via /admin/server-config UI (rarely changed once set). desktop: jwt_issuer: "data-analyst" jwt_secret: "${DESKTOP_JWT_SECRET}" url_scheme: "data-analyst" # --- Telegram notifications (optional) --- telegram: bot_token: "${TELEGRAM_BOT_TOKEN}" bot_username: "" domain_suffix: "" # --- Jira integration (optional) --- jira: domain: "" email: "" api_token: "${JIRA_API_TOKEN}" webhook_secret: "${JIRA_WEBHOOK_SECRET}" sla_email: "" sla_api_token: "${JIRA_SLA_API_TOKEN}" cloud_id: "" # --- Corporate Memory AI (optional) --- # Extracts shared knowledge from team members' CLAUDE.local.md files. # Provider: "anthropic" (direct API) or "openai_compat" (LiteLLM, OpenRouter, Azure, etc.) ai: provider: "anthropic" # or "openai_compat" api_key: "${ANTHROPIC_API_KEY}" # or "${LLM_API_KEY}" for proxy # base_url: "https://litellm.example.com" # Required for provider='openai_compat' (LiteLLM, # OpenRouter, vLLM). Ignored when provider='anthropic'. # Configurable via /admin/server-config UI. model: "claude-haiku-4-5-20251001" # any model available on your provider # --- Structured output quality control --- # AI models can return JSON in three ways, each with different reliability: # # Layer 1 - "json_schema" (best): # The provider enforces an exact schema. Every field, type, and structure # is guaranteed. Available on: Anthropic, OpenAI, Claude via LiteLLM. # # Layer 2 - "json_object" (good): # The provider guarantees valid JSON, but does not enforce a specific schema. # Fields may be missing or have wrong types. Available on most providers. # # Layer 3 - "prompt" (acceptable): # The AI is asked to respond in JSON via instructions in the prompt. # No technical enforcement -- the model may still return invalid JSON. # Works everywhere, but least reliable. # # "strict" = only Layer 1. Fail if provider doesn't support json_schema. # Use when data quality is non-negotiable. # "json" = Layer 1, fall back to Layer 2. No prompt-based fallback. # Good balance of quality and compatibility. # "auto" = All three layers as progressive fallback. Maximum compatibility. # Use when you'd rather get imperfect data than no data. structured_output: "auto" # Legacy format (still supported, equivalent to provider: "anthropic"): # ai: # anthropic_api_key: "${ANTHROPIC_API_KEY}" # Examples: # --- LiteLLM proxy --- # ai: # provider: "openai_compat" # base_url: "https://litellm.example.com" # api_key: "${LLM_API_KEY}" # model: "claude-haiku-4-5-20251001" # structured_output: "strict" # # --- OpenRouter --- # ai: # provider: "openai_compat" # base_url: "https://openrouter.ai/api/v1" # api_key: "${OPENROUTER_API_KEY}" # model: "anthropic/claude-3-haiku" # structured_output: "auto" # --- Flea-market upload guardrails (optional) --- # Controls the pre-publish check pipeline for skill/agent/plugin uploads # to /store. See docs/STORE_GUARDRAILS.md for the full check catalogue. # # guardrails: # # Master kill-switch. When false, inline manifest/security/quality # # checks still run (they're free) but the LLM step is skipped and new # # uploads are auto-approved. Useful for local dev without an LLM key. # enabled: true # # # Anthropic model tier for the LLM security review. # # haiku — ~$0.001/review, default, good enough for routine uploads # # sonnet — ~$0.015/review, deeper reasoning, fewer false negatives # # opus — ~$0.075/review, only for high-stakes deployments # # You can also pin a concrete model ID (e.g. "claude-haiku-4-5-20251001"). # review_model: "haiku" # # # Per-submitter daily cap on inline-blocked uploads. Bounds disk + # # admin-queue spam. Set to 0 to disable. Default 50. # blocked_quota_per_day: 50 # # # How many days to keep blocked bundle bytes on disk before the # # daily TTL job purges them. Submission row + sha256 + size always # # survive — only the bundle bytes go. Set to 0 to retain forever # # (rely on admin Delete). Default 30. # blocked_bundle_ttl_days: 30 # --- Corporate Memory governance (optional) --- # Controls how AI-extracted knowledge is reviewed and distributed. # If not present, system operates in legacy mode (democratic wiki, no admin review). # # The corporate_memory.* schema is editable via /admin/server-config UI; you can # also continue to manage it via this YAML file. The UI surfaces every leaf with # a hint, so use it to discover the schema if this comment block has aged. # # corporate_memory: # # How knowledge reaches users: # # "mandatory_only" — admin controls everything, no user voting # # "admin_curated" — admin controls, users vote as feedback signal # # "hybrid" — mandatory from admin + optional from user voting (default) # distribution_mode: "hybrid" # # # How new AI-extracted items enter the system: # # "review_queue" — nothing published without admin approval (default) # # "auto_publish" — items go live immediately, admin intervenes retroactively # # "threshold" — high-confidence auto-publish, low-confidence to review queue # approval_mode: "review_queue" # # # Default review period for approved/mandatory items (months) # review_period_months: 6 # # # Notify km_admins about new pending items # notify_on_new_items: true # # # --- V1 Context Engineering --- # # sources: # claude_local_md: # enabled: true # confidence_base: 0.50 # session_transcripts: # enabled: true # confidence_base: 0.60 # max_turns_per_session: 100 # detection_types: # - correction # - confirmation # - unprompted_definition # # extraction: # model: "claude-haiku-4-5-20251001" # sensitivity_check: true # contradiction_check: true # # confidence: # # Base score per extraction source. Key format: "source_type" or "source_type.detection_type" # base: # user_verification.correction: 0.90 # user_verification.unprompted_definition: 0.90 # user_verification.confirmation: 0.60 # admin_mandate: 1.00 # claude_local_md: 0.50 # session_transcript: 0.50 # # Per-key modifier step sizes applied to base when optional signals are present. # modifiers: # user_verification.correction: # additional_verifiers: 0.05 # per extra unique verifier # user_verification.unprompted_definition: # additional_verifiers: 0.05 # user_verification.confirmation: # admin_confirmed: 0.20 # session_transcript: # user_confirmed_in_session: 0.20 # # Confidence decay applied to items as they age. # decay: # mode: exponential # linear | exponential # half_life_months: 12 # used when mode=exponential # decay_rate_monthly: 0.02 # used when mode=linear # floor: # admin_mandate: 0.50 # admin policies don't silently decay to zero # user_verification: 0.40 # user-verified facts never fall below 0.40 # default: 0.0 # # contradiction_detection: # enabled: true # max_candidates: 10 # # entity_resolution: # enabled: true # entities: # metrics: ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"] # products: ["Platform", "API", "Dashboard"] # # domain_owners: # finance: ["cfo@company.com"] # engineering: ["cto@company.com"] # product: ["pm@company.com"] # # domains: # - finance # - engineering # - product # - data # - operations # - infrastructure # --- User groups for audience targeting (optional) --- # Used with Corporate Memory governance to target mandatory knowledge to specific groups. # # groups: # finance: # label: "Finance & Analytics" # members: ["analyst1@company.com", "analyst2@company.com"] # engineering: # label: "Engineering" # members: ["dev1@company.com", "dev2@company.com"] # --- User display and permissions --- # Corporate Memory avatars + optional km_admin flag for governance. # users: # admin@company.com: # display_name: "Admin User" # km_admin: true # Corporate Memory admin (approve/mandate knowledge) # analyst@company.com: # display_name: "Analyst User" users: {} # --- Username mapping (webapp email -> server username, only if different) --- username_mapping: {} # --- Optional datasets (sync settings UI) --- datasets: {} # --- Data catalog --- catalog: categories: {} order: [] # --- Data profiler (optional) --- # profiler: # sample_size: 500000 # If table > this, sample this many rows; otherwise use all # max_categorical_distinct: 50 # Treat as categorical if unique <= this # top_values_limit: 10 # Top values per categorical column # histogram_bins: 15 # Bins in histogram visualizations # sample_rows_limit: 5 # Sample rows to show in UI "Sample" tab # alert_high_missing_pct: 30.0 # Alert threshold for high missing % # alert_missing_pct: 5.0 # Alert threshold for missing % # alert_imbalance_pct: 60.0 # Alert threshold for imbalance % # alert_high_cardinality: 50 # Alert threshold for high cardinality columns # --- Remote query (optional) --- # Settings for remote BigQuery queries via `python -m src.remote_query`. # Used when tables have query_mode: "remote" in data_description.md. # remote_query: # timeout_seconds: 300 # BQ + DuckDB query timeout # max_result_rows: 100000 # Max rows in final output # max_bq_registration_rows: 500000 # Max rows per --register-bq sub-query # default_format: "table" # Default output format # output_dir: "/tmp/remote_query" # Directory for Parquet/CSV exports # --- v2 API knobs (optional) --- # Controls for the /api/v2/{catalog,schema,sample,scan,scan/estimate} endpoints. # All values are optional — the defaults shown below are applied if keys are absent. # # api: # # --- Scan / fetch limits --- # scan: # max_limit: 10000000 # Hard row cap per /api/v2/scan request (default: 10 M) # max_result_bytes: 2147483648 # Hard byte cap on Arrow stream response: 2 GB (default) # # If exceeded, partial result returned with X-Agnes-Truncated header. # max_concurrent_per_user: 5 # In-flight /api/v2/scan requests allowed per user (default: 5) # # Note: quota is process-local; N replicas → effective N× cap. # max_daily_bytes_per_user: 53687091200 # Per-user daily byte quota: 50 GB (default). Resets at UTC midnight. # bq_cost_per_tb_usd: 5.00 # Cost rate shown in /api/v2/scan/estimate response (default: $5/TB) # request_timeout_seconds: 300 # Server-side timeout for a single scan request (default: 300 s) # # --- Discovery cache TTLs --- # catalog_cache_ttl_seconds: 300 # /api/v2/catalog response cache lifetime (default: 5 min) # schema_cache_ttl_seconds: 3600 # /api/v2/schema/{table_id} cache lifetime (default: 1 h) # sample_cache_ttl_seconds: 3600 # /api/v2/sample/{table_id} cache lifetime (default: 1 h) # # Admins can force-refresh via POST /api/v2/sample/{id}?refresh=true # --- Materialize concurrency safety (optional) --- # Concurrency safety net for the materialize path (BQ + Keboola). When # two materialize attempts race for the same table_id, the second one # raises MaterializeInFlightError and skips. The lock is held in a # .parquet.lock sibling file; if a holder process is hard-killed before # kernel-level flock release, the next attempt reclaims the lock once # the file's mtime is older than this TTL. # # Default 86400 (24h) is generous on purpose — anything shorter risks # a long-running COPY being interrupted by its own scheduler successor. # Lower it only if you know your materialize never exceeds the new # value AND your host has a habit of hard-killing processes. # Min 60 (1 minute), max 604800 (7 days). Configurable via /admin/server-config UI. materialize: lock_ttl_seconds: 86400