agnes-the-ai-analyst/config/instance.yaml.example
ZdenekSrotyr b7a1795834
feat(scheduler): re-wire sync_schedule + script.schedule; tune via env; OpenMetadata TLS (#135)
Bundles 4 issues:
- #79 — table_registry.sync_schedule honored at runtime (API-side filter + Pydantic validators)
- #78 — script_registry.schedule honored via new POST /api/scripts/run-due (atomic claim, BackgroundTask exec, deploy-time safety validation)
- #77 — sidecar JOBS env-driven (SCHEDULER_DATA_REFRESH_INTERVAL/HEALTH_CHECK_INTERVAL/SCRIPT_RUN_INTERVAL/TICK_SECONDS)
- #89 — OpenMetadataClient verify=True default (BREAKING for self-signed)

Cuts release 0.19.0. See CHANGELOG for full notes incl. Known Limitations.
2026-04-29 22:06:30 +02:00

392 lines
17 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# AI Data Analyst - Instance Configuration
# ==========================================
# This is the main configuration file for your instance.
# Copy to instance.yaml and fill in your values.
#
# SECRET VALUES use ${ENV_VAR} syntax - actual values go in .env file.
# Non-secret values are set directly here.
# --- Config version ---
# Incremented when the config schema changes. Must match SUPPORTED_CONFIG_VERSIONS
# in config/loader.py. Currently only version 1 is supported.
config_version: 1
# --- Instance branding ---
instance:
name: "AI Data Analyst"
subtitle: "Your Organization"
copyright: "Your Organization"
# logo_svg: Full <svg> element for header logo (optional, default: Keboola logo)
# Example: '<svg width="120" height="30" viewBox="0 0 100 30" xmlns="http://www.w3.org/2000/svg"><text y="22" font-size="24" fill="#333">Logo</text></svg>'
# --- Server ---
server:
hostname: "" # DNS name (e.g., "data.acme.com")
host: "" # IP address
app_dir: "/opt/data-analyst" # Installation directory
# --- Client setup (shown in "Get Started" on dashboard) ---
# ssh_alias: "data-analyst" # SSH config Host alias for analysts (default: "data-analyst")
# ssh_key: "~/.ssh/data_analyst_server" # SSH key path for analysts (default: "~/.ssh/data_analyst_server")
# project_dir: "data-analyst" # Local project folder name (default: "data-analyst")
# --- Admin users ---
# Manage the server, own data files, get unlimited resource limits.
# SSH keys are used by server/setup.sh during provisioning.
admins:
- username: "admin"
ssh_public_key: "ssh-ed25519 AAAA..."
# --- Deployment ---
deployment:
method: "manual" # manual | github_actions
repo_url: "" # e.g., "git@github.com:acme/ai-data-analyst.git"
branch: "main"
# --- Authentication ---
# At minimum, set allowed_domain and webapp_secret_key.
# Email magic link auth works out of the box (no external service needed).
# Google OAuth is optional - add credentials to enable it.
auth:
allowed_domain: "" # Email domain(s) for login, comma-separated (e.g., "acme.com" or "acme.com, partner.org")
webapp_secret_key: "${WEBAPP_SECRET_KEY}"
# Optional: Google OAuth (if not set, only email magic link is available)
google_client_id: "${GOOGLE_CLIENT_ID}"
google_client_secret: "${GOOGLE_CLIENT_SECRET}"
# --- Webapp username shaping ---
#
# By default, a user's OS account is derived from their full email:
# e.psimecek@acme.com -> e_psimecek_acme_com
#
# Two options let you control this:
#
# username_strip_domain: true
# Use only the local part of the email (before @).
# Safe when allowed_domain ensures all users share a single domain.
# e.psimecek@acme.com -> e_psimecek
# Keeps usernames short and readable.
#
# username_prefix: "myapp_"
# Prepend a fixed string to every webapp-created account name.
# Necessary when an external identity system (GCP OS Login, LDAP, SAML)
# already creates OS accounts in /home/ using the same naming scheme.
# Without a prefix, the webapp sees those existing OS accounts and refuses
# to register new analyst accounts ("already in use by a system account").
# With prefix "myapp_" and strip_domain true:
# e.psimecek@acme.com -> myapp_e_psimecek
# Linux enforces a 32-character username limit. Keep the prefix short.
# Changing or removing either option later will invalidate all existing
# analyst accounts. Use username_mapping (top-level) to bridge legacy accounts.
#
# username_strip_domain: false
# username_prefix: ""
# disabled_providers: # Hide auth methods from login page
# - "email" # Disable email magic link (use when Google OAuth is configured)
# --- Theme (optional) ---
# Customize colors, fonts, and shape to match your brand.
# All values are optional - defaults provide a clean blue theme.
# See docs/theme-reference.html for a visual guide.
theme:
# primary: "#0073D1" # Main brand color (buttons, links, accents)
# primary_dark: "#005BA3" # Hover/active state of primary
# primary_light: "rgba(0, 115, 209, 0.1)" # Light tint backgrounds
# text_primary: "#1A253C" # Main text color
# text_secondary: "#6B7280" # Muted/secondary text
# background: "#F5F7FA" # Page background
# surface: "#FFFFFF" # Card/panel background
# border: "#E5E7EB" # Borders and dividers
# font_primary: "'Inter', system-ui, sans-serif"
# font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"
# radius: "6px" # Border radius (cards, buttons, inputs)
# success: "#10B77F"
# warning: "#F59F0A"
# error: "#EA580C"
# --- Data source ---
data_source:
type: "keboola" # keboola | bigquery | local
keboola:
storage_token: "${KEBOOLA_STORAGE_TOKEN}"
stack_url: "" # e.g., "https://connection.keboola.com"
project_id: ""
bigquery:
project: "${BIGQUERY_PROJECT}" # GCP project hosting the data (used in FROM clause)
location: "${BIGQUERY_LOCATION}" # BigQuery location (e.g., "us-central1", "US")
# Uses ADC (Application Default Credentials) - VM service account on GCP
# Data can live in a different project -- use fully-qualified table IDs in data_description.md
# billing_project: "" # Optional: GCP project to bill BQ jobs to / submit jobs from.
# # Defaults to `project`. Set this when the SA has bigquery.data.* on
# # the data project but lacks serviceusage.services.use there (i.e.,
# # cross-project read pattern). Submission/billing target must be a
# # project the SA can use; data project just needs read.
# legacy_wrap_views: false # Set true to restore pre-v2 wrap views for BQ VIEW/MATERIALIZED_VIEW
# # tables in analytics.duckdb (migration escape hatch; default: false)
# --- OpenMetadata catalog (optional) ---
# Enriches table and column metadata from OpenMetadata REST API.
# If not configured, app works normally without catalog enrichment.
# openmetadata:
# url: "https://your-catalog.example.com"
# token: "${OPENMETADATA_TOKEN}" # JWT bearer token
# cache_ttl_seconds: 3600 # Cache TTL in seconds
# verify_ssl: true # set to false ONLY for internal
# # CAs / self-signed certs; defaults
# # to true. Setting false ships the
# # JWT over an unverified channel.
# --- Email delivery (optional, for magic link auth) ---
# Without SMTP, magic links are shown directly in browser (development mode).
# For production, configure any SMTP relay (Gmail, Mailgun, SendGrid SMTP, etc.)
email:
from_address: "noreply@example.com"
from_name: "AI Data Analyst"
smtp_host: "${SMTP_HOST}" # e.g., "smtp.gmail.com"
smtp_port: 587 # 587 for STARTTLS, 465 for SSL
smtp_user: "${SMTP_USER}"
smtp_password: "${SMTP_PASSWORD}"
# --- Desktop app (optional) ---
desktop:
jwt_issuer: "data-analyst"
jwt_secret: "${DESKTOP_JWT_SECRET}"
url_scheme: "data-analyst"
# --- Telegram notifications (optional) ---
telegram:
bot_token: "${TELEGRAM_BOT_TOKEN}"
bot_username: ""
domain_suffix: ""
# --- Jira integration (optional) ---
jira:
domain: ""
email: ""
api_token: "${JIRA_API_TOKEN}"
webhook_secret: "${JIRA_WEBHOOK_SECRET}"
sla_email: ""
sla_api_token: "${JIRA_SLA_API_TOKEN}"
cloud_id: ""
# --- Corporate Memory AI (optional) ---
# Extracts shared knowledge from team members' CLAUDE.local.md files.
# Provider: "anthropic" (direct API) or "openai_compat" (LiteLLM, OpenRouter, Azure, etc.)
ai:
provider: "anthropic" # or "openai_compat"
api_key: "${ANTHROPIC_API_KEY}" # or "${LLM_API_KEY}" for proxy
# base_url: "https://litellm.example.com" # required for openai_compat
model: "claude-haiku-4-5-20251001" # any model available on your provider
# --- Structured output quality control ---
# AI models can return JSON in three ways, each with different reliability:
#
# Layer 1 - "json_schema" (best):
# The provider enforces an exact schema. Every field, type, and structure
# is guaranteed. Available on: Anthropic, OpenAI, Claude via LiteLLM.
#
# Layer 2 - "json_object" (good):
# The provider guarantees valid JSON, but does not enforce a specific schema.
# Fields may be missing or have wrong types. Available on most providers.
#
# Layer 3 - "prompt" (acceptable):
# The AI is asked to respond in JSON via instructions in the prompt.
# No technical enforcement -- the model may still return invalid JSON.
# Works everywhere, but least reliable.
#
# "strict" = only Layer 1. Fail if provider doesn't support json_schema.
# Use when data quality is non-negotiable.
# "json" = Layer 1, fall back to Layer 2. No prompt-based fallback.
# Good balance of quality and compatibility.
# "auto" = All three layers as progressive fallback. Maximum compatibility.
# Use when you'd rather get imperfect data than no data.
structured_output: "auto"
# Legacy format (still supported, equivalent to provider: "anthropic"):
# ai:
# anthropic_api_key: "${ANTHROPIC_API_KEY}"
# Examples:
# --- LiteLLM proxy ---
# ai:
# provider: "openai_compat"
# base_url: "https://litellm.example.com"
# api_key: "${LLM_API_KEY}"
# model: "claude-haiku-4-5-20251001"
# structured_output: "strict"
#
# --- OpenRouter ---
# ai:
# provider: "openai_compat"
# base_url: "https://openrouter.ai/api/v1"
# api_key: "${OPENROUTER_API_KEY}"
# model: "anthropic/claude-3-haiku"
# structured_output: "auto"
# --- Corporate Memory governance (optional) ---
# Controls how AI-extracted knowledge is reviewed and distributed.
# If not present, system operates in legacy mode (democratic wiki, no admin review).
#
# corporate_memory:
# # How knowledge reaches users:
# # "mandatory_only" — admin controls everything, no user voting
# # "admin_curated" — admin controls, users vote as feedback signal
# # "hybrid" — mandatory from admin + optional from user voting (default)
# distribution_mode: "hybrid"
#
# # How new AI-extracted items enter the system:
# # "review_queue" — nothing published without admin approval (default)
# # "auto_publish" — items go live immediately, admin intervenes retroactively
# # "threshold" — high-confidence auto-publish, low-confidence to review queue
# approval_mode: "review_queue"
#
# # Default review period for approved/mandatory items (months)
# review_period_months: 6
#
# # Notify km_admins about new pending items
# notify_on_new_items: true
#
# # --- V1 Context Engineering ---
#
# sources:
# claude_local_md:
# enabled: true
# confidence_base: 0.50
# session_transcripts:
# enabled: true
# confidence_base: 0.60
# max_turns_per_session: 100
# detection_types:
# - correction
# - confirmation
# - unprompted_definition
#
# extraction:
# model: "claude-haiku-4-5-20251001"
# sensitivity_check: true
# contradiction_check: true
#
# confidence:
# # Base score per extraction source. Key format: "source_type" or "source_type.detection_type"
# base:
# user_verification.correction: 0.90
# user_verification.unprompted_definition: 0.90
# user_verification.confirmation: 0.60
# admin_mandate: 1.00
# claude_local_md: 0.50
# session_transcript: 0.50
# # Per-key modifier step sizes applied to base when optional signals are present.
# modifiers:
# user_verification.correction:
# additional_verifiers: 0.05 # per extra unique verifier
# user_verification.unprompted_definition:
# additional_verifiers: 0.05
# user_verification.confirmation:
# admin_confirmed: 0.20
# session_transcript:
# user_confirmed_in_session: 0.20
# # Confidence decay applied to items as they age.
# decay:
# mode: exponential # linear | exponential
# half_life_months: 12 # used when mode=exponential
# decay_rate_monthly: 0.02 # used when mode=linear
# floor:
# admin_mandate: 0.50 # admin policies don't silently decay to zero
# user_verification: 0.40 # user-verified facts never fall below 0.40
# default: 0.0
#
# contradiction_detection:
# enabled: true
# max_candidates: 10
#
# entity_resolution:
# enabled: true
# entities:
# metrics: ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"]
# products: ["Platform", "API", "Dashboard"]
#
# domain_owners:
# finance: ["cfo@company.com"]
# engineering: ["cto@company.com"]
# product: ["pm@company.com"]
#
# domains:
# - finance
# - engineering
# - product
# - data
# - operations
# - infrastructure
# --- User groups for audience targeting (optional) ---
# Used with Corporate Memory governance to target mandatory knowledge to specific groups.
#
# groups:
# finance:
# label: "Finance & Analytics"
# members: ["analyst1@company.com", "analyst2@company.com"]
# engineering:
# label: "Engineering"
# members: ["dev1@company.com", "dev2@company.com"]
# --- User display and permissions ---
# Corporate Memory avatars + optional km_admin flag for governance.
# users:
# admin@company.com:
# display_name: "Admin User"
# km_admin: true # Corporate Memory admin (approve/mandate knowledge)
# analyst@company.com:
# display_name: "Analyst User"
users: {}
# --- Username mapping (webapp email -> server username, only if different) ---
username_mapping: {}
# --- Optional datasets (sync settings UI) ---
datasets: {}
# --- Data catalog ---
catalog:
categories: {}
order: []
# --- Data profiler (optional) ---
# profiler:
# sample_size: 500000 # If table > this, sample this many rows; otherwise use all
# max_categorical_distinct: 50 # Treat as categorical if unique <= this
# top_values_limit: 10 # Top values per categorical column
# histogram_bins: 15 # Bins in histogram visualizations
# sample_rows_limit: 5 # Sample rows to show in UI "Sample" tab
# alert_high_missing_pct: 30.0 # Alert threshold for high missing %
# alert_missing_pct: 5.0 # Alert threshold for missing %
# alert_imbalance_pct: 60.0 # Alert threshold for imbalance %
# alert_high_cardinality: 50 # Alert threshold for high cardinality columns
# --- Remote query (optional) ---
# Settings for remote BigQuery queries via `python -m src.remote_query`.
# Used when tables have query_mode: "remote" in data_description.md.
# remote_query:
# timeout_seconds: 300 # BQ + DuckDB query timeout
# max_result_rows: 100000 # Max rows in final output
# max_bq_registration_rows: 500000 # Max rows per --register-bq sub-query
# default_format: "table" # Default output format
# output_dir: "/tmp/remote_query" # Directory for Parquet/CSV exports
# --- v2 API knobs (optional) ---
# Controls for the /api/v2/{catalog,schema,sample,scan,scan/estimate} endpoints.
# All values are optional — the defaults shown below are applied if keys are absent.
#
# api:
# # --- Scan / fetch limits ---
# scan:
# max_limit: 10000000 # Hard row cap per /api/v2/scan request (default: 10 M)
# max_result_bytes: 2147483648 # Hard byte cap on Arrow stream response: 2 GB (default)
# # If exceeded, partial result returned with X-Agnes-Truncated header.
# max_concurrent_per_user: 5 # In-flight /api/v2/scan requests allowed per user (default: 5)
# # Note: quota is process-local; N replicas → effective N× cap.
# max_daily_bytes_per_user: 53687091200 # Per-user daily byte quota: 50 GB (default). Resets at UTC midnight.
# bq_cost_per_tb_usd: 5.00 # Cost rate shown in /api/v2/scan/estimate response (default: $5/TB)
# request_timeout_seconds: 300 # Server-side timeout for a single scan request (default: 300 s)
# # --- Discovery cache TTLs ---
# catalog_cache_ttl_seconds: 300 # /api/v2/catalog response cache lifetime (default: 5 min)
# schema_cache_ttl_seconds: 3600 # /api/v2/schema/{table_id} cache lifetime (default: 1 h)
# sample_cache_ttl_seconds: 3600 # /api/v2/sample/{table_id} cache lifetime (default: 1 h)
# # Admins can force-refresh via POST /api/v2/sample/{id}?refresh=true