Merge pull request #1 from keboola/feature/v2-fastapi-duckdb-docker-cli

feat: multi-instance deployment (14 must-have items)
2026-04-10 18:08:03 +02:00 · 2026-04-10 18:08:03 +02:00 · dbc57d1de3
commit dbc57d1de3
parent b7a3c8dd13 5836bcde4c
27 changed files with 7008 additions and 67 deletions
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@ -1,8 +1,9 @@
-name: Build & Push
+# SUPERSEDED by release.yml — CalVer tagging with stable/dev channels.
 # Kept for manual trigger only. Automated builds use release.yml.
 name: Build & Push (legacy)
 on:
-  push:
+  workflow_dispatch: {}
    branches: [main]
 jobs:
  test:
@ -24,27 +25,3 @@ jobs:
        run: pytest tests/ -v --tb=short
        env:
          TESTING: "1"
  build-and-push:
    needs: test
    runs-on: ubuntu-latest
    permissions:
      packages: write
      contents: read
    steps:
      - uses: actions/checkout@v5
      - name: Log in to GHCR
        uses: docker/login-action@v4
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Build and push
        uses: docker/build-push-action@v7
        with:
          push: true
          tags: |
            ghcr.io/${{ github.repository }}:latest
            ghcr.io/${{ github.repository }}:${{ github.sha }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -0,0 +1,148 @@
 name: Release
 on:
  push:
    branches: [main, "feature/**"]
 permissions:
  contents: write
  packages: write
 jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5
      - uses: actions/setup-python@v6
        with:
          python-version: "3.13"
      - name: Install uv
        uses: astral-sh/setup-uv@v7
      - name: Install dependencies
        run: uv pip install --system ".[dev]"
      - name: Run tests
        run: pytest tests/ -v --tb=short
        env:
          TESTING: "1"
  build-and-push:
    needs: test
    runs-on: ubuntu-latest
    outputs:
      image_tag: ${{ steps.meta.outputs.versioned_tag }}
      version: ${{ steps.meta.outputs.version }}
      channel: ${{ steps.meta.outputs.channel }}
    steps:
      - uses: actions/checkout@v5
        with:
          fetch-depth: 0
          fetch-tags: true
      - name: Claim version tag (with retry to avoid race conditions)
        id: meta
        run: |
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"
          YEAR_MONTH=$(date +%Y.%m)
          if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
            CHANNEL="stable"
          else
            CHANNEL="dev"
          fi
          SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
          # Claim a unique version by pushing a git tag BEFORE building.
          # Retry up to 5 times if another CI run took our N.
          TAG_CLAIMED=false
          for ATTEMPT in 1 2 3 4 5; do
            git fetch --tags --force
            # Use max(N) not count — safe even if tags are deleted
            MAX_N=$(git tag -l "*-${YEAR_MONTH}.*" | sed 's/.*\.//' | sort -n | tail -1)
            N=$(( ${MAX_N:-0} + 1 ))
            VERSION="${YEAR_MONTH}.${N}"
            TAG="${CHANNEL}-${VERSION}"
            git tag -a "$TAG" -m "Release $TAG"
            if git push origin "$TAG" 2>/dev/null; then
              echo "Claimed tag $TAG (attempt $ATTEMPT)"
              TAG_CLAIMED=true
              break
            else
              echo "Tag $TAG already exists, retrying... (attempt $ATTEMPT)"
              git tag -d "$TAG"
              sleep 2
            fi
          done
          if [ "$TAG_CLAIMED" != "true" ]; then
            echo "::error::Failed to claim a unique version tag after 5 attempts"
            exit 1
          fi
          echo "channel=${CHANNEL}" >> "$GITHUB_OUTPUT"
          echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
          echo "versioned_tag=${TAG}" >> "$GITHUB_OUTPUT"
          echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT"
          echo "Channel: ${CHANNEL}"
          echo "Version: ${VERSION}"
          echo "Versioned tag: ${TAG}"
      - name: Log in to GHCR
        uses: docker/login-action@v4
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Build and push
        uses: docker/build-push-action@v7
        with:
          push: true
          build-args: |
            AGNES_VERSION=${{ steps.meta.outputs.version }}
            RELEASE_CHANNEL=${{ steps.meta.outputs.channel }}
          tags: |
            ghcr.io/${{ github.repository }}:${{ steps.meta.outputs.channel }}
            ghcr.io/${{ github.repository }}:${{ steps.meta.outputs.versioned_tag }}
            ghcr.io/${{ github.repository }}:sha-${{ steps.meta.outputs.short_sha }}
  smoke-test:
    needs: build-and-push
    if: github.ref == 'refs/heads/main'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5
      - name: Start Agnes from built image
        run: |
          # Create empty .env (docker-compose.yml requires env_file: .env, gitignored)
          touch .env
          # Use prod compose (GHCR images) + CI overlay (test secrets)
          export AGNES_TAG="${{ needs.build-and-push.outputs.image_tag }}"
          docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml up -d app
          # Wait for healthy (max 60s)
          timeout 60 bash -c 'until curl -sf http://localhost:8000/api/health | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d[\"status\"]!=\"unhealthy\" else 1)"; do sleep 3; done'
      - name: Run smoke tests
        run: bash scripts/smoke-test.sh http://localhost:8000
      - name: Collect logs on failure
        if: failure()
        run: docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml logs > smoke-test-logs.txt
      - name: Upload logs
        if: failure()
        uses: actions/upload-artifact@v4
        with:
          name: smoke-test-logs
          path: smoke-test-logs.txt
      - name: Teardown
        if: always()
        run: docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml down -v
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,33 @@
 # Changelog
 All notable changes to Agnes AI Data Analyst are documented in this file.
 Format: [CalVer](https://calver.org/) `YYYY.MM.N` with channels `stable` and `dev`.
 ---
 ## stable-2026.04.1 (unreleased)
 Multi-instance deployment and self-service setup.
 ### Added
 - CalVer versioning with `stable` and `dev` release channels
 - `/api/health` now returns `version`, `channel`, and `schema_version`
 - Auto-generated JWT and session secrets with file persistence (`/data/state/.jwt_secret`)
 - Pre-migration snapshot of `system.duckdb` before schema upgrades
 - `POST /api/admin/configure` for headless data source configuration
 - `POST /api/admin/discover-and-register` combined table discovery and registration
 - `/setup` web wizard for first-time instance setup
 - `scripts/smoke-test.sh` for post-deploy verification
 - Smoke test job in CI (Docker-in-CI after every release)
 - OpenAPI snapshot test for breaking change detection
 - Custom connector mount support (`connectors/custom/`)
 - Startup banner logging version, channel, and schema version
 - Schema migration safety tests (idempotency, data preservation, snapshot)
 - `CHANGELOG.md` and release notes template
 ### Breaking Changes
 None.
 ### Migration Guide
 No action required. Existing instances upgrade seamlessly.
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -154,7 +154,7 @@ Auth providers in `app/auth/` (FastAPI-based):
 ## Key Implementation Details
 ### DuckDB Schema (src/db.py)
- Schema v2 with auto-migration from v1
+- Schema v3 with auto-migration from v1→v2→v3
 - `table_registry`: id, name, source_type, bucket, source_table, query_mode, sync_schedule, etc.
 - `sync_state`, `sync_history`: track extraction progress
 - `users`, `dataset_permissions`, `audit_log`: auth + RBAC
--- a/5
+++ b/5
@ -6,6 +6,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf
 # Install uv for fast dependency management
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 ARG AGNES_VERSION=dev
 ARG RELEASE_CHANNEL=dev
 ENV AGNES_VERSION=${AGNES_VERSION}
 ENV RELEASE_CHANNEL=${RELEASE_CHANNEL}
 WORKDIR /app
 # Copy application code
--- a/6
+++ b/6
@ -1,6 +1,6 @@
 # Agnes AI Data Analyst — Development Makefile
-.PHONY: help test lint dev docker
+.PHONY: help test lint dev docker update-openapi-snapshot
 help:
 	@echo "Available targets:"
@ -20,3 +20,7 @@ docker:
 lint:
 	@ruff check . 2>/dev/null || echo "ruff not installed: pip install ruff"
 update-openapi-snapshot:
 	TESTING=1 python scripts/generate_openapi.py > tests/snapshots/openapi.json
 	@echo "Snapshot updated. Review diff and commit."
--- a/app/api/admin.py
+++ b/app/api/admin.py
@ -1,7 +1,9 @@
-"""Admin endpoints — table discovery, registry management."""
+"""Admin endpoints — table discovery, registry management, instance configuration."""
 import logging
 import os
 import uuid
 from pathlib import Path
 from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel
@ -42,6 +44,16 @@ class UpdateTableRequest(BaseModel):
    profile_after_sync: Optional[bool] = None
 class ConfigureRequest(BaseModel):
    data_source: str  # "keboola" | "bigquery" | "local"
    keboola_token: Optional[str] = None
    keboola_url: Optional[str] = None
    bigquery_project: Optional[str] = None
    bigquery_location: Optional[str] = None
    instance_name: Optional[str] = None
    allowed_domain: Optional[str] = None
@router.get("/discover-tables")
 async def discover_tables(
    user: dict = Depends(require_role(Role.ADMIN)),
@ -53,10 +65,12 @@ async def discover_tables(
        if source_type == "keboola":
            from connectors.keboola.client import KeboolaClient
            import os
            from app.instance_config import get_value
-            url = get_value("keboola", "url", default="")
+            url = get_value("data_source", "keboola", "stack_url", default="")
-            token = os.environ.get(get_value("keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN"), "")
+            token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
            token = os.environ.get(token_env, "") if token_env else ""
            if not token:
                token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
            client = KeboolaClient(token=token, url=url)
            tables = client.discover_all_tables()
            return {"tables": tables, "count": len(tables), "source": "keboola"}
@ -144,3 +158,208 @@ async def unregister_table(
    if not repo.get(table_id):
        raise HTTPException(status_code=404, detail="Table not found")
    repo.unregister(table_id)
@router.post("/configure")
 async def configure_instance(
    request: ConfigureRequest,
    user: dict = Depends(require_role(Role.ADMIN)),
 ):
    """Configure data source and instance settings via API.
    Writes config to instance.yaml and persists secrets to .env_overlay.
    AI agents and the /setup wizard use this instead of manual file editing.
    """
    import yaml
    if request.data_source not in ("keboola", "bigquery", "local"):
        raise HTTPException(status_code=400, detail="data_source must be 'keboola', 'bigquery', or 'local'")
    # Validate credentials if provided
    if request.data_source == "keboola":
        if not request.keboola_token or not request.keboola_url:
            raise HTTPException(status_code=400, detail="keboola_token and keboola_url are required for Keboola data source")
        try:
            from connectors.keboola.client import KeboolaClient
            client = KeboolaClient(token=request.keboola_token, url=request.keboola_url)
            client.test_connection()
        except Exception as e:
            logger.error("Keboola connection validation failed: %s", e)
            raise HTTPException(status_code=400, detail="Keboola connection failed. Check your token and URL.")
    elif request.data_source == "bigquery":
        if not request.bigquery_project:
            raise HTTPException(status_code=400, detail="bigquery_project is required for BigQuery data source")
    # Write instance.yaml to DATA_DIR/state/ (writable Docker volume),
    # NOT to CONFIG_DIR which is mounted read-only in Docker.
    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
    config_path = data_dir / "state" / "instance.yaml"
    # Load existing API-generated config, or fall back to read-only CONFIG_DIR config
    existing = {}
    if config_path.exists():
        try:
            existing = yaml.safe_load(config_path.read_text()) or {}
        except Exception:
            existing = {}
    else:
        # Try loading from read-only config as base
        ro_path = Path(os.environ.get("CONFIG_DIR", "./config")) / "instance.yaml"
        if ro_path.exists():
            try:
                existing = yaml.safe_load(ro_path.read_text()) or {}
            except Exception:
                existing = {}
    # Merge instance settings
    if request.instance_name:
        existing.setdefault("instance", {})["name"] = request.instance_name
    if request.allowed_domain:
        existing.setdefault("auth", {})["allowed_domain"] = request.allowed_domain
    # Merge data source config (secrets as env var references)
    existing["data_source"] = {"type": request.data_source}
    if request.data_source == "keboola":
        existing["data_source"]["keboola"] = {
            "stack_url": request.keboola_url,
            "token_env": "KEBOOLA_STORAGE_TOKEN",
        }
    elif request.data_source == "bigquery":
        existing["data_source"]["bigquery"] = {
            "project": request.bigquery_project,
            "location": request.bigquery_location or "us",
        }
    # Write to writable data volume
    config_path.parent.mkdir(parents=True, exist_ok=True)
    config_path.write_text(yaml.dump(existing, default_flow_style=False, sort_keys=False))
    logger.info("Wrote instance config to %s", config_path)
    # Persist secrets to .env_overlay (in data volume, never in git)
    secrets_to_persist = {}
    if request.keboola_token:
        secrets_to_persist["KEBOOLA_STORAGE_TOKEN"] = request.keboola_token
    if request.keboola_url:
        secrets_to_persist["KEBOOLA_STACK_URL"] = request.keboola_url
    if secrets_to_persist:
        data_dir = Path(os.environ.get("DATA_DIR", "./data"))
        overlay_path = data_dir / "state" / ".env_overlay"
        overlay_path.parent.mkdir(parents=True, exist_ok=True)
        # Merge with existing overlay
        existing_overlay = {}
        if overlay_path.exists():
            for line in overlay_path.read_text().splitlines():
                if "=" in line and not line.startswith("#"):
                    k, v = line.split("=", 1)
                    existing_overlay[k.strip()] = v.strip()
        existing_overlay.update(secrets_to_persist)
        overlay_path.write_text(
            "\n".join(f"{k}={v}" for k, v in existing_overlay.items()) + "\n"
        )
        try:
            overlay_path.chmod(0o600)
        except OSError:
            pass
        logger.info("Persisted %d secrets to .env_overlay", len(secrets_to_persist))
        # Inject into current process environment
        for k, v in secrets_to_persist.items():
            os.environ[k] = v
    # Invalidate cached instance config so next read picks up changes
    import app.instance_config as ic
    ic._instance_config = None
    return {
        "status": "ok",
        "data_source": request.data_source,
        "connection": "verified" if request.data_source != "local" else "local",
    }
 def _discover_and_register_tables(conn: duckdb.DuckDBPyConnection, user_email: str) -> dict:
    """Discover tables from configured source and register them. Shared logic for API and sync."""
    from app.instance_config import get_data_source_type, get_value
    source_type = get_data_source_type()
    if source_type != "keboola":
        return {"registered": 0, "skipped": 0, "errors": 0, "tables": [], "source": source_type}
    from connectors.keboola.client import KeboolaClient
    # Read from data_source.keboola (matches what /api/admin/configure writes)
    url = get_value("data_source", "keboola", "stack_url", default="")
    token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
    token = os.environ.get(token_env, "") if token_env else ""
    if not token:
        token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
    client = KeboolaClient(token=token, url=url)
    discovered = client.discover_all_tables()
    repo = TableRegistryRepository(conn)
    registered = 0
    skipped = 0
    errors = 0
    table_names = []
    for table in discovered:
        table_id = table.get("id", "").strip().lower().replace(".", "_").replace(" ", "_")
        if not table_id:
            errors += 1
            continue
        if repo.get(table_id):
            skipped += 1
            continue
        try:
            # Parse bucket from table ID (format: in.c-bucket.table_name)
            parts = table.get("id", "").split(".")
            bucket = parts[1] if len(parts) > 1 else ""
            source_table = parts[2] if len(parts) > 2 else table.get("name", "")
            repo.register(
                id=table_id,
                name=table.get("name", table_id),
                source_type="keboola",
                bucket=bucket,
                source_table=source_table,
                query_mode="local",
                registered_by=user_email,
                description=f"Auto-discovered from Keboola: {table.get('id', '')}",
            )
            registered += 1
            table_names.append(table_id)
        except Exception as e:
            logger.warning("Failed to register %s: %s", table_id, e)
            errors += 1
    return {
        "registered": registered,
        "skipped": skipped,
        "errors": errors,
        "tables": table_names,
        "source": "keboola",
    }
@router.post("/discover-and-register")
 async def discover_and_register(
    user: dict = Depends(require_role(Role.ADMIN)),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
 ):
    """Discover tables from configured source and auto-register them.
    Combines discover-tables + register-table into one call.
    Skips already-registered tables. Used by /setup wizard and AI agents.
    """
    try:
        result = _discover_and_register_tables(conn, user.get("email", "admin"))
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Discovery and registration failed: {e}")
--- a/app/api/health.py
+++ b/app/api/health.py
@ -1,11 +1,13 @@
 """Health check endpoint — structured diagnostics for AI agents."""
 import os
 from datetime import datetime, timezone
 from fastapi import APIRouter, Depends
 import duckdb
 from app.auth.dependencies import _get_db
 from src.db import SCHEMA_VERSION
 from src.repositories.sync_state import SyncStateRepository
 router = APIRouter(tags=["health"])
@ -69,6 +71,9 @@ async def health_check(conn: duckdb.DuckDBPyConnection = Depends(_get_db)):
    return {
        "status": overall,
        "version": os.environ.get("AGNES_VERSION", "dev"),
        "channel": os.environ.get("RELEASE_CHANNEL", "dev"),
        "schema_version": SCHEMA_VERSION,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "services": checks,
    }
--- a/app/api/sync.py
+++ b/app/api/sync.py
@ -63,6 +63,27 @@ def _run_sync(tables: Optional[List[str]] = None):
        finally:
            sys_conn.close()
        if not table_configs:
            # Auto-discover tables on first sync when registry is empty
            if source_type == "keboola" and os.environ.get("KEBOOLA_STORAGE_TOKEN"):
                logger.info("No tables registered — running auto-discovery from Keboola")
                try:
                    from app.api.admin import _discover_and_register_tables
                    auto_conn = get_system_db()
                    try:
                        result = _discover_and_register_tables(auto_conn, "auto-discovery")
                        logger.info("Auto-discovered %d tables, skipped %d", result["registered"], result["skipped"])
                    finally:
                        auto_conn.close()
                    # Re-read table configs after auto-registration
                    sys_conn2 = get_system_db()
                    try:
                        table_configs = TableRegistryRepository(sys_conn2).list_local(source_type)
                    finally:
                        sys_conn2.close()
                except Exception as e:
                    logger.warning("Auto-discovery failed: %s", e)
            if not table_configs:
                logger.warning("No tables to sync for source_type=%s", source_type)
                return
@ -113,6 +134,29 @@ print(json.dumps(result))
        else:
            print(f"[SYNC] Extractor OK", file=_sys.stderr, flush=True)
        # Run custom connectors (Tier A: local mount)
        connectors_dir = Path(os.environ.get("CONNECTORS_DIR", str(Path(__file__).parent.parent.parent / "connectors" / "custom")))
        if connectors_dir.exists():
            for connector_dir in sorted(connectors_dir.iterdir()):
                if not connector_dir.is_dir():
                    continue
                extractor = connector_dir / "extractor.py"
                if not extractor.exists():
                    continue
                logger.info("Running custom connector: %s", connector_dir.name)
                try:
                    custom_result = subprocess.run(
                        [sys.executable, str(extractor)],
                        env=env, capture_output=True, text=True, timeout=600,
                        cwd=str(Path(__file__).parent.parent.parent),
                    )
                    if custom_result.returncode != 0:
                        logger.error("Custom connector %s failed: %s", connector_dir.name, custom_result.stderr[-500:])
                    else:
                        logger.info("Custom connector %s completed", connector_dir.name)
                except subprocess.TimeoutExpired:
                    logger.error("Custom connector %s timed out", connector_dir.name)
        # Rebuild master views (reads extract.duckdb files, no write conflict)
        from src.orchestrator import SyncOrchestrator
        orch = SyncOrchestrator()
--- a/app/auth/jwt.py
+++ b/app/auth/jwt.py
@ -7,22 +7,22 @@ from typing import Optional
 import jwt
-SECRET_KEY = os.environ.get("JWT_SECRET_KEY", "")
+def _get_secret_key() -> str:
-
+    """Load JWT secret - from env, file, or auto-generated."""
 if not SECRET_KEY:
    if os.environ.get("TESTING", "").lower() in ("1", "true"):
-        SECRET_KEY = "test-jwt-secret-key-minimum-32-chars!!"
+        return os.environ.get("JWT_SECRET_KEY", "test-jwt-secret-key-minimum-32-chars!!")
-    else:
+    from app.secrets import get_jwt_secret
-        raise RuntimeError(
+    key = get_jwt_secret()
-            "JWT_SECRET_KEY environment variable is required. "
+    if len(key) < 32:
            "Generate one: python -c \"import secrets; print(secrets.token_hex(32))\""
        )
 elif len(SECRET_KEY) < 32 and os.environ.get("TESTING", "").lower() not in ("1", "true"):
        import warnings as _warnings
        _warnings.warn(
-        f"JWT_SECRET_KEY is {len(SECRET_KEY)} chars — minimum 32 recommended",
+            f"JWT_SECRET_KEY is {len(key)} chars — minimum 32 recommended",
            UserWarning, stacklevel=2,
        )
    return key
 SECRET_KEY = _get_secret_key()
 ALGORITHM = "HS256"
 ACCESS_TOKEN_EXPIRE_HOURS = 24  # 24 hours
--- a/app/instance_config.py
+++ b/app/instance_config.py
@ -11,15 +11,34 @@ _instance_config: Optional[dict] = None
 def load_instance_config() -> dict:
-    """Load instance.yaml using the existing config loader."""
+    """Load instance.yaml — checks API-generated config first, then static config.
    Search order:
    1. DATA_DIR/state/instance.yaml (written by /api/admin/configure, writable)
    2. CONFIG_DIR/instance.yaml (static, read-only in Docker)
    3. Empty dict with defaults (if neither exists)
    """
    global _instance_config
    if _instance_config is not None:
        return _instance_config
    # First, try API-generated config in writable data volume
    import yaml
    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
    api_config_path = data_dir / "state" / "instance.yaml"
    if api_config_path.exists():
        try:
-        from config.loader import load_instance_config as _load, get_instance_value
+            _instance_config = yaml.safe_load(api_config_path.read_text()) or {}
            logger.info("Loaded instance.yaml from %s", api_config_path)
            return _instance_config
        except Exception as e:
            logger.warning(f"Could not load API-generated instance.yaml: {e}")
    # Fall back to static config (may have strict validation)
    try:
        from config.loader import load_instance_config as _load
        _instance_config = _load()
-        logger.info("Loaded instance.yaml")
+        logger.info("Loaded instance.yaml from config/")
    except Exception as e:
        logger.warning(f"Could not load instance.yaml: {e}. Using defaults.")
        _instance_config = {}
--- a/app/main.py
+++ b/app/main.py
@ -48,8 +48,8 @@ def create_app() -> FastAPI:
    )
    # Session middleware (required for OAuth state)
-    import secrets as _secrets
+    from app.secrets import get_session_secret
-    session_secret = os.environ.get("SESSION_SECRET", os.environ.get("JWT_SECRET_KEY", _secrets.token_hex(32)))
+    session_secret = get_session_secret()
    app.add_middleware(SessionMiddleware, secret_key=session_secret)
    # CORS for CLI and external clients
@ -62,6 +62,14 @@ def create_app() -> FastAPI:
        allow_headers=["*"],
    )
    # Load .env_overlay (persisted by /api/admin/configure)
    _overlay = Path(os.environ.get("DATA_DIR", "./data")) / "state" / ".env_overlay"
    if _overlay.exists():
        for line in _overlay.read_text().splitlines():
            if "=" in line and not line.startswith("#"):
                k, v = line.split("=", 1)
                os.environ.setdefault(k.strip(), v.strip())
    # Load instance config on startup
    try:
        from app.instance_config import load_instance_config
@ -70,6 +78,15 @@ def create_app() -> FastAPI:
    except Exception as e:
        logger.warning(f"Could not load instance config: {e}")
    # Startup banner
    from src.db import SCHEMA_VERSION
    logger.info(
        "Agnes %s | channel: %s | schema v%s",
        os.environ.get("AGNES_VERSION", "dev"),
        os.environ.get("RELEASE_CHANNEL", "dev"),
        SCHEMA_VERSION,
    )
    # Seed admin user for testing/CI (when SEED_ADMIN_EMAIL is set)
    seed_email = os.environ.get("SEED_ADMIN_EMAIL")
    if seed_email:
--- a/app/secrets.py
+++ b/app/secrets.py
@ -0,0 +1,43 @@
 """Auto-generate and persist secrets that survive container restarts."""
 import logging
 import os
 import secrets
 from pathlib import Path
 logger = logging.getLogger(__name__)
 def _load_or_generate(env_var: str, file_name: str) -> str:
    """Load secret from env var, or from file, or generate and persist."""
    val = os.environ.get(env_var, "")
    if val:
        return val
    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
    secret_path = data_dir / "state" / file_name
    if secret_path.exists():
        val = secret_path.read_text().strip()
        if val:
            return val
        logger.warning("Secret file %s is empty, regenerating", secret_path)
    secret_path.parent.mkdir(parents=True, exist_ok=True)
    val = secrets.token_hex(32)
    secret_path.write_text(val)
    try:
        secret_path.chmod(0o600)
    except OSError:
        pass  # chmod not supported on all platforms (e.g., Windows)
    logger.info(
        "Auto-generated %s -> %s (set %s in .env to use a fixed value)",
        file_name, secret_path, env_var,
    )
    return val
 def get_jwt_secret() -> str:
    """Get JWT secret key from env, file, or auto-generate."""
    return _load_or_generate("JWT_SECRET_KEY", ".jwt_secret")
 def get_session_secret() -> str:
    """Get session secret from env, file, or auto-generate."""
    return _load_or_generate("SESSION_SECRET", ".session_secret")
--- a/app/web/router.py
+++ b/app/web/router.py
@ -120,6 +120,7 @@ _URL_MAP = {
    "email_auth.login_email_form": "/login/email",
    "email_auth.send_magic_link": "/auth/email/send-link",
    "register": "/auth/password/setup",
    "setup": "/setup",
 }
@ -177,6 +178,18 @@ async def index(request: Request, user: Optional[dict] = Depends(get_optional_us
    return RedirectResponse(url="/login", status_code=302)
@router.get("/setup", response_class=HTMLResponse)
 async def setup_wizard(request: Request, conn: duckdb.DuckDBPyConnection = Depends(_get_db)):
    """First-time setup wizard. Redirects to dashboard if users already exist."""
    try:
        user_count = conn.execute("SELECT COUNT(*) FROM users").fetchone()[0]
        if user_count > 0:
            return RedirectResponse(url="/login", status_code=302)
    except Exception:
        pass  # No users table yet — show setup
    return templates.TemplateResponse(request, "setup.html", _build_context(request))
@router.get("/login", response_class=HTMLResponse)
 async def login_page(request: Request):
    providers = []
--- a/app/web/templates/setup.html
+++ b/app/web/templates/setup.html
@ -0,0 +1,267 @@
 {% extends "base_login.html" %}
 {% block title %}Setup - Agnes AI Data Analyst{% endblock %}
 {% block content %}
 <div class="login-page">
    <div class="login-card-wrapper" style="max-width: 520px; margin: 40px auto; padding: 0 20px;">
        <div class="login-card" style="max-width: 520px;">
            <h2 id="wizard-title">Setup Agnes</h2>
            <p class="login-description" id="wizard-description">
                Create your admin account to get started.
            </p>
            <!-- Progress -->
            <div style="display: flex; gap: 8px; margin-bottom: 24px;">
                <div id="step-dot-1" style="flex: 1; height: 4px; border-radius: 2px; background: var(--primary, #2563eb);"></div>
                <div id="step-dot-2" style="flex: 1; height: 4px; border-radius: 2px; background: #e5e7eb;"></div>
                <div id="step-dot-3" style="flex: 1; height: 4px; border-radius: 2px; background: #e5e7eb;"></div>
                <div id="step-dot-4" style="flex: 1; height: 4px; border-radius: 2px; background: #e5e7eb;"></div>
            </div>
            <!-- Status message -->
            <div id="status-msg" style="display: none; padding: 10px 14px; border-radius: 6px; margin-bottom: 16px; font-size: 14px;"></div>
            <!-- Step 1: Create Admin -->
            <div id="step-1">
                <form id="admin-form" onsubmit="return createAdmin(event)">
                    <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Email</label>
                    <input type="email" id="admin-email" required placeholder="admin@company.com"
                        style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 12px; font-size: 14px; box-sizing: border-box;">
                    <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Password</label>
                    <input type="password" id="admin-password" required minlength="8" placeholder="Min. 8 characters"
                        style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 16px; font-size: 14px; box-sizing: border-box;">
                    <button type="submit" class="btn btn-primary" style="width: 100%;" id="btn-admin">
                        Create Admin Account
                    </button>
                </form>
            </div>
            <!-- Step 2: Data Source -->
            <div id="step-2" style="display: none;">
                <form id="source-form" onsubmit="return configureSource(event)">
                    <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Data Source</label>
                    <select id="data-source" onchange="toggleSourceFields()"
                        style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 12px; font-size: 14px; box-sizing: border-box;">
                        <option value="keboola">Keboola</option>
                        <option value="bigquery">BigQuery</option>
                        <option value="local">Local / CSV</option>
                    </select>
                    <div id="keboola-fields">
                        <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Keboola URL</label>
                        <input type="url" id="keboola-url" placeholder="https://connection.keboola.com"
                            style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 12px; font-size: 14px; box-sizing: border-box;">
                        <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Storage API Token</label>
                        <input type="password" id="keboola-token" placeholder="Your Keboola storage token"
                            style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 16px; font-size: 14px; box-sizing: border-box;">
                    </div>
                    <div id="bigquery-fields" style="display: none;">
                        <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">GCP Project</label>
                        <input type="text" id="bq-project" placeholder="my-gcp-project"
                            style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 12px; font-size: 14px; box-sizing: border-box;">
                        <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Location</label>
                        <input type="text" id="bq-location" value="us" placeholder="us"
                            style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 16px; font-size: 14px; box-sizing: border-box;">
                    </div>
                    <button type="submit" class="btn btn-primary" style="width: 100%;" id="btn-source">
                        Configure Data Source
                    </button>
                    <button type="button" onclick="skipToStep(4)" class="btn btn-secondary" style="width: 100%; margin-top: 8px;" id="btn-skip-source">
                        Skip (configure later)
                    </button>
                </form>
            </div>
            <!-- Step 3: Discover Tables -->
            <div id="step-3" style="display: none;">
                <p style="font-size: 14px; color: #6b7280; margin-bottom: 16px;">
                    Discover and register tables from your data source.
                </p>
                <button onclick="discoverTables()" class="btn btn-primary" style="width: 100%;" id="btn-discover">
                    Discover Tables
                </button>
                <div id="discover-result" style="display: none; margin-top: 12px; padding: 12px; background: #f0fdf4; border-radius: 6px; font-size: 14px;"></div>
                <button onclick="goToStep(4)" class="btn btn-primary" style="width: 100%; margin-top: 12px; display: none;" id="btn-next-sync">
                    Continue
                </button>
            </div>
            <!-- Step 4: First Sync & Done -->
            <div id="step-4" style="display: none;">
                <p style="font-size: 14px; color: #6b7280; margin-bottom: 16px;">
                    Start the first data sync and go to your dashboard.
                </p>
                <button onclick="triggerSync()" class="btn btn-primary" style="width: 100%;" id="btn-sync">
                    Start First Sync
                </button>
                <a href="/dashboard" class="btn btn-primary" style="width: 100%; margin-top: 12px; display: none; text-align: center; text-decoration: none;" id="btn-dashboard">
                    Go to Dashboard
                </a>
            </div>
        </div>
    </div>
 </div>
 <script>
 let token = '';
 const steps = {
    1: { title: 'Setup Agnes', desc: 'Create your admin account to get started.' },
    2: { title: 'Data Source', desc: 'Connect to your data source.' },
    3: { title: 'Discover Tables', desc: 'Find and register tables from your data source.' },
    4: { title: 'Almost Done', desc: 'Start syncing data and open your dashboard.' },
 };
 function showStatus(msg, type) {
    const el = document.getElementById('status-msg');
    el.textContent = msg;
    el.style.display = 'block';
    el.style.background = type === 'error' ? '#fef2f2' : '#f0fdf4';
    el.style.color = type === 'error' ? '#dc2626' : '#16a34a';
 }
 function hideStatus() {
    document.getElementById('status-msg').style.display = 'none';
 }
 function goToStep(n) {
    hideStatus();
    for (let i = 1; i <= 4; i++) {
        document.getElementById('step-' + i).style.display = i === n ? 'block' : 'none';
        document.getElementById('step-dot-' + i).style.background = i <= n ? 'var(--primary, #2563eb)' : '#e5e7eb';
    }
    document.getElementById('wizard-title').textContent = steps[n].title;
    document.getElementById('wizard-description').textContent = steps[n].desc;
 }
 function skipToStep(n) {
    goToStep(n);
 }
 function toggleSourceFields() {
    const src = document.getElementById('data-source').value;
    document.getElementById('keboola-fields').style.display = src === 'keboola' ? 'block' : 'none';
    document.getElementById('bigquery-fields').style.display = src === 'bigquery' ? 'block' : 'none';
 }
 async function apiCall(url, body) {
    const headers = { 'Content-Type': 'application/json' };
    if (token) headers['Authorization'] = 'Bearer ' + token;
    const resp = await fetch(url, { method: 'POST', headers, body: JSON.stringify(body) });
    if (resp.status === 401) {
        token = '';
        sessionStorage.removeItem('setup_token');
        showStatus('Session expired. Please refresh the page and start over.', 'error');
        throw new Error('Session expired');
    }
    const data = await resp.json();
    if (!resp.ok) throw new Error(data.detail || 'Request failed');
    return data;
 }
 async function createAdmin(e) {
    e.preventDefault();
    const btn = document.getElementById('btn-admin');
    btn.disabled = true;
    btn.textContent = 'Creating...';
    try {
        const data = await apiCall('/auth/bootstrap', {
            email: document.getElementById('admin-email').value,
            password: document.getElementById('admin-password').value,
        });
        token = data.access_token;
        sessionStorage.setItem('setup_token', token);
        goToStep(2);
    } catch (err) {
        showStatus(err.message, 'error');
    } finally {
        btn.disabled = false;
        btn.textContent = 'Create Admin Account';
    }
    return false;
 }
 async function configureSource(e) {
    e.preventDefault();
    const btn = document.getElementById('btn-source');
    btn.disabled = true;
    btn.textContent = 'Verifying...';
    try {
        const src = document.getElementById('data-source').value;
        const body = { data_source: src };
        if (src === 'keboola') {
            body.keboola_url = document.getElementById('keboola-url').value;
            body.keboola_token = document.getElementById('keboola-token').value;
        } else if (src === 'bigquery') {
            body.bigquery_project = document.getElementById('bq-project').value;
            body.bigquery_location = document.getElementById('bq-location').value;
        }
        await apiCall('/api/admin/configure', body);
        showStatus('Connection verified!', 'success');
        if (src === 'local') {
            goToStep(4);
        } else {
            goToStep(3);
        }
    } catch (err) {
        showStatus(err.message, 'error');
    } finally {
        btn.disabled = false;
        btn.textContent = 'Configure Data Source';
    }
    return false;
 }
 async function discoverTables() {
    const btn = document.getElementById('btn-discover');
    btn.disabled = true;
    btn.textContent = 'Discovering...';
    try {
        const headers = { 'Content-Type': 'application/json' };
        if (token) headers['Authorization'] = 'Bearer ' + token;
        const resp = await fetch('/api/admin/discover-and-register', { method: 'POST', headers });
        const data = await resp.json();
        if (!resp.ok) throw new Error(data.detail || 'Discovery failed');
        const el = document.getElementById('discover-result');
        el.style.display = 'block';
        el.textContent = `Registered ${data.registered} tables, skipped ${data.skipped}.`;
        document.getElementById('btn-next-sync').style.display = 'block';
        btn.style.display = 'none';
    } catch (err) {
        showStatus(err.message, 'error');
    } finally {
        btn.disabled = false;
        btn.textContent = 'Discover Tables';
    }
 }
 async function triggerSync() {
    const btn = document.getElementById('btn-sync');
    btn.disabled = true;
    btn.textContent = 'Starting sync...';
    try {
        const headers = {};
        if (token) headers['Authorization'] = 'Bearer ' + token;
        await fetch('/api/sync/trigger', { method: 'POST', headers });
        btn.style.display = 'none';
        document.getElementById('btn-dashboard').style.display = 'block';
        showStatus('Sync started! You can now go to your dashboard.', 'success');
    } catch (err) {
        showStatus(err.message, 'error');
        btn.disabled = false;
        btn.textContent = 'Start First Sync';
    }
 }
 // Restore token from sessionStorage (in case of page reload)
 const savedToken = sessionStorage.getItem('setup_token');
 if (savedToken) token = savedToken;
 </script>
 {% endblock %}
--- a/docker-compose.ci.yml
+++ b/docker-compose.ci.yml
@ -0,0 +1,11 @@
 # CI smoke test overlay — minimal config for testing in GitHub Actions.
 # Usage: docker compose -f docker-compose.yml -f docker-compose.ci.yml up -d
 services:
  app:
    environment:
      - JWT_SECRET_KEY=smoke-test-ci-key-minimum-32-chars-xx
      - SESSION_SECRET=smoke-test-session-key-32-chars-min-x
      - DATA_DIR=/data
      - TESTING=0
    ports:
      - "8000:8000"
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@ -1,17 +1,18 @@
 # Production override — uses pre-built GHCR image instead of local build.
 # Usage: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
 # Override tag: AGNES_TAG=stable-2026.04.3 docker compose -f ... up -d
 services:
  app:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  scheduler:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  extract:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  telegram-bot:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  ws-gateway:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  corporate-memory:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  session-collector:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -7,6 +7,7 @@ services:
    volumes:
      - data:/data
      - ./config:/app/config:ro
      # - ./custom-connectors:/app/connectors/custom:ro  # Tier A: AI-generated connectors
    env_file: .env
    environment:
      - DATA_DIR=/data
--- a/docs/RELEASE_TEMPLATE.md
+++ b/docs/RELEASE_TEMPLATE.md
@ -0,0 +1,37 @@
 # Release Notes Template
 Use this template when adding a new entry to `CHANGELOG.md`.
 ---
 ## stable-YYYY.MM.N
 **Image:** `ghcr.io/keboola/agnes-the-ai-analyst:stable-YYYY.MM.N`
 **Digest:** `sha256:...` (from `docker inspect --format='{{index .RepoDigests 0}}'`)
 **Date:** YYYY-MM-DD
 ### Added
 - Feature description
 ### Changed
 - Change description
 ### Fixed
 - Bug fix description
 ### Breaking Changes
 - Description of breaking change
 - **Migration guide:** Steps to upgrade from previous version
 ### Deprecated
 - Description of deprecated feature (will be removed in YYYY.MM.N)
 ---
 ## Guidelines
 - Every merge to `main` creates a new `stable-YYYY.MM.N` release
 - Include the image digest for verification with `cosign verify`
 - Breaking changes require `BREAKING:` prefix in commit message
 - Migration guides must include exact commands or config changes
 - If a release deprecates the previous stable, note it explicitly
--- a/docs/superpowers/specs/2026-04-09-multi-instance-deployment-design.md
+++ b/docs/superpowers/specs/2026-04-09-multi-instance-deployment-design.md
@ -0,0 +1,527 @@
 # Multi-Instance Deployment & Versioning — Design Spec
 ## Goal
 Make Agnes deployable to 20+ independent customer instances via self-service, with safe versioning that prevents one customer's PR from breaking another's deployment.
 ## Context
 Agnes is an open-source AI Data Analyst platform. Customers (or their AI agents) deploy it as a Docker image on their own infrastructure. Each instance connects to different data sources (Keboola, BigQuery, Jira, custom).
 **Key constraints:**
 - Customers range from semi-technical to non-technical, assisted by AI agents
 - Cloud-agnostic (GCP, AWS, Azure, on-prem, VPS)
 - One repo, one Docker image, many instances
 - Community PRs must not break existing customers
 - AI agent is the primary "installer" and "developer"
 ---
 ## 1. Versioning & Release Channels
 ### CalVer: `YYYY.MM.N`
 Format: year.month.sequential-number. Example: `2026.04.1`, `2026.04.2`, `2026.05.1`.
 No manual release decisions. Every merge to main is a release.
 ### Three channels
 | Channel | Floating tag | Versioned tag | Source | Who uses it |
 |---------|-------------|---------------|--------|-------------|
 | **dev** | `:dev` | `:dev-2026.04.N` | Every CI-passing push on any feature branch | Developers, PR testing |
 | **stable** | `:stable` | `:stable-2026.04.N` | Every merge to main + CI pass | All production customers |
 | **deprecated** | — | `:deprecated-2026.04.N` | Previous stable after breaking change or failed smoke test | Grace period (30 days) |
 Every image also gets a `:sha-abc1234` tag for exact commit traceability.
 ### Tag lifecycle
 ```
 feature branch push → CI ✅ → :dev + :dev-2026.04.N + :sha-abc1234
                         ❌ → nothing pushed
 merge to main       → CI ✅ → :stable + :stable-2026.04.N + :sha-abc1234
                         ❌ → merge blocked (CI required)
                                │
                                ▼
                         smoke test on canary VM
                                │
                         ✅ → :stable confirmed
                         ❌ → alert, rollback canary to previous :stable
                              broken build tagged :deprecated-2026.04.N
 ```
 ### Version numbering
 CalVer `YYYY.MM.N` where N is a global auto-incrementing counter per month across both channels.
 Example timeline:
 ```
 Apr 8  feature/foo push     → :dev-2026.04.1
 Apr 8  feature/bar push     → :dev-2026.04.2
 Apr 8  merge foo to main    → :stable-2026.04.3
 Apr 9  feature/baz push     → :dev-2026.04.4
 Apr 9  merge bar to main    → :stable-2026.04.5
 ```
 This avoids confusion — version `2026.04.3` exists only once, in one channel.
 ### Customer pins version
 ```yaml
 # docker-compose.prod.yml
 # Auto-update (recommended): always latest stable
 image: ghcr.io/keboola/agnes-the-ai-analyst:stable
 # Pinned: specific stable release, manual update
 image: ghcr.io/keboola/agnes-the-ai-analyst:stable-2026.04.3
 # Testing: latest dev
 image: ghcr.io/keboola/agnes-the-ai-analyst:dev
 # Testing: specific dev build
 image: ghcr.io/keboola/agnes-the-ai-analyst:dev-2026.04.2
 ```
 ### Main = stable
 - `main` branch is always releasable
 - Every merge to main triggers a new stable release
 - Feature branches are the dev channel
 - No promotion pipeline, no manual approval for releases
 - Smoke test is a post-deploy safety net, not a gate
 ---
 ## 2. Breaking Change Detection
 ### What is a breaking change
 - `_meta` table schema change (add/remove column)
 - `_remote_attach` table schema change
 - API endpoint removed or response field removed
 - DuckDB system schema migration that drops data
 - CLI command removed or argument renamed
 - `instance.yaml` required key added
 ### Automated detection in CI
 Every PR runs:
 1. **Contract tests**: `_meta` and `_remote_attach` schema validation against frozen spec
 2. **OpenAPI diff**: Compare PR's `openapi.json` against main's. Flag removed endpoints/fields.
 3. **DuckDB schema diff**: Compare table definitions in system.duckdb
 4. **Config diff**: Compare `instance.yaml.example` required keys
 5. **Full connector matrix**: ALL connectors tested, not just changed ones
 If breaking change detected:
 - PR gets `BREAKING` label automatically
 - Requires 2 reviewers (elevated review)
 - Commit message must have `BREAKING:` prefix
 - CHANGELOG.md entry with migration guide required
 - On merge: previous stable tagged as `:deprecated-YYYY.MM.N`
 ### Deprecated channel
 When a breaking change merges:
 1. Previous stable image retagged to `:deprecated-2026.04.N`
 2. New build becomes `:stable` + `:2026.04.(N+1)`
 3. Health endpoint on deprecated version shows warning:
   ```json
   {"warnings": ["Running deprecated version 2026.04.3. Update to stable."]}
   ```
 4. Deprecated images removed from GHCR after 30 days
 ---
 ## 3. Smoke Test (Post-Deploy Safety Net)
 ### What it tests
 Automated sequence run on canary VM after every `:stable` deploy:
 ```
 1. GET  /api/health                    → status != "unhealthy"
 2. POST /auth/token                    → 200 (valid credentials)
 3. GET  /api/catalog/tables            → count > 0
 4. POST /api/query {sql: "SELECT 1"}   → 200 + rows
 5. POST /api/sync/trigger              → 200
 6. (wait 30s)
 7. GET  /api/health                    → check no new errors
 ```
 ### On failure
 1. Alert (GitHub issue + optional webhook)
 2. Canary VM rolled back to previous stable: `docker compose pull && docker compose up -d` with previous tag
 3. Failed build tagged `:deprecated-YYYY.MM.N`
 4. `:stable` tag reverted to previous good build
 ### Implementation
 GitHub Actions workflow triggered after the build-and-push workflow completes:
 ```yaml
 smoke-test:
  needs: build-and-push
  runs-on: ubuntu-latest
  steps:
    - name: Deploy to canary
      run: |
        gcloud compute ssh canary-vm --command="
          cd /opt/agnes &&
          docker compose pull &&
          docker compose up -d"
    - name: Wait for healthy
      run: |
        for i in $(seq 1 30); do
          STATUS=$(curl -sf canary:8000/api/health | jq -r .status)
          [ "$STATUS" != "unhealthy" ] && break
          sleep 10
        done
    - name: Run smoke tests
      run: |
        # auth, catalog, query, sync checks
        ./scripts/smoke-test.sh canary:8000
    - name: Rollback on failure
      if: failure()
      run: |
        # retag and rollback
 ```
 ---
 ## 4. Self-Service Deployment
 ### Target experience
 Customer (or their AI agent) goes from zero to running instance:
 ```bash
 # 1. Get the code
 git clone https://github.com/keboola/agnes-the-ai-analyst.git
 cd agnes-the-ai-analyst
 # 2. Start it
 docker compose up -d
 # 3. Open browser or use API
 # First visit: /setup wizard (no users exist)
 # Or headless: curl -X POST localhost:8000/auth/bootstrap ...
 ```
 ### Two setup modes
 **A) Interactive (browser):**
 - First visit when no users exist → redirected to `/setup`
 - Step 1: Create admin account (email + password)
 - Step 2: Choose data source (Keboola / BigQuery / CSV / Custom)
 - Step 3: Enter credentials (token, URL)
 - Step 4: Auto-discover and register tables
 - Step 5: Trigger first sync
 - Done → redirect to dashboard
 **B) Headless (AI agent / CLI):**
 ```bash
 # Bootstrap admin
 curl -X POST http://localhost:8000/auth/bootstrap \
  -H "Content-Type: application/json" \
  -d '{"email":"admin@company.com","password":"SecurePass123!"}'
 # Configure data source
 curl -X POST http://localhost:8000/api/admin/configure \
  -H "Authorization: Bearer $TOKEN" \
  -H "Content-Type: application/json" \
  -d '{"data_source":"keboola","keboola_token":"...","keboola_url":"..."}'
 # Discover and register tables
 curl -X POST http://localhost:8000/api/admin/discover-and-register \
  -H "Authorization: Bearer $TOKEN"
 # Trigger first sync
 curl -X POST http://localhost:8000/api/sync/trigger \
  -H "Authorization: Bearer $TOKEN"
 ```
 Both modes lead to same result. AI agent uses headless.
 ### Auto-configuration
 On first `docker compose up` with no `.env`:
 - `JWT_SECRET_KEY` auto-generated and persisted to `/data/state/.jwt_secret`
 - `SESSION_SECRET` auto-generated similarly
 - App starts in "setup mode" — only `/setup`, `/auth/bootstrap`, and `/api/health` accessible
 On first `docker compose up` with `.env` containing `KEBOOLA_STORAGE_TOKEN`:
 - Auto-discovers tables from Keboola on first sync
 - Skips manual table registration step
 ### What customer must provide
 | Required | Optional |
 |----------|----------|
 | Server with Docker | Custom domain + TLS |
 | Admin email + password | Google OAuth credentials |
 | Data source credentials (Keboola token OR BigQuery creds OR CSV files) | Telegram bot token |
 | | Jira webhook secret |
 ### What customer must NOT do
 - Edit YAML manually (setup wizard generates `instance.yaml`)
 - Generate JWT secret (auto-generated)
 - Register tables manually (auto-discovery)
 - Understand DuckDB internals
 ---
 ## 5. Custom Connectors (Three Tiers)
 All tiers produce the same output: `extract.duckdb` with `_meta` table + `data/*.parquet`. Orchestrator treats them identically.
 ### Tier A: Local mount (fastest, AI-generated)
 Customer's AI agent generates a connector. Lives outside Docker image, survives updates.
 ```
 /opt/agnes/
 ├── docker-compose.yml              ← official image
 ├── docker-compose.override.yml     ← customer additions
 └── custom-connectors/
    └── snowflake/
        ├── extractor.py
        └── requirements.txt
 ```
 ```yaml
 # docker-compose.override.yml
 services:
  app:
    volumes:
      - ./custom-connectors:/app/connectors/custom:ro
 ```
 Orchestrator scans `connectors/custom/*/` in addition to built-in connectors.
 **How the AI agent creates one:**
 1. Reads CLAUDE.md → understands extract.duckdb contract
 2. Reads existing connector as reference (e.g., `connectors/keboola/extractor.py`)
 3. Generates `custom-connectors/snowflake/extractor.py`
 4. Runs contract test to validate output
 5. Done — orchestrator picks it up on next rebuild
 **Requirements for this to work:**
 - CLAUDE.md must perfectly describe the contract
 - Contract test must be runnable standalone
 - Existing connectors must be readable as examples
 - Clear error messages when contract doesn't match
 ### Tier B: Standalone container (complex dependencies)
 For connectors needing their own runtime (Java, .NET, heavy Python packages).
 ```yaml
 # docker-compose.override.yml
 services:
  connector-sap:
    build: ./custom-connectors/sap
    volumes:
      - data:/data
    environment:
      - DATA_DIR=/data
      - SAP_HOST=...
    profiles:
      - extract
 ```
 Connector is its own Docker image. Writes to `/data/extracts/sap/extract.duckdb`. Orchestrator finds it automatically.
 ### Tier C: Community PR (shared with all)
 Connector contributed to main repo via PR. After merge, available in official image for all customers.
 ```
 connectors/
 ├── keboola/          ← built-in
 ├── bigquery/         ← built-in
 ├── jira/             ← built-in
 └── snowflake/        ← community contributed
 ```
 **PR requirements:**
 - Must pass contract tests
 - Must include tests
 - Must not modify shared code (orchestrator, API, auth)
 - CI runs full connector matrix
 ---
 ## 6. CI/CD Pipeline
 ### On feature branch push
 ```yaml
 ci.yml:
  - tests (all 654+)
  - contract tests (all connectors)
  - docker build
  - push :dev + :dev-sha-xxx to GHCR
 ```
 ### On merge to main
 ```yaml
 release.yml:
  - tests (all)
  - contract tests (all connectors)
  - breaking change detection (OpenAPI diff, schema diff)
  - docker build
  - push :stable + :YYYY.MM.N + :sha-xxx to GHCR
  - trigger smoke test on canary
 smoke-test.yml (triggered):
  - deploy to canary VM
  - run smoke test sequence
  - on failure: rollback canary, tag build as deprecated, create alert
 ```
 ### On PR
 ```yaml
 pr-check.yml:
  - tests
  - contract tests
  - breaking change detection
  - label PR: "BREAKING" if detected
  - require 2 reviewers if breaking
 ```
 ---
 ## 7. Infrastructure (Cloud-Agnostic)
 ### Primary: Docker Compose
 Works everywhere Docker runs. This is the default and only required deployment method.
 ```bash
 git clone https://github.com/keboola/agnes-the-ai-analyst.git
 cd agnes-the-ai-analyst
 docker compose up -d
 ```
 ### Optional: Terraform (GCP)
 For automated provisioning. Lives in `infra/` with GCS remote state backend.
 ```bash
 cd infra
 terraform workspace new customer-name
 terraform apply -var-file=instances/customer-name.tfvars
 ```
 Creates VM, installs Docker, clones repo, generates `.env` and `instance.yaml`, starts Docker Compose.
 ### Optional: Caddy TLS
 Production profile adds Caddy reverse proxy with automatic Let's Encrypt:
 ```bash
 DOMAIN=data.customer.com docker compose --profile production up -d
 ```
 ### Directory layout on customer server
 ```
 /opt/agnes/                           ← git clone
 ├── docker-compose.yml                ← official
 ├── docker-compose.prod.yml           ← GHCR images
 ├── docker-compose.override.yml       ← customer customizations
 ├── .env                              ← secrets (gitignored)
 ├── config/
 │   └── instance.yaml                 ← generated by setup wizard
 ├── custom-connectors/                ← Tier A connectors
 │   └── snowflake/
 └── Caddyfile                         ← TLS config
 /data/                                ← Docker volume (persistent)
 ├── state/system.duckdb               ← users, registry, sync state
 ├── analytics/server.duckdb           ← views into extracts
 └── extracts/                         ← per-source data
    ├── keboola/extract.duckdb
    ├── bigquery/extract.duckdb
    └── snowflake/extract.duckdb      ← from custom connector
 ```
 ---
 ## 8. AI Agent as Primary Installer
 CLAUDE.md and documentation must be optimized for AI agent consumption:
 ### CLAUDE.md requirements
 - Complete extract.duckdb contract with exact SQL for `_meta` and `_remote_attach`
 - Step-by-step setup instructions with exact curl commands
 - Existing connectors as reference for AI-generated new ones
 - Clear error messages explaining what went wrong and how to fix
 ### API requirements
 - All setup operations available as API calls (not just UI)
 - Self-describing error messages: `"Missing KEBOOLA_STORAGE_TOKEN. Set it in .env or pass via /api/admin/configure"`
 - `/api/health` returns structured diagnostics AI agent can parse
 - `/api/admin/configure` accepts data source config without file editing
 ### Documentation requirements
 - Machine-readable (no screenshots, no "click here")
 - Every manual step has an equivalent API/CLI command
 - QUICKSTART.md optimized for copy-paste by AI agent
 ---
 ## 9. What Needs to Be Built
 ### Must have (blocks multi-instance)
 | # | What | Effort |
 |---|------|--------|
 | 1 | CalVer auto-tagging in CI (release.yml) | 1 day |
 | 2 | Smoke test script + CI workflow | 1 day |
 | 3 | Breaking change detection in CI (OpenAPI diff, contract diff) | 2 days |
 | 4 | `/setup` wizard (web) + `/api/admin/configure` (headless) | 3 days |
 | 5 | Auto-generate JWT_SECRET_KEY on first start | 0.5 day |
 | 6 | Auto-discovery for Keboola tables on first sync | 1 day |
 | 7 | Custom connector mount support in orchestrator | 1 day |
 | 8 | `CHANGELOG.md` + release notes template | 0.5 day |
 | 9 | Health endpoint version + channel info | 0.5 day |
 ### Should have (improves experience)
 | # | What | Effort |
 |---|------|--------|
 | 10 | Deprecated version warning in health endpoint | 0.5 day |
 | 11 | `/api/admin/discover-and-register` auto-discovery endpoint | 1 day |
 | 12 | Standalone container connector example (Tier B) | 0.5 day |
 | 13 | CLAUDE.md optimization for AI agent setup | 1 day |
 | 14 | Terraform module refactor for multi-workspace | 1 day |
 ### Nice to have (future)
 | # | What |
 |---|------|
 | 15 | Community connector contribution guide |
 | 16 | Instance health dashboard (central monitoring) |
 | 17 | Automated backup (GCP disk snapshots) |
 | 18 | Usage analytics (opt-in telemetry) |
 ---
 ## Non-Goals
 - Multi-tenancy in single process (each customer = separate instance)
 - Kubernetes/Helm (Docker Compose is sufficient for target scale)
 - Paid tier / license keys (open-source, monetization TBD)
 - GUI for connector development (AI agent + CLAUDE.md is sufficient)
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@ -0,0 +1,16 @@
 """Generate OpenAPI snapshot from the current FastAPI app."""
 import json
 import os
 import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 os.environ.setdefault("TESTING", "1")
 os.environ.setdefault("JWT_SECRET_KEY", "snapshot-generation-key-32-chars-min!!")
 from app.main import create_app  # noqa: E402
 app = create_app()
 schema = app.openapi()
 json.dump(schema, sys.stdout, indent=2, sort_keys=True)
 sys.stdout.write("\n")
--- a/scripts/smoke-test.sh
+++ b/scripts/smoke-test.sh
@ -0,0 +1,97 @@
 #!/usr/bin/env bash
 # Agnes smoke test — verifies a running instance is functional.
 # Usage: ./scripts/smoke-test.sh [host:port]
 # Default: http://localhost:8000
 set -euo pipefail
 HOST="${1:-http://localhost:8000}"
 PASS=0
 FAIL=0
 TOKEN=""
 check() {
    local name="$1" ok="$2"
    if [ "$ok" = "true" ]; then
        echo "  PASS $name"
        PASS=$((PASS + 1))
    else
        echo "  FAIL $name"
        FAIL=$((FAIL + 1))
    fi
 }
 echo "Smoke test: $HOST"
 echo "---"
 # 1. Health check
 HEALTH=$(curl -sf "$HOST/api/health" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unreachable")
 if [ "$HEALTH" = "unhealthy" ] || [ "$HEALTH" = "unreachable" ]; then
    echo "  FATAL: health=$HEALTH"
    exit 1
 fi
 check "health ($HEALTH)" "true"
 # 2. Health has version fields
 HAS_VERSION=$(curl -sf "$HOST/api/health" | python3 -c "
 import sys,json
 d=json.load(sys.stdin)
 print('true' if 'version' in d and 'channel' in d and 'schema_version' in d else 'false')
 " 2>/dev/null || echo "false")
 check "health version fields" "$HAS_VERSION"
 # 3. Bootstrap (only works on fresh DB; 403 means users exist)
 BOOT_HTTP=$(curl -s -o /tmp/smoke_boot.json -w "%{http_code}" -X POST "$HOST/auth/bootstrap" \
  -H "Content-Type: application/json" \
  -d '{"email":"smoke@test.local","name":"Smoke Test","password":"SmokeTest123!"}' 2>/dev/null || echo "000")
 if [ "$BOOT_HTTP" = "200" ]; then
    TOKEN=$(python3 -c "import json; print(json.load(open('/tmp/smoke_boot.json'))['access_token'])" 2>/dev/null || echo "")
    check "bootstrap (new admin)" "true"
 elif [ "$BOOT_HTTP" = "403" ]; then
    TOKEN="${SMOKE_TOKEN:-}"
    echo "  SKIP bootstrap (users exist)"
 else
    check "bootstrap (HTTP $BOOT_HTTP)" "false"
 fi
 # 4. Query SELECT 1 (requires auth)
 if [ -n "$TOKEN" ]; then
    QUERY_OK=$(curl -sf -X POST "$HOST/api/query" \
      -H "Authorization: Bearer $TOKEN" \
      -H "Content-Type: application/json" \
      -d '{"sql":"SELECT 1 as test"}' | python3 -c "
 import sys,json
 d=json.load(sys.stdin)
 print('true' if len(d.get('rows',[])) > 0 else 'false')
 " 2>/dev/null || echo "false")
    check "query SELECT 1" "$QUERY_OK"
 else
    echo "  SKIP query (no token)"
 fi
 # 5. Sync trigger
 if [ -n "$TOKEN" ]; then
    SYNC_HTTP=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$HOST/api/sync/trigger" \
      -H "Authorization: Bearer $TOKEN" 2>/dev/null || echo "000")
    if [[ "$SYNC_HTTP" =~ ^(200|202)$ ]]; then
        check "sync trigger" "true"
    else
        check "sync trigger (HTTP $SYNC_HTTP)" "false"
    fi
 else
    echo "  SKIP sync (no token)"
 fi
 # 6. Post-sync health (wait briefly)
 sleep 5
 HEALTH2=$(curl -sf "$HOST/api/health" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unreachable")
 if [ "$HEALTH2" = "unhealthy" ] || [ "$HEALTH2" = "unreachable" ]; then
    check "post-sync health ($HEALTH2)" "false"
 else
    check "post-sync health ($HEALTH2)" "true"
 fi
 # Results
 echo ""
 echo "Results: $PASS passed, $FAIL failed"
 [ "$FAIL" -eq 0 ] || exit 1
--- a/src/db.py
+++ b/src/db.py
@ -4,12 +4,16 @@ Provides get_system_db() for the system state database
 and get_analytics_db() for the analytics database with parquet views.
 """
 import logging
 import os
 import re
 import shutil
 from pathlib import Path
 import duckdb
 logger = logging.getLogger(__name__)
 _SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
 SCHEMA_VERSION = 3
@ -260,6 +264,25 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
    """Create tables if they don't exist. Apply migrations if schema version changed."""
    current = get_schema_version(conn)
    if current < SCHEMA_VERSION:
        # Snapshot before migration for rollback support
        if current > 0:
            try:
                db_path = Path(os.environ.get("DATA_DIR", "./data")) / "state" / "system.duckdb"
                if db_path.exists():
                    # Flush WAL to main DB file before copying
                    try:
                        conn.execute("CHECKPOINT")
                    except Exception:
                        pass  # CHECKPOINT may fail on read-only or in-memory DBs
                    snapshot = db_path.parent / "system.duckdb.pre-migrate"
                    shutil.copy2(str(db_path), str(snapshot))
                    # Also copy WAL if it still exists (belt and suspenders)
                    wal_path = Path(str(db_path) + ".wal")
                    if wal_path.exists():
                        shutil.copy2(str(wal_path), str(snapshot) + ".wal")
                    logger.info("Pre-migration snapshot saved: %s", snapshot)
            except Exception as e:
                logger.warning("Could not create pre-migration snapshot: %s", e)
        conn.execute(_SYSTEM_SCHEMA)
        if current == 0:
            conn.execute(
--- a/tests/snapshots/openapi.json
+++ b/tests/snapshots/openapi.json
--- a/tests/test_db.py
+++ b/tests/test_db.py
@ -144,6 +144,205 @@ class TestGetAnalyticsDb:
            conn.close()
 class TestMigrationSafety:
    """Tests for schema migration correctness, idempotency, and safety snapshots."""
    # Minimal v2 table_registry (no is_public column — that comes in v3)
    _V2_TABLE_REGISTRY = """
        CREATE TABLE table_registry (
            id VARCHAR PRIMARY KEY,
            name VARCHAR NOT NULL,
            source_type VARCHAR,
            bucket VARCHAR,
            source_table VARCHAR,
            sync_strategy VARCHAR DEFAULT 'full_refresh',
            query_mode VARCHAR DEFAULT 'local',
            sync_schedule VARCHAR,
            profile_after_sync BOOLEAN DEFAULT true,
            primary_key VARCHAR,
            folder VARCHAR,
            description TEXT,
            registered_by VARCHAR,
            registered_at TIMESTAMP DEFAULT current_timestamp
        );
    """
    def _create_v2_db(self, db_path):
        """Create a minimal v2-schema DuckDB file at db_path."""
        import duckdb as _duckdb
        db_path.parent.mkdir(parents=True, exist_ok=True)
        conn = _duckdb.connect(str(db_path))
        try:
            conn.execute(
                "CREATE TABLE schema_version (version INTEGER, applied_at TIMESTAMP DEFAULT current_timestamp);"
                "INSERT INTO schema_version (version) VALUES (2);"
            )
            conn.execute(self._V2_TABLE_REGISTRY)
            # Stub out remaining tables so _ensure_schema doesn't fail
            for ddl in [
                "CREATE TABLE IF NOT EXISTS users (id VARCHAR PRIMARY KEY, email VARCHAR)",
                "CREATE TABLE IF NOT EXISTS sync_state (table_id VARCHAR PRIMARY KEY)",
                "CREATE TABLE IF NOT EXISTS sync_history (id VARCHAR PRIMARY KEY, table_id VARCHAR)",
                "CREATE TABLE IF NOT EXISTS user_sync_settings (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))",
                "CREATE TABLE IF NOT EXISTS knowledge_items (id VARCHAR PRIMARY KEY, title VARCHAR)",
                "CREATE TABLE IF NOT EXISTS knowledge_votes (item_id VARCHAR, user_id VARCHAR, PRIMARY KEY(item_id, user_id))",
                "CREATE TABLE IF NOT EXISTS audit_log (id VARCHAR PRIMARY KEY, action VARCHAR)",
                "CREATE TABLE IF NOT EXISTS telegram_links (user_id VARCHAR PRIMARY KEY, chat_id BIGINT)",
                "CREATE TABLE IF NOT EXISTS pending_codes (code VARCHAR PRIMARY KEY, chat_id BIGINT)",
                "CREATE TABLE IF NOT EXISTS script_registry (id VARCHAR PRIMARY KEY, name VARCHAR, source TEXT)",
                "CREATE TABLE IF NOT EXISTS table_profiles (table_id VARCHAR PRIMARY KEY, profile JSON)",
                "CREATE TABLE IF NOT EXISTS dataset_permissions (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))",
            ]:
                conn.execute(ddl)
        finally:
            conn.close()
    def test_v2_to_v3_migration(self, tmp_path, monkeypatch):
        """v2 DB migrated to v3: schema_version=3 and is_public column added."""
        monkeypatch.setenv("DATA_DIR", str(tmp_path))
        import duckdb as _duckdb
        from src.db import _ensure_schema, get_schema_version
        db_path = tmp_path / "state" / "system.duckdb"
        self._create_v2_db(db_path)
        conn = _duckdb.connect(str(db_path))
        try:
            _ensure_schema(conn)
            assert get_schema_version(conn) == 3
            cols = {
                r[0]
                for r in conn.execute(
                    "SELECT column_name FROM information_schema.columns WHERE table_name='table_registry'"
                ).fetchall()
            }
            assert "is_public" in cols
        finally:
            conn.close()
    def test_migration_idempotency(self, tmp_path, monkeypatch):
        """Calling _ensure_schema twice on a fresh DB raises no error and leaves version at 3."""
        monkeypatch.setenv("DATA_DIR", str(tmp_path))
        import duckdb as _duckdb
        from src.db import _ensure_schema, get_schema_version, SCHEMA_VERSION
        db_path = tmp_path / "state" / "system.duckdb"
        db_path.parent.mkdir(parents=True, exist_ok=True)
        conn = _duckdb.connect(str(db_path))
        try:
            _ensure_schema(conn)
            _ensure_schema(conn)
            assert get_schema_version(conn) == SCHEMA_VERSION
        finally:
            conn.close()
    def test_migration_preserves_data(self, tmp_path, monkeypatch):
        """Data inserted before migration is preserved after migration runs."""
        monkeypatch.setenv("DATA_DIR", str(tmp_path))
        import duckdb as _duckdb
        from src.db import _ensure_schema, get_schema_version, _SYSTEM_SCHEMA
        db_path = tmp_path / "state" / "system.duckdb"
        db_path.parent.mkdir(parents=True, exist_ok=True)
        conn = _duckdb.connect(str(db_path))
        try:
            # Build a v1 schema manually
            conn.execute(
                "CREATE TABLE schema_version (version INTEGER, applied_at TIMESTAMP DEFAULT current_timestamp);"
                "INSERT INTO schema_version (version) VALUES (1);"
            )
            conn.execute("""
                CREATE TABLE table_registry (
                    id VARCHAR PRIMARY KEY,
                    name VARCHAR NOT NULL,
                    folder VARCHAR,
                    sync_strategy VARCHAR,
                    primary_key VARCHAR,
                    description TEXT,
                    registered_by VARCHAR,
                    registered_at TIMESTAMP DEFAULT current_timestamp
                );
            """)
            conn.execute(
                "INSERT INTO table_registry (id, name, description) VALUES ('row1', 'MyTable', 'kept')"
            )
            # Stub remaining tables
            for ddl in [
                "CREATE TABLE IF NOT EXISTS users (id VARCHAR PRIMARY KEY, email VARCHAR)",
                "CREATE TABLE IF NOT EXISTS sync_state (table_id VARCHAR PRIMARY KEY)",
                "CREATE TABLE IF NOT EXISTS sync_history (id VARCHAR PRIMARY KEY, table_id VARCHAR)",
                "CREATE TABLE IF NOT EXISTS user_sync_settings (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))",
                "CREATE TABLE IF NOT EXISTS knowledge_items (id VARCHAR PRIMARY KEY, title VARCHAR)",
                "CREATE TABLE IF NOT EXISTS knowledge_votes (item_id VARCHAR, user_id VARCHAR, PRIMARY KEY(item_id, user_id))",
                "CREATE TABLE IF NOT EXISTS audit_log (id VARCHAR PRIMARY KEY, action VARCHAR)",
                "CREATE TABLE IF NOT EXISTS telegram_links (user_id VARCHAR PRIMARY KEY, chat_id BIGINT)",
                "CREATE TABLE IF NOT EXISTS pending_codes (code VARCHAR PRIMARY KEY, chat_id BIGINT)",
                "CREATE TABLE IF NOT EXISTS script_registry (id VARCHAR PRIMARY KEY, name VARCHAR, source TEXT)",
                "CREATE TABLE IF NOT EXISTS table_profiles (table_id VARCHAR PRIMARY KEY, profile JSON)",
                "CREATE TABLE IF NOT EXISTS dataset_permissions (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))",
            ]:
                conn.execute(ddl)
            _ensure_schema(conn)
            assert get_schema_version(conn) == 3
            row = conn.execute(
                "SELECT name, description FROM table_registry WHERE id='row1'"
            ).fetchone()
            assert row is not None, "Pre-migration row was lost"
            assert row[0] == "MyTable"
            assert row[1] == "kept"
        finally:
            conn.close()
    def test_pre_migration_snapshot_created(self, tmp_path, monkeypatch):
        """A pre-migrate snapshot is written when migrating an existing (non-fresh) DB."""
        monkeypatch.setenv("DATA_DIR", str(tmp_path))
        from src.db import get_system_db
        # Create a v2 DB at the expected path before calling get_system_db
        db_path = tmp_path / "state" / "system.duckdb"
        self._create_v2_db(db_path)
        conn = get_system_db()
        try:
            snapshot = tmp_path / "state" / "system.duckdb.pre-migrate"
            assert snapshot.exists(), "Pre-migration snapshot was not created"
        finally:
            conn.close()
    def test_no_snapshot_on_fresh_db(self, tmp_path, monkeypatch):
        """No pre-migrate snapshot is created when initialising a brand-new DB."""
        monkeypatch.setenv("DATA_DIR", str(tmp_path))
        from src.db import get_system_db
        conn = get_system_db()
        try:
            snapshot = tmp_path / "state" / "system.duckdb.pre-migrate"
            assert not snapshot.exists(), "Snapshot should not exist for a fresh DB"
        finally:
            conn.close()
    def test_future_version_is_noop(self, tmp_path, monkeypatch):
        """_ensure_schema does nothing when schema_version > SCHEMA_VERSION."""
        monkeypatch.setenv("DATA_DIR", str(tmp_path))
        import duckdb as _duckdb
        from src.db import _ensure_schema, get_schema_version
        db_path = tmp_path / "state" / "system.duckdb"
        db_path.parent.mkdir(parents=True, exist_ok=True)
        conn = _duckdb.connect(str(db_path))
        try:
            conn.execute(
                "CREATE TABLE schema_version (version INTEGER, applied_at TIMESTAMP DEFAULT current_timestamp);"
                "INSERT INTO schema_version (version) VALUES (99);"
            )
            _ensure_schema(conn)
            assert get_schema_version(conn) == 99
        finally:
            conn.close()
 class TestGetAnalyticsDbReadonly:
    def test_analytics_readonly_rejects_malicious_dir_name(self, tmp_path, monkeypatch):
        """Directories with SQL-injection chars in their name are skipped."""
--- a/tests/test_openapi_snapshot.py
+++ b/tests/test_openapi_snapshot.py
@ -0,0 +1,73 @@
 """OpenAPI snapshot test — detect breaking API changes.
 Compares the current app's OpenAPI schema against a committed snapshot.
 Fails if any path or HTTP method has been removed (breaking change).
 To update the snapshot after an intentional change:
    make update-openapi-snapshot
 """
 import json
 import os
 from pathlib import Path
 import pytest
 SNAPSHOT_PATH = Path(__file__).parent / "snapshots" / "openapi.json"
@pytest.fixture(scope="module")
 def current_schema():
    os.environ.setdefault("TESTING", "1")
    from app.main import create_app
    app = create_app()
    return app.openapi()
 def test_snapshot_exists():
    """Committed OpenAPI snapshot must exist."""
    assert SNAPSHOT_PATH.exists(), (
        "No OpenAPI snapshot found. Generate one with: make update-openapi-snapshot"
    )
 def test_no_removed_paths(current_schema):
    """No API paths should be removed compared to the snapshot."""
    if not SNAPSHOT_PATH.exists():
        pytest.skip("No snapshot to compare against")
    snapshot = json.loads(SNAPSHOT_PATH.read_text())
    current_paths = set(current_schema.get("paths", {}))
    snapshot_paths = set(snapshot.get("paths", {}))
    removed = snapshot_paths - current_paths
    assert not removed, (
        f"BREAKING: {len(removed)} API path(s) removed: {sorted(removed)}\n"
        "If intentional, run: make update-openapi-snapshot"
    )
 def test_no_removed_methods(current_schema):
    """No HTTP methods should be removed from existing paths."""
    if not SNAPSHOT_PATH.exists():
        pytest.skip("No snapshot to compare against")
    snapshot = json.loads(SNAPSHOT_PATH.read_text())
    current_paths = current_schema.get("paths", {})
    snapshot_paths = snapshot.get("paths", {})
    breaking = []
    for path in set(snapshot_paths) & set(current_paths):
        removed_methods = set(snapshot_paths[path]) - set(current_paths[path])
        # Ignore non-HTTP keys like 'parameters'
        http_methods = {"get", "post", "put", "delete", "patch", "head", "options"}
        removed_http = removed_methods & http_methods
        if removed_http:
            breaking.append(f"  {path}: {sorted(removed_http)}")
    assert not breaking, (
        f"BREAKING: HTTP methods removed from {len(breaking)} path(s):\n"
        + "\n".join(breaking)
        + "\nIf intentional, run: make update-openapi-snapshot"
    )
--- a/tests/test_security.py
+++ b/tests/test_security.py
@ -304,26 +304,37 @@ class TestJwtClaims:
 # ---- JWT Secret Hardening ----
 class TestJwtSecretHardening:
-    def test_raises_without_jwt_secret_in_non_test_env(self):
+    def test_auto_generates_jwt_secret_when_absent(self, tmp_path):
-        """Module-level code must raise RuntimeError when JWT_SECRET_KEY is absent
+        """When JWT_SECRET_KEY is absent and TESTING is not set,
-        and TESTING is not set, preventing accidental production deploys with no secret."""
+        the secret is auto-generated and persisted to a file."""
        saved_key = os.environ.pop("JWT_SECRET_KEY", None)
        saved_testing = os.environ.pop("TESTING", None)
-        # Eject any cached module so the re-import re-executes module-level code
+        saved_data_dir = os.environ.get("DATA_DIR")
        os.environ["DATA_DIR"] = str(tmp_path)
        # Eject cached modules so the re-import re-executes module-level code
        sys.modules.pop("app.auth.jwt", None)
        sys.modules.pop("app.secrets", None)
        try:
            with pytest.raises(RuntimeError, match="JWT_SECRET_KEY environment variable is required"):
            importlib.import_module("app.auth.jwt")
            secret_file = tmp_path / "state" / ".jwt_secret"
            assert secret_file.exists(), "JWT secret file should be auto-generated"
            secret = secret_file.read_text().strip()
            assert len(secret) == 64, "Auto-generated secret should be 64 hex chars (32 bytes)"
        finally:
            # Restore environment before re-importing so the module loads cleanly
            if saved_key is not None:
                os.environ["JWT_SECRET_KEY"] = saved_key
            if saved_testing is not None:
                os.environ["TESTING"] = saved_testing
            if saved_data_dir is not None:
                os.environ["DATA_DIR"] = saved_data_dir
            else:
                os.environ.pop("DATA_DIR", None)
            # If neither was set (bare test run), use TESTING flag so reload works
            if saved_key is None and saved_testing is None:
                os.environ["TESTING"] = "1"
            sys.modules.pop("app.auth.jwt", None)
            sys.modules.pop("app.secrets", None)
            importlib.import_module("app.auth.jwt")
            # Clean up the temporary TESTING flag if we added it
            if saved_key is None and saved_testing is None: