Merge pull request #1 from keboola/feature/v2-fastapi-duckdb-docker-cli

feat: multi-instance deployment (14 must-have items)
2026-04-10 18:08:03 +02:00 · 2026-04-10 18:08:03 +02:00 · dbc57d1de3
commit dbc57d1de3
parent b7a3c8dd13 5836bcde4c
27 changed files with 7008 additions and 67 deletions
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@ -1,8 +1,9 @@
-name: Build & Push
+# SUPERSEDED by release.yml — CalVer tagging with stable/dev channels.
+# Kept for manual trigger only. Automated builds use release.yml.
+name: Build & Push (legacy)

 on:
-  push:
-    branches: [main]
+  workflow_dispatch: {}

 jobs:
  test:
@ -24,27 +25,3 @@ jobs:
        run: pytest tests/ -v --tb=short
        env:
          TESTING: "1"
-
-  build-and-push:
-    needs: test
-    runs-on: ubuntu-latest
-    permissions:
-      packages: write
-      contents: read
-    steps:
-      - uses: actions/checkout@v5
-
-      - name: Log in to GHCR
-        uses: docker/login-action@v4
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build and push
-        uses: docker/build-push-action@v7
-        with:
-          push: true
-          tags: |
-            ghcr.io/${{ github.repository }}:latest
-            ghcr.io/${{ github.repository }}:${{ github.sha }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -0,0 +1,148 @@
+name: Release
+
+on:
+  push:
+    branches: [main, "feature/**"]
+
+permissions:
+  contents: write
+  packages: write
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: "3.13"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Install dependencies
+        run: uv pip install --system ".[dev]"
+
+      - name: Run tests
+        run: pytest tests/ -v --tb=short
+        env:
+          TESTING: "1"
+
+  build-and-push:
+    needs: test
+    runs-on: ubuntu-latest
+    outputs:
+      image_tag: ${{ steps.meta.outputs.versioned_tag }}
+      version: ${{ steps.meta.outputs.version }}
+      channel: ${{ steps.meta.outputs.channel }}
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+
+      - name: Claim version tag (with retry to avoid race conditions)
+        id: meta
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+          YEAR_MONTH=$(date +%Y.%m)
+          if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
+            CHANNEL="stable"
+          else
+            CHANNEL="dev"
+          fi
+          SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
+
+          # Claim a unique version by pushing a git tag BEFORE building.
+          # Retry up to 5 times if another CI run took our N.
+          TAG_CLAIMED=false
+          for ATTEMPT in 1 2 3 4 5; do
+            git fetch --tags --force
+            # Use max(N) not count — safe even if tags are deleted
+            MAX_N=$(git tag -l "*-${YEAR_MONTH}.*" | sed 's/.*\.//' | sort -n | tail -1)
+            N=$(( ${MAX_N:-0} + 1 ))
+            VERSION="${YEAR_MONTH}.${N}"
+            TAG="${CHANNEL}-${VERSION}"
+
+            git tag -a "$TAG" -m "Release $TAG"
+            if git push origin "$TAG" 2>/dev/null; then
+              echo "Claimed tag $TAG (attempt $ATTEMPT)"
+              TAG_CLAIMED=true
+              break
+            else
+              echo "Tag $TAG already exists, retrying... (attempt $ATTEMPT)"
+              git tag -d "$TAG"
+              sleep 2
+            fi
+          done
+
+          if [ "$TAG_CLAIMED" != "true" ]; then
+            echo "::error::Failed to claim a unique version tag after 5 attempts"
+            exit 1
+          fi
+
+          echo "channel=${CHANNEL}" >> "$GITHUB_OUTPUT"
+          echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
+          echo "versioned_tag=${TAG}" >> "$GITHUB_OUTPUT"
+          echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT"
+
+          echo "Channel: ${CHANNEL}"
+          echo "Version: ${VERSION}"
+          echo "Versioned tag: ${TAG}"
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v7
+        with:
+          push: true
+          build-args: |
+            AGNES_VERSION=${{ steps.meta.outputs.version }}
+            RELEASE_CHANNEL=${{ steps.meta.outputs.channel }}
+          tags: |
+            ghcr.io/${{ github.repository }}:${{ steps.meta.outputs.channel }}
+            ghcr.io/${{ github.repository }}:${{ steps.meta.outputs.versioned_tag }}
+            ghcr.io/${{ github.repository }}:sha-${{ steps.meta.outputs.short_sha }}
+
+  smoke-test:
+    needs: build-and-push
+    if: github.ref == 'refs/heads/main'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Start Agnes from built image
+        run: |
+          # Create empty .env (docker-compose.yml requires env_file: .env, gitignored)
+          touch .env
+          # Use prod compose (GHCR images) + CI overlay (test secrets)
+          export AGNES_TAG="${{ needs.build-and-push.outputs.image_tag }}"
+          docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml up -d app
+          # Wait for healthy (max 60s)
+          timeout 60 bash -c 'until curl -sf http://localhost:8000/api/health | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d[\"status\"]!=\"unhealthy\" else 1)"; do sleep 3; done'
+
+      - name: Run smoke tests
+        run: bash scripts/smoke-test.sh http://localhost:8000
+
+      - name: Collect logs on failure
+        if: failure()
+        run: docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml logs > smoke-test-logs.txt
+
+      - name: Upload logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: smoke-test-logs
+          path: smoke-test-logs.txt
+
+      - name: Teardown
+        if: always()
+        run: docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml down -v
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,33 @@
+# Changelog
+
+All notable changes to Agnes AI Data Analyst are documented in this file.
+
+Format: [CalVer](https://calver.org/) `YYYY.MM.N` with channels `stable` and `dev`.
+
+---
+
+## stable-2026.04.1 (unreleased)
+
+Multi-instance deployment and self-service setup.
+
+### Added
+- CalVer versioning with `stable` and `dev` release channels
+- `/api/health` now returns `version`, `channel`, and `schema_version`
+- Auto-generated JWT and session secrets with file persistence (`/data/state/.jwt_secret`)
+- Pre-migration snapshot of `system.duckdb` before schema upgrades
+- `POST /api/admin/configure` for headless data source configuration
+- `POST /api/admin/discover-and-register` combined table discovery and registration
+- `/setup` web wizard for first-time instance setup
+- `scripts/smoke-test.sh` for post-deploy verification
+- Smoke test job in CI (Docker-in-CI after every release)
+- OpenAPI snapshot test for breaking change detection
+- Custom connector mount support (`connectors/custom/`)
+- Startup banner logging version, channel, and schema version
+- Schema migration safety tests (idempotency, data preservation, snapshot)
+- `CHANGELOG.md` and release notes template
+
+### Breaking Changes
+None.
+
+### Migration Guide
+No action required. Existing instances upgrade seamlessly.
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -154,7 +154,7 @@ Auth providers in `app/auth/` (FastAPI-based):
 ## Key Implementation Details

 ### DuckDB Schema (src/db.py)
- Schema v2 with auto-migration from v1
+- Schema v3 with auto-migration from v1→v2→v3
 - `table_registry`: id, name, source_type, bucket, source_table, query_mode, sync_schedule, etc.
 - `sync_state`, `sync_history`: track extraction progress
 - `users`, `dataset_permissions`, `audit_log`: auth + RBAC
--- a/5
+++ b/5
@ -6,6 +6,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf
 # Install uv for fast dependency management
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv

+ARG AGNES_VERSION=dev
+ARG RELEASE_CHANNEL=dev
+ENV AGNES_VERSION=${AGNES_VERSION}
+ENV RELEASE_CHANNEL=${RELEASE_CHANNEL}
+
 WORKDIR /app

 # Copy application code
--- a/6
+++ b/6
@ -1,6 +1,6 @@
 # Agnes AI Data Analyst — Development Makefile

-.PHONY: help test lint dev docker
+.PHONY: help test lint dev docker update-openapi-snapshot

 help:
 	@echo "Available targets:"
@ -20,3 +20,7 @@ docker:

 lint:
 	@ruff check . 2>/dev/null || echo "ruff not installed: pip install ruff"
+
+update-openapi-snapshot:
+	TESTING=1 python scripts/generate_openapi.py > tests/snapshots/openapi.json
+	@echo "Snapshot updated. Review diff and commit."
--- a/app/api/admin.py
+++ b/app/api/admin.py
@ -1,7 +1,9 @@
-"""Admin endpoints — table discovery, registry management."""
+"""Admin endpoints — table discovery, registry management, instance configuration."""

 import logging
+import os
 import uuid
+from pathlib import Path

 from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel
@ -42,6 +44,16 @@ class UpdateTableRequest(BaseModel):
    profile_after_sync: Optional[bool] = None


+class ConfigureRequest(BaseModel):
+    data_source: str  # "keboola" | "bigquery" | "local"
+    keboola_token: Optional[str] = None
+    keboola_url: Optional[str] = None
+    bigquery_project: Optional[str] = None
+    bigquery_location: Optional[str] = None
+    instance_name: Optional[str] = None
+    allowed_domain: Optional[str] = None
+
+
@router.get("/discover-tables")
 async def discover_tables(
    user: dict = Depends(require_role(Role.ADMIN)),
@ -53,10 +65,12 @@ async def discover_tables(

        if source_type == "keboola":
            from connectors.keboola.client import KeboolaClient
-            import os
            from app.instance_config import get_value
-            url = get_value("keboola", "url", default="")
-            token = os.environ.get(get_value("keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN"), "")
+            url = get_value("data_source", "keboola", "stack_url", default="")
+            token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
+            token = os.environ.get(token_env, "") if token_env else ""
+            if not token:
+                token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
            client = KeboolaClient(token=token, url=url)
            tables = client.discover_all_tables()
            return {"tables": tables, "count": len(tables), "source": "keboola"}
@ -144,3 +158,208 @@ async def unregister_table(
    if not repo.get(table_id):
        raise HTTPException(status_code=404, detail="Table not found")
    repo.unregister(table_id)
+
+
+@router.post("/configure")
+async def configure_instance(
+    request: ConfigureRequest,
+    user: dict = Depends(require_role(Role.ADMIN)),
+):
+    """Configure data source and instance settings via API.
+
+    Writes config to instance.yaml and persists secrets to .env_overlay.
+    AI agents and the /setup wizard use this instead of manual file editing.
+    """
+    import yaml
+
+    if request.data_source not in ("keboola", "bigquery", "local"):
+        raise HTTPException(status_code=400, detail="data_source must be 'keboola', 'bigquery', or 'local'")
+
+    # Validate credentials if provided
+    if request.data_source == "keboola":
+        if not request.keboola_token or not request.keboola_url:
+            raise HTTPException(status_code=400, detail="keboola_token and keboola_url are required for Keboola data source")
+        try:
+            from connectors.keboola.client import KeboolaClient
+            client = KeboolaClient(token=request.keboola_token, url=request.keboola_url)
+            client.test_connection()
+        except Exception as e:
+            logger.error("Keboola connection validation failed: %s", e)
+            raise HTTPException(status_code=400, detail="Keboola connection failed. Check your token and URL.")
+
+    elif request.data_source == "bigquery":
+        if not request.bigquery_project:
+            raise HTTPException(status_code=400, detail="bigquery_project is required for BigQuery data source")
+
+    # Write instance.yaml to DATA_DIR/state/ (writable Docker volume),
+    # NOT to CONFIG_DIR which is mounted read-only in Docker.
+    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
+    config_path = data_dir / "state" / "instance.yaml"
+
+    # Load existing API-generated config, or fall back to read-only CONFIG_DIR config
+    existing = {}
+    if config_path.exists():
+        try:
+            existing = yaml.safe_load(config_path.read_text()) or {}
+        except Exception:
+            existing = {}
+    else:
+        # Try loading from read-only config as base
+        ro_path = Path(os.environ.get("CONFIG_DIR", "./config")) / "instance.yaml"
+        if ro_path.exists():
+            try:
+                existing = yaml.safe_load(ro_path.read_text()) or {}
+            except Exception:
+                existing = {}
+
+    # Merge instance settings
+    if request.instance_name:
+        existing.setdefault("instance", {})["name"] = request.instance_name
+
+    if request.allowed_domain:
+        existing.setdefault("auth", {})["allowed_domain"] = request.allowed_domain
+
+    # Merge data source config (secrets as env var references)
+    existing["data_source"] = {"type": request.data_source}
+    if request.data_source == "keboola":
+        existing["data_source"]["keboola"] = {
+            "stack_url": request.keboola_url,
+            "token_env": "KEBOOLA_STORAGE_TOKEN",
+        }
+    elif request.data_source == "bigquery":
+        existing["data_source"]["bigquery"] = {
+            "project": request.bigquery_project,
+            "location": request.bigquery_location or "us",
+        }
+
+    # Write to writable data volume
+    config_path.parent.mkdir(parents=True, exist_ok=True)
+    config_path.write_text(yaml.dump(existing, default_flow_style=False, sort_keys=False))
+    logger.info("Wrote instance config to %s", config_path)
+
+    # Persist secrets to .env_overlay (in data volume, never in git)
+    secrets_to_persist = {}
+    if request.keboola_token:
+        secrets_to_persist["KEBOOLA_STORAGE_TOKEN"] = request.keboola_token
+    if request.keboola_url:
+        secrets_to_persist["KEBOOLA_STACK_URL"] = request.keboola_url
+
+    if secrets_to_persist:
+        data_dir = Path(os.environ.get("DATA_DIR", "./data"))
+        overlay_path = data_dir / "state" / ".env_overlay"
+        overlay_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Merge with existing overlay
+        existing_overlay = {}
+        if overlay_path.exists():
+            for line in overlay_path.read_text().splitlines():
+                if "=" in line and not line.startswith("#"):
+                    k, v = line.split("=", 1)
+                    existing_overlay[k.strip()] = v.strip()
+        existing_overlay.update(secrets_to_persist)
+
+        overlay_path.write_text(
+            "\n".join(f"{k}={v}" for k, v in existing_overlay.items()) + "\n"
+        )
+        try:
+            overlay_path.chmod(0o600)
+        except OSError:
+            pass
+        logger.info("Persisted %d secrets to .env_overlay", len(secrets_to_persist))
+
+        # Inject into current process environment
+        for k, v in secrets_to_persist.items():
+            os.environ[k] = v
+
+    # Invalidate cached instance config so next read picks up changes
+    import app.instance_config as ic
+    ic._instance_config = None
+
+    return {
+        "status": "ok",
+        "data_source": request.data_source,
+        "connection": "verified" if request.data_source != "local" else "local",
+    }
+
+
+def _discover_and_register_tables(conn: duckdb.DuckDBPyConnection, user_email: str) -> dict:
+    """Discover tables from configured source and register them. Shared logic for API and sync."""
+    from app.instance_config import get_data_source_type, get_value
+
+    source_type = get_data_source_type()
+    if source_type != "keboola":
+        return {"registered": 0, "skipped": 0, "errors": 0, "tables": [], "source": source_type}
+
+    from connectors.keboola.client import KeboolaClient
+    # Read from data_source.keboola (matches what /api/admin/configure writes)
+    url = get_value("data_source", "keboola", "stack_url", default="")
+    token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
+    token = os.environ.get(token_env, "") if token_env else ""
+    if not token:
+        token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
+
+    client = KeboolaClient(token=token, url=url)
+    discovered = client.discover_all_tables()
+
+    repo = TableRegistryRepository(conn)
+    registered = 0
+    skipped = 0
+    errors = 0
+    table_names = []
+
+    for table in discovered:
+        table_id = table.get("id", "").strip().lower().replace(".", "_").replace(" ", "_")
+        if not table_id:
+            errors += 1
+            continue
+
+        if repo.get(table_id):
+            skipped += 1
+            continue
+
+        try:
+            # Parse bucket from table ID (format: in.c-bucket.table_name)
+            parts = table.get("id", "").split(".")
+            bucket = parts[1] if len(parts) > 1 else ""
+            source_table = parts[2] if len(parts) > 2 else table.get("name", "")
+
+            repo.register(
+                id=table_id,
+                name=table.get("name", table_id),
+                source_type="keboola",
+                bucket=bucket,
+                source_table=source_table,
+                query_mode="local",
+                registered_by=user_email,
+                description=f"Auto-discovered from Keboola: {table.get('id', '')}",
+            )
+            registered += 1
+            table_names.append(table_id)
+        except Exception as e:
+            logger.warning("Failed to register %s: %s", table_id, e)
+            errors += 1
+
+    return {
+        "registered": registered,
+        "skipped": skipped,
+        "errors": errors,
+        "tables": table_names,
+        "source": "keboola",
+    }
+
+
+@router.post("/discover-and-register")
+async def discover_and_register(
+    user: dict = Depends(require_role(Role.ADMIN)),
+    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
+):
+    """Discover tables from configured source and auto-register them.
+
+    Combines discover-tables + register-table into one call.
+    Skips already-registered tables. Used by /setup wizard and AI agents.
+    """
+    try:
+        result = _discover_and_register_tables(conn, user.get("email", "admin"))
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Discovery and registration failed: {e}")
--- a/app/api/health.py
+++ b/app/api/health.py
@ -1,11 +1,13 @@
 """Health check endpoint — structured diagnostics for AI agents."""

+import os
 from datetime import datetime, timezone

 from fastapi import APIRouter, Depends
 import duckdb

 from app.auth.dependencies import _get_db
+from src.db import SCHEMA_VERSION
 from src.repositories.sync_state import SyncStateRepository

 router = APIRouter(tags=["health"])
@ -69,6 +71,9 @@ async def health_check(conn: duckdb.DuckDBPyConnection = Depends(_get_db)):

    return {
        "status": overall,
+        "version": os.environ.get("AGNES_VERSION", "dev"),
+        "channel": os.environ.get("RELEASE_CHANNEL", "dev"),
+        "schema_version": SCHEMA_VERSION,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "services": checks,
    }
--- a/app/api/sync.py
+++ b/app/api/sync.py
@ -64,8 +64,29 @@ def _run_sync(tables: Optional[List[str]] = None):
            sys_conn.close()

        if not table_configs:
-            logger.warning("No tables to sync for source_type=%s", source_type)
-            return
+            # Auto-discover tables on first sync when registry is empty
+            if source_type == "keboola" and os.environ.get("KEBOOLA_STORAGE_TOKEN"):
+                logger.info("No tables registered — running auto-discovery from Keboola")
+                try:
+                    from app.api.admin import _discover_and_register_tables
+                    auto_conn = get_system_db()
+                    try:
+                        result = _discover_and_register_tables(auto_conn, "auto-discovery")
+                        logger.info("Auto-discovered %d tables, skipped %d", result["registered"], result["skipped"])
+                    finally:
+                        auto_conn.close()
+                    # Re-read table configs after auto-registration
+                    sys_conn2 = get_system_db()
+                    try:
+                        table_configs = TableRegistryRepository(sys_conn2).list_local(source_type)
+                    finally:
+                        sys_conn2.close()
+                except Exception as e:
+                    logger.warning("Auto-discovery failed: %s", e)
+
+            if not table_configs:
+                logger.warning("No tables to sync for source_type=%s", source_type)
+                return

        # Serialize configs — strip non-serializable fields
        serializable = []
@ -113,6 +134,29 @@ print(json.dumps(result))
        else:
            print(f"[SYNC] Extractor OK", file=_sys.stderr, flush=True)

+        # Run custom connectors (Tier A: local mount)
+        connectors_dir = Path(os.environ.get("CONNECTORS_DIR", str(Path(__file__).parent.parent.parent / "connectors" / "custom")))
+        if connectors_dir.exists():
+            for connector_dir in sorted(connectors_dir.iterdir()):
+                if not connector_dir.is_dir():
+                    continue
+                extractor = connector_dir / "extractor.py"
+                if not extractor.exists():
+                    continue
+                logger.info("Running custom connector: %s", connector_dir.name)
+                try:
+                    custom_result = subprocess.run(
+                        [sys.executable, str(extractor)],
+                        env=env, capture_output=True, text=True, timeout=600,
+                        cwd=str(Path(__file__).parent.parent.parent),
+                    )
+                    if custom_result.returncode != 0:
+                        logger.error("Custom connector %s failed: %s", connector_dir.name, custom_result.stderr[-500:])
+                    else:
+                        logger.info("Custom connector %s completed", connector_dir.name)
+                except subprocess.TimeoutExpired:
+                    logger.error("Custom connector %s timed out", connector_dir.name)
+
        # Rebuild master views (reads extract.duckdb files, no write conflict)
        from src.orchestrator import SyncOrchestrator
        orch = SyncOrchestrator()
--- a/app/auth/jwt.py
+++ b/app/auth/jwt.py
@ -7,22 +7,22 @@ from typing import Optional

 import jwt

-SECRET_KEY = os.environ.get("JWT_SECRET_KEY", "")
-
-if not SECRET_KEY:
+def _get_secret_key() -> str:
+    """Load JWT secret - from env, file, or auto-generated."""
    if os.environ.get("TESTING", "").lower() in ("1", "true"):
-        SECRET_KEY = "test-jwt-secret-key-minimum-32-chars!!"
-    else:
-        raise RuntimeError(
-            "JWT_SECRET_KEY environment variable is required. "
-            "Generate one: python -c \"import secrets; print(secrets.token_hex(32))\""
+        return os.environ.get("JWT_SECRET_KEY", "test-jwt-secret-key-minimum-32-chars!!")
+    from app.secrets import get_jwt_secret
+    key = get_jwt_secret()
+    if len(key) < 32:
+        import warnings as _warnings
+        _warnings.warn(
+            f"JWT_SECRET_KEY is {len(key)} chars — minimum 32 recommended",
+            UserWarning, stacklevel=2,
        )
-elif len(SECRET_KEY) < 32 and os.environ.get("TESTING", "").lower() not in ("1", "true"):
-    import warnings as _warnings
-    _warnings.warn(
-        f"JWT_SECRET_KEY is {len(SECRET_KEY)} chars — minimum 32 recommended",
-        UserWarning, stacklevel=2,
-    )
+    return key
+
+
+SECRET_KEY = _get_secret_key()

 ALGORITHM = "HS256"
 ACCESS_TOKEN_EXPIRE_HOURS = 24  # 24 hours
--- a/app/instance_config.py
+++ b/app/instance_config.py
@ -11,15 +11,34 @@ _instance_config: Optional[dict] = None


 def load_instance_config() -> dict:
-    """Load instance.yaml using the existing config loader."""
+    """Load instance.yaml — checks API-generated config first, then static config.
+
+    Search order:
+    1. DATA_DIR/state/instance.yaml (written by /api/admin/configure, writable)
+    2. CONFIG_DIR/instance.yaml (static, read-only in Docker)
+    3. Empty dict with defaults (if neither exists)
+    """
    global _instance_config
    if _instance_config is not None:
        return _instance_config

+    # First, try API-generated config in writable data volume
+    import yaml
+    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
+    api_config_path = data_dir / "state" / "instance.yaml"
+    if api_config_path.exists():
+        try:
+            _instance_config = yaml.safe_load(api_config_path.read_text()) or {}
+            logger.info("Loaded instance.yaml from %s", api_config_path)
+            return _instance_config
+        except Exception as e:
+            logger.warning(f"Could not load API-generated instance.yaml: {e}")
+
+    # Fall back to static config (may have strict validation)
    try:
-        from config.loader import load_instance_config as _load, get_instance_value
+        from config.loader import load_instance_config as _load
        _instance_config = _load()
-        logger.info("Loaded instance.yaml")
+        logger.info("Loaded instance.yaml from config/")
    except Exception as e:
        logger.warning(f"Could not load instance.yaml: {e}. Using defaults.")
        _instance_config = {}
--- a/app/main.py
+++ b/app/main.py
@ -48,8 +48,8 @@ def create_app() -> FastAPI:
    )

    # Session middleware (required for OAuth state)
-    import secrets as _secrets
-    session_secret = os.environ.get("SESSION_SECRET", os.environ.get("JWT_SECRET_KEY", _secrets.token_hex(32)))
+    from app.secrets import get_session_secret
+    session_secret = get_session_secret()
    app.add_middleware(SessionMiddleware, secret_key=session_secret)

    # CORS for CLI and external clients
@ -62,6 +62,14 @@ def create_app() -> FastAPI:
        allow_headers=["*"],
    )

+    # Load .env_overlay (persisted by /api/admin/configure)
+    _overlay = Path(os.environ.get("DATA_DIR", "./data")) / "state" / ".env_overlay"
+    if _overlay.exists():
+        for line in _overlay.read_text().splitlines():
+            if "=" in line and not line.startswith("#"):
+                k, v = line.split("=", 1)
+                os.environ.setdefault(k.strip(), v.strip())
+
    # Load instance config on startup
    try:
        from app.instance_config import load_instance_config
@ -70,6 +78,15 @@ def create_app() -> FastAPI:
    except Exception as e:
        logger.warning(f"Could not load instance config: {e}")

+    # Startup banner
+    from src.db import SCHEMA_VERSION
+    logger.info(
+        "Agnes %s | channel: %s | schema v%s",
+        os.environ.get("AGNES_VERSION", "dev"),
+        os.environ.get("RELEASE_CHANNEL", "dev"),
+        SCHEMA_VERSION,
+    )
+
    # Seed admin user for testing/CI (when SEED_ADMIN_EMAIL is set)
    seed_email = os.environ.get("SEED_ADMIN_EMAIL")
    if seed_email:
--- a/app/secrets.py
+++ b/app/secrets.py
@ -0,0 +1,43 @@
+"""Auto-generate and persist secrets that survive container restarts."""
+import logging
+import os
+import secrets
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def _load_or_generate(env_var: str, file_name: str) -> str:
+    """Load secret from env var, or from file, or generate and persist."""
+    val = os.environ.get(env_var, "")
+    if val:
+        return val
+    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
+    secret_path = data_dir / "state" / file_name
+    if secret_path.exists():
+        val = secret_path.read_text().strip()
+        if val:
+            return val
+        logger.warning("Secret file %s is empty, regenerating", secret_path)
+    secret_path.parent.mkdir(parents=True, exist_ok=True)
+    val = secrets.token_hex(32)
+    secret_path.write_text(val)
+    try:
+        secret_path.chmod(0o600)
+    except OSError:
+        pass  # chmod not supported on all platforms (e.g., Windows)
+    logger.info(
+        "Auto-generated %s -> %s (set %s in .env to use a fixed value)",
+        file_name, secret_path, env_var,
+    )
+    return val
+
+
+def get_jwt_secret() -> str:
+    """Get JWT secret key from env, file, or auto-generate."""
+    return _load_or_generate("JWT_SECRET_KEY", ".jwt_secret")
+
+
+def get_session_secret() -> str:
+    """Get session secret from env, file, or auto-generate."""
+    return _load_or_generate("SESSION_SECRET", ".session_secret")
--- a/app/web/router.py
+++ b/app/web/router.py
@ -120,6 +120,7 @@ _URL_MAP = {
    "email_auth.login_email_form": "/login/email",
    "email_auth.send_magic_link": "/auth/email/send-link",
    "register": "/auth/password/setup",
+    "setup": "/setup",
 }


@ -177,6 +178,18 @@ async def index(request: Request, user: Optional[dict] = Depends(get_optional_us
    return RedirectResponse(url="/login", status_code=302)


+@router.get("/setup", response_class=HTMLResponse)
+async def setup_wizard(request: Request, conn: duckdb.DuckDBPyConnection = Depends(_get_db)):
+    """First-time setup wizard. Redirects to dashboard if users already exist."""
+    try:
+        user_count = conn.execute("SELECT COUNT(*) FROM users").fetchone()[0]
+        if user_count > 0:
+            return RedirectResponse(url="/login", status_code=302)
+    except Exception:
+        pass  # No users table yet — show setup
+    return templates.TemplateResponse(request, "setup.html", _build_context(request))
+
+
@router.get("/login", response_class=HTMLResponse)
 async def login_page(request: Request):
    providers = []
--- a/app/web/templates/setup.html
+++ b/app/web/templates/setup.html
@ -0,0 +1,267 @@
+{% extends "base_login.html" %}
+
+{% block title %}Setup - Agnes AI Data Analyst{% endblock %}
+
+{% block content %}
+<div class="login-page">
+    <div class="login-card-wrapper" style="max-width: 520px; margin: 40px auto; padding: 0 20px;">
+        <div class="login-card" style="max-width: 520px;">
+            <h2 id="wizard-title">Setup Agnes</h2>
+            <p class="login-description" id="wizard-description">
+                Create your admin account to get started.
+            </p>
+
+            <!-- Progress -->
+            <div style="display: flex; gap: 8px; margin-bottom: 24px;">
+                <div id="step-dot-1" style="flex: 1; height: 4px; border-radius: 2px; background: var(--primary, #2563eb);"></div>
+                <div id="step-dot-2" style="flex: 1; height: 4px; border-radius: 2px; background: #e5e7eb;"></div>
+                <div id="step-dot-3" style="flex: 1; height: 4px; border-radius: 2px; background: #e5e7eb;"></div>
+                <div id="step-dot-4" style="flex: 1; height: 4px; border-radius: 2px; background: #e5e7eb;"></div>
+            </div>
+
+            <!-- Status message -->
+            <div id="status-msg" style="display: none; padding: 10px 14px; border-radius: 6px; margin-bottom: 16px; font-size: 14px;"></div>
+
+            <!-- Step 1: Create Admin -->
+            <div id="step-1">
+                <form id="admin-form" onsubmit="return createAdmin(event)">
+                    <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Email</label>
+                    <input type="email" id="admin-email" required placeholder="admin@company.com"
+                        style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 12px; font-size: 14px; box-sizing: border-box;">
+
+                    <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Password</label>
+                    <input type="password" id="admin-password" required minlength="8" placeholder="Min. 8 characters"
+                        style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 16px; font-size: 14px; box-sizing: border-box;">
+
+                    <button type="submit" class="btn btn-primary" style="width: 100%;" id="btn-admin">
+                        Create Admin Account
+                    </button>
+                </form>
+            </div>
+
+            <!-- Step 2: Data Source -->
+            <div id="step-2" style="display: none;">
+                <form id="source-form" onsubmit="return configureSource(event)">
+                    <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Data Source</label>
+                    <select id="data-source" onchange="toggleSourceFields()"
+                        style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 12px; font-size: 14px; box-sizing: border-box;">
+                        <option value="keboola">Keboola</option>
+                        <option value="bigquery">BigQuery</option>
+                        <option value="local">Local / CSV</option>
+                    </select>
+
+                    <div id="keboola-fields">
+                        <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Keboola URL</label>
+                        <input type="url" id="keboola-url" placeholder="https://connection.keboola.com"
+                            style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 12px; font-size: 14px; box-sizing: border-box;">
+
+                        <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Storage API Token</label>
+                        <input type="password" id="keboola-token" placeholder="Your Keboola storage token"
+                            style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 16px; font-size: 14px; box-sizing: border-box;">
+                    </div>
+
+                    <div id="bigquery-fields" style="display: none;">
+                        <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">GCP Project</label>
+                        <input type="text" id="bq-project" placeholder="my-gcp-project"
+                            style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 12px; font-size: 14px; box-sizing: border-box;">
+
+                        <label style="display: block; margin-bottom: 4px; font-size: 14px; font-weight: 500;">Location</label>
+                        <input type="text" id="bq-location" value="us" placeholder="us"
+                            style="width: 100%; padding: 10px 12px; border: 1px solid #d1d5db; border-radius: 6px; margin-bottom: 16px; font-size: 14px; box-sizing: border-box;">
+                    </div>
+
+                    <button type="submit" class="btn btn-primary" style="width: 100%;" id="btn-source">
+                        Configure Data Source
+                    </button>
+                    <button type="button" onclick="skipToStep(4)" class="btn btn-secondary" style="width: 100%; margin-top: 8px;" id="btn-skip-source">
+                        Skip (configure later)
+                    </button>
+                </form>
+            </div>
+
+            <!-- Step 3: Discover Tables -->
+            <div id="step-3" style="display: none;">
+                <p style="font-size: 14px; color: #6b7280; margin-bottom: 16px;">
+                    Discover and register tables from your data source.
+                </p>
+                <button onclick="discoverTables()" class="btn btn-primary" style="width: 100%;" id="btn-discover">
+                    Discover Tables
+                </button>
+                <div id="discover-result" style="display: none; margin-top: 12px; padding: 12px; background: #f0fdf4; border-radius: 6px; font-size: 14px;"></div>
+                <button onclick="goToStep(4)" class="btn btn-primary" style="width: 100%; margin-top: 12px; display: none;" id="btn-next-sync">
+                    Continue
+                </button>
+            </div>
+
+            <!-- Step 4: First Sync & Done -->
+            <div id="step-4" style="display: none;">
+                <p style="font-size: 14px; color: #6b7280; margin-bottom: 16px;">
+                    Start the first data sync and go to your dashboard.
+                </p>
+                <button onclick="triggerSync()" class="btn btn-primary" style="width: 100%;" id="btn-sync">
+                    Start First Sync
+                </button>
+                <a href="/dashboard" class="btn btn-primary" style="width: 100%; margin-top: 12px; display: none; text-align: center; text-decoration: none;" id="btn-dashboard">
+                    Go to Dashboard
+                </a>
+            </div>
+        </div>
+    </div>
+</div>
+
+<script>
+let token = '';
+const steps = {
+    1: { title: 'Setup Agnes', desc: 'Create your admin account to get started.' },
+    2: { title: 'Data Source', desc: 'Connect to your data source.' },
+    3: { title: 'Discover Tables', desc: 'Find and register tables from your data source.' },
+    4: { title: 'Almost Done', desc: 'Start syncing data and open your dashboard.' },
+};
+
+function showStatus(msg, type) {
+    const el = document.getElementById('status-msg');
+    el.textContent = msg;
+    el.style.display = 'block';
+    el.style.background = type === 'error' ? '#fef2f2' : '#f0fdf4';
+    el.style.color = type === 'error' ? '#dc2626' : '#16a34a';
+}
+
+function hideStatus() {
+    document.getElementById('status-msg').style.display = 'none';
+}
+
+function goToStep(n) {
+    hideStatus();
+    for (let i = 1; i <= 4; i++) {
+        document.getElementById('step-' + i).style.display = i === n ? 'block' : 'none';
+        document.getElementById('step-dot-' + i).style.background = i <= n ? 'var(--primary, #2563eb)' : '#e5e7eb';
+    }
+    document.getElementById('wizard-title').textContent = steps[n].title;
+    document.getElementById('wizard-description').textContent = steps[n].desc;
+}
+
+function skipToStep(n) {
+    goToStep(n);
+}
+
+function toggleSourceFields() {
+    const src = document.getElementById('data-source').value;
+    document.getElementById('keboola-fields').style.display = src === 'keboola' ? 'block' : 'none';
+    document.getElementById('bigquery-fields').style.display = src === 'bigquery' ? 'block' : 'none';
+}
+
+async function apiCall(url, body) {
+    const headers = { 'Content-Type': 'application/json' };
+    if (token) headers['Authorization'] = 'Bearer ' + token;
+    const resp = await fetch(url, { method: 'POST', headers, body: JSON.stringify(body) });
+    if (resp.status === 401) {
+        token = '';
+        sessionStorage.removeItem('setup_token');
+        showStatus('Session expired. Please refresh the page and start over.', 'error');
+        throw new Error('Session expired');
+    }
+    const data = await resp.json();
+    if (!resp.ok) throw new Error(data.detail || 'Request failed');
+    return data;
+}
+
+async function createAdmin(e) {
+    e.preventDefault();
+    const btn = document.getElementById('btn-admin');
+    btn.disabled = true;
+    btn.textContent = 'Creating...';
+    try {
+        const data = await apiCall('/auth/bootstrap', {
+            email: document.getElementById('admin-email').value,
+            password: document.getElementById('admin-password').value,
+        });
+        token = data.access_token;
+        sessionStorage.setItem('setup_token', token);
+        goToStep(2);
+    } catch (err) {
+        showStatus(err.message, 'error');
+    } finally {
+        btn.disabled = false;
+        btn.textContent = 'Create Admin Account';
+    }
+    return false;
+}
+
+async function configureSource(e) {
+    e.preventDefault();
+    const btn = document.getElementById('btn-source');
+    btn.disabled = true;
+    btn.textContent = 'Verifying...';
+    try {
+        const src = document.getElementById('data-source').value;
+        const body = { data_source: src };
+        if (src === 'keboola') {
+            body.keboola_url = document.getElementById('keboola-url').value;
+            body.keboola_token = document.getElementById('keboola-token').value;
+        } else if (src === 'bigquery') {
+            body.bigquery_project = document.getElementById('bq-project').value;
+            body.bigquery_location = document.getElementById('bq-location').value;
+        }
+        await apiCall('/api/admin/configure', body);
+        showStatus('Connection verified!', 'success');
+        if (src === 'local') {
+            goToStep(4);
+        } else {
+            goToStep(3);
+        }
+    } catch (err) {
+        showStatus(err.message, 'error');
+    } finally {
+        btn.disabled = false;
+        btn.textContent = 'Configure Data Source';
+    }
+    return false;
+}
+
+async function discoverTables() {
+    const btn = document.getElementById('btn-discover');
+    btn.disabled = true;
+    btn.textContent = 'Discovering...';
+    try {
+        const headers = { 'Content-Type': 'application/json' };
+        if (token) headers['Authorization'] = 'Bearer ' + token;
+        const resp = await fetch('/api/admin/discover-and-register', { method: 'POST', headers });
+        const data = await resp.json();
+        if (!resp.ok) throw new Error(data.detail || 'Discovery failed');
+
+        const el = document.getElementById('discover-result');
+        el.style.display = 'block';
+        el.textContent = `Registered ${data.registered} tables, skipped ${data.skipped}.`;
+        document.getElementById('btn-next-sync').style.display = 'block';
+        btn.style.display = 'none';
+    } catch (err) {
+        showStatus(err.message, 'error');
+    } finally {
+        btn.disabled = false;
+        btn.textContent = 'Discover Tables';
+    }
+}
+
+async function triggerSync() {
+    const btn = document.getElementById('btn-sync');
+    btn.disabled = true;
+    btn.textContent = 'Starting sync...';
+    try {
+        const headers = {};
+        if (token) headers['Authorization'] = 'Bearer ' + token;
+        await fetch('/api/sync/trigger', { method: 'POST', headers });
+        btn.style.display = 'none';
+        document.getElementById('btn-dashboard').style.display = 'block';
+        showStatus('Sync started! You can now go to your dashboard.', 'success');
+    } catch (err) {
+        showStatus(err.message, 'error');
+        btn.disabled = false;
+        btn.textContent = 'Start First Sync';
+    }
+}
+
+// Restore token from sessionStorage (in case of page reload)
+const savedToken = sessionStorage.getItem('setup_token');
+if (savedToken) token = savedToken;
+</script>
+{% endblock %}
--- a/docker-compose.ci.yml
+++ b/docker-compose.ci.yml
@ -0,0 +1,11 @@
+# CI smoke test overlay — minimal config for testing in GitHub Actions.
+# Usage: docker compose -f docker-compose.yml -f docker-compose.ci.yml up -d
+services:
+  app:
+    environment:
+      - JWT_SECRET_KEY=smoke-test-ci-key-minimum-32-chars-xx
+      - SESSION_SECRET=smoke-test-session-key-32-chars-min-x
+      - DATA_DIR=/data
+      - TESTING=0
+    ports:
+      - "8000:8000"
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@ -1,17 +1,18 @@
 # Production override — uses pre-built GHCR image instead of local build.
 # Usage: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
+# Override tag: AGNES_TAG=stable-2026.04.3 docker compose -f ... up -d
 services:
  app:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  scheduler:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  extract:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  telegram-bot:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  ws-gateway:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  corporate-memory:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  session-collector:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -7,6 +7,7 @@ services:
    volumes:
      - data:/data
      - ./config:/app/config:ro
+      # - ./custom-connectors:/app/connectors/custom:ro  # Tier A: AI-generated connectors
    env_file: .env
    environment:
      - DATA_DIR=/data
--- a/docs/RELEASE_TEMPLATE.md
+++ b/docs/RELEASE_TEMPLATE.md
@ -0,0 +1,37 @@
+# Release Notes Template
+
+Use this template when adding a new entry to `CHANGELOG.md`.
+
+---
+
+## stable-YYYY.MM.N
+
+**Image:** `ghcr.io/keboola/agnes-the-ai-analyst:stable-YYYY.MM.N`
+**Digest:** `sha256:...` (from `docker inspect --format='{{index .RepoDigests 0}}'`)
+**Date:** YYYY-MM-DD
+
+### Added
+- Feature description
+
+### Changed
+- Change description
+
+### Fixed
+- Bug fix description
+
+### Breaking Changes
+- Description of breaking change
+- **Migration guide:** Steps to upgrade from previous version
+
+### Deprecated
+- Description of deprecated feature (will be removed in YYYY.MM.N)
+
+---
+
+## Guidelines
+
+- Every merge to `main` creates a new `stable-YYYY.MM.N` release
+- Include the image digest for verification with `cosign verify`
+- Breaking changes require `BREAKING:` prefix in commit message
+- Migration guides must include exact commands or config changes
+- If a release deprecates the previous stable, note it explicitly
--- a/docs/superpowers/specs/2026-04-09-multi-instance-deployment-design.md
+++ b/docs/superpowers/specs/2026-04-09-multi-instance-deployment-design.md
@ -0,0 +1,527 @@
+# Multi-Instance Deployment & Versioning — Design Spec
+
+## Goal
+
+Make Agnes deployable to 20+ independent customer instances via self-service, with safe versioning that prevents one customer's PR from breaking another's deployment.
+
+## Context
+
+Agnes is an open-source AI Data Analyst platform. Customers (or their AI agents) deploy it as a Docker image on their own infrastructure. Each instance connects to different data sources (Keboola, BigQuery, Jira, custom).
+
+**Key constraints:**
+- Customers range from semi-technical to non-technical, assisted by AI agents
+- Cloud-agnostic (GCP, AWS, Azure, on-prem, VPS)
+- One repo, one Docker image, many instances
+- Community PRs must not break existing customers
+- AI agent is the primary "installer" and "developer"
+
+---
+
+## 1. Versioning & Release Channels
+
+### CalVer: `YYYY.MM.N`
+
+Format: year.month.sequential-number. Example: `2026.04.1`, `2026.04.2`, `2026.05.1`.
+
+No manual release decisions. Every merge to main is a release.
+
+### Three channels
+
+| Channel | Floating tag | Versioned tag | Source | Who uses it |
+|---------|-------------|---------------|--------|-------------|
+| **dev** | `:dev` | `:dev-2026.04.N` | Every CI-passing push on any feature branch | Developers, PR testing |
+| **stable** | `:stable` | `:stable-2026.04.N` | Every merge to main + CI pass | All production customers |
+| **deprecated** | — | `:deprecated-2026.04.N` | Previous stable after breaking change or failed smoke test | Grace period (30 days) |
+
+Every image also gets a `:sha-abc1234` tag for exact commit traceability.
+
+### Tag lifecycle
+
+```
+feature branch push → CI ✅ → :dev + :dev-2026.04.N + :sha-abc1234
+                         ❌ → nothing pushed
+
+merge to main       → CI ✅ → :stable + :stable-2026.04.N + :sha-abc1234
+                         ❌ → merge blocked (CI required)
+                                │
+                                ▼
+                         smoke test on canary VM
+                                │
+                         ✅ → :stable confirmed
+                         ❌ → alert, rollback canary to previous :stable
+                              broken build tagged :deprecated-2026.04.N
+```
+
+### Version numbering
+
+CalVer `YYYY.MM.N` where N is a global auto-incrementing counter per month across both channels.
+
+Example timeline:
+```
+Apr 8  feature/foo push     → :dev-2026.04.1
+Apr 8  feature/bar push     → :dev-2026.04.2
+Apr 8  merge foo to main    → :stable-2026.04.3
+Apr 9  feature/baz push     → :dev-2026.04.4
+Apr 9  merge bar to main    → :stable-2026.04.5
+```
+
+This avoids confusion — version `2026.04.3` exists only once, in one channel.
+
+### Customer pins version
+
+```yaml
+# docker-compose.prod.yml
+
+# Auto-update (recommended): always latest stable
+image: ghcr.io/keboola/agnes-the-ai-analyst:stable
+
+# Pinned: specific stable release, manual update
+image: ghcr.io/keboola/agnes-the-ai-analyst:stable-2026.04.3
+
+# Testing: latest dev
+image: ghcr.io/keboola/agnes-the-ai-analyst:dev
+
+# Testing: specific dev build
+image: ghcr.io/keboola/agnes-the-ai-analyst:dev-2026.04.2
+```
+
+### Main = stable
+
+- `main` branch is always releasable
+- Every merge to main triggers a new stable release
+- Feature branches are the dev channel
+- No promotion pipeline, no manual approval for releases
+- Smoke test is a post-deploy safety net, not a gate
+
+---
+
+## 2. Breaking Change Detection
+
+### What is a breaking change
+
+- `_meta` table schema change (add/remove column)
+- `_remote_attach` table schema change
+- API endpoint removed or response field removed
+- DuckDB system schema migration that drops data
+- CLI command removed or argument renamed
+- `instance.yaml` required key added
+
+### Automated detection in CI
+
+Every PR runs:
+
+1. **Contract tests**: `_meta` and `_remote_attach` schema validation against frozen spec
+2. **OpenAPI diff**: Compare PR's `openapi.json` against main's. Flag removed endpoints/fields.
+3. **DuckDB schema diff**: Compare table definitions in system.duckdb
+4. **Config diff**: Compare `instance.yaml.example` required keys
+5. **Full connector matrix**: ALL connectors tested, not just changed ones
+
+If breaking change detected:
+- PR gets `BREAKING` label automatically
+- Requires 2 reviewers (elevated review)
+- Commit message must have `BREAKING:` prefix
+- CHANGELOG.md entry with migration guide required
+- On merge: previous stable tagged as `:deprecated-YYYY.MM.N`
+
+### Deprecated channel
+
+When a breaking change merges:
+1. Previous stable image retagged to `:deprecated-2026.04.N`
+2. New build becomes `:stable` + `:2026.04.(N+1)`
+3. Health endpoint on deprecated version shows warning:
+   ```json
+   {"warnings": ["Running deprecated version 2026.04.3. Update to stable."]}
+   ```
+4. Deprecated images removed from GHCR after 30 days
+
+---
+
+## 3. Smoke Test (Post-Deploy Safety Net)
+
+### What it tests
+
+Automated sequence run on canary VM after every `:stable` deploy:
+
+```
+1. GET  /api/health                    → status != "unhealthy"
+2. POST /auth/token                    → 200 (valid credentials)
+3. GET  /api/catalog/tables            → count > 0
+4. POST /api/query {sql: "SELECT 1"}   → 200 + rows
+5. POST /api/sync/trigger              → 200
+6. (wait 30s)
+7. GET  /api/health                    → check no new errors
+```
+
+### On failure
+
+1. Alert (GitHub issue + optional webhook)
+2. Canary VM rolled back to previous stable: `docker compose pull && docker compose up -d` with previous tag
+3. Failed build tagged `:deprecated-YYYY.MM.N`
+4. `:stable` tag reverted to previous good build
+
+### Implementation
+
+GitHub Actions workflow triggered after the build-and-push workflow completes:
+
+```yaml
+smoke-test:
+  needs: build-and-push
+  runs-on: ubuntu-latest
+  steps:
+    - name: Deploy to canary
+      run: |
+        gcloud compute ssh canary-vm --command="
+          cd /opt/agnes &&
+          docker compose pull &&
+          docker compose up -d"
+
+    - name: Wait for healthy
+      run: |
+        for i in $(seq 1 30); do
+          STATUS=$(curl -sf canary:8000/api/health | jq -r .status)
+          [ "$STATUS" != "unhealthy" ] && break
+          sleep 10
+        done
+
+    - name: Run smoke tests
+      run: |
+        # auth, catalog, query, sync checks
+        ./scripts/smoke-test.sh canary:8000
+
+    - name: Rollback on failure
+      if: failure()
+      run: |
+        # retag and rollback
+```
+
+---
+
+## 4. Self-Service Deployment
+
+### Target experience
+
+Customer (or their AI agent) goes from zero to running instance:
+
+```bash
+# 1. Get the code
+git clone https://github.com/keboola/agnes-the-ai-analyst.git
+cd agnes-the-ai-analyst
+
+# 2. Start it
+docker compose up -d
+
+# 3. Open browser or use API
+# First visit: /setup wizard (no users exist)
+# Or headless: curl -X POST localhost:8000/auth/bootstrap ...
+```
+
+### Two setup modes
+
+**A) Interactive (browser):**
+- First visit when no users exist → redirected to `/setup`
+- Step 1: Create admin account (email + password)
+- Step 2: Choose data source (Keboola / BigQuery / CSV / Custom)
+- Step 3: Enter credentials (token, URL)
+- Step 4: Auto-discover and register tables
+- Step 5: Trigger first sync
+- Done → redirect to dashboard
+
+**B) Headless (AI agent / CLI):**
+```bash
+# Bootstrap admin
+curl -X POST http://localhost:8000/auth/bootstrap \
+  -H "Content-Type: application/json" \
+  -d '{"email":"admin@company.com","password":"SecurePass123!"}'
+
+# Configure data source
+curl -X POST http://localhost:8000/api/admin/configure \
+  -H "Authorization: Bearer $TOKEN" \
+  -H "Content-Type: application/json" \
+  -d '{"data_source":"keboola","keboola_token":"...","keboola_url":"..."}'
+
+# Discover and register tables
+curl -X POST http://localhost:8000/api/admin/discover-and-register \
+  -H "Authorization: Bearer $TOKEN"
+
+# Trigger first sync
+curl -X POST http://localhost:8000/api/sync/trigger \
+  -H "Authorization: Bearer $TOKEN"
+```
+
+Both modes lead to same result. AI agent uses headless.
+
+### Auto-configuration
+
+On first `docker compose up` with no `.env`:
+- `JWT_SECRET_KEY` auto-generated and persisted to `/data/state/.jwt_secret`
+- `SESSION_SECRET` auto-generated similarly
+- App starts in "setup mode" — only `/setup`, `/auth/bootstrap`, and `/api/health` accessible
+
+On first `docker compose up` with `.env` containing `KEBOOLA_STORAGE_TOKEN`:
+- Auto-discovers tables from Keboola on first sync
+- Skips manual table registration step
+
+### What customer must provide
+
+| Required | Optional |
+|----------|----------|
+| Server with Docker | Custom domain + TLS |
+| Admin email + password | Google OAuth credentials |
+| Data source credentials (Keboola token OR BigQuery creds OR CSV files) | Telegram bot token |
+| | Jira webhook secret |
+
+### What customer must NOT do
+
+- Edit YAML manually (setup wizard generates `instance.yaml`)
+- Generate JWT secret (auto-generated)
+- Register tables manually (auto-discovery)
+- Understand DuckDB internals
+
+---
+
+## 5. Custom Connectors (Three Tiers)
+
+All tiers produce the same output: `extract.duckdb` with `_meta` table + `data/*.parquet`. Orchestrator treats them identically.
+
+### Tier A: Local mount (fastest, AI-generated)
+
+Customer's AI agent generates a connector. Lives outside Docker image, survives updates.
+
+```
+/opt/agnes/
+├── docker-compose.yml              ← official image
+├── docker-compose.override.yml     ← customer additions
+└── custom-connectors/
+    └── snowflake/
+        ├── extractor.py
+        └── requirements.txt
+```
+
+```yaml
+# docker-compose.override.yml
+services:
+  app:
+    volumes:
+      - ./custom-connectors:/app/connectors/custom:ro
+```
+
+Orchestrator scans `connectors/custom/*/` in addition to built-in connectors.
+
+**How the AI agent creates one:**
+1. Reads CLAUDE.md → understands extract.duckdb contract
+2. Reads existing connector as reference (e.g., `connectors/keboola/extractor.py`)
+3. Generates `custom-connectors/snowflake/extractor.py`
+4. Runs contract test to validate output
+5. Done — orchestrator picks it up on next rebuild
+
+**Requirements for this to work:**
+- CLAUDE.md must perfectly describe the contract
+- Contract test must be runnable standalone
+- Existing connectors must be readable as examples
+- Clear error messages when contract doesn't match
+
+### Tier B: Standalone container (complex dependencies)
+
+For connectors needing their own runtime (Java, .NET, heavy Python packages).
+
+```yaml
+# docker-compose.override.yml
+services:
+  connector-sap:
+    build: ./custom-connectors/sap
+    volumes:
+      - data:/data
+    environment:
+      - DATA_DIR=/data
+      - SAP_HOST=...
+    profiles:
+      - extract
+```
+
+Connector is its own Docker image. Writes to `/data/extracts/sap/extract.duckdb`. Orchestrator finds it automatically.
+
+### Tier C: Community PR (shared with all)
+
+Connector contributed to main repo via PR. After merge, available in official image for all customers.
+
+```
+connectors/
+├── keboola/          ← built-in
+├── bigquery/         ← built-in
+├── jira/             ← built-in
+└── snowflake/        ← community contributed
+```
+
+**PR requirements:**
+- Must pass contract tests
+- Must include tests
+- Must not modify shared code (orchestrator, API, auth)
+- CI runs full connector matrix
+
+---
+
+## 6. CI/CD Pipeline
+
+### On feature branch push
+
+```yaml
+ci.yml:
+  - tests (all 654+)
+  - contract tests (all connectors)
+  - docker build
+  - push :dev + :dev-sha-xxx to GHCR
+```
+
+### On merge to main
+
+```yaml
+release.yml:
+  - tests (all)
+  - contract tests (all connectors)
+  - breaking change detection (OpenAPI diff, schema diff)
+  - docker build
+  - push :stable + :YYYY.MM.N + :sha-xxx to GHCR
+  - trigger smoke test on canary
+
+smoke-test.yml (triggered):
+  - deploy to canary VM
+  - run smoke test sequence
+  - on failure: rollback canary, tag build as deprecated, create alert
+```
+
+### On PR
+
+```yaml
+pr-check.yml:
+  - tests
+  - contract tests
+  - breaking change detection
+  - label PR: "BREAKING" if detected
+  - require 2 reviewers if breaking
+```
+
+---
+
+## 7. Infrastructure (Cloud-Agnostic)
+
+### Primary: Docker Compose
+
+Works everywhere Docker runs. This is the default and only required deployment method.
+
+```bash
+git clone https://github.com/keboola/agnes-the-ai-analyst.git
+cd agnes-the-ai-analyst
+docker compose up -d
+```
+
+### Optional: Terraform (GCP)
+
+For automated provisioning. Lives in `infra/` with GCS remote state backend.
+
+```bash
+cd infra
+terraform workspace new customer-name
+terraform apply -var-file=instances/customer-name.tfvars
+```
+
+Creates VM, installs Docker, clones repo, generates `.env` and `instance.yaml`, starts Docker Compose.
+
+### Optional: Caddy TLS
+
+Production profile adds Caddy reverse proxy with automatic Let's Encrypt:
+
+```bash
+DOMAIN=data.customer.com docker compose --profile production up -d
+```
+
+### Directory layout on customer server
+
+```
+/opt/agnes/                           ← git clone
+├── docker-compose.yml                ← official
+├── docker-compose.prod.yml           ← GHCR images
+├── docker-compose.override.yml       ← customer customizations
+├── .env                              ← secrets (gitignored)
+├── config/
+│   └── instance.yaml                 ← generated by setup wizard
+├── custom-connectors/                ← Tier A connectors
+│   └── snowflake/
+└── Caddyfile                         ← TLS config
+
+/data/                                ← Docker volume (persistent)
+├── state/system.duckdb               ← users, registry, sync state
+├── analytics/server.duckdb           ← views into extracts
+└── extracts/                         ← per-source data
+    ├── keboola/extract.duckdb
+    ├── bigquery/extract.duckdb
+    └── snowflake/extract.duckdb      ← from custom connector
+```
+
+---
+
+## 8. AI Agent as Primary Installer
+
+CLAUDE.md and documentation must be optimized for AI agent consumption:
+
+### CLAUDE.md requirements
+- Complete extract.duckdb contract with exact SQL for `_meta` and `_remote_attach`
+- Step-by-step setup instructions with exact curl commands
+- Existing connectors as reference for AI-generated new ones
+- Clear error messages explaining what went wrong and how to fix
+
+### API requirements
+- All setup operations available as API calls (not just UI)
+- Self-describing error messages: `"Missing KEBOOLA_STORAGE_TOKEN. Set it in .env or pass via /api/admin/configure"`
+- `/api/health` returns structured diagnostics AI agent can parse
+- `/api/admin/configure` accepts data source config without file editing
+
+### Documentation requirements
+- Machine-readable (no screenshots, no "click here")
+- Every manual step has an equivalent API/CLI command
+- QUICKSTART.md optimized for copy-paste by AI agent
+
+---
+
+## 9. What Needs to Be Built
+
+### Must have (blocks multi-instance)
+
+| # | What | Effort |
+|---|------|--------|
+| 1 | CalVer auto-tagging in CI (release.yml) | 1 day |
+| 2 | Smoke test script + CI workflow | 1 day |
+| 3 | Breaking change detection in CI (OpenAPI diff, contract diff) | 2 days |
+| 4 | `/setup` wizard (web) + `/api/admin/configure` (headless) | 3 days |
+| 5 | Auto-generate JWT_SECRET_KEY on first start | 0.5 day |
+| 6 | Auto-discovery for Keboola tables on first sync | 1 day |
+| 7 | Custom connector mount support in orchestrator | 1 day |
+| 8 | `CHANGELOG.md` + release notes template | 0.5 day |
+| 9 | Health endpoint version + channel info | 0.5 day |
+
+### Should have (improves experience)
+
+| # | What | Effort |
+|---|------|--------|
+| 10 | Deprecated version warning in health endpoint | 0.5 day |
+| 11 | `/api/admin/discover-and-register` auto-discovery endpoint | 1 day |
+| 12 | Standalone container connector example (Tier B) | 0.5 day |
+| 13 | CLAUDE.md optimization for AI agent setup | 1 day |
+| 14 | Terraform module refactor for multi-workspace | 1 day |
+
+### Nice to have (future)
+
+| # | What |
+|---|------|
+| 15 | Community connector contribution guide |
+| 16 | Instance health dashboard (central monitoring) |
+| 17 | Automated backup (GCP disk snapshots) |
+| 18 | Usage analytics (opt-in telemetry) |
+
+---
+
+## Non-Goals
+
+- Multi-tenancy in single process (each customer = separate instance)
+- Kubernetes/Helm (Docker Compose is sufficient for target scale)
+- Paid tier / license keys (open-source, monetization TBD)
+- GUI for connector development (AI agent + CLAUDE.md is sufficient)
--- a/scripts/generate_openapi.py
+++ b/scripts/generate_openapi.py
@ -0,0 +1,16 @@
+"""Generate OpenAPI snapshot from the current FastAPI app."""
+
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.environ.setdefault("TESTING", "1")
+os.environ.setdefault("JWT_SECRET_KEY", "snapshot-generation-key-32-chars-min!!")
+
+from app.main import create_app  # noqa: E402
+
+app = create_app()
+schema = app.openapi()
+json.dump(schema, sys.stdout, indent=2, sort_keys=True)
+sys.stdout.write("\n")
--- a/scripts/smoke-test.sh
+++ b/scripts/smoke-test.sh
@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+# Agnes smoke test — verifies a running instance is functional.
+# Usage: ./scripts/smoke-test.sh [host:port]
+# Default: http://localhost:8000
+set -euo pipefail
+
+HOST="${1:-http://localhost:8000}"
+PASS=0
+FAIL=0
+TOKEN=""
+
+check() {
+    local name="$1" ok="$2"
+    if [ "$ok" = "true" ]; then
+        echo "  PASS $name"
+        PASS=$((PASS + 1))
+    else
+        echo "  FAIL $name"
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+echo "Smoke test: $HOST"
+echo "---"
+
+# 1. Health check
+HEALTH=$(curl -sf "$HOST/api/health" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unreachable")
+if [ "$HEALTH" = "unhealthy" ] || [ "$HEALTH" = "unreachable" ]; then
+    echo "  FATAL: health=$HEALTH"
+    exit 1
+fi
+check "health ($HEALTH)" "true"
+
+# 2. Health has version fields
+HAS_VERSION=$(curl -sf "$HOST/api/health" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print('true' if 'version' in d and 'channel' in d and 'schema_version' in d else 'false')
+" 2>/dev/null || echo "false")
+check "health version fields" "$HAS_VERSION"
+
+# 3. Bootstrap (only works on fresh DB; 403 means users exist)
+BOOT_HTTP=$(curl -s -o /tmp/smoke_boot.json -w "%{http_code}" -X POST "$HOST/auth/bootstrap" \
+  -H "Content-Type: application/json" \
+  -d '{"email":"smoke@test.local","name":"Smoke Test","password":"SmokeTest123!"}' 2>/dev/null || echo "000")
+
+if [ "$BOOT_HTTP" = "200" ]; then
+    TOKEN=$(python3 -c "import json; print(json.load(open('/tmp/smoke_boot.json'))['access_token'])" 2>/dev/null || echo "")
+    check "bootstrap (new admin)" "true"
+elif [ "$BOOT_HTTP" = "403" ]; then
+    TOKEN="${SMOKE_TOKEN:-}"
+    echo "  SKIP bootstrap (users exist)"
+else
+    check "bootstrap (HTTP $BOOT_HTTP)" "false"
+fi
+
+# 4. Query SELECT 1 (requires auth)
+if [ -n "$TOKEN" ]; then
+    QUERY_OK=$(curl -sf -X POST "$HOST/api/query" \
+      -H "Authorization: Bearer $TOKEN" \
+      -H "Content-Type: application/json" \
+      -d '{"sql":"SELECT 1 as test"}' | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print('true' if len(d.get('rows',[])) > 0 else 'false')
+" 2>/dev/null || echo "false")
+    check "query SELECT 1" "$QUERY_OK"
+else
+    echo "  SKIP query (no token)"
+fi
+
+# 5. Sync trigger
+if [ -n "$TOKEN" ]; then
+    SYNC_HTTP=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$HOST/api/sync/trigger" \
+      -H "Authorization: Bearer $TOKEN" 2>/dev/null || echo "000")
+    if [[ "$SYNC_HTTP" =~ ^(200|202)$ ]]; then
+        check "sync trigger" "true"
+    else
+        check "sync trigger (HTTP $SYNC_HTTP)" "false"
+    fi
+else
+    echo "  SKIP sync (no token)"
+fi
+
+# 6. Post-sync health (wait briefly)
+sleep 5
+HEALTH2=$(curl -sf "$HOST/api/health" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unreachable")
+if [ "$HEALTH2" = "unhealthy" ] || [ "$HEALTH2" = "unreachable" ]; then
+    check "post-sync health ($HEALTH2)" "false"
+else
+    check "post-sync health ($HEALTH2)" "true"
+fi
+
+# Results
+echo ""
+echo "Results: $PASS passed, $FAIL failed"
+[ "$FAIL" -eq 0 ] || exit 1
--- a/src/db.py
+++ b/src/db.py
@ -4,12 +4,16 @@ Provides get_system_db() for the system state database
 and get_analytics_db() for the analytics database with parquet views.
 """

+import logging
 import os
 import re
+import shutil
 from pathlib import Path

 import duckdb

+logger = logging.getLogger(__name__)
+
 _SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")

 SCHEMA_VERSION = 3
@ -260,6 +264,25 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
    """Create tables if they don't exist. Apply migrations if schema version changed."""
    current = get_schema_version(conn)
    if current < SCHEMA_VERSION:
+        # Snapshot before migration for rollback support
+        if current > 0:
+            try:
+                db_path = Path(os.environ.get("DATA_DIR", "./data")) / "state" / "system.duckdb"
+                if db_path.exists():
+                    # Flush WAL to main DB file before copying
+                    try:
+                        conn.execute("CHECKPOINT")
+                    except Exception:
+                        pass  # CHECKPOINT may fail on read-only or in-memory DBs
+                    snapshot = db_path.parent / "system.duckdb.pre-migrate"
+                    shutil.copy2(str(db_path), str(snapshot))
+                    # Also copy WAL if it still exists (belt and suspenders)
+                    wal_path = Path(str(db_path) + ".wal")
+                    if wal_path.exists():
+                        shutil.copy2(str(wal_path), str(snapshot) + ".wal")
+                    logger.info("Pre-migration snapshot saved: %s", snapshot)
+            except Exception as e:
+                logger.warning("Could not create pre-migration snapshot: %s", e)
        conn.execute(_SYSTEM_SCHEMA)
        if current == 0:
            conn.execute(
--- a/tests/snapshots/openapi.json
+++ b/tests/snapshots/openapi.json
--- a/tests/test_db.py
+++ b/tests/test_db.py
@ -144,6 +144,205 @@ class TestGetAnalyticsDb:
            conn.close()


+class TestMigrationSafety:
+    """Tests for schema migration correctness, idempotency, and safety snapshots."""
+
+    # Minimal v2 table_registry (no is_public column — that comes in v3)
+    _V2_TABLE_REGISTRY = """
+        CREATE TABLE table_registry (
+            id VARCHAR PRIMARY KEY,
+            name VARCHAR NOT NULL,
+            source_type VARCHAR,
+            bucket VARCHAR,
+            source_table VARCHAR,
+            sync_strategy VARCHAR DEFAULT 'full_refresh',
+            query_mode VARCHAR DEFAULT 'local',
+            sync_schedule VARCHAR,
+            profile_after_sync BOOLEAN DEFAULT true,
+            primary_key VARCHAR,
+            folder VARCHAR,
+            description TEXT,
+            registered_by VARCHAR,
+            registered_at TIMESTAMP DEFAULT current_timestamp
+        );
+    """
+
+    def _create_v2_db(self, db_path):
+        """Create a minimal v2-schema DuckDB file at db_path."""
+        import duckdb as _duckdb
+        db_path.parent.mkdir(parents=True, exist_ok=True)
+        conn = _duckdb.connect(str(db_path))
+        try:
+            conn.execute(
+                "CREATE TABLE schema_version (version INTEGER, applied_at TIMESTAMP DEFAULT current_timestamp);"
+                "INSERT INTO schema_version (version) VALUES (2);"
+            )
+            conn.execute(self._V2_TABLE_REGISTRY)
+            # Stub out remaining tables so _ensure_schema doesn't fail
+            for ddl in [
+                "CREATE TABLE IF NOT EXISTS users (id VARCHAR PRIMARY KEY, email VARCHAR)",
+                "CREATE TABLE IF NOT EXISTS sync_state (table_id VARCHAR PRIMARY KEY)",
+                "CREATE TABLE IF NOT EXISTS sync_history (id VARCHAR PRIMARY KEY, table_id VARCHAR)",
+                "CREATE TABLE IF NOT EXISTS user_sync_settings (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))",
+                "CREATE TABLE IF NOT EXISTS knowledge_items (id VARCHAR PRIMARY KEY, title VARCHAR)",
+                "CREATE TABLE IF NOT EXISTS knowledge_votes (item_id VARCHAR, user_id VARCHAR, PRIMARY KEY(item_id, user_id))",
+                "CREATE TABLE IF NOT EXISTS audit_log (id VARCHAR PRIMARY KEY, action VARCHAR)",
+                "CREATE TABLE IF NOT EXISTS telegram_links (user_id VARCHAR PRIMARY KEY, chat_id BIGINT)",
+                "CREATE TABLE IF NOT EXISTS pending_codes (code VARCHAR PRIMARY KEY, chat_id BIGINT)",
+                "CREATE TABLE IF NOT EXISTS script_registry (id VARCHAR PRIMARY KEY, name VARCHAR, source TEXT)",
+                "CREATE TABLE IF NOT EXISTS table_profiles (table_id VARCHAR PRIMARY KEY, profile JSON)",
+                "CREATE TABLE IF NOT EXISTS dataset_permissions (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))",
+            ]:
+                conn.execute(ddl)
+        finally:
+            conn.close()
+
+    def test_v2_to_v3_migration(self, tmp_path, monkeypatch):
+        """v2 DB migrated to v3: schema_version=3 and is_public column added."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import duckdb as _duckdb
+        from src.db import _ensure_schema, get_schema_version
+
+        db_path = tmp_path / "state" / "system.duckdb"
+        self._create_v2_db(db_path)
+
+        conn = _duckdb.connect(str(db_path))
+        try:
+            _ensure_schema(conn)
+            assert get_schema_version(conn) == 3
+            cols = {
+                r[0]
+                for r in conn.execute(
+                    "SELECT column_name FROM information_schema.columns WHERE table_name='table_registry'"
+                ).fetchall()
+            }
+            assert "is_public" in cols
+        finally:
+            conn.close()
+
+    def test_migration_idempotency(self, tmp_path, monkeypatch):
+        """Calling _ensure_schema twice on a fresh DB raises no error and leaves version at 3."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import duckdb as _duckdb
+        from src.db import _ensure_schema, get_schema_version, SCHEMA_VERSION
+
+        db_path = tmp_path / "state" / "system.duckdb"
+        db_path.parent.mkdir(parents=True, exist_ok=True)
+        conn = _duckdb.connect(str(db_path))
+        try:
+            _ensure_schema(conn)
+            _ensure_schema(conn)
+            assert get_schema_version(conn) == SCHEMA_VERSION
+        finally:
+            conn.close()
+
+    def test_migration_preserves_data(self, tmp_path, monkeypatch):
+        """Data inserted before migration is preserved after migration runs."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import duckdb as _duckdb
+        from src.db import _ensure_schema, get_schema_version, _SYSTEM_SCHEMA
+
+        db_path = tmp_path / "state" / "system.duckdb"
+        db_path.parent.mkdir(parents=True, exist_ok=True)
+        conn = _duckdb.connect(str(db_path))
+        try:
+            # Build a v1 schema manually
+            conn.execute(
+                "CREATE TABLE schema_version (version INTEGER, applied_at TIMESTAMP DEFAULT current_timestamp);"
+                "INSERT INTO schema_version (version) VALUES (1);"
+            )
+            conn.execute("""
+                CREATE TABLE table_registry (
+                    id VARCHAR PRIMARY KEY,
+                    name VARCHAR NOT NULL,
+                    folder VARCHAR,
+                    sync_strategy VARCHAR,
+                    primary_key VARCHAR,
+                    description TEXT,
+                    registered_by VARCHAR,
+                    registered_at TIMESTAMP DEFAULT current_timestamp
+                );
+            """)
+            conn.execute(
+                "INSERT INTO table_registry (id, name, description) VALUES ('row1', 'MyTable', 'kept')"
+            )
+            # Stub remaining tables
+            for ddl in [
+                "CREATE TABLE IF NOT EXISTS users (id VARCHAR PRIMARY KEY, email VARCHAR)",
+                "CREATE TABLE IF NOT EXISTS sync_state (table_id VARCHAR PRIMARY KEY)",
+                "CREATE TABLE IF NOT EXISTS sync_history (id VARCHAR PRIMARY KEY, table_id VARCHAR)",
+                "CREATE TABLE IF NOT EXISTS user_sync_settings (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))",
+                "CREATE TABLE IF NOT EXISTS knowledge_items (id VARCHAR PRIMARY KEY, title VARCHAR)",
+                "CREATE TABLE IF NOT EXISTS knowledge_votes (item_id VARCHAR, user_id VARCHAR, PRIMARY KEY(item_id, user_id))",
+                "CREATE TABLE IF NOT EXISTS audit_log (id VARCHAR PRIMARY KEY, action VARCHAR)",
+                "CREATE TABLE IF NOT EXISTS telegram_links (user_id VARCHAR PRIMARY KEY, chat_id BIGINT)",
+                "CREATE TABLE IF NOT EXISTS pending_codes (code VARCHAR PRIMARY KEY, chat_id BIGINT)",
+                "CREATE TABLE IF NOT EXISTS script_registry (id VARCHAR PRIMARY KEY, name VARCHAR, source TEXT)",
+                "CREATE TABLE IF NOT EXISTS table_profiles (table_id VARCHAR PRIMARY KEY, profile JSON)",
+                "CREATE TABLE IF NOT EXISTS dataset_permissions (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))",
+            ]:
+                conn.execute(ddl)
+
+            _ensure_schema(conn)
+
+            assert get_schema_version(conn) == 3
+            row = conn.execute(
+                "SELECT name, description FROM table_registry WHERE id='row1'"
+            ).fetchone()
+            assert row is not None, "Pre-migration row was lost"
+            assert row[0] == "MyTable"
+            assert row[1] == "kept"
+        finally:
+            conn.close()
+
+    def test_pre_migration_snapshot_created(self, tmp_path, monkeypatch):
+        """A pre-migrate snapshot is written when migrating an existing (non-fresh) DB."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        from src.db import get_system_db
+
+        # Create a v2 DB at the expected path before calling get_system_db
+        db_path = tmp_path / "state" / "system.duckdb"
+        self._create_v2_db(db_path)
+
+        conn = get_system_db()
+        try:
+            snapshot = tmp_path / "state" / "system.duckdb.pre-migrate"
+            assert snapshot.exists(), "Pre-migration snapshot was not created"
+        finally:
+            conn.close()
+
+    def test_no_snapshot_on_fresh_db(self, tmp_path, monkeypatch):
+        """No pre-migrate snapshot is created when initialising a brand-new DB."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        from src.db import get_system_db
+
+        conn = get_system_db()
+        try:
+            snapshot = tmp_path / "state" / "system.duckdb.pre-migrate"
+            assert not snapshot.exists(), "Snapshot should not exist for a fresh DB"
+        finally:
+            conn.close()
+
+    def test_future_version_is_noop(self, tmp_path, monkeypatch):
+        """_ensure_schema does nothing when schema_version > SCHEMA_VERSION."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import duckdb as _duckdb
+        from src.db import _ensure_schema, get_schema_version
+
+        db_path = tmp_path / "state" / "system.duckdb"
+        db_path.parent.mkdir(parents=True, exist_ok=True)
+        conn = _duckdb.connect(str(db_path))
+        try:
+            conn.execute(
+                "CREATE TABLE schema_version (version INTEGER, applied_at TIMESTAMP DEFAULT current_timestamp);"
+                "INSERT INTO schema_version (version) VALUES (99);"
+            )
+            _ensure_schema(conn)
+            assert get_schema_version(conn) == 99
+        finally:
+            conn.close()
+
+
 class TestGetAnalyticsDbReadonly:
    def test_analytics_readonly_rejects_malicious_dir_name(self, tmp_path, monkeypatch):
        """Directories with SQL-injection chars in their name are skipped."""
--- a/tests/test_openapi_snapshot.py
+++ b/tests/test_openapi_snapshot.py
@ -0,0 +1,73 @@
+"""OpenAPI snapshot test — detect breaking API changes.
+
+Compares the current app's OpenAPI schema against a committed snapshot.
+Fails if any path or HTTP method has been removed (breaking change).
+
+To update the snapshot after an intentional change:
+    make update-openapi-snapshot
+"""
+
+import json
+import os
+from pathlib import Path
+
+import pytest
+
+SNAPSHOT_PATH = Path(__file__).parent / "snapshots" / "openapi.json"
+
+
+@pytest.fixture(scope="module")
+def current_schema():
+    os.environ.setdefault("TESTING", "1")
+    from app.main import create_app
+
+    app = create_app()
+    return app.openapi()
+
+
+def test_snapshot_exists():
+    """Committed OpenAPI snapshot must exist."""
+    assert SNAPSHOT_PATH.exists(), (
+        "No OpenAPI snapshot found. Generate one with: make update-openapi-snapshot"
+    )
+
+
+def test_no_removed_paths(current_schema):
+    """No API paths should be removed compared to the snapshot."""
+    if not SNAPSHOT_PATH.exists():
+        pytest.skip("No snapshot to compare against")
+
+    snapshot = json.loads(SNAPSHOT_PATH.read_text())
+    current_paths = set(current_schema.get("paths", {}))
+    snapshot_paths = set(snapshot.get("paths", {}))
+
+    removed = snapshot_paths - current_paths
+    assert not removed, (
+        f"BREAKING: {len(removed)} API path(s) removed: {sorted(removed)}\n"
+        "If intentional, run: make update-openapi-snapshot"
+    )
+
+
+def test_no_removed_methods(current_schema):
+    """No HTTP methods should be removed from existing paths."""
+    if not SNAPSHOT_PATH.exists():
+        pytest.skip("No snapshot to compare against")
+
+    snapshot = json.loads(SNAPSHOT_PATH.read_text())
+    current_paths = current_schema.get("paths", {})
+    snapshot_paths = snapshot.get("paths", {})
+
+    breaking = []
+    for path in set(snapshot_paths) & set(current_paths):
+        removed_methods = set(snapshot_paths[path]) - set(current_paths[path])
+        # Ignore non-HTTP keys like 'parameters'
+        http_methods = {"get", "post", "put", "delete", "patch", "head", "options"}
+        removed_http = removed_methods & http_methods
+        if removed_http:
+            breaking.append(f"  {path}: {sorted(removed_http)}")
+
+    assert not breaking, (
+        f"BREAKING: HTTP methods removed from {len(breaking)} path(s):\n"
+        + "\n".join(breaking)
+        + "\nIf intentional, run: make update-openapi-snapshot"
+    )
--- a/tests/test_security.py
+++ b/tests/test_security.py
@ -304,26 +304,37 @@ class TestJwtClaims:
 # ---- JWT Secret Hardening ----

 class TestJwtSecretHardening:
-    def test_raises_without_jwt_secret_in_non_test_env(self):
-        """Module-level code must raise RuntimeError when JWT_SECRET_KEY is absent
-        and TESTING is not set, preventing accidental production deploys with no secret."""
+    def test_auto_generates_jwt_secret_when_absent(self, tmp_path):
+        """When JWT_SECRET_KEY is absent and TESTING is not set,
+        the secret is auto-generated and persisted to a file."""
        saved_key = os.environ.pop("JWT_SECRET_KEY", None)
        saved_testing = os.environ.pop("TESTING", None)
-        # Eject any cached module so the re-import re-executes module-level code
+        saved_data_dir = os.environ.get("DATA_DIR")
+        os.environ["DATA_DIR"] = str(tmp_path)
+        # Eject cached modules so the re-import re-executes module-level code
        sys.modules.pop("app.auth.jwt", None)
+        sys.modules.pop("app.secrets", None)
        try:
-            with pytest.raises(RuntimeError, match="JWT_SECRET_KEY environment variable is required"):
-                importlib.import_module("app.auth.jwt")
+            importlib.import_module("app.auth.jwt")
+            secret_file = tmp_path / "state" / ".jwt_secret"
+            assert secret_file.exists(), "JWT secret file should be auto-generated"
+            secret = secret_file.read_text().strip()
+            assert len(secret) == 64, "Auto-generated secret should be 64 hex chars (32 bytes)"
        finally:
            # Restore environment before re-importing so the module loads cleanly
            if saved_key is not None:
                os.environ["JWT_SECRET_KEY"] = saved_key
            if saved_testing is not None:
                os.environ["TESTING"] = saved_testing
+            if saved_data_dir is not None:
+                os.environ["DATA_DIR"] = saved_data_dir
+            else:
+                os.environ.pop("DATA_DIR", None)
            # If neither was set (bare test run), use TESTING flag so reload works
            if saved_key is None and saved_testing is None:
                os.environ["TESTING"] = "1"
            sys.modules.pop("app.auth.jwt", None)
+            sys.modules.pop("app.secrets", None)
            importlib.import_module("app.auth.jwt")
            # Clean up the temporary TESTING flag if we added it
            if saved_key is None and saved_testing is None: