fix: address PR review findings — config write, CalVer, error handling

- Config writes to DATA_DIR/state/instance.yaml (writable) instead of CONFIG_DIR (read-only :ro in Docker) - instance_config.py checks DATA_DIR/state/ first, then falls back to CONFIG_DIR for backward compat - CalVer counter is now global across channels (*-YYYY.MM.*) per spec - Keboola error messages sanitized — log full error, return generic msg - chmod in secrets.py wrapped in try/except for Windows compat - Setup wizard JS handles 401 (expired JWT) with user-facing message - deploy.yml changed to workflow_dispatch only (no duplicate test runs) - Smoke test uses docker-compose.prod.yml + AGNES_TAG instead of sed - docker-compose.prod.yml uses ${AGNES_TAG:-stable} env var 663 tests pass. 8 E2E verification tests pass.
2026-04-10 13:16:40 +02:00 · 2026-04-10 13:16:40 +02:00 · 49f109bf73
commit 49f109bf73
parent 6c53082295
7 changed files with 67 additions and 30 deletions
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@ -1,11 +1,9 @@
 # SUPERSEDED by release.yml — CalVer tagging with stable/dev channels.
-# This workflow is kept for backward compatibility but only runs tests.
+# Kept for manual trigger only. Automated builds use release.yml.
 # Image build and push is handled by release.yml.
 name: Build & Push (legacy)
 on:
-  push:
+  workflow_dispatch: {}
    branches: [main]
 jobs:
  test:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -53,8 +53,9 @@ jobs:
            CHANNEL="dev"
          fi
-          # Count existing tags for this channel+month to get next N
+          # Count existing tags GLOBALLY across all channels for this month
-          EXISTING=$(git tag -l "${CHANNEL}-${YEAR_MONTH}.*" | wc -l | tr -d ' ')
+          # (spec requires unique N per month: dev-2026.04.1 and stable-2026.04.2, never both .1)
          EXISTING=$(git tag -l "*-${YEAR_MONTH}.*" | wc -l | tr -d ' ')
          N=$((EXISTING + 1))
          VERSION="${YEAR_MONTH}.${N}"
          SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
@ -104,10 +105,9 @@ jobs:
      - name: Start Agnes from built image
        run: |
-          # Override image to use the just-built version
+          # Use prod compose (GHCR images) + CI overlay (test secrets)
-          export AGNES_IMAGE="ghcr.io/${{ github.repository }}:${{ needs.build-and-push.outputs.image_tag }}"
+          export AGNES_TAG="${{ needs.build-and-push.outputs.image_tag }}"
-          sed -i "s|build: \.|image: ${AGNES_IMAGE}|g" docker-compose.yml
+          docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml up -d app
          docker compose -f docker-compose.yml -f docker-compose.ci.yml up -d
          # Wait for healthy (max 60s)
          timeout 60 bash -c 'until curl -sf http://localhost:8000/api/health | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d[\"status\"]!=\"unhealthy\" else 1)"; do sleep 3; done'
@ -116,7 +116,7 @@ jobs:
      - name: Collect logs on failure
        if: failure()
-        run: docker compose -f docker-compose.yml -f docker-compose.ci.yml logs > smoke-test-logs.txt
+        run: docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml logs > smoke-test-logs.txt
      - name: Upload logs
        if: failure()
@ -127,4 +127,4 @@ jobs:
      - name: Teardown
        if: always()
-        run: docker compose -f docker-compose.yml -f docker-compose.ci.yml down -v
+        run: docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml down -v
--- a/app/api/admin.py
+++ b/app/api/admin.py
@ -182,23 +182,33 @@ async def configure_instance(
            client = KeboolaClient(token=request.keboola_token, url=request.keboola_url)
            client.verify_token()
        except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Keboola connection failed: {e}")
+            logger.error("Keboola connection validation failed: %s", e)
            raise HTTPException(status_code=400, detail="Keboola connection failed. Check your token and URL.")
    elif request.data_source == "bigquery":
        if not request.bigquery_project:
            raise HTTPException(status_code=400, detail="bigquery_project is required for BigQuery data source")
-    # Build instance.yaml config (secrets as ${ENV_VAR} references)
+    # Write instance.yaml to DATA_DIR/state/ (writable Docker volume),
-    config_dir = Path(os.environ.get("CONFIG_DIR", "./config"))
+    # NOT to CONFIG_DIR which is mounted read-only in Docker.
-    config_path = config_dir / "instance.yaml"
+    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
    config_path = data_dir / "state" / "instance.yaml"
-    # Load existing config or start fresh
+    # Load existing API-generated config, or fall back to read-only CONFIG_DIR config
    existing = {}
    if config_path.exists():
        try:
            existing = yaml.safe_load(config_path.read_text()) or {}
        except Exception:
            existing = {}
    else:
        # Try loading from read-only config as base
        ro_path = Path(os.environ.get("CONFIG_DIR", "./config")) / "instance.yaml"
        if ro_path.exists():
            try:
                existing = yaml.safe_load(ro_path.read_text()) or {}
            except Exception:
                existing = {}
    # Merge instance settings
    if request.instance_name:
@ -220,8 +230,8 @@ async def configure_instance(
            "location": request.bigquery_location or "us",
        }
-    # Write instance.yaml
+    # Write to writable data volume
-    config_dir.mkdir(parents=True, exist_ok=True)
+    config_path.parent.mkdir(parents=True, exist_ok=True)
    config_path.write_text(yaml.dump(existing, default_flow_style=False, sort_keys=False))
    logger.info("Wrote instance config to %s", config_path)
--- a/app/instance_config.py
+++ b/app/instance_config.py
@ -11,15 +11,34 @@ _instance_config: Optional[dict] = None
 def load_instance_config() -> dict:
-    """Load instance.yaml using the existing config loader."""
+    """Load instance.yaml — checks API-generated config first, then static config.
    Search order:
    1. DATA_DIR/state/instance.yaml (written by /api/admin/configure, writable)
    2. CONFIG_DIR/instance.yaml (static, read-only in Docker)
    3. Empty dict with defaults (if neither exists)
    """
    global _instance_config
    if _instance_config is not None:
        return _instance_config
    # First, try API-generated config in writable data volume
    import yaml
    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
    api_config_path = data_dir / "state" / "instance.yaml"
    if api_config_path.exists():
        try:
-        from config.loader import load_instance_config as _load, get_instance_value
+            _instance_config = yaml.safe_load(api_config_path.read_text()) or {}
            logger.info("Loaded instance.yaml from %s", api_config_path)
            return _instance_config
        except Exception as e:
            logger.warning(f"Could not load API-generated instance.yaml: {e}")
    # Fall back to static config (may have strict validation)
    try:
        from config.loader import load_instance_config as _load
        _instance_config = _load()
-        logger.info("Loaded instance.yaml")
+        logger.info("Loaded instance.yaml from config/")
    except Exception as e:
        logger.warning(f"Could not load instance.yaml: {e}. Using defaults.")
        _instance_config = {}
--- a/app/secrets.py
+++ b/app/secrets.py
@ -19,7 +19,10 @@ def _load_or_generate(env_var: str, file_name: str) -> str:
    secret_path.parent.mkdir(parents=True, exist_ok=True)
    val = secrets.token_hex(32)
    secret_path.write_text(val)
    try:
        secret_path.chmod(0o600)
    except OSError:
        pass  # chmod not supported on all platforms (e.g., Windows)
    logger.info(
        "Auto-generated %s -> %s (set %s in .env to use a fixed value)",
        file_name, secret_path, env_var,
--- a/app/web/templates/setup.html
+++ b/app/web/templates/setup.html
@ -154,6 +154,12 @@ async function apiCall(url, body) {
    const headers = { 'Content-Type': 'application/json' };
    if (token) headers['Authorization'] = 'Bearer ' + token;
    const resp = await fetch(url, { method: 'POST', headers, body: JSON.stringify(body) });
    if (resp.status === 401) {
        token = '';
        sessionStorage.removeItem('setup_token');
        showStatus('Session expired. Please refresh the page and start over.', 'error');
        throw new Error('Session expired');
    }
    const data = await resp.json();
    if (!resp.ok) throw new Error(data.detail || 'Request failed');
    return data;
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@ -1,17 +1,18 @@
 # Production override — uses pre-built GHCR image instead of local build.
 # Usage: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
 # Override tag: AGNES_TAG=stable-2026.04.3 docker compose -f ... up -d
 services:
  app:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  scheduler:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  extract:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  telegram-bot:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  ws-gateway:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  corporate-memory:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}
  session-collector:
-    image: ghcr.io/keboola/agnes-the-ai-analyst:latest
+    image: ghcr.io/keboola/agnes-the-ai-analyst:${AGNES_TAG:-stable}