name: Release on: push: branches: - main - "**" # build :dev- image for any branch push (e.g. feature/x, zs/edit, fix/y) paths-ignore: - "docs/**" - "*.md" - "LICENSE" # Branch creation. Required because `paths-ignore` on the `push` event # diffs the new ref against the default branch — a branch created from # main with no extra commits has zero diff, so every file matches # paths-ignore and the workflow is skipped. Devs spinning up a personal # branch off main to deploy main's exact state to their dev VM # (`:dev--latest` floating tag) need an image to be published, so # we trigger explicitly on branch create. Tag creates are filtered out # at the job level so we don't double-build with `keboola-deploy.yml` # (which owns `keboola-deploy-*` tag pushes). create: workflow_dispatch: # manual trigger for explicit dev- builds permissions: contents: write packages: write # `issues: write` lets the smoke-test job's rollback step open a # GitHub issue alerting operators when an auto-rollback fires. Without # this, the `gh issue create` call hits 403 and the `|| echo` fallback # silently swallows it — operators see :stable revert with no alert. issues: write # When a developer pushes a brand-new branch with code changes, GitHub fires # both a `create` and a `push` event for the same commit. Without # concurrency control, both runs would claim distinct CalVer version tags # (dev-YYYY.MM.N and dev-YYYY.MM.N+1) and race to push overlapping floating # tags (:dev, :dev-, :dev--latest). Group by ref and cancel # in-progress duplicates so only the most recent event survives — the # zero-diff case (only `create` fires, no `push`) is unaffected since # there's only one run. concurrency: group: release-${{ github.ref }} cancel-in-progress: true jobs: test: # Skip the `create` event for tags — those are owned by keboola-deploy.yml # and shouldn't double-build here. Branch creates DO run. if: github.event_name != 'create' || github.event.ref_type == 'branch' runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: "3.13" - name: Install uv uses: astral-sh/setup-uv@v7 - name: Install dependencies run: uv pip install --system ".[dev]" - name: Lint with ruff run: | pip install ruff ruff check . || true continue-on-error: true # Don't block on pre-existing lint issues; can tighten later - name: Type check with mypy run: | pip install mypy mypy src/ app/ cli/ connectors/ --ignore-missing-imports --no-error-summary || true continue-on-error: true # Don't block on mypy initially, can tighten later - name: Run tests run: pytest tests/ -v --tb=short env: TESTING: "1" build-and-push: needs: test # Publish on: # - any push (main → :stable-* / non-main → :dev-* + :dev-); # - branch creation (a fresh branch off main with no extra commits # should still produce a `:dev-` + `:dev--latest` # image so the developer's VM, which pins to that floating tag, # can deploy main's exact state without manually changing code); # - manual workflow_dispatch. # Tag creates are excluded — `keboola-deploy.yml` owns tag pushes. if: | github.event_name == 'push' || github.event_name == 'workflow_dispatch' || (github.event_name == 'create' && github.event.ref_type == 'branch') runs-on: ubuntu-latest outputs: image_tag: ${{ steps.meta.outputs.versioned_tag }} version: ${{ steps.meta.outputs.version }} channel: ${{ steps.meta.outputs.channel }} steps: - uses: actions/checkout@v6 with: fetch-depth: 0 fetch-tags: true - name: Claim version tag (with retry to avoid race conditions) id: meta run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" YEAR_MONTH=$(date +%Y.%m) if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then CHANNEL="stable" else CHANNEL="dev" fi SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7) # Claim a unique version by pushing a git tag BEFORE building. # Retry up to 5 times if another CI run took our N. TAG_CLAIMED=false for ATTEMPT in 1 2 3 4 5; do git fetch --tags --force # Use max(N) not count — safe even if tags are deleted MAX_N=$(git tag -l "*-${YEAR_MONTH}.*" | sed 's/.*\.//' | sort -n | tail -1) N=$(( ${MAX_N:-0} + 1 )) VERSION="${YEAR_MONTH}.${N}" TAG="${CHANNEL}-${VERSION}" git tag -a "$TAG" -m "Release $TAG" if git push origin "$TAG" 2>/dev/null; then echo "Claimed tag $TAG (attempt $ATTEMPT)" TAG_CLAIMED=true break else echo "Tag $TAG already exists, retrying... (attempt $ATTEMPT)" git tag -d "$TAG" sleep 2 fi done if [ "$TAG_CLAIMED" != "true" ]; then echo "::error::Failed to claim a unique version tag after 5 attempts" exit 1 fi echo "channel=${CHANNEL}" >> "$GITHUB_OUTPUT" echo "version=${VERSION}" >> "$GITHUB_OUTPUT" echo "versioned_tag=${TAG}" >> "$GITHUB_OUTPUT" echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT" # Per-branch slug for dev builds (enables branch-aware dev VMs) if [[ "${{ github.ref }}" != "refs/heads/main" ]]; then BRANCH_NAME="${GITHUB_REF#refs/heads/}" BRANCH_SLUG=$(echo "$BRANCH_NAME" | sed 's|^feature/||' | sed 's|[^a-zA-Z0-9-]|-|g' | tr '[:upper:]' '[:lower:]' | cut -c1-50) echo "branch_slug=${BRANCH_SLUG}" >> "$GITHUB_OUTPUT" echo "Branch slug: ${BRANCH_SLUG}" # User prefix for / branches — powers the # dev--latest alias tag so each developer's personal VM # can pin to their prefix and auto-pull the latest push. Common # Git Flow prefixes are skipped so `feature/x`, `fix/y` etc. # don't create noisy -latest tags. if [[ "$BRANCH_NAME" == *"/"* ]]; then USER_PREFIX=$(echo "$BRANCH_NAME" | cut -d/ -f1 | sed 's|[^a-zA-Z0-9-]|-|g' | tr '[:upper:]' '[:lower:]') case "$USER_PREFIX" in feature|fix|hotfix|bugfix|docs|chore|test|ci|ops|refactor|perf|style|build) echo "Branch prefix '$USER_PREFIX' is a Git Flow convention — skipping dev-*-latest alias" ;; *) echo "user_prefix=${USER_PREFIX}" >> "$GITHUB_OUTPUT" echo "User prefix: ${USER_PREFIX} (will push dev-${USER_PREFIX}-latest alias)" ;; esac fi fi echo "Channel: ${CHANNEL}" echo "Version: ${VERSION}" echo "Versioned tag: ${TAG}" - name: Extract package version from pyproject.toml id: pkgver run: | # Single source of truth for the product version: the # pyproject.toml [project] table. The CalVer "${YEAR_MONTH}.${N}" # claimed above stays as the git / image tag (release identity), # but AGNES_VERSION — what /api/version, /cli/latest, and `da # --version` all expose — tracks the package version. VERSION=$(grep '^version' pyproject.toml | head -1 | sed -E 's/^version\s*=\s*"([^"]+)".*/\1/') if [ -z "$VERSION" ]; then echo "::error::Could not extract version from pyproject.toml" exit 1 fi echo "version=${VERSION}" >> "$GITHUB_OUTPUT" echo "Package version: ${VERSION}" - name: Log in to GHCR uses: docker/login-action@v4 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push uses: docker/build-push-action@v7 with: push: true build-args: | AGNES_VERSION=${{ steps.pkgver.outputs.version }} RELEASE_CHANNEL=${{ steps.meta.outputs.channel }} AGNES_COMMIT_SHA=${{ github.sha }} AGNES_TAG=${{ steps.meta.outputs.versioned_tag }} tags: | ghcr.io/${{ github.repository }}:${{ steps.meta.outputs.channel }} ghcr.io/${{ github.repository }}:${{ steps.meta.outputs.versioned_tag }} ghcr.io/${{ github.repository }}:sha-${{ steps.meta.outputs.short_sha }} ${{ steps.meta.outputs.channel == 'dev' && format('ghcr.io/{0}:dev-{1}', github.repository, steps.meta.outputs.branch_slug) || '' }} ${{ steps.meta.outputs.channel == 'dev' && steps.meta.outputs.user_prefix != '' && format('ghcr.io/{0}:dev-{1}-latest', github.repository, steps.meta.outputs.user_prefix) || '' }} smoke-test: needs: build-and-push if: github.ref == 'refs/heads/main' runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 with: fetch-depth: 0 fetch-tags: true # Required for the rollback step's `docker push` to GHCR. The # `build-and-push` job logs in for itself; this job needs its own # login since GitHub Actions tokens are scoped per-job. Without it, # the rollback hits "unauthenticated: User cannot be authenticated # with the token provided" and silently leaves :stable pointing at # the broken image (real incident: PR #137 / 4ec5ff44). - name: Log in to GHCR uses: docker/login-action@v4 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Start Agnes from built image run: | # Create empty .env (docker-compose.yml requires env_file: .env, gitignored) touch .env # Use prod compose (GHCR images) + CI overlay (test secrets) export AGNES_TAG="${{ needs.build-and-push.outputs.image_tag }}" docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml up -d app # Wait for healthy (max 60s) timeout 60 bash -c 'until curl -sf http://localhost:8000/api/health | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d[\"status\"]!=\"unhealthy\" else 1)"; do sleep 3; done' - name: Run smoke tests run: bash scripts/smoke-test.sh http://localhost:8000 - name: Automatic rollback on failure if: failure() env: # Required for the `gh issue create` call below — without GH_TOKEN # the gh CLI fails the auth check and the issue creation falls # through the `|| echo` fallback, so an operator never sees the # rollback alert. GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | IMAGE_TAG="${{ needs.build-and-push.outputs.image_tag }}" VERSION="${{ needs.build-and-push.outputs.version }}" DEPRECATED_TAG="deprecated-${VERSION}" REPO="ghcr.io/${{ github.repository }}" echo "Smoke test failed — initiating rollback" # Tag the current (failed) image as :deprecated-YYYY.MM.N docker pull "${REPO}:${IMAGE_TAG}" docker tag "${REPO}:${IMAGE_TAG}" "${REPO}:${DEPRECATED_TAG}" docker push "${REPO}:${DEPRECATED_TAG}" echo "Tagged failed image as ${REPO}:${DEPRECATED_TAG}" # Revert :stable to the previous known-good image PREV_TAG=$(git tag -l "stable-*" --sort=-version:refname | head -2 | tail -1) if [ -n "$PREV_TAG" ]; then docker pull "${REPO}:${PREV_TAG}" docker tag "${REPO}:${PREV_TAG}" "${REPO}:stable" docker push "${REPO}:stable" echo "Reverted :stable to ${PREV_TAG}" else echo "WARNING: No previous stable tag found — cannot revert :stable automatically" fi # Create a GitHub issue alerting about the failure ISSUE_TITLE="Smoke test failure — rollback to ${PREV_TAG:-unknown}" ISSUE_BODY="## Automatic Rollback Report\n\nThe smoke test for image \`${IMAGE_TAG}\` failed.\n\n- **Failed image**: \`${REPO}:${IMAGE_TAG}\`\n- **Deprecated tag**: \`${REPO}:${DEPRECATED_TAG}\`\n- **Rolled back to**: \`${PREV_TAG:-N/A}\`\n- **Commit**: \`${{ github.sha }}\`\n- **Run**: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\n\nPlease investigate and fix before re-deploying." gh issue create --title "$ISSUE_TITLE" --body "$(echo -e "$ISSUE_BODY")" --label "bug" || echo "Failed to create GitHub issue (gh CLI may not be available)" - name: Collect logs on failure if: failure() run: docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml logs > smoke-test-logs.txt - name: Upload logs if: failure() uses: actions/upload-artifact@v7 with: name: smoke-test-logs path: smoke-test-logs.txt - name: Teardown if: always() run: docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ci.yml down -v # Reproduces the deploy shape that broke agnes-development on 2026-04-29: # the production stack uses docker-compose.host-mount.yml to bind-mount /data # from the host PD instead of using a Docker named volume. Docker initializes # a fresh named volume from the image's /data dir (which the Dockerfile # chowns to agnes:agnes BEFORE switching USER), so the existing smoke-test # job above never reproduces the "host /data is root-owned, container is # USER agnes" scenario. This job pre-creates a host dir, applies the same # chown the startup-script does on the GCE VM, and asserts the smoke # passes — locking in the chown contract so removing it from # startup-script.sh.tpl or flipping the Dockerfile uid breaks CI. e2e-bind-mount: needs: build-and-push if: github.ref == 'refs/heads/main' runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Pre-create /data with root-owned subdirs (mimics fresh GCE PD) run: | sudo mkdir -p /tmp/agnes-data/{state,analytics,extracts} sudo chown -R 0:0 /tmp/agnes-data ls -la /tmp/agnes-data - name: Negative test — image must fail to write before chown run: | IMAGE="ghcr.io/${{ github.repository }}:${{ needs.build-and-push.outputs.image_tag }}" # USER agnes (uid 999) writing to root-owned dir must fail. if docker run --rm -v /tmp/agnes-data:/data "$IMAGE" \ sh -c "touch /data/state/.probe" 2>/dev/null; then echo "REGRESSION: write to root-owned /data unexpectedly succeeded" echo " Either USER agnes is no longer enforced, or uid pin changed." exit 1 fi echo "OK: write correctly fails — operator chown is required" - name: Apply startup-script chown (uid:gid 999:999) run: sudo chown -R 999:999 /tmp/agnes-data - name: Boot stack with bind-mounted /data + run smoke run: | touch .env export AGNES_TAG="${{ needs.build-and-push.outputs.image_tag }}" # Override the `data` volume to bind-mount /tmp/agnes-data, mirroring # the production host-mount.yml overlay shape. cat > docker-compose.bind-test.yml <<'EOF' volumes: data: driver: local driver_opts: type: none o: bind,rbind device: /tmp/agnes-data EOF docker compose \ -f docker-compose.yml \ -f docker-compose.prod.yml \ -f docker-compose.ci.yml \ -f docker-compose.bind-test.yml \ up -d app timeout 60 bash -c 'until curl -sf http://localhost:8000/api/health | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d[\"status\"]!=\"unhealthy\" else 1)"; do sleep 3; done' bash scripts/smoke-test.sh http://localhost:8000 - name: Collect logs on failure if: failure() run: | docker compose \ -f docker-compose.yml -f docker-compose.prod.yml \ -f docker-compose.ci.yml -f docker-compose.bind-test.yml \ logs > bind-mount-logs.txt 2>&1 || true ls -la /tmp/agnes-data /tmp/agnes-data/state 2>&1 | tee -a bind-mount-logs.txt - name: Upload logs if: failure() uses: actions/upload-artifact@v7 with: name: e2e-bind-mount-logs path: bind-mount-logs.txt - name: Teardown if: always() run: | docker compose \ -f docker-compose.yml -f docker-compose.prod.yml \ -f docker-compose.ci.yml -f docker-compose.bind-test.yml \ down -v || true sudo rm -rf /tmp/agnes-data || true