diff --git a/.github/workflows/keboola-deploy.yml b/.github/workflows/keboola-deploy.yml index d5bd49b..57b09d8 100644 --- a/.github/workflows/keboola-deploy.yml +++ b/.github/workflows/keboola-deploy.yml @@ -3,10 +3,9 @@ name: Keboola Deploy # Tag-triggered build for Keboola's internal dev instance. # # Why a separate workflow: the default release.yml builds an image for *every* push -# to *every* branch, which means Keboola's `agnes-dev` VM (pinned to `:dev` or -# similar floating tag) sees whoever pushed last — Vojta, Minas, anyone. That -# convenience for Groupon-side dev VMs (per-developer `dev--latest` aliases) -# is a footgun for shared instances. +# to *every* branch, which means a shared dev VM pinned to a floating tag like +# `:dev` sees whoever pushed last. That convenience for per-developer dev VMs +# (`dev--latest` aliases) is a footgun for shared instances. # # This workflow runs ONLY when an operator explicitly creates a `keboola-deploy-*` # git tag. The image is published with two tags: diff --git a/CHANGELOG.md b/CHANGELOG.md index be0c34c..8ce4d5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,65 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C clips length to 64 chars, and routes the final filename through `safe_join_under`. +### Changed + +- **BREAKING (ops)**: Generic ops scripts moved out of the customer-named + `scripts/grpn/` directory into `scripts/ops/` as part of the OSS + vendor-neutralization (issue #88): + - `scripts/grpn/agnes-tls-rotate.sh` → `scripts/ops/agnes-tls-rotate.sh` + - `scripts/grpn/agnes-auto-upgrade.sh` → `scripts/ops/agnes-auto-upgrade.sh` + + Downstream consumer infra repos that copy these scripts onto VMs (e.g. via + their own `startup.sh`) must update the source path. The OSS-shipped + `infra/modules/customer-instance/` Terraform module is unaffected — it + embeds equivalent logic inline via heredoc and does not source-by-path + from `scripts/`. Script behaviour and env vars are unchanged. Cross-refs + in `README.md`, `CLAUDE.md`, `docs/DEPLOYMENT.md`, `Caddyfile`, and + `docker-compose.yml` were updated. + +- **OSS neutralization (wave 2 — code, tests, planning docs)**. Customer + identifiers replaced with placeholders across the codebase to ready the + repo for public release (issue #88): + + - **Code docstrings**: `connectors/openmetadata/{client,transformer,enricher}.py`, + `src/catalog_export.py`, `scripts/duckdb_manager.py` — `prj-grp-…` → + `my-bq-project` / `prj-example-1234`, `AIAgent.FoundryAI` → + `AIAgent.MyAgent` (in docstrings) / `AIAgent.Example` (in test fixtures), + `FoundryAIDataModel` → `AnalyticsDataModel`. + - **Test fixtures** in `tests/test_openmetadata_enricher.py`, + `tests/test_duckdb_manager.py`, `tests/test_catalog_export.py`, + `tests/test_openmetadata_transformer.py` — same set of replacements, + behaviour-preserving (157 tests still green). + - **Terraform module** `infra/modules/customer-instance/variables.tf`: + `customer_name` description rewritten in English, examples switched + from `keboola, grpn` to `acme, example`. + - **Workflow** `.github/workflows/keboola-deploy.yml`: comment "Groupon-side + dev VMs" → generic "per-developer dev VMs". + - **Caddyfile**: TLS-rotation cross-ref updated to `scripts/ops/…` and + Keboola-specific aside removed. + - **Auth docs** `docs/auth-groups.md` and the OAuth probe in + `scripts/debug/probe_google_groups.py`: GCP project name `kids-ai-data-analysis` + replaced with placeholder `acme-internal-prod`. + - **Planning docs** under `docs/superpowers/plans/` and `…/specs/`: the + five hackathon-era documents (`2026-04-21-deployment-log.md`, + `…-multi-customer-deployment.md`, `…-issues-14-and-10.md`, + `…-hackathon-dry-run.md`, the spec) had `34.77.94.14` / `34.77.102.61` + replaced with `` / ``, `Groupon`/`GRPN`/`grpn` + with `Acme`/`another-customer`, and `prj-grp-…` with `prj-example-…`. + +### Removed + +- Customer-specific manual-deploy helper `scripts/grpn/Makefile` and its + README, plus the corresponding hackathon deploy log under + `docs/superpowers/plans/2026-04-22-grpn-deploy-learnings.md`. These + documented one operator's hand-rolled stopgap for an org-policy-blocked + Terraform flow and do not belong in vendor-neutral OSS. +- `scripts/switch-dev-vm.sh` — hackathon-era helper hardcoded to a specific + shared dev VM. Per-developer dev VMs are + the supported pattern now; operators who need an equivalent should use + `gcloud compute ssh --command "sed -i …/.env && sudo /usr/local/bin/agnes-auto-upgrade.sh"` + with their own VM details. + ## [0.11.5] — 2026-04-27 Follow-up release for PR #73: addresses four rounds of Devin AI review on the role-management-complete branch. No new public-API surface; the user-visible payoff is that v8→v9-migrated installations now work end-to-end (login flows, user list, admin nav, privilege revocation), and `make local-dev` startup is finally quiet. @@ -211,7 +270,7 @@ First tagged semver release. The `version = "2.x"` strings that appeared in earl - Bootstrap backdoor closed when passwordless seed admin exists. - urllib3 1.26→2.6.3 (resolves 4 Dependabot security alerts). - argon2-cffi adopted for password hashing. -- See [docs/padak-security.md](docs/padak-security.md) for the full audit. +- See [docs/security-audit-2026-04.md](docs/security-audit-2026-04.md) for the full audit (renamed from `docs/padak-security.md` in #94). ### Fixed — Other diff --git a/CLAUDE.md b/CLAUDE.md index 7488b87..9d33e9c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,7 +33,7 @@ docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compos --profile tls up -d ``` -See `docs/DEPLOYMENT.md` → **TLS** for cert provisioning + `scripts/grpn/agnes-tls-rotate.sh` (daily refetch from `TLS_FULLCHAIN_URL`, `SIGUSR1` reload on diff, no-op when unchanged). The infra repo's `startup.sh` installs this as a systemd timer automatically. +See `docs/DEPLOYMENT.md` → **TLS** for cert provisioning + `scripts/ops/agnes-tls-rotate.sh` (daily refetch from `TLS_FULLCHAIN_URL`, `SIGUSR1` reload on diff, no-op when unchanged). The infra repo's `startup.sh` installs this as a systemd timer automatically. ## Project Structure diff --git a/Caddyfile b/Caddyfile index b5a242d..cf34742 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,9 +1,9 @@ {$DOMAIN:localhost} { # Cert provisioning. Driven by env var CADDY_TLS: # - unset (default) → cert-file mode for corporate PKI (rotated by - # scripts/grpn/agnes-tls-rotate.sh into /data/state/certs/). + # scripts/ops/agnes-tls-rotate.sh into /data/state/certs/). # - "tls " → Let's Encrypt auto-issue, e.g. "tls ops@example.com" - # (used by public-internet deployments like Keboola dev). + # (used by public-internet deployments). # - "tls internal" → Caddy-managed self-signed cert (lab/dev only, # browser warning on every visit). # diff --git a/README.md b/README.md index a389cba..b5d98e5 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compos --profile tls up -d ``` -Once running, the FastAPI app is available at `http://localhost:8000` (or `https://$DOMAIN` in TLS mode). See [`docs/DEPLOYMENT.md`](docs/DEPLOYMENT.md) for cert provisioning + auto-rotation via `scripts/grpn/agnes-tls-rotate.sh`. Trigger a manual sync: +Once running, the FastAPI app is available at `http://localhost:8000` (or `https://$DOMAIN` in TLS mode). See [`docs/DEPLOYMENT.md`](docs/DEPLOYMENT.md) for cert provisioning + auto-rotation via `scripts/ops/agnes-tls-rotate.sh`. Trigger a manual sync: ```bash curl -X POST http://localhost:8000/api/sync/trigger diff --git a/config/.env.template b/config/.env.template index d45cb34..0d63274 100644 --- a/config/.env.template +++ b/config/.env.template @@ -65,7 +65,7 @@ SESSION_SECRET= # python -c "import secrets; print(secrets.token_he # trusts X-Forwarded-Proto / X-Forwarded-For from the reverse proxy. # ── TLS TERMINATION (Caddy in cert-file mode) ─────── -# When TLS_FULLCHAIN_URL is set, scripts/grpn/agnes-tls-rotate.sh fetches +# When TLS_FULLCHAIN_URL is set, scripts/ops/agnes-tls-rotate.sh fetches # the cert daily from this URL and reloads Caddy on diff (zero downtime). # Empty -> no TLS, app serves plain HTTP on :8000. See docs/DEPLOYMENT.md # -> TLS for the full bring-up flow. diff --git a/config/instance.yaml.example b/config/instance.yaml.example index a711738..93fa555 100644 --- a/config/instance.yaml.example +++ b/config/instance.yaml.example @@ -111,7 +111,7 @@ data_source: # Uses ADC (Application Default Credentials) - VM service account on GCP # Data can live in a different project -- use fully-qualified table IDs in data_description.md -# --- OpenMetadata catalog (optional - Groupon-specific) --- +# --- OpenMetadata catalog (optional) --- # Enriches table and column metadata from OpenMetadata REST API. # If not configured, app works normally without catalog enrichment. # openmetadata: diff --git a/connectors/openmetadata/client.py b/connectors/openmetadata/client.py index 7784b3d..6f4e0ac 100644 --- a/connectors/openmetadata/client.py +++ b/connectors/openmetadata/client.py @@ -152,7 +152,7 @@ class OpenMetadataClient: by data product membership (queryFilter is unreliable for dataProducts field). Args: - data_product_name: Name of the data product (e.g., "FoundryAIDataModel") + data_product_name: Name of the data product (e.g., "AnalyticsDataModel") entity_type: Filter by entity type (e.g., "metric", "table"). Empty = all types. limit: Maximum number of results to fetch before filtering diff --git a/connectors/openmetadata/enricher.py b/connectors/openmetadata/enricher.py index 42d8213..b30ef45 100644 --- a/connectors/openmetadata/enricher.py +++ b/connectors/openmetadata/enricher.py @@ -176,8 +176,8 @@ class CatalogEnricher: Derive OpenMetadata FQN from table config. Auto-derivation: bigquery.{table_config.id} - Example: table_config.id = "prj-grp-dataview-prod-1ff9.marketing.roi_datamart_v2" - -> FQN = "bigquery.prj-grp-dataview-prod-1ff9.marketing.roi_datamart_v2" + Example: table_config.id = "my-bq-project.marketing.roi_datamart_v2" + -> FQN = "bigquery.my-bq-project.marketing.roi_datamart_v2" Args: table_config: Configuration with id and optional catalog_fqn diff --git a/connectors/openmetadata/transformer.py b/connectors/openmetadata/transformer.py index b66cb3f..e2c50d2 100644 --- a/connectors/openmetadata/transformer.py +++ b/connectors/openmetadata/transformer.py @@ -184,7 +184,7 @@ def has_tag(tags: List[Dict[str, Any]], tag_fqn: str) -> bool: Args: tags: List of tag dicts from OpenMetadata - tag_fqn: Fully qualified tag name to check (e.g., "AIAgent.FoundryAI") + tag_fqn: Fully qualified tag name to check (e.g., "AIAgent.MyAgent") Returns: True if the tag is found diff --git a/docker-compose.yml b/docker-compose.yml index a40d10f..5953135 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -112,7 +112,7 @@ services: restart: unless-stopped # TLS reverse proxy. Corporate-CA certs mounted from /data/state/certs - # (managed by scripts/grpn/agnes-tls-rotate.sh on the VM). For local + # (managed by scripts/ops/agnes-tls-rotate.sh on the VM). For local # development without certs, run without --profile tls and hit :8000 # directly. caddy: diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index e5f9f8c..8cfdb95 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -124,7 +124,7 @@ Both modes converge: once the CA publishes the signed chain at `TLS_FULLCHAIN_UR #### Automatic rotation -`scripts/grpn/agnes-tls-rotate.sh` is the single entry point — it handles fetch, self-signed fallback, auto-generation on missing key, atomic cert swap, and Caddy reload. Env vars it reads: +`scripts/ops/agnes-tls-rotate.sh` is the single entry point — it handles fetch, self-signed fallback, auto-generation on missing key, atomic cert swap, and Caddy reload. Env vars it reads: | Var | Required | Schemes | Notes | |---|---|---|---| diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index 3383aba..2ba16c5 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -64,5 +64,5 @@ Open the project in Claude Code. The CLAUDE.md file will guide the AI assistant ## Hackathon -Point the shared `agnes-dev` VM at your branch image with `scripts/switch-dev-vm.sh `. See [`HACKATHON.md`](HACKATHON.md) for the full deploy-and-develop playbook. +See [`HACKATHON.md`](HACKATHON.md) for the deploy-and-develop playbook. Per-developer dev VMs are the supported pattern — point your VM at your branch image with `gcloud compute ssh --command "sudo sed -i 's/^AGNES_TAG=.*/AGNES_TAG=dev-/' /opt/agnes/.env && sudo /usr/local/bin/agnes-auto-upgrade.sh"`. diff --git a/docs/auth-groups.md b/docs/auth-groups.md index f0b05e1..ce24711 100644 --- a/docs/auth-groups.md +++ b/docs/auth-groups.md @@ -4,7 +4,7 @@ How Agnes pulls a user's group memberships at Google sign-in and where they end ## Google Cloud setup (per OAuth client / project) -In the GCP project hosting the OAuth client (for Keboola dev: `kids-ai-data-analysis`): +In the GCP project hosting the OAuth client (e.g. `acme-internal-prod`): 1. **Enable Cloud Identity API** — `APIs & Services → Library → "Cloud Identity API" → Enable`. 2. **OAuth consent screen → Data Access → Add or Remove Scopes** — manually add: diff --git a/docs/padak-security.md b/docs/security-audit-2026-04.md similarity index 98% rename from docs/padak-security.md rename to docs/security-audit-2026-04.md index 8b2fafa..93f0ca7 100644 --- a/docs/padak-security.md +++ b/docs/security-audit-2026-04.md @@ -30,7 +30,7 @@ Known issues already in flight are marked with their tracking links so we do not - **URL map:** `app/web/router.py:119` — `"password_auth.reset_request": "/auth/password/reset"` - **Backend:** `app/auth/providers/password.py` only registers `/login`, `/login/web`, `/setup`. No `/reset` handler is wired. - **Related dead code:** templates `password_reset.html` and `password_setup.html` exist but no route renders them — indicates an abandoned reset flow. -- **Tracking:** [padak/keboola_agent_cli#206](https://github.com/padak/keboola_agent_cli/issues/206) +- **Tracking:** tracked upstream in the auth-CLI repo - **Confidence:** broken. ### 3. `[HIGH]` No rate limiting on any auth endpoint @@ -185,4 +185,4 @@ Several patterns looked scary at first glance but are correctly defended: **Backlog (single tracking issue with checkboxes):** - #11–24 — timing comparisons, theme XSS gating, config reload, RBAC unification, schedule validation, silent-except triage, dead templates cleanup. -The missing `/auth/password/reset` endpoint (#2) is already tracked in [padak/keboola_agent_cli#206](https://github.com/padak/keboola_agent_cli/issues/206). +The missing `/auth/password/reset` endpoint (#2) is already tracked upstream in the auth-CLI repo. diff --git a/docs/superpowers/plans/2026-04-21-deployment-log.md b/docs/superpowers/plans/2026-04-21-deployment-log.md index cf1bec3..b5fc6ad 100644 --- a/docs/superpowers/plans/2026-04-21-deployment-log.md +++ b/docs/superpowers/plans/2026-04-21-deployment-log.md @@ -16,7 +16,7 @@ Startup stav: Keboola prod/dev Agnes běžel z osobního forku `padak/tmp_oss` ( - **Public repo:** `keboola/agnes-the-ai-analyst` (app + TF modul) - **Privátní repo:** `keboola/agnes-infra-{customer}` (pro Keboolu `keboola/agnes-infra-keboola`) -- **GCP projekt:** `kids-ai-data-analysis` (Keboola) — pozn.: ponechán, owner `petr@keboola.com` +- **GCP projekt:** `internal-prod` (Keboola) — pozn.: ponechán, owner `petr@keboola.com` - **Deploy SA:** `agnes-deploy@.iam.gserviceaccount.com` - **TF state bucket:** `gs://agnes--tfstate//` - **VM SA:** `agnes--vm@.iam.gserviceaccount.com` (scope: secretmanager.secretAccessor) @@ -36,7 +36,7 @@ Startup stav: Keboola prod/dev Agnes běžel z osobního forku `padak/tmp_oss` ( 2. **GHCR image public:** `docker manifest inspect ghcr.io/keboola/agnes-the-ai-analyst:stable` funguje bez auth. 3. **Snapshot boot disku:** `data-analyst-pre-migration-20260421` (safety net před Fází 2). 4. **Per-branch tagging v release.yml:** commit `0ade45c` — přidává `:dev-` tag. **Nepushnuto** do origin kvůli chybějícímu `workflow` scope; uložen jako patch `~/.agnes-keys/0ade45c-workflow-per-branch-tag.patch`. -5. **bootstrap-gcp.sh:** Vytváří SA + role + tfstate bucket + SA key. Spuštěno na `kids-ai-data-analysis`. Vytvořen `agnes-deploy` SA, bucket `gs://agnes-kids-ai-data-analysis-tfstate`, klíč uložen do `~/.agnes-keys/agnes-deploy-kids-ai-data-analysis-key.json`. +5. **bootstrap-gcp.sh:** Vytváří SA + role + tfstate bucket + SA key. Spuštěno na `internal-prod`. Vytvořen `agnes-deploy` SA, bucket `gs://agnes-internal-prod-tfstate`, klíč uložen do `~/.agnes-keys/agnes-deploy-internal-prod-key.json`. 6. **Secret Manager:** `keboola-storage-token`, `jwt-secret-key` nahrány (obě s PŘEDCHOZÍMI hodnotami — `jwt-secret-key` aby existing JWT tokeny zůstaly validní; `keboola-storage-token` pro kontinuitu syncu). Rotace tokenu odložena do Fáze 2 completion. 7. **fetch-env-from-secrets.sh:** VM-side skript, který stahuje secrets a skládá `.env`. 8. **Deploy MVP na staré VM `data-analyst`:** @@ -64,16 +64,16 @@ Startup stav: Keboola prod/dev Agnes běžel z osobního forku `padak/tmp_oss` ( - GitHub secret `GCP_SA_KEY` nahrán z `~/.agnes-keys/agnes-deploy-*.json` - Environmenty `dev` a `prod` vytvořeny přes `gh api` 14. **Terraform apply Keboola instance:** 12 resources vytvořeno: - - `agnes-prod` VM + `agnes-prod-data` disk (50 GB) + `agnes-prod-ip` (34.77.102.61) - - `agnes-dev` VM + `agnes-dev-data` disk (20 GB) + `agnes-dev-ip` (34.77.94.14) + - `agnes-prod` VM + `agnes-prod-data` disk (50 GB) + `agnes-prod-ip` () + - `agnes-dev` VM + `agnes-dev-data` disk (20 GB) + `agnes-dev-ip` () - Firewall `agnes-keboola-allow-web` - `agnes-keboola-vm` SA + IAM binding - `agnes-keboola-jwt-secret` + version - - TF state v `gs://agnes-kids-ai-data-analysis-tfstate/keboola/` + - TF state v `gs://agnes-internal-prod-tfstate/keboola/` 15. **Data migration starý prod → nový prod (~2 min):** - `docker compose down` na starém prod VM - `tar czf /tmp/agnes-data.tar.gz -C /var/lib/docker/volumes/app_data/_data .` (1.8 GB) - - `gsutil cp` do `gs://agnes-kids-ai-data-analysis-tfstate/migration/agnes-data-20260421-1624.tar.gz` + - `gsutil cp` do `gs://agnes-internal-prod-tfstate/migration/agnes-data-20260421-1624.tar.gz` - **Problém:** `agnes-keboola-vm` SA neměl `storage.objectViewer` na bucketu → `gsutil iam ch serviceAccount:...:objectViewer gs://...` (dočasné, pro download) - `docker compose down` na novém prod VM - `gsutil cp` z bucketu na nový VM + `tar xzf ... -C /data` @@ -83,19 +83,19 @@ Startup stav: Keboola prod/dev Agnes běžel z osobního forku `padak/tmp_oss` ( ## Klíčové hodnoty (kopíruj pro další zákazníky) ``` -GCP_PROJECT_ID = kids-ai-data-analysis +GCP_PROJECT_ID = internal-prod CUSTOMER_NAME = keboola -DEPLOY_SA = agnes-deploy@kids-ai-data-analysis.iam.gserviceaccount.com -TFSTATE_BUCKET = gs://agnes-kids-ai-data-analysis-tfstate +DEPLOY_SA = agnes-deploy@internal-prod.iam.gserviceaccount.com +TFSTATE_BUCKET = gs://agnes-internal-prod-tfstate TFSTATE_PREFIX = keboola -VM_SA = agnes-keboola-vm@kids-ai-data-analysis.iam.gserviceaccount.com +VM_SA = agnes-keboola-vm@internal-prod.iam.gserviceaccount.com JWT_SECRET = agnes-keboola-jwt-secret (TF-managed) KEBOOLA_TOKEN_SECRET = keboola-storage-token (manuálně vytvořený) INFRA_MODULE_REF = infra-v1.0.0 (github.com/keboola/agnes-the-ai-analyst) -PROD_IP = 34.77.102.61 (agnes-prod) -DEV_IP = 34.77.94.14 (agnes-dev) -STARÝ PROD IP (legacy) = 35.195.96.98 (data-analyst — po stabilitě smazat) -STARÝ DEV IP (legacy) = 34.62.223.189 (data-analyst-dev — po stabilitě smazat) +PROD_IP = (agnes-prod) +DEV_IP = (agnes-dev) +STARÝ PROD IP (legacy) = (data-analyst — po stabilitě smazat) +STARÝ DEV IP (legacy) = (data-analyst-dev — po stabilitě smazat) ``` ## Známá omezení / TODO @@ -128,8 +128,8 @@ Migrace dat zkopírovala users table, takže heslo je platné i na novém prod. ## Co zbývá (uživatelské akce) - [ ] **Approve prod environment** v `apply.yml` runu (https://github.com/keboola/agnes-infra-keboola/actions/runs/24731681502) — jinak se state neaplikuje na prod -- [ ] **Změnit heslo admin usera** z `1234` (http://34.77.102.61:8000/login → profil) -- [ ] **Rotovat Keboola Storage token** v Keboola UI → `gcloud secrets versions add keboola-storage-token --data-file=- --project=kids-ai-data-analysis` → restart app containerů na obou VMs (cron to zachytí při dalším tiku nebo `sudo /usr/local/bin/agnes-auto-upgrade.sh`) +- [ ] **Změnit heslo admin usera** z `1234` (http://:8000/login → profil) +- [ ] **Rotovat Keboola Storage token** v Keboola UI → `gcloud secrets versions add keboola-storage-token --data-file=- --project=internal-prod` → restart app containerů na obou VMs (cron to zachytí při dalším tiku nebo `sudo /usr/local/bin/agnes-auto-upgrade.sh`) ## Aktualizace průběhu (2026-04-21 pozdně) @@ -193,8 +193,8 @@ Migrace dat zkopírovala users table, takže heslo je platné i na novém prod. | Resource | Value | |---|---| -| **Prod VM** | `agnes-prod` @ 34.77.102.61 (e2-small, 50GB /data PD, daily snapshot, uptime check) | -| **Dev VM** | `agnes-dev` @ 34.77.94.14 (e2-small, 20GB /data PD, daily snapshot, uptime check) | +| **Prod VM** | `agnes-prod` @ (e2-small, 50GB /data PD, daily snapshot, uptime check) | +| **Dev VM** | `agnes-dev` @ (e2-small, 20GB /data PD, daily snapshot, uptime check) | | **Staré VMs** | 🗑️ smazané | | **Image tagy** | prod `:stable`, dev `:dev`, feature branches `:dev-` (aktivní po v1.4) | | **Auto-upgrade** | Cron `*/5 * * * *` — reads AGNES_TAG z .env, digest change → restart | @@ -205,9 +205,9 @@ Migrace dat zkopírovala users table, takže heslo je platné i na novém prod. | **Firewall** | Web 80/443 + 8000 (jen když TLS off); SSH na IAP range only | | **Login prod** | `zdenek.srotyr@keboola.com` / `1234` *(pending: user rotate)* | | **Login dev** | `admin@keboola.com` / `1234` *(pending: user rotate)* | -| **TF state** | `gs://agnes-kids-ai-data-analysis-tfstate/keboola/` (versioned, GCS backend) | -| **Deploy SA** | `agnes-deploy@kids-ai-data-analysis.iam.gserviceaccount.com` | -| **VM SA** (scope: secretmanager.secretAccessor per-secret) | `agnes-keboola-vm@kids-ai-data-analysis.iam.gserviceaccount.com` | +| **TF state** | `gs://agnes-internal-prod-tfstate/keboola/` (versioned, GCS backend) | +| **Deploy SA** | `agnes-deploy@internal-prod.iam.gserviceaccount.com` | +| **VM SA** (scope: secretmanager.secretAccessor per-secret) | `agnes-keboola-vm@internal-prod.iam.gserviceaccount.com` | | **Secrets** | `keboola-storage-token` (manual), `agnes-keboola-jwt-secret` (TF), `jwt-secret-key` (legacy) | | **Public upstream repo** | https://github.com/keboola/agnes-the-ai-analyst | | **Template repo** | https://github.com/keboola/agnes-infra-template (is_template=true, ref infra-v1.4.0) | @@ -228,12 +228,12 @@ Podle [`docs/ONBOARDING.md`](../../ONBOARDING.md) — cíl: < 1 hodina. Klíčov 8. `POST /auth/bootstrap` admin user 9. Otestovat `/api/health` + login -Předpokládám, že nový zákazník (např. GRPN) projde všech 9 kroků za **~30–45 min** včetně čekání na TF apply. +Předpokládám, že nový zákazník (např. another-customer) projde všech 9 kroků za **~30–45 min** včetně čekání na TF apply. ## Budoucí one-click deploy -Cíl: pro nového zákazníka `{customer}` (např. `grpn`) by mělo stačit: +Cíl: pro nového zákazníka `{customer}` (např. `another-customer`) by mělo stačit: ```bash # 1. Vytvořit GCP projekt (má billing) diff --git a/docs/superpowers/plans/2026-04-21-hackathon-dry-run.md b/docs/superpowers/plans/2026-04-21-hackathon-dry-run.md index 279a007..ee6b42a 100644 --- a/docs/superpowers/plans/2026-04-21-hackathon-dry-run.md +++ b/docs/superpowers/plans/2026-04-21-hackathon-dry-run.md @@ -36,14 +36,14 @@ Before starting, the executing agent MUST verify all of the following. If any fa Expected: line containing `Logged in to github.com` and a line listing scopes that include `workflow`. If `workflow` scope is missing, abort with message: `Run: gh auth refresh -h github.com -s workflow`. -- [ ] **`gcloud` authenticated** to project `kids-ai-data-analysis`. Run: +- [ ] **`gcloud` authenticated** to project `internal-prod`. Run: ```bash gcloud config get-value project gcloud auth list --filter=status:ACTIVE --format="value(account)" ``` - Expected: project is `kids-ai-data-analysis`, at least one active account. If not, abort with message: `Run: gcloud config set project kids-ai-data-analysis && gcloud auth login`. + Expected: project is `internal-prod`, at least one active account. If not, abort with message: `Run: gcloud config set project internal-prod && gcloud auth login`. - [ ] **SSH to `agnes-dev` works** (OS Login). Run: @@ -98,7 +98,7 @@ Before starting, the executing agent MUST verify all of the following. If any fa - [ ] **Step 1.2: Capture prod health** ```bash - curl -sf --max-time 10 http://34.77.102.61:8000/api/health > /tmp/dryrun-baseline/prod-health.json + curl -sf --max-time 10 http://:8000/api/health > /tmp/dryrun-baseline/prod-health.json cat /tmp/dryrun-baseline/prod-health.json | python3 -m json.tool ``` @@ -107,7 +107,7 @@ Before starting, the executing agent MUST verify all of the following. If any fa - [ ] **Step 1.3: Capture dev health** ```bash - curl -sf --max-time 10 http://34.77.94.14:8000/api/health > /tmp/dryrun-baseline/dev-health.json + curl -sf --max-time 10 http://:8000/api/health > /tmp/dryrun-baseline/dev-health.json cat /tmp/dryrun-baseline/dev-health.json | python3 -m json.tool ``` @@ -297,7 +297,7 @@ Before starting, the executing agent MUST verify all of the following. If any fa ```bash # Poll /api/health for up to 90s for i in $(seq 1 30); do - STATUS=$(curl -s --max-time 5 http://34.77.94.14:8000/api/health | jq -r '.status' 2>/dev/null || echo "down") + STATUS=$(curl -s --max-time 5 http://:8000/api/health | jq -r '.status' 2>/dev/null || echo "down") echo "[$i/30] status=$STATUS" if [ "$STATUS" = "healthy" ] || [ "$STATUS" = "degraded" ]; then break @@ -412,7 +412,7 @@ Before starting, the executing agent MUST verify all of the following. If any fa ```bash cd /tmp/agnes-infra-keboola/terraform - export GOOGLE_APPLICATION_CREDENTIALS="$HOME/.agnes-keys/agnes-deploy-kids-ai-data-analysis-key.json" + export GOOGLE_APPLICATION_CREDENTIALS="$HOME/.agnes-keys/agnes-deploy-internal-prod-key.json" [ -f "$GOOGLE_APPLICATION_CREDENTIALS" ] || { echo "SA key not found — skipping plan"; exit 2; } terraform init -input=false -upgrade=false terraform plan -input=false -no-color -out=/tmp/dryrun-tfplan.bin > /tmp/dryrun-tfplan.txt 2>&1 @@ -609,7 +609,7 @@ Before starting, the executing agent MUST verify all of the following. If any fa ```bash for i in $(seq 1 30); do - STATUS=$(curl -s --max-time 5 http://34.77.94.14:8000/api/health | jq -r '.status' 2>/dev/null || echo down) + STATUS=$(curl -s --max-time 5 http://:8000/api/health | jq -r '.status' 2>/dev/null || echo down) echo "[$i/30] status=$STATUS" [ "$STATUS" = "healthy" ] || [ "$STATUS" = "degraded" ] && break sleep 3 @@ -651,7 +651,7 @@ Before starting, the executing agent MUST verify all of the following. If any fa - [ ] **Step 6.5: Final health check on prod (must match baseline)** ```bash - curl -sf --max-time 10 http://34.77.102.61:8000/api/health > /tmp/dryrun-baseline/prod-health-after.json + curl -sf --max-time 10 http://:8000/api/health > /tmp/dryrun-baseline/prod-health-after.json BEFORE=$(jq -r '.status' /tmp/dryrun-baseline/prod-health.json) AFTER=$(jq -r '.status' /tmp/dryrun-baseline/prod-health-after.json) echo "Prod status before: $BEFORE / after: $AFTER" @@ -667,7 +667,7 @@ Before starting, the executing agent MUST verify all of the following. If any fa ## Task 6: Cleanup — - agnes-dev AGNES_TAG restored to: $(cat /tmp/dryrun-baseline/dev-env.txt) - - agnes-dev health after restore: $(curl -s --max-time 5 http://34.77.94.14:8000/api/health | jq -r '.status') + - agnes-dev health after restore: $(curl -s --max-time 5 http://:8000/api/health | jq -r '.status') - agnes-dev image: matches baseline? - Throwaway branches deleted: feature, smoke - Prod status unchanged: @@ -737,10 +737,10 @@ Before starting, the executing agent MUST verify all of the following. If any fa echo "[4/4] Waiting for app to become healthy..." for i in $(seq 1 30); do - STATUS=$(curl -s --max-time 5 http://34.77.94.14:8000/api/health | python3 -c 'import sys,json; print(json.load(sys.stdin).get("status","down"))' 2>/dev/null || echo down) + STATUS=$(curl -s --max-time 5 http://:8000/api/health | python3 -c 'import sys,json; print(json.load(sys.stdin).get("status","down"))' 2>/dev/null || echo down) echo " [$i/30] status=$STATUS" if [ "$STATUS" = "healthy" ] || [ "$STATUS" = "degraded" ]; then - echo "OK — agnes-dev now running $TAG. Open http://34.77.94.14:8000" + echo "OK — agnes-dev now running $TAG. Open http://:8000" exit 0 fi sleep 3 @@ -790,7 +790,7 @@ Before starting, the executing agent MUST verify all of the following. If any fa 1. Configure required status check 'test' on main branch of keboola/agnes-the-ai-analyst. 2. Pin prod image_tag in agnes-infra-keboola/terraform/terraform.tfvars from "stable" to "stable-2026.04.XX" (current running version). Revert after hackathon. - 3. Rotate admin password '1234' on prod (34.77.102.61:8000/login) and dev (34.77.94.14:8000/login). + 3. Rotate admin password '1234' on prod (:8000/login) and dev (:8000/login). 4. Wire notification_channel_ids in tfvars so uptime alerts actually notify someone. 5. Share the hackathon 1-pager + switch-dev-vm.sh via the team Slack channel. 6. Review PR $(cat /tmp/dryrun-baseline/deliverable-pr.txt) and merge if switch-dev-vm.sh looks good. diff --git a/docs/superpowers/plans/2026-04-21-issues-14-and-10.md b/docs/superpowers/plans/2026-04-21-issues-14-and-10.md index 7e6fae0..f7ffcb7 100644 --- a/docs/superpowers/plans/2026-04-21-issues-14-and-10.md +++ b/docs/superpowers/plans/2026-04-21-issues-14-and-10.md @@ -87,10 +87,10 @@ gcloud compute ssh "$VM" --zone="$ZONE" --quiet --command \ echo "[4/4] Waiting for app to become healthy..." for i in $(seq 1 30); do - STATUS=$(curl -s --max-time 5 http://34.77.94.14:8000/api/health | python3 -c 'import sys,json; print(json.load(sys.stdin).get("status","down"))' 2>/dev/null || echo down) + STATUS=$(curl -s --max-time 5 http://:8000/api/health | python3 -c 'import sys,json; print(json.load(sys.stdin).get("status","down"))' 2>/dev/null || echo down) echo " [$i/30] status=$STATUS" if [ "$STATUS" = "healthy" ] || [ "$STATUS" = "degraded" ]; then - echo "OK — agnes-dev now running $TAG. Open http://34.77.94.14:8000" + echo "OK — agnes-dev now running $TAG. Open http://:8000" exit 0 fi sleep 3 diff --git a/docs/superpowers/plans/2026-04-21-multi-customer-deployment.md b/docs/superpowers/plans/2026-04-21-multi-customer-deployment.md index a14df06..daa4b49 100644 --- a/docs/superpowers/plans/2026-04-21-multi-customer-deployment.md +++ b/docs/superpowers/plans/2026-04-21-multi-customer-deployment.md @@ -38,10 +38,10 @@ Tyto kroky vyžadují externí akce (oprávnění, Keboola UI). Musí být hotov ### Task 0.1: Ověřit přístupová práva -- [ ] **Step 1: Ověřit, že máš `iam.serviceAccountAdmin` na kids-ai-data-analysis** +- [ ] **Step 1: Ověřit, že máš `iam.serviceAccountAdmin` na internal-prod** ```bash -gcloud projects get-iam-policy kids-ai-data-analysis --format=json \ +gcloud projects get-iam-policy internal-prod --format=json \ | python3 -c "import json, sys; d=json.load(sys.stdin); \ me='zdenek.srotyr@keboola.com'; \ roles=[b['role'] for b in d['bindings'] if any(me in m for m in b.get('members', []))]; \ @@ -54,7 +54,7 @@ Expected: seznam rolí, nebo poznámka "NO DIRECT ROLES". Poslat mu odkaz na tuhle dokumentaci: https://cloud.google.com/iam/docs/understanding-roles#iam-roles -Napsat Petrovi ve Slacku / emailu: "Potřebuji dočasně roli `iam.serviceAccountAdmin` a `resourcemanager.projectIamAdmin` na projektu `kids-ai-data-analysis` pro vytvoření Agnes deploy SA. Zrušíme, jakmile bude hotovo." +Napsat Petrovi ve Slacku / emailu: "Potřebuji dočasně roli `iam.serviceAccountAdmin` a `resourcemanager.projectIamAdmin` na projektu `internal-prod` pro vytvoření Agnes deploy SA. Zrušíme, jakmile bude hotovo." - [ ] **Step 3: Ověřit, že image `ghcr.io/keboola/agnes-the-ai-analyst` je public** @@ -72,7 +72,7 @@ Expected: `"public"`. Pokud `"private"`, změnit přes GitHub UI: Keboola org gcloud compute disks snapshot data-analyst \ --zone=europe-west1-b \ --snapshot-names=data-analyst-pre-migration-$(date +%Y%m%d) \ - --project=kids-ai-data-analysis + --project=internal-prod ``` Expected: `Created snapshot data-analyst-pre-migration-YYYYMMDD`. @@ -80,7 +80,7 @@ Expected: `Created snapshot data-analyst-pre-migration-YYYYMMDD`. - [ ] **Step 2: Ověřit snapshot** ```bash -gcloud compute snapshots list --project=kids-ai-data-analysis \ +gcloud compute snapshots list --project=internal-prod \ --filter="name~pre-migration" --format="table(name, status, diskSizeGb, creationTimestamp)" ``` @@ -246,11 +246,11 @@ echo "" chmod +x scripts/bootstrap-gcp.sh ``` -- [ ] **Step 3: Spustit skript na kids-ai-data-analysis** +- [ ] **Step 3: Spustit skript na internal-prod** ```bash cd "/Users/zdeneksrotyr/Library/Mobile Documents/com~apple~CloudDocs/Sources/VsCode/component_factory/tmp_oss" -./scripts/bootstrap-gcp.sh kids-ai-data-analysis +./scripts/bootstrap-gcp.sh internal-prod ``` Expected: na konci výpis "HOTOVO" + instrukce. @@ -260,8 +260,8 @@ Pokud selže na "Permission denied": viz Task 0.1 step 2 (požádat Petra). - [ ] **Step 4: Ověřit SA a bucket** ```bash -gcloud iam service-accounts list --project=kids-ai-data-analysis --filter="email~agnes-deploy" --format="value(email)" -gsutil ls -b gs://agnes-kids-ai-data-analysis-tfstate +gcloud iam service-accounts list --project=internal-prod --filter="email~agnes-deploy" --format="value(email)" +gsutil ls -b gs://agnes-internal-prod-tfstate ``` Expected: SA email + bucket URL. @@ -288,7 +288,7 @@ read -s NEW_TOKEN echo -n "$NEW_TOKEN" | gcloud secrets create keboola-storage-token \ --data-file=- \ --replication-policy=automatic \ - --project=kids-ai-data-analysis + --project=internal-prod unset NEW_TOKEN ``` @@ -300,7 +300,7 @@ Expected: `Created secret [keboola-storage-token]`. openssl rand -hex 32 | gcloud secrets create jwt-secret-key \ --data-file=- \ --replication-policy=automatic \ - --project=kids-ai-data-analysis + --project=internal-prod ``` Expected: `Created secret [jwt-secret-key]`. @@ -308,7 +308,7 @@ Expected: `Created secret [jwt-secret-key]`. - [ ] **Step 4: Ověřit secrets** ```bash -gcloud secrets list --project=kids-ai-data-analysis --format="table(name, createTime)" +gcloud secrets list --project=internal-prod --format="table(name, createTime)" ``` Expected: dva secrets — keboola-storage-token, jwt-secret-key. @@ -318,9 +318,9 @@ Expected: dva secrets — keboola-storage-token, jwt-secret-key. ```bash for secret in keboola-storage-token jwt-secret-key; do gcloud secrets add-iam-policy-binding "$secret" \ - --member="serviceAccount:agnes-deploy@kids-ai-data-analysis.iam.gserviceaccount.com" \ + --member="serviceAccount:agnes-deploy@internal-prod.iam.gserviceaccount.com" \ --role=roles/secretmanager.secretAccessor \ - --project=kids-ai-data-analysis + --project=internal-prod done ``` @@ -436,7 +436,7 @@ git commit -m "infra: prod compose pulls from GHCR via AGNES_TAG env (default :s - [ ] **Step 1: SSH na prod VM a zastavit kontejnery** ```bash -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo -u deploy bash -c 'cd /home/deploy/app && docker compose down'" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo -u deploy bash -c 'cd /home/deploy/app && docker compose down'" ``` Expected: `Container app-app-1 Stopped`, `Container app-scheduler-1 Stopped`. @@ -445,7 +445,7 @@ Expected: `Container app-app-1 Stopped`, `Container app-scheduler-1 Stopped`. ```bash # Ověřit aktuální SA -gcloud compute instances describe data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis \ +gcloud compute instances describe data-analyst --zone=europe-west1-b --project=internal-prod \ --format="value(serviceAccounts[0].email)" ``` @@ -454,7 +454,7 @@ Pokud výstup `327445566538-compute@developer.gserviceaccount.com` (default SA), Přidat mu explicitně secretmanager.secretAccessor (idempotentní): ```bash -gcloud projects add-iam-policy-binding kids-ai-data-analysis \ +gcloud projects add-iam-policy-binding internal-prod \ --member="serviceAccount:327445566538-compute@developer.gserviceaccount.com" \ --role="roles/secretmanager.secretAccessor" \ --condition=None @@ -466,13 +466,13 @@ gcloud projects add-iam-policy-binding kids-ai-data-analysis \ gcloud compute scp \ "/Users/zdeneksrotyr/Library/Mobile Documents/com~apple~CloudDocs/Sources/VsCode/component_factory/tmp_oss/scripts/fetch-env-from-secrets.sh" \ data-analyst:/tmp/fetch-env.sh \ - --zone=europe-west1-b --project=kids-ai-data-analysis + --zone=europe-west1-b --project=internal-prod ``` - [ ] **Step 4: Spustit fetch-env skript pod uživatelem deploy** ```bash -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo install -m 755 -o deploy -g deploy /tmp/fetch-env.sh /home/deploy/app/fetch-env.sh && sudo -u deploy bash -c 'cd /home/deploy/app && ./fetch-env.sh'" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo install -m 755 -o deploy -g deploy /tmp/fetch-env.sh /home/deploy/app/fetch-env.sh && sudo -u deploy bash -c 'cd /home/deploy/app && ./fetch-env.sh'" ``` Expected: `Wrote /home/deploy/app/.env (chmod 600)`. @@ -480,7 +480,7 @@ Expected: `Wrote /home/deploy/app/.env (chmod 600)`. - [ ] **Step 5: Zkontrolovat .env na VM (bez vypisování hodnot)** ```bash -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo -u deploy bash -c 'ls -la /home/deploy/app/.env && wc -l /home/deploy/app/.env && cut -d= -f1 /home/deploy/app/.env'" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo -u deploy bash -c 'ls -la /home/deploy/app/.env && wc -l /home/deploy/app/.env && cut -d= -f1 /home/deploy/app/.env'" ``` Expected: soubor 600 mode, 7 řádků, klíče: JWT_SECRET_KEY, DATA_DIR, DATA_SOURCE, KEBOOLA_STORAGE_TOKEN, KEBOOLA_STACK_URL, SEED_ADMIN_EMAIL, LOG_LEVEL. @@ -488,13 +488,13 @@ Expected: soubor 600 mode, 7 řádků, klíče: JWT_SECRET_KEY, DATA_DIR, DATA_S - [ ] **Step 6: Aktualizovat docker-compose.yml konfiguraci na VM na pulling z GHCR** ```bash -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo -u deploy bash -c 'cd /home/deploy/app && git fetch origin feature/v2-fastapi-duckdb-docker-cli && git reset --hard origin/feature/v2-fastapi-duckdb-docker-cli'" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo -u deploy bash -c 'cd /home/deploy/app && git fetch origin feature/v2-fastapi-duckdb-docker-cli && git reset --hard origin/feature/v2-fastapi-duckdb-docker-cli'" ``` **Pozor:** VM má starý remote `ZdenekSrotyr/tmp_oss`. Tohle tedy nebude fungovat, pokud se ten repo smazal. Alternativa: nahradit origin remote za keboola/agnes-the-ai-analyst: ```bash -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo -u deploy bash -c 'cd /home/deploy/app && git remote set-url origin https://github.com/keboola/agnes-the-ai-analyst.git && git fetch origin main && git reset --hard origin/main'" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo -u deploy bash -c 'cd /home/deploy/app && git remote set-url origin https://github.com/keboola/agnes-the-ai-analyst.git && git fetch origin main && git reset --hard origin/main'" ``` Expected: HEAD is now at `` ``. @@ -502,7 +502,7 @@ Expected: HEAD is now at `` ``. - [ ] **Step 7: Pullnout image z GHCR a nastartovat s novým override** ```bash -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo -u deploy bash -c 'cd /home/deploy/app && export AGNES_TAG=stable && docker compose -f docker-compose.yml -f docker-compose.prod.yml pull && docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d'" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo -u deploy bash -c 'cd /home/deploy/app && export AGNES_TAG=stable && docker compose -f docker-compose.yml -f docker-compose.prod.yml pull && docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d'" ``` Expected: `Container app-app-1 Started`, `Container app-scheduler-1 Started`. @@ -512,7 +512,7 @@ Expected: `Container app-app-1 Started`, `Container app-scheduler-1 Started`. ```bash # Počkat 30 sekund sleep 30 -curl -s --max-time 10 http://35.195.96.98:8000/api/health | python3 -m json.tool | head -10 +curl -s --max-time 10 http://:8000/api/health | python3 -m json.tool | head -10 ``` Expected: `"status": "healthy"` nebo `"degraded"` (stale tables jsou OK). Ne `connection refused`. @@ -520,7 +520,7 @@ Expected: `"status": "healthy"` nebo `"degraded"` (stale tables jsou OK). Ne `co - [ ] **Step 9: Ověřit, že app používá nový image** ```bash -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo docker inspect app-app-1 --format '{{.Config.Image}}'" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo docker inspect app-app-1 --format '{{.Config.Image}}'" ``` Expected: `ghcr.io/keboola/agnes-the-ai-analyst:stable` (ne `app-app`). @@ -528,7 +528,7 @@ Expected: `ghcr.io/keboola/agnes-the-ai-analyst:stable` (ne `app-app`). - [ ] **Step 10: Ověřit login** ```bash -curl -sS --max-time 5 -X POST http://35.195.96.98:8000/auth/password/login \ +curl -sS --max-time 5 -X POST http://:8000/auth/password/login \ -H "Content-Type: application/json" \ -d '{"email":"zdenek.srotyr@keboola.com","password":"1234"}' 2>&1 | python3 -c "import sys,json; d=json.load(sys.stdin); print('OK — role:', d.get('role'))" ``` @@ -563,12 +563,12 @@ git commit -m "docs: document Secret Manager-backed .env for production" - [ ] **Step 1: Opakovat Task 1.6 steps 1-10 proti data-analyst-dev VM** -Stejné příkazy, jen zaměnit `data-analyst` za `data-analyst-dev` a IP `35.195.96.98` za `34.62.223.189`. +Stejné příkazy, jen zaměnit `data-analyst` za `data-analyst-dev` a IP `` za ``. - [ ] **Step 2: Verify** ```bash -curl -s --max-time 10 http://34.62.223.189:8000/api/health | python3 -m json.tool | head -3 +curl -s --max-time 10 http://:8000/api/health | python3 -m json.tool | head -3 ``` Expected: valid JSON s `"status"`. @@ -608,13 +608,13 @@ Expected: `Not Found (HTTP 404)`. Ověřit, že nová verze tokenu funguje: ```bash -curl -s --max-time 10 http://35.195.96.98:8000/api/sync/status 2>&1 | python3 -m json.tool | head -20 +curl -s --max-time 10 http://:8000/api/sync/status 2>&1 | python3 -m json.tool | head -20 ``` Expected: nějaký valid JSON. Pokud `401 Unauthorized` nebo `Invalid token`, app ještě má cached starý token — restartovat: ```bash -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo -u deploy bash -c 'cd /home/deploy/app && docker compose restart app'" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo -u deploy bash -c 'cd /home/deploy/app && docker compose restart app'" ``` ### Task 1.10: Checkpoint — Fáze 1 hotová @@ -625,7 +625,7 @@ Přes UI nebo: ```bash read -s NEW_PASSWORD -TOKEN=$(curl -sS -X POST http://35.195.96.98:8000/auth/password/login \ +TOKEN=$(curl -sS -X POST http://:8000/auth/password/login \ -H "Content-Type: application/json" \ -d '{"email":"zdenek.srotyr@keboola.com","password":"1234"}' | python3 -c "import sys,json;print(json.load(sys.stdin)['access_token'])") # [Volba: použít admin endpoint pro změnu hesla, pokud existuje — jinak přes UI] @@ -690,7 +690,7 @@ variable "zone" { } variable "customer_name" { - description = "Krátké identifikátor zákazníka (např. keboola, grpn). Použije se v prefixu resourců." + description = "Krátké identifikátor zákazníka (např. keboola, another-customer). Použije se v prefixu resourců." type = string validation { condition = can(regex("^[a-z][a-z0-9-]{1,20}$", var.customer_name)) @@ -830,7 +830,7 @@ resource "google_compute_firewall" "web" { ports = ["22", "80", "443", "8000"] } - source_ranges = ["0.0.0.0/0"] + source_ranges = ["/0"] target_tags = ["agnes-${var.customer_name}"] } @@ -1166,7 +1166,7 @@ terraform { google = { source = "hashicorp/google", version = "~> 5.0" } } backend "gcs" { - bucket = "agnes-kids-ai-data-analysis-tfstate" + bucket = "agnes-internal-prod-tfstate" prefix = "keboola" } } @@ -1206,7 +1206,7 @@ variable "dev_instances" { type = any, default = [] } EOF cat > terraform/terraform.tfvars.example <<'EOF' -gcp_project_id = "kids-ai-data-analysis" +gcp_project_id = "internal-prod" seed_admin_email = "zdenek.srotyr@keboola.com" keboola_stack_url = "https://connection.us-east4.gcp.keboola.com/" @@ -1250,7 +1250,7 @@ git push -u origin main ```bash # Klíč vytvořený v Task 1.2 step 3 gh secret set GCP_SA_KEY --repo keboola/agnes-infra-keboola \ - < ../tmp_oss/agnes-deploy-kids-ai-data-analysis-key.json + < ../tmp_oss/agnes-deploy-internal-prod-key.json ``` **Poznámka:** Pokud klíč ne už smazal, re-generate: `gcloud iam service-accounts keys create ...`. @@ -1280,7 +1280,7 @@ Už máme z Task 0.2. Pokud je snapshot starší než 24 h, udělat nový: gcloud compute disks snapshot data-analyst \ --zone=europe-west1-b \ --snapshot-names=data-analyst-migration-$(date +%Y%m%d-%H%M) \ - --project=kids-ai-data-analysis + --project=internal-prod ``` - [ ] **Step 2: Terraform apply — vytvoří nové VMs (`agnes-prod`, `agnes-dev`) vedle starých** @@ -1307,15 +1307,15 @@ NEW_PROD_IP=$(cd ~/.../agnes-infra-keboola/terraform && terraform output -raw pr # (nebo použít oslogin → další prerekvizita) # Alternativa: udělat z druhé strany — SSH na starou VM, rsync na novou -gcloud compute ssh data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo docker compose -f /home/deploy/app/docker-compose.yml -f /home/deploy/app/docker-compose.prod.yml down" +gcloud compute ssh data-analyst --zone=europe-west1-b --project=internal-prod --command="sudo docker compose -f /home/deploy/app/docker-compose.yml -f /home/deploy/app/docker-compose.prod.yml down" # Rsync přes gcloud compute scp recursive (funguje jen z lokálu) -gcloud compute scp --recurse --zone=europe-west1-b --project=kids-ai-data-analysis \ +gcloud compute scp --recurse --zone=europe-west1-b --project=internal-prod \ data-analyst:/home/deploy/app/data-volume/ \ agnes-prod:/data/ # Spustit app na nové VM znovu -gcloud compute ssh agnes-prod --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo docker compose -f /opt/agnes/docker-compose.yml -f /opt/agnes/docker-compose.prod.yml restart" +gcloud compute ssh agnes-prod --zone=europe-west1-b --project=internal-prod --command="sudo docker compose -f /opt/agnes/docker-compose.yml -f /opt/agnes/docker-compose.prod.yml restart" ``` **Alternativně (čistěji):** restore ze snapshotu přes `gcloud compute disks create --source-snapshot`, pak attach místo prázdného data disku. @@ -1347,8 +1347,8 @@ Stejné kroky 1-5. - [ ] **Step 7: Vypnout staré VMs (zatím NEmazat — jen stop)** ```bash -gcloud compute instances stop data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis -gcloud compute instances stop data-analyst-dev --zone=europe-west1-b --project=kids-ai-data-analysis +gcloud compute instances stop data-analyst --zone=europe-west1-b --project=internal-prod +gcloud compute instances stop data-analyst-dev --zone=europe-west1-b --project=internal-prod ``` - [ ] **Step 8: Ověřit, že nový prod běží minimálně 24 h bez problému** @@ -1361,13 +1361,13 @@ curl -s "http://$NEW_PROD_IP:8000/api/health" | python3 -m json.tool - [ ] **Step 9: Po 24h stability smazat staré VMs + jejich disky + statické IP** ```bash -gcloud compute instances delete data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --quiet -gcloud compute instances delete data-analyst-dev --zone=europe-west1-b --project=kids-ai-data-analysis --quiet +gcloud compute instances delete data-analyst --zone=europe-west1-b --project=internal-prod --quiet +gcloud compute instances delete data-analyst-dev --zone=europe-west1-b --project=internal-prod --quiet -gcloud compute disks delete data-analyst --zone=europe-west1-b --project=kids-ai-data-analysis --quiet 2>&1 || true -gcloud compute disks delete data-analyst-dev --zone=europe-west1-b --project=kids-ai-data-analysis --quiet 2>&1 || true +gcloud compute disks delete data-analyst --zone=europe-west1-b --project=internal-prod --quiet 2>&1 || true +gcloud compute disks delete data-analyst-dev --zone=europe-west1-b --project=internal-prod --quiet 2>&1 || true -gcloud compute addresses delete data-analyst-ip --region=europe-west1 --project=kids-ai-data-analysis --quiet 2>&1 || true +gcloud compute addresses delete data-analyst-ip --region=europe-west1 --project=internal-prod --quiet 2>&1 || true ``` - [ ] **Step 10: Checkpoint — Fáze 2 hotová** @@ -1546,7 +1546,7 @@ Expected: `HTTP/2 200` (ne 301, ne TLS error). - [ ] **Step 1: SSH na dev VM a ověřit, že watchtower běží** ```bash -gcloud compute ssh agnes-dev --zone=europe-west1-b --project=kids-ai-data-analysis --command="sudo docker ps | grep watchtower" +gcloud compute ssh agnes-dev --zone=europe-west1-b --project=internal-prod --command="sudo docker ps | grep watchtower" ``` Expected: container `watchtower` STATUS `Up X minutes`. @@ -1567,7 +1567,7 @@ Počkat ~ 5-10 min (CI build + watchtower poll interval 5 min). ```bash # Kontrola image sha na dev VM -gcloud compute ssh agnes-dev --zone=europe-west1-b --project=kids-ai-data-analysis \ +gcloud compute ssh agnes-dev --zone=europe-west1-b --project=internal-prod \ --command="sudo docker inspect app-app-1 --format '{{.Image}}' && sudo docker image inspect \$(sudo docker inspect app-app-1 --format '{{.Image}}') --format '{{.Created}}'" ``` @@ -1588,7 +1588,7 @@ metadata = { - [ ] **Step 2: Zkontrolovat, že uživatelé mají `roles/compute.osAdminLogin` na projektu** ```bash -gcloud projects get-iam-policy kids-ai-data-analysis \ +gcloud projects get-iam-policy internal-prod \ --flatten="bindings[].members" \ --filter="bindings.role=roles/compute.osAdminLogin" \ --format="value(bindings.members)" @@ -1597,7 +1597,7 @@ gcloud projects get-iam-policy kids-ai-data-analysis \ Pokud prázdné, přidat: ```bash -gcloud projects add-iam-policy-binding kids-ai-data-analysis \ +gcloud projects add-iam-policy-binding internal-prod \ --member=user:zdenek.srotyr@keboola.com \ --role=roles/compute.osAdminLogin ``` @@ -1605,7 +1605,7 @@ gcloud projects add-iam-policy-binding kids-ai-data-analysis \ - [ ] **Step 3: Test SSH přes OS Login** ```bash -gcloud compute ssh agnes-prod --zone=europe-west1-b --project=kids-ai-data-analysis --command="whoami" +gcloud compute ssh agnes-prod --zone=europe-west1-b --project=internal-prod --command="whoami" ``` Expected: username ve formátu `zdenek_srotyr_keboola_com` (OS Login generated). @@ -1615,7 +1615,7 @@ Expected: username ve formátu `zdenek_srotyr_keboola_com` (OS Login generated). - [ ] **Step 1: Ověřit, že VM SA má jen secretmanager.secretAccessor** ```bash -gcloud projects get-iam-policy kids-ai-data-analysis \ +gcloud projects get-iam-policy internal-prod \ --flatten="bindings[].members" \ --filter="bindings.members:agnes-keboola-vm@" \ --format="value(bindings.role)" @@ -1806,7 +1806,7 @@ V PR: - [ ] **Step 1: Smazat lokální SA key** ```bash -rm ~/.../agnes-deploy-kids-ai-data-analysis-key.json +rm ~/.../agnes-deploy-internal-prod-key.json ``` - [ ] **Step 2: Na GCP smazat starý klíč (key rotation)** @@ -1814,8 +1814,8 @@ rm ~/.../agnes-deploy-kids-ai-data-analysis-key.json ```bash # Seznam klíčů gcloud iam service-accounts keys list \ - --iam-account=agnes-deploy@kids-ai-data-analysis.iam.gserviceaccount.com \ - --project=kids-ai-data-analysis + --iam-account=agnes-deploy@internal-prod.iam.gserviceaccount.com \ + --project=internal-prod ``` Po ověření, že GH Actions s novým klíčem funguje (po úspěšném prvním apply), smazat starý. @@ -1824,7 +1824,7 @@ Po ověření, že GH Actions s novým klíčem funguje (po úspěšném prvním ## Fáze 6 — Template repo + onboarding playbook -**Goal fáze:** Druhý zákazník (GRPN) se dá nasadit za < 1 hodinu. +**Goal fáze:** Druhý zákazník (another-customer) se dá nasadit za < 1 hodinu. ### Task 6.1: Vytvořit `keboola/agnes-infra-template` diff --git a/docs/superpowers/plans/2026-04-22-grpn-deploy-learnings.md b/docs/superpowers/plans/2026-04-22-grpn-deploy-learnings.md deleted file mode 100644 index f1574e2..0000000 --- a/docs/superpowers/plans/2026-04-22-grpn-deploy-learnings.md +++ /dev/null @@ -1,79 +0,0 @@ -# GRPN deploy learnings — hackathon 2026-04-22 - -Running log of constraints encountered while deploying Agnes to GRPN's `prj-grp-foundryai-dev-7c37` on an existing VM (`foundryai-development`). Recorded during deploy; each entry captures the constraint, workaround, and what it implies for our Terraform flow. - -## Constraints hit - -### 1. No `projectIamAdmin` on human identity - -- **Signal:** `bootstrap-gcp.sh` failed on `gcloud projects add-iam-policy-binding` with `[e_zsrotyr@groupon.com] does not have permission ... setIamPolicy`. -- **Root cause:** `roles/editor` intentionally excludes `resourcemanager.projects.setIamPolicy`. -- **Workaround (hackathon):** Skip `bootstrap-gcp.sh`. Deploy on existing VM with docker-compose; use VM's existing SA without adding any new IAM bindings. -- **Implication for TF flow:** For a proper per-customer deploy, the GRPN admin must grant `roles/resourcemanager.projectIamAdmin` to either the onboarding engineer or directly to `agnes-deploy` SA. Or onboarding becomes two-phase: engineer creates SA + bucket; admin grants roles. - -### 2. Organization policy `iam.disableServiceAccountKeyCreation` - -- **Signal:** `gcloud iam service-accounts keys create` returned `Key creation is not allowed on this service account`. -- **Root cause:** Org-level `constraints/iam.disableServiceAccountKeyCreation` applies to all projects in the organization. Intentional security posture — static SA JSON keys are the highest-risk credential type. -- **Workaround:** Can't produce a `GCP_SA_KEY` GitHub secret for CI/CD. Options: - - **WIF (Workload Identity Federation)**: GitHub Actions OIDC → GCP, no static keys. Requires bootstrap updates (create WIF pool + provider + binding on deploy SA). - - **Skip CI/CD for GRPN**: Run `terraform apply` only from developer laptops with user ADC (`gcloud auth application-default login`). Works for hackathon, does not scale. -- **Implication for TF flow:** Our current bootstrap + `apply.yml` assume SA JSON key. GRPN (and any org with this org policy) requires WIF path. Track as follow-up; for hackathon we skip CI entirely. - -### 3. Resource-level `setIamPolicy` also blocked - -- **Signal:** `gcloud secrets add-iam-policy-binding` returned `Permission 'secretmanager.secrets.setIamPolicy' denied`. -- **Root cause:** `editor` does not grant `setIamPolicy` on any resource, even secret-level. Stricter than standard GCP default; likely additional org policies. -- **Workaround:** Don't use Secret Manager for hackathon secrets. Store JWT + any tokens directly in `.env` on the VM with `chmod 600`. -- **Implication:** Our module's secret-based `.env` assembly from Secret Manager needs a fallback path when `setIamPolicy` is blocked. For now: document that customers who can't grant IAM must bake secrets into `.env` manually (still via `scp`, not git). - -### 4. VM has no external IP (IAP tunnel only) - -- **Signal:** `gcloud compute ssh` auto-falls-back to IAP tunnel; direct IP access from browser impossible. -- **Root cause:** GRPN VMs are created in a private VPC. Standard security posture. Our module default (`access_config { nat_ip = ... }`) is the opposite — external IP by default. -- **Workaround:** Browser access via IAP tunnel: `gcloud compute start-iap-tunnel foundryai-development 8000 --local-host-port=localhost:8000`. Then `http://localhost:8000`. -- **Implication:** Our module needs an `external_ip` variable (default `true`) that customers can disable. Plus docs for IAP tunnel access pattern. - -### 5. VM's SA scopes include `cloud-platform` (default overkill) - -- **Signal:** `grpn-sa-foundryai-execution@...` has `cloud-platform` scope — full GCP access. -- **Root cause:** GRPN's default compute SA configuration. -- **Workaround:** Use VM's existing SA; it already has enough (BigQuery datasets, Compute, etc.). No need to create a dedicated `agnes-vm` SA (and we couldn't anyway — would need `projectIamAdmin`). -- **Implication:** For hackathon OK. For production the SA is overprovisioned — different customer than us, our opinion doesn't apply. - -### 6. Docker not pre-installed - -- **Signal:** `docker: command not found` on fresh VM. -- **Root cause:** VM is generic Ubuntu, no opinions about Docker. -- **Workaround:** `curl -fsSL https://get.docker.com | sudo sh` + `sudo apt install docker-compose-plugin`. Took ~30 s. -- **Implication:** Any non-TF-managed VM will need this. Our module's startup script already does this; manual deploys need it inline or a small bootstrap script. - -### 7. `/data` did not exist - -- **Signal:** `df /data` → No such file or directory. -- **Root cause:** Fresh VM, no persistent disk attached for data. -- **Workaround:** `mkdir -p /data/{state,analytics,extracts}` on boot disk. Ephemeral — data lives with VM. Acceptable for hackathon. -- **Implication:** For production this would mean no data survives VM recreate. Module's persistent-disk + `host-mount` overlay is the right long-term answer. For hackathon, boot disk is fine. - -## Derived follow-ups (post-hackathon) - -- [ ] **Add WIF path to `bootstrap-gcp.sh`** — alternative to SA JSON key. Detect `iam.disableServiceAccountKeyCreation` constraint and switch automatically. -- [ ] **Make `external_ip` + `iap_only` optional in customer-instance module** — GRPN-style customers need VMs without NAT. -- [ ] **Document two-phase bootstrap flow** — engineer creates SA, admin grants roles. Or admin runs the script on behalf. -- [ ] **Fallback `.env` assembly** — when Secret Manager is blocked, allow operator to `scp` secrets. -- [ ] **Customer onboarding checklist addition** — verify required project IAM before onboarding starts: - - `resourcemanager.projects.setIamPolicy` (for adding binding to SA) - - `iam.serviceAccountKeys.create` — check org policy `iam.disableServiceAccountKeyCreation` → if true, mandate WIF - - `compute.firewalls.create` (for firewall rules) - - `compute.disks.create`, `compute.instances.create` (for VM) - - `secretmanager.*` (for secrets) - - `storage.buckets.create` (for tfstate bucket, if hosted in customer project) - -## Hackathon deploy summary (live) - -- VM: `foundryai-development` in `prj-grp-foundryai-dev-7c37`, zone `us-central1-a`, e2-medium, 30GB boot, IAP-only access -- Data source: `csv` (no external data ingest needed for hackathon) -- App directory: `/opt/agnes/`, docker-compose fetched from upstream `main` -- Data directory: `/data` on boot disk (ephemeral) -- Secrets: plain `.env` with chmod 600 (org policy blocks Secret Manager IAM bindings) -- Access: IAP tunnel on port 8000 diff --git a/docs/superpowers/specs/2026-04-21-multi-customer-deployment-spec.md b/docs/superpowers/specs/2026-04-21-multi-customer-deployment-spec.md index 3853a11..ea81cab 100644 --- a/docs/superpowers/specs/2026-04-21-multi-customer-deployment-spec.md +++ b/docs/superpowers/specs/2026-04-21-multi-customer-deployment-spec.md @@ -13,7 +13,7 @@ Zavést *production-grade* nasazení Agnes, které: 3. Je **anonymizované** — jeden zákazník nevidí existenci ani identitu ostatních. 4. Má **auto-deploy s rozumnými gates** — feature branch push → dev VM aktualizace do minut; merge do main → prod s review gate. 5. Podporuje **branch-aware dev environments** — víc vývojářů paralelně, každý na své branchi, bez interference. -6. **Škáluje O(1) na zákazníka** — přidání GRPN vedle Keboola znamená jen klonování šablony, ne změnu upstream. +6. **Škáluje O(1) na zákazníka** — přidání another-customer vedle Keboola znamená jen klonování šablony, ne změnu upstream. ## 2. Model — Pure Self-Deploy @@ -26,7 +26,7 @@ Zavést *production-grade* nasazení Agnes, které: Keboola jako upstream **nemá žádný přístup k zákaznickým GCP projektům**. Zákazník zodpovídá za svoje nasazení. -Keboola interní produkční Agnes instance je **speciální případ zákazníka** — Keboola IT vlastní `kids-ai-data-analysis` GCP projekt a spravuje tam svou Agnes stejně jako to bude dělat GRPN ve svém GCP. +Keboola interní produkční Agnes instance je **speciální případ zákazníka** — Keboola IT vlastní `internal-prod` GCP projekt a spravuje tam svou Agnes stejně jako to bude dělat another-customer ve svém GCP. ### 2.2 Budoucí rozšíření (out of scope pro tuto vlnu) @@ -99,7 +99,7 @@ Přesně ta samá struktura jako template, jen s konkrétními hodnotami v `terr # keboola/agnes-infra-keboola/terraform/terraform.tfvars # (gitignored, nebo lokálně v Secret Manageru — viz §6) -gcp_project_id = "kids-ai-data-analysis" +gcp_project_id = "internal-prod" region = "europe-west1" zone = "europe-west1-b" @@ -407,7 +407,7 @@ Všechny designové otázky, které vznikly během brainstormingu, jsou vyřeše | Prod upgrade režim | Per-instance volba auto/pinned, default auto | | TLS | Caddy default, flex na gcp-lb/cloudflare | | DNS | Zákazník si řeší sám, default jen IP | -| GCP projekt pro Keboola | `kids-ai-data-analysis` zůstává | +| GCP projekt pro Keboola | `internal-prod` zůstává | | Dev VM model | Seznam `dev_instances` v tfvars, per-položka image_tag | | `ZdenekSrotyr/tmp_oss` | Smazat po Fázi 1 | diff --git a/infra/modules/customer-instance/outputs.tf b/infra/modules/customer-instance/outputs.tf index cb8e353..b7403a7 100644 --- a/infra/modules/customer-instance/outputs.tf +++ b/infra/modules/customer-instance/outputs.tf @@ -1,20 +1,20 @@ output "instance_ips" { - description = "Mapa { name => external IP }" + description = "Map of { name => external IP }." value = { for k, v in google_compute_address.ip : k => v.address } } output "prod_ip" { - description = "External IP prod instance" + description = "External IP of the production instance." value = google_compute_address.ip[var.prod_instance.name].address } output "vm_service_account" { - description = "Email VM SA (pro další IAM bindings, např. BigQuery)" + description = "VM service-account email (for additional IAM bindings, e.g. BigQuery)." value = google_service_account.vm.email } output "jwt_secret_name" { - description = "Plný název JWT secretu v Secret Manageru" + description = "Full name of the JWT secret in Secret Manager." value = google_secret_manager_secret.jwt.name } diff --git a/infra/modules/customer-instance/variables.tf b/infra/modules/customer-instance/variables.tf index 2e7011c..1fcce5f 100644 --- a/infra/modules/customer-instance/variables.tf +++ b/infra/modules/customer-instance/variables.tf @@ -1,5 +1,5 @@ variable "gcp_project_id" { - description = "GCP project ID kde bude instance nasazená" + description = "GCP project ID where the instance will be deployed." type = string } @@ -16,16 +16,16 @@ variable "zone" { } variable "customer_name" { - description = "Krátký identifikátor zákazníka (např. keboola, grpn). Použije se v prefixu resourců." + description = "Short customer identifier (e.g. acme, example). Used as a prefix for created resources." type = string validation { condition = can(regex("^[a-z][a-z0-9-]{1,20}$", var.customer_name)) - error_message = "customer_name musí být lowercase, začínat písmenem, 2-21 znaků." + error_message = "customer_name must be lowercase, start with a letter, 2-21 chars." } } variable "prod_instance" { - description = "Prod VM konfigurace" + description = "Production VM configuration." type = object({ name = string machine_type = optional(string, "e2-small") @@ -40,7 +40,7 @@ variable "prod_instance" { variable "dev_instances" { description = <<-EOT - Seznam dev VMs. Prázdné pole = žádné dev VMs. + List of dev VMs. Empty list = no dev VMs. tls_mode + domain are optional and default to plain HTTP on :8000. Set tls_mode = "caddy" + domain to enable Caddy + Let's Encrypt (or whatever @@ -57,31 +57,31 @@ variable "dev_instances" { } variable "seed_admin_email" { - description = "Email prvního admin usera" + description = "Email of the initial admin user." type = string } variable "enable_seed_password" { - description = "Pokud true, seed admin user dostane hned password_hash ze seed_admin_password (dev helper). Ponech false v prod — admin si heslo nastaví přes /auth/bootstrap nebo Google OAuth." + description = "If true, the seed admin user immediately gets a password_hash from seed_admin_password (dev helper). Keep false in prod — the admin sets a password via /auth/bootstrap or Google OAuth." type = bool default = false } variable "seed_admin_password" { - description = "Plain-text heslo pro seed admina. Použije se jen když enable_seed_password=true. POZOR: ukládá se do Terraform state." + description = "Plain-text password for the seed admin. Only used when enable_seed_password=true. WARNING: stored in Terraform state." type = string default = "" sensitive = true } variable "data_source" { - description = "Typ data source — keboola | bigquery | csv" + description = "Data source type — keboola | bigquery | csv." type = string default = "keboola" } variable "keboola_stack_url" { - description = "Keboola Stack URL (pokud data_source = keboola)" + description = "Keboola Stack URL (used when data_source = keboola)." type = string default = "" } diff --git a/scripts/debug/probe_google_groups.py b/scripts/debug/probe_google_groups.py index f80ba52..c5f9d2c 100755 --- a/scripts/debug/probe_google_groups.py +++ b/scripts/debug/probe_google_groups.py @@ -16,8 +16,8 @@ How to get an access token (Easiest path): Google's OAuth 2.0 Playground (https://developers.google.com/oauthplayground/) 1. Click the gear icon (top right) → tick "Use your own OAuth credentials" - 2. Paste your Client ID + Secret (from kids-ai-data-analysis project, - same OAuth client agnes-dev uses) + 2. Paste your Client ID + Secret (the same OAuth client your Agnes + deployment uses) 3. Step 1: pick scopes. For comparison test all of: https://www.googleapis.com/auth/cloud-identity.groups.readonly https://www.googleapis.com/auth/cloud-identity.groups diff --git a/scripts/duckdb_manager.py b/scripts/duckdb_manager.py index 5f99954..20c0e00 100644 --- a/scripts/duckdb_manager.py +++ b/scripts/duckdb_manager.py @@ -177,7 +177,7 @@ def _get_bq_project_from_table_id(table_id: str) -> Optional[str]: """Extract BQ project ID from a fully-qualified table ID. Args: - table_id: e.g. "prj-grp-dataview-prod-1ff9.finance_unit_economics.unit_economics" + table_id: e.g. "my-bq-project.finance_unit_economics.unit_economics" Returns: Project ID or None if format doesn't match BQ convention diff --git a/scripts/fetch-env-from-secrets.sh b/scripts/fetch-env-from-secrets.sh index d0de77a..79a4aee 100755 --- a/scripts/fetch-env-from-secrets.sh +++ b/scripts/fetch-env-from-secrets.sh @@ -13,7 +13,7 @@ ENV_FILE="${APP_DIR}/.env" # Non-secret config (override via environment or hardcoded defaults) DATA_SOURCE="${DATA_SOURCE:-keboola}" KEBOOLA_STACK_URL="${KEBOOLA_STACK_URL:-https://connection.us-east4.gcp.keboola.com/}" -SEED_ADMIN_EMAIL="${SEED_ADMIN_EMAIL:-zdenek.srotyr@keboola.com}" +SEED_ADMIN_EMAIL="${SEED_ADMIN_EMAIL:?SEED_ADMIN_EMAIL must be set}" LOG_LEVEL="${LOG_LEVEL:-info}" DATA_DIR="${DATA_DIR:-/data}" AGNES_TAG="${AGNES_TAG:-stable}" @@ -38,7 +38,7 @@ AGNES_TAG=${AGNES_TAG} EOF chmod 600 "${ENV_FILE}" -# Chown je best-effort — pokud skript neběží jako root, ignoruj +# chown is best-effort — ignore if the script isn't running as root. chown deploy:deploy "${ENV_FILE}" 2>/dev/null || true echo "Done. ${ENV_FILE} has $(wc -l < "${ENV_FILE}") lines, chmod 600." diff --git a/scripts/grpn/Makefile b/scripts/grpn/Makefile deleted file mode 100644 index d4c129e..0000000 --- a/scripts/grpn/Makefile +++ /dev/null @@ -1,146 +0,0 @@ -# Makefile — Agnes on foundryai-development (GRPN hackathon deploy) -# -# This is a manual-deploy helper used while the full Terraform flow is -# blocked by GRPN org policies (iam.disableServiceAccountKeyCreation, -# no projectIamAdmin delegation). It targets the existing VM -# foundryai-development in prj-grp-foundryai-dev-7c37. -# -# Once WIF + Terraform is unblocked, this file moves to a private -# keboola/agnes-infra-grpn repo and most targets become obsolete. -# -# Usage: -# make -C scripts/grpn help -# make -C scripts/grpn deploy -# make -C scripts/grpn status -# make -C scripts/grpn tunnel - -SHELL := /bin/bash - -# -------- overridable config (safe defaults for GRPN foundryai-development) -------- -PROJECT ?= prj-grp-foundryai-dev-7c37 -ZONE ?= us-central1-a -VM ?= foundryai-development -APP_DIR ?= /opt/agnes -LOCAL_PORT ?= 8000 -VM_PORT ?= 8000 -IMAGE ?= ghcr.io/keboola/agnes-the-ai-analyst -ADMIN_EMAIL ?= e_zsrotyr@groupon.com - -# compose files (note: host-mount overlay binds /data from host = boot-disk ephemeral for this VM) -COMPOSE_FILES = -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.host-mount.yml -COMPOSE = sudo docker compose $(COMPOSE_FILES) -SSH = gcloud compute ssh $(VM) --zone=$(ZONE) --project=$(PROJECT) -SCP = gcloud compute scp --zone=$(ZONE) --project=$(PROJECT) - -# -------- help -------- -.PHONY: help -help: - @echo "Agnes @ $(VM) (project: $(PROJECT), zone: $(ZONE))" - @echo "" - @echo " make deploy Pull latest :stable image, recreate containers (zero-downtime if healthy)" - @echo " make deploy-tag TAG=stable-2026.04.83 Pull a specific tag instead of floating :stable" - @echo " make status Health + version endpoint" - @echo " make logs Tail app logs (ctrl-c to exit)" - @echo " make logs-scheduler Tail scheduler logs" - @echo " make restart docker compose restart (keeps state)" - @echo " make stop docker compose stop (containers down, volumes preserved)" - @echo " make start docker compose up -d" - @echo " make recreate docker compose down + up -d (fresh containers, same data)" - @echo "" - @echo " make ssh Open interactive SSH session to the VM" - @echo " make tunnel Start IAP tunnel; open http://localhost:$(LOCAL_PORT) in browser" - @echo " make open Start tunnel AND open browser (macOS only)" - @echo "" - @echo " make bootstrap-admin PASSWORD= Create admin (first-time only; 403 once any user has password)" - @echo " make set-data-source SOURCE=bigquery Edit .env DATA_SOURCE; restart app" - @echo "" - @echo " make install-cron Install auto-upgrade cron (pulls :stable every 5 min, restarts on digest change)" - @echo " make uninstall-cron Remove auto-upgrade cron" - @echo "" - @echo " make env Show .env keys (values NOT printed)" - @echo " make version What version/channel/commit is running now" - @echo " make ps docker ps on the VM" - -# -------- deployment -------- -.PHONY: deploy deploy-tag recreate restart start stop -deploy: - $(SSH) --command='cd $(APP_DIR) && $(COMPOSE) pull && $(COMPOSE) up -d' - @$(MAKE) --no-print-directory status - -deploy-tag: - @test -n "$(TAG)" || (echo "Usage: make deploy-tag TAG=stable-2026.04.83" >&2; exit 2) - $(SSH) --command='cd $(APP_DIR) && sudo sed -i "s|^AGNES_TAG=.*|AGNES_TAG=$(TAG)|" .env && $(COMPOSE) pull && $(COMPOSE) up -d' - @$(MAKE) --no-print-directory status - -recreate: - $(SSH) --command='cd $(APP_DIR) && $(COMPOSE) down && $(COMPOSE) up -d' - @$(MAKE) --no-print-directory status - -restart: - $(SSH) --command='cd $(APP_DIR) && $(COMPOSE) restart' - -start: - $(SSH) --command='cd $(APP_DIR) && $(COMPOSE) up -d' - -stop: - $(SSH) --command='cd $(APP_DIR) && $(COMPOSE) down' - -# -------- observability -------- -.PHONY: status version ps env logs logs-scheduler -status: - @echo "=== health (via IAP tunnel on VM) ===" - @$(SSH) --command='curl -sf --max-time 10 http://localhost:$(VM_PORT)/api/health' 2>&1 | tail -1 | python3 -m json.tool 2>/dev/null | head -10 || echo "not healthy" - -version: - @$(SSH) --command='curl -sf --max-time 10 http://localhost:$(VM_PORT)/api/version' 2>&1 | tail -1 | python3 -m json.tool 2>/dev/null | head -10 || echo "unreachable" - -ps: - $(SSH) --command='sudo docker ps --format "table {{.Names}}\t{{.Image}}\t{{.Status}}"' - -env: - @echo "=== .env keys on VM (values not shown) ===" - $(SSH) --command='sudo cut -d= -f1 $(APP_DIR)/.env' - -logs: - $(SSH) --command='sudo docker logs -f --tail 100 agnes-app-1' - -logs-scheduler: - $(SSH) --command='sudo docker logs -f --tail 100 agnes-scheduler-1' - -# -------- access -------- -.PHONY: ssh tunnel open -ssh: - $(SSH) - -tunnel: - @echo "Starting IAP tunnel — http://localhost:$(LOCAL_PORT) is now Agnes" - @echo "Leave this terminal open; Ctrl-C to stop." - gcloud compute start-iap-tunnel $(VM) $(VM_PORT) \ - --local-host-port=localhost:$(LOCAL_PORT) \ - --zone=$(ZONE) --project=$(PROJECT) - -open: - @( gcloud compute start-iap-tunnel $(VM) $(VM_PORT) \ - --local-host-port=localhost:$(LOCAL_PORT) \ - --zone=$(ZONE) --project=$(PROJECT) & \ - sleep 4 && open "http://localhost:$(LOCAL_PORT)/login" && wait ) - -# -------- one-off operations -------- -.PHONY: bootstrap-admin set-data-source install-cron uninstall-cron -bootstrap-admin: - @test -n "$(PASSWORD)" || (echo "Usage: make bootstrap-admin PASSWORD=" >&2; exit 2) - @$(SSH) --command='curl -sS -X POST http://localhost:$(VM_PORT)/auth/bootstrap \ - -H "Content-Type: application/json" \ - -d "{\"email\":\"$(ADMIN_EMAIL)\",\"password\":\"$(PASSWORD)\"}"' 2>&1 | tail -1 | python3 -m json.tool 2>/dev/null | head -8 - -set-data-source: - @test -n "$(SOURCE)" || (echo "Usage: make set-data-source SOURCE=bigquery|csv|keboola" >&2; exit 2) - $(SSH) --command='sudo sed -i "s|^DATA_SOURCE=.*|DATA_SOURCE=$(SOURCE)|" $(APP_DIR)/.env && cd $(APP_DIR) && $(COMPOSE) up -d --force-recreate app' - @$(MAKE) --no-print-directory status - -install-cron: - $(SCP) agnes-auto-upgrade.sh $(VM):/tmp/agnes-auto-upgrade.sh - $(SSH) --command='sudo install -m 755 /tmp/agnes-auto-upgrade.sh /usr/local/bin/agnes-auto-upgrade.sh && rm /tmp/agnes-auto-upgrade.sh && ( sudo crontab -l 2>/dev/null | grep -v agnes-auto-upgrade || true; echo "*/5 * * * * /usr/local/bin/agnes-auto-upgrade.sh >> /var/log/agnes-auto-upgrade.log 2>&1" ) | sudo crontab - && echo "cron installed"' - -uninstall-cron: - $(SSH) --command='( sudo crontab -l 2>/dev/null | grep -v agnes-auto-upgrade ) | sudo crontab - && sudo rm -f /usr/local/bin/agnes-auto-upgrade.sh && echo "cron removed"' diff --git a/scripts/grpn/README.md b/scripts/grpn/README.md deleted file mode 100644 index 3a3b1be..0000000 --- a/scripts/grpn/README.md +++ /dev/null @@ -1,179 +0,0 @@ -# Manual deploy helper — Agnes on an existing VM (GRPN pattern) - -A `make`-based helper for deploying and operating Agnes on an **existing** GCE VM when the full Terraform flow is blocked — typically by organization policies that forbid SA JSON key creation or by missing IAM delegation. This is the pattern we used on GRPN's `foundryai-development` during the 2026-04-22 hackathon. - -It is **not** a replacement for the full Terraform module — only a stopgap while the proper flow is being unblocked. See [Migration path](#migration-path) below. - -## When to use this - -Use this helper when **all** are true: - -- A target VM already exists in the customer's GCP project (we don't create it) -- You (or the deploy SA) do **not** have `roles/resourcemanager.projectIamAdmin` on that project, **or** the org has `constraints/iam.disableServiceAccountKeyCreation` enabled -- The customer is OK with a single-VM, single-node Agnes (no prod + dev split for now) -- Data persistence on the VM's boot disk is acceptable (no persistent disk attached → data loss on VM recreate) - -Any of those false → go the Terraform route via [`docs/HACKATHON.md`](../../docs/HACKATHON.md) Part 1. - -## What it does (and doesn't) - -| Aspect | Manual helper (this) | Full Terraform flow | -|---|---|---| -| VM provisioning | Reuses existing VM | Creates a dedicated `agnes-prod` + optional `agnes-dev` VMs | -| Docker install | Inline `curl get.docker.com \| sh` on first deploy | Part of the module's startup script | -| Secrets | Plain `.env` on VM (`chmod 600`) | GCP Secret Manager, read by VM SA | -| Service account | Uses the VM's existing SA, whatever that is | Dedicated `agnes--vm` with scoped `secretmanager.secretAccessor` only | -| Data persistence | Boot disk, ephemeral across VM recreate | Separate persistent disk (`/data` bind-mount), daily snapshot + 30-day retention | -| Auto-upgrade | `install-cron` target deploys the same cron script the module uses | Built into the startup script | -| Monitoring / alerts | None | Uptime check + alert policy per VM | -| Backup | None | Daily snapshot schedule | -| Branch-aware dev VMs | Not supported (single VM) | `dev_instances` list — one VM per branch/engineer | -| CI/CD | None — manual `make deploy` | GitHub Actions: PR → plan → apply (dev auto, prod gated) | - -The helper covers the **runtime** aspects (pull image, restart, logs, access) but skips the infra-as-code posture. - -## One-time setup - -Done for GRPN during the 2026-04-22 hackathon. Re-useable template for any future customer in a similar constrained environment: - -### 1. Verify access to the VM - -```bash -gcloud compute ssh $VM --zone=$ZONE --project=$PROJECT --command='whoami' -``` - -If this works, you have SSH via OS Login or your own key. IAP tunnel auto-kicks in if the VM has no external IP. No further auth setup is needed. - -### 2. Install Docker + compose plugin - -```bash -gcloud compute ssh $VM --zone=$ZONE --project=$PROJECT --command=" - curl -fsSL https://get.docker.com | sudo sh - sudo apt-get install -y -qq docker-compose-plugin -" -``` - -### 3. Prepare app directory and data root - -```bash -gcloud compute ssh $VM --zone=$ZONE --project=$PROJECT --command=" - sudo mkdir -p /opt/agnes /data/state /data/analytics /data/extracts - sudo chown -R \$USER:\$USER /opt/agnes - cd /opt/agnes - curl -fsSL https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.yml -o docker-compose.yml - curl -fsSL https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.prod.yml -o docker-compose.prod.yml - curl -fsSL https://raw.githubusercontent.com/keboola/agnes-the-ai-analyst/main/docker-compose.host-mount.yml -o docker-compose.host-mount.yml -" -``` - -### 4. Write `.env` (plain, chmod 600) - -```bash -JWT=$(openssl rand -hex 32) -cat > /tmp/agnes-env < -LOG_LEVEL=info -AGNES_TAG=stable -EOF -gcloud compute scp /tmp/agnes-env $VM:/tmp/.env --zone=$ZONE --project=$PROJECT -gcloud compute ssh $VM --zone=$ZONE --project=$PROJECT --command=" - sudo install -m 600 -o \$USER -g \$USER /tmp/.env /opt/agnes/.env - rm /tmp/.env -" -rm /tmp/agnes-env -``` - -If `DATA_SOURCE=keboola`, add `KEBOOLA_STORAGE_TOKEN=...` + `KEBOOLA_STACK_URL=...` lines. Same for any BQ / custom data source credentials — they all live in this one `.env`. - -### 5. First boot - -```bash -make deploy -make bootstrap-admin PASSWORD= -``` - -`deploy` pulls the image + starts containers. `bootstrap-admin` hits `/auth/bootstrap` to activate the seed admin. - -### 6. (Optional) Auto-upgrade - -```bash -make install-cron -``` - -Installs the same 5-minute polling cron used by the Terraform module. After this, every new `:stable` image digest is picked up within ~5 min without any human action. - -## Everyday operations - -From the repo root (tested defaults target GRPN's `foundryai-development`): - -```bash -make -C scripts/grpn help # list all targets -make -C scripts/grpn status # is it up? -make -C scripts/grpn version # what's deployed right now -make -C scripts/grpn logs # tail app logs -make -C scripts/grpn deploy # pull :stable + recreate -make -C scripts/grpn tunnel # IAP tunnel → http://localhost:8000 -``` - -## Configuration - -All targets read overridable variables at the top of `Makefile`. Defaults target GRPN's `foundryai-development`. For other VMs/projects: - -```bash -# one-off override -make -C scripts/grpn status \ - PROJECT=other-project \ - ZONE=us-central1-a \ - VM=other-vm - -# or fork this Makefile into `scripts//Makefile` with different defaults -``` - -| Variable | Default | Purpose | -|---|---|---| -| `PROJECT` | `prj-grp-foundryai-dev-7c37` | GCP project ID | -| `ZONE` | `us-central1-a` | VM zone | -| `VM` | `foundryai-development` | Instance name | -| `APP_DIR` | `/opt/agnes` | Where compose files + `.env` live on the VM | -| `LOCAL_PORT` | `8000` | Local port for `tunnel` target | -| `VM_PORT` | `8000` | Port the app listens on inside the VM | -| `IMAGE` | `ghcr.io/keboola/agnes-the-ai-analyst` | GHCR image repo | -| `ADMIN_EMAIL` | `e_zsrotyr@groupon.com` | Default bootstrap email | - -## Files - -``` -scripts/grpn/ -├── Makefile # the helper itself -├── agnes-auto-upgrade.sh # deployed by `make install-cron` to /usr/local/bin/ -└── README.md # this file -``` - -Plus the deploy log: [`docs/superpowers/plans/2026-04-22-grpn-deploy-learnings.md`](../../docs/superpowers/plans/2026-04-22-grpn-deploy-learnings.md) — lists all the org-policy constraints encountered and their workarounds. - -## Migration path - -Once the blockers are lifted, move to the proper Terraform flow: - -1. **Get `roles/resourcemanager.projectIamAdmin`** on the customer project (ask the GRPN admin to grant it). -2. **Create a WIF pool + provider** in the customer project (doesn't require SA JSON keys; bypasses `iam.disableServiceAccountKeyCreation`). Draft patch pending on [`bootstrap-gcp.sh`](../bootstrap-gcp.sh) — track via GitHub issue tagged `wif`. -3. **Migrate**: run the new `bootstrap-gcp.sh --wif`, create a private infra repo from [`keboola/agnes-infra-template`](https://github.com/keboola/agnes-infra-template), `terraform apply` → this creates a **new** Agnes VM alongside the existing `foundryai-development`. -4. **Optional** — move data from the manual VM to the TF VM with a `tar` snapshot through GCS (see the original migration in [`docs/superpowers/plans/2026-04-21-deployment-log.md`](../../docs/superpowers/plans/2026-04-21-deployment-log.md) "Data migration" section). -5. **Decommission** the manual deploy: `make stop` + delete `/opt/agnes/` on the VM. - -## Caveats - -- **Single VM, single point of failure.** No dev/prod split. -- **No automatic backups.** If someone deletes the VM, data is gone (30-day boot-disk retention from GCP default only). -- **Plain-text secrets in `.env`.** Acceptable for IAP-only internal VM; **not** acceptable if the VM ever gets an external IP. -- **No drift detection.** Anyone with SSH can hand-edit `.env` or compose files without leaving an audit trail. The Terraform flow's `ignore_changes` + `-replace` pattern is the correct version of this. - -## See also - -- [`docs/HACKATHON.md`](../../docs/HACKATHON.md) — the full TL;DR for deploy and develop (the TF path) -- [`docs/ONBOARDING.md`](../../docs/ONBOARDING.md) — detailed per-customer Terraform onboarding -- [`docs/DEPLOYMENT.md`](../../docs/DEPLOYMENT.md) — comparison of TF vs docker-compose deployment strategies -- [`infra/modules/customer-instance/`](../../infra/modules/customer-instance/) — the Terraform module this helper shadows diff --git a/scripts/grpn/agnes-auto-upgrade.sh b/scripts/ops/agnes-auto-upgrade.sh similarity index 100% rename from scripts/grpn/agnes-auto-upgrade.sh rename to scripts/ops/agnes-auto-upgrade.sh diff --git a/scripts/grpn/agnes-tls-rotate.sh b/scripts/ops/agnes-tls-rotate.sh similarity index 100% rename from scripts/grpn/agnes-tls-rotate.sh rename to scripts/ops/agnes-tls-rotate.sh diff --git a/scripts/switch-dev-vm.sh b/scripts/switch-dev-vm.sh deleted file mode 100755 index 2f665c8..0000000 --- a/scripts/switch-dev-vm.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash -# switch-dev-vm.sh — point the shared hackathon dev VM at the caller's branch image. -# -# Usage: -# scripts/switch-dev-vm.sh -# scripts/switch-dev-vm.sh hack-zs-metrics -# -# Prerequisite: your branch has been pushed and the release.yml workflow has completed, -# producing ghcr.io/keboola/agnes-the-ai-analyst:dev-. -# -# The slug is derived from your branch name by stripping the leading "feature/" and -# replacing non-alphanumeric chars with "-". For branch "feature/hack-zs-metrics" the slug -# is "hack-zs-metrics". -set -euo pipefail - -if [ $# -ne 1 ]; then - echo "Usage: $0 " >&2 - echo "Example: $0 hack-zs-metrics" >&2 - exit 2 -fi - -SLUG="$1" -VM="agnes-dev" -ZONE="europe-west1-b" -TAG="dev-$SLUG" -IMAGE="ghcr.io/keboola/agnes-the-ai-analyst:$TAG" - -echo "[1/4] Verifying $IMAGE exists on GHCR..." -docker manifest inspect "$IMAGE" > /dev/null || { - echo "ERROR: $IMAGE not found on GHCR. Did your release.yml run finish?" >&2 - echo "Check: gh run list --branch feature/$SLUG --workflow release.yml" >&2 - exit 1 -} - -echo "[2/4] Updating AGNES_TAG on $VM to $TAG..." -gcloud compute ssh "$VM" --zone="$ZONE" --quiet --command "\ - sudo sed -i 's|^AGNES_TAG=.*|AGNES_TAG=$TAG|' /opt/agnes/.env && \ - sudo grep -E '^AGNES_TAG=' /opt/agnes/.env" - -echo "[3/4] Triggering auto-upgrade..." -gcloud compute ssh "$VM" --zone="$ZONE" --quiet --command \ - "sudo /usr/local/bin/agnes-auto-upgrade.sh 2>&1 | tail -10" - -echo "[4/4] Waiting for app to become healthy..." -for i in $(seq 1 30); do - STATUS=$(curl -s --max-time 5 http://34.77.94.14:8000/api/health | python3 -c 'import sys,json; print(json.load(sys.stdin).get("status","down"))' 2>/dev/null || echo down) - echo " [$i/30] status=$STATUS" - if [ "$STATUS" = "healthy" ] || [ "$STATUS" = "degraded" ]; then - echo "OK — agnes-dev now running $TAG. Open http://34.77.94.14:8000" - exit 0 - fi - sleep 3 -done -echo "ERROR: agnes-dev did not become healthy in 90s. SSH in and check: docker compose logs" >&2 -exit 1 diff --git a/src/catalog_export.py b/src/catalog_export.py index bbe4242..76c645c 100644 --- a/src/catalog_export.py +++ b/src/catalog_export.py @@ -140,7 +140,7 @@ def export_metrics( client: Initialized OpenMetadata API client docs_dir: Base docs directory (e.g., /data/docs) catalog_url: Catalog URL for header comments - filter_tag: If set, only export metrics that have this tag (e.g., "AIAgent.FoundryAI") + filter_tag: If set, only export metrics that have this tag (e.g., "AIAgent.MyAgent") data_product: If set, discover metrics via data product assets (preferred over filter_tag) Returns: diff --git a/tests/test_catalog_export.py b/tests/test_catalog_export.py index 9d9d04d..48f1a62 100644 --- a/tests/test_catalog_export.py +++ b/tests/test_catalog_export.py @@ -298,7 +298,7 @@ class TestExportMetrics: def test_export_metrics_filter_tag_keeps_matching(self, tmp_path: Path, mock_client): """Only metrics with the filter_tag are exported.""" tagged = _make_raw_metric(name="M1", fqn="M1", category_tag="MetricCategory.finance") - tagged["tags"].append({"tagFQN": "AIAgent.FoundryAI", "name": "FoundryAI"}) + tagged["tags"].append({"tagFQN": "AIAgent.Example", "name": "Example"}) untagged = _make_raw_metric( name="Live Deals", fqn="LiveDeals", category_tag="MetricCategory.supply" @@ -307,7 +307,7 @@ class TestExportMetrics: mock_client.get_metrics.return_value = [tagged, untagged] docs = tmp_path / "docs" - count = export_metrics(mock_client, docs, CATALOG_URL, filter_tag="AIAgent.FoundryAI") + count = export_metrics(mock_client, docs, CATALOG_URL, filter_tag="AIAgent.Example") assert count == 1 assert (docs / "metrics" / "finance" / "m1.yml").exists() @@ -328,7 +328,7 @@ class TestExportMetrics: def test_export_metrics_filter_tag_cleans_stale_untagged(self, tmp_path: Path, mock_client): """Stale files from previously-exported untagged metrics get cleaned up.""" tagged = _make_raw_metric(name="M1", fqn="M1", category_tag="MetricCategory.finance") - tagged["tags"].append({"tagFQN": "AIAgent.FoundryAI", "name": "FoundryAI"}) + tagged["tags"].append({"tagFQN": "AIAgent.Example", "name": "Example"}) mock_client.get_metrics.return_value = [tagged] docs = tmp_path / "docs" @@ -337,7 +337,7 @@ class TestExportMetrics: stale = stale_dir / "livedeals.yml" stale.write_text(AUTO_GENERATED_MARKER + "\nname: livedeals\n") - export_metrics(mock_client, docs, CATALOG_URL, filter_tag="AIAgent.FoundryAI") + export_metrics(mock_client, docs, CATALOG_URL, filter_tag="AIAgent.Example") assert not stale.exists() diff --git a/tests/test_duckdb_manager.py b/tests/test_duckdb_manager.py index ba5233d..cae7955 100644 --- a/tests/test_duckdb_manager.py +++ b/tests/test_duckdb_manager.py @@ -89,13 +89,13 @@ tables: primary_key: "id" sync_strategy: "full_refresh" - - id: "prj-grp-dataview-prod-1ff9.finance.revenue" + - id: "prj-example-1234.finance.revenue" name: "revenue" description: "Remote BQ table" primary_key: "id" query_mode: "remote" - - id: "prj-grp-dataview-prod-1ff9.marketing.campaigns" + - id: "prj-example-1234.marketing.campaigns" name: "campaigns" description: "Hybrid table" primary_key: "id" @@ -111,7 +111,7 @@ tables: table = pa.table({"id": [1, 2], "name": ["a", "b"]}) pq.write_table(table, crm_dir / "company.parquet") - marketing_dir = tmp_path / "server" / "parquet" / "prj-grp-dataview-prod-1ff9.marketing" + marketing_dir = tmp_path / "server" / "parquet" / "prj-example-1234.marketing" marketing_dir.mkdir(parents=True) campaigns_table = pa.table({"id": [10], "campaign": ["test"]}) pq.write_table(campaigns_table, marketing_dir / "campaigns.parquet") @@ -134,13 +134,13 @@ def tmp_project_remote_only(tmp_path): ```yaml tables: - - id: "prj-grp-dataview-prod-1ff9.finance.revenue" + - id: "prj-example-1234.finance.revenue" name: "revenue" description: "Remote BQ table" primary_key: "id" query_mode: "remote" - - id: "prj-grp-dataview-prod-1ff9.finance.costs" + - id: "prj-example-1234.finance.costs" name: "costs" description: "Remote BQ table" primary_key: "id" @@ -165,9 +165,9 @@ class TestGetBqProjectFromTableId: def test_valid_bq_table_id(self): result = _get_bq_project_from_table_id( - "prj-grp-dataview-prod-1ff9.finance.table" + "prj-example-1234.finance.table" ) - assert result == "prj-grp-dataview-prod-1ff9" + assert result == "prj-example-1234" def test_valid_bq_table_id_different_project(self): result = _get_bq_project_from_table_id( diff --git a/tests/test_openmetadata_enricher.py b/tests/test_openmetadata_enricher.py index 4a33f71..afb4c5c 100644 --- a/tests/test_openmetadata_enricher.py +++ b/tests/test_openmetadata_enricher.py @@ -19,7 +19,7 @@ from connectors.openmetadata.enricher import ( def sample_table_config(): """Sample table configuration.""" return TableConfig( - id="prj-grp-dataview-prod-1ff9.marketing.roi_datamart_v2", + id="prj-example-1234.marketing.roi_datamart_v2", name="roi_datamart_v2", ) @@ -30,7 +30,7 @@ def sample_om_response(): return { "id": "table-uuid", "name": "roi_datamart_v2", - "fullyQualifiedName": "bigquery.prj-grp-dataview-prod-1ff9.marketing.roi_datamart_v2", + "fullyQualifiedName": "bigquery.prj-example-1234.marketing.roi_datamart_v2", "description": "Daily ROI analytics", "columns": [ { @@ -132,12 +132,12 @@ def test_enrich_table_cache_hit(): columns={"id": CatalogColumnData(description="ID", data_type="BIGINT")}, ) enricher._cache_entry( - "bigquery.prj-grp-dataview-prod-1ff9.marketing.test", + "bigquery.prj-example-1234.marketing.test", cached_data, ) table_config = TableConfig( - id="prj-grp-dataview-prod-1ff9.marketing.test", + id="prj-example-1234.marketing.test", name="test", ) @@ -164,7 +164,7 @@ def test_enrich_table_cache_expiry(): description="Old data", columns={}, ) - fqn = "bigquery.prj-grp-dataview-prod-1ff9.marketing.test" + fqn = "bigquery.prj-example-1234.marketing.test" enricher._cache[fqn] = { "data": cached_data, "fetched_at": datetime.now() - timedelta(seconds=2), # 2 seconds old @@ -188,12 +188,12 @@ def test_derive_fqn_auto(): ) table_config = TableConfig( - id="prj-grp-dataview-prod-1ff9.marketing.roi_datamart_v2", + id="prj-example-1234.marketing.roi_datamart_v2", name="roi_datamart_v2", ) fqn = enricher._derive_fqn(table_config) - assert fqn == "bigquery.prj-grp-dataview-prod-1ff9.marketing.roi_datamart_v2" + assert fqn == "bigquery.prj-example-1234.marketing.roi_datamart_v2" def test_derive_fqn_explicit_override(): @@ -209,7 +209,7 @@ def test_derive_fqn_explicit_override(): ) table_config = TableConfig( - id="prj-grp-dataview-prod-1ff9.marketing.roi_datamart_v2", + id="prj-example-1234.marketing.roi_datamart_v2", name="roi_datamart_v2", ) table_config.catalog_fqn = "bigquery.custom.fqn.override" diff --git a/tests/test_openmetadata_transformer.py b/tests/test_openmetadata_transformer.py index 505453c..9ddfbd3 100644 --- a/tests/test_openmetadata_transformer.py +++ b/tests/test_openmetadata_transformer.py @@ -336,24 +336,24 @@ class TestHasTag: def test_has_tag_present(self): """Returns True when tag with matching FQN is in the list.""" tags = [ - {"tagFQN": "AIAgent.FoundryAI", "name": "FoundryAI"}, + {"tagFQN": "AIAgent.Example", "name": "Example"}, {"tagFQN": "Tier.Tier1"}, ] - assert has_tag(tags, "AIAgent.FoundryAI") is True + assert has_tag(tags, "AIAgent.Example") is True def test_has_tag_absent(self): """Returns False when tag is not in the list.""" tags = [{"tagFQN": "Tier.Tier2"}] - assert has_tag(tags, "AIAgent.FoundryAI") is False + assert has_tag(tags, "AIAgent.Example") is False def test_has_tag_empty_list(self): """Returns False for empty tag list.""" - assert has_tag([], "AIAgent.FoundryAI") is False + assert has_tag([], "AIAgent.Example") is False def test_has_tag_partial_match(self): """Does not match partial FQN.""" - tags = [{"tagFQN": "AIAgent.FoundryAI_v2"}] - assert has_tag(tags, "AIAgent.FoundryAI") is False + tags = [{"tagFQN": "AIAgent.Example_v2"}] + assert has_tag(tags, "AIAgent.Example") is False class TestExtractTagNames: @@ -768,7 +768,7 @@ class TestTableToYamlDict: }, ], "tags": [ - {"name": "FoundryAI", "tagFQN": "AIAgent.FoundryAI"}, + {"name": "Example", "tagFQN": "AIAgent.Example"}, {"tagFQN": "Tier.Tier1"}, ], "owners": [ @@ -781,7 +781,7 @@ class TestTableToYamlDict: assert result["fqn"] == "bigquery.prj.dataset.order_economics" assert result["description"] == "Order-level economics data" assert result["owners"] == ["data_team"] - assert result["tags"] == ["FoundryAI", "Tier1"] + assert result["tags"] == ["Example", "Tier1"] assert result["tier"] == "Tier1" # Columns