"""Test fixtures for the clean-bootstrap test suite (Task 20). Boots a real FastAPI server via uvicorn subprocess so end-to-end paths exercise the same wsgi/asgi stack as production (cookie sessions, PAT verify, JWT, DB locks). Pre-seeds two users + three tables in the system DB before the subprocess starts so the test can authenticate immediately. Subprocess (not in-thread uvicorn): isolates the test's `DATA_DIR` and the load-bearing module-level singletons in `src.db` (cached system DB connection) and `app.instance_config` from the parent test runner. The existing E2E pattern in tests/test_e2e_corporate_memory.py uses the same shape; we reuse it here for the same reasons. Public API: - `fastapi_test_server` — yields `_ServerHandle` with `.url` + `.shutdown()`. - `web_session` — `httpx.Client` authenticated as admin via the form-login endpoint (cookie session). - `test_pat` — string PAT for analyst with grants to the `local` and `materialized` test tables. - `test_pat_no_grants` — string PAT for analyst with zero grants. - `zero_grants_workspace` — `Path` to a workspace where `agnes init` has run with `test_pat_no_grants` (no parquets, no rules). - `NONEXISTENT_TABLE` — module constant for Task 21's smoke matrix. """ from __future__ import annotations import os import socket import subprocess import sys import time from dataclasses import dataclass from pathlib import Path from typing import Iterator import httpx import pytest NONEXISTENT_TABLE = "__nonexistent__" ADMIN_EMAIL = "admin@example.com" ADMIN_PASSWORD = "test-admin-password-123" ANALYST_EMAIL = "analyst@example.com" ANALYST_PASSWORD = "test-analyst-password-123" # Test table fixtures — one per query_mode the manifest filter cares about. LOCAL_TABLE_ID = "local_tbl" MATERIALIZED_TABLE_ID = "materialized_tbl" REMOTE_TABLE_ID = "remote_tbl" # --------------------------------------------------------------------------- # Server handle # --------------------------------------------------------------------------- @dataclass class _ServerHandle: url: str data_dir: Path proc: subprocess.Popen admin_user_id: str analyst_user_id: str everyone_group_id: str admin_group_id: str def shutdown(self) -> None: if self.proc.poll() is None: self.proc.terminate() try: self.proc.wait(timeout=5) except subprocess.TimeoutExpired: self.proc.kill() # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _find_free_port() -> int: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("127.0.0.1", 0)) return s.getsockname()[1] def _seed_db(data_dir: Path) -> dict: """Open system.duckdb under DATA_DIR, run migrations, seed users + tables. Done before the uvicorn subprocess boots so: 1. The test can immediately log in as admin/analyst (passwords set up). 2. The subprocess inherits a ready DB on first request — no race between startup migrations and the test's first HTTP call. 3. We close the parent's connection at the end so the subprocess can acquire DuckDB's file lock. """ # Restrict the system-db path resolution to our tmp path. _seed_db # mutates module state in src.db; the caller's get_system_db() cache # is reset by tests/conftest._reset_module_caches but we still want # the subprocess child to see the same DATA_DIR. os.environ["DATA_DIR"] = str(data_dir) (data_dir / "state").mkdir(parents=True, exist_ok=True) (data_dir / "analytics").mkdir(parents=True, exist_ok=True) (data_dir / "extracts").mkdir(parents=True, exist_ok=True) # Defer imports until after DATA_DIR is set so any module that reads # the env at import time picks up our path. import uuid from argon2 import PasswordHasher from src.db import ( SYSTEM_ADMIN_GROUP, SYSTEM_EVERYONE_GROUP, close_system_db, get_system_db, ) from src.repositories.table_registry import TableRegistryRepository from src.repositories.user_group_members import UserGroupMembersRepository from src.repositories.users import UserRepository conn = get_system_db() try: ph = PasswordHasher() # --- Users ----------------------------------------------------- users = UserRepository(conn) admin_id = str(uuid.uuid4()) analyst_id = str(uuid.uuid4()) users.create( id=admin_id, email=ADMIN_EMAIL, name="Admin Tester", password_hash=ph.hash(ADMIN_PASSWORD), ) users.create( id=analyst_id, email=ANALYST_EMAIL, name="Analyst Tester", password_hash=ph.hash(ANALYST_PASSWORD), ) # --- System groups (Admin / Everyone are seeded by _ensure_schema). admin_group_row = conn.execute( "SELECT id FROM user_groups WHERE name = ?", [SYSTEM_ADMIN_GROUP], ).fetchone() everyone_group_row = conn.execute( "SELECT id FROM user_groups WHERE name = ?", [SYSTEM_EVERYONE_GROUP], ).fetchone() assert admin_group_row, "Admin system group not seeded" assert everyone_group_row, "Everyone system group not seeded" admin_group_id = admin_group_row[0] everyone_group_id = everyone_group_row[0] members = UserGroupMembersRepository(conn) members.add_member(admin_id, admin_group_id, source="system_seed") # The analyst is in Everyone so RBAC checks against the Everyone # group resolve cleanly. Admin is implicitly in Everyone for # historical reasons but the marketplace filter treats Admin as a # regular group, so we add it explicitly to be unambiguous. members.add_member(admin_id, everyone_group_id, source="system_seed") members.add_member(analyst_id, everyone_group_id, source="system_seed") # --- Tables (one per query_mode) ------------------------------- tables = TableRegistryRepository(conn) tables.register( id=LOCAL_TABLE_ID, name=LOCAL_TABLE_ID, source_type="keboola", bucket="test", source_table=LOCAL_TABLE_ID, query_mode="local", ) tables.register( id=MATERIALIZED_TABLE_ID, name=MATERIALIZED_TABLE_ID, source_type="bigquery", bucket="test", source_table=MATERIALIZED_TABLE_ID, query_mode="materialized", ) tables.register( id=REMOTE_TABLE_ID, name=REMOTE_TABLE_ID, source_type="bigquery", bucket="test", source_table=REMOTE_TABLE_ID, query_mode="remote", ) # --- Parquet files + sync_state for non-remote tables ----------- # The manifest builder iterates `sync_state` (not table_registry) and # `/api/data/{tid}/download` looks up parquet files under # `data_dir/extracts/.../data/`. Seeding both lets `agnes init` # exercise the full download path, not just the registry-only stub. # Each parquet is a single-row DuckDB COPY — minimal but valid (PAR1 # magic + metadata) so client-side `_is_valid_parquet` passes. from src.repositories.sync_state import SyncStateRepository sync_repo = SyncStateRepository(conn) extracts_data = data_dir / "extracts" / "test" / "data" extracts_data.mkdir(parents=True, exist_ok=True) for tid in (LOCAL_TABLE_ID, MATERIALIZED_TABLE_ID): parquet_path = extracts_data / f"{tid}.parquet" # COPY ... TO creates a real parquet via DuckDB's writer. conn.execute( f"COPY (SELECT 1 AS id, 'sample' AS label) " f"TO '{parquet_path}' (FORMAT PARQUET)" ) # Compute MD5 the same way `app/api/sync.py:_file_hash` and # `cli/lib/pull.py:_file_md5` do — chunked 8k reads. import hashlib h = hashlib.md5() with open(parquet_path, "rb") as fh: for chunk in iter(lambda: fh.read(8192), b""): h.update(chunk) sync_repo.update_sync( table_id=tid, rows=1, file_size_bytes=parquet_path.stat().st_size, hash=h.hexdigest(), ) finally: conn.close() # CRITICAL: release DuckDB's file lock so the uvicorn subprocess can # open the DB. The parent's cached connection is held by src.db at # module level; without close_system_db() the child blocks forever # on its first get_system_db() call. close_system_db() return { "admin_user_id": admin_id, "analyst_user_id": analyst_id, "admin_group_id": admin_group_id, "everyone_group_id": everyone_group_id, } def _wait_for_server(url: str, timeout_s: float = 30.0) -> None: """Poll /api/health until the server answers 200 or timeout.""" deadline = time.monotonic() + timeout_s while time.monotonic() < deadline: try: resp = httpx.get(f"{url}/api/health", timeout=1.0) if resp.status_code == 200: return except Exception: pass time.sleep(0.2) raise RuntimeError(f"Server at {url} not ready within {timeout_s}s") def _login_session(server_url: str, email: str, password: str) -> httpx.Client: """Form-login and return an httpx.Client with the access cookie set. The /auth/password/login/web endpoint sets `access_token` as an HttpOnly cookie and 302s to /dashboard. Using `follow_redirects=False` means we capture the cookie without chasing the redirect chain. """ client = httpx.Client(base_url=server_url, follow_redirects=False, timeout=10.0) resp = client.post( "/auth/password/login/web", data={"email": email, "password": password}, ) assert resp.status_code == 302, ( f"login expected 302 redirect, got {resp.status_code}: {resp.text[:300]}" ) # Sanity: the redirect Location must NOT carry ?error=invalid (form-login # bounces back to /login/password on bad creds with status 302 too). target = resp.headers.get("location", "") assert "error=" not in target, f"login failed: redirected to {target}" return client def _mint_pat(server_url: str, email: str, password: str, *, name: str) -> str: """Log in as the user via web-form, then POST /auth/tokens. Returns the raw JWT (returned exactly once by the create endpoint). PATs cannot mint other PATs (require_session_token), so we must use a cookie session, not a previously-minted PAT. """ session = _login_session(server_url, email, password) try: resp = session.post( "/auth/tokens", json={"name": name, "ttl_seconds": 3600}, ) assert resp.status_code == 201, ( f"PAT mint failed: {resp.status_code} {resp.text[:300]}" ) token = resp.json().get("token") assert token and isinstance(token, str), f"no token in response: {resp.text}" return token finally: session.close() def _grant_table_access(web_session: httpx.Client, group_id: str, table_id: str) -> None: """POST /api/admin/grants for `(group, "table", table_id)`. Idempotent: a 409 from the unique constraint is swallowed so the fixture can be reused with a pre-existing grant. """ resp = web_session.post( "/api/admin/grants", json={ "group_id": group_id, "resource_type": "table", "resource_id": table_id, }, ) if resp.status_code not in (201, 409): raise AssertionError( f"grant create failed: {resp.status_code} {resp.text[:300]}" ) # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture def fastapi_test_server(tmp_path: Path) -> Iterator[_ServerHandle]: """Boot a real FastAPI server in a uvicorn subprocess against tmp_path DATA_DIR. Pre-seeds: - Two users: admin@example.com (Admin group) and analyst@example.com (Everyone group only). Both have argon2-hashed passwords usable via `/auth/password/login/web`. - Two system groups (Admin, Everyone) — created by `_ensure_schema`. - Three tables in `table_registry`, one per query_mode (local, materialized, remote). The subprocess inherits `DATA_DIR=tmp_path/agnes-data` plus whatever `JWT_SECRET_KEY` / `TESTING` is in the environment, so the parent process can verify JWTs against the same secret it issued. Port is allocated via `_find_free_port` so xdist workers don't collide. Server is shut down via SIGTERM in the fixture teardown. """ data_dir = tmp_path / "agnes-data" data_dir.mkdir(parents=True, exist_ok=True) seeded = _seed_db(data_dir) port = _find_free_port() url = f"http://127.0.0.1:{port}" env = os.environ.copy() env["DATA_DIR"] = str(data_dir) env["TESTING"] = "1" env["JWT_SECRET_KEY"] = os.environ.get( "JWT_SECRET_KEY", "test-secret-key-minimum-32-characters!!" ) # Disable LOCAL_DEV_MODE — the smoke test must exercise real auth. env.pop("LOCAL_DEV_MODE", None) proc = subprocess.Popen( [sys.executable, "-m", "uvicorn", "app.main:app", "--host", "127.0.0.1", "--port", str(port), "--log-level", "warning"], env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) try: _wait_for_server(url) except RuntimeError: proc.terminate() try: stdout = proc.stdout.read().decode("utf-8", errors="replace") if proc.stdout else "" except Exception: stdout = "" proc.wait(timeout=2) pytest.fail(f"fastapi_test_server failed to start on {url}\nstdout:\n{stdout[:3000]}") handle = _ServerHandle( url=url, data_dir=data_dir, proc=proc, admin_user_id=seeded["admin_user_id"], analyst_user_id=seeded["analyst_user_id"], admin_group_id=seeded["admin_group_id"], everyone_group_id=seeded["everyone_group_id"], ) try: yield handle finally: handle.shutdown() @pytest.fixture def web_session(fastapi_test_server: _ServerHandle) -> Iterator[httpx.Client]: """Authenticated httpx.Client (cookie session) for admin@example.com. Cookies persist across requests on the same client, so subsequent requests against admin-gated endpoints (e.g. POST /api/admin/grants) succeed without re-attaching the JWT. """ client = _login_session( fastapi_test_server.url, ADMIN_EMAIL, ADMIN_PASSWORD, ) try: yield client finally: client.close() @pytest.fixture def test_pat( fastapi_test_server: _ServerHandle, web_session: httpx.Client, ) -> str: """Mint a PAT for analyst@example.com with two table grants. Grants applied (to the Everyone group, which the analyst is a member of): `local_tbl` and `materialized_tbl`. The third seeded table (`remote_tbl`) is intentionally left ungranted so smoke matrices can distinguish "remote skip" from "no access". Memory items / mandatory rules: not seeded by this fixture; Tasks 21 and 22 will add them when needed via the same `web_session` admin client. Keeping memory off the critical path makes the fixture cheaper and the failure surface smaller. """ everyone_id = fastapi_test_server.everyone_group_id _grant_table_access(web_session, everyone_id, LOCAL_TABLE_ID) _grant_table_access(web_session, everyone_id, MATERIALIZED_TABLE_ID) return _mint_pat( fastapi_test_server.url, ANALYST_EMAIL, ANALYST_PASSWORD, name="test-pat-with-grants", ) @pytest.fixture def test_pat_no_grants(fastapi_test_server: _ServerHandle) -> str: """Mint a PAT for analyst@example.com with no resource_grants. The analyst is still in the Everyone group (so they can authenticate and call /api/sync/manifest), but no group they belong to has any table grants. The manifest will return zero tables; `agnes init` completes (no manifest_unauthorized error) with an empty workspace. """ return _mint_pat( fastapi_test_server.url, ANALYST_EMAIL, ANALYST_PASSWORD, name="test-pat-no-grants", ) @pytest.fixture def zero_grants_workspace( tmp_path: Path, fastapi_test_server: _ServerHandle, test_pat_no_grants: str, ) -> Path: """Run `agnes init` with the no-grants PAT; return the workspace path. Subprocess invocation (not in-process Typer call) so the test exercises the same path the paste-prompt installer uses. The CLI binary is the editable install at `.venv/bin/agnes`; we pass `AGNES_CONFIG_DIR=/agnes-config` so this test does not stomp on the developer's `~/.config/agnes/`. """ workspace = tmp_path / "workspace" workspace.mkdir() config_dir = tmp_path / "agnes-config" config_dir.mkdir() # Always invoke via `python -m cli.main` rather than the .venv/bin/agnes # console-script shim. The shim reads `from cli.main import app`, which # depends on `_editable_impl_agnes_the_ai_analyst.pth` being on disk and # discoverable. On macOS + iCloud Drive, the leading-underscore .pth # files get re-hidden by the system between unrelated tasks and the # shim then fails with `ModuleNotFoundError: No module named 'cli'`. # `python -m` uses the same interpreter without depending on .pth-file # visibility for CLI dispatch, so it is robust against that race. cmd: list[str] = [sys.executable, "-m", "cli.main"] env = os.environ.copy() env["AGNES_CONFIG_DIR"] = str(config_dir) env["AGNES_LOCAL_DIR"] = str(workspace) result = subprocess.run( cmd + [ "init", "--server-url", fastapi_test_server.url, "--token", test_pat_no_grants, "--workspace", str(workspace), ], env=env, capture_output=True, text=True, timeout=60, ) assert result.returncode == 0, ( f"agnes init failed (exit={result.returncode}):\n" f"--- stdout ---\n{result.stdout}\n" f"--- stderr ---\n{result.stderr}" ) return workspace