agnes-the-ai-analyst/tests/test_api_design_rules.py

"""API design rule enforcement — prevents new violations from accumulating.

Existing violations are captured in allowlists: visible, deliberate,
and documented so they can be shrunk over time.

See: https://github.com/keboola/agnes-the-ai-analyst/issues/337
"""

import os
from pathlib import Path

import pytest

SNAPSHOT_PATH = Path(__file__).parent / "snapshots" / "openapi.json"
_HTTP_METHODS = {"get", "post", "put", "delete", "patch", "head", "options"}

# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


@pytest.fixture(scope="module")
def spec():
    """Boot the app in test mode — same fixture strategy as test_openapi_snapshot."""
    os.environ.setdefault("TESTING", "1")
    from app.main import create_app

    return create_app().openapi()


def _ops(spec):
    for path, methods in spec.get("paths", {}).items():
        for method, op in methods.items():
            if method in _HTTP_METHODS:
                yield path, method, op


# ---------------------------------------------------------------------------
# Rule 1 — No new verbs in URL path segments
#
# Rationale: verb-in-URL encodes intent in the path rather than the HTTP method,
# which breaks REST client assumptions, prevents generic caching/retry logic,
# and makes the API surface harder to discover.
#
# Exceptions: RPC-style command-bus operations where the HTTP method genuinely
# cannot express the intent (e.g. fire-and-forget triggers, state machines).
# These are explicitly listed below so the allowlist is self-documenting.
# ---------------------------------------------------------------------------

_VERBS = frozenset({
    "trigger", "run", "activate", "deactivate", "approve", "reject", "revoke",
    "register", "discover", "refresh", "reset", "send", "import", "export",
    "push", "pull", "enable", "disable", "rebuild", "reload", "bulk", "precheck",
    "rescan",
})

# Existing violations — grandfathered. Do not extend this list.
# Each entry should include a brief note on why it is intentional RPC.
_VERB_PATH_ALLOWLIST = frozenset({
    # Command-bus triggers — fire-and-forget, no idiomatic REST resource
    "/api/sync/trigger",
    "/api/scripts/run",
    "/api/scripts/run-due",
    "/api/scripts/{script_id}/run",
    "/api/marketplaces/{marketplace_id}/sync",
    "/api/marketplaces/sync-all",
    # State transitions on governance resources
    "/api/memory/admin/approve",
    "/api/memory/admin/reject",
    "/api/memory/admin/revoke",
    "/api/memory/admin/bulk-update",
    # User lifecycle — activate/deactivate map to a boolean field (acceptable PATCH candidate)
    "/api/users/{user_id}/activate",
    "/api/users/{user_id}/deactivate",
    "/api/users/{user_id}/reset-password",
    # Admin operations — discovery + registration (complex multi-step, no single resource)
    "/api/admin/discover-and-register",
    "/api/admin/discover-tables",
    "/api/admin/register-table",
    "/api/admin/register-table/precheck",
    "/api/admin/metadata/{table_id}/push",
    "/api/admin/metrics/import",
    # Profile refresh — triggers async re-profiling of table metadata
    "/api/catalog/profile/{table_name}/refresh",
    # BQ metadata cache refresh — on-demand operator trigger for a single registry row
    "/api/v2/metadata-cache/refresh",
    # Cache warmup — manual trigger (idempotent fire-and-forget)
    "/api/admin/cache-warmup/run",
    # Store submission rescan — re-runs guardrail scan on an existing submission
    "/api/admin/store/submissions/{submission_id}/rescan",
    # Telemetry export — GET because it streams a report, not a resource collection
    "/api/admin/telemetry/export",
    # Auth flows — /auth/* uses verb-style paths by convention across the industry
    "/auth/email/send-link",
    "/auth/password/reset",
    "/auth/password/reset/confirm",
    "/auth/password/setup",
    "/auth/password/setup/confirm",
    "/auth/password/setup/request",
    # Sync sub-resources — "sync" is the resource namespace here, not the verb
    "/api/sync/manifest",
    "/api/sync/settings",
    "/api/sync/table-subscriptions",
})


def test_no_new_verbs_in_path(spec):
    """New path segments must not contain action verbs."""
    violations = []
    for path, method, _ in _ops(spec):
        if path in _VERB_PATH_ALLOWLIST:
            continue
        segs = [s for s in path.split("/") if s and not s.startswith("{")]
        hits = [s for s in segs if s.lower() in _VERBS]
        if hits:
            violations.append(f"  {method.upper():6} {path}  (verbs: {hits})")

    assert not violations, (
        f"{len(violations)} new verb-in-URL violation(s):\n" + "\n".join(violations) + "\n\n"
        "Fix: model the action as a resource state change (noun + HTTP method).\n"
        "If the operation is genuinely RPC (fire-and-forget, state machine), add to "
        "_VERB_PATH_ALLOWLIST with a comment explaining why."
    )


# ---------------------------------------------------------------------------
# Rule 2 — DELETE must return 204 No Content
#
# Rationale: DELETE is idempotent; 204 signals successful removal without a
# response body. Returning 200 with a body on DELETE conflates "removed" with
# "here is the removed representation" — which is a read concern, not a write one.
#
# No allowlist: the two pre-existing violations were fixed in this PR.
# ---------------------------------------------------------------------------


def test_delete_returns_204(spec):
    """DELETE operations must declare 204 No Content."""
    violations = []
    for path, method, op in _ops(spec):
        if method != "delete":
            continue
        codes = set(op.get("responses", {}).keys())
        if "204" not in codes:
            violations.append(f"  DELETE {path}  (declares: {sorted(codes)})")

    assert not violations, (
        f"{len(violations)} DELETE endpoint(s) not declaring 204:\n" + "\n".join(violations) + "\n\n"
        "Fix: return Response(status_code=204) and remove any response body.\n"
        "If the endpoint intentionally returns content after deletion, return 200 and "
        "add a response_model — then add it to an allowlist here with a comment."
    )


# ---------------------------------------------------------------------------
# Rule 3 — True creator POSTs must declare 201 Created
#
# Heuristic: a POST is a "creator" if the same path also has a GET method
# (i.e. it is a collection endpoint with read+write).  Pure RPC commands
# (/api/query, /api/sync/trigger) have no GET counterpart and are excluded.
#
# Allowlist: false positives from the heuristic (upserts, config saves,
# auth flows that respond with 200 by design).
# ---------------------------------------------------------------------------

_CREATOR_POST_ALLOWLIST = frozenset({
    # Config upserts — update existing config, not create a new resource
    "/api/admin/server-config",
    "/api/sync/settings",
    # Subscription upsert — sets per-table enabled flags, not a pure create
    "/api/sync/table-subscriptions",
    # Auth flows — 200 is conventional for token/session responses
    "/auth/email/verify",
    "/auth/password/reset",
    "/auth/password/setup",
    # Register/update upsert — saves config, not a pure create
    "/api/admin/initial-workspace",
    # Saved-view upsert — ON CONFLICT updates existing name rather than creating
    "/api/admin/observability/views",
})


def test_creator_post_declares_201(spec):
    """POST on a collection endpoint (path also has GET) must declare 201 or 202."""
    violations = []
    paths = spec.get("paths", {})
    for path, methods in paths.items():
        if "post" not in methods or "get" not in methods:
            continue
        if path in _CREATOR_POST_ALLOWLIST:
            continue
        last = path.rstrip("/").split("/")[-1]
        if last.startswith("{"):
            continue  # item endpoint, not collection
        op = methods["post"]
        codes = set(op.get("responses", {}).keys())
        if "201" not in codes and "202" not in codes:
            violations.append(f"  POST {path}  (declares: {sorted(codes)})")

    assert not violations, (
        f"{len(violations)} creator POST(s) missing 201/202:\n" + "\n".join(violations) + "\n\n"
        "Fix: add responses={{201: {{...}}}} (sync create) or 202 (async create) to the decorator.\n"
        "If the POST is an upsert or config save rather than a create, add to "
        "_CREATOR_POST_ALLOWLIST with a comment."
    )


# ---------------------------------------------------------------------------
# Rule 4 — Protected /api/* endpoints must declare 401 and 403
#
# Rationale: auth errors are real contract elements. Clients (including LLMs)
# that read the spec to understand retry / fallback behaviour need to know
# these codes exist.  The declarations are injected centrally via
# _add_auth_error_responses() in app/main.py, so per-route boilerplate is
# not required.
#
# Public paths: intentionally unauthenticated (health probes, auth entry points).
# ---------------------------------------------------------------------------

_PUBLIC_API_PATHS = frozenset({
    "/api/health",
    "/api/health/detailed",
    "/api/version",
})


def test_protected_endpoints_declare_auth_errors(spec):
    """Every /api/* endpoint not in PUBLIC must declare 401 and 403."""
    violations = []
    for path, method, op in _ops(spec):
        if not path.startswith("/api/"):
            continue
        if path in _PUBLIC_API_PATHS:
            continue
        codes = set(op.get("responses", {}).keys())
        missing = [c for c in ("401", "403") if c not in codes]
        if missing:
            violations.append(
                f"  {method.upper():6} {path}  (missing: {', '.join(missing)})"
            )

    assert not violations, (
        f"{len(violations)} protected endpoint(s) missing auth error declarations:\n"
        + "\n".join(violations[:40])
        + ("\n  … (truncated)" if len(violations) > 40 else "")
        + "\n\nFix: ensure the path is covered by _add_auth_error_responses() in app/main.py, "
        "or add to _PUBLIC_API_PATHS above if it is genuinely unauthenticated."
    )