agnes-the-ai-analyst/tests/test_bq_cost_guardrail.py

"""materialize_query refuses to run when dry-run estimate exceeds the cap.

The cap is wired through `data_source.bigquery.max_bytes_per_materialize`
(read by the trigger pass; default 10 GiB; set 0 to disable). The dry-run
itself reuses `app.api.v2_scan._bq_dry_run_bytes` so cost-estimate logic
lives in exactly one place. Fail-open behaviour (DuckDB-syntax SQL the
native BQ client can't parse → estimate=0 → COPY proceeds with a warning)
is documented and exercised here too.
"""
import duckdb
import pytest
from contextlib import contextmanager
from unittest.mock import MagicMock, patch

from connectors.bigquery.access import BqAccess, BqProjects
from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError


def _bq_with_seed(tables: dict[str, str] | None = None) -> BqAccess:
    """Stub BqAccess seeded with in-memory tables (same recipe as
    test_bq_materialize).

    A `bigquery_query(project, sql_text)` table macro is registered so the
    wrapping added by `_wrap_admin_sql_for_jobs_api` (Task 2 — routes COPY
    through the BQ jobs API for views) resolves against the in-memory tables
    without needing the real BQ extension.
    """
    tables = tables or {}

    @contextmanager
    def _session(_projects):
        conn = duckdb.connect(":memory:")
        try:
            conn.execute("ATTACH ':memory:' AS bq")
            for s in {ref.rsplit(".", 1)[0] for ref in tables}:
                conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}")
            for ref, body in tables.items():
                conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}")
            # Stub bigquery_query() so materialize_query's wrapped COPY works
            # against the in-memory bq catalog without the real BQ extension.
            conn.execute(
                "CREATE OR REPLACE MACRO bigquery_query(project, sql_text) "
                "AS TABLE SELECT * FROM query(sql_text)"
            )
            yield conn
        finally:
            conn.close()

    return BqAccess(
        BqProjects(billing="test-billing", data="test-data"),
        client_factory=lambda _p: MagicMock(),
        duckdb_session_factory=_session,
    )


def test_refuses_when_estimate_exceeds_cap(tmp_path):
    out = tmp_path / "extracts" / "bigquery"
    out.mkdir(parents=True)

    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})

    with patch(
        "app.api.v2_scan._bq_dry_run_bytes", return_value=100 * 2**30
    ):
        with pytest.raises(MaterializeBudgetError) as exc:
            materialize_query(
                table_id="huge",
                sql="SELECT * FROM bq.test.tiny",
                bq=bq,
                output_dir=str(out),
                max_bytes=10 * 2**30,
            )
    err = exc.value
    assert err.table_id == "huge"
    assert err.current == 100 * 2**30
    assert err.limit == 10 * 2**30


def test_proceeds_when_estimate_under_cap(tmp_path):
    out = tmp_path / "extracts" / "bigquery"
    out.mkdir(parents=True)

    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})

    with patch("app.api.v2_scan._bq_dry_run_bytes", return_value=1024):
        stats = materialize_query(
            table_id="tiny",
            sql="SELECT * FROM bq.test.tiny",
            bq=bq,
            output_dir=str(out),
            max_bytes=10 * 2**30,
        )
    assert stats["rows"] == 1


def test_no_cap_skips_dry_run(tmp_path):
    """When max_bytes=None (default), no dry-run is performed."""
    out = tmp_path / "extracts" / "bigquery"
    out.mkdir(parents=True)
    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})

    with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry:
        stats = materialize_query(
            table_id="t1",
            sql="SELECT * FROM bq.test.tiny",
            bq=bq,
            output_dir=str(out),
        )
    mock_dry.assert_not_called()
    assert stats["rows"] == 1


def test_zero_max_bytes_skips_dry_run(tmp_path):
    """Sentinel: max_bytes=0 disables the guardrail (config docs)."""
    out = tmp_path / "extracts" / "bigquery"
    out.mkdir(parents=True)
    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})

    with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry:
        stats = materialize_query(
            table_id="t1",
            sql="SELECT * FROM bq.test.tiny",
            bq=bq,
            output_dir=str(out),
            max_bytes=0,
        )
    mock_dry.assert_not_called()
    assert stats["rows"] == 1


def test_dry_run_failure_is_fail_open(tmp_path, caplog):
    """If the dry-run errors (DuckDB syntax, missing google lib, transient
    upstream failure) we don't block — log + proceed with COPY. Operators
    who need hard-fail watch logs for the warning."""
    import logging

    out = tmp_path / "extracts" / "bigquery"
    out.mkdir(parents=True)
    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})

    with caplog.at_level(logging.WARNING, logger="connectors.bigquery.extractor"):
        with patch(
            "app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
        ):
            stats = materialize_query(
                table_id="t1",
                sql="SELECT * FROM bq.test.tiny",
                bq=bq,
                output_dir=str(out),
                max_bytes=10 * 2**30,
            )
    assert stats["rows"] == 1
    assert "fail-open" in caplog.text