- tests/test_bq_cost_guardrail.py: assert fail-open warning is logged (test previously only proved fail-open doesn't crash; review note: warning is the only operator-visible signal of the silent failure). - extractor._wrap_admin_sql_for_jobs_api: docstring no longer claims DuckDB-flavor SQL is rejected — the function performs no inner-SQL validation; the v24 migration + register-time validator are the real enforcement points. - extractor.materialize_query: safe_path uses _escape_sql_string_literal instead of inline replace, for one-place-to-update consistency. - extractor: import hashlib hoisted to module-level imports.
153 lines
5.3 KiB
Python
153 lines
5.3 KiB
Python
"""materialize_query refuses to run when dry-run estimate exceeds the cap.
|
|
|
|
The cap is wired through `data_source.bigquery.max_bytes_per_materialize`
|
|
(read by the trigger pass; default 10 GiB; set 0 to disable). The dry-run
|
|
itself reuses `app.api.v2_scan._bq_dry_run_bytes` so cost-estimate logic
|
|
lives in exactly one place. Fail-open behaviour (DuckDB-syntax SQL the
|
|
native BQ client can't parse → estimate=0 → COPY proceeds with a warning)
|
|
is documented and exercised here too.
|
|
"""
|
|
import duckdb
|
|
import pytest
|
|
from contextlib import contextmanager
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from connectors.bigquery.access import BqAccess, BqProjects
|
|
from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError
|
|
|
|
|
|
def _bq_with_seed(tables: dict[str, str] | None = None) -> BqAccess:
|
|
"""Stub BqAccess seeded with in-memory tables (same recipe as
|
|
test_bq_materialize).
|
|
|
|
A `bigquery_query(project, sql_text)` table macro is registered so the
|
|
wrapping added by `_wrap_admin_sql_for_jobs_api` (Task 2 — routes COPY
|
|
through the BQ jobs API for views) resolves against the in-memory tables
|
|
without needing the real BQ extension.
|
|
"""
|
|
tables = tables or {}
|
|
|
|
@contextmanager
|
|
def _session(_projects):
|
|
conn = duckdb.connect(":memory:")
|
|
try:
|
|
conn.execute("ATTACH ':memory:' AS bq")
|
|
for s in {ref.rsplit(".", 1)[0] for ref in tables}:
|
|
conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}")
|
|
for ref, body in tables.items():
|
|
conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}")
|
|
# Stub bigquery_query() so materialize_query's wrapped COPY works
|
|
# against the in-memory bq catalog without the real BQ extension.
|
|
conn.execute(
|
|
"CREATE OR REPLACE MACRO bigquery_query(project, sql_text) "
|
|
"AS TABLE SELECT * FROM query(sql_text)"
|
|
)
|
|
yield conn
|
|
finally:
|
|
conn.close()
|
|
|
|
return BqAccess(
|
|
BqProjects(billing="test-billing", data="test-data"),
|
|
client_factory=lambda _p: MagicMock(),
|
|
duckdb_session_factory=_session,
|
|
)
|
|
|
|
|
|
def test_refuses_when_estimate_exceeds_cap(tmp_path):
|
|
out = tmp_path / "extracts" / "bigquery"
|
|
out.mkdir(parents=True)
|
|
|
|
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
|
|
|
with patch(
|
|
"app.api.v2_scan._bq_dry_run_bytes", return_value=100 * 2**30
|
|
):
|
|
with pytest.raises(MaterializeBudgetError) as exc:
|
|
materialize_query(
|
|
table_id="huge",
|
|
sql="SELECT * FROM bq.test.tiny",
|
|
bq=bq,
|
|
output_dir=str(out),
|
|
max_bytes=10 * 2**30,
|
|
)
|
|
err = exc.value
|
|
assert err.table_id == "huge"
|
|
assert err.current == 100 * 2**30
|
|
assert err.limit == 10 * 2**30
|
|
|
|
|
|
def test_proceeds_when_estimate_under_cap(tmp_path):
|
|
out = tmp_path / "extracts" / "bigquery"
|
|
out.mkdir(parents=True)
|
|
|
|
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
|
|
|
with patch("app.api.v2_scan._bq_dry_run_bytes", return_value=1024):
|
|
stats = materialize_query(
|
|
table_id="tiny",
|
|
sql="SELECT * FROM bq.test.tiny",
|
|
bq=bq,
|
|
output_dir=str(out),
|
|
max_bytes=10 * 2**30,
|
|
)
|
|
assert stats["rows"] == 1
|
|
|
|
|
|
def test_no_cap_skips_dry_run(tmp_path):
|
|
"""When max_bytes=None (default), no dry-run is performed."""
|
|
out = tmp_path / "extracts" / "bigquery"
|
|
out.mkdir(parents=True)
|
|
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
|
|
|
with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry:
|
|
stats = materialize_query(
|
|
table_id="t1",
|
|
sql="SELECT * FROM bq.test.tiny",
|
|
bq=bq,
|
|
output_dir=str(out),
|
|
)
|
|
mock_dry.assert_not_called()
|
|
assert stats["rows"] == 1
|
|
|
|
|
|
def test_zero_max_bytes_skips_dry_run(tmp_path):
|
|
"""Sentinel: max_bytes=0 disables the guardrail (config docs)."""
|
|
out = tmp_path / "extracts" / "bigquery"
|
|
out.mkdir(parents=True)
|
|
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
|
|
|
with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry:
|
|
stats = materialize_query(
|
|
table_id="t1",
|
|
sql="SELECT * FROM bq.test.tiny",
|
|
bq=bq,
|
|
output_dir=str(out),
|
|
max_bytes=0,
|
|
)
|
|
mock_dry.assert_not_called()
|
|
assert stats["rows"] == 1
|
|
|
|
|
|
def test_dry_run_failure_is_fail_open(tmp_path, caplog):
|
|
"""If the dry-run errors (DuckDB syntax, missing google lib, transient
|
|
upstream failure) we don't block — log + proceed with COPY. Operators
|
|
who need hard-fail watch logs for the warning."""
|
|
import logging
|
|
|
|
out = tmp_path / "extracts" / "bigquery"
|
|
out.mkdir(parents=True)
|
|
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
|
|
|
with caplog.at_level(logging.WARNING, logger="connectors.bigquery.extractor"):
|
|
with patch(
|
|
"app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
|
|
):
|
|
stats = materialize_query(
|
|
table_id="t1",
|
|
sql="SELECT * FROM bq.test.tiny",
|
|
bq=bq,
|
|
output_dir=str(out),
|
|
max_bytes=10 * 2**30,
|
|
)
|
|
assert stats["rows"] == 1
|
|
assert "fail-open" in caplog.text
|