agnes-the-ai-analyst/tests/test_bq_materialize.py
ZdenekSrotyr d8a2299633 fix(bq-materialize): wrap admin SQL in bigquery_query() so views work
Pre-fix, materialize ran the admin source_query as 'COPY (sql) TO parquet'
through the DuckDB BQ extension session. The extension defaults to the
BQ Storage Read API for bq.<ds>.<tbl> references, which rejects views
('non-table entities cannot be read with the storage API'). The fix
always wraps admin SQL into bigquery_query('<billing>', '<inner>') so
COPY uses the BQ jobs API uniformly for tables and views.

Cost guardrail dry-run now operates on the inner SQL (BQ-native), so
the BQ Python client parses it and the cap engages — pre-fix the dry-run
hit 'Table-valued function not found: bigquery_query' and fail-opened.
2026-05-04 16:40:40 +02:00

150 lines
5.1 KiB
Python

"""BigQuery `materialize_query` writes parquet via BqAccess + DuckDB COPY.
The function takes a `BqAccess` instance so the BQ extension session and
SECRET token live in one place across the codebase (cf. `v2_scan` / `v2_sample`
/ `v2_schema`). Tests inject a stub BqAccess whose `duckdb_session()` yields
an in-memory connection with a pre-attached `bq` catalog containing fixture
tables, exercising the COPY path end-to-end without any GCP traffic.
"""
import duckdb
import pytest
from contextlib import contextmanager
from pathlib import Path
from unittest.mock import MagicMock
from connectors.bigquery.access import BqAccess, BqProjects
from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError
def _make_stub_bq(tables: dict[str, str] | None = None) -> BqAccess:
"""Return a BqAccess wired to factories that yield an in-memory DuckDB
with a pretend `bq` catalog containing test tables. `tables` maps
DuckDB-three-part references like `'bq.test.orders'` to a SELECT
expression to seed them with.
A `bigquery_query(project, sql_text)` table macro is registered so the
wrapping added by `_wrap_admin_sql_for_jobs_api` (Task 2 — routes COPY
through the BQ jobs API for views) resolves against the in-memory tables
without needing the real BQ extension.
"""
tables = tables or {}
@contextmanager
def _session(_projects):
conn = duckdb.connect(":memory:")
try:
conn.execute("ATTACH ':memory:' AS bq")
schemas = {ref.rsplit(".", 1)[0] for ref in tables}
for s in schemas:
conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}")
for ref, body in tables.items():
conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}")
# Stub bigquery_query() so materialize_query's wrapped COPY works
# against the in-memory bq catalog without the real BQ extension.
conn.execute(
"CREATE OR REPLACE MACRO bigquery_query(project, sql_text) "
"AS TABLE SELECT * FROM query(sql_text)"
)
yield conn
finally:
conn.close()
# client_factory returns a stub whose .query(sql, job_config=...) yields
# a job whose .total_bytes_processed defaults to 0 (fail-open).
def _client(_projects):
client = MagicMock()
job = MagicMock()
job.total_bytes_processed = 0
client.query.return_value = job
return client
return BqAccess(
BqProjects(billing="test-billing", data="test-data"),
client_factory=_client,
duckdb_session_factory=_session,
)
def test_materialize_writes_parquet_and_returns_stats(tmp_path):
out = tmp_path / "extracts" / "bigquery"
out.mkdir(parents=True)
bq = _make_stub_bq({
"bq.test.orders": (
"SELECT 'EU' AS region, 100 AS revenue UNION ALL "
"SELECT 'US' AS region, 250 AS revenue"
)
})
stats = materialize_query(
table_id="orders_summary",
sql="SELECT region, SUM(revenue) AS revenue FROM bq.test.orders GROUP BY 1",
bq=bq,
output_dir=str(out),
)
parquet_path = out / "data" / "orders_summary.parquet"
assert parquet_path.exists()
assert stats["rows"] == 2
assert stats["size_bytes"] > 0
assert stats["query_mode"] == "materialized"
# Parquet readable end-to-end
rows = duckdb.connect().execute(
f"SELECT region, revenue FROM read_parquet('{parquet_path}') ORDER BY region"
).fetchall()
assert rows == [("EU", 100), ("US", 250)]
def test_materialize_atomic_on_failure(tmp_path):
"""Bad SQL must not leave a half-written parquet behind."""
out = tmp_path / "extracts" / "bigquery"
out.mkdir(parents=True)
parquet_path = out / "data" / "broken.parquet"
bq = _make_stub_bq({"bq.test.orders": "SELECT 1 AS n"})
with pytest.raises(Exception):
materialize_query(
table_id="broken",
sql="SELECT * FROM bq.test.does_not_exist",
bq=bq,
output_dir=str(out),
)
assert not parquet_path.exists()
# Tmp also cleaned
assert not (out / "data" / "broken.parquet.tmp").exists()
def test_materialize_rejects_unsafe_table_id(tmp_path):
"""table_id becomes the parquet filename — block path traversal up front."""
out = tmp_path / "extracts" / "bigquery"
out.mkdir(parents=True)
bq = _make_stub_bq()
with pytest.raises(ValueError, match="unsafe"):
materialize_query(
table_id="../etc/passwd",
sql="SELECT 1",
bq=bq,
output_dir=str(out),
)
def test_materialize_overwrites_existing_parquet(tmp_path):
out = tmp_path / "extracts" / "bigquery"
out.mkdir(parents=True)
bq = _make_stub_bq({"bq.test.tiny": "SELECT 1 AS n"})
materialize_query(
table_id="t1", sql="SELECT 1 AS n",
bq=bq, output_dir=str(out),
)
materialize_query(
table_id="t1", sql="SELECT 2 AS n",
bq=bq, output_dir=str(out),
)
rows = duckdb.connect().execute(
f"SELECT n FROM read_parquet('{out}/data/t1.parquet')"
).fetchall()
assert rows == [(2,)]