fix(api): sample endpoint returns 500 for materialized BQ tables (#341)

* fix(api): v2 sample endpoint returns 500 for materialized BQ tables

build_sample in app/api/v2_sample.py checked only source_type ==
'bigquery' before routing to _fetch_bq_sample, so materialized
tables (source_type='bigquery', query_mode='materialized') attempted
a live BigQuery query for data that lives locally as parquet —
causing an unhandled exception and HTTP 500.

Fix mirrors the existing guard already in v2_schema.py (#261): skip
_fetch_bq_sample when query_mode='materialized' and fall through to
the local parquet read path. The parquet is the source of truth for
any materialized source regardless of source_type.

Regression test test_materialized_bq_table_reads_parquet_not_bq
patches _fetch_bq_sample with a sentinel, registers a materialized
BQ table, calls build_sample, and asserts (a) the sentinel was never
hit and (b) rows came from the local parquet.

Credit @davidrybar-grpn (#341, cleaned + rebased onto post-#340 main).

* release: 0.54.28 — v2 sample endpoint materialized-BQ 500 fix

---------

Co-authored-by: ZdenekSrotyr <zdenek.srotyr@keboola.com>
This commit is contained in:
David Rybar 2026-05-18 22:57:32 +02:00 committed by GitHub
parent 86933a2cb5
commit e11f03eb60
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 69 additions and 2 deletions

View file

@ -10,6 +10,19 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
## [Unreleased]
## [0.54.28] — 2026-05-18
### Fixed
- `/api/v2/sample` (and `agnes describe`) no longer returns HTTP 500
for materialized BigQuery tables (`source_type='bigquery'`,
`query_mode='materialized'`). The handler previously routed any
`source_type='bigquery'` row to `_fetch_bq_sample` regardless of
query mode, attempting a live BigQuery query for data that lives
locally as parquet. Fix mirrors the existing guard in
`app/api/v2_schema.py` from #261 — materialized tables fall through
to the local parquet read path. Regression-locked by
`test_materialized_bq_table_reads_parquet_not_bq`. Closes #341.
## [0.54.27] — 2026-05-18
### Fixed

View file

@ -144,7 +144,7 @@ def build_sample(
if cached is not None:
return cached
if source_type == "bigquery":
if source_type == "bigquery" and (row.get("query_mode") or "") != "materialized":
rows = _fetch_bq_sample(bq, row.get("bucket") or "", row.get("source_table") or table_id, n)
else:
from app.utils import get_data_dir

View file

@ -1,6 +1,6 @@
[project]
name = "agnes-the-ai-analyst"
version = "0.54.27"
version = "0.54.28"
description = "Agnes — AI Data Analyst platform for AI analytical systems"
requires-python = ">=3.11,<3.14"
license = "MIT"

View file

@ -162,6 +162,60 @@ class TestSampleEndpoint:
finally:
conn.close()
def test_materialized_bq_table_reads_parquet_not_bq(self, reload_db, monkeypatch):
"""Regression: build_sample routed materialized tables (source_type='bigquery',
query_mode='materialized') to _fetch_bq_sample, which attempted a live BQ
query for data that lives locally as parquet causing HTTP 500.
After the fix, query_mode='materialized' must always fall through to the
local parquet read path, regardless of source_type."""
import duckdb as _duckdb
from app.api import v2_sample
from app.utils import get_data_dir
v2_sample._sample_cache.clear()
bq_called = []
def _fake_bq_fetch(*a, **kw):
bq_called.append(True)
return []
monkeypatch.setattr(v2_sample, "_fetch_bq_sample", _fake_bq_fetch)
parquet_dir = get_data_dir() / "extracts" / "bigquery" / "data"
parquet_dir.mkdir(parents=True, exist_ok=True)
parquet_path = parquet_dir / "order_economics.parquet"
c = _duckdb.connect(":memory:")
try:
c.execute(
"COPY (SELECT 'Los Angeles' AS customer_city, 100 AS orders "
"UNION ALL SELECT 'New York', 80 AS orders) "
f"TO '{parquet_path}' (FORMAT PARQUET)"
)
finally:
c.close()
conn = reload_db.get_system_db()
try:
_ensure_admin1(conn)
from src.repositories.table_registry import TableRegistryRepository
TableRegistryRepository(conn).register(
id="order_economics", name="order_economics",
source_type="bigquery", query_mode="materialized",
bucket="finance_unit_economics", source_table="order_economics",
)
user = {"id": "admin1", "email": "a@x.com"}
data = v2_sample.build_sample(conn, user, "order_economics", n=5, bq=_bq())
finally:
conn.close()
assert not bq_called, "_fetch_bq_sample must not be called for materialized tables"
assert data["table_id"] == "order_economics"
assert len(data["rows"]) == 2
cities = {r["customer_city"] for r in data["rows"]}
assert cities == {"Los Angeles", "New York"}
class TestBqAccessErrors:
"""Issue #134: structured 502 translation on BQ errors in sample path.