Three behavioural improvements driven by the sub-agent end-to-end test
findings, plus scheduler tweaks to prevent the post-deploy contention
burst we measured.
CATALOG (catalog-side bugs the test agents tripped on):
- new entity_type field per remote row (BASE TABLE / VIEW /
MATERIALIZED VIEW). For views, rows + size_bytes return null
instead of the misleading 0 that __TABLES__ reports.
- where_examples now validates against the table's actual schema
(cached known_columns from refresh). The pre-fix behavior
blindly advertised `country_code = 'CZ'` on tables with no
country_code column — the sub-agent tests reliably hit this on
unit_economics.
- new known_columns + entity_type columns on bq_metadata_cache;
populated by bq_metadata_refresh.refresh_one from the same
fetch_bq_columns_full call (no extra BQ roundtrip) plus a
cheap INFORMATION_SCHEMA.TABLES lookup for table_type.
QUERY COST-GUARD:
- remote_scan_too_large suggestion now names views explicitly:
`Target(s) <ids> are VIEW or MATERIALIZED VIEW. BigQuery does
not push LIMIT into the view body — SELECT * FROM <view>
LIMIT 1 still runs the full underlying scan.` Programmatic
consumers get a new view_targets field on the error detail.
SCHEDULER HYGIENE (the post-deploy 1-minute window where
concurrent parquet downloads dropped to ~1 MB/s):
- SCHEDULER_STARTUP_GRACE_SECONDS (default 60) holds the first
tick so the burst doesn't overlap cache_warmup writes.
- SCHEDULER_BQ_METADATA_INITIAL_OFFSET_MAX_SECONDS (default 900)
randomises bq-metadata-refresh's first-fire offset.
TESTS:
- test_bq_metadata_cache_repo: entity_type + known_columns round-trip
- test_v2_catalog_remote_metadata: where_examples validation, views
return null rows/size_bytes, cold rows have empty examples
- test_api_query_guardrail: VIEW-aware suggestion text + view_targets
- test_connectors_bigquery_metadata: entity_type lookup mock + new
fields in TableMetadata expectations
- test_scheduler_sidecar: grace + jitter env-var resolution
261 lines
9.7 KiB
Python
261 lines
9.7 KiB
Python
"""BigQuery metadata provider — 5 paths from spec test plan:
|
|
happy / sentinel / VIEW / region-typo / both-paths-fail."""
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from app.api._metadata_models import MetadataRequest, TableMetadata
|
|
|
|
|
|
@pytest.fixture
|
|
def req():
|
|
return MetadataRequest(
|
|
table_id="orders", bucket="dwh_base", source_table="orders_2024",
|
|
)
|
|
|
|
|
|
def _bq_with_session(table_storage_rows=None, columns_rows=None,
|
|
table_storage_raises=None, columns_raises=None,
|
|
legacy_tables_rows=None, legacy_tables_raises=None,
|
|
entity_type_rows=None, entity_type_raises=None,
|
|
projects_data="data-proj", projects_billing="billing-proj"):
|
|
"""Mock `BqAccess` whose `duckdb_session()` returns a context manager
|
|
routing `.execute(...)` based on the inner SQL string."""
|
|
bq = MagicMock()
|
|
bq.projects.data = projects_data
|
|
bq.projects.billing = projects_billing
|
|
|
|
def execute(outer_sql, params):
|
|
inner_sql = params[1] if len(params) > 1 else ""
|
|
if "TABLE_STORAGE" in inner_sql:
|
|
if table_storage_raises:
|
|
raise table_storage_raises
|
|
return MagicMock(
|
|
fetchone=lambda: table_storage_rows[0] if table_storage_rows else None,
|
|
fetchall=lambda: table_storage_rows or [],
|
|
)
|
|
if "INFORMATION_SCHEMA.COLUMNS" in inner_sql:
|
|
if columns_raises:
|
|
raise columns_raises
|
|
return MagicMock(
|
|
fetchall=lambda: columns_rows or [],
|
|
)
|
|
if "INFORMATION_SCHEMA.TABLES" in inner_sql:
|
|
# entity_type lookup added in 0.50.0 — order matters: this check
|
|
# must come BEFORE __TABLES__ because the substring overlaps.
|
|
if entity_type_raises:
|
|
raise entity_type_raises
|
|
return MagicMock(
|
|
fetchone=lambda: entity_type_rows[0] if entity_type_rows else None,
|
|
)
|
|
if "__TABLES__" in inner_sql:
|
|
if legacy_tables_raises:
|
|
raise legacy_tables_raises
|
|
return MagicMock(
|
|
fetchone=lambda: legacy_tables_rows[0] if legacy_tables_rows else None,
|
|
)
|
|
raise AssertionError(f"unexpected SQL: {inner_sql[:80]}")
|
|
|
|
session = MagicMock()
|
|
session.execute.side_effect = execute
|
|
cm = MagicMock()
|
|
cm.__enter__.return_value = session
|
|
cm.__exit__.return_value = False
|
|
bq.duckdb_session.return_value = cm
|
|
return bq
|
|
|
|
|
|
def _location_get_value(*keys, default=None):
|
|
"""Mock for `app.instance_config.get_value` matching its multi-positional
|
|
signature. Returns 'us-central1' for the BQ location key, default otherwise.
|
|
Regression-anchored to Devin Review #1: the prior buggy single-string call
|
|
silently dropped the configured location; this fixture intentionally
|
|
requires the correct ('data_source', 'bigquery', 'location') tuple."""
|
|
if keys == ("data_source", "bigquery", "location"):
|
|
return "us-central1"
|
|
return default
|
|
|
|
|
|
def test_happy_path_returns_full_metadata(req, monkeypatch):
|
|
"""TABLE_STORAGE returns rows+size, COLUMNS returns partition+cluster."""
|
|
from connectors.bigquery import metadata
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.metadata.get_value",
|
|
_location_get_value,
|
|
raising=False,
|
|
)
|
|
|
|
bq = _bq_with_session(
|
|
table_storage_rows=[(1234567, 5_000_000)],
|
|
columns_rows=[
|
|
("event_date", "DATE", "NO", "YES", None),
|
|
("country", "STRING", "YES", "NO", 1),
|
|
("user_id", "STRING", "NO", "NO", None),
|
|
],
|
|
entity_type_rows=[("BASE TABLE",)],
|
|
)
|
|
with patch("connectors.bigquery.metadata.get_bq_access", return_value=bq):
|
|
result = metadata.fetch(req)
|
|
assert result == TableMetadata(
|
|
rows=1234567,
|
|
size_bytes=5_000_000,
|
|
partition_by="event_date",
|
|
clustered_by=["country"],
|
|
entity_type="BASE TABLE",
|
|
known_columns=["event_date", "country", "user_id"],
|
|
)
|
|
|
|
|
|
def test_sentinel_unconfigured_returns_none_no_query(req):
|
|
"""`bq.projects.data == ''` → return None before any query."""
|
|
from connectors.bigquery import metadata
|
|
bq = _bq_with_session(projects_data="")
|
|
with patch("connectors.bigquery.metadata.get_bq_access", return_value=bq):
|
|
assert metadata.fetch(req) is None
|
|
bq.duckdb_session.assert_not_called()
|
|
|
|
|
|
def test_view_path_returns_metadata_with_null_rows_size(req, monkeypatch):
|
|
"""VIEW: TABLE_STORAGE empty + __TABLES__ empty → rows/size = None;
|
|
partition + cluster from COLUMNS still surface."""
|
|
from connectors.bigquery import metadata
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.metadata.get_value",
|
|
_location_get_value,
|
|
raising=False,
|
|
)
|
|
bq = _bq_with_session(
|
|
table_storage_rows=[], # view → no row
|
|
legacy_tables_rows=[], # view also absent from __TABLES__
|
|
columns_rows=[
|
|
("event_date", "DATE", "NO", "YES", None),
|
|
],
|
|
entity_type_rows=[("VIEW",)],
|
|
)
|
|
with patch("connectors.bigquery.metadata.get_bq_access", return_value=bq):
|
|
result = metadata.fetch(req)
|
|
assert result is not None
|
|
assert result.rows is None
|
|
assert result.size_bytes is None
|
|
assert result.partition_by == "event_date"
|
|
assert result.entity_type == "VIEW"
|
|
assert result.known_columns == ["event_date"]
|
|
|
|
|
|
def test_region_typo_falls_through_to_legacy_tables(req, monkeypatch):
|
|
"""TABLE_STORAGE raises (typo'd region) → fall through to __TABLES__."""
|
|
from connectors.bigquery import metadata
|
|
|
|
def typo_get_value(*keys, default=None):
|
|
if keys == ("data_source", "bigquery", "location"):
|
|
return "us-central" # typo!
|
|
return default
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.metadata.get_value",
|
|
typo_get_value,
|
|
raising=False,
|
|
)
|
|
bq = _bq_with_session(
|
|
table_storage_raises=RuntimeError("Not found: ..."),
|
|
legacy_tables_rows=[(100, 2048)],
|
|
columns_rows=[("event_date", "DATE", "NO", "YES", None)],
|
|
)
|
|
with patch("connectors.bigquery.metadata.get_bq_access", return_value=bq):
|
|
result = metadata.fetch(req)
|
|
assert result is not None
|
|
assert result.rows == 100
|
|
assert result.size_bytes == 2048
|
|
|
|
|
|
def test_both_paths_fail_returns_metadata_with_partition_only(req, monkeypatch):
|
|
"""Both TABLE_STORAGE and __TABLES__ fail → rows/size None, partition still fills."""
|
|
from connectors.bigquery import metadata
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.metadata.get_value",
|
|
_location_get_value,
|
|
raising=False,
|
|
)
|
|
bq = _bq_with_session(
|
|
table_storage_raises=RuntimeError("BQ down"),
|
|
legacy_tables_raises=RuntimeError("BQ still down"),
|
|
columns_rows=[("event_date", "DATE", "NO", "YES", None)],
|
|
)
|
|
with patch("connectors.bigquery.metadata.get_bq_access", return_value=bq):
|
|
result = metadata.fetch(req)
|
|
assert result is not None
|
|
assert result.rows is None
|
|
assert result.size_bytes is None
|
|
assert result.partition_by == "event_date"
|
|
|
|
|
|
def test_location_config_uses_multi_positional_get_value_args(req, monkeypatch):
|
|
"""Devin Review #1 regression: `get_value` was called with a single
|
|
dot-separated string `'data_source.bigquery.location'`, but the function
|
|
iterates over separate positional keys — so the call always returned None
|
|
and the BQ location config was never read.
|
|
|
|
This test records every call to `get_value` and asserts that the location
|
|
lookup goes through the correct multi-positional form
|
|
(`'data_source', 'bigquery', 'location'`)."""
|
|
from connectors.bigquery import metadata
|
|
|
|
calls: list[tuple] = []
|
|
|
|
def recording_get_value(*keys, default=None):
|
|
calls.append(keys)
|
|
if keys == ("data_source", "bigquery", "location"):
|
|
return "europe-west1"
|
|
return default
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.metadata.get_value",
|
|
recording_get_value,
|
|
raising=False,
|
|
)
|
|
|
|
captured: dict = {}
|
|
|
|
def execute(outer_sql, params):
|
|
if "TABLE_STORAGE" in (params[1] if len(params) > 1 else ""):
|
|
captured["table_storage_sql"] = params[1]
|
|
return MagicMock(fetchone=lambda: (5, 10))
|
|
return MagicMock(fetchall=lambda: [], fetchone=lambda: None)
|
|
|
|
bq = MagicMock()
|
|
bq.projects.data = "data-proj"
|
|
bq.projects.billing = "billing-proj"
|
|
session = MagicMock()
|
|
session.execute.side_effect = execute
|
|
cm = MagicMock()
|
|
cm.__enter__.return_value = session
|
|
cm.__exit__.return_value = False
|
|
bq.duckdb_session.return_value = cm
|
|
|
|
with patch("connectors.bigquery.metadata.get_bq_access", return_value=bq):
|
|
metadata.fetch(req)
|
|
|
|
# The fix: `get_value("data_source", "bigquery", "location")` must appear.
|
|
assert ("data_source", "bigquery", "location") in calls, (
|
|
f"expected ('data_source','bigquery','location') tuple in get_value "
|
|
f"calls, got: {calls}"
|
|
)
|
|
# And the configured location must reach the TABLE_STORAGE SQL — proving
|
|
# the value was actually consumed, not just looked up.
|
|
assert "region-europe-west1" in captured.get("table_storage_sql", ""), (
|
|
f"location config was not propagated to BQ SQL: "
|
|
f"{captured.get('table_storage_sql', '<no SQL captured>')}"
|
|
)
|
|
|
|
|
|
def test_bq_access_error_returns_none(req):
|
|
"""get_bq_access() raises BqAccessError → return None gracefully."""
|
|
from connectors.bigquery import metadata
|
|
from connectors.bigquery.access import BqAccessError
|
|
with patch(
|
|
"connectors.bigquery.metadata.get_bq_access",
|
|
side_effect=BqAccessError("not_configured", "not configured"),
|
|
):
|
|
assert metadata.fetch(req) is None
|