fix(bq): #160 wrap views via bigquery_query() for VIEW/MATERIALIZED_VIEW

Issue #160: da query --remote against query_mode='remote' BQ rows whose
underlying entity is a VIEW or MATERIALIZED_VIEW returned a DuckDB catalog
error because the extractor (with legacy_wrap_views=False default since
the v2 fetch primitives release) skipped master-view creation for those
entity types — but kept inserting the _meta row, leaving operators with a
registered name that resolves to nothing.

Always create a master view for entity types we have proven runtime support
for in this codebase:

  BASE TABLE          → bq."<dataset>"."<source_table>"
                        (Storage Read API path; predicate pushdown)
  VIEW / MAT_VIEW     → bigquery_query('<project>', 'SELECT * FROM `proj.ds.tbl`')
                        (jobs API path; no pushdown — the upcoming /api/query
                        cost guardrail bounds the scan; was the legacy
                        legacy_wrap_views=True branch SQL form, just always-on)

For other entity types (EXTERNAL, SNAPSHOT, CLONE, future), log a warning
and SKIP both the master view AND the _meta row. The registry row remains
intact so /api/v2/scan still works for `da fetch`; we just don't expose a
stale _meta entry that the orchestrator would later strand.

The legacy_wrap_views config knob is still readable in this commit (read
returns the value, which is then ignored). Removal across the rest of
the codebase happens in the follow-up REFACTOR commit.

tests/test_bigquery_extractor.py:
- Add 3 RED tests covering the new always-wrap behavior:
  test_view_creates_wrap_view_with_default_config,
  test_materialized_view_creates_wrap_view_with_default_config,
  test_unsupported_entity_type_skips_meta_and_view.
- Fix pre-existing flakiness in test_main_exits_when_project_missing
  by resetting app.instance_config cache before the no-project mock —
  the prior test populates the cache with a project, and removing the
  legacy_wrap_views get_value() call surfaced this latent ordering bug.
This commit is contained in:
ZdenekSrotyr 2026-05-03 16:23:35 +02:00
parent 4aa96f2546
commit 10d7bd62f8
2 changed files with 138 additions and 18 deletions

View file

@ -222,25 +222,26 @@ def _init_extract_locked(
f"BQ entity {project_id}.{dataset}.{source_table} not found"
)
legacy_wrap_views = bool(
get_value("data_source", "bigquery", "legacy_wrap_views", default=False)
)
# Issue #160: always create a master view for query_mode='remote'
# rows we have proven runtime support for.
# BASE TABLE → catalog path (Storage Read API, predicate pushdown)
# VIEW / MATERIALIZED_VIEW → bigquery_query() (jobs API, no
# pushdown — cost is bounded by the /api/query guardrail)
# Other entity types (EXTERNAL, SNAPSHOT, CLONE, future) are
# logged + skipped, with NO _meta row, since orchestrator-side
# master-view creation requires a corresponding inner view.
if entity_type == "BASE TABLE":
# Storage Read API — fast for full scans
view_sql = (
f'CREATE OR REPLACE VIEW "{table_name}" AS '
f'SELECT * FROM bq."{dataset}"."{source_table}"'
)
conn.execute(view_sql)
elif legacy_wrap_views:
# Backwards compatibility — for one release cycle only.
if entity_type not in ("VIEW", "MATERIALIZED_VIEW"):
logger.warning(
"Unknown BQ entity type %r for %s.%s.%s — using bigquery_query() path",
entity_type, project_id, dataset, source_table,
)
# VIEW or MATERIALIZED_VIEW — use jobs API
elif entity_type in ("VIEW", "MATERIALIZED_VIEW"):
# `dataset` and `source_table` are validated above by
# validate_quoted_identifier; project_id is validated at
# the entry boundary of init_extract (lines 152-160).
# The .replace("'", "''") is defense-in-depth on the
# inline literal.
bq_inner = f"SELECT * FROM `{project_id}.{dataset}.{source_table}`"
bq_inner_escaped = bq_inner.replace("'", "''")
view_sql = (
@ -249,13 +250,17 @@ def _init_extract_locked(
)
conn.execute(view_sql)
else:
# Default: VIEW / MATERIALIZED_VIEW are recorded in _meta but no master
# view created. Analyst must use `da fetch` (v2 primitives) to materialise
# a snapshot locally.
logger.info(
"Skipping wrap view for %s entity %s.%s.%s — use `da fetch`",
# Unverified entity type. Skip both the wrap view and
# the _meta row. The registry row remains; /api/v2/scan
# can still operate from it (builds BQ SQL from
# bucket+source_table), and `da fetch` works.
logger.warning(
"Unverified BQ entity_type %r for %s.%s.%s — master view skipped. "
"Use `da fetch` for this row, or file an issue with a repro to "
"request native support.",
entity_type, project_id, dataset, source_table,
)
continue # Do NOT insert _meta — no inner view to point at.
conn.execute(
"INSERT INTO _meta VALUES (?, ?, 0, 0, ?, 'remote')",

View file

@ -557,6 +557,12 @@ class TestExtractorMainModule:
def test_main_exits_when_project_missing(self, tmp_path, monkeypatch):
"""__main__ must SystemExit(2) when data_source.bigquery.project is empty/missing."""
# Reset the app.instance_config cache — `test_main_reads_data_source_bigquery_project`
# above populated it with a config that has the project set, and the
# cache survives across tests. Without this reset, `_resolve_bq_project_id`
# returns the stale cached value instead of the no-project mock below.
from app.instance_config import reset_cache
reset_cache()
monkeypatch.setattr(
"config.loader.load_instance_config",
lambda: {"data_source": {"type": "bigquery"}}, # no .bigquery.project
@ -642,6 +648,115 @@ class TestDropWrapViewForBQViews:
assert any("bigquery_query(" in s for s in myview_sqls), \
f"legacy wrap view should use bigquery_query(); got: {myview_sqls}"
# ---- #160 always-wrap tests ------------------------------------------
# Issue #160: query_mode='remote' BQ rows whose entity is VIEW or
# MATERIALIZED_VIEW must get a master view via bigquery_query() by default.
# Today (legacy_wrap_views=False default) they don't — `da query --remote`
# against such a table returns DuckDB catalog error. Fix: always wrap.
def test_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
"""VIEW entity must get a bigquery_query() wrap view by default
(not just when legacy_wrap_views=True). Closes #160."""
from connectors.bigquery.extractor import init_extract
import app.instance_config as _ic
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW")
real_connect = duckdb.connect
captured = []
def safe_connect(*a, **kw):
return _CapturingProxy(real_connect(*a, **kw), captured)
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
# Default config — no monkeypatch setting legacy_wrap_views
init_extract(
str(tmp_path),
"my-project",
[{"name": "ue", "bucket": "finance", "source_table": "ue", "description": ""}],
)
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ue"' in s]
assert view_sqls != [], \
f"VIEW entity must produce a wrap view by default; captured={captured}"
assert any("bigquery_query(" in s for s in view_sqls), \
f"VIEW wrap view must use bigquery_query(); got: {view_sqls}"
assert any("`my-project.finance.ue`" in s for s in view_sqls), \
f"wrap view must reference full project.dataset.table path; got: {view_sqls}"
def test_materialized_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
"""MATERIALIZED_VIEW entity must get a bigquery_query() wrap view by default."""
from connectors.bigquery.extractor import init_extract
import app.instance_config as _ic
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
monkeypatch.setattr(
"connectors.bigquery.extractor._detect_table_type",
lambda *a, **kw: "MATERIALIZED_VIEW",
)
real_connect = duckdb.connect
captured = []
def safe_connect(*a, **kw):
return _CapturingProxy(real_connect(*a, **kw), captured)
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
init_extract(
str(tmp_path),
"my-project",
[{"name": "mv", "bucket": "ds", "source_table": "mv", "description": ""}],
)
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"mv"' in s]
assert view_sqls != [], \
f"MATERIALIZED_VIEW must produce a wrap view by default; captured={captured}"
assert any("bigquery_query(" in s for s in view_sqls)
def test_unsupported_entity_type_skips_meta_and_view(self, tmp_path, monkeypatch):
"""For entity_types we don't have proven runtime support for
(EXTERNAL, SNAPSHOT, CLONE, future types), skip BOTH the master
view AND the _meta row. Today the _meta row is inserted
unconditionally orchestrator sees a `_meta` entry pointing to a
non-existent inner view, then skips master-view creation, leaving
the operator with a registered-but-unqueryable name."""
from connectors.bigquery.extractor import init_extract
import app.instance_config as _ic
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
monkeypatch.setattr(
"connectors.bigquery.extractor._detect_table_type",
lambda *a, **kw: "EXTERNAL",
)
real_connect = duckdb.connect
captured = []
def safe_connect(*a, **kw):
return _CapturingProxy(real_connect(*a, **kw), captured)
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
init_extract(
str(tmp_path),
"my-project",
[{"name": "ext_tbl", "bucket": "ds", "source_table": "ext_tbl", "description": ""}],
)
# No CREATE VIEW for ext_tbl
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ext_tbl"' in s]
assert view_sqls == [], \
f"unsupported entity_type must NOT produce a wrap view; got {view_sqls}"
# _meta row also skipped — no INSERT INTO _meta for ext_tbl
c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True)
try:
meta = c.execute("SELECT table_name FROM _meta").fetchall()
assert ("ext_tbl",) not in meta, \
f"unsupported entity_type must NOT insert _meta row; got {meta}"
finally:
c.close()
class TestInitExtractProjectIdValidation:
"""init_extract must reject unsafe project_id before any auth or DB work."""