fix(bq): #160 wrap views via bigquery_query() for VIEW/MATERIALIZED_VIEW
Issue #160: da query --remote against query_mode='remote' BQ rows whose underlying entity is a VIEW or MATERIALIZED_VIEW returned a DuckDB catalog error because the extractor (with legacy_wrap_views=False default since the v2 fetch primitives release) skipped master-view creation for those entity types — but kept inserting the _meta row, leaving operators with a registered name that resolves to nothing. Always create a master view for entity types we have proven runtime support for in this codebase: BASE TABLE → bq."<dataset>"."<source_table>" (Storage Read API path; predicate pushdown) VIEW / MAT_VIEW → bigquery_query('<project>', 'SELECT * FROM `proj.ds.tbl`') (jobs API path; no pushdown — the upcoming /api/query cost guardrail bounds the scan; was the legacy legacy_wrap_views=True branch SQL form, just always-on) For other entity types (EXTERNAL, SNAPSHOT, CLONE, future), log a warning and SKIP both the master view AND the _meta row. The registry row remains intact so /api/v2/scan still works for `da fetch`; we just don't expose a stale _meta entry that the orchestrator would later strand. The legacy_wrap_views config knob is still readable in this commit (read returns the value, which is then ignored). Removal across the rest of the codebase happens in the follow-up REFACTOR commit. tests/test_bigquery_extractor.py: - Add 3 RED tests covering the new always-wrap behavior: test_view_creates_wrap_view_with_default_config, test_materialized_view_creates_wrap_view_with_default_config, test_unsupported_entity_type_skips_meta_and_view. - Fix pre-existing flakiness in test_main_exits_when_project_missing by resetting app.instance_config cache before the no-project mock — the prior test populates the cache with a project, and removing the legacy_wrap_views get_value() call surfaced this latent ordering bug.
This commit is contained in:
parent
4aa96f2546
commit
10d7bd62f8
2 changed files with 138 additions and 18 deletions
|
|
@ -222,25 +222,26 @@ def _init_extract_locked(
|
|||
f"BQ entity {project_id}.{dataset}.{source_table} not found"
|
||||
)
|
||||
|
||||
legacy_wrap_views = bool(
|
||||
get_value("data_source", "bigquery", "legacy_wrap_views", default=False)
|
||||
)
|
||||
|
||||
# Issue #160: always create a master view for query_mode='remote'
|
||||
# rows we have proven runtime support for.
|
||||
# BASE TABLE → catalog path (Storage Read API, predicate pushdown)
|
||||
# VIEW / MATERIALIZED_VIEW → bigquery_query() (jobs API, no
|
||||
# pushdown — cost is bounded by the /api/query guardrail)
|
||||
# Other entity types (EXTERNAL, SNAPSHOT, CLONE, future) are
|
||||
# logged + skipped, with NO _meta row, since orchestrator-side
|
||||
# master-view creation requires a corresponding inner view.
|
||||
if entity_type == "BASE TABLE":
|
||||
# Storage Read API — fast for full scans
|
||||
view_sql = (
|
||||
f'CREATE OR REPLACE VIEW "{table_name}" AS '
|
||||
f'SELECT * FROM bq."{dataset}"."{source_table}"'
|
||||
)
|
||||
conn.execute(view_sql)
|
||||
elif legacy_wrap_views:
|
||||
# Backwards compatibility — for one release cycle only.
|
||||
if entity_type not in ("VIEW", "MATERIALIZED_VIEW"):
|
||||
logger.warning(
|
||||
"Unknown BQ entity type %r for %s.%s.%s — using bigquery_query() path",
|
||||
entity_type, project_id, dataset, source_table,
|
||||
)
|
||||
# VIEW or MATERIALIZED_VIEW — use jobs API
|
||||
elif entity_type in ("VIEW", "MATERIALIZED_VIEW"):
|
||||
# `dataset` and `source_table` are validated above by
|
||||
# validate_quoted_identifier; project_id is validated at
|
||||
# the entry boundary of init_extract (lines 152-160).
|
||||
# The .replace("'", "''") is defense-in-depth on the
|
||||
# inline literal.
|
||||
bq_inner = f"SELECT * FROM `{project_id}.{dataset}.{source_table}`"
|
||||
bq_inner_escaped = bq_inner.replace("'", "''")
|
||||
view_sql = (
|
||||
|
|
@ -249,13 +250,17 @@ def _init_extract_locked(
|
|||
)
|
||||
conn.execute(view_sql)
|
||||
else:
|
||||
# Default: VIEW / MATERIALIZED_VIEW are recorded in _meta but no master
|
||||
# view created. Analyst must use `da fetch` (v2 primitives) to materialise
|
||||
# a snapshot locally.
|
||||
logger.info(
|
||||
"Skipping wrap view for %s entity %s.%s.%s — use `da fetch`",
|
||||
# Unverified entity type. Skip both the wrap view and
|
||||
# the _meta row. The registry row remains; /api/v2/scan
|
||||
# can still operate from it (builds BQ SQL from
|
||||
# bucket+source_table), and `da fetch` works.
|
||||
logger.warning(
|
||||
"Unverified BQ entity_type %r for %s.%s.%s — master view skipped. "
|
||||
"Use `da fetch` for this row, or file an issue with a repro to "
|
||||
"request native support.",
|
||||
entity_type, project_id, dataset, source_table,
|
||||
)
|
||||
continue # Do NOT insert _meta — no inner view to point at.
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO _meta VALUES (?, ?, 0, 0, ?, 'remote')",
|
||||
|
|
|
|||
|
|
@ -557,6 +557,12 @@ class TestExtractorMainModule:
|
|||
|
||||
def test_main_exits_when_project_missing(self, tmp_path, monkeypatch):
|
||||
"""__main__ must SystemExit(2) when data_source.bigquery.project is empty/missing."""
|
||||
# Reset the app.instance_config cache — `test_main_reads_data_source_bigquery_project`
|
||||
# above populated it with a config that has the project set, and the
|
||||
# cache survives across tests. Without this reset, `_resolve_bq_project_id`
|
||||
# returns the stale cached value instead of the no-project mock below.
|
||||
from app.instance_config import reset_cache
|
||||
reset_cache()
|
||||
monkeypatch.setattr(
|
||||
"config.loader.load_instance_config",
|
||||
lambda: {"data_source": {"type": "bigquery"}}, # no .bigquery.project
|
||||
|
|
@ -642,6 +648,115 @@ class TestDropWrapViewForBQViews:
|
|||
assert any("bigquery_query(" in s for s in myview_sqls), \
|
||||
f"legacy wrap view should use bigquery_query(); got: {myview_sqls}"
|
||||
|
||||
# ---- #160 always-wrap tests ------------------------------------------
|
||||
# Issue #160: query_mode='remote' BQ rows whose entity is VIEW or
|
||||
# MATERIALIZED_VIEW must get a master view via bigquery_query() by default.
|
||||
# Today (legacy_wrap_views=False default) they don't — `da query --remote`
|
||||
# against such a table returns DuckDB catalog error. Fix: always wrap.
|
||||
|
||||
def test_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
|
||||
"""VIEW entity must get a bigquery_query() wrap view by default
|
||||
(not just when legacy_wrap_views=True). Closes #160."""
|
||||
from connectors.bigquery.extractor import init_extract
|
||||
import app.instance_config as _ic
|
||||
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
||||
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
||||
monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW")
|
||||
|
||||
real_connect = duckdb.connect
|
||||
captured = []
|
||||
|
||||
def safe_connect(*a, **kw):
|
||||
return _CapturingProxy(real_connect(*a, **kw), captured)
|
||||
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
||||
|
||||
# Default config — no monkeypatch setting legacy_wrap_views
|
||||
init_extract(
|
||||
str(tmp_path),
|
||||
"my-project",
|
||||
[{"name": "ue", "bucket": "finance", "source_table": "ue", "description": ""}],
|
||||
)
|
||||
|
||||
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ue"' in s]
|
||||
assert view_sqls != [], \
|
||||
f"VIEW entity must produce a wrap view by default; captured={captured}"
|
||||
assert any("bigquery_query(" in s for s in view_sqls), \
|
||||
f"VIEW wrap view must use bigquery_query(); got: {view_sqls}"
|
||||
assert any("`my-project.finance.ue`" in s for s in view_sqls), \
|
||||
f"wrap view must reference full project.dataset.table path; got: {view_sqls}"
|
||||
|
||||
def test_materialized_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
|
||||
"""MATERIALIZED_VIEW entity must get a bigquery_query() wrap view by default."""
|
||||
from connectors.bigquery.extractor import init_extract
|
||||
import app.instance_config as _ic
|
||||
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
||||
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.extractor._detect_table_type",
|
||||
lambda *a, **kw: "MATERIALIZED_VIEW",
|
||||
)
|
||||
|
||||
real_connect = duckdb.connect
|
||||
captured = []
|
||||
|
||||
def safe_connect(*a, **kw):
|
||||
return _CapturingProxy(real_connect(*a, **kw), captured)
|
||||
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
||||
|
||||
init_extract(
|
||||
str(tmp_path),
|
||||
"my-project",
|
||||
[{"name": "mv", "bucket": "ds", "source_table": "mv", "description": ""}],
|
||||
)
|
||||
|
||||
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"mv"' in s]
|
||||
assert view_sqls != [], \
|
||||
f"MATERIALIZED_VIEW must produce a wrap view by default; captured={captured}"
|
||||
assert any("bigquery_query(" in s for s in view_sqls)
|
||||
|
||||
def test_unsupported_entity_type_skips_meta_and_view(self, tmp_path, monkeypatch):
|
||||
"""For entity_types we don't have proven runtime support for
|
||||
(EXTERNAL, SNAPSHOT, CLONE, future types), skip BOTH the master
|
||||
view AND the _meta row. Today the _meta row is inserted
|
||||
unconditionally → orchestrator sees a `_meta` entry pointing to a
|
||||
non-existent inner view, then skips master-view creation, leaving
|
||||
the operator with a registered-but-unqueryable name."""
|
||||
from connectors.bigquery.extractor import init_extract
|
||||
import app.instance_config as _ic
|
||||
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
||||
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.extractor._detect_table_type",
|
||||
lambda *a, **kw: "EXTERNAL",
|
||||
)
|
||||
|
||||
real_connect = duckdb.connect
|
||||
captured = []
|
||||
|
||||
def safe_connect(*a, **kw):
|
||||
return _CapturingProxy(real_connect(*a, **kw), captured)
|
||||
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
||||
|
||||
init_extract(
|
||||
str(tmp_path),
|
||||
"my-project",
|
||||
[{"name": "ext_tbl", "bucket": "ds", "source_table": "ext_tbl", "description": ""}],
|
||||
)
|
||||
|
||||
# No CREATE VIEW for ext_tbl
|
||||
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ext_tbl"' in s]
|
||||
assert view_sqls == [], \
|
||||
f"unsupported entity_type must NOT produce a wrap view; got {view_sqls}"
|
||||
|
||||
# _meta row also skipped — no INSERT INTO _meta for ext_tbl
|
||||
c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True)
|
||||
try:
|
||||
meta = c.execute("SELECT table_name FROM _meta").fetchall()
|
||||
assert ("ext_tbl",) not in meta, \
|
||||
f"unsupported entity_type must NOT insert _meta row; got {meta}"
|
||||
finally:
|
||||
c.close()
|
||||
|
||||
|
||||
class TestInitExtractProjectIdValidation:
|
||||
"""init_extract must reject unsafe project_id before any auth or DB work."""
|
||||
|
|
|
|||
Loading…
Reference in a new issue