fix(bq): #160 wrap views via bigquery_query() for VIEW/MATERIALIZED_VIEW
Issue #160: da query --remote against query_mode='remote' BQ rows whose underlying entity is a VIEW or MATERIALIZED_VIEW returned a DuckDB catalog error because the extractor (with legacy_wrap_views=False default since the v2 fetch primitives release) skipped master-view creation for those entity types — but kept inserting the _meta row, leaving operators with a registered name that resolves to nothing. Always create a master view for entity types we have proven runtime support for in this codebase: BASE TABLE → bq."<dataset>"."<source_table>" (Storage Read API path; predicate pushdown) VIEW / MAT_VIEW → bigquery_query('<project>', 'SELECT * FROM `proj.ds.tbl`') (jobs API path; no pushdown — the upcoming /api/query cost guardrail bounds the scan; was the legacy legacy_wrap_views=True branch SQL form, just always-on) For other entity types (EXTERNAL, SNAPSHOT, CLONE, future), log a warning and SKIP both the master view AND the _meta row. The registry row remains intact so /api/v2/scan still works for `da fetch`; we just don't expose a stale _meta entry that the orchestrator would later strand. The legacy_wrap_views config knob is still readable in this commit (read returns the value, which is then ignored). Removal across the rest of the codebase happens in the follow-up REFACTOR commit. tests/test_bigquery_extractor.py: - Add 3 RED tests covering the new always-wrap behavior: test_view_creates_wrap_view_with_default_config, test_materialized_view_creates_wrap_view_with_default_config, test_unsupported_entity_type_skips_meta_and_view. - Fix pre-existing flakiness in test_main_exits_when_project_missing by resetting app.instance_config cache before the no-project mock — the prior test populates the cache with a project, and removing the legacy_wrap_views get_value() call surfaced this latent ordering bug.
This commit is contained in:
parent
4aa96f2546
commit
10d7bd62f8
2 changed files with 138 additions and 18 deletions
|
|
@ -222,25 +222,26 @@ def _init_extract_locked(
|
||||||
f"BQ entity {project_id}.{dataset}.{source_table} not found"
|
f"BQ entity {project_id}.{dataset}.{source_table} not found"
|
||||||
)
|
)
|
||||||
|
|
||||||
legacy_wrap_views = bool(
|
# Issue #160: always create a master view for query_mode='remote'
|
||||||
get_value("data_source", "bigquery", "legacy_wrap_views", default=False)
|
# rows we have proven runtime support for.
|
||||||
)
|
# BASE TABLE → catalog path (Storage Read API, predicate pushdown)
|
||||||
|
# VIEW / MATERIALIZED_VIEW → bigquery_query() (jobs API, no
|
||||||
|
# pushdown — cost is bounded by the /api/query guardrail)
|
||||||
|
# Other entity types (EXTERNAL, SNAPSHOT, CLONE, future) are
|
||||||
|
# logged + skipped, with NO _meta row, since orchestrator-side
|
||||||
|
# master-view creation requires a corresponding inner view.
|
||||||
if entity_type == "BASE TABLE":
|
if entity_type == "BASE TABLE":
|
||||||
# Storage Read API — fast for full scans
|
|
||||||
view_sql = (
|
view_sql = (
|
||||||
f'CREATE OR REPLACE VIEW "{table_name}" AS '
|
f'CREATE OR REPLACE VIEW "{table_name}" AS '
|
||||||
f'SELECT * FROM bq."{dataset}"."{source_table}"'
|
f'SELECT * FROM bq."{dataset}"."{source_table}"'
|
||||||
)
|
)
|
||||||
conn.execute(view_sql)
|
conn.execute(view_sql)
|
||||||
elif legacy_wrap_views:
|
elif entity_type in ("VIEW", "MATERIALIZED_VIEW"):
|
||||||
# Backwards compatibility — for one release cycle only.
|
# `dataset` and `source_table` are validated above by
|
||||||
if entity_type not in ("VIEW", "MATERIALIZED_VIEW"):
|
# validate_quoted_identifier; project_id is validated at
|
||||||
logger.warning(
|
# the entry boundary of init_extract (lines 152-160).
|
||||||
"Unknown BQ entity type %r for %s.%s.%s — using bigquery_query() path",
|
# The .replace("'", "''") is defense-in-depth on the
|
||||||
entity_type, project_id, dataset, source_table,
|
# inline literal.
|
||||||
)
|
|
||||||
# VIEW or MATERIALIZED_VIEW — use jobs API
|
|
||||||
bq_inner = f"SELECT * FROM `{project_id}.{dataset}.{source_table}`"
|
bq_inner = f"SELECT * FROM `{project_id}.{dataset}.{source_table}`"
|
||||||
bq_inner_escaped = bq_inner.replace("'", "''")
|
bq_inner_escaped = bq_inner.replace("'", "''")
|
||||||
view_sql = (
|
view_sql = (
|
||||||
|
|
@ -249,13 +250,17 @@ def _init_extract_locked(
|
||||||
)
|
)
|
||||||
conn.execute(view_sql)
|
conn.execute(view_sql)
|
||||||
else:
|
else:
|
||||||
# Default: VIEW / MATERIALIZED_VIEW are recorded in _meta but no master
|
# Unverified entity type. Skip both the wrap view and
|
||||||
# view created. Analyst must use `da fetch` (v2 primitives) to materialise
|
# the _meta row. The registry row remains; /api/v2/scan
|
||||||
# a snapshot locally.
|
# can still operate from it (builds BQ SQL from
|
||||||
logger.info(
|
# bucket+source_table), and `da fetch` works.
|
||||||
"Skipping wrap view for %s entity %s.%s.%s — use `da fetch`",
|
logger.warning(
|
||||||
|
"Unverified BQ entity_type %r for %s.%s.%s — master view skipped. "
|
||||||
|
"Use `da fetch` for this row, or file an issue with a repro to "
|
||||||
|
"request native support.",
|
||||||
entity_type, project_id, dataset, source_table,
|
entity_type, project_id, dataset, source_table,
|
||||||
)
|
)
|
||||||
|
continue # Do NOT insert _meta — no inner view to point at.
|
||||||
|
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT INTO _meta VALUES (?, ?, 0, 0, ?, 'remote')",
|
"INSERT INTO _meta VALUES (?, ?, 0, 0, ?, 'remote')",
|
||||||
|
|
|
||||||
|
|
@ -557,6 +557,12 @@ class TestExtractorMainModule:
|
||||||
|
|
||||||
def test_main_exits_when_project_missing(self, tmp_path, monkeypatch):
|
def test_main_exits_when_project_missing(self, tmp_path, monkeypatch):
|
||||||
"""__main__ must SystemExit(2) when data_source.bigquery.project is empty/missing."""
|
"""__main__ must SystemExit(2) when data_source.bigquery.project is empty/missing."""
|
||||||
|
# Reset the app.instance_config cache — `test_main_reads_data_source_bigquery_project`
|
||||||
|
# above populated it with a config that has the project set, and the
|
||||||
|
# cache survives across tests. Without this reset, `_resolve_bq_project_id`
|
||||||
|
# returns the stale cached value instead of the no-project mock below.
|
||||||
|
from app.instance_config import reset_cache
|
||||||
|
reset_cache()
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"config.loader.load_instance_config",
|
"config.loader.load_instance_config",
|
||||||
lambda: {"data_source": {"type": "bigquery"}}, # no .bigquery.project
|
lambda: {"data_source": {"type": "bigquery"}}, # no .bigquery.project
|
||||||
|
|
@ -642,6 +648,115 @@ class TestDropWrapViewForBQViews:
|
||||||
assert any("bigquery_query(" in s for s in myview_sqls), \
|
assert any("bigquery_query(" in s for s in myview_sqls), \
|
||||||
f"legacy wrap view should use bigquery_query(); got: {myview_sqls}"
|
f"legacy wrap view should use bigquery_query(); got: {myview_sqls}"
|
||||||
|
|
||||||
|
# ---- #160 always-wrap tests ------------------------------------------
|
||||||
|
# Issue #160: query_mode='remote' BQ rows whose entity is VIEW or
|
||||||
|
# MATERIALIZED_VIEW must get a master view via bigquery_query() by default.
|
||||||
|
# Today (legacy_wrap_views=False default) they don't — `da query --remote`
|
||||||
|
# against such a table returns DuckDB catalog error. Fix: always wrap.
|
||||||
|
|
||||||
|
def test_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
|
||||||
|
"""VIEW entity must get a bigquery_query() wrap view by default
|
||||||
|
(not just when legacy_wrap_views=True). Closes #160."""
|
||||||
|
from connectors.bigquery.extractor import init_extract
|
||||||
|
import app.instance_config as _ic
|
||||||
|
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
||||||
|
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
||||||
|
monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW")
|
||||||
|
|
||||||
|
real_connect = duckdb.connect
|
||||||
|
captured = []
|
||||||
|
|
||||||
|
def safe_connect(*a, **kw):
|
||||||
|
return _CapturingProxy(real_connect(*a, **kw), captured)
|
||||||
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
||||||
|
|
||||||
|
# Default config — no monkeypatch setting legacy_wrap_views
|
||||||
|
init_extract(
|
||||||
|
str(tmp_path),
|
||||||
|
"my-project",
|
||||||
|
[{"name": "ue", "bucket": "finance", "source_table": "ue", "description": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ue"' in s]
|
||||||
|
assert view_sqls != [], \
|
||||||
|
f"VIEW entity must produce a wrap view by default; captured={captured}"
|
||||||
|
assert any("bigquery_query(" in s for s in view_sqls), \
|
||||||
|
f"VIEW wrap view must use bigquery_query(); got: {view_sqls}"
|
||||||
|
assert any("`my-project.finance.ue`" in s for s in view_sqls), \
|
||||||
|
f"wrap view must reference full project.dataset.table path; got: {view_sqls}"
|
||||||
|
|
||||||
|
def test_materialized_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
|
||||||
|
"""MATERIALIZED_VIEW entity must get a bigquery_query() wrap view by default."""
|
||||||
|
from connectors.bigquery.extractor import init_extract
|
||||||
|
import app.instance_config as _ic
|
||||||
|
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
||||||
|
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"connectors.bigquery.extractor._detect_table_type",
|
||||||
|
lambda *a, **kw: "MATERIALIZED_VIEW",
|
||||||
|
)
|
||||||
|
|
||||||
|
real_connect = duckdb.connect
|
||||||
|
captured = []
|
||||||
|
|
||||||
|
def safe_connect(*a, **kw):
|
||||||
|
return _CapturingProxy(real_connect(*a, **kw), captured)
|
||||||
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
||||||
|
|
||||||
|
init_extract(
|
||||||
|
str(tmp_path),
|
||||||
|
"my-project",
|
||||||
|
[{"name": "mv", "bucket": "ds", "source_table": "mv", "description": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"mv"' in s]
|
||||||
|
assert view_sqls != [], \
|
||||||
|
f"MATERIALIZED_VIEW must produce a wrap view by default; captured={captured}"
|
||||||
|
assert any("bigquery_query(" in s for s in view_sqls)
|
||||||
|
|
||||||
|
def test_unsupported_entity_type_skips_meta_and_view(self, tmp_path, monkeypatch):
|
||||||
|
"""For entity_types we don't have proven runtime support for
|
||||||
|
(EXTERNAL, SNAPSHOT, CLONE, future types), skip BOTH the master
|
||||||
|
view AND the _meta row. Today the _meta row is inserted
|
||||||
|
unconditionally → orchestrator sees a `_meta` entry pointing to a
|
||||||
|
non-existent inner view, then skips master-view creation, leaving
|
||||||
|
the operator with a registered-but-unqueryable name."""
|
||||||
|
from connectors.bigquery.extractor import init_extract
|
||||||
|
import app.instance_config as _ic
|
||||||
|
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
||||||
|
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"connectors.bigquery.extractor._detect_table_type",
|
||||||
|
lambda *a, **kw: "EXTERNAL",
|
||||||
|
)
|
||||||
|
|
||||||
|
real_connect = duckdb.connect
|
||||||
|
captured = []
|
||||||
|
|
||||||
|
def safe_connect(*a, **kw):
|
||||||
|
return _CapturingProxy(real_connect(*a, **kw), captured)
|
||||||
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
||||||
|
|
||||||
|
init_extract(
|
||||||
|
str(tmp_path),
|
||||||
|
"my-project",
|
||||||
|
[{"name": "ext_tbl", "bucket": "ds", "source_table": "ext_tbl", "description": ""}],
|
||||||
|
)
|
||||||
|
|
||||||
|
# No CREATE VIEW for ext_tbl
|
||||||
|
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ext_tbl"' in s]
|
||||||
|
assert view_sqls == [], \
|
||||||
|
f"unsupported entity_type must NOT produce a wrap view; got {view_sqls}"
|
||||||
|
|
||||||
|
# _meta row also skipped — no INSERT INTO _meta for ext_tbl
|
||||||
|
c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True)
|
||||||
|
try:
|
||||||
|
meta = c.execute("SELECT table_name FROM _meta").fetchall()
|
||||||
|
assert ("ext_tbl",) not in meta, \
|
||||||
|
f"unsupported entity_type must NOT insert _meta row; got {meta}"
|
||||||
|
finally:
|
||||||
|
c.close()
|
||||||
|
|
||||||
|
|
||||||
class TestInitExtractProjectIdValidation:
|
class TestInitExtractProjectIdValidation:
|
||||||
"""init_extract must reject unsafe project_id before any auth or DB work."""
|
"""init_extract must reject unsafe project_id before any auth or DB work."""
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue