From 10d7bd62f881ce1ffbfd100408c2df3bc75839fb Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Sun, 3 May 2026 16:23:35 +0200 Subject: [PATCH] fix(bq): #160 wrap views via bigquery_query() for VIEW/MATERIALIZED_VIEW MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #160: da query --remote against query_mode='remote' BQ rows whose underlying entity is a VIEW or MATERIALIZED_VIEW returned a DuckDB catalog error because the extractor (with legacy_wrap_views=False default since the v2 fetch primitives release) skipped master-view creation for those entity types — but kept inserting the _meta row, leaving operators with a registered name that resolves to nothing. Always create a master view for entity types we have proven runtime support for in this codebase: BASE TABLE → bq.""."" (Storage Read API path; predicate pushdown) VIEW / MAT_VIEW → bigquery_query('', 'SELECT * FROM `proj.ds.tbl`') (jobs API path; no pushdown — the upcoming /api/query cost guardrail bounds the scan; was the legacy legacy_wrap_views=True branch SQL form, just always-on) For other entity types (EXTERNAL, SNAPSHOT, CLONE, future), log a warning and SKIP both the master view AND the _meta row. The registry row remains intact so /api/v2/scan still works for `da fetch`; we just don't expose a stale _meta entry that the orchestrator would later strand. The legacy_wrap_views config knob is still readable in this commit (read returns the value, which is then ignored). Removal across the rest of the codebase happens in the follow-up REFACTOR commit. tests/test_bigquery_extractor.py: - Add 3 RED tests covering the new always-wrap behavior: test_view_creates_wrap_view_with_default_config, test_materialized_view_creates_wrap_view_with_default_config, test_unsupported_entity_type_skips_meta_and_view. - Fix pre-existing flakiness in test_main_exits_when_project_missing by resetting app.instance_config cache before the no-project mock — the prior test populates the cache with a project, and removing the legacy_wrap_views get_value() call surfaced this latent ordering bug. --- connectors/bigquery/extractor.py | 41 ++++++----- tests/test_bigquery_extractor.py | 115 +++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 18 deletions(-) diff --git a/connectors/bigquery/extractor.py b/connectors/bigquery/extractor.py index e3e7838..bf95c2b 100644 --- a/connectors/bigquery/extractor.py +++ b/connectors/bigquery/extractor.py @@ -222,25 +222,26 @@ def _init_extract_locked( f"BQ entity {project_id}.{dataset}.{source_table} not found" ) - legacy_wrap_views = bool( - get_value("data_source", "bigquery", "legacy_wrap_views", default=False) - ) - + # Issue #160: always create a master view for query_mode='remote' + # rows we have proven runtime support for. + # BASE TABLE → catalog path (Storage Read API, predicate pushdown) + # VIEW / MATERIALIZED_VIEW → bigquery_query() (jobs API, no + # pushdown — cost is bounded by the /api/query guardrail) + # Other entity types (EXTERNAL, SNAPSHOT, CLONE, future) are + # logged + skipped, with NO _meta row, since orchestrator-side + # master-view creation requires a corresponding inner view. if entity_type == "BASE TABLE": - # Storage Read API — fast for full scans view_sql = ( f'CREATE OR REPLACE VIEW "{table_name}" AS ' f'SELECT * FROM bq."{dataset}"."{source_table}"' ) conn.execute(view_sql) - elif legacy_wrap_views: - # Backwards compatibility — for one release cycle only. - if entity_type not in ("VIEW", "MATERIALIZED_VIEW"): - logger.warning( - "Unknown BQ entity type %r for %s.%s.%s — using bigquery_query() path", - entity_type, project_id, dataset, source_table, - ) - # VIEW or MATERIALIZED_VIEW — use jobs API + elif entity_type in ("VIEW", "MATERIALIZED_VIEW"): + # `dataset` and `source_table` are validated above by + # validate_quoted_identifier; project_id is validated at + # the entry boundary of init_extract (lines 152-160). + # The .replace("'", "''") is defense-in-depth on the + # inline literal. bq_inner = f"SELECT * FROM `{project_id}.{dataset}.{source_table}`" bq_inner_escaped = bq_inner.replace("'", "''") view_sql = ( @@ -249,13 +250,17 @@ def _init_extract_locked( ) conn.execute(view_sql) else: - # Default: VIEW / MATERIALIZED_VIEW are recorded in _meta but no master - # view created. Analyst must use `da fetch` (v2 primitives) to materialise - # a snapshot locally. - logger.info( - "Skipping wrap view for %s entity %s.%s.%s — use `da fetch`", + # Unverified entity type. Skip both the wrap view and + # the _meta row. The registry row remains; /api/v2/scan + # can still operate from it (builds BQ SQL from + # bucket+source_table), and `da fetch` works. + logger.warning( + "Unverified BQ entity_type %r for %s.%s.%s — master view skipped. " + "Use `da fetch` for this row, or file an issue with a repro to " + "request native support.", entity_type, project_id, dataset, source_table, ) + continue # Do NOT insert _meta — no inner view to point at. conn.execute( "INSERT INTO _meta VALUES (?, ?, 0, 0, ?, 'remote')", diff --git a/tests/test_bigquery_extractor.py b/tests/test_bigquery_extractor.py index cbc1fcd..f6879c3 100644 --- a/tests/test_bigquery_extractor.py +++ b/tests/test_bigquery_extractor.py @@ -557,6 +557,12 @@ class TestExtractorMainModule: def test_main_exits_when_project_missing(self, tmp_path, monkeypatch): """__main__ must SystemExit(2) when data_source.bigquery.project is empty/missing.""" + # Reset the app.instance_config cache — `test_main_reads_data_source_bigquery_project` + # above populated it with a config that has the project set, and the + # cache survives across tests. Without this reset, `_resolve_bq_project_id` + # returns the stale cached value instead of the no-project mock below. + from app.instance_config import reset_cache + reset_cache() monkeypatch.setattr( "config.loader.load_instance_config", lambda: {"data_source": {"type": "bigquery"}}, # no .bigquery.project @@ -642,6 +648,115 @@ class TestDropWrapViewForBQViews: assert any("bigquery_query(" in s for s in myview_sqls), \ f"legacy wrap view should use bigquery_query(); got: {myview_sqls}" + # ---- #160 always-wrap tests ------------------------------------------ + # Issue #160: query_mode='remote' BQ rows whose entity is VIEW or + # MATERIALIZED_VIEW must get a master view via bigquery_query() by default. + # Today (legacy_wrap_views=False default) they don't — `da query --remote` + # against such a table returns DuckDB catalog error. Fix: always wrap. + + def test_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch): + """VIEW entity must get a bigquery_query() wrap view by default + (not just when legacy_wrap_views=True). Closes #160.""" + from connectors.bigquery.extractor import init_extract + import app.instance_config as _ic + monkeypatch.setattr(_ic, "_instance_config", None, raising=False) + monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok") + monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW") + + real_connect = duckdb.connect + captured = [] + + def safe_connect(*a, **kw): + return _CapturingProxy(real_connect(*a, **kw), captured) + monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) + + # Default config — no monkeypatch setting legacy_wrap_views + init_extract( + str(tmp_path), + "my-project", + [{"name": "ue", "bucket": "finance", "source_table": "ue", "description": ""}], + ) + + view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ue"' in s] + assert view_sqls != [], \ + f"VIEW entity must produce a wrap view by default; captured={captured}" + assert any("bigquery_query(" in s for s in view_sqls), \ + f"VIEW wrap view must use bigquery_query(); got: {view_sqls}" + assert any("`my-project.finance.ue`" in s for s in view_sqls), \ + f"wrap view must reference full project.dataset.table path; got: {view_sqls}" + + def test_materialized_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch): + """MATERIALIZED_VIEW entity must get a bigquery_query() wrap view by default.""" + from connectors.bigquery.extractor import init_extract + import app.instance_config as _ic + monkeypatch.setattr(_ic, "_instance_config", None, raising=False) + monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok") + monkeypatch.setattr( + "connectors.bigquery.extractor._detect_table_type", + lambda *a, **kw: "MATERIALIZED_VIEW", + ) + + real_connect = duckdb.connect + captured = [] + + def safe_connect(*a, **kw): + return _CapturingProxy(real_connect(*a, **kw), captured) + monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) + + init_extract( + str(tmp_path), + "my-project", + [{"name": "mv", "bucket": "ds", "source_table": "mv", "description": ""}], + ) + + view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"mv"' in s] + assert view_sqls != [], \ + f"MATERIALIZED_VIEW must produce a wrap view by default; captured={captured}" + assert any("bigquery_query(" in s for s in view_sqls) + + def test_unsupported_entity_type_skips_meta_and_view(self, tmp_path, monkeypatch): + """For entity_types we don't have proven runtime support for + (EXTERNAL, SNAPSHOT, CLONE, future types), skip BOTH the master + view AND the _meta row. Today the _meta row is inserted + unconditionally → orchestrator sees a `_meta` entry pointing to a + non-existent inner view, then skips master-view creation, leaving + the operator with a registered-but-unqueryable name.""" + from connectors.bigquery.extractor import init_extract + import app.instance_config as _ic + monkeypatch.setattr(_ic, "_instance_config", None, raising=False) + monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok") + monkeypatch.setattr( + "connectors.bigquery.extractor._detect_table_type", + lambda *a, **kw: "EXTERNAL", + ) + + real_connect = duckdb.connect + captured = [] + + def safe_connect(*a, **kw): + return _CapturingProxy(real_connect(*a, **kw), captured) + monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) + + init_extract( + str(tmp_path), + "my-project", + [{"name": "ext_tbl", "bucket": "ds", "source_table": "ext_tbl", "description": ""}], + ) + + # No CREATE VIEW for ext_tbl + view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ext_tbl"' in s] + assert view_sqls == [], \ + f"unsupported entity_type must NOT produce a wrap view; got {view_sqls}" + + # _meta row also skipped — no INSERT INTO _meta for ext_tbl + c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True) + try: + meta = c.execute("SELECT table_name FROM _meta").fetchall() + assert ("ext_tbl",) not in meta, \ + f"unsupported entity_type must NOT insert _meta row; got {meta}" + finally: + c.close() + class TestInitExtractProjectIdValidation: """init_extract must reject unsafe project_id before any auth or DB work."""