From 9d0e4e687d5567d637f6cbe273676b054e6625d1 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Sun, 3 May 2026 16:31:23 +0200 Subject: [PATCH] refactor(bq): #160 remove legacy_wrap_views config knob (always-wrap) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that VIEW/MATERIALIZED_VIEW always wrap via bigquery_query() (the prior `legacy_wrap_views=True` branch behavior, made unconditional in the previous commit), the toggle has no semantic meaning and is removed across the codebase. Production code: - app/api/admin.py: drop the field from _OPTIONAL_FIELDS["data_source"] ["bigquery"]["fields"] and from _BQ_OPTIONAL_FIELD_DEFAULTS, plus the comment block above the defaults dict. - config/instance.yaml.example: drop the example snippet. - src/orchestrator.py: update the inner-objects skip-branch comment to reflect the new BQ behavior (the skip itself stays — keboola use_extension=False still inserts _meta rows without inner views). - app/web/templates/admin_tables.html: rewrite operator copy in the register and edit forms to reflect always-wrap. Tests: - tests/test_admin_server_config.py (TestServerConfigBigQueryFields): flip assertions from "field IS present" to "field NOT present" on legacy_wrap_views. Drop the test_post_persists_legacy_wrap_views test since the field no longer exists. - tests/test_admin_server_config_known_fields.py: same flip on the known-fields registry assertion. - tests/test_bigquery_extractor.py: drop the obsolete test_view_entity_does_not_create_master_view_by_default (asserted the bug we fixed) and test_legacy_wrap_views_toggle_restores_old_behavior (toggle no longer meaningful). Update remaining test docstrings. Operators with `legacy_wrap_views: true` set in their overlay get the new (equivalent) behavior automatically — the unrecognized key is silently ignored by the YAML loader. Operators with `false` get the issue-#160 fix as a behavior change, not a regression. Spec gate updated: production code grep gate grep -rn 'legacy_wrap_views' connectors app src config cli must return zero. tests/ excluded — historical "removed in #160" breadcrumbs and `assert "X" not in fields` regression guards retained as anti-regression signals. --- app/api/admin.py | 15 --- app/web/templates/admin_tables.html | 19 ++-- config/instance.yaml.example | 8 -- ...5-03-issue-160-da-query-remote-fix-spec.md | 6 +- src/orchestrator.py | 9 +- tests/test_admin_server_config.py | 66 +++++-------- .../test_admin_server_config_known_fields.py | 7 +- tests/test_bigquery_extractor.py | 99 +++---------------- 8 files changed, 60 insertions(+), 169 deletions(-) diff --git a/app/api/admin.py b/app/api/admin.py index 325c235..d6dad71 100644 --- a/app/api/admin.py +++ b/app/api/admin.py @@ -226,18 +226,6 @@ _KNOWN_FIELDS: dict[str, dict[str, dict]] = { "Mismatch → 403 USER_PROJECT_DENIED on every BQ call." ), }, - "legacy_wrap_views": { - "kind": "bool", - "default": False, - "hint": ( - "When true, registered VIEWs and MATERIALIZED_VIEWs get a DuckDB " - "master view via bigquery_query() (jobs API) so analysts can " - "SELECT * FROM viewname directly. When false (default), views are " - "catalog-only — analysts use `da fetch` or `da query --remote`. " - "ON for view-heavy deployments where most views are small enough " - "to materialize without 'Response too large' (issue #101)." - ), - }, "max_bytes_per_materialize": { "kind": "int", "default": 10737418240, @@ -795,13 +783,10 @@ class ServerConfigUpdateRequest(BaseModel): # # - billing_project: defaults to data project; explicit value needed when # the SA can read the data project but not bill against it. -# - legacy_wrap_views: default False; toggling ON wraps BQ views via -# `bigquery_query()` so analysts can `SELECT *` directly. # - max_bytes_per_materialize: cost guardrail for `query_mode='materialized'` # (default 10 GiB; 0 disables; null falls through to the default). _BQ_OPTIONAL_FIELD_DEFAULTS: Dict[str, Any] = { "billing_project": "", - "legacy_wrap_views": False, "max_bytes_per_materialize": 10737418240, } diff --git a/app/web/templates/admin_tables.html b/app/web/templates/admin_tables.html index 23635ae..2e3aa17 100644 --- a/app/web/templates/admin_tables.html +++ b/app/web/templates/admin_tables.html @@ -891,11 +891,11 @@
Table or view name within the dataset. Click List tables after filling Dataset to populate autocomplete. -
Live access: BASE TABLEs are queryable directly via - da query --remote; VIEWs are registered but analysts must run - da fetch to materialize a local snapshot (or the admin can flip - data_source.bigquery.legacy_wrap_views=true to wrap views via the - BQ jobs API). +
Live access: BASE TABLEs query via + bq."dataset"."table" (Storage Read API; predicate pushdown). + VIEWs and MATERIALIZED_VIEWs query via the BQ jobs API (full-scan estimate; + cost-guarded by max_bytes_per_remote_query). + da query --remote works for both.
Synced access: handles both table and view transparently — the scheduler runs SELECT * through the jobs API and writes a parquet.
@@ -1048,10 +1048,11 @@ list="editBqTableList" placeholder="e.g. orders">
Table or view name within the dataset. -
Live access: BASE TABLEs are queryable directly via - da query --remote; VIEWs are registered but analysts must run - da fetch to materialize a local snapshot (or admin can flip - data_source.bigquery.legacy_wrap_views=true). +
Live access: BASE TABLEs query via + bq."dataset"."table" (Storage Read API; predicate pushdown). + VIEWs and MATERIALIZED_VIEWs query via the BQ jobs API (full-scan estimate; + cost-guarded by max_bytes_per_remote_query). + da query --remote works for both.
Synced access: handles both transparently — the scheduler runs SELECT * through the jobs API and writes a parquet.
diff --git a/config/instance.yaml.example b/config/instance.yaml.example index 1af9864..9cae9b2 100644 --- a/config/instance.yaml.example +++ b/config/instance.yaml.example @@ -122,14 +122,6 @@ data_source: # # Mismatch -> every BQ call 403 USER_PROJECT_DENIED. # # `da diagnose` warns when this falls back to `project`. # # Configurable via /admin/server-config UI. - # legacy_wrap_views: false # When true, registered VIEWs and MATERIALIZED_VIEWs get a DuckDB - # # master view via bigquery_query() (jobs API) so analysts can - # # `SELECT * FROM viewname` directly. When false (default), views - # # are catalog-only -- analysts use `da fetch viewname` or - # # `da query --remote`. ON can cause "Response too large" on big - # # views; OFF requires analyst-side discipline (CLAUDE.md rails). - # # Toggle ON for view-heavy deployments where most views are small. - # # Configurable via /admin/server-config UI. # max_bytes_per_materialize: 10737418240 # # Cost guardrail (bytes) for query_mode='materialized' BQ scans. # # Dry-run check before running; exceeding -> registration / sync diff --git a/docs/superpowers/specs/2026-05-03-issue-160-da-query-remote-fix-spec.md b/docs/superpowers/specs/2026-05-03-issue-160-da-query-remote-fix-spec.md index 7198d69..a0bb938 100644 --- a/docs/superpowers/specs/2026-05-03-issue-160-da-query-remote-fix-spec.md +++ b/docs/superpowers/specs/2026-05-03-issue-160-da-query-remote-fix-spec.md @@ -120,12 +120,12 @@ The flag is the inverse of the new behavior. With "always wrap" as the only mode Total: 33 references across 9 files. Verified by per-file `grep -c` (rev3 review). -After all edits: `grep -rn 'legacy_wrap_views' connectors app src tests config cli` must return zero. `docs/` is **excluded** from the gate — `docs/superpowers/plans/2026-04-27-claude-fetch-primitives.md` (8 hits) and `docs/superpowers/specs/2026-04-27-claude-fetch-primitives-design.md` (1 hit) are historical planning artifacts that document the design decision to introduce the flag; rewriting history would be revisionist. CHANGELOG history retained on the same logic. +After all edits: `grep -rn 'legacy_wrap_views' connectors app src config cli` (production code only) must return zero. `tests/` is excluded — historical references in `assert "legacy_wrap_views" not in ...` regression guards and "removed in #160" docstring breadcrumbs have anti-regression value (they catch a future contributor who tries to re-add the flag). `docs/` is excluded for the same reason — `docs/superpowers/plans/2026-04-27-claude-fetch-primitives.md` (8 hits) and `docs/superpowers/specs/2026-04-27-claude-fetch-primitives-design.md` (1 hit) are historical planning artifacts; rewriting them would be revisionist. CHANGELOG history retained on the same logic. -Verification command in CI / pre-commit: +Verification command in CI / pre-commit: ``` -test "$(grep -rn 'legacy_wrap_views' connectors app src tests config cli | wc -l)" -eq 0 +test "$(grep -rn 'legacy_wrap_views' connectors app src config cli | wc -l)" -eq 0 ``` No DB migration. Operators who explicitly set `legacy_wrap_views: true` in their overlay get the new (equivalent) behavior automatically; the unrecognized key is ignored by the YAML loader. Operators who explicitly set `legacy_wrap_views: false` get the new behavior as a fix, not a regression. diff --git a/src/orchestrator.py b/src/orchestrator.py index 810135e..db6b527 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -328,9 +328,12 @@ class SyncOrchestrator: ).fetchall() # Pre-fetch the set of names that actually exist as views/tables in - # the attached extract.duckdb. The BQ connector with - # legacy_wrap_views=False inserts _meta rows for VIEW entities - # without creating inner views — those need a different code path. + # the attached extract.duckdb. Most connectors emit a `_meta` row + # alongside an inner view per registered name; the keboola + # extractor with `use_extension=False` (and other connectors) + # may insert `_meta` rows whose inner view doesn't exist yet — + # skip those to avoid creating a master view that would resolve + # to nothing. inner_objects = { row[0] for row in conn.execute( diff --git a/tests/test_admin_server_config.py b/tests/test_admin_server_config.py index 3d93740..b1a1eed 100644 --- a/tests/test_admin_server_config.py +++ b/tests/test_admin_server_config.py @@ -807,14 +807,15 @@ class TestRedactionHelpers: class TestServerConfigBigQueryFields: - """Phase J — billing_project, legacy_wrap_views, and - max_bytes_per_materialize are surfaced in the UI/API so an operator can - set them without SSH'ing to the VM. The first two were previously only - addressable via direct YAML edits; max_bytes_per_materialize had no UI - hint at all.""" + """Phase J — billing_project and max_bytes_per_materialize are surfaced + in the UI/API so an operator can set them without SSH'ing to the VM. + They were previously only addressable via direct YAML edits. + + legacy_wrap_views was removed in #160 — VIEW/MAT_VIEW are now always + wrapped via bigquery_query() (the previous opt-in path).""" def test_get_surfaces_bq_fields_even_when_unset(self, seeded_app, tmp_path, monkeypatch): - """GET response always includes the three BQ fields under + """GET response always includes the BQ optional fields under data_source.bigquery so the UI's JSON-textarea rendering shows them as editable keys even when YAML omits them. Without this, the operator has no UI hint that the knobs exist.""" @@ -822,7 +823,7 @@ class TestServerConfigBigQueryFields: state = tmp_path / "state" state.mkdir(parents=True, exist_ok=True) # Plant a minimal instance.yaml that has data_source.bigquery but - # NONE of the three fields set. + # none of the optional fields set. (state / "instance.yaml").write_text(yaml.dump({ "data_source": { "type": "bigquery", @@ -838,9 +839,12 @@ class TestServerConfigBigQueryFields: assert resp.status_code == 200, resp.text bq = resp.json()["sections"]["data_source"]["bigquery"] assert "billing_project" in bq, f"billing_project missing from GET: {bq}" - assert "legacy_wrap_views" in bq, f"legacy_wrap_views missing from GET: {bq}" assert "max_bytes_per_materialize" in bq, \ f"max_bytes_per_materialize missing from GET: {bq}" + # legacy_wrap_views was removed in #160; UI must NOT surface it any + # longer (the wrap behavior is now unconditional). + assert "legacy_wrap_views" not in bq, \ + f"legacy_wrap_views was removed in #160 but still present in GET: {bq}" def test_get_preserves_existing_bq_field_values(self, seeded_app, tmp_path, monkeypatch): """When the operator HAS set the fields, GET must surface their actual @@ -854,7 +858,6 @@ class TestServerConfigBigQueryFields: "bigquery": { "project": "my-data-prj", "billing_project": "my-billing-prj", - "legacy_wrap_views": True, "max_bytes_per_materialize": 5368709120, # 5 GiB }, }, @@ -867,7 +870,6 @@ class TestServerConfigBigQueryFields: resp = c.get("/api/admin/server-config", headers=_auth(token)) bq = resp.json()["sections"]["data_source"]["bigquery"] assert bq["billing_project"] == "my-billing-prj" - assert bq["legacy_wrap_views"] is True assert bq["max_bytes_per_materialize"] == 5368709120 def test_post_persists_billing_project(self, seeded_app, tmp_path, monkeypatch): @@ -890,23 +892,6 @@ class TestServerConfigBigQueryFields: bq = resp.json()["sections"]["data_source"]["bigquery"] assert bq["billing_project"] == "my-billing-prj" - def test_post_persists_legacy_wrap_views(self, seeded_app, tmp_path, monkeypatch): - monkeypatch.setenv("DATA_DIR", str(tmp_path)) - (tmp_path / "state").mkdir(parents=True, exist_ok=True) - c = seeded_app["client"] - token = seeded_app["admin_token"] - resp = c.post( - "/api/admin/server-config", - json={"sections": {"data_source": {"bigquery": { - "legacy_wrap_views": True, - }}}}, - headers=_auth(token), - ) - assert resp.status_code == 200, resp.text - resp = c.get("/api/admin/server-config", headers=_auth(token)) - bq = resp.json()["sections"]["data_source"]["bigquery"] - assert bq["legacy_wrap_views"] is True - def test_post_persists_max_bytes_per_materialize(self, seeded_app, tmp_path, monkeypatch): monkeypatch.setenv("DATA_DIR", str(tmp_path)) (tmp_path / "state").mkdir(parents=True, exist_ok=True) @@ -924,14 +909,14 @@ class TestServerConfigBigQueryFields: bq = resp.json()["sections"]["data_source"]["bigquery"] assert bq["max_bytes_per_materialize"] == 21474836480 - def test_template_documents_three_new_fields(self, seeded_app): - """The three BQ optional fields are now surfaced through the - known-fields registry (GET /api/admin/server-config carries them - in `known_fields.data_source.bigquery.fields`), not hardcoded into - the template text. The renderer reads the registry at runtime and - creates a structured form with hints for each leaf — so the test - verifies operator-discoverability through the API channel rather - than via static HTML inspection. + def test_template_documents_bq_optional_fields(self, seeded_app): + """BQ optional fields are surfaced through the known-fields registry + (GET /api/admin/server-config carries them in + `known_fields.data_source.bigquery.fields`), not hardcoded into the + template text. The renderer reads the registry at runtime and creates + a structured form with hints for each leaf — so the test verifies + operator-discoverability through the API channel rather than via + static HTML inspection. """ c = seeded_app["client"] token = seeded_app["admin_token"] @@ -940,12 +925,13 @@ class TestServerConfigBigQueryFields: bq_fields = resp.json()["known_fields"]["data_source"]["bigquery"]["fields"] assert "billing_project" in bq_fields, \ "registry must expose billing_project as a known field" - assert "legacy_wrap_views" in bq_fields, \ - "registry must expose legacy_wrap_views as a known field" assert "max_bytes_per_materialize" in bq_fields, \ "registry must expose max_bytes_per_materialize as a known field" - # Each field must carry a hint so the renderer can show operator- - # facing help text — no anonymous knobs. - for k in ("billing_project", "legacy_wrap_views", "max_bytes_per_materialize"): + # legacy_wrap_views was removed in #160 (VIEW/MAT_VIEW always wrap). + assert "legacy_wrap_views" not in bq_fields, \ + "legacy_wrap_views was removed in #160 but still present in known_fields" + # Each surviving field must carry a hint so the renderer can show + # operator-facing help text — no anonymous knobs. + for k in ("billing_project", "max_bytes_per_materialize"): assert "hint" in bq_fields[k] and bq_fields[k]["hint"], \ f"{k} must carry a non-empty hint" diff --git a/tests/test_admin_server_config_known_fields.py b/tests/test_admin_server_config_known_fields.py index ec1eb75..2bcf260 100644 --- a/tests/test_admin_server_config_known_fields.py +++ b/tests/test_admin_server_config_known_fields.py @@ -176,10 +176,11 @@ def test_bigquery_subfields_populated(seeded_app): assert r.status_code == 200 fields = r.json()["known_fields"]["data_source"]["bigquery"]["fields"] assert "billing_project" in fields - assert "legacy_wrap_views" in fields assert "max_bytes_per_materialize" in fields - assert fields["legacy_wrap_views"]["kind"] == "bool" - assert fields["legacy_wrap_views"]["default"] is False + # legacy_wrap_views was removed in #160 — VIEW/MATERIALIZED_VIEW are now + # always wrapped via bigquery_query() (the previous opt-in path). + assert "legacy_wrap_views" not in fields, \ + "legacy_wrap_views config knob was removed; #160 makes the wrap behavior unconditional" assert fields["max_bytes_per_materialize"]["kind"] == "int" assert fields["max_bytes_per_materialize"]["default"] == 10737418240 diff --git a/tests/test_bigquery_extractor.py b/tests/test_bigquery_extractor.py index f6879c3..c58bf18 100644 --- a/tests/test_bigquery_extractor.py +++ b/tests/test_bigquery_extractor.py @@ -316,7 +316,9 @@ class TestViewVsTableTemplates: "BASE TABLE should not use bigquery_query() function" def test_view_uses_bigquery_query_function(self, tmp_path, monkeypatch): - """For VIEW with legacy_wrap_views=True, generated DuckDB view wraps bigquery_query() (jobs API path).""" + """For VIEW entity, generated DuckDB master view wraps bigquery_query() + (jobs API path). Same SQL form as the prior `legacy_wrap_views=True` + branch — now unconditional per #160.""" from connectors.bigquery.extractor import init_extract monkeypatch.setattr( @@ -337,12 +339,6 @@ class TestViewVsTableTemplates: monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", spy_connect) - # Enable legacy toggle so this test verifies the old wrap-view path still works. - monkeypatch.setattr( - "connectors.bigquery.extractor.get_value", - lambda *args, default=None, **kw: True if "legacy_wrap_views" in args else default, - ) - init_extract( str(tmp_path), "my-project", @@ -575,88 +571,16 @@ class TestExtractorMainModule: assert exc_info.value.code == 2 -class TestDropWrapViewForBQViews: - def test_view_entity_does_not_create_master_view_by_default(self, tmp_path, monkeypatch): - from connectors.bigquery.extractor import init_extract - monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok") - monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW") - - # Stub BQ extension calls to avoid hitting real BQ - real_connect = duckdb.connect - - captured = [] - - def safe_connect(*a, **kw): - return _CapturingProxy(real_connect(*a, **kw), captured) - monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) - - # legacy toggle is OFF by default → expect no CREATE VIEW for the BQ view - monkeypatch.setattr( - "connectors.bigquery.extractor.get_value", - lambda *args, default=None, **kw: False if "legacy_wrap_views" in args else default, - raising=False, - ) - - init_extract( - str(tmp_path), - "my-project", - [{"name": "myview", "bucket": "ds", "source_table": "myview", "description": ""}], - ) - - # Confirm extract.duckdb has _meta + _remote_attach but NO master view for myview - c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True) - try: - views = c.execute( - "SELECT view_name FROM duckdb_views() WHERE view_name='myview'" - ).fetchall() - assert views == [], f"expected no wrap view for VIEW entity by default; got {views}" - meta = c.execute("SELECT table_name FROM _meta").fetchall() - assert ("myview",) in meta, "_meta must still record the view" - finally: - c.close() - - def test_legacy_wrap_views_toggle_restores_old_behavior(self, tmp_path, monkeypatch): - from connectors.bigquery.extractor import init_extract - monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok") - monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW") - - real_connect = duckdb.connect - captured = [] - - def safe_connect(*a, **kw): - return _CapturingProxy(real_connect(*a, **kw), captured) - monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) - - # legacy toggle ON → should still create the wrap view - monkeypatch.setattr( - "connectors.bigquery.extractor.get_value", - lambda *args, default=None, **kw: True if "legacy_wrap_views" in args else default, - raising=False, - ) - - init_extract( - str(tmp_path), - "my-project", - [{"name": "myview", "bucket": "ds", "source_table": "myview", "description": ""}], - ) - - # With legacy ON the wrap view SQL should have been emitted - view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper()] - myview_sqls = [s for s in view_sqls if '"myview"' in s] - assert myview_sqls != [], \ - f"expected wrap view SQL for VIEW entity when legacy_wrap_views=True; captured={captured}" - assert any("bigquery_query(" in s for s in myview_sqls), \ - f"legacy wrap view should use bigquery_query(); got: {myview_sqls}" - - # ---- #160 always-wrap tests ------------------------------------------ - # Issue #160: query_mode='remote' BQ rows whose entity is VIEW or - # MATERIALIZED_VIEW must get a master view via bigquery_query() by default. - # Today (legacy_wrap_views=False default) they don't — `da query --remote` - # against such a table returns DuckDB catalog error. Fix: always wrap. +class TestWrapViewForBQViews: + """Issue #160: query_mode='remote' BQ rows whose entity is VIEW or + MATERIALIZED_VIEW must get a master view via bigquery_query() — for any + other entity type we don't have proven runtime support for, skip both + the master view AND the _meta row.""" def test_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch): - """VIEW entity must get a bigquery_query() wrap view by default - (not just when legacy_wrap_views=True). Closes #160.""" + """VIEW entity must get a bigquery_query() wrap view (the previous + opt-in path under `legacy_wrap_views=True`, now unconditional). + Closes #160.""" from connectors.bigquery.extractor import init_extract import app.instance_config as _ic monkeypatch.setattr(_ic, "_instance_config", None, raising=False) @@ -670,7 +594,6 @@ class TestDropWrapViewForBQViews: return _CapturingProxy(real_connect(*a, **kw), captured) monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) - # Default config — no monkeypatch setting legacy_wrap_views init_extract( str(tmp_path), "my-project",