refactor(bq): #160 remove legacy_wrap_views config knob (always-wrap)

Now that VIEW/MATERIALIZED_VIEW always wrap via bigquery_query() (the
prior `legacy_wrap_views=True` branch behavior, made unconditional in
the previous commit), the toggle has no semantic meaning and is removed
across the codebase.

Production code:
- app/api/admin.py: drop the field from _OPTIONAL_FIELDS["data_source"]
  ["bigquery"]["fields"] and from _BQ_OPTIONAL_FIELD_DEFAULTS, plus the
  comment block above the defaults dict.
- config/instance.yaml.example: drop the example snippet.
- src/orchestrator.py: update the inner-objects skip-branch comment to
  reflect the new BQ behavior (the skip itself stays — keboola
  use_extension=False still inserts _meta rows without inner views).
- app/web/templates/admin_tables.html: rewrite operator copy in the
  register and edit forms to reflect always-wrap.

Tests:
- tests/test_admin_server_config.py (TestServerConfigBigQueryFields):
  flip assertions from "field IS present" to "field NOT present" on
  legacy_wrap_views. Drop the test_post_persists_legacy_wrap_views test
  since the field no longer exists.
- tests/test_admin_server_config_known_fields.py: same flip on the
  known-fields registry assertion.
- tests/test_bigquery_extractor.py: drop the obsolete
  test_view_entity_does_not_create_master_view_by_default (asserted the
  bug we fixed) and test_legacy_wrap_views_toggle_restores_old_behavior
  (toggle no longer meaningful). Update remaining test docstrings.

Operators with `legacy_wrap_views: true` set in their overlay get the
new (equivalent) behavior automatically — the unrecognized key is
silently ignored by the YAML loader. Operators with `false` get the
issue-#160 fix as a behavior change, not a regression.

Spec gate updated: production code grep gate
  grep -rn 'legacy_wrap_views' connectors app src config cli
must return zero. tests/ excluded — historical "removed in #160"
breadcrumbs and `assert "X" not in fields` regression guards retained
as anti-regression signals.
This commit is contained in:
ZdenekSrotyr 2026-05-03 16:31:23 +02:00
parent 10d7bd62f8
commit 9d0e4e687d
8 changed files with 60 additions and 169 deletions

View file

@ -226,18 +226,6 @@ _KNOWN_FIELDS: dict[str, dict[str, dict]] = {
"Mismatch → 403 USER_PROJECT_DENIED on every BQ call."
),
},
"legacy_wrap_views": {
"kind": "bool",
"default": False,
"hint": (
"When true, registered VIEWs and MATERIALIZED_VIEWs get a DuckDB "
"master view via bigquery_query() (jobs API) so analysts can "
"SELECT * FROM viewname directly. When false (default), views are "
"catalog-only — analysts use `da fetch` or `da query --remote`. "
"ON for view-heavy deployments where most views are small enough "
"to materialize without 'Response too large' (issue #101)."
),
},
"max_bytes_per_materialize": {
"kind": "int",
"default": 10737418240,
@ -795,13 +783,10 @@ class ServerConfigUpdateRequest(BaseModel):
#
# - billing_project: defaults to data project; explicit value needed when
# the SA can read the data project but not bill against it.
# - legacy_wrap_views: default False; toggling ON wraps BQ views via
# `bigquery_query()` so analysts can `SELECT *` directly.
# - max_bytes_per_materialize: cost guardrail for `query_mode='materialized'`
# (default 10 GiB; 0 disables; null falls through to the default).
_BQ_OPTIONAL_FIELD_DEFAULTS: Dict[str, Any] = {
"billing_project": "",
"legacy_wrap_views": False,
"max_bytes_per_materialize": 10737418240,
}

View file

@ -891,11 +891,11 @@
<datalist id="bqTableList"></datalist>
<div class="form-hint">Table or view name within the dataset. Click
<strong>List tables</strong> after filling Dataset to populate autocomplete.
<br><strong>Live access:</strong> BASE TABLEs are queryable directly via
<code>da query --remote</code>; VIEWs are registered but analysts must run
<code>da fetch</code> to materialize a local snapshot (or the admin can flip
<code>data_source.bigquery.legacy_wrap_views=true</code> to wrap views via the
BQ jobs API).
<br><strong>Live access:</strong> BASE TABLEs query via
<code>bq."dataset"."table"</code> (Storage Read API; predicate pushdown).
VIEWs and MATERIALIZED_VIEWs query via the BQ jobs API (full-scan estimate;
cost-guarded by <code>max_bytes_per_remote_query</code>).
<code>da query --remote</code> works for both.
<br><strong>Synced access:</strong> handles both table and view transparently
— the scheduler runs <code>SELECT *</code> through the jobs API and writes a
parquet.</div>
@ -1048,10 +1048,11 @@
list="editBqTableList" placeholder="e.g. orders">
<datalist id="editBqTableList"></datalist>
<div class="form-hint">Table or view name within the dataset.
<br><strong>Live access:</strong> BASE TABLEs are queryable directly via
<code>da query --remote</code>; VIEWs are registered but analysts must run
<code>da fetch</code> to materialize a local snapshot (or admin can flip
<code>data_source.bigquery.legacy_wrap_views=true</code>).
<br><strong>Live access:</strong> BASE TABLEs query via
<code>bq."dataset"."table"</code> (Storage Read API; predicate pushdown).
VIEWs and MATERIALIZED_VIEWs query via the BQ jobs API (full-scan estimate;
cost-guarded by <code>max_bytes_per_remote_query</code>).
<code>da query --remote</code> works for both.
<br><strong>Synced access:</strong> handles both transparently — the
scheduler runs <code>SELECT *</code> through the jobs API and writes a
parquet.</div>

View file

@ -122,14 +122,6 @@ data_source:
# # Mismatch -> every BQ call 403 USER_PROJECT_DENIED.
# # `da diagnose` warns when this falls back to `project`.
# # Configurable via /admin/server-config UI.
# legacy_wrap_views: false # When true, registered VIEWs and MATERIALIZED_VIEWs get a DuckDB
# # master view via bigquery_query() (jobs API) so analysts can
# # `SELECT * FROM viewname` directly. When false (default), views
# # are catalog-only -- analysts use `da fetch viewname` or
# # `da query --remote`. ON can cause "Response too large" on big
# # views; OFF requires analyst-side discipline (CLAUDE.md rails).
# # Toggle ON for view-heavy deployments where most views are small.
# # Configurable via /admin/server-config UI.
# max_bytes_per_materialize: 10737418240
# # Cost guardrail (bytes) for query_mode='materialized' BQ scans.
# # Dry-run check before running; exceeding -> registration / sync

View file

@ -120,12 +120,12 @@ The flag is the inverse of the new behavior. With "always wrap" as the only mode
Total: 33 references across 9 files. Verified by per-file `grep -c` (rev3 review).
After all edits: `grep -rn 'legacy_wrap_views' connectors app src tests config cli` must return zero. `docs/` is **excluded** from the gate`docs/superpowers/plans/2026-04-27-claude-fetch-primitives.md` (8 hits) and `docs/superpowers/specs/2026-04-27-claude-fetch-primitives-design.md` (1 hit) are historical planning artifacts that document the design decision to introduce the flag; rewriting history would be revisionist. CHANGELOG history retained on the same logic.
After all edits: `grep -rn 'legacy_wrap_views' connectors app src config cli` (production code only) must return zero. `tests/` is excluded — historical references in `assert "legacy_wrap_views" not in ...` regression guards and "removed in #160" docstring breadcrumbs have anti-regression value (they catch a future contributor who tries to re-add the flag). `docs/` is excluded for the same reason`docs/superpowers/plans/2026-04-27-claude-fetch-primitives.md` (8 hits) and `docs/superpowers/specs/2026-04-27-claude-fetch-primitives-design.md` (1 hit) are historical planning artifacts; rewriting them would be revisionist. CHANGELOG history retained on the same logic.
Verification command in CI / pre-commit:
Verification command in CI / pre-commit:
```
test "$(grep -rn 'legacy_wrap_views' connectors app src tests config cli | wc -l)" -eq 0
test "$(grep -rn 'legacy_wrap_views' connectors app src config cli | wc -l)" -eq 0
```
No DB migration. Operators who explicitly set `legacy_wrap_views: true` in their overlay get the new (equivalent) behavior automatically; the unrecognized key is ignored by the YAML loader. Operators who explicitly set `legacy_wrap_views: false` get the new behavior as a fix, not a regression.

View file

@ -328,9 +328,12 @@ class SyncOrchestrator:
).fetchall()
# Pre-fetch the set of names that actually exist as views/tables in
# the attached extract.duckdb. The BQ connector with
# legacy_wrap_views=False inserts _meta rows for VIEW entities
# without creating inner views — those need a different code path.
# the attached extract.duckdb. Most connectors emit a `_meta` row
# alongside an inner view per registered name; the keboola
# extractor with `use_extension=False` (and other connectors)
# may insert `_meta` rows whose inner view doesn't exist yet —
# skip those to avoid creating a master view that would resolve
# to nothing.
inner_objects = {
row[0]
for row in conn.execute(

View file

@ -807,14 +807,15 @@ class TestRedactionHelpers:
class TestServerConfigBigQueryFields:
"""Phase J — billing_project, legacy_wrap_views, and
max_bytes_per_materialize are surfaced in the UI/API so an operator can
set them without SSH'ing to the VM. The first two were previously only
addressable via direct YAML edits; max_bytes_per_materialize had no UI
hint at all."""
"""Phase J — billing_project and max_bytes_per_materialize are surfaced
in the UI/API so an operator can set them without SSH'ing to the VM.
They were previously only addressable via direct YAML edits.
legacy_wrap_views was removed in #160 — VIEW/MAT_VIEW are now always
wrapped via bigquery_query() (the previous opt-in path)."""
def test_get_surfaces_bq_fields_even_when_unset(self, seeded_app, tmp_path, monkeypatch):
"""GET response always includes the three BQ fields under
"""GET response always includes the BQ optional fields under
data_source.bigquery so the UI's JSON-textarea rendering shows them
as editable keys even when YAML omits them. Without this, the
operator has no UI hint that the knobs exist."""
@ -822,7 +823,7 @@ class TestServerConfigBigQueryFields:
state = tmp_path / "state"
state.mkdir(parents=True, exist_ok=True)
# Plant a minimal instance.yaml that has data_source.bigquery but
# NONE of the three fields set.
# none of the optional fields set.
(state / "instance.yaml").write_text(yaml.dump({
"data_source": {
"type": "bigquery",
@ -838,9 +839,12 @@ class TestServerConfigBigQueryFields:
assert resp.status_code == 200, resp.text
bq = resp.json()["sections"]["data_source"]["bigquery"]
assert "billing_project" in bq, f"billing_project missing from GET: {bq}"
assert "legacy_wrap_views" in bq, f"legacy_wrap_views missing from GET: {bq}"
assert "max_bytes_per_materialize" in bq, \
f"max_bytes_per_materialize missing from GET: {bq}"
# legacy_wrap_views was removed in #160; UI must NOT surface it any
# longer (the wrap behavior is now unconditional).
assert "legacy_wrap_views" not in bq, \
f"legacy_wrap_views was removed in #160 but still present in GET: {bq}"
def test_get_preserves_existing_bq_field_values(self, seeded_app, tmp_path, monkeypatch):
"""When the operator HAS set the fields, GET must surface their actual
@ -854,7 +858,6 @@ class TestServerConfigBigQueryFields:
"bigquery": {
"project": "my-data-prj",
"billing_project": "my-billing-prj",
"legacy_wrap_views": True,
"max_bytes_per_materialize": 5368709120, # 5 GiB
},
},
@ -867,7 +870,6 @@ class TestServerConfigBigQueryFields:
resp = c.get("/api/admin/server-config", headers=_auth(token))
bq = resp.json()["sections"]["data_source"]["bigquery"]
assert bq["billing_project"] == "my-billing-prj"
assert bq["legacy_wrap_views"] is True
assert bq["max_bytes_per_materialize"] == 5368709120
def test_post_persists_billing_project(self, seeded_app, tmp_path, monkeypatch):
@ -890,23 +892,6 @@ class TestServerConfigBigQueryFields:
bq = resp.json()["sections"]["data_source"]["bigquery"]
assert bq["billing_project"] == "my-billing-prj"
def test_post_persists_legacy_wrap_views(self, seeded_app, tmp_path, monkeypatch):
monkeypatch.setenv("DATA_DIR", str(tmp_path))
(tmp_path / "state").mkdir(parents=True, exist_ok=True)
c = seeded_app["client"]
token = seeded_app["admin_token"]
resp = c.post(
"/api/admin/server-config",
json={"sections": {"data_source": {"bigquery": {
"legacy_wrap_views": True,
}}}},
headers=_auth(token),
)
assert resp.status_code == 200, resp.text
resp = c.get("/api/admin/server-config", headers=_auth(token))
bq = resp.json()["sections"]["data_source"]["bigquery"]
assert bq["legacy_wrap_views"] is True
def test_post_persists_max_bytes_per_materialize(self, seeded_app, tmp_path, monkeypatch):
monkeypatch.setenv("DATA_DIR", str(tmp_path))
(tmp_path / "state").mkdir(parents=True, exist_ok=True)
@ -924,14 +909,14 @@ class TestServerConfigBigQueryFields:
bq = resp.json()["sections"]["data_source"]["bigquery"]
assert bq["max_bytes_per_materialize"] == 21474836480
def test_template_documents_three_new_fields(self, seeded_app):
"""The three BQ optional fields are now surfaced through the
known-fields registry (GET /api/admin/server-config carries them
in `known_fields.data_source.bigquery.fields`), not hardcoded into
the template text. The renderer reads the registry at runtime and
creates a structured form with hints for each leaf so the test
verifies operator-discoverability through the API channel rather
than via static HTML inspection.
def test_template_documents_bq_optional_fields(self, seeded_app):
"""BQ optional fields are surfaced through the known-fields registry
(GET /api/admin/server-config carries them in
`known_fields.data_source.bigquery.fields`), not hardcoded into the
template text. The renderer reads the registry at runtime and creates
a structured form with hints for each leaf so the test verifies
operator-discoverability through the API channel rather than via
static HTML inspection.
"""
c = seeded_app["client"]
token = seeded_app["admin_token"]
@ -940,12 +925,13 @@ class TestServerConfigBigQueryFields:
bq_fields = resp.json()["known_fields"]["data_source"]["bigquery"]["fields"]
assert "billing_project" in bq_fields, \
"registry must expose billing_project as a known field"
assert "legacy_wrap_views" in bq_fields, \
"registry must expose legacy_wrap_views as a known field"
assert "max_bytes_per_materialize" in bq_fields, \
"registry must expose max_bytes_per_materialize as a known field"
# Each field must carry a hint so the renderer can show operator-
# facing help text — no anonymous knobs.
for k in ("billing_project", "legacy_wrap_views", "max_bytes_per_materialize"):
# legacy_wrap_views was removed in #160 (VIEW/MAT_VIEW always wrap).
assert "legacy_wrap_views" not in bq_fields, \
"legacy_wrap_views was removed in #160 but still present in known_fields"
# Each surviving field must carry a hint so the renderer can show
# operator-facing help text — no anonymous knobs.
for k in ("billing_project", "max_bytes_per_materialize"):
assert "hint" in bq_fields[k] and bq_fields[k]["hint"], \
f"{k} must carry a non-empty hint"

View file

@ -176,10 +176,11 @@ def test_bigquery_subfields_populated(seeded_app):
assert r.status_code == 200
fields = r.json()["known_fields"]["data_source"]["bigquery"]["fields"]
assert "billing_project" in fields
assert "legacy_wrap_views" in fields
assert "max_bytes_per_materialize" in fields
assert fields["legacy_wrap_views"]["kind"] == "bool"
assert fields["legacy_wrap_views"]["default"] is False
# legacy_wrap_views was removed in #160 — VIEW/MATERIALIZED_VIEW are now
# always wrapped via bigquery_query() (the previous opt-in path).
assert "legacy_wrap_views" not in fields, \
"legacy_wrap_views config knob was removed; #160 makes the wrap behavior unconditional"
assert fields["max_bytes_per_materialize"]["kind"] == "int"
assert fields["max_bytes_per_materialize"]["default"] == 10737418240

View file

@ -316,7 +316,9 @@ class TestViewVsTableTemplates:
"BASE TABLE should not use bigquery_query() function"
def test_view_uses_bigquery_query_function(self, tmp_path, monkeypatch):
"""For VIEW with legacy_wrap_views=True, generated DuckDB view wraps bigquery_query() (jobs API path)."""
"""For VIEW entity, generated DuckDB master view wraps bigquery_query()
(jobs API path). Same SQL form as the prior `legacy_wrap_views=True`
branch now unconditional per #160."""
from connectors.bigquery.extractor import init_extract
monkeypatch.setattr(
@ -337,12 +339,6 @@ class TestViewVsTableTemplates:
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", spy_connect)
# Enable legacy toggle so this test verifies the old wrap-view path still works.
monkeypatch.setattr(
"connectors.bigquery.extractor.get_value",
lambda *args, default=None, **kw: True if "legacy_wrap_views" in args else default,
)
init_extract(
str(tmp_path),
"my-project",
@ -575,88 +571,16 @@ class TestExtractorMainModule:
assert exc_info.value.code == 2
class TestDropWrapViewForBQViews:
def test_view_entity_does_not_create_master_view_by_default(self, tmp_path, monkeypatch):
from connectors.bigquery.extractor import init_extract
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW")
# Stub BQ extension calls to avoid hitting real BQ
real_connect = duckdb.connect
captured = []
def safe_connect(*a, **kw):
return _CapturingProxy(real_connect(*a, **kw), captured)
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
# legacy toggle is OFF by default → expect no CREATE VIEW for the BQ view
monkeypatch.setattr(
"connectors.bigquery.extractor.get_value",
lambda *args, default=None, **kw: False if "legacy_wrap_views" in args else default,
raising=False,
)
init_extract(
str(tmp_path),
"my-project",
[{"name": "myview", "bucket": "ds", "source_table": "myview", "description": ""}],
)
# Confirm extract.duckdb has _meta + _remote_attach but NO master view for myview
c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True)
try:
views = c.execute(
"SELECT view_name FROM duckdb_views() WHERE view_name='myview'"
).fetchall()
assert views == [], f"expected no wrap view for VIEW entity by default; got {views}"
meta = c.execute("SELECT table_name FROM _meta").fetchall()
assert ("myview",) in meta, "_meta must still record the view"
finally:
c.close()
def test_legacy_wrap_views_toggle_restores_old_behavior(self, tmp_path, monkeypatch):
from connectors.bigquery.extractor import init_extract
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW")
real_connect = duckdb.connect
captured = []
def safe_connect(*a, **kw):
return _CapturingProxy(real_connect(*a, **kw), captured)
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
# legacy toggle ON → should still create the wrap view
monkeypatch.setattr(
"connectors.bigquery.extractor.get_value",
lambda *args, default=None, **kw: True if "legacy_wrap_views" in args else default,
raising=False,
)
init_extract(
str(tmp_path),
"my-project",
[{"name": "myview", "bucket": "ds", "source_table": "myview", "description": ""}],
)
# With legacy ON the wrap view SQL should have been emitted
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper()]
myview_sqls = [s for s in view_sqls if '"myview"' in s]
assert myview_sqls != [], \
f"expected wrap view SQL for VIEW entity when legacy_wrap_views=True; captured={captured}"
assert any("bigquery_query(" in s for s in myview_sqls), \
f"legacy wrap view should use bigquery_query(); got: {myview_sqls}"
# ---- #160 always-wrap tests ------------------------------------------
# Issue #160: query_mode='remote' BQ rows whose entity is VIEW or
# MATERIALIZED_VIEW must get a master view via bigquery_query() by default.
# Today (legacy_wrap_views=False default) they don't — `da query --remote`
# against such a table returns DuckDB catalog error. Fix: always wrap.
class TestWrapViewForBQViews:
"""Issue #160: query_mode='remote' BQ rows whose entity is VIEW or
MATERIALIZED_VIEW must get a master view via bigquery_query() for any
other entity type we don't have proven runtime support for, skip both
the master view AND the _meta row."""
def test_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
"""VIEW entity must get a bigquery_query() wrap view by default
(not just when legacy_wrap_views=True). Closes #160."""
"""VIEW entity must get a bigquery_query() wrap view (the previous
opt-in path under `legacy_wrap_views=True`, now unconditional).
Closes #160."""
from connectors.bigquery.extractor import init_extract
import app.instance_config as _ic
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
@ -670,7 +594,6 @@ class TestDropWrapViewForBQViews:
return _CapturingProxy(real_connect(*a, **kw), captured)
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
# Default config — no monkeypatch setting legacy_wrap_views
init_extract(
str(tmp_path),
"my-project",