Closes the 'admin pre-stages a curated table/view for analysts' use case end-to-end across both supported source connectors. Backend (BigQuery + Keboola, schema v20): - schema v20 adds source_query TEXT to table_registry (renumbered from v19 after main's #150 RBAC migration also bumped to v19) - connectors/bigquery/extractor.py adds materialize_query(table_id, sql, *, bq, output_dir, max_bytes=...) — BqAccess session, dry-run cost guardrail (default 10 GiB, configurable via data_source.bigquery.max_bytes_per_materialize), idempotent ATTACH, rows/bytes/md5 metadata for sync_state - connectors/keboola/access.py — new KeboolaAccess facade (parallel of BqAccess) wrapping ATTACH 'keboola://...' AS kbc - connectors/keboola/extractor.py adds materialize_query — same shape, no dry-run analog (Keboola Storage API has different cost model); legacy bucket-download path skips query_mode='materialized' rows - app/api/sync.py:_run_materialized_pass dispatches by source_type to the right materialize_query - app/api/admin.py: RegisterTableRequest accepts source_query; model_validator coheres mode↔source_query↔bucket; PUT preserves omitted fields; deprecation marks (Field(deprecated=True)) on sync_strategy + profile_after_sync (no extractor reads them; profile_after_sync becomes inert — bug from earlier work where /api/sync/trigger never honored the flag); _BQ_OPTIONAL_FIELD_DEFAULTS injects defaults into GET /server-config payload Operator + CLI surface: - da admin register-table --query / --query-mode materialized - scripts/smoke-test-materialized-bq.sh — end-to-end smoke for operators Tests (incl. spike + integration + regression): - test_db_migration_v20, test_table_registry_source_query - test_bq_materialize, test_bq_cost_guardrail, test_bq_init_extract_skips - test_keboola_access, test_keboola_extension_query_passthrough (lock-in for the DuckDB extension capability), test_keboola_materialize, test_keboola_init_extract_skips, test_keboola_materialized_e2e (skipped without KBC_TEST_* creds) - test_sync_trigger_materialized, test_sync_trigger_keboola_materialized - test_api_admin_materialized, test_cli_admin_materialized - test_admin_bq_register, test_admin_discover_bigquery, test_admin_keboola_materialized, test_admin_phase_c_deprecation, test_admin_put_preservation, test_materialized_e2e Cost: BQ uses bigquery_query() (jobs API, view-aware) — works on tables, views, materialized views uniformly. Keboola uses ATTACH+COPY parquet through the DuckDB extension.
322 lines
11 KiB
Python
322 lines
11 KiB
Python
"""Admin API accepts source_query when query_mode='materialized', rejects
|
|
mismatches between mode and query field.
|
|
|
|
Tests that hit the remote-mode register path require `stub_bq_extractor`
|
|
to bypass the post-register rebuild's real-BQ traffic. Materialized-only
|
|
tests skip the BG path (the 201 fast-path returns before any rebuild
|
|
fires) so they don't need the stub.
|
|
|
|
Covers PR #145 (re-implementation against 0.24.0 base):
|
|
- RegisterTableRequest + UpdateTableRequest model_validators
|
|
- _validate_bigquery_register_payload materialized branch (skips bucket/
|
|
source_table checks, requires source_query)
|
|
- register_table 201 response for materialized BQ rows (no synchronous
|
|
materialize — cron tick or manual /api/sync/trigger picks them up)
|
|
- update_table clears stale source_query when switching mode away from
|
|
materialized
|
|
|
|
Shares the seeded_app + bq_instance fixtures from conftest /
|
|
test_admin_bq_register.py for parity with the existing BQ test surface.
|
|
"""
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
|
|
def _auth(token):
|
|
return {"Authorization": f"Bearer {token}"}
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_bq_extractor(monkeypatch):
|
|
"""Mirror tests/test_admin_bq_register.py — bypasses real-BQ traffic
|
|
in the post-register rebuild path so the test stays offline. Required
|
|
whenever the test seeds a remote-mode BQ row via the HTTP API."""
|
|
rebuild_mock = MagicMock(return_value={
|
|
"project_id": "my-test-project",
|
|
"tables_registered": 1, "errors": [], "skipped": False,
|
|
})
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.rebuild_from_registry",
|
|
rebuild_mock,
|
|
)
|
|
monkeypatch.setattr(
|
|
"src.orchestrator.SyncOrchestrator",
|
|
lambda *a, **kw: MagicMock(),
|
|
)
|
|
return rebuild_mock
|
|
|
|
|
|
@pytest.fixture
|
|
def bq_instance(monkeypatch):
|
|
"""Force instance.yaml to look like a BigQuery deployment.
|
|
|
|
Mirrors tests/test_admin_bq_register.py::bq_instance so the
|
|
project_id read inside _validate_bigquery_register_payload succeeds.
|
|
"""
|
|
fake_cfg = {
|
|
"data_source": {
|
|
"type": "bigquery",
|
|
"bigquery": {"project": "my-test-project", "location": "us"},
|
|
},
|
|
}
|
|
monkeypatch.setattr(
|
|
"app.instance_config.load_instance_config",
|
|
lambda: fake_cfg,
|
|
raising=False,
|
|
)
|
|
from app.instance_config import reset_cache
|
|
reset_cache()
|
|
yield fake_cfg
|
|
reset_cache()
|
|
|
|
|
|
def _materialized_payload(**overrides):
|
|
p = {
|
|
"name": "orders_90d",
|
|
"source_type": "bigquery",
|
|
"query_mode": "materialized",
|
|
"source_query": "SELECT date FROM `prj.ds.orders`",
|
|
"sync_schedule": "every 6h",
|
|
}
|
|
p.update(overrides)
|
|
return p
|
|
|
|
|
|
def test_register_materialized_requires_source_query(seeded_app, bq_instance):
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.post(
|
|
"/api/admin/register-table",
|
|
json={
|
|
"name": "missing_query",
|
|
"source_type": "bigquery",
|
|
"query_mode": "materialized",
|
|
# source_query missing
|
|
},
|
|
headers=_auth(token),
|
|
)
|
|
assert 400 <= r.status_code < 500, r.json()
|
|
detail = str(r.json().get("detail", "")).lower()
|
|
assert "source_query" in detail or "materialized" in detail
|
|
|
|
|
|
def test_register_materialized_accepts_source_query(seeded_app, bq_instance):
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.post(
|
|
"/api/admin/register-table",
|
|
json=_materialized_payload(name="orders_90d_a"),
|
|
headers=_auth(token),
|
|
)
|
|
assert r.status_code == 201, r.json()
|
|
body = r.json()
|
|
assert body["status"] == "registered"
|
|
assert "Materialized" in body.get("message", "")
|
|
|
|
|
|
def test_register_remote_rejects_source_query(seeded_app, bq_instance):
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.post(
|
|
"/api/admin/register-table",
|
|
json={
|
|
"name": "live_orders",
|
|
"source_type": "bigquery",
|
|
"bucket": "analytics",
|
|
"source_table": "orders",
|
|
"query_mode": "remote",
|
|
"source_query": "SELECT 1",
|
|
},
|
|
headers=_auth(token),
|
|
)
|
|
assert 400 <= r.status_code < 500, r.json()
|
|
|
|
|
|
def test_register_local_rejects_source_query(seeded_app):
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.post(
|
|
"/api/admin/register-table",
|
|
json={
|
|
"name": "kbc_orders",
|
|
"source_type": "keboola",
|
|
"query_mode": "local",
|
|
"source_query": "SELECT 1",
|
|
},
|
|
headers=_auth(token),
|
|
)
|
|
assert 400 <= r.status_code < 500, r.json()
|
|
|
|
|
|
def test_register_materialized_with_empty_source_query_rejected(seeded_app, bq_instance):
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.post(
|
|
"/api/admin/register-table",
|
|
json=_materialized_payload(name="empty_q", source_query=""),
|
|
headers=_auth(token),
|
|
)
|
|
assert 400 <= r.status_code < 500, r.json()
|
|
|
|
|
|
def test_update_source_query_alone_requires_query_mode(seeded_app, bq_instance, stub_bq_extractor):
|
|
"""PUT body with source_query but no query_mode is incoherent — reject
|
|
so non-materialized rows can't carry an orphan source_query."""
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
# Seed a remote-mode row.
|
|
r = c.post(
|
|
"/api/admin/register-table",
|
|
json={
|
|
"name": "live_orphan",
|
|
"source_type": "bigquery",
|
|
"bucket": "analytics",
|
|
"source_table": "orders",
|
|
"query_mode": "remote",
|
|
},
|
|
headers=_auth(token),
|
|
)
|
|
assert r.status_code in (200, 202), r.json() # synchronous or async
|
|
table_id = r.json()["id"]
|
|
|
|
r2 = c.put(
|
|
f"/api/admin/registry/{table_id}",
|
|
json={"source_query": "SELECT 1"},
|
|
headers=_auth(token),
|
|
)
|
|
assert 400 <= r2.status_code < 500, r2.json()
|
|
|
|
|
|
def test_update_schedule_only_on_materialized_row_succeeds(
|
|
seeded_app, bq_instance, stub_bq_extractor,
|
|
):
|
|
"""REGRESSION (Devin BUG_0002 on 2219255): an admin editing only the
|
|
sync_schedule of a materialized row sends `{query_mode: 'materialized',
|
|
sync_schedule: '...'}` (the Edit modal always sends query_mode for BQ
|
|
rows). Pre-fix the UpdateTableRequest validator rejected this with 422
|
|
because source_query wasn't in the body — even though the existing row
|
|
already had one.
|
|
|
|
The PUT semantics overlay the body on the existing row, so omitted
|
|
source_query keeps the stored value. The synthetic RegisterTableRequest
|
|
constructed against the merged record at the handler still runs the
|
|
strict cross-field check, so the truly-broken case (materialized
|
|
without ANY source_query, even on existing) is still caught."""
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
# Seed a materialized row with a real source_query.
|
|
r = c.post("/api/admin/register-table", json={
|
|
"name": "schedule_edit_target",
|
|
"source_type": "bigquery",
|
|
"query_mode": "materialized",
|
|
"source_query": "SELECT 1",
|
|
"sync_schedule": "every 1h",
|
|
}, headers=_auth(token))
|
|
assert r.status_code == 201, r.json()
|
|
table_id = r.json()["id"]
|
|
|
|
# Edit ONLY the schedule. UI's saveTableEdit sends query_mode for BQ
|
|
# rows even when the operator didn't change it.
|
|
r2 = c.put(f"/api/admin/registry/{table_id}", json={
|
|
"query_mode": "materialized",
|
|
"sync_schedule": "every 12h",
|
|
}, headers=_auth(token))
|
|
assert r2.status_code == 200, r2.json()
|
|
|
|
# Verify the schedule changed and source_query survived.
|
|
r3 = c.get("/api/admin/registry", headers=_auth(token))
|
|
row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None)
|
|
assert row is not None
|
|
assert row["sync_schedule"] == "every 12h"
|
|
assert row["source_query"] == "SELECT 1" # preserved across edit
|
|
assert row["query_mode"] == "materialized"
|
|
|
|
|
|
def test_update_materialized_with_explicit_empty_source_query_rejected(
|
|
seeded_app, bq_instance, stub_bq_extractor,
|
|
):
|
|
"""The fix above relaxes the validator for OMITTED source_query, but
|
|
explicitly setting it to an empty / whitespace string while claiming
|
|
materialized is still a typo and must be rejected (not silently
|
|
persisted as NULL)."""
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
r = c.post("/api/admin/register-table", json={
|
|
"name": "explicit_empty",
|
|
"source_type": "bigquery",
|
|
"query_mode": "materialized",
|
|
"source_query": "SELECT 1",
|
|
}, headers=_auth(token))
|
|
assert r.status_code == 201, r.json()
|
|
table_id = r.json()["id"]
|
|
|
|
r2 = c.put(f"/api/admin/registry/{table_id}", json={
|
|
"query_mode": "materialized",
|
|
"source_query": "", # explicitly empty
|
|
}, headers=_auth(token))
|
|
assert 400 <= r2.status_code < 500, r2.json()
|
|
|
|
|
|
def test_update_materialized_to_remote_clears_source_query(
|
|
seeded_app, bq_instance, stub_bq_extractor,
|
|
):
|
|
"""When admin switches a materialized table to remote/local, the stale
|
|
source_query must be cleared in the DB — otherwise the registry shows
|
|
a non-materialized row carrying an orphan SQL body."""
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
|
|
# Seed a materialized table with a source_query.
|
|
r = c.post(
|
|
"/api/admin/register-table",
|
|
json=_materialized_payload(name="switcher"),
|
|
headers=_auth(token),
|
|
)
|
|
assert r.status_code == 201, r.json()
|
|
table_id = r.json()["id"]
|
|
|
|
# Switch to remote — must include bucket+source_table for the new mode
|
|
# (the merged validator runs the BQ payload check on the merged record).
|
|
r2 = c.put(
|
|
f"/api/admin/registry/{table_id}",
|
|
json={
|
|
"query_mode": "remote",
|
|
"bucket": "analytics",
|
|
"source_table": "orders_90d",
|
|
},
|
|
headers=_auth(token),
|
|
)
|
|
assert r2.status_code == 200, r2.json()
|
|
|
|
# Verify in the registry: query_mode flipped, source_query cleared.
|
|
r3 = c.get("/api/admin/registry", headers=_auth(token))
|
|
assert r3.status_code == 200, r3.json()
|
|
row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None)
|
|
assert row is not None, f"Table {table_id} not found in registry"
|
|
assert row["query_mode"] == "remote"
|
|
assert row["source_query"] in (None, "")
|
|
|
|
|
|
def test_register_materialized_persists_source_query_in_registry(seeded_app, bq_instance):
|
|
c = seeded_app["client"]
|
|
token = seeded_app["admin_token"]
|
|
r = c.post(
|
|
"/api/admin/register-table",
|
|
json=_materialized_payload(
|
|
name="persist_q",
|
|
source_query="SELECT col FROM `prj.ds.t` WHERE x = 1",
|
|
),
|
|
headers=_auth(token),
|
|
)
|
|
assert r.status_code == 201, r.json()
|
|
table_id = r.json()["id"]
|
|
|
|
r2 = c.get("/api/admin/registry", headers=_auth(token))
|
|
row = next((t for t in r2.json()["tables"] if t["id"] == table_id), None)
|
|
assert row is not None
|
|
assert row["query_mode"] == "materialized"
|
|
assert "WHERE x = 1" in row["source_query"]
|