Closes the 'admin pre-stages a curated table/view for analysts' use case end-to-end across both supported source connectors. Backend (BigQuery + Keboola, schema v20): - schema v20 adds source_query TEXT to table_registry (renumbered from v19 after main's #150 RBAC migration also bumped to v19) - connectors/bigquery/extractor.py adds materialize_query(table_id, sql, *, bq, output_dir, max_bytes=...) — BqAccess session, dry-run cost guardrail (default 10 GiB, configurable via data_source.bigquery.max_bytes_per_materialize), idempotent ATTACH, rows/bytes/md5 metadata for sync_state - connectors/keboola/access.py — new KeboolaAccess facade (parallel of BqAccess) wrapping ATTACH 'keboola://...' AS kbc - connectors/keboola/extractor.py adds materialize_query — same shape, no dry-run analog (Keboola Storage API has different cost model); legacy bucket-download path skips query_mode='materialized' rows - app/api/sync.py:_run_materialized_pass dispatches by source_type to the right materialize_query - app/api/admin.py: RegisterTableRequest accepts source_query; model_validator coheres mode↔source_query↔bucket; PUT preserves omitted fields; deprecation marks (Field(deprecated=True)) on sync_strategy + profile_after_sync (no extractor reads them; profile_after_sync becomes inert — bug from earlier work where /api/sync/trigger never honored the flag); _BQ_OPTIONAL_FIELD_DEFAULTS injects defaults into GET /server-config payload Operator + CLI surface: - da admin register-table --query / --query-mode materialized - scripts/smoke-test-materialized-bq.sh — end-to-end smoke for operators Tests (incl. spike + integration + regression): - test_db_migration_v20, test_table_registry_source_query - test_bq_materialize, test_bq_cost_guardrail, test_bq_init_extract_skips - test_keboola_access, test_keboola_extension_query_passthrough (lock-in for the DuckDB extension capability), test_keboola_materialize, test_keboola_init_extract_skips, test_keboola_materialized_e2e (skipped without KBC_TEST_* creds) - test_sync_trigger_materialized, test_sync_trigger_keboola_materialized - test_api_admin_materialized, test_cli_admin_materialized - test_admin_bq_register, test_admin_discover_bigquery, test_admin_keboola_materialized, test_admin_phase_c_deprecation, test_admin_put_preservation, test_materialized_e2e Cost: BQ uses bigquery_query() (jobs API, view-aware) — works on tables, views, materialized views uniformly. Keboola uses ATTACH+COPY parquet through the DuckDB extension.
95 lines
3.2 KiB
Python
95 lines
3.2 KiB
Python
"""Tests for the Keboola materialize_query path."""
|
|
import hashlib
|
|
import pytest
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock
|
|
|
|
from connectors.keboola import extractor as kbe
|
|
|
|
|
|
def test_materialize_query_writes_parquet_and_returns_metadata(tmp_path, monkeypatch):
|
|
"""Mock-mode: feed in a fake KeboolaAccess that yields a fake DuckDB
|
|
connection accepting `COPY ... TO '...' (FORMAT PARQUET)` and just
|
|
writes a small parquet via duckdb's own primitive on a tmp DB.
|
|
"""
|
|
import duckdb
|
|
real_conn = duckdb.connect(":memory:")
|
|
# Pre-create a small relation the fake materialize "copies".
|
|
real_conn.execute("CREATE TABLE t AS SELECT 1 AS x, 'hello' AS y UNION ALL SELECT 2, 'world'")
|
|
|
|
class FakeAccess:
|
|
def duckdb_session(self):
|
|
from contextlib import contextmanager
|
|
@contextmanager
|
|
def _cm():
|
|
yield real_conn
|
|
return _cm()
|
|
fake_access = FakeAccess()
|
|
|
|
output_dir = tmp_path / "out"
|
|
output_dir.mkdir()
|
|
|
|
# Submit a query that selects from the in-memory table (not a real
|
|
# Keboola bucket — the test verifies the COPY/parquet/hash path,
|
|
# not the extension behavior).
|
|
result = kbe.materialize_query(
|
|
table_id="example_subset",
|
|
sql="SELECT * FROM t",
|
|
keboola_access=fake_access,
|
|
output_dir=output_dir,
|
|
)
|
|
|
|
parquet_path = output_dir / "example_subset.parquet"
|
|
assert parquet_path.exists()
|
|
assert result["table_id"] == "example_subset"
|
|
assert result["path"] == str(parquet_path)
|
|
assert result["rows"] == 2
|
|
assert result["bytes"] > 0
|
|
# MD5 of the bytes should match what we recompute.
|
|
expected_md5 = hashlib.md5(parquet_path.read_bytes()).hexdigest()
|
|
assert result["md5"] == expected_md5
|
|
|
|
|
|
def test_materialize_query_zero_rows_logs_warning(tmp_path, caplog):
|
|
import duckdb
|
|
real_conn = duckdb.connect(":memory:")
|
|
real_conn.execute("CREATE TABLE t AS SELECT 1 AS x WHERE FALSE")
|
|
|
|
class FakeAccess:
|
|
def duckdb_session(self):
|
|
from contextlib import contextmanager
|
|
@contextmanager
|
|
def _cm():
|
|
yield real_conn
|
|
return _cm()
|
|
|
|
output_dir = tmp_path / "out"
|
|
output_dir.mkdir()
|
|
|
|
with caplog.at_level("WARNING"):
|
|
result = kbe.materialize_query(
|
|
table_id="empty_subset",
|
|
sql="SELECT * FROM t",
|
|
keboola_access=FakeAccess(),
|
|
output_dir=output_dir,
|
|
)
|
|
assert result["rows"] == 0
|
|
assert "0 rows" in caplog.text or "empty" in caplog.text.lower()
|
|
|
|
|
|
def test_materialize_query_rejects_unsafe_table_id(tmp_path):
|
|
"""Defense: table_id is interpolated into the parquet filename. SQL/
|
|
path-traversal-unsafe values must be rejected up-front (mirror of BQ
|
|
materialize_query's validation)."""
|
|
class FakeAccess:
|
|
def duckdb_session(self):
|
|
raise AssertionError("should not be called")
|
|
output_dir = tmp_path / "out"
|
|
output_dir.mkdir()
|
|
with pytest.raises(ValueError, match="table_id"):
|
|
kbe.materialize_query(
|
|
table_id="../../etc/passwd",
|
|
sql="SELECT 1",
|
|
keboola_access=FakeAccess(),
|
|
output_dir=output_dir,
|
|
)
|