Closes the 'admin pre-stages a curated table/view for analysts' use case end-to-end across both supported source connectors. Backend (BigQuery + Keboola, schema v20): - schema v20 adds source_query TEXT to table_registry (renumbered from v19 after main's #150 RBAC migration also bumped to v19) - connectors/bigquery/extractor.py adds materialize_query(table_id, sql, *, bq, output_dir, max_bytes=...) — BqAccess session, dry-run cost guardrail (default 10 GiB, configurable via data_source.bigquery.max_bytes_per_materialize), idempotent ATTACH, rows/bytes/md5 metadata for sync_state - connectors/keboola/access.py — new KeboolaAccess facade (parallel of BqAccess) wrapping ATTACH 'keboola://...' AS kbc - connectors/keboola/extractor.py adds materialize_query — same shape, no dry-run analog (Keboola Storage API has different cost model); legacy bucket-download path skips query_mode='materialized' rows - app/api/sync.py:_run_materialized_pass dispatches by source_type to the right materialize_query - app/api/admin.py: RegisterTableRequest accepts source_query; model_validator coheres mode↔source_query↔bucket; PUT preserves omitted fields; deprecation marks (Field(deprecated=True)) on sync_strategy + profile_after_sync (no extractor reads them; profile_after_sync becomes inert — bug from earlier work where /api/sync/trigger never honored the flag); _BQ_OPTIONAL_FIELD_DEFAULTS injects defaults into GET /server-config payload Operator + CLI surface: - da admin register-table --query / --query-mode materialized - scripts/smoke-test-materialized-bq.sh — end-to-end smoke for operators Tests (incl. spike + integration + regression): - test_db_migration_v20, test_table_registry_source_query - test_bq_materialize, test_bq_cost_guardrail, test_bq_init_extract_skips - test_keboola_access, test_keboola_extension_query_passthrough (lock-in for the DuckDB extension capability), test_keboola_materialize, test_keboola_init_extract_skips, test_keboola_materialized_e2e (skipped without KBC_TEST_* creds) - test_sync_trigger_materialized, test_sync_trigger_keboola_materialized - test_api_admin_materialized, test_cli_admin_materialized - test_admin_bq_register, test_admin_discover_bigquery, test_admin_keboola_materialized, test_admin_phase_c_deprecation, test_admin_put_preservation, test_materialized_e2e Cost: BQ uses bigquery_query() (jobs API, view-aware) — works on tables, views, materialized views uniformly. Keboola uses ATTACH+COPY parquet through the DuckDB extension.
75 lines
2.7 KiB
Python
75 lines
2.7 KiB
Python
"""Tests for KeboolaAccess facade."""
|
|
import os
|
|
import pytest
|
|
from connectors.keboola.access import KeboolaAccess
|
|
|
|
|
|
def test_access_session_yields_attached_duckdb(tmp_path, monkeypatch):
|
|
"""Mock-mode test: the facade should accept a token, install+load
|
|
the Keboola extension, and ATTACH it as 'kbc'. We verify the SQL
|
|
issued by intercepting the duckdb.connect call.
|
|
"""
|
|
issued = []
|
|
class FakeConn:
|
|
def execute(self, sql, *args, **kwargs):
|
|
issued.append(sql)
|
|
class R:
|
|
def fetchall(s): return []
|
|
def fetchone(s): return (0,)
|
|
return R()
|
|
def close(self): pass
|
|
|
|
import duckdb
|
|
monkeypatch.setattr(duckdb, "connect", lambda *a, **kw: FakeConn())
|
|
|
|
acc = KeboolaAccess(
|
|
url="https://connection.keboola.com/",
|
|
token="fake-token-xyz",
|
|
)
|
|
with acc.duckdb_session() as conn:
|
|
assert conn is not None
|
|
# Verify the install + load + attach sequence happened.
|
|
joined = "\n".join(issued)
|
|
assert "INSTALL keboola" in joined
|
|
assert "LOAD keboola" in joined
|
|
assert "ATTACH" in joined and "TYPE keboola" in joined
|
|
# Token must be escaped for embedding in the ATTACH literal.
|
|
assert "fake-token-xyz" in joined
|
|
|
|
|
|
def test_access_escapes_single_quote_in_token(monkeypatch):
|
|
"""Defense against a token containing a single quote breaking the
|
|
ATTACH literal. SQL injection here is non-trivial because the token
|
|
is admin-supplied at instance config time, but escape it anyway."""
|
|
issued = []
|
|
class FakeConn:
|
|
def execute(self, sql, *args, **kwargs):
|
|
issued.append(sql)
|
|
class R:
|
|
def fetchall(s): return []
|
|
def fetchone(s): return (0,)
|
|
return R()
|
|
def close(self): pass
|
|
import duckdb
|
|
monkeypatch.setattr(duckdb, "connect", lambda *a, **kw: FakeConn())
|
|
|
|
acc = KeboolaAccess(url="x", token="bad'token")
|
|
with acc.duckdb_session() as conn:
|
|
pass
|
|
attach_sql = next(s for s in issued if "ATTACH" in s)
|
|
# Doubled single-quote per SQL string-literal escaping.
|
|
assert "bad''token" in attach_sql
|
|
|
|
|
|
def test_access_real_attach_when_creds_present(tmp_path):
|
|
"""Smoke when KBC_TEST_URL + KBC_TEST_TOKEN are present."""
|
|
url = os.environ.get("KBC_TEST_URL")
|
|
token = os.environ.get("KBC_TEST_TOKEN")
|
|
if not (url and token):
|
|
pytest.skip("Keboola creds not provided")
|
|
acc = KeboolaAccess(url=url, token=token)
|
|
with acc.duckdb_session() as conn:
|
|
# ATTACH must have succeeded — querying duckdb_databases() should
|
|
# show the 'kbc' alias.
|
|
rows = [r[0] for r in conn.execute("SELECT name FROM duckdb_databases()").fetchall()]
|
|
assert "kbc" in rows
|