Closes the 'admin pre-stages a curated table/view for analysts' use case end-to-end across both supported source connectors. Backend (BigQuery + Keboola, schema v20): - schema v20 adds source_query TEXT to table_registry (renumbered from v19 after main's #150 RBAC migration also bumped to v19) - connectors/bigquery/extractor.py adds materialize_query(table_id, sql, *, bq, output_dir, max_bytes=...) — BqAccess session, dry-run cost guardrail (default 10 GiB, configurable via data_source.bigquery.max_bytes_per_materialize), idempotent ATTACH, rows/bytes/md5 metadata for sync_state - connectors/keboola/access.py — new KeboolaAccess facade (parallel of BqAccess) wrapping ATTACH 'keboola://...' AS kbc - connectors/keboola/extractor.py adds materialize_query — same shape, no dry-run analog (Keboola Storage API has different cost model); legacy bucket-download path skips query_mode='materialized' rows - app/api/sync.py:_run_materialized_pass dispatches by source_type to the right materialize_query - app/api/admin.py: RegisterTableRequest accepts source_query; model_validator coheres mode↔source_query↔bucket; PUT preserves omitted fields; deprecation marks (Field(deprecated=True)) on sync_strategy + profile_after_sync (no extractor reads them; profile_after_sync becomes inert — bug from earlier work where /api/sync/trigger never honored the flag); _BQ_OPTIONAL_FIELD_DEFAULTS injects defaults into GET /server-config payload Operator + CLI surface: - da admin register-table --query / --query-mode materialized - scripts/smoke-test-materialized-bq.sh — end-to-end smoke for operators Tests (incl. spike + integration + regression): - test_db_migration_v20, test_table_registry_source_query - test_bq_materialize, test_bq_cost_guardrail, test_bq_init_extract_skips - test_keboola_access, test_keboola_extension_query_passthrough (lock-in for the DuckDB extension capability), test_keboola_materialize, test_keboola_init_extract_skips, test_keboola_materialized_e2e (skipped without KBC_TEST_* creds) - test_sync_trigger_materialized, test_sync_trigger_keboola_materialized - test_api_admin_materialized, test_cli_admin_materialized - test_admin_bq_register, test_admin_discover_bigquery, test_admin_keboola_materialized, test_admin_phase_c_deprecation, test_admin_put_preservation, test_materialized_e2e Cost: BQ uses bigquery_query() (jobs API, view-aware) — works on tables, views, materialized views uniformly. Keboola uses ATTACH+COPY parquet through the DuckDB extension.
93 lines
3.1 KiB
Python
93 lines
3.1 KiB
Python
"""Repository round-trips source_query column for query_mode='materialized'.
|
|
|
|
Lives alongside the schema-v20 migration: register() now accepts source_query
|
|
as an Optional[str] kwarg, and the column flows through SELECT * via list/get.
|
|
"""
|
|
import duckdb
|
|
import pytest
|
|
|
|
from src.db import _ensure_schema
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
|
|
|
|
@pytest.fixture
|
|
def repo(tmp_path):
|
|
conn = duckdb.connect(str(tmp_path / "system.duckdb"))
|
|
_ensure_schema(conn)
|
|
return TableRegistryRepository(conn)
|
|
|
|
|
|
def test_register_persists_source_query(repo):
|
|
repo.register(
|
|
id="orders_90d",
|
|
name="orders_90d",
|
|
source_type="bigquery",
|
|
query_mode="materialized",
|
|
source_query=(
|
|
"SELECT date, SUM(revenue) FROM `prj.ds.orders` "
|
|
"WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY) GROUP BY 1"
|
|
),
|
|
sync_schedule="every 6h",
|
|
)
|
|
row = repo.get("orders_90d")
|
|
assert row is not None
|
|
assert row["query_mode"] == "materialized"
|
|
assert "INTERVAL 90 DAY" in row["source_query"]
|
|
assert row["sync_schedule"] == "every 6h"
|
|
|
|
|
|
def test_register_omitted_source_query_stays_null(repo):
|
|
"""Default registrations (Keboola local) must not stamp an empty string."""
|
|
repo.register(id="t1", name="t1", source_type="keboola", query_mode="local")
|
|
row = repo.get("t1")
|
|
assert row is not None
|
|
assert row["source_query"] is None
|
|
|
|
|
|
def test_list_all_includes_source_query(repo):
|
|
repo.register(
|
|
id="m1", name="m1", source_type="bigquery",
|
|
query_mode="materialized", source_query="SELECT 1",
|
|
)
|
|
rows = repo.list_all()
|
|
assert len(rows) == 1
|
|
assert rows[0]["source_query"] == "SELECT 1"
|
|
|
|
|
|
def test_register_updates_source_query_on_conflict(repo):
|
|
"""Re-registering the same id must overwrite source_query (admin edit)."""
|
|
repo.register(
|
|
id="m1", name="m1", source_type="bigquery",
|
|
query_mode="materialized", source_query="SELECT 1",
|
|
)
|
|
repo.register(
|
|
id="m1", name="m1", source_type="bigquery",
|
|
query_mode="materialized", source_query="SELECT 2",
|
|
)
|
|
row = repo.get("m1")
|
|
assert row["source_query"] == "SELECT 2"
|
|
|
|
|
|
def test_register_preserves_registered_at_when_supplied(repo):
|
|
"""source_query addition must not break the existing registered_at
|
|
preservation contract (admin edits keep the original timestamp)."""
|
|
from datetime import datetime, timezone
|
|
|
|
original = datetime(2026, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
|
|
repo.register(
|
|
id="t", name="t", source_type="bigquery",
|
|
query_mode="materialized", source_query="SELECT 1",
|
|
registered_at=original,
|
|
)
|
|
repo.register(
|
|
id="t", name="t", source_type="bigquery",
|
|
query_mode="materialized", source_query="SELECT 2",
|
|
registered_at=original,
|
|
)
|
|
row = repo.get("t")
|
|
assert row["source_query"] == "SELECT 2"
|
|
# Don't assert exact equality on naive vs aware (DuckDB strips tz);
|
|
# just confirm the year+month+day didn't slide forward to 'now'.
|
|
assert row["registered_at"].year == 2026
|
|
assert row["registered_at"].month == 1
|
|
assert row["registered_at"].day == 1
|