agnes-the-ai-analyst/tests/test_journey_sync_query.py
ZdenekSrotyr 8030a867ec fix(admin-api): keep source_type validator permissive when primary is 'local' (bootstrap)
The strict source_type-availability validator from the prior commit
broke ~12 existing tests that register tables on the default test
instance (where `data_source.type` resolves to 'local' since no
instance.yaml is loaded).

The intent of the validator is to catch *explicit* misconfig:
`type=bigquery` instance + `source_type=keboola` payload with no
`data_source.keboola.*` block. The bootstrap workflow — admin sets up
a fresh instance and registers a few tables before pointing at a real
source — should not be gated here.

Loosen the check: when `get_data_source_type()` returns 'local' (the
fallback when no `data_source.type` is set), skip the rejection. The
explicit mismatch case still 422s because that path resolves
`configured_primary` to a real source type.

Also adds an autouse keboola_instance fixture to test_journey_sync_query.py
which exercises Keboola registrations through the full sync→query
flow — the fixture documents the test's data-source assumption rather
than relying on the bootstrap escape hatch.
2026-05-01 23:09:15 +02:00

172 lines
5.6 KiB
Python

"""J2 — Sync & Query journey tests.
Complete flow: register table → create mock extract → rebuild orchestrator →
query data via API → verify catalog listing.
"""
import pytest
from tests.conftest import create_mock_extract
def _auth(token: str) -> dict:
return {"Authorization": f"Bearer {token}"}
@pytest.fixture(autouse=True)
def _keboola_instance(monkeypatch):
"""Keboola instance fixture — required by tests that POST
`source_type='keboola'` payloads. The register-table source-type
availability validator refuses Keboola registrations on the default
unconfigured test instance (where get_data_source_type() returns
'local')."""
fake_cfg = {
"data_source": {
"type": "keboola",
"keboola": {
"stack_url": "https://connection.keboola.com",
"project_id": "1234",
"token_env": "KEBOOLA_STORAGE_TOKEN",
},
},
}
monkeypatch.setattr(
"app.instance_config.load_instance_config", lambda: fake_cfg, raising=False,
)
from app.instance_config import reset_cache
reset_cache()
yield
reset_cache()
@pytest.mark.journey
class TestSyncAndQuery:
def test_register_create_rebuild_query(self, seeded_app, mock_extract_factory):
"""Full flow: register → mock extract → rebuild → query rows."""
c = seeded_app["client"]
t = seeded_app["admin_token"]
env = seeded_app["env"]
# Step 1: register table
resp = c.post(
"/api/admin/register-table",
json={
"name": "orders",
"source_type": "keboola",
"bucket": "in.c-crm",
"source_table": "orders",
"query_mode": "local",
},
headers=_auth(t),
)
assert resp.status_code == 201
# Step 2: create mock extract
mock_extract_factory(
"keboola",
[
{
"name": "orders",
"data": [
{"id": "1", "product": "Widget", "amount": "100"},
{"id": "2", "product": "Gadget", "amount": "200"},
],
}
],
)
# Step 3: rebuild orchestrator
from src.orchestrator import SyncOrchestrator
result = SyncOrchestrator(analytics_db_path=env["analytics_db"]).rebuild()
assert "keboola" in result
assert "orders" in result["keboola"]
# Step 4: query data
resp = c.post(
"/api/query",
json={"sql": "SELECT * FROM orders ORDER BY id"},
headers=_auth(t),
)
assert resp.status_code == 200
body = resp.json()
assert body["row_count"] == 2
assert "id" in body["columns"]
def test_catalog_lists_registered_table(self, seeded_app):
"""After registration, table appears in /api/catalog/tables."""
c = seeded_app["client"]
t = seeded_app["admin_token"]
c.post(
"/api/admin/register-table",
json={"name": "customers", "source_type": "keboola", "query_mode": "local"},
headers=_auth(t),
)
resp = c.get("/api/catalog/tables", headers=_auth(t))
assert resp.status_code == 200
names = {tbl["name"] for tbl in resp.json()["tables"]}
assert "customers" in names
def test_query_blocked_keywords(self, seeded_app):
"""DROP and other DDL/dangerous statements are blocked."""
c = seeded_app["client"]
t = seeded_app["admin_token"]
for bad_sql in [
"DROP TABLE orders",
"INSERT INTO orders VALUES (1)",
"SELECT * FROM read_parquet('/tmp/x.parquet')",
]:
resp = c.post("/api/query", json={"sql": bad_sql}, headers=_auth(t))
assert resp.status_code == 400, f"Expected 400 for: {bad_sql}"
def test_manifest_reflects_synced_tables(self, seeded_app, mock_extract_factory):
"""After rebuild, manifest includes synced table with correct row count."""
c = seeded_app["client"]
t = seeded_app["admin_token"]
env = seeded_app["env"]
mock_extract_factory(
"keboola",
[
{
"name": "products",
"data": [
{"id": "1", "name": "Alpha"},
{"id": "2", "name": "Beta"},
{"id": "3", "name": "Gamma"},
],
}
],
)
from src.orchestrator import SyncOrchestrator
SyncOrchestrator(analytics_db_path=env["analytics_db"]).rebuild()
resp = c.get("/api/sync/manifest", headers=_auth(t))
assert resp.status_code == 200
tables = resp.json()["tables"]
assert "products" in tables
assert tables["products"]["rows"] == 3
def test_query_empty_result(self, seeded_app, mock_extract_factory):
"""Query against a view with no rows returns empty result set."""
c = seeded_app["client"]
t = seeded_app["admin_token"]
env = seeded_app["env"]
mock_extract_factory(
"keboola",
[{"name": "empty_table", "data": [{"id": "1", "val": "x"}]}],
)
from src.orchestrator import SyncOrchestrator
SyncOrchestrator(analytics_db_path=env["analytics_db"]).rebuild()
resp = c.post(
"/api/query",
json={"sql": "SELECT * FROM empty_table WHERE id = 'nonexistent'"},
headers=_auth(t),
)
assert resp.status_code == 200
assert resp.json()["row_count"] == 0