agnes-the-ai-analyst/tests/test_sync_trigger_keboola_materialized.py

"""Scheduler-level test: when a Keboola row has query_mode='materialized',
_run_materialized_pass dispatches to connectors.keboola.extractor.materialize_query
(not BQ's). Existing BQ-materialized rows continue using BqAccess.

Mirrors the unit-style of tests/test_sync_trigger_materialized.py — patches
the inner extractor entry points instead of going through the API layer.
"""
import duckdb
import pytest
from contextlib import contextmanager
from pathlib import Path
from unittest.mock import patch, MagicMock

from src.db import _ensure_schema
from src.repositories.table_registry import TableRegistryRepository
from connectors.bigquery.access import BqAccess, BqProjects


@pytest.fixture
def system_db(tmp_path, monkeypatch):
    db_path = tmp_path / "system.duckdb"
    conn = duckdb.connect(str(db_path))
    _ensure_schema(conn)
    monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
    yield conn
    conn.close()


@pytest.fixture
def stub_bq():
    @contextmanager
    def _session(_p):
        conn = duckdb.connect(":memory:")
        try:
            yield conn
        finally:
            conn.close()

    return BqAccess(
        BqProjects(billing="t", data="t"),
        client_factory=lambda _p: MagicMock(),
        duckdb_session_factory=_session,
    )


def test_run_materialized_pass_dispatches_keboola_to_keboola_extractor(
    system_db, stub_bq, tmp_path, monkeypatch
):
    """Keboola row with query_mode='materialized' must invoke the Keboola
    materialize_query, not the BQ one."""
    repo = TableRegistryRepository(system_db)
    repo.register(
        id="orders_recent", name="orders_recent",
        source_type="keboola", query_mode="materialized",
        source_query='SELECT * FROM kbc."in.c-sales"."orders" WHERE 1=1',
        sync_schedule="every 1m",  # always due
    )

    # Provide instance.yaml-shape config + env so the Keboola lazy-init succeeds.
    monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake-token")
    from app.api import sync as sync_mod

    # Patch get_value to return the keboola URL/token_env.
    def _fake_get_value(*keys, default=None):
        path = keys
        if path == ("data_source", "keboola", "url"):
            return "https://connection.keboola.com/"
        if path == ("data_source", "keboola", "token_env"):
            return "KEBOOLA_STORAGE_TOKEN"
        if path == ("data_source", "bigquery", "max_bytes_per_materialize"):
            return default if default is not None else 0
        return default

    # Pre-create the parquet for hash bookkeeping (kb materialize is patched
    # so it won't write a real one).
    parquet_dir = tmp_path / "data" / "extracts" / "keboola" / "data"
    parquet_dir.mkdir(parents=True, exist_ok=True)
    (parquet_dir / "orders_recent.parquet").write_bytes(
        b"PAR1" + b"\x00" * 16 + b"PAR1"
    )

    bq_called = MagicMock()
    kb_called = MagicMock(return_value={
        "table_id": "orders_recent", "rows": 1, "bytes": 100,
        "md5": "abc123", "path": str(parquet_dir / "orders_recent.parquet"),
    })

    with patch("app.instance_config.get_value", _fake_get_value), \
         patch("connectors.bigquery.extractor.materialize_query", bq_called), \
         patch("connectors.keboola.extractor.materialize_query", kb_called):
        summary = sync_mod._run_materialized_pass(system_db, stub_bq)

    assert kb_called.called, "Keboola materialize_query was not invoked"
    assert not bq_called.called, (
        "BQ materialize_query was wrongly invoked for a Keboola row"
    )
    assert "orders_recent" in summary["materialized"]


def test_run_materialized_pass_dispatches_bigquery_to_bq_extractor(
    system_db, stub_bq, tmp_path
):
    """Regression: BQ-materialized path keeps working unchanged."""
    repo = TableRegistryRepository(system_db)
    repo.register(
        id="events_summary", name="events_summary",
        source_type="bigquery", query_mode="materialized",
        source_query="SELECT date, COUNT(*) FROM `proj.dataset.events` GROUP BY 1",
        sync_schedule="every 1m",
    )

    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
    parquet_dir.mkdir(parents=True, exist_ok=True)
    (parquet_dir / "events_summary.parquet").write_bytes(
        b"PAR1" + b"\x00" * 16 + b"PAR1"
    )

    bq_called = MagicMock(return_value={
        "rows": 1, "size_bytes": 100, "query_mode": "materialized",
        "hash": "abc123",
    })
    kb_called = MagicMock()

    from app.api import sync as sync_mod

    with patch("app.api.sync._materialize_table", bq_called), \
         patch("connectors.keboola.extractor.materialize_query", kb_called):
        summary = sync_mod._run_materialized_pass(system_db, stub_bq)

    assert bq_called.called
    assert not kb_called.called
    assert "events_summary" in summary["materialized"]