agnes-the-ai-analyst/tests/test_sync_trigger_keboola_materialized.py

"""Scheduler-level test: when a Keboola row has query_mode='materialized',
_run_materialized_pass dispatches to connectors.keboola.extractor.materialize_query
(not BQ's). Existing BQ-materialized rows continue using BqAccess.

Mirrors the unit-style of tests/test_sync_trigger_materialized.py — patches
the inner extractor entry points instead of going through the API layer.
"""
import duckdb
import pytest
from contextlib import contextmanager
from pathlib import Path
from unittest.mock import patch, MagicMock

from src.db import _ensure_schema
from src.repositories.table_registry import TableRegistryRepository
from connectors.bigquery.access import BqAccess, BqProjects


@pytest.fixture
def system_db(tmp_path, monkeypatch):
    db_path = tmp_path / "system.duckdb"
    conn = duckdb.connect(str(db_path))
    _ensure_schema(conn)
    monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
    yield conn
    conn.close()


@pytest.fixture
def stub_bq():
    @contextmanager
    def _session(_p):
        conn = duckdb.connect(":memory:")
        try:
            yield conn
        finally:
            conn.close()

    return BqAccess(
        BqProjects(billing="t", data="t"),
        client_factory=lambda _p: MagicMock(),
        duckdb_session_factory=_session,
    )


def test_run_materialized_pass_dispatches_keboola_to_keboola_extractor(
    system_db, stub_bq, tmp_path, monkeypatch
):
    """Keboola row with query_mode='materialized' must invoke the Keboola
    materialize_query, not the BQ one."""
    repo = TableRegistryRepository(system_db)
    repo.register(
        id="orders_recent", name="orders_recent",
        source_type="keboola", query_mode="materialized",
        source_query='SELECT * FROM kbc."in.c-sales"."orders" WHERE 1=1',
        sync_schedule="every 1m",  # always due
    )

    # Provide instance.yaml-shape config + env so the Keboola lazy-init succeeds.
    monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake-token")
    from app.api import sync as sync_mod

    # Patch get_value to return the keboola URL/token_env.
    def _fake_get_value(*keys, default=None):
        path = keys
        if path == ("data_source", "keboola", "stack_url"):
            return "https://connection.keboola.com/"
        if path == ("data_source", "keboola", "token_env"):
            return "KEBOOLA_STORAGE_TOKEN"
        if path == ("data_source", "bigquery", "max_bytes_per_materialize"):
            return default if default is not None else 0
        return default

    # Pre-create the parquet for hash bookkeeping (kb materialize is patched
    # so it won't write a real one).
    parquet_dir = tmp_path / "data" / "extracts" / "keboola" / "data"
    parquet_dir.mkdir(parents=True, exist_ok=True)
    (parquet_dir / "orders_recent.parquet").write_bytes(
        b"PAR1" + b"\x00" * 16 + b"PAR1"
    )

    bq_called = MagicMock()
    kb_called = MagicMock(return_value={
        "table_id": "orders_recent", "rows": 1, "bytes": 100,
        "md5": "abc123", "path": str(parquet_dir / "orders_recent.parquet"),
    })

    with patch("app.instance_config.get_value", _fake_get_value), \
         patch("connectors.bigquery.extractor.materialize_query", bq_called), \
         patch("connectors.keboola.extractor.materialize_query", kb_called):
        summary = sync_mod._run_materialized_pass(system_db, stub_bq)

    assert kb_called.called, "Keboola materialize_query was not invoked"
    assert not bq_called.called, (
        "BQ materialize_query was wrongly invoked for a Keboola row"
    )
    assert "orders_recent" in summary["materialized"]


def test_run_materialized_pass_dispatches_bigquery_to_bq_extractor(
    system_db, stub_bq, tmp_path
):
    """Regression: BQ-materialized path keeps working unchanged."""
    repo = TableRegistryRepository(system_db)
    repo.register(
        id="events_summary", name="events_summary",
        source_type="bigquery", query_mode="materialized",
        source_query="SELECT date, COUNT(*) FROM `proj.dataset.events` GROUP BY 1",
        sync_schedule="every 1m",
    )

    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
    parquet_dir.mkdir(parents=True, exist_ok=True)
    (parquet_dir / "events_summary.parquet").write_bytes(
        b"PAR1" + b"\x00" * 16 + b"PAR1"
    )

    bq_called = MagicMock(return_value={
        "rows": 1, "size_bytes": 100, "query_mode": "materialized",
        "hash": "abc123",
    })
    kb_called = MagicMock()

    from app.api import sync as sync_mod

    with patch("app.api.sync._materialize_table", bq_called), \
         patch("connectors.keboola.extractor.materialize_query", kb_called):
        summary = sync_mod._run_materialized_pass(system_db, stub_bq)

    assert bq_called.called
    assert not kb_called.called
    assert "events_summary" in summary["materialized"]


def test_run_sync_runs_materialized_pass_on_keboola_only_instance(
    system_db, tmp_path, monkeypatch
):
    """Devin finding 2026-05-01 (BUG_pr-review-job-3fbd31c9_0001):
    on a Keboola-only instance (no data_source.bigquery.project), the
    materialized pass must still run so Keboola materialized rows get
    processed. Pre-fix, _run_sync gated the entire pass behind
    `if bq_project:` and silently skipped Keboola materialized."""
    from app.api import sync as sync_mod

    # Register a Keboola materialized row.
    repo = TableRegistryRepository(system_db)
    repo.register(
        id="kb_aggregated",
        name="kb_aggregated",
        source_type="keboola",
        query_mode="materialized",
        source_query="SELECT 1 AS x",
        registered_by="admin@test",
    )

    # Stub the Keboola materialize entry — verifies dispatch reached it.
    kb_called = MagicMock(return_value={
        "table_id": "kb_aggregated",
        "path": str(tmp_path / "kb_aggregated.parquet"),
        "rows": 1,
        "bytes": 100,
        "md5": "abc",
    })

    # Pretend we're on Keboola-only — empty BQ project. The sentinel
    # BqAccess will be constructed but never invoked because no BQ row
    # is in registry. Patch get_value to mirror Keboola-only config.
    def _fake_get_value(*keys, default=None):
        if keys == ("data_source", "bigquery", "project"):
            return ""  # KEY: no BQ project configured
        if keys == ("data_source", "keboola", "stack_url"):
            return "https://connection.keboola.com/"
        if keys == ("data_source", "keboola", "token_env"):
            return "KEBOOLA_STORAGE_TOKEN"
        if keys == ("data_source", "bigquery", "max_bytes_per_materialize"):
            return 0
        return default

    monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake-token")
    monkeypatch.setattr("app.instance_config.get_value", _fake_get_value)

    # Pre-create the parquet file so the post-materialize hash bookkeeping
    # in _run_materialized_pass doesn't ENOENT.
    parquet_dir = Path(tmp_path) / "data" / "extracts" / "keboola" / "data"
    parquet_dir.mkdir(parents=True, exist_ok=True)
    (parquet_dir / "kb_aggregated.parquet").write_bytes(
        b"PAR1" + b"\x00" * 16 + b"PAR1"
    )

    with patch("connectors.keboola.extractor.materialize_query", kb_called):
        # Use the public entry point _run_materialized_pass with a
        # sentinel bq (None or a BqAccess that errors on .client()).
        # The Keboola dispatch branch never touches `bq`, so even None works.
        # We construct a minimal BqAccess so the BQ branch (if any row went
        # through it) would surface a typed error per-row.
        @contextmanager
        def _err_session(_p):
            raise RuntimeError("BQ not configured — should not be called for Keboola-only")
            yield  # unreachable

        sentinel_bq = BqAccess(
            BqProjects(billing="", data=""),
            client_factory=lambda _p: (_ for _ in ()).throw(RuntimeError("not configured")),
            duckdb_session_factory=_err_session,
        )

        summary = sync_mod._run_materialized_pass(system_db, sentinel_bq)

    # Critical assertion: Keboola materialize was actually invoked.
    assert kb_called.called, (
        "Keboola materialize_query was not called on Keboola-only instance — "
        "the bq_project gate in _run_sync would have skipped this entirely."
    )
    assert "kb_aggregated" in summary["materialized"]