agnes-the-ai-analyst/tests/test_query_remote_rewrite.py

"""Unit tests for ``_rewrite_user_sql_for_bigquery_query``.

The helper rewrites user SQL referencing query_mode='remote' BigQuery
tables so the entire query ships to BQ via the DuckDB BQ extension's
``bigquery_query(<project>, <sql>)`` UDF — engaging WHERE / SELECT /
LIMIT predicate pushdown instead of falling through to ATTACH-catalog
mode (which opens a Storage Read API session over the whole table).

These tests pin down each conservative-skip rule plus the happy-path
rewrites. Edge cases (CTE shadowing, double-wrap, mixed-source JOIN)
are intentionally explicit so a future refactor doesn't quietly
loosen the guard.
"""
from __future__ import annotations

import pytest


# ---------------------------------------------------------------------------
# Test infrastructure: an in-memory DuckDB seeded with table_registry rows
# matching the shapes the production registry produces. Avoids the full app
# bootstrap path; the rewriter only needs ``conn.execute("SELECT * FROM
# table_registry ...")`` to resolve names.
# ---------------------------------------------------------------------------


@pytest.fixture
def seeded_registry(tmp_path, monkeypatch):
    """Build a fresh ``system.duckdb`` in tmp_path with the schema migrated.

    Returns the open connection so tests can pass it to the rewriter.
    Cleanup is automatic via tmp_path teardown — but we close the
    open singleton handle first so a different DATA_DIR in the next
    test doesn't see the previous tmp's lock.
    """
    from src.db import get_system_db, close_system_db

    monkeypatch.setenv("DATA_DIR", str(tmp_path))
    (tmp_path / "state").mkdir(parents=True, exist_ok=True)
    close_system_db()
    conn = get_system_db()
    yield conn
    close_system_db()


def _register_bq_remote(conn, *, table_id, name, bucket, source_table):
    from src.repositories.table_registry import TableRegistryRepository
    TableRegistryRepository(conn).register(
        id=table_id,
        name=name,
        source_type="bigquery",
        bucket=bucket,
        source_table=source_table,
        query_mode="remote",
    )


def _register_local(conn, *, table_id, name, source_type="keboola"):
    from src.repositories.table_registry import TableRegistryRepository
    TableRegistryRepository(conn).register(
        id=table_id,
        name=name,
        source_type=source_type,
        bucket="bkt",
        source_table=name,
        query_mode="local",
    )


def _set_bq_project(monkeypatch, project="test-prj"):
    """Stub get_bq_access so the rewriter sees a real-looking project ID."""
    from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access
    bq = BqAccess(
        BqProjects(billing=project, data=project),
        client_factory=lambda projects: object(),
    )
    monkeypatch.setattr(
        "app.api.query.get_bq_access",
        lambda: bq,
        raising=False,
    )
    get_bq_access.cache_clear()


# ---------------------------------------------------------------------------
# Happy-path rewrites
# ---------------------------------------------------------------------------


def test_simple_select_where_against_one_bq_table_rewrites(seeded_registry, monkeypatch):
    """Single-table SELECT-WHERE against a registered BQ remote row →
    full SQL wrapped in ``bigquery_query('project', '<rewritten>')``.
    The bare-name reference gets translated to BQ-native backtick form."""
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
                        bucket="fin", source_table="ue")
    _set_bq_project(monkeypatch, "test-prj")

    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        "SELECT count(*) FROM ue WHERE event_date = '2026-01-01'",
        seeded_registry,
    )

    assert did_rewrite is True
    # Outer wrap must be a single bigquery_query() FROM-source.
    assert "bigquery_query(" in rewritten
    assert "test-prj" in rewritten
    # Inner SQL: bare name rewritten to backticked BQ-native path.
    assert "`test-prj.fin.ue`" in rewritten
    # Inner SQL is dollar-quoted (`$bqq_inner$ ... $bqq_inner$`), so
    # single quotes inside the WHERE predicate remain literal — no
    # doubling, no backslash escaping. Verifies the safer embedding form
    # introduced after the code review caught naive single-quote-only
    # escape doubling missing DuckDB backslash sequences.
    assert "$bqq_inner$" in rewritten
    assert "event_date = '2026-01-01'" in rewritten


def test_direct_bq_path_rewrites(seeded_registry, monkeypatch):
    """User wrote the direct ``bq."ds"."tbl"`` form. The rewriter must
    still translate to BQ-native backtick form before wrapping."""
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
                        bucket="fin", source_table="ue")
    _set_bq_project(monkeypatch, "test-prj")

    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        'SELECT * FROM bq."fin"."ue" LIMIT 10',
        seeded_registry,
    )

    assert did_rewrite is True
    assert "bigquery_query(" in rewritten
    assert "`test-prj.fin.ue`" in rewritten
    # Original duckdb-flavor path must NOT remain (it'd parse-fail under BQ).
    assert 'bq."fin"."ue"' not in rewritten


def test_cte_referencing_bq_table_rewrites_inside_cte(seeded_registry, monkeypatch):
    """A WITH clause whose body references a BQ table must rewrite that
    inner reference; the wrapping happens at the top level so BQ sees a
    valid BQ-flavor CTE."""
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.orders", name="orders",
                        bucket="fin", source_table="orders")
    _set_bq_project(monkeypatch, "test-prj")

    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        "WITH x AS (SELECT id FROM orders WHERE total > 0) SELECT count(*) FROM x",
        seeded_registry,
    )
    assert did_rewrite is True
    # Inner reference is rewritten.
    assert "`test-prj.fin.orders`" in rewritten
    # The whole thing is wrapped — bigquery_query is the outermost FROM.
    assert rewritten.lower().count("bigquery_query(") == 1


def test_subquery_referencing_bq_table_rewrites(seeded_registry, monkeypatch):
    """Subquery in FROM position — same handling as a CTE: rewrite the
    inner table reference, wrap the whole at the top."""
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
                        bucket="fin", source_table="ue")
    _set_bq_project(monkeypatch, "test-prj")

    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        "SELECT s.cnt FROM (SELECT count(*) AS cnt FROM ue) s",
        seeded_registry,
    )
    assert did_rewrite is True
    assert "`test-prj.fin.ue`" in rewritten
    assert rewritten.lower().count("bigquery_query(") == 1


def test_multiple_bq_tables_one_project_combine(seeded_registry, monkeypatch):
    """Two registered BQ tables in the same project → single
    ``bigquery_query()`` wraps the whole SQL with both refs rewritten
    inline. No separate parallel calls."""
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.orders", name="orders",
                        bucket="fin", source_table="orders")
    _register_bq_remote(seeded_registry, table_id="bq.fin.users", name="users",
                        bucket="fin", source_table="users")
    _set_bq_project(monkeypatch, "test-prj")

    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        "SELECT u.id, count(o.id) "
        "FROM users u JOIN orders o ON u.id = o.user_id "
        "GROUP BY u.id",
        seeded_registry,
    )
    assert did_rewrite is True
    # Both rewritten.
    assert "`test-prj.fin.users`" in rewritten
    assert "`test-prj.fin.orders`" in rewritten
    # Single wrap.
    assert rewritten.lower().count("bigquery_query(") == 1


# ---------------------------------------------------------------------------
# Conservative-skip cases
# ---------------------------------------------------------------------------


def test_join_bq_to_local_skips_rewrite(seeded_registry, monkeypatch):
    """A JOIN between a BQ table and a local-mode (Keboola/Jira) table
    is a cross-source query — wrapping it in bigquery_query() would lose
    the local table. The rewriter must fall through to the ATTACH-catalog
    path (slow but correct).
    """
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
                        bucket="fin", source_table="ue")
    _register_local(seeded_registry, table_id="kbc.in.local_orders",
                    name="local_orders")
    _set_bq_project(monkeypatch, "test-prj")

    user_sql = (
        "SELECT u.id, lo.total "
        "FROM ue u JOIN local_orders lo ON u.id = lo.user_id"
    )
    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        user_sql, seeded_registry,
    )
    assert did_rewrite is False
    assert rewritten == user_sql  # untouched


def test_no_bq_tables_passes_through(seeded_registry, monkeypatch):
    """User SQL referencing only local-source tables → no rewrite,
    no log spam, original SQL returned."""
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_local(seeded_registry, table_id="kbc.in.orders", name="orders")
    _set_bq_project(monkeypatch, "test-prj")

    user_sql = "SELECT * FROM orders WHERE id = 1"
    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        user_sql, seeded_registry,
    )
    assert did_rewrite is False
    assert rewritten == user_sql


def test_already_contains_bigquery_query_passes_through(seeded_registry, monkeypatch):
    """User SQL already calls bigquery_query() — never double-wrap.

    Note: the /api/query endpoint blocks ``bigquery_query`` in user SQL
    via the keyword denylist, so this scenario can't reach the rewriter
    in production today. Defensive guard for callers from other paths.
    """
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
                        bucket="fin", source_table="ue")
    _set_bq_project(monkeypatch, "test-prj")

    user_sql = (
        "SELECT * FROM bigquery_query('test-prj', 'SELECT * FROM `test-prj.fin.ue`')"
    )
    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        user_sql, seeded_registry,
    )
    assert did_rewrite is False
    assert rewritten == user_sql


def test_unconfigured_bq_project_skips(seeded_registry, monkeypatch):
    """If get_bq_access() is the not-configured sentinel (data=''),
    don't rewrite — there's no project to fill into bigquery_query()."""
    from app.api.query import _rewrite_user_sql_for_bigquery_query
    _register_bq_remote(seeded_registry, table_id="bq.fin.ue", name="ue",
                        bucket="fin", source_table="ue")

    # Override to sentinel (empty data project).
    from connectors.bigquery.access import BqAccess, BqProjects, get_bq_access
    monkeypatch.setattr(
        "app.api.query.get_bq_access",
        lambda: BqAccess(BqProjects(billing="", data="")),
        raising=False,
    )
    get_bq_access.cache_clear()

    user_sql = "SELECT * FROM ue"
    rewritten, did_rewrite = _rewrite_user_sql_for_bigquery_query(
        user_sql, seeded_registry,
    )
    assert did_rewrite is False
    assert rewritten == user_sql


# ---------------------------------------------------------------------------
# Backwards-compat: dry-run helper still available + behaves the same
# ---------------------------------------------------------------------------


def test_existing_dry_run_helper_still_callable():
    """The original ``_rewrite_user_sql_for_bq_dry_run`` is now a thin
    wrapper around the shared core rewriter (Pass 1 + Pass 2). Callers
    that pass an explicit ``project`` argument keep working unchanged.
    """
    from app.api.query import _rewrite_user_sql_for_bq_dry_run

    rewritten = _rewrite_user_sql_for_bq_dry_run(
        sql="SELECT * FROM ue",
        name_lookups=[("ue", "fin", "ue")],
        project="some-prj",
    )
    assert "`some-prj.fin.ue`" in rewritten
    # The dry-run helper does NOT add a bigquery_query() wrapper; that's
    # only the new execution-path helper's job.
    assert "bigquery_query(" not in rewritten


# ---------------------------------------------------------------------------
# End-to-end: the /api/query handler must invoke the rewriter and execute
# the rewritten SQL (not the original) when there's a BQ-remote table.
# ---------------------------------------------------------------------------


def _auth(token: str) -> dict:
    return {"Authorization": f"Bearer {token}"}


def _register_bq_remote_row(name: str, bucket: str, source_table: str) -> None:
    from src.db import get_system_db
    from src.repositories.table_registry import TableRegistryRepository
    sys_conn = get_system_db()
    try:
        TableRegistryRepository(sys_conn).register(
            id=f"bq.{bucket}.{source_table}",
            name=name,
            source_type="bigquery",
            bucket=bucket,
            source_table=source_table,
            query_mode="remote",
        )
    finally:
        sys_conn.close()


@pytest.fixture
def stub_bq_for_endpoint(monkeypatch):
    """Stub _bq_dry_run_bytes + get_bq_access at the endpoint level so the
    cap-guard sees a real-looking BQ project but doesn't issue real RPCs.
    """
    monkeypatch.setattr(
        "app.api.query._bq_dry_run_bytes",
        lambda *a, **k: 1024,  # tiny — pass cap
        raising=False,
    )

    class _FakeProjects:
        data = "test-data-prj"
        billing = "test-billing-prj"

    class _FakeBqAccess:
        projects = _FakeProjects()

    monkeypatch.setattr(
        "app.api.query.get_bq_access",
        lambda: _FakeBqAccess(),
        raising=False,
    )


def test_endpoint_executes_rewritten_sql_against_analytics(
    seeded_app, stub_bq_for_endpoint, monkeypatch,
):
    """The /api/query handler must call ``analytics.execute(rewritten_sql)``
    — NOT the user's original SQL — when a BQ-remote table is referenced.
    Capture what reaches DuckDB and assert the bigquery_query() wrap is
    present.
    """
    _register_bq_remote_row("ue", "fin", "ue")

    # Capture analytics.execute calls. The handler does
    # `analytics = get_analytics_db_readonly(); analytics.execute(sql)`,
    # so we patch the connection factory to return a stub.
    captured = {"sql": None}

    class _StubAnalytics:
        description = [("c0",)]
        def execute(self, sql, *args, **kwargs):
            captured["sql"] = sql
            class _R:
                def fetchmany(self, _n):
                    return []
            return _R()
        def close(self):
            pass

    monkeypatch.setattr(
        "app.api.query.get_analytics_db_readonly",
        lambda: _StubAnalytics(),
        raising=False,
    )

    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={"sql": "SELECT count(*) FROM ue WHERE country = 'CZ'"},
        headers=_auth(token),
    )
    assert r.status_code == 200, r.json()
    sent = captured["sql"]
    assert sent is not None, "analytics.execute was never called"
    assert "bigquery_query(" in sent, (
        f"endpoint did not wrap user SQL in bigquery_query(); sent: {sent!r}"
    )
    assert "test-data-prj" in sent
    assert "`test-data-prj.fin.ue`" in sent


def test_endpoint_passes_original_sql_when_no_bq_table(
    seeded_app, stub_bq_for_endpoint, monkeypatch,
):
    """For queries that don't touch any BQ-remote registered name, the
    handler must pass the original SQL through unchanged — the
    ATTACH-catalog path handles local-source tables natively and any
    rewrite would be wasted work."""
    captured = {"sql": None}

    class _StubAnalytics:
        description = [("c0",)]
        def execute(self, sql, *args, **kwargs):
            captured["sql"] = sql
            class _R:
                def fetchmany(self, _n):
                    return []
            return _R()
        def close(self):
            pass

    monkeypatch.setattr(
        "app.api.query.get_analytics_db_readonly",
        lambda: _StubAnalytics(),
        raising=False,
    )

    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    user_sql = "SELECT 1 AS x"
    r = c.post("/api/query", json={"sql": user_sql}, headers=_auth(token))
    assert r.status_code == 200, r.json()
    assert captured["sql"] == user_sql
    assert "bigquery_query(" not in captured["sql"]


def test_endpoint_wraps_rewritten_sql_with_outer_limit(
    seeded_app, stub_bq_for_endpoint, monkeypatch,
):
    """Memory-safety regression — when the rewriter fires, the handler
    MUST wrap the bigquery_query() call in an outer ``LIMIT N+1`` so a
    `SELECT *` against a billion-row remote table doesn't materialise the
    full result into the worker before fetchmany applies the cap.
    Code-review #2a fix.
    """
    _register_bq_remote_row("ue", "fin", "ue")

    captured = {"sql": None}

    class _StubAnalytics:
        description = [("c0",)]
        def execute(self, sql, *args, **kwargs):
            captured["sql"] = sql
            class _R:
                def fetchmany(self, _n):
                    return []
            return _R()
        def close(self):
            pass

    monkeypatch.setattr(
        "app.api.query.get_analytics_db_readonly",
        lambda: _StubAnalytics(),
        raising=False,
    )

    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={"sql": "SELECT * FROM ue", "limit": 100},
        headers=_auth(token),
    )
    assert r.status_code == 200, r.json()
    sent = captured["sql"]
    # The bigquery_query() wrap is present, AND the whole thing is wrapped
    # again with an outer LIMIT that includes the user-requested cap +1
    # (the +1 is the existing truncation-detection pattern).
    assert "bigquery_query(" in sent
    assert "_bqq_outer" in sent
    assert "LIMIT 101" in sent  # request.limit (100) + 1


def test_endpoint_falls_back_to_original_sql_on_bq_parse_error(
    seeded_app, stub_bq_for_endpoint, monkeypatch,
):
    """When the rewritten ``bigquery_query()`` path fails with a parse-
    level error (e.g. user SQL contained DuckDB-only syntax that BQ
    can't parse), the handler MUST retry with the original SQL via the
    ATTACH-catalog path so the user request still succeeds. Code-review
    #4 fix.
    """
    _register_bq_remote_row("ue", "fin", "ue")

    calls = {"sqls": []}

    class _StubAnalytics:
        description = [("c0",)]
        def execute(self, sql, *args, **kwargs):
            calls["sqls"].append(sql)
            # First call (rewritten) raises a BQ-style parse error;
            # second call (original SQL fallback) returns rows.
            if "bigquery_query(" in sql:
                raise RuntimeError(
                    "BinderException: Query execution failed: "
                    "Syntax error: Unexpected token at [1:42]"
                )
            class _R:
                def fetchmany(self, _n):
                    return [(1,)]
            return _R()
        def close(self):
            pass

    monkeypatch.setattr(
        "app.api.query.get_analytics_db_readonly",
        lambda: _StubAnalytics(),
        raising=False,
    )

    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        # DuckDB-only ::INT cast — survives identifier rewrite, BQ refuses.
        json={"sql": "SELECT (count(*))::INT FROM ue"},
        headers=_auth(token),
    )
    assert r.status_code == 200, r.json()
    # Two execute calls: 1) rewritten (raised) 2) fallback to original.
    assert len(calls["sqls"]) == 2
    assert "bigquery_query(" in calls["sqls"][0]
    assert calls["sqls"][1] == "SELECT (count(*))::INT FROM ue"


def test_endpoint_does_not_fall_back_on_non_parse_errors(
    seeded_app, stub_bq_for_endpoint, monkeypatch,
):
    """Non-parse-error exceptions from the rewritten path (network,
    quota, forbidden, generic runtime) must propagate, NOT silently
    retry against the legacy path. Otherwise the legacy path would
    just fail again and the user sees a slow + double-failure.
    """
    _register_bq_remote_row("ue", "fin", "ue")

    calls = {"sqls": []}

    class _StubAnalytics:
        description = [("c0",)]
        def execute(self, sql, *args, **kwargs):
            calls["sqls"].append(sql)
            raise RuntimeError("Network unreachable: BQ endpoint timed out")
        def close(self):
            pass

    monkeypatch.setattr(
        "app.api.query.get_analytics_db_readonly",
        lambda: _StubAnalytics(),
        raising=False,
    )

    c = seeded_app["client"]
    token = seeded_app["admin_token"]
    r = c.post(
        "/api/query",
        json={"sql": "SELECT count(*) FROM ue"},
        headers=_auth(token),
    )
    # Generic 400 from the handler's outer except — body will surface
    # the runtime error message; we just need to confirm no fallback.
    assert r.status_code in (400, 500, 502)
    assert len(calls["sqls"]) == 1, "must not retry on non-parse error"