"""Tests for the ``internal`` data source. Coverage: - Per-row RBAC: alice sees only alice's rows; admin sees everyone's. - Catalog: internal tables show up in /api/v2/catalog. - Schema: /api/v2/schema/agnes_sessions returns column metadata. - Cross-source rejection: SQL mixing ``agnes_sessions`` with a registered Keboola/BQ table id is rejected before execution. - Username sanitization: an exotic local-part is rejected, not silently scoped to the wrong rows. """ from __future__ import annotations import pytest from connectors.internal.access import ( INTERNAL_TABLES_BY_ID, InternalAccessError, build_filter_clause, execute_internal_query, find_internal_refs, get_schema, is_internal_table, ) from connectors.internal.registry import ensure_internal_tables_registered from src.db import _ensure_schema, _get_state_dir @pytest.fixture def system_db(tmp_path, monkeypatch): """Stand up a fresh system.duckdb with the v43 schema + registered internal rows. ``get_system_db`` is the singleton consumed by the connector, so we redirect AGNES_DATA_DIR to a per-test path before the helper opens the file.""" data_dir = tmp_path / "agnes_data" (data_dir / "state").mkdir(parents=True) # ``src.db._get_data_dir`` reads ``DATA_DIR`` (not ``AGNES_DATA_DIR``). # Setting it before ``get_system_db()`` runs reroutes the singleton to # the per-test tmp_path. monkeypatch.setenv("DATA_DIR", str(data_dir)) monkeypatch.delenv("STATE_DIR", raising=False) # Force close any pre-existing handle held by an earlier test. from src.db import close_system_db close_system_db() from src.db import get_system_db conn = get_system_db() _ensure_schema(conn) ensure_internal_tables_registered(conn) # Seed a couple of canonical rows for the RBAC checks. conn.execute( "INSERT INTO usage_session_summary " "(session_file, session_id, username, user_id, tool_calls, tool_errors, processor_version) VALUES " "('alice/s1.jsonl', 's-a-1', 'alice', 'alice-uuid', 10, 1, 1)" ) conn.execute( "INSERT INTO usage_session_summary " "(session_file, session_id, username, user_id, tool_calls, tool_errors, processor_version) VALUES " "('bob/s2.jsonl', 's-b-1', 'bob', 'bob-uuid', 20, 0, 1)" ) conn.execute( "INSERT INTO audit_log (id, user_id, action, result) VALUES " "('a-1', 'alice-uuid', 'session.transcript_view', 'success')" ) conn.execute( "INSERT INTO audit_log (id, user_id, action, result) VALUES " "('a-2', 'bob-uuid', 'session.transcript_view', 'success')" ) yield conn close_system_db() # --------------------------------------------------------------------------- # Static helpers — no DB needed # --------------------------------------------------------------------------- def test_is_internal_table_recognises_canonical_ids(): assert is_internal_table("agnes_sessions") assert is_internal_table("agnes_telemetry") assert is_internal_table("agnes_audit") assert not is_internal_table("usage_session_summary") # underlying physical assert not is_internal_table("orders_daily") def test_find_internal_refs_word_boundary(): refs = find_internal_refs("SELECT * FROM agnes_sessions WHERE x = 'foo'") assert refs == ["agnes_sessions"] def test_find_internal_refs_multiple_in_declaration_order(): refs = find_internal_refs("SELECT * FROM agnes_audit JOIN agnes_sessions USING (x)") # Order follows INTERNAL_TABLES declaration, not the SQL order. assert refs == ["agnes_sessions", "agnes_audit"] def test_find_internal_refs_ignores_unrelated(): refs = find_internal_refs("SELECT * FROM orders_daily") assert refs == [] def test_filter_clause_admin_is_empty(): table = INTERNAL_TABLES_BY_ID["agnes_sessions"] assert build_filter_clause(table, {"email": "admin@x", "id": "admin-uuid"}, True) == "" def test_filter_clause_non_admin_scopes_to_user_id(): """agnes_sessions filters on user_id with OR username fallback for pre-backfill rows.""" table = INTERNAL_TABLES_BY_ID["agnes_sessions"] clause = build_filter_clause(table, {"email": "alice@example.com", "id": "alice-uuid"}, False) assert clause == "WHERE (user_id = 'alice-uuid' OR username = 'alice')" def test_filter_clause_non_admin_scopes_audit_to_user_id(): table = INTERNAL_TABLES_BY_ID["agnes_audit"] clause = build_filter_clause( table, {"email": "alice@example.com", "id": "alice-uuid"}, False, ) assert clause == "WHERE user_id = 'alice-uuid'" def test_filter_clause_rejects_unsafe_user_id(): table = INTERNAL_TABLES_BY_ID["agnes_sessions"] with pytest.raises(InternalAccessError): build_filter_clause( table, {"email": "alice@example.com", "id": "'; DROP TABLE--"}, False, ) # --------------------------------------------------------------------------- # End-to-end RBAC — runs against a real (fresh) system.duckdb # --------------------------------------------------------------------------- def test_admin_sees_every_user_session(system_db, tmp_path): db_path = str(_get_state_dir() / "system.duckdb") _, rows, _ = execute_internal_query( db_path, {"email": "admin@x", "id": "admin-uuid"}, is_admin=True, sql="SELECT username FROM agnes_sessions ORDER BY username", ) assert [r[0] for r in rows] == ["alice", "bob"] def test_non_admin_sees_only_own_sessions(system_db): db_path = str(_get_state_dir() / "system.duckdb") _, rows, _ = execute_internal_query( db_path, {"email": "alice@example.com", "id": "alice-uuid"}, is_admin=False, sql="SELECT username, session_id FROM agnes_sessions", ) assert rows == [("alice", "s-a-1")] def test_non_admin_sees_sessions_uploaded_via_api(system_db): """Sessions uploaded via POST /api/upload/sessions are stored under user_id (UUID) by the pipeline. The user_id filter covers both ingestion paths (collector and upload API) because the pipeline resolves the stable user_id for every session at processing time.""" from src.db import get_system_db conn = get_system_db() conn.execute( "INSERT INTO usage_session_summary " "(session_file, session_id, username, user_id, tool_calls, tool_errors, processor_version) VALUES " "('550e8400-uuid/api.jsonl', 's-api-1', '550e8400-uuid', '550e8400-uuid', 5, 0, 1)" ) db_path = str(_get_state_dir() / "system.duckdb") _, rows, _ = execute_internal_query( db_path, {"email": "alice@example.com", "id": "550e8400-uuid"}, is_admin=False, sql="SELECT username, session_id FROM agnes_sessions ORDER BY session_id", ) # alice should see both her collector row (user_id='alice-uuid' from # fixture — won't match '550e8400-uuid') and her upload-API row # (user_id='550e8400-uuid'). assert ("550e8400-uuid", "s-api-1") in rows # bob's rows must NOT appear assert all(r[0] != "bob" for r in rows) def test_email_change_does_not_affect_visibility(system_db): """If alice changes her email from alice@example.com to alice.new@example.com, her sessions should still be visible because the filter uses the stable user_id, not the email.""" db_path = str(_get_state_dir() / "system.duckdb") # Query with a different email but the same user_id. _, rows, _ = execute_internal_query( db_path, {"email": "alice.new@example.com", "id": "alice-uuid"}, is_admin=False, sql="SELECT username, session_id FROM agnes_sessions", ) assert rows == [("alice", "s-a-1")] def test_non_admin_sees_only_own_audit_rows(system_db): db_path = str(_get_state_dir() / "system.duckdb") _, rows, _ = execute_internal_query( db_path, {"email": "bob@example.com", "id": "bob-uuid"}, is_admin=False, sql="SELECT user_id, action FROM agnes_audit", ) assert rows == [("bob-uuid", "session.transcript_view")] def test_admin_can_count_per_user_session_views(system_db): """The motivating admin query: who is looking at whose session transcripts?""" db_path = str(_get_state_dir() / "system.duckdb") _, rows, _ = execute_internal_query( db_path, {"email": "admin@x", "id": "admin-uuid"}, is_admin=True, sql=( "SELECT user_id, COUNT(*) AS n FROM agnes_audit " "WHERE action = 'session.transcript_view' GROUP BY user_id " "ORDER BY user_id" ), ) assert rows == [("alice-uuid", 1), ("bob-uuid", 1)] def test_user_sql_inside_cte_wrapper_still_resolves(system_db): """The CTE wrapper that scopes agnes_* aliases must coexist with user-supplied SELECT shape — including basic aggregations + LIMIT.""" db_path = str(_get_state_dir() / "system.duckdb") _, rows, _ = execute_internal_query( db_path, {"email": "alice@example.com", "id": "alice-uuid"}, is_admin=False, sql="SELECT SUM(tool_calls) AS n FROM agnes_sessions", ) assert rows == [(10,)] def test_schema_returns_underlying_columns(system_db): db_path = str(_get_state_dir() / "system.duckdb") cols = get_schema(db_path, "agnes_sessions") names = {c["name"] for c in cols} # Sample of the documented usage_session_summary columns. assert {"session_file", "session_id", "username", "tool_calls"} <= names def test_get_schema_unknown_table_returns_empty(): assert get_schema("/nonexistent/path", "not_a_real_internal_table") == [] def test_execute_rejects_sql_without_internal_refs(): with pytest.raises(InternalAccessError): execute_internal_query( "/tmp/dummy", {"email": "alice@x", "id": "alice-uuid"}, is_admin=False, sql="SELECT 1", ) # --------------------------------------------------------------------------- # FastAPI-level wiring tests — catch the two arity bugs that the unit # tests on `execute_internal_query` / `build_filter_clause` missed, # because those test the connector primitives directly and skip the # request handler layer where `is_user_admin(user)` was mis-called. # --------------------------------------------------------------------------- def _seed_internal_via_api(): """``seeded_app`` bypasses the FastAPI lifespan, so the registry seed that puts agnes_sessions / agnes_telemetry / agnes_audit into ``table_registry`` doesn't run. Register them inline so the API routes (`/api/query`, `/api/v2/sample`) can find them.""" from src.db import get_system_db from connectors.internal.registry import ensure_internal_tables_registered ensure_internal_tables_registered(get_system_db()) def test_query_internal_via_api_admin_path(seeded_app, admin_user): """POST /api/query with SQL referencing agnes_sessions returns rows for an admin caller. Catches the regression where app/api/query.py:_run_internal_query called is_user_admin(user) with a dict instead of (user_id, conn) — request blew up with TypeError. """ _seed_internal_via_api() from src.db import get_system_db conn = get_system_db() conn.execute( "INSERT INTO usage_session_summary " "(session_file, session_id, username, processor_version) VALUES " "('alice/api-1.jsonl', 's-api-1', 'alice', 1) " "ON CONFLICT (session_file) DO NOTHING" ) resp = seeded_app["client"].post( "/api/query", json={"sql": "SELECT COUNT(*) AS n FROM agnes_sessions"}, headers=admin_user, ) assert resp.status_code == 200, resp.text body = resp.json() assert body["row_count"] == 1 # Admin sees all rows seeded earlier + this fixture's row. assert int(body["rows"][0][0]) >= 1 def test_sample_internal_via_api_admin_path(seeded_app, admin_user): """GET /api/v2/sample/agnes_sessions returns rows for an admin caller. Catches the regression where v2_sample.py called is_user_admin(user) with a dict instead of (user_id, conn) — request blew up with 500. """ _seed_internal_via_api() resp = seeded_app["client"].get( "/api/v2/sample/agnes_sessions?n=3", headers=admin_user, ) assert resp.status_code == 200, resp.text body = resp.json() assert body["table_id"] == "agnes_sessions" assert body["source"] == "internal" assert isinstance(body["rows"], list) # --------------------------------------------------------------------------- # Negative tests — non-admin must not bypass row filter by referencing # the underlying physical tables or by smuggling the alias into a string # literal to enter the privileged code path. Both vectors flagged in the # round-2 review (PR #278 R2 finding #1). # --------------------------------------------------------------------------- def test_string_literal_alone_does_not_route_internal(system_db): """A SQL with `agnes_sessions` only inside a string literal should not be treated as referencing the internal table — find_internal_refs strips literals first.""" refs = find_internal_refs("SELECT 'agnes_sessions' AS x") assert refs == [] def test_non_admin_cannot_reference_underlying_physical_table(system_db): """If a non-admin's SQL touches an agnes_* alias AND a base table (usage_session_summary etc.) directly, the request is rejected before execution — the CTE wrapper would otherwise only scope the alias, leaking every user's rows.""" db_path = str(_get_state_dir() / "system.duckdb") payloads = ( "SELECT * FROM usage_session_summary WHERE 'agnes_sessions'='agnes_sessions'", "SELECT * FROM agnes_sessions UNION ALL SELECT * FROM audit_log LIMIT 1", # CTE shadow attempt — still has to touch the base table to do anything. "WITH agnes_sessions AS (SELECT * FROM usage_events) SELECT * FROM agnes_sessions", ) for sql in payloads: with pytest.raises(InternalAccessError): execute_internal_query( db_path, {"email": "alice@x", "id": "alice-uuid"}, is_admin=False, sql=sql, ) def test_admin_unaffected_by_underlying_table_guard(system_db): """Admins can reference underlying physical tables — they're already god-mode through the filter clause, and they have legitimate reasons to read raw rows.""" db_path = str(_get_state_dir() / "system.duckdb") # Admin must still reference at least one agnes_* alias to enter the # internal-query path (the routing rule unchanged), but the base # table guard doesn't apply. _, rows, _ = execute_internal_query( db_path, {"email": "admin@x", "id": "admin-uuid"}, is_admin=True, sql="SELECT (SELECT COUNT(*) FROM usage_session_summary) AS n FROM agnes_sessions LIMIT 1", ) assert rows # admin can join base table inside the wrapper def test_non_admin_cannot_reference_users_table(system_db): """R3 finding: non-admin must not reach any non-agnes_* table in system.duckdb (users / personal_access_tokens / resource_grants / saved views / etc.). The dynamic denylist from information_schema covers all of them; this test pins one critical case (`users`).""" db_path = str(_get_state_dir() / "system.duckdb") with pytest.raises(InternalAccessError): execute_internal_query( db_path, {"email": "alice@x", "id": "alice-uuid"}, is_admin=False, sql="SELECT * FROM agnes_sessions UNION ALL SELECT email, id, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM users LIMIT 1", ) def test_non_admin_cannot_reference_pat_table(system_db): """PAT table is one of the most sensitive — explicit pin.""" db_path = str(_get_state_dir() / "system.duckdb") with pytest.raises(InternalAccessError): execute_internal_query( db_path, {"email": "alice@x", "id": "alice-uuid"}, is_admin=False, sql="SELECT * FROM agnes_audit; SELECT * FROM personal_access_tokens", ) def test_non_admin_block_survives_block_comment(system_db): """`/* */` comment-wrapped table name should still be caught — the pre-pass strips comments before the identifier scan.""" db_path = str(_get_state_dir() / "system.duckdb") with pytest.raises(InternalAccessError): execute_internal_query( db_path, {"email": "alice@x", "id": "alice-uuid"}, is_admin=False, sql="SELECT * FROM agnes_sessions /**/ JOIN /**/ users /**/ ON 1=1", ) def test_non_admin_block_survives_line_comment(system_db): """`--` line-comment shouldn't hide the table name from the scan.""" db_path = str(_get_state_dir() / "system.duckdb") with pytest.raises(InternalAccessError): execute_internal_query( db_path, {"email": "alice@x", "id": "alice-uuid"}, is_admin=False, sql=( "SELECT * FROM agnes_sessions\n" "-- this is a comment but we still touch the table below\n" "UNION ALL SELECT email, id, NULL, NULL, NULL, NULL, NULL, NULL, " "NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM users LIMIT 1" ), )