"""Per-request scoped query execution for the ``internal`` data source. Architecture: open ``system.duckdb`` read-only, build a temporary view per referenced internal table with RBAC applied as a WHERE clause, execute the user's SQL inside that scope, return rows. RBAC model: - Admin (per ``is_user_admin``) sees the table unscoped — no WHERE clause. - Everyone else gets a single-row-filter projection. The filter column is hard-coded per table (``INTERNAL_TABLES``) and the filter value comes from the auth-resolved user object — never from user-supplied SQL. SQL-injection considerations: - The temp-view DDL interpolates a literal string for the filter value. - ``username`` for ``usage_*`` is the local-part of an email; we enforce the same regex used by the session-file path (alnums + ``._-``) before interpolation. - ``user_id`` for ``audit_log`` is a UUID; we validate the regex before interpolation. - The user's SELECT itself is gated by the same SELECT-only validator the ``/api/query`` endpoint uses (denylist of write/DDL keywords + file functions). That validator runs in the API layer. """ from __future__ import annotations import logging import re from dataclasses import dataclass from typing import Any logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Internal-table registry — single source of truth # --------------------------------------------------------------------------- @dataclass(frozen=True) class InternalTable: """One internal table mapping. Fields: registry_id: the value used in ``table_registry.id`` and what analysts type in SQL (``SELECT * FROM ``) source_table: the underlying physical table in ``system.duckdb`` filter_column: the column on ``source_table`` carrying the per-row owner. Used for the non-admin scoping clause. filter_kind: how to resolve the filter value from the auth user dict — ``'username'`` (email local-part) or ``'user_id'`` (UUID). display_name: human-readable name (also goes into ``table_registry.name``) description: short blurb (catalog UI + ``agnes catalog`` output) """ registry_id: str source_table: str filter_column: str filter_kind: str # 'username' | 'user_id' display_name: str description: str legacy_username_column: str | None = None # backward-compat OR fallback INTERNAL_TABLES: tuple[InternalTable, ...] = ( InternalTable( registry_id="agnes_sessions", source_table="usage_session_summary", filter_column="user_id", filter_kind="user_id", display_name="Agnes sessions", description="Claude Code sessions. Also available locally for analysis.", legacy_username_column="username", ), InternalTable( registry_id="agnes_telemetry", source_table="usage_events", filter_column="user_id", filter_kind="user_id", display_name="Agnes telemetry events", description="Tool and skill invocations from Claude Code. Also available locally for analysis.", legacy_username_column="username", ), InternalTable( registry_id="agnes_audit", source_table="audit_log", filter_column="user_id", filter_kind="user_id", display_name="Agnes audit log", description="Server-side actions performed against Agnes. Also available locally for analysis.", ), ) INTERNAL_TABLES_BY_ID: dict[str, InternalTable] = {t.registry_id: t for t in INTERNAL_TABLES} def is_internal_table(table_id: str) -> bool: return table_id in INTERNAL_TABLES_BY_ID # --------------------------------------------------------------------------- # RBAC filter resolution # --------------------------------------------------------------------------- # `+` allowed in both regexes — RFC 5321 local-parts (e.g. alice+test@x) # resolve to filesystem usernames with a `+`, and the session-data-dir # layout already supports the same character class. _USERNAME_RE = re.compile(r"^[A-Za-z0-9._+-]{1,200}$") _USER_ID_RE = re.compile(r"^[A-Za-z0-9._@:+-]{1,200}$") class InternalAccessError(Exception): """Raised when the caller cannot be safely resolved to a filter value or when an internal table is misconfigured.""" def _filter_value(user: dict[str, Any], kind: str) -> str: """Derive the per-row filter value from the authenticated user. For ``username`` we mirror the session-data-dir convention: the local-part of the email. This is the same value the UsageProcessor writes into ``usage_events.username`` / ``usage_session_summary``. For ``user_id`` we use the ``users.id`` UUID directly — same value audit_log writes carry. """ if kind == "username": email = (user or {}).get("email", "") or "" username = email.split("@")[0] if "@" in email else email if not _USERNAME_RE.match(username): raise InternalAccessError(f"user email {email!r} does not yield a safe username for scoping") return username if kind == "user_id": uid = (user or {}).get("id", "") or "" if not _USER_ID_RE.match(uid): raise InternalAccessError(f"user_id {uid!r} fails the safe-identifier check") return uid raise InternalAccessError(f"unknown filter_kind: {kind!r}") def build_filter_clause(table: InternalTable, user: dict[str, Any], is_admin: bool) -> str: """Return the WHERE clause for one internal table. Admins get an empty string (unscoped view). Everyone else get ``WHERE = ''`` where value has been regex-validated. ``agnes_sessions`` and ``agnes_telemetry`` filter primarily on ``user_id`` (stable UUID) but include an OR fallback on ``username`` (email local-part) for rows that pre-date the v45 backfill. Once all rows carry a non-NULL ``user_id`` the ``legacy_username_column`` field can be removed. """ if is_admin: return "" value = _filter_value(user, table.filter_kind) safe = value.replace("'", "''") if table.legacy_username_column: legacy = _filter_value(user, "username").replace("'", "''") return f"WHERE ({table.filter_column} = '{safe}' OR {table.legacy_username_column} = '{legacy}')" return f"WHERE {table.filter_column} = '{safe}'" # --------------------------------------------------------------------------- # Query execution # --------------------------------------------------------------------------- _TABLE_REF_RE = re.compile( r"\b(" + "|".join(re.escape(t.registry_id) for t in INTERNAL_TABLES) + r")\b", re.IGNORECASE, ) # Single-quoted SQL string literals (with `''` escape handling). Stripped # before reference detection so a non-admin can't trigger the internal # privileged code path by smuggling the alias inside a literal. _SQL_STRING_LITERAL_RE = re.compile(r"'(?:''|[^'])*'") # SQL comments — block `/* … */` and `--` line forms. Stripped so a # comment-wrapped table name (`/**/users/**/`) can't slip past the # identifier scan downstream. _SQL_BLOCK_COMMENT_RE = re.compile(r"/\*[\s\S]*?\*/") _SQL_LINE_COMMENT_RE = re.compile(r"--[^\n]*") def _strip_sql_noise(sql: str) -> str: """Strip string literals + block + line comments so the identifier scanners that follow see only structural SQL. Order matters: strip literals first (they can contain comment-looking text), then comments (which can contain literal-looking text). String content is replaced with empty `''` to keep token spacing intact.""" s = _SQL_STRING_LITERAL_RE.sub("''", sql) s = _SQL_BLOCK_COMMENT_RE.sub(" ", s) s = _SQL_LINE_COMMENT_RE.sub(" ", s) return s _INTERNAL_ALIAS_NAMES: frozenset[str] = frozenset(t.registry_id.lower() for t in INTERNAL_TABLES) def _sensitive_table_reference(stripped_sql: str, conn) -> str | None: """Return the first non-allowlisted system.duckdb table name that appears in ``stripped_sql``, or None if clean. Allowlist = the registered ``agnes_*`` internal-table IDs. The denylist is derived dynamically from ``information_schema.tables`` in the system.duckdb main schema, so adding a new sensitive table in a future migration is automatically covered without re-editing this module. ``stripped_sql`` MUST already have string literals and comments stripped (see ``_strip_sql_noise``). Identifier scan is case-insensitive word-boundary; schema-prefixed (`main.users`) and double-quoted (`"users"`) forms both match because the bare name still sits between word boundaries. """ rows = conn.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'").fetchall() for (name,) in rows: if name is None: continue if name.lower() in _INTERNAL_ALIAS_NAMES: continue if re.search(rf"\b{re.escape(name)}\b", stripped_sql, re.IGNORECASE): return name return None def find_internal_refs(sql: str) -> list[str]: """Word-boundary scan of `sql` for the registered internal table IDs. Returns the matched IDs (lowercase, deduped) in declaration order. String literals AND comments are stripped first so a literal / commented mention of `agnes_sessions` doesn't route the request into the privileged internal-query path (review #278 R2 / R3). """ stripped = _strip_sql_noise(sql) found = {m.group(1).lower() for m in _TABLE_REF_RE.finditer(stripped)} # Preserve declaration order so reasoning about the resulting set is stable. return [t.registry_id for t in INTERNAL_TABLES if t.registry_id.lower() in found] def execute_internal_query( system_db_path: str, user: dict[str, Any], is_admin: bool, sql: str, limit: int = 1000, ) -> tuple[list[str], list[tuple], bool]: """Run a SELECT against per-request-scoped internal views. Approach: wrap the user SQL in a CTE prefix that defines one ``agnes_*`` alias per referenced internal table, each scoped to the caller's rows (admin → unscoped). We then ``SELECT * FROM ()`` inside the wrapper. DuckDB resolves the CTE aliases when it parses the user_sql; the caller never sees the real ``usage_*`` tables. Why a CTE wrapper instead of TEMP VIEW or ATTACH: - ``duckdb.connect(path, read_only=True)`` from a fresh handle is rejected when the app's main connection already holds ``system.duckdb`` open RW: "Can't open a connection to same database file with a different configuration than existing connections". - ``ATTACH '' AS sys (READ_ONLY)`` from a :memory: handle is rejected with "Binder Error: Unique file handle conflict — the database file is already attached by …": DuckDB serialises file handles process-wide, so two connections can't reach the same ``.duckdb`` file even at different attach points. - ``TEMP VIEW`` on the shared singleton connection bleeds across concurrent requests (TEMP VIEWS are connection-scoped, and the request-handler pool reuses the same handle). CTE wrap leaves no residual state on the connection and isolates naturally per request. The SQL stays in SELECT space; existing keyword-denylist + sanitised-username defenses still apply. """ refs = find_internal_refs(sql) if not refs: raise InternalAccessError("no internal-table references in SQL") # Lazy import to avoid a hard cycle (src.db imports go via repositories # which then end up importing access in some test paths). from src.db import get_system_db # Non-admins are NOT allowed to reference any system.duckdb table # outside the registered agnes_* aliases. The CTE wrapper only # scopes those aliases; a direct FROM on the base table # (`usage_session_summary`, `audit_log`, `users`, # `personal_access_tokens`, etc.) would bypass row-level RBAC and # leak other users' data. Denylist is derived dynamically from # `information_schema.tables` — every table in system.duckdb that # is NOT one of the agnes_* aliases is sensitive. This is # future-proof: new tables added by later migrations are # automatically covered without re-editing this module. # # Admin path is unaffected — admins have legitimate need to read # raw rows, and the filter clause is empty for them anyway. if not is_admin: stripped = _strip_sql_noise(sql) sensitive = _sensitive_table_reference(stripped, get_system_db()) if sensitive is not None: raise InternalAccessError( f"non-admin SQL cannot reference table {sensitive!r}; query one of the agnes_* aliases instead" ) cte_parts = [] for table_id in refs: table = INTERNAL_TABLES_BY_ID[table_id] where_clause = build_filter_clause(table, user, is_admin) cte_parts.append(f"{table.registry_id} AS (SELECT * FROM {table.source_table} {where_clause})") cte_prefix = "WITH " + ", ".join(cte_parts) wrapped = f"{cte_prefix} SELECT * FROM ({sql}) AS _agnes_user_query" conn = get_system_db() cursor = conn.cursor() try: rows = cursor.execute(wrapped).fetchmany(limit + 1) cols = [d[0] for d in cursor.description] if cursor.description else [] truncated = len(rows) > limit return cols, rows[:limit], truncated finally: try: cursor.close() except Exception: logger.exception("close() failed on internal-query cursor") # --------------------------------------------------------------------------- # Schema introspection — feeds /api/v2/schema/{id} for internal tables # --------------------------------------------------------------------------- def get_schema(system_db_path: str, table_id: str) -> list[dict]: """Return the underlying physical schema for an internal table. Used by ``/api/v2/schema/`` so ``agnes schema `` works against internal sources. Reuses the shared ``system.duckdb`` connection — same rationale as ``execute_internal_query``: opening a parallel handle to the same file is process-wide blocked. The information_schema query is read-only and small. ``system_db_path`` is kept in the signature for API symmetry with the earlier draft, but is unused — the singleton handle already knows the path. """ if table_id not in INTERNAL_TABLES_BY_ID: return [] table = INTERNAL_TABLES_BY_ID[table_id] from src.db import get_system_db cursor = get_system_db().cursor() try: rows = cursor.execute( "SELECT column_name, data_type, is_nullable " "FROM information_schema.columns " "WHERE table_name = ? ORDER BY ordinal_position", [table.source_table], ).fetchall() return [{"name": r[0], "type": r[1], "nullable": r[2] == "YES"} for r in rows] finally: try: cursor.close() except Exception: pass