diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ece257..661c43d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,7 +46,18 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C primary source and pointing at `/admin/server-config` for enabling a secondary source. `jira` / `local` are exempt — they don't sit under `data_source.*`. Omitted source_type still tolerated for legacy CLI - callers. + callers. Stays permissive when primary is `'local'` (bootstrap workflow + — instance not yet pointed at a real source). +- **query API**: `POST /api/query` now returns a materialize-aware error + when the failed SQL references a table id that's registered with + `query_mode='materialized'` but doesn't yet exist as a master view in + this instance's `analytics.duckdb` (e.g. fresh instance, no scheduler + tick yet). The hint names the table, points at `da sync` / + `POST /api/sync/trigger`, and — when the registry row carries a + bucket+source_table — surfaces the equivalent direct-source query + (`bq."dataset"."table"` or `kbc."bucket"."table"`) so the operator has a + concrete next step. Falls back to DuckDB's raw error for non-materialized + unknowns. ## [0.30.0] — 2026-05-01 diff --git a/app/api/query.py b/app/api/query.py index 4cdf5d7..4312407 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -3,6 +3,7 @@ import os import re from pathlib import Path +from typing import Optional from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel @@ -11,6 +12,7 @@ import duckdb from app.auth.dependencies import get_current_user, _get_db from src.db import get_analytics_db_readonly from src.rbac import get_accessible_tables +from src.repositories.table_registry import TableRegistryRepository router = APIRouter(prefix="/api/query", tags=["query"]) @@ -107,6 +109,92 @@ async def execute_query( except HTTPException: raise except Exception as e: - raise HTTPException(status_code=400, detail=f"Query error: {str(e)}") + # If DuckDB raised "Table … does not exist" for a referenced name, + # check whether that name belongs to a registry row in + # `query_mode='materialized'` that hasn't yet been materialized in + # this instance's analytics.duckdb. Materialized rows produce a + # parquet at `${DATA_DIR}/extracts//data/.parquet` but + # the orchestrator is `_meta`-driven and only creates master views + # for connectors that emit `_meta` rows — so on a fresh instance + # (or before the first scheduler tick) the master view doesn't + # exist yet and the operator gets a confusing "table does not + # exist" with no path forward. Surface a materialize-aware hint + # instead of DuckDB's bare error. + msg = str(e) + helpful = _materialized_hint_for_query_error(conn, request.sql, msg) + if helpful: + raise HTTPException(status_code=400, detail=helpful) + raise HTTPException(status_code=400, detail=f"Query error: {msg}") finally: analytics.close() + + +def _materialized_hint_for_query_error( + conn: duckdb.DuckDBPyConnection, sql: str, error_msg: str, +) -> Optional[str]: + """Return a materialize-aware error message if the failed query + references a registry row whose `query_mode='materialized'` and which + has no master view in analytics.duckdb yet, OR ``None`` to fall back + to DuckDB's raw error. + + The detection scans each materialized row's id/name against the SQL + text; a hit means the operator picked a name that exists in the + registry but isn't queryable in this instance. The hint is the same + in both arms of the OR — it tells them what the table needs and what + they can do today (`da sync` or query `bq."dataset"."table"` + directly using the bucket/source_table from the registry row). + """ + # Cheap fast-path — only inspect the registry when DuckDB's error + # actually mentions a missing table. Avoids registry round-trip on + # every parse/cast/permission failure. + el = error_msg.lower() + if "does not exist" not in el and "table with name" not in el: + return None + try: + repo = TableRegistryRepository(conn) + rows = repo.list_all() + except Exception: + # Registry read failed for whatever reason — don't compound the + # error response by hiding the original DuckDB message. + return None + sql_l = sql.lower() + for r in rows: + if (r.get("query_mode") or "") != "materialized": + continue + # Match by id or by name; either could appear in the SQL. + candidates = {r.get("id"), r.get("name")} + for cand in candidates: + if not cand: + continue + cand_l = str(cand).lower() + # Word-boundary-ish check — `\b` doesn't match `.` so + # `bq.dataset.cand` would still hit, which is fine for the + # hint path (the operator is referring to the same table). + if re.search(r"\b" + re.escape(cand_l) + r"\b", sql_l): + return _build_materialized_hint(r) + return None + + +def _build_materialized_hint(row: dict) -> str: + """Format the user-facing hint for a materialized row that's not yet + queryable. Includes the table id, the bucket/source_table when the + row carries them, and concrete operator next steps.""" + tid = row.get("id") or row.get("name") or "" + bucket = row.get("bucket") + source_table = row.get("source_table") + direct_hint = "" + if bucket and source_table: + # BigQuery: `bq."dataset"."table"`; Keboola: `kbc."bucket"."table"`. + # Pick the alias by source_type so the hint is copy-pasteable. + alias = "bq" if (row.get("source_type") or "") == "bigquery" else "kbc" + direct_hint = ( + f' or query the source directly via {alias}."{bucket}".' + f'"{source_table}"' + ) + return ( + f"Table {tid!r} is registered as query_mode='materialized' but is " + f"not yet materialized in this instance's analytics views. Run " + f"`da sync` (or wait for the scheduler tick / hit POST " + f"/api/sync/trigger) to materialize the parquet" + f"{direct_hint}." + ) diff --git a/tests/test_query_materialized_error_message.py b/tests/test_query_materialized_error_message.py new file mode 100644 index 0000000..016ab3a --- /dev/null +++ b/tests/test_query_materialized_error_message.py @@ -0,0 +1,75 @@ +"""POST /api/query for a table id that's registered as +`query_mode='materialized'` but isn't yet a view in `analytics.duckdb` +returns a helpful, materialize-aware error instead of a raw "Table does +not exist" string from DuckDB. + +E2E sub-agent finding 2026-05-01: `da query --remote "SELECT * FROM +e2e2_synced_table LIMIT 5"` on a synced materialized table failed with +DuckDB's bare error message even though the table is in the registry. +The fix improves the surfaced message so the operator sees the +materialize-mode hint without having to decode DuckDB internals. +""" +from __future__ import annotations + +import pytest + +from src.repositories.table_registry import TableRegistryRepository + + +def _auth(token: str) -> dict: + return {"Authorization": f"Bearer {token}"} + + +def test_query_materialized_id_not_in_views_returns_helpful_message(seeded_app): + """An admin querying a materialized id that isn't yet materialized in + the local analytics.duckdb gets a 400 whose detail names the + query_mode and points at `da sync` / direct-BQ-query.""" + from src.db import get_system_db + sys_conn = get_system_db() + try: + TableRegistryRepository(sys_conn).register( + id="not_yet_materialized", + name="not_yet_materialized", + source_type="bigquery", + query_mode="materialized", + source_query='SELECT 1 FROM bq."ds"."t"', + bucket="ds", + source_table="t", + ) + finally: + sys_conn.close() + + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={"sql": "SELECT * FROM not_yet_materialized LIMIT 5"}, + headers=_auth(token), + ) + assert r.status_code == 400, r.json() + detail = str(r.json().get("detail", "")) + # Message should name the table and surface the materialize-mode hint. + assert "not_yet_materialized" in detail + assert "materialized" in detail.lower() + # Either a `da sync` hint or a direct-BQ-query hint must appear so the + # operator has a concrete next step. + assert "da sync" in detail or "bq." in detail + + +def test_query_unknown_table_falls_back_to_default_error(seeded_app): + """Sanity: a query for a table that isn't even in the registry still + surfaces DuckDB's error verbatim (no false positive on the new hint + path). RBAC's 403 path takes precedence for non-admin callers; for + admins (no RBAC filter) the table simply doesn't exist as a view, and + the query falls through to DuckDB's "does not exist" message.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/query", + json={"sql": "SELECT * FROM totally_unknown_table"}, + headers=_auth(token), + ) + assert r.status_code == 400, r.json() + detail = str(r.json().get("detail", "")).lower() + # Falls back to the generic query-error path; no materialized hint. + assert "materialized" not in detail