feat(query-api): better error message when --remote query references a materialized-but-not-rebuilt id

E2E sub-agent finding: `da query --remote "SELECT * FROM <id>"` against a
materialized table that hasn't yet been rebuilt in the server's
analytics.duckdb returns a confusing DuckDB "Table does not exist"
message even though the table is in the registry. Materialized rows
produce parquets at `${DATA_DIR}/extracts/<source>/data/<id>.parquet`,
but the orchestrator's master-view creation is `_meta`-driven — fresh
instances or pre-tick states have the registry row without a
corresponding view, so analysts hit the bare "does not exist" with no
path forward.

Improve the error rendering in `app/api/query.py:execute_query`. When
DuckDB raises a "table does not exist" error, scan the registry for any
`query_mode='materialized'` row whose id or name appears in the failed
SQL. On a hit, return a 400 whose detail names the table, explains the
materialize state, and offers two concrete next steps:

1. Run `da sync` (or wait for the scheduler tick / hit
   POST /api/sync/trigger) to materialize the parquet, OR
2. Query the source directly via the catalog alias when the registry row
   carries bucket+source_table (e.g. `bq."dataset"."table"` for BigQuery,
   `kbc."bucket"."table"` for Keboola).

Detection is bounded — the registry round-trip only fires when DuckDB's
error mentions a missing table, so happy-path queries pay no cost.
Non-materialized unknowns fall through to DuckDB's raw error.

2 new tests: materialized id surfaces the hint with the bucket+source_table
payload; unknown table falls back to the generic error path with no false
positive on the new hint.
This commit is contained in:
ZdenekSrotyr 2026-05-01 23:09:52 +02:00
parent 8030a867ec
commit dc03837a7b
3 changed files with 176 additions and 2 deletions

View file

@ -46,7 +46,18 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
primary source and pointing at `/admin/server-config` for enabling a
secondary source. `jira` / `local` are exempt — they don't sit under
`data_source.*`. Omitted source_type still tolerated for legacy CLI
callers.
callers. Stays permissive when primary is `'local'` (bootstrap workflow
— instance not yet pointed at a real source).
- **query API**: `POST /api/query` now returns a materialize-aware error
when the failed SQL references a table id that's registered with
`query_mode='materialized'` but doesn't yet exist as a master view in
this instance's `analytics.duckdb` (e.g. fresh instance, no scheduler
tick yet). The hint names the table, points at `da sync` /
`POST /api/sync/trigger`, and — when the registry row carries a
bucket+source_table — surfaces the equivalent direct-source query
(`bq."dataset"."table"` or `kbc."bucket"."table"`) so the operator has a
concrete next step. Falls back to DuckDB's raw error for non-materialized
unknowns.
## [0.30.0] — 2026-05-01

View file

@ -3,6 +3,7 @@
import os
import re
from pathlib import Path
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
@ -11,6 +12,7 @@ import duckdb
from app.auth.dependencies import get_current_user, _get_db
from src.db import get_analytics_db_readonly
from src.rbac import get_accessible_tables
from src.repositories.table_registry import TableRegistryRepository
router = APIRouter(prefix="/api/query", tags=["query"])
@ -107,6 +109,92 @@ async def execute_query(
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=400, detail=f"Query error: {str(e)}")
# If DuckDB raised "Table … does not exist" for a referenced name,
# check whether that name belongs to a registry row in
# `query_mode='materialized'` that hasn't yet been materialized in
# this instance's analytics.duckdb. Materialized rows produce a
# parquet at `${DATA_DIR}/extracts/<source>/data/<id>.parquet` but
# the orchestrator is `_meta`-driven and only creates master views
# for connectors that emit `_meta` rows — so on a fresh instance
# (or before the first scheduler tick) the master view doesn't
# exist yet and the operator gets a confusing "table does not
# exist" with no path forward. Surface a materialize-aware hint
# instead of DuckDB's bare error.
msg = str(e)
helpful = _materialized_hint_for_query_error(conn, request.sql, msg)
if helpful:
raise HTTPException(status_code=400, detail=helpful)
raise HTTPException(status_code=400, detail=f"Query error: {msg}")
finally:
analytics.close()
def _materialized_hint_for_query_error(
conn: duckdb.DuckDBPyConnection, sql: str, error_msg: str,
) -> Optional[str]:
"""Return a materialize-aware error message if the failed query
references a registry row whose `query_mode='materialized'` and which
has no master view in analytics.duckdb yet, OR ``None`` to fall back
to DuckDB's raw error.
The detection scans each materialized row's id/name against the SQL
text; a hit means the operator picked a name that exists in the
registry but isn't queryable in this instance. The hint is the same
in both arms of the OR it tells them what the table needs and what
they can do today (`da sync` or query `bq."dataset"."table"`
directly using the bucket/source_table from the registry row).
"""
# Cheap fast-path — only inspect the registry when DuckDB's error
# actually mentions a missing table. Avoids registry round-trip on
# every parse/cast/permission failure.
el = error_msg.lower()
if "does not exist" not in el and "table with name" not in el:
return None
try:
repo = TableRegistryRepository(conn)
rows = repo.list_all()
except Exception:
# Registry read failed for whatever reason — don't compound the
# error response by hiding the original DuckDB message.
return None
sql_l = sql.lower()
for r in rows:
if (r.get("query_mode") or "") != "materialized":
continue
# Match by id or by name; either could appear in the SQL.
candidates = {r.get("id"), r.get("name")}
for cand in candidates:
if not cand:
continue
cand_l = str(cand).lower()
# Word-boundary-ish check — `\b` doesn't match `.` so
# `bq.dataset.cand` would still hit, which is fine for the
# hint path (the operator is referring to the same table).
if re.search(r"\b" + re.escape(cand_l) + r"\b", sql_l):
return _build_materialized_hint(r)
return None
def _build_materialized_hint(row: dict) -> str:
"""Format the user-facing hint for a materialized row that's not yet
queryable. Includes the table id, the bucket/source_table when the
row carries them, and concrete operator next steps."""
tid = row.get("id") or row.get("name") or "<unknown>"
bucket = row.get("bucket")
source_table = row.get("source_table")
direct_hint = ""
if bucket and source_table:
# BigQuery: `bq."dataset"."table"`; Keboola: `kbc."bucket"."table"`.
# Pick the alias by source_type so the hint is copy-pasteable.
alias = "bq" if (row.get("source_type") or "") == "bigquery" else "kbc"
direct_hint = (
f' or query the source directly via {alias}."{bucket}".'
f'"{source_table}"'
)
return (
f"Table {tid!r} is registered as query_mode='materialized' but is "
f"not yet materialized in this instance's analytics views. Run "
f"`da sync` (or wait for the scheduler tick / hit POST "
f"/api/sync/trigger) to materialize the parquet"
f"{direct_hint}."
)

View file

@ -0,0 +1,75 @@
"""POST /api/query for a table id that's registered as
`query_mode='materialized'` but isn't yet a view in `analytics.duckdb`
returns a helpful, materialize-aware error instead of a raw "Table does
not exist" string from DuckDB.
E2E sub-agent finding 2026-05-01: `da query --remote "SELECT * FROM
e2e2_synced_table LIMIT 5"` on a synced materialized table failed with
DuckDB's bare error message even though the table is in the registry.
The fix improves the surfaced message so the operator sees the
materialize-mode hint without having to decode DuckDB internals.
"""
from __future__ import annotations
import pytest
from src.repositories.table_registry import TableRegistryRepository
def _auth(token: str) -> dict:
return {"Authorization": f"Bearer {token}"}
def test_query_materialized_id_not_in_views_returns_helpful_message(seeded_app):
"""An admin querying a materialized id that isn't yet materialized in
the local analytics.duckdb gets a 400 whose detail names the
query_mode and points at `da sync` / direct-BQ-query."""
from src.db import get_system_db
sys_conn = get_system_db()
try:
TableRegistryRepository(sys_conn).register(
id="not_yet_materialized",
name="not_yet_materialized",
source_type="bigquery",
query_mode="materialized",
source_query='SELECT 1 FROM bq."ds"."t"',
bucket="ds",
source_table="t",
)
finally:
sys_conn.close()
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={"sql": "SELECT * FROM not_yet_materialized LIMIT 5"},
headers=_auth(token),
)
assert r.status_code == 400, r.json()
detail = str(r.json().get("detail", ""))
# Message should name the table and surface the materialize-mode hint.
assert "not_yet_materialized" in detail
assert "materialized" in detail.lower()
# Either a `da sync` hint or a direct-BQ-query hint must appear so the
# operator has a concrete next step.
assert "da sync" in detail or "bq." in detail
def test_query_unknown_table_falls_back_to_default_error(seeded_app):
"""Sanity: a query for a table that isn't even in the registry still
surfaces DuckDB's error verbatim (no false positive on the new hint
path). RBAC's 403 path takes precedence for non-admin callers; for
admins (no RBAC filter) the table simply doesn't exist as a view, and
the query falls through to DuckDB's "does not exist" message."""
c = seeded_app["client"]
token = seeded_app["admin_token"]
r = c.post(
"/api/query",
json={"sql": "SELECT * FROM totally_unknown_table"},
headers=_auth(token),
)
assert r.status_code == 400, r.json()
detail = str(r.json().get("detail", "")).lower()
# Falls back to the generic query-error path; no materialized hint.
assert "materialized" not in detail