feat(query-api): better error message when --remote query references a materialized-but-not-rebuilt id
E2E sub-agent finding: `da query --remote "SELECT * FROM <id>"` against a
materialized table that hasn't yet been rebuilt in the server's
analytics.duckdb returns a confusing DuckDB "Table does not exist"
message even though the table is in the registry. Materialized rows
produce parquets at `${DATA_DIR}/extracts/<source>/data/<id>.parquet`,
but the orchestrator's master-view creation is `_meta`-driven — fresh
instances or pre-tick states have the registry row without a
corresponding view, so analysts hit the bare "does not exist" with no
path forward.
Improve the error rendering in `app/api/query.py:execute_query`. When
DuckDB raises a "table does not exist" error, scan the registry for any
`query_mode='materialized'` row whose id or name appears in the failed
SQL. On a hit, return a 400 whose detail names the table, explains the
materialize state, and offers two concrete next steps:
1. Run `da sync` (or wait for the scheduler tick / hit
POST /api/sync/trigger) to materialize the parquet, OR
2. Query the source directly via the catalog alias when the registry row
carries bucket+source_table (e.g. `bq."dataset"."table"` for BigQuery,
`kbc."bucket"."table"` for Keboola).
Detection is bounded — the registry round-trip only fires when DuckDB's
error mentions a missing table, so happy-path queries pay no cost.
Non-materialized unknowns fall through to DuckDB's raw error.
2 new tests: materialized id surfaces the hint with the bucket+source_table
payload; unknown table falls back to the generic error path with no false
positive on the new hint.
This commit is contained in:
parent
8030a867ec
commit
dc03837a7b
3 changed files with 176 additions and 2 deletions
13
CHANGELOG.md
13
CHANGELOG.md
|
|
@ -46,7 +46,18 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
|
||||||
primary source and pointing at `/admin/server-config` for enabling a
|
primary source and pointing at `/admin/server-config` for enabling a
|
||||||
secondary source. `jira` / `local` are exempt — they don't sit under
|
secondary source. `jira` / `local` are exempt — they don't sit under
|
||||||
`data_source.*`. Omitted source_type still tolerated for legacy CLI
|
`data_source.*`. Omitted source_type still tolerated for legacy CLI
|
||||||
callers.
|
callers. Stays permissive when primary is `'local'` (bootstrap workflow
|
||||||
|
— instance not yet pointed at a real source).
|
||||||
|
- **query API**: `POST /api/query` now returns a materialize-aware error
|
||||||
|
when the failed SQL references a table id that's registered with
|
||||||
|
`query_mode='materialized'` but doesn't yet exist as a master view in
|
||||||
|
this instance's `analytics.duckdb` (e.g. fresh instance, no scheduler
|
||||||
|
tick yet). The hint names the table, points at `da sync` /
|
||||||
|
`POST /api/sync/trigger`, and — when the registry row carries a
|
||||||
|
bucket+source_table — surfaces the equivalent direct-source query
|
||||||
|
(`bq."dataset"."table"` or `kbc."bucket"."table"`) so the operator has a
|
||||||
|
concrete next step. Falls back to DuckDB's raw error for non-materialized
|
||||||
|
unknowns.
|
||||||
|
|
||||||
## [0.30.0] — 2026-05-01
|
## [0.30.0] — 2026-05-01
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
@ -11,6 +12,7 @@ import duckdb
|
||||||
from app.auth.dependencies import get_current_user, _get_db
|
from app.auth.dependencies import get_current_user, _get_db
|
||||||
from src.db import get_analytics_db_readonly
|
from src.db import get_analytics_db_readonly
|
||||||
from src.rbac import get_accessible_tables
|
from src.rbac import get_accessible_tables
|
||||||
|
from src.repositories.table_registry import TableRegistryRepository
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/query", tags=["query"])
|
router = APIRouter(prefix="/api/query", tags=["query"])
|
||||||
|
|
||||||
|
|
@ -107,6 +109,92 @@ async def execute_query(
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=f"Query error: {str(e)}")
|
# If DuckDB raised "Table … does not exist" for a referenced name,
|
||||||
|
# check whether that name belongs to a registry row in
|
||||||
|
# `query_mode='materialized'` that hasn't yet been materialized in
|
||||||
|
# this instance's analytics.duckdb. Materialized rows produce a
|
||||||
|
# parquet at `${DATA_DIR}/extracts/<source>/data/<id>.parquet` but
|
||||||
|
# the orchestrator is `_meta`-driven and only creates master views
|
||||||
|
# for connectors that emit `_meta` rows — so on a fresh instance
|
||||||
|
# (or before the first scheduler tick) the master view doesn't
|
||||||
|
# exist yet and the operator gets a confusing "table does not
|
||||||
|
# exist" with no path forward. Surface a materialize-aware hint
|
||||||
|
# instead of DuckDB's bare error.
|
||||||
|
msg = str(e)
|
||||||
|
helpful = _materialized_hint_for_query_error(conn, request.sql, msg)
|
||||||
|
if helpful:
|
||||||
|
raise HTTPException(status_code=400, detail=helpful)
|
||||||
|
raise HTTPException(status_code=400, detail=f"Query error: {msg}")
|
||||||
finally:
|
finally:
|
||||||
analytics.close()
|
analytics.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _materialized_hint_for_query_error(
|
||||||
|
conn: duckdb.DuckDBPyConnection, sql: str, error_msg: str,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Return a materialize-aware error message if the failed query
|
||||||
|
references a registry row whose `query_mode='materialized'` and which
|
||||||
|
has no master view in analytics.duckdb yet, OR ``None`` to fall back
|
||||||
|
to DuckDB's raw error.
|
||||||
|
|
||||||
|
The detection scans each materialized row's id/name against the SQL
|
||||||
|
text; a hit means the operator picked a name that exists in the
|
||||||
|
registry but isn't queryable in this instance. The hint is the same
|
||||||
|
in both arms of the OR — it tells them what the table needs and what
|
||||||
|
they can do today (`da sync` or query `bq."dataset"."table"`
|
||||||
|
directly using the bucket/source_table from the registry row).
|
||||||
|
"""
|
||||||
|
# Cheap fast-path — only inspect the registry when DuckDB's error
|
||||||
|
# actually mentions a missing table. Avoids registry round-trip on
|
||||||
|
# every parse/cast/permission failure.
|
||||||
|
el = error_msg.lower()
|
||||||
|
if "does not exist" not in el and "table with name" not in el:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
repo = TableRegistryRepository(conn)
|
||||||
|
rows = repo.list_all()
|
||||||
|
except Exception:
|
||||||
|
# Registry read failed for whatever reason — don't compound the
|
||||||
|
# error response by hiding the original DuckDB message.
|
||||||
|
return None
|
||||||
|
sql_l = sql.lower()
|
||||||
|
for r in rows:
|
||||||
|
if (r.get("query_mode") or "") != "materialized":
|
||||||
|
continue
|
||||||
|
# Match by id or by name; either could appear in the SQL.
|
||||||
|
candidates = {r.get("id"), r.get("name")}
|
||||||
|
for cand in candidates:
|
||||||
|
if not cand:
|
||||||
|
continue
|
||||||
|
cand_l = str(cand).lower()
|
||||||
|
# Word-boundary-ish check — `\b` doesn't match `.` so
|
||||||
|
# `bq.dataset.cand` would still hit, which is fine for the
|
||||||
|
# hint path (the operator is referring to the same table).
|
||||||
|
if re.search(r"\b" + re.escape(cand_l) + r"\b", sql_l):
|
||||||
|
return _build_materialized_hint(r)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _build_materialized_hint(row: dict) -> str:
|
||||||
|
"""Format the user-facing hint for a materialized row that's not yet
|
||||||
|
queryable. Includes the table id, the bucket/source_table when the
|
||||||
|
row carries them, and concrete operator next steps."""
|
||||||
|
tid = row.get("id") or row.get("name") or "<unknown>"
|
||||||
|
bucket = row.get("bucket")
|
||||||
|
source_table = row.get("source_table")
|
||||||
|
direct_hint = ""
|
||||||
|
if bucket and source_table:
|
||||||
|
# BigQuery: `bq."dataset"."table"`; Keboola: `kbc."bucket"."table"`.
|
||||||
|
# Pick the alias by source_type so the hint is copy-pasteable.
|
||||||
|
alias = "bq" if (row.get("source_type") or "") == "bigquery" else "kbc"
|
||||||
|
direct_hint = (
|
||||||
|
f' or query the source directly via {alias}."{bucket}".'
|
||||||
|
f'"{source_table}"'
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
f"Table {tid!r} is registered as query_mode='materialized' but is "
|
||||||
|
f"not yet materialized in this instance's analytics views. Run "
|
||||||
|
f"`da sync` (or wait for the scheduler tick / hit POST "
|
||||||
|
f"/api/sync/trigger) to materialize the parquet"
|
||||||
|
f"{direct_hint}."
|
||||||
|
)
|
||||||
|
|
|
||||||
75
tests/test_query_materialized_error_message.py
Normal file
75
tests/test_query_materialized_error_message.py
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
"""POST /api/query for a table id that's registered as
|
||||||
|
`query_mode='materialized'` but isn't yet a view in `analytics.duckdb`
|
||||||
|
returns a helpful, materialize-aware error instead of a raw "Table does
|
||||||
|
not exist" string from DuckDB.
|
||||||
|
|
||||||
|
E2E sub-agent finding 2026-05-01: `da query --remote "SELECT * FROM
|
||||||
|
e2e2_synced_table LIMIT 5"` on a synced materialized table failed with
|
||||||
|
DuckDB's bare error message even though the table is in the registry.
|
||||||
|
The fix improves the surfaced message so the operator sees the
|
||||||
|
materialize-mode hint without having to decode DuckDB internals.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.repositories.table_registry import TableRegistryRepository
|
||||||
|
|
||||||
|
|
||||||
|
def _auth(token: str) -> dict:
|
||||||
|
return {"Authorization": f"Bearer {token}"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_materialized_id_not_in_views_returns_helpful_message(seeded_app):
|
||||||
|
"""An admin querying a materialized id that isn't yet materialized in
|
||||||
|
the local analytics.duckdb gets a 400 whose detail names the
|
||||||
|
query_mode and points at `da sync` / direct-BQ-query."""
|
||||||
|
from src.db import get_system_db
|
||||||
|
sys_conn = get_system_db()
|
||||||
|
try:
|
||||||
|
TableRegistryRepository(sys_conn).register(
|
||||||
|
id="not_yet_materialized",
|
||||||
|
name="not_yet_materialized",
|
||||||
|
source_type="bigquery",
|
||||||
|
query_mode="materialized",
|
||||||
|
source_query='SELECT 1 FROM bq."ds"."t"',
|
||||||
|
bucket="ds",
|
||||||
|
source_table="t",
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
sys_conn.close()
|
||||||
|
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={"sql": "SELECT * FROM not_yet_materialized LIMIT 5"},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
assert r.status_code == 400, r.json()
|
||||||
|
detail = str(r.json().get("detail", ""))
|
||||||
|
# Message should name the table and surface the materialize-mode hint.
|
||||||
|
assert "not_yet_materialized" in detail
|
||||||
|
assert "materialized" in detail.lower()
|
||||||
|
# Either a `da sync` hint or a direct-BQ-query hint must appear so the
|
||||||
|
# operator has a concrete next step.
|
||||||
|
assert "da sync" in detail or "bq." in detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_unknown_table_falls_back_to_default_error(seeded_app):
|
||||||
|
"""Sanity: a query for a table that isn't even in the registry still
|
||||||
|
surfaces DuckDB's error verbatim (no false positive on the new hint
|
||||||
|
path). RBAC's 403 path takes precedence for non-admin callers; for
|
||||||
|
admins (no RBAC filter) the table simply doesn't exist as a view, and
|
||||||
|
the query falls through to DuckDB's "does not exist" message."""
|
||||||
|
c = seeded_app["client"]
|
||||||
|
token = seeded_app["admin_token"]
|
||||||
|
r = c.post(
|
||||||
|
"/api/query",
|
||||||
|
json={"sql": "SELECT * FROM totally_unknown_table"},
|
||||||
|
headers=_auth(token),
|
||||||
|
)
|
||||||
|
assert r.status_code == 400, r.json()
|
||||||
|
detail = str(r.json().get("detail", "")).lower()
|
||||||
|
# Falls back to the generic query-error path; no materialized hint.
|
||||||
|
assert "materialized" not in detail
|
||||||
Loading…
Reference in a new issue