From 5458ccc41b467b927a3788ec360c183438c12cb5 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr <139972147+ZdenekSrotyr@users.noreply.github.com> Date: Tue, 12 May 2026 20:32:29 +0200 Subject: [PATCH] hygiene: BQ error hint dispatch + catalog ENTITY column (#274) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two analyst-UX papercuts surfaced by the v0.53.4 onboarding smoke test. 1) /api/query remote_estimate_failed hint now branches on the BigQuery error class instead of always claiming a column doesn't exist. The previous hardcoded "Most often this means a column referenced … doesn't exist" misled analysts whenever BigQuery actually rejected on syntax — concretely, `SELECT COUNT(*) AS rows FROM …` fails with `Syntax error: Unexpected keyword ROWS at [1:20]` (`rows` is a BQ reserved word) and the hint pointed at non-existent columns. New _hint_for_bq_bad_request() helper dispatches: - "Syntax error" / "Unexpected keyword" → reserved-keyword alias hint with `AS row_count` workaround - "Unrecognized name" / "not found inside" → `agnes schema ` - "Table not found" → `agnes catalog` - fallback → enumerate all three 4 unit tests in TestHintForBqBadRequest pin each branch. Existing guardrail tests (test_fallback_fails_fast_on_pure_duckdb_syntax, test_remote_estimate_failed_surfaces_first_error_when_attempts_differ) continue to pass — both hint substrings they assert on still appear in the relevant branches. 2) `agnes catalog` replaces the FLAVOR column with ENTITY. FLAVOR rendered t['sql_flavor'] which duplicated SOURCE for any catalog dominated by one source type — analysts saw `SOURCE=bigquery FLAVOR=bigquery` on every row. ENTITY instead surfaces the upstream BigQuery entity_type (BASE TABLE / VIEW / MATERIALIZED_VIEW) for remote rows; non-remote rows render `-`. The distinction matters operationally: views don't support predicate pushdown, so `agnes query --remote` against a view trips the cost guardrail where the same query against a BASE TABLE pushes down cleanly. The entity_type field has been in the v2 catalog response since 0.51.0; this PR just stops hiding it behind a column header that conveyed no information. JSON output (`agnes catalog --json`) is unchanged — only the human- readable column changed. No DB migration; no API change. Verified: 4161 tests pass locally; 25 in test_api_query_guardrail.py green; the 4 new TestHintForBqBadRequest cases pin each branch. --- CHANGELOG.md | 8 +++++ app/api/query.py | 60 ++++++++++++++++++++++++++----- cli/commands/catalog.py | 13 +++++-- tests/test_api_query_guardrail.py | 51 ++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b926e6..17edf92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,14 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +### Changed + +- **`agnes catalog` replaces the `FLAVOR` column with `ENTITY`.** The old `FLAVOR` column rendered `t['sql_flavor']` (`bigquery`/`duckdb`) which duplicated `SOURCE` for any catalog dominated by one source type — analysts saw `SOURCE=bigquery FLAVOR=bigquery` on every row and the column carried zero information. `ENTITY` instead renders the upstream BigQuery `entity_type` (`BASE TABLE` / `VIEW` / `MATERIALIZED_VIEW`) for remote rows, surfacing the distinction that actually changes how the analyst should query: views don't support predicate pushdown, so `agnes query --remote` against a view trips the cost guardrail where the same query against a BASE TABLE pushes down cleanly. Non-remote rows (`local`/`materialized`) render `-` since the distinction doesn't apply. JSON output (`agnes catalog --json`) is unchanged — `entity_type` was already in the v2 catalog response since 0.51.0; only the human-readable column changed. + +### Fixed + +- **`/api/query` `remote_estimate_failed` hint now branches on the BigQuery error class** instead of always claiming a column doesn't exist. The previous hardcoded "Most often this means a column referenced … doesn't exist" misled analysts whenever BigQuery actually rejected on syntax (e.g. `SELECT COUNT(*) AS rows` — `rows` is reserved, BQ returns `Syntax error: Unexpected keyword ROWS at [1:20]`, the previous hint pointed at non-existent columns). Branching: syntax errors get a hint about reserved-keyword aliases (with the `AS \`rows\`` / `AS row_count` workaround); `Unrecognized name` / `not found inside` still points at `agnes schema `; `Table not found` points at `agnes catalog`; the fallback hint enumerates all three causes for the analyst to triage. + ## [0.53.4] — 2026-05-12 ### Fixed diff --git a/app/api/query.py b/app/api/query.py index c8a17c6..b4695c5 100644 --- a/app/api/query.py +++ b/app/api/query.py @@ -75,6 +75,47 @@ def _looks_like_bq_rewrite_parse_error(exc: BaseException) -> bool: msg = str(exc) return any(pat in msg for pat in _BQ_REWRITE_PARSE_ERROR_PATTERNS) + +def _hint_for_bq_bad_request(message: str) -> str: + """Pick the most useful one-line hint for a BigQuery `bad_request` + error message. The default "column doesn't exist" hint is correct + for ~half of BQ rejections (`Unrecognized name: foo`, + `Field foo not found in record`) but actively misleading when BQ + actually rejected on syntax (`Syntax error: Unexpected keyword + ROWS at [1:20]` — reserved-keyword alias without quoting, + extremely common because `rows` / `range` / `groups` / `window` + are all reserved). Branch on the BQ message to pick the right hint + rather than always blaming columns.""" + msg = message.lower() + if "unexpected keyword" in msg or "syntax error" in msg: + return ( + "BigQuery rejected this on SQL syntax. Most often this is a " + "reserved-keyword identifier used unquoted — e.g. " + "`SELECT COUNT(*) AS rows` fails because `rows` is reserved. " + "Backtick the alias (`AS \\`rows\\``) or rename it " + "(`AS row_count`). For other syntax errors, see the " + "`underlying` field below — it carries BigQuery's own " + "diagnostic with the error position." + ) + if "unrecognized name" in msg or "not found inside" in msg or "field name" in msg: + return ( + "BigQuery rejected this because a column referenced in " + "WHERE/SELECT/etc doesn't exist on the table. Verify with " + "`agnes schema `." + ) + if "table not found" in msg or "not found:" in msg: + return ( + "BigQuery rejected this because the table reference doesn't " + "exist. Use a registered table id from `agnes catalog`, or " + "write a full backtick path like `` `..` ``." + ) + return ( + "BigQuery rejected this query during cost estimation. See the " + "`underlying` field for BigQuery's own diagnostic; common causes " + "are missing columns (verify with `agnes schema `), " + "reserved-keyword aliases, or unregistered table paths." + ) + # Issue #160 §4.3.1 — direct `bq..` references in user # SQL. Catalog token accepts both `bq` (the unquoted DuckDB-style name) and # `"bq"` (quoted identifier). DuckDB resolves both to the same ATTACHed @@ -1127,14 +1168,17 @@ def _bq_quota_and_cap_guard( "BigQuery rejected this query during cost " "estimation." ), - "hint": ( - "Most often this means a column referenced " - "in WHERE/SELECT/etc doesn't exist on the " - "table — verify with `agnes schema `. " - "Otherwise: use a registered table name from " - "`agnes catalog`, or write BQ-native SQL " - "with full backtick paths." - ), + # Branch the hint on the actual BQ error class — + # syntax errors (e.g. reserved-keyword aliases like + # `AS rows`) deserve a different pointer than + # column-not-found, which deserves a different one + # than table-not-found. Pre-#NNN this was a single + # hardcoded "column referenced doesn't exist" hint + # that misled analysts whenever BQ actually rejected + # on syntax. The first attempt's diagnostic + # (rewritten SQL — has the real BQ position info) + # is the more informative one to dispatch on. + "hint": _hint_for_bq_bad_request(exc.message), # Surface the FIRST attempt's diagnostic (rewritten # SQL — has the real "Unrecognized name" / syntax # info). Second attempt for catalog-id-only SQL diff --git a/cli/commands/catalog.py b/cli/commands/catalog.py index 37fbc6f..11deb16 100644 --- a/cli/commands/catalog.py +++ b/cli/commands/catalog.py @@ -51,12 +51,19 @@ def catalog( if json: typer.echo(json_lib.dumps(data, indent=2)) return - # Human-readable table - typer.echo(f"{'ID':30s} {'SOURCE':10s} {'MODE':8s} {'FLAVOR':10s} NAME") + # Human-readable table. + # ENTITY column shows the upstream entity_type for remote BigQuery rows + # (BASE TABLE / VIEW / MATERIALIZED_VIEW) — matters because views don't + # support predicate pushdown, so an analyst should reach for `agnes + # snapshot create` rather than `agnes query --remote` on a view. + # For non-remote rows (local / materialized) entity_type is NULL upstream + # and we render a dash — those modes don't have an analogous distinction. + typer.echo(f"{'ID':30s} {'SOURCE':10s} {'MODE':8s} {'ENTITY':18s} NAME") for t in data.get("tables", []): + entity = t.get("entity_type") or "-" typer.echo( f"{t['id']:30s} {t['source_type']:10s} {t['query_mode']:8s} " - f"{t['sql_flavor']:10s} {t.get('name', '')}" + f"{entity:18s} {t.get('name', '')}" ) diff --git a/tests/test_api_query_guardrail.py b/tests/test_api_query_guardrail.py index d5c6101..3361d9c 100644 --- a/tests/test_api_query_guardrail.py +++ b/tests/test_api_query_guardrail.py @@ -831,3 +831,54 @@ def test_full_backtick_path_inside_string_literal_not_gated( "bq_path_not_registered", "bq_path_cross_project", ), detail + + +class TestHintForBqBadRequest: + """The `remote_estimate_failed` hint must branch on the BQ error class. + + Pre-#NNN every BQ rejection got the same "column referenced doesn't + exist" hint, which actively misled analysts whenever BQ actually + rejected on syntax (e.g. `SELECT COUNT(*) AS rows` — `rows` is + reserved). The dispatch helper picks the most useful one-line hint + based on what BigQuery actually said. + """ + + def test_syntax_error_hint_calls_out_reserved_keyword_alias(self): + from app.api.query import _hint_for_bq_bad_request + + # Sub-agent-reported actual case from the v0.53.4 smoke test + hint = _hint_for_bq_bad_request( + "Syntax error: Unexpected keyword ROWS at [1:20]" + ) + assert "syntax" in hint.lower() + assert "reserved" in hint.lower() or "rows" in hint.lower() + # Must NOT lead with the misleading column-not-found hint + assert "column referenced" not in hint.lower() + + def test_unrecognized_name_hint_points_at_agnes_schema(self): + from app.api.query import _hint_for_bq_bad_request + + hint = _hint_for_bq_bad_request( + "Unrecognized name: authorize_date at [1:88]" + ) + assert "agnes schema" in hint.lower() + assert "doesn't exist" in hint.lower() or "column" in hint.lower() + + def test_table_not_found_hint_points_at_agnes_catalog(self): + from app.api.query import _hint_for_bq_bad_request + + hint = _hint_for_bq_bad_request( + "Table not found: my-project.dataset.tbl" + ) + assert "agnes catalog" in hint.lower() + + def test_unknown_error_falls_back_to_generic_hint(self): + from app.api.query import _hint_for_bq_bad_request + + hint = _hint_for_bq_bad_request( + "Some unfamiliar BigQuery diagnostic we don't classify yet" + ) + # Generic hint mentions all three common causes so the analyst + # has somewhere to start + assert "schema" in hint.lower() + assert "underlying" in hint.lower()