From 12db59127b0a3b96ecd5b760af4dd125fe281fc8 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr <139972147+ZdenekSrotyr@users.noreply.github.com> Date: Tue, 12 May 2026 16:28:41 +0200 Subject: [PATCH] =?UTF-8?q?release:=200.53.0=20=E2=80=94=20close=20Tier=20?= =?UTF-8?q?B=20trackers=20(#259-#261)=20+=20admin=20UI=20fix=20(#265)=20(#?= =?UTF-8?q?267)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * release: 0.53.0 — Tier B trackers + admin UI bugfix Closes #259 (init resume sentinel), #260 (startup parquet-lock sweep), #261 (materialized schema uses local parquet, not BQ), #265 (admin tables apostrophe → HTML-entity escape). Tracker notes: #262 closed as obsolete (pre-empted by 0.51.0 changes), #266 left open pending UX clarification. * fix(init): move resume sentinel from .agnes/ to .claude/ The clean-install integration test (test_clean_install_integration.py) forbids creating .agnes/ in the workspace root via its forbidden_unconditional list — that path is reserved for ~/.agnes/ in the user's HOME (marketplace clone, CA bundle). .claude/ is already created by agnes init for settings.json + hooks, so dropping init-complete next to those keeps the resume sentinel consistent with the rest of Claude Code's workspace surface and lets the clean-install assertions pass. Issue #259. * docs(changelog): point #259 entry at new .claude/init-complete path Follows the sentinel move from .agnes/ → .claude/ to keep the changelog in sync with what 0.53.0 actually ships. --- CHANGELOG.md | 20 +++++++ app/api/v2_schema.py | 10 +++- app/main.py | 12 ++++ app/web/templates/admin_tables.html | 30 +++++++++- cli/commands/init.py | 55 +++++++++++++++++-- connectors/bigquery/extractor.py | 61 +++++++++++++++++++++ pyproject.toml | 2 +- tests/test_init_resume_sentinel.py | 46 ++++++++++++++++ tests/test_parquet_lock_sweep.py | 61 +++++++++++++++++++++ tests/test_v2_schema_materialized_local.py | 64 ++++++++++++++++++++++ 10 files changed, 351 insertions(+), 10 deletions(-) create mode 100644 tests/test_init_resume_sentinel.py create mode 100644 tests/test_parquet_lock_sweep.py create mode 100644 tests/test_v2_schema_materialized_local.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b49cc3c..2ba1c83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,26 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +## [0.53.0] — 2026-05-12 + +Second hygiene round closing the Tier B trackers opened during the +0.51.0 retro plus one new admin UI bug. `agnes init` resumes after a +kill (#259), schema endpoint stops calling BigQuery for materialized +tables (#261), admin tables UI no longer breaks on apostrophes (#265), +stale parquet locks get swept at startup (#260). + +### Fixed + +- **`agnes init` resumes after an interrupted run, no `--force` required** (#259). Pre-0.53 a killed `agnes init` (SIGKILL from a runtime watchdog, network drop, operator Ctrl-C) left `CLAUDE.md` on disk; the next attempt errored with `partial_state` and `--force` then re-downloaded the full materialized parquet from scratch. Init now writes a completion sentinel at `.claude/init-complete` (next to the workspace's `settings.json` + hooks; `.claude/` already gets created by init for those, so the sentinel reuses existing surface and stays out of `.agnes/` which is reserved for `~/.agnes/` user-HOME content) at the end of the flow. The early-out gate distinguishes "fully initialized" (`CLAUDE.md` + sentinel both present → still `partial_state`) from "previous run was interrupted" (`CLAUDE.md` present but sentinel missing → resume silently, log a one-line notice). +- **Materialized BQ tables read schema from the local parquet, not from BigQuery** (#261). `app/api/v2_schema.build_schema_uncached` dispatched on `source_type` alone and always reached for `INFORMATION_SCHEMA.COLUMNS` when `source_type='bigquery'` — including for `query_mode='materialized'` rows whose actual data is sitting next to the dataset as a parquet. The 0.51.0 perf tests measured this as a 4–5× cold-start anomaly (4.6 s vs 1.0 s for a remote VIEW); root cause is the wasted BQ round-trip. Branch now uses the local-parquet path for ANY `query_mode='materialized'` row. +- **Apostrophe in `table_registry.description` no longer breaks every Edit / Delete button on `/admin/tables`** (#265). The row-rendering JS wrapped the per-row payload in a single-quoted HTML `onclick` attribute and escaped apostrophes with a JS-style backslash (`\'`). HTML attribute values don't recognize backslash escapes — the first real `'` in the description terminated the attribute, the rest of the HTML was malformed, and the onclick handlers on every subsequent row silently failed to attach. New `escapeHtmlAttr` helper does proper HTML-entity escaping (`'` for `'`, plus `"`, `<`, `>`, `&`); applied to all three onclick callsites in the row template. Also addresses the implicit XSS-adjacent risk of admin-controlled text in an HTML attribute. +- **Stale `*.parquet.lock` files swept on app startup** (#260). The acquire path already reclaims locks older than `materialize.lock_ttl_seconds` (default 24 h) lazily on the next materialize attempt, but lock files left behind by a SIGKILL'd materialize would sit next to parquets for days waiting for the next sync. New `connectors.bigquery.extractor.sweep_stale_parquet_locks(data_root)` walks every `*.parquet.lock` under the extracts tree at app boot and unlinks the stale ones. Failures are logged at WARNING, not raised. Wired into the FastAPI startup hook. + +### Tracker-only (still open, no code in this release) + +- **#262** closed as obsolete — Caddy `file_server` + persistent catalog cache already address the user-facing impact this issue was originally written about. +- **#266** admin tables Edit dialog dataset field "disabled for materialized" — actual behavior is `display:none` (hidden when sync mode is custom-SQL); not the same as "disabled". UX clarification not in scope for this release. + ## [0.52.0] — 2026-05-12 UX + hygiene round following the 0.51.0 catalog-hang fix. Five small, diff --git a/app/api/v2_schema.py b/app/api/v2_schema.py index 1a35529..a4025ec 100644 --- a/app/api/v2_schema.py +++ b/app/api/v2_schema.py @@ -136,7 +136,15 @@ def build_schema_uncached( raise NotFound(table_id) source_type = row.get("source_type") or "" - if source_type == "bigquery": + query_mode = row.get("query_mode") or "" + # Issue #261: a `source_type='bigquery'` row with `query_mode='materialized'` + # has the data on local disk as a parquet — same shape as Keboola local + # tables. Hitting BigQuery INFORMATION_SCHEMA on every schema call was + # the root cause of the materialized-schema cold-start anomaly observed + # in the 0.51.0 perf tests (4.6 s vs 1.0 s for remote VIEW). Use the + # local-parquet branch for any materialized source regardless of + # `source_type` — the parquet is the source of truth. + if source_type == "bigquery" and query_mode != "materialized": dataset = row.get("bucket") or "" source_table = row.get("source_table") or table_id columns = _fetch_bq_schema(bq, dataset, source_table) diff --git a/app/main.py b/app/main.py index 0ef02b2..7938182 100644 --- a/app/main.py +++ b/app/main.py @@ -155,6 +155,18 @@ async def lifespan(app): from app.api.cache_warmup import maybe_schedule_startup_warmup maybe_schedule_startup_warmup() + # Sweep stale materialize parquet locks left behind by previous runs + # that were SIGKILL'd mid-materialize. Lazy reclaim at next acquire + # already handles correctness, but an active sweep at startup keeps + # the data directory tidy and gives operators a clear "swept N" log + # line instead of zombie 0-byte files lingering for days (issue #260). + try: + from connectors.bigquery.extractor import sweep_stale_parquet_locks + from src.db import _get_data_dir as _ddir + sweep_stale_parquet_locks(_ddir() / "extracts") + except Exception: + logger.exception("startup parquet-lock sweep failed (non-fatal)") + # Construct the PostHog client up front so its background flush thread # starts before the first request — and so a missing/invalid key fails # loud at boot rather than on first capture. No-op when disabled. diff --git a/app/web/templates/admin_tables.html b/app/web/templates/admin_tables.html index f47c23d..0e5350b 100644 --- a/app/web/templates/admin_tables.html +++ b/app/web/templates/admin_tables.html @@ -1816,6 +1816,22 @@ return div.innerHTML; } + /** + * Escape a string for safe inclusion inside a single- OR double-quoted + * HTML attribute. Unlike `escapeHtml` (which goes through textContent → + * innerHTML and only escapes `<`/`>`/`&`), this also escapes both quote + * characters so the value can't break out of the attribute. Issue #265. + */ + function escapeHtmlAttr(str) { + if (str === null || str === undefined) return ''; + return String(str) + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + } + // Defensive normalization for descriptions registered via shell-quoting // tooling that injected literal backslash escapes (e.g. `Don\'t`, `\n`). // Mirrors _unescape_shell_quoting in app/api/admin.py — applied at render @@ -3168,14 +3184,22 @@ html += ''; html += '
'; - html += ''; // Manage access: deep-link to /admin/access pre-filtered to this table. - html += ''; - html += ''; html += '
'; diff --git a/cli/commands/init.py b/cli/commands/init.py index 873ebcf..f333933 100644 --- a/cli/commands/init.py +++ b/cli/commands/init.py @@ -60,6 +60,17 @@ from cli.lib.pull import PullResult, _override_server_env, run_pull # template can override this via the `--force` flag. _INIT_MARKER = "AI Data Analyst" +# Sentinel written at the very END of a successful `agnes init`. Existence +# of CLAUDE.md alone is NOT a "workspace is initialized" signal because +# CLAUDE.md is written early in the flow — long before the parquet pull, +# the AGNES_WORKSPACE.md render, and the final summary. Killed runs +# (SIGKILL from the harness, network drop mid-pull, operator Ctrl-C) +# leave CLAUDE.md on disk but not this sentinel. The next `agnes init` +# can then resume without requiring `--force`, which would otherwise +# force a full re-download of any large materialized parquet that was +# 80 % complete. Issue #259. +_INIT_COMPLETE_FILE = ".claude/init-complete" + # Env vars that, when set to a non-existent path, cause every TLS handshake # on the host to fail before Agnes itself runs. Past versions of the Agnes @@ -201,17 +212,32 @@ def init( # Step 1: detect an existing workspace. # ------------------------------------------------------------------ claude_md = workspace / "CLAUDE.md" + init_complete = workspace / _INIT_COMPLETE_FILE if claude_md.exists() and not force: try: existing = claude_md.read_text(encoding="utf-8") except OSError: existing = "" if _INIT_MARKER in existing: - typer.echo(render_error(0, {"detail": { - "kind": "partial_state", - "hint": "Workspace already initialized. Re-run with --force to redo.", - }}), err=True) - raise typer.Exit(1) + # Distinguish "fully initialized" from "previous attempt was + # killed mid-flight": only block when the completion sentinel + # is there. Issue #259 — pre-0.53 every interrupted init left + # CLAUDE.md behind and the next `agnes init` errored with + # `partial_state`, forcing `--force` + full re-download of any + # large materialized parquet. + if init_complete.exists(): + typer.echo(render_error(0, {"detail": { + "kind": "partial_state", + "hint": "Workspace already initialized. Re-run with --force to redo.", + }}), err=True) + raise typer.Exit(1) + else: + typer.echo( + "Previous init was interrupted (no completion sentinel " + "found). Resuming — partial downloads will continue where " + "they stopped.", + err=True, + ) # On --force, snapshot the existing CLAUDE.md before regenerating it # so an operator who edited it can recover their notes (issue #164). @@ -381,6 +407,25 @@ def init( ) (workspace / "AGNES_WORKSPACE.md").write_text(workspace_md, encoding="utf-8") + # ------------------------------------------------------------------ + # Step 9: write the completion sentinel. The next `agnes init` (no + # flags) checks this; absence means a previous attempt was killed + # mid-flight and we should resume rather than refuse. Issue #259. + # ------------------------------------------------------------------ + sentinel = workspace / _INIT_COMPLETE_FILE + sentinel.parent.mkdir(parents=True, exist_ok=True) + try: + import importlib.metadata as _md + agnes_version = _md.version("agnes-the-ai-analyst") + except Exception: + agnes_version = "unknown" + sentinel.write_text( + f"completed_at: {datetime.now(timezone.utc).isoformat()}\n" + f"agnes_version: {agnes_version}\n" + f"server_url: {server_url}\n", + encoding="utf-8", + ) + # ------------------------------------------------------------------ # Final: human-readable summary. # ------------------------------------------------------------------ diff --git a/connectors/bigquery/extractor.py b/connectors/bigquery/extractor.py index 0baafda..96d4e60 100644 --- a/connectors/bigquery/extractor.py +++ b/connectors/bigquery/extractor.py @@ -99,6 +99,67 @@ def _get_lock_ttl_seconds() -> int: return _LOCK_TTL_DEFAULT_SECONDS +def sweep_stale_parquet_locks(data_root: Path | str) -> int: + """Walk every ``*.parquet.lock`` under ``data_root`` and unlink the + ones older than the configured TTL. Returns the count reclaimed. + + Called once at app startup so the dev VM doesn't accumulate 0-byte + zombie lock files (issue #260 — observed lock from a SIGKILL'd + materialize run sitting for 6 days). The acquire path already + reclaims stale locks lazily on the next attempt, but startup sweep + keeps `/data/extracts/*/data/` tidy regardless of whether the next + materialize is scheduled soon. + + Failures (permission, vanished mid-stat, weird FS) are logged at + WARNING and counted toward "errors"; the sweep doesn't raise. + """ + root = Path(data_root) + if not root.exists(): + return 0 + ttl = _get_lock_ttl_seconds() + now = time.time() + reclaimed = 0 + errors = 0 + # Search both ``/data/*.lock`` (Keboola/BQ layout) and + # ``/*/data/*.lock`` (multi-source layout under /data/extracts). + candidates: list[Path] = [] + try: + candidates.extend(root.rglob("*.parquet.lock")) + except Exception as e: + logger.warning("sweep_stale_parquet_locks: rglob failed at %s: %s", root, e) + return 0 + for lock_path in candidates: + try: + age = now - lock_path.stat().st_mtime + except FileNotFoundError: + continue + except Exception as e: + logger.warning("sweep_stale_parquet_locks: stat failed on %s: %s", lock_path, e) + errors += 1 + continue + if age <= ttl: + continue + try: + lock_path.unlink() + logger.info( + "Swept stale materialize lock at %s (age %.0fs > TTL %ds)", + lock_path, age, ttl, + ) + reclaimed += 1 + except FileNotFoundError: + # Concurrent reclaim won — fine. + continue + except Exception as e: + logger.warning("sweep_stale_parquet_locks: unlink failed on %s: %s", lock_path, e) + errors += 1 + if reclaimed or errors: + logger.info( + "sweep_stale_parquet_locks: reclaimed=%d errors=%d under %s", + reclaimed, errors, root, + ) + return reclaimed + + def _try_acquire_file_lock(lock_path: Path): """Try to acquire an advisory exclusive flock on `lock_path`. Returns the open file object on success (caller must close to release); None diff --git a/pyproject.toml b/pyproject.toml index b62c7c4..0a9742f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agnes-the-ai-analyst" -version = "0.52.0" +version = "0.53.0" description = "Agnes — AI Data Analyst platform for AI analytical systems" requires-python = ">=3.11,<3.14" license = "MIT" diff --git a/tests/test_init_resume_sentinel.py b/tests/test_init_resume_sentinel.py new file mode 100644 index 0000000..fee194d --- /dev/null +++ b/tests/test_init_resume_sentinel.py @@ -0,0 +1,46 @@ +"""`agnes init` after an interrupted run resumes without `--force`. + +Regression coverage for issue #259: pre-0.53 every killed `agnes init` +left `CLAUDE.md` on disk but no completion marker; the next attempt +errored with `partial_state` and forced a full re-download. +""" + +from pathlib import Path + + +def test_init_complete_constant_points_at_dotfile(): + """The sentinel lives under `.claude/` (already created by init for + settings.json + hooks) so it doesn't pollute the workspace root and + doesn't trip the `forbidden_unconditional` check in + test_clean_install_integration.py (which bans `.agnes/`).""" + from cli.commands.init import _INIT_COMPLETE_FILE + assert _INIT_COMPLETE_FILE.startswith(".claude/") + assert _INIT_COMPLETE_FILE.endswith("init-complete") + + +def test_workspace_without_sentinel_is_treated_as_resumable(tmp_path: Path): + """A workspace with CLAUDE.md but no completion sentinel must NOT + raise `partial_state` — it's a resume.""" + # We exercise the gate logic directly by checking what the + # path-existence check sees. + (tmp_path / "CLAUDE.md").write_text("# Acme — AI Data Analyst\n", encoding="utf-8") + # Sentinel absent. + from cli.commands.init import _INIT_COMPLETE_FILE, _INIT_MARKER + assert _INIT_MARKER in (tmp_path / "CLAUDE.md").read_text() + assert not (tmp_path / _INIT_COMPLETE_FILE).exists() + # If both conditions hold (marker present, sentinel absent), the + # init flow's early-out should NOT fire. We can't easily run the + # full init command in a unit test, but the boolean condition is + # testable. + is_resumable = (tmp_path / "CLAUDE.md").exists() and not (tmp_path / _INIT_COMPLETE_FILE).exists() + assert is_resumable + + +def test_workspace_with_sentinel_blocks_without_force(tmp_path: Path): + """Both CLAUDE.md AND sentinel present → require --force (old behavior).""" + (tmp_path / "CLAUDE.md").write_text("# Acme — AI Data Analyst\n", encoding="utf-8") + sentinel = tmp_path / ".claude" / "init-complete" + sentinel.parent.mkdir(parents=True, exist_ok=True) + sentinel.write_text("completed_at: 2026-05-12T10:00:00+00:00\n", encoding="utf-8") + is_blocked = (tmp_path / "CLAUDE.md").exists() and sentinel.exists() + assert is_blocked diff --git a/tests/test_parquet_lock_sweep.py b/tests/test_parquet_lock_sweep.py new file mode 100644 index 0000000..bde3456 --- /dev/null +++ b/tests/test_parquet_lock_sweep.py @@ -0,0 +1,61 @@ +"""Startup sweep of stale `.parquet.lock` files (Issue #260). + +The lock acquire path already reclaims stale locks lazily on the next +materialize attempt, but a dedicated startup sweep removes zombies +sitting next to parquets without waiting for the next sync. +""" + +from pathlib import Path + + +def test_sweep_returns_zero_when_no_data_dir(tmp_path: Path): + """Missing directory must not raise — operators starting on a fresh + VM before any extracts exist should see a clean startup.""" + from connectors.bigquery.extractor import sweep_stale_parquet_locks + assert sweep_stale_parquet_locks(tmp_path / "nonexistent") == 0 + + +def test_sweep_keeps_fresh_locks(tmp_path: Path, monkeypatch): + """A lock file mtime'd within the TTL stays put.""" + from connectors.bigquery.extractor import sweep_stale_parquet_locks + extracts = tmp_path / "extracts" / "bigquery" / "data" + extracts.mkdir(parents=True) + lock = extracts / "t.parquet.lock" + lock.touch() + monkeypatch.setenv("AGNES_INSTANCE_CONFIG", str(tmp_path / "no.yaml")) + n = sweep_stale_parquet_locks(tmp_path / "extracts") + assert n == 0 + assert lock.exists() + + +def test_sweep_removes_stale_locks(tmp_path: Path): + """A lock mtime'd > TTL ago gets unlinked.""" + import os + import time + from connectors.bigquery.extractor import sweep_stale_parquet_locks, _get_lock_ttl_seconds + extracts = tmp_path / "extracts" / "bigquery" / "data" + extracts.mkdir(parents=True) + lock = extracts / "old.parquet.lock" + lock.touch() + # Backdate mtime to 2× TTL ago. + ttl = _get_lock_ttl_seconds() + ancient = time.time() - (ttl * 2) + os.utime(lock, (ancient, ancient)) + n = sweep_stale_parquet_locks(tmp_path / "extracts") + assert n == 1 + assert not lock.exists() + + +def test_sweep_handles_multiple_sources(tmp_path: Path): + """Recursive search covers bq + keboola + jira layouts under one root.""" + import os, time + from connectors.bigquery.extractor import sweep_stale_parquet_locks, _get_lock_ttl_seconds + for source in ("bigquery", "keboola", "jira"): + d = tmp_path / "extracts" / source / "data" + d.mkdir(parents=True) + lock = d / "t.parquet.lock" + lock.touch() + ancient = time.time() - (_get_lock_ttl_seconds() * 2) + os.utime(lock, (ancient, ancient)) + n = sweep_stale_parquet_locks(tmp_path / "extracts") + assert n == 3 diff --git a/tests/test_v2_schema_materialized_local.py b/tests/test_v2_schema_materialized_local.py new file mode 100644 index 0000000..7b7416c --- /dev/null +++ b/tests/test_v2_schema_materialized_local.py @@ -0,0 +1,64 @@ +"""Materialized BQ tables read schema from the local parquet, not from +BigQuery INFORMATION_SCHEMA. Issue #261 fix verification.""" + +from pathlib import Path +from unittest.mock import patch + + +def test_materialized_bq_schema_does_not_call_bq(tmp_path: Path, monkeypatch): + """When `query_mode='materialized'`, `build_schema_uncached` must + bypass `_fetch_bq_schema` and read from the local parquet.""" + monkeypatch.setenv("DATA_DIR", str(tmp_path)) + # Create a tiny parquet with two columns so the parquet reader has + # something to DESCRIBE. + import duckdb + parquet_dir = tmp_path / "extracts" / "bigquery" / "data" + parquet_dir.mkdir(parents=True) + parquet_path = parquet_dir / "orders.parquet" + conn = duckdb.connect(":memory:") + conn.execute( + f"COPY (SELECT 1 AS event_id, 'USD' AS currency) TO '{parquet_path}' (FORMAT PARQUET)" + ) + conn.close() + + from app.api.v2_schema import build_schema_uncached + fake_row = { + "id": "orders", + "source_type": "bigquery", + "query_mode": "materialized", + "bucket": "dwh", + "source_table": "orders", + } + fake_bq = object() # never used — BQ path must be skipped + + with patch("app.api.v2_schema._fetch_bq_schema") as mock_bq_schema, \ + patch("app.api.v2_schema._fetch_bq_table_options") as mock_bq_opts: + result = build_schema_uncached( + conn=None, table_id="orders", bq=fake_bq, row=fake_row, + ) + mock_bq_schema.assert_not_called() + mock_bq_opts.assert_not_called() + cols = {c["name"] for c in result["columns"]} + assert cols == {"event_id", "currency"} + assert result["sql_flavor"] == "duckdb" + + +def test_remote_bq_schema_still_calls_bq(tmp_path: Path, monkeypatch): + """Sanity: remote BQ tables (not materialized) still go through BQ.""" + monkeypatch.setenv("DATA_DIR", str(tmp_path)) + from app.api.v2_schema import build_schema_uncached + fake_row = { + "id": "ue", + "source_type": "bigquery", + "query_mode": "remote", + "bucket": "finance", + "source_table": "ue", + } + fake_bq = object() + with patch( + "app.api.v2_schema._fetch_bq_schema", return_value=[], + ) as mock_bq_schema, patch( + "app.api.v2_schema._fetch_bq_table_options", return_value={}, + ): + build_schema_uncached(conn=None, table_id="ue", bq=fake_bq, row=fake_row) + mock_bq_schema.assert_called_once()