agnes-the-ai-analyst/cli/commands/fetch.py
ZdenekSrotyr 1563b05f2e refactor(cli): hard-cutover env vars + config dir to AGNES_*
Task 0.5 of clean-analyst-bootstrap. Greenfield rewrite — no fallback,
no aliases. Existing dev environments lose their cached PAT and must
re-authenticate.

Env var renames (hard cutover):
- DA_CONFIG_DIR    -> AGNES_CONFIG_DIR
- DA_SERVER        -> AGNES_SERVER
- DA_SERVER_URL    -> AGNES_SERVER_URL  (test-only stale ref, not in spec)
- DA_NO_UPDATE_CHECK -> AGNES_NO_UPDATE_CHECK
- DA_LOCAL_DIR     -> AGNES_LOCAL_DIR
- DA_TOKEN         -> AGNES_TOKEN
- DA_STREAM_RETRIES -> AGNES_STREAM_RETRIES

Config dir rename: ~/.config/da/ -> ~/.config/agnes/ (across code,
comments, docstrings, error messages, install templates, dev scripts).

Stale `da X` references in CLI source (and adjacent app/, tests/):
swept docstrings, comments, help text, and error messages where the
verb survives the rewrite (init, pull, push, catalog, status, diagnose,
auth, admin, skills, query, schema, describe, explore, disk-info,
snapshot, login, logout, whoami, server, setup) and replaced `da X`
with `agnes X`. Intentionally kept `da sync`, `da fetch`, `da analyst`,
`da metrics` — those verbs are removed in later tasks; the legacy
strings will be detected by `_LEGACY_STRINGS` (added in Task 2).

Test fixes:
- TestCLIVersion now asserts output starts with `agnes ` (was `da `).

Test results: 2675 passed, 25 skipped (full pytest run, excluding 9
pre-existing test_db.py / test_user_management.py / test_e2e_extract.py
/ test_cli_binary_rename.py failures unrelated to this rename).
2026-05-04 16:35:44 +02:00

184 lines
7.3 KiB
Python

"""`da fetch` — materialize a filtered subset of a remote table locally (spec §4.2)."""
from __future__ import annotations
import hashlib
import json
import os
from datetime import datetime, timezone
from pathlib import Path
import duckdb
import pyarrow as pa
import pyarrow.parquet as pq
import typer
from cli.snapshot_meta import (
SnapshotMeta, write_meta, read_meta, snapshot_lock,
)
from cli.v2_client import api_post_json, api_post_arrow, V2ClientError
fetch_app = typer.Typer(
help="Fetch a filtered subset of a remote table locally",
context_settings={"allow_interspersed_args": True},
)
def _local_dir() -> Path:
return Path(os.environ.get("AGNES_LOCAL_DIR", ".")).resolve()
def _print_estimate(d: dict) -> None:
# `dict.get(k, default)` returns `default` only when k is missing; if k
# maps to None (server returns None for non-BQ tables) the default doesn't
# kick in. `or 0` covers both cases.
typer.echo(f" estimated_scan_bytes: {(d.get('estimated_scan_bytes') or 0):>15,} bytes")
typer.echo(f" estimated_result_rows: {(d.get('estimated_result_rows') or 0):>15,}")
typer.echo(f" estimated_result_bytes: {(d.get('estimated_result_bytes') or 0):>15,} bytes")
typer.echo(f" bq_cost_estimate_usd: $ {(d.get('bq_cost_estimate_usd') or 0):.4f}")
@fetch_app.callback(invoke_without_command=True)
def fetch(
ctx: typer.Context,
table_id: str = typer.Argument(...),
select: str = typer.Option(None, "--select", help="Comma-separated column list"),
where: str = typer.Option(None, "--where", help="WHERE predicate (BQ flavor for remote tables)"),
limit: int = typer.Option(None, "--limit"),
order_by: str = typer.Option(None, "--order-by", help="Comma-separated"),
as_name: str = typer.Option(None, "--as", help="Local snapshot name (default: <table_id>)"),
estimate: bool = typer.Option(False, "--estimate", help="Run dry-run only, do not fetch"),
no_estimate: bool = typer.Option(False, "--no-estimate", help="Skip the pre-fetch estimate"),
force: bool = typer.Option(False, "--force", help="Overwrite existing snapshot of the same name"),
):
"""Fetch a filtered subset of a remote table locally."""
if ctx.invoked_subcommand is not None:
return
name = as_name or table_id
# Snapshot name lands in DuckDB CREATE VIEW as a quoted identifier; a `"`
# in the name would break out and enable arbitrary SQL execution against
# the user's local analytics.duckdb. Validate up-front with the same
# regex used elsewhere for safe identifiers.
import re as _re
if not _re.match(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$", name):
typer.echo(
f"Error: snapshot name {name!r} is not a safe identifier. "
f"Use letters, digits, and underscores; must start with a letter "
f"or underscore; max 64 characters.",
err=True,
)
raise typer.Exit(2)
snap_dir = _local_dir() / "user" / "snapshots"
snap_dir.mkdir(parents=True, exist_ok=True)
# Build request
req = {"table_id": table_id}
if select:
req["select"] = [c.strip() for c in select.split(",") if c.strip()]
if where:
req["where"] = where
if limit:
req["limit"] = int(limit)
if order_by:
req["order_by"] = [c.strip() for c in order_by.split(",") if c.strip()]
# Estimate (always shown unless --no-estimate). The `--estimate` early
# exit is OUTSIDE this block — `--estimate` is a cost-safety mechanism
# ("dry-run only, do not fetch") whose guarantee must hold even when
# the user also passes `--no-estimate` (silly combo; treat as dry-run
# because the fetch-blocking semantics dominate).
est = None
if not no_estimate:
try:
est = api_post_json("/api/v2/scan/estimate", req)
except V2ClientError as e:
typer.echo(f"Error: estimate failed: {e}", err=True)
raise typer.Exit(_exit_code_for(e))
typer.echo(f"Estimate for {table_id}:")
_print_estimate(est)
if estimate:
return
# Cheap existence pre-check (outside the lock) so we don't waste a BQ
# scan on an obviously-redundant fetch. Authoritative re-check happens
# under the lock below — necessary because between this check and the
# write a concurrent `da fetch --as same_name` could create the file.
if not force and read_meta(snap_dir, name) is not None:
existing = read_meta(snap_dir, name)
typer.echo(
f"Error: snapshot {name!r} already exists "
f"(fetched {existing.fetched_at}, {existing.rows:,} rows). "
f"Pass --force to overwrite, or 'agnes snapshot refresh {name}' to update in place.",
err=True,
)
raise typer.Exit(6)
# Fetch
try:
table = api_post_arrow("/api/v2/scan", req)
except V2ClientError as e:
typer.echo(f"Error: fetch failed: {e}", err=True)
raise typer.Exit(_exit_code_for(e))
# Install under flock — re-check existence here to close the TOCTOU
# window between the early check above and this write.
parquet_path = snap_dir / f"{name}.parquet"
with snapshot_lock(snap_dir):
if not force and read_meta(snap_dir, name) is not None:
existing = read_meta(snap_dir, name)
typer.echo(
f"Error: snapshot {name!r} was created by a concurrent "
f"`da fetch` (fetched {existing.fetched_at}, "
f"{existing.rows:,} rows). Pass --force to overwrite.",
err=True,
)
raise typer.Exit(6)
pq.write_table(table, parquet_path)
# Register view in user analytics.duckdb
local_db = _local_dir() / "user" / "duckdb" / "analytics.duckdb"
local_db.parent.mkdir(parents=True, exist_ok=True)
conn = duckdb.connect(str(local_db))
try:
safe_path = str(parquet_path).replace("'", "''")
conn.execute(
f"CREATE OR REPLACE VIEW \"{name}\" AS SELECT * FROM read_parquet('{safe_path}')"
)
finally:
conn.close()
# Compute hash + write meta
result_hash = hashlib.md5(parquet_path.read_bytes()[:1_000_000]).hexdigest()
now = datetime.now(timezone.utc).isoformat()
meta = SnapshotMeta(
name=name, table_id=table_id,
select=req.get("select"), where=req.get("where"),
limit=req.get("limit"), order_by=req.get("order_by"),
fetched_at=now, effective_as_of=now,
rows=int(table.num_rows),
bytes_local=parquet_path.stat().st_size,
estimated_scan_bytes_at_fetch=int(est.get("estimated_scan_bytes", 0)) if est is not None else 0,
result_hash_md5=result_hash,
)
write_meta(snap_dir, meta)
typer.echo(f"Fetched {table.num_rows:,} rows -> {name}")
def _exit_code_for(e: V2ClientError) -> int:
if e.status_code == 400:
# Inspect body for 'kind'
body = e.body if isinstance(e.body, dict) else {}
if body.get("error") == "validator_rejected":
return 2
return 2
if e.status_code == 401:
return 7
if e.status_code == 403:
return 8
if e.status_code == 404:
return 8 # treat unknown table as RBAC-equivalent
if e.status_code == 429:
return 3
if e.status_code >= 500:
return 5
return 9