agnes-the-ai-analyst/pyproject.toml
ZdenekSrotyr 4751094e1c
fix(keboola): per-table fallback to legacy Storage-API client (#183)
* fix(keboola): per-table fallback to legacy Storage-API client

The DuckDB Keboola extension's per-table COPY fails with
`Schema '..."in.c-..."' does not exist or not authorized` on
projects whose Snowflake backend doesn't expose bucket schemas
to the storage-token-derived QueryService role
(keboola/duckdb-extension#17). ATTACH itself succeeds, so the
existing extension-level fallback in `_try_attach_extension`
never triggers — the table is just marked failed.

- Promote `kbcstorage>=0.9.0` from optional to core dep so the
  legacy client import in `_extract_via_legacy` doesn't crash
  default installs with `ModuleNotFoundError`.
- Wrap `_extract_via_extension` in a per-table try/except so a
  scan failure retries via `_extract_via_legacy` instead of
  recording `tables_failed` and moving on.

Slower than the extension path, but produces correct parquets
on affected projects while the upstream extension fix lands.

* test(keboola): cover per-table extension→legacy fallback

Two existing tests mocked _extract_via_extension to throw and asserted
the original message survived in result["errors"]. With per-table
fallback, the new flow retries via _extract_via_legacy — which on the
mock URLs would throw a different (404 / DNS-fail) error, replacing the
asserted message.

- Mock _extract_via_legacy alongside _extract_via_extension in
  test_network_timeout_during_extraction +
  test_partial_failure_continues +
  test_all_tables_fail_returns_full_failure_stats so the assertion
  observes the final propagated error from the fallback chain.
- Add test_extension_per_table_failure_falls_back_to_legacy that
  exercises the new behavior directly: extension scan fails with the
  QueryService schema-not-authorized message
  (keboola/duckdb-extension#17), legacy succeeds, parquet ends up
  queryable.
2026-05-05 15:47:44 +02:00

120 lines
4.4 KiB
TOML

[project]
name = "agnes-the-ai-analyst"
version = "0.35.1"
description = "Agnes — AI Data Analyst platform for AI analytical systems"
requires-python = ">=3.11,<3.14"
license = "MIT"
readme = "README.md"
dependencies = [
# Core database
"duckdb>=0.9.0",
# Web framework (FastAPI)
"fastapi>=0.115.0",
"uvicorn[standard]>=0.32.0",
"python-multipart>=0.0.26",
"jinja2>=3.1.0",
"starlette>=0.41.0",
# Authentication
"PyJWT>=2.8.0",
"itsdangerous>=2.1.0",
"authlib>=1.6.11",
"argon2-cffi>=23.1.0",
# HTTP client
"httpx>=0.27.0",
# CLI
"typer>=0.12.0",
"rich>=13.0.0",
# Configuration
"python-dotenv>=1.0.0",
"pyyaml>=6.0",
# Data processing
"pandas>=2.0.0",
"pyarrow>=12.0.0",
"pytz>=2024.1",
# SQL parsing — server-side WHERE validator for /api/v2/scan (app/api/where_validator.py)
# Minimum 30.x — older versions had walk() yielding (node, parent, key)
# tuples instead of expression nodes, which would silently bypass the
# WHERE-validator structural checks (isinstance(tuple, exp.Subquery)
# is always False). 30.x yields nodes directly.
"sqlglot>=30.0.0",
# Data source connectors
"google-cloud-bigquery>=3.0.0",
"google-cloud-bigquery-storage>=2.0.0",
# Google Workspace Cloud Identity / Admin SDK (Workspace group membership sync)
"google-api-python-client>=2.0.0",
# Profiler visualizations
"matplotlib>=3.8.0",
"numpy>=1.24.0",
# Claude Code marketplace endpoint — pure-Python git server mounted in FastAPI
"dulwich>=0.22.0",
"a2wsgi>=1.10.0",
# In-process TTL cache for marketplace etag (transitively present via
# google-auth, declared explicitly here because we depend on it directly).
"cachetools>=5.3.0",
# Per-IP rate limiting on auth endpoints (#45). In-process counters by
# default — fine for single-replica deploys. Multi-replica rollouts can
# swap the storage backend via slowapi's `storage_uri` (Redis, Memcached).
"slowapi>=0.1.9",
# LLM provider SDKs — core (not dev) because connectors/llm/*_provider.py
# is imported by services/{corporate_memory, verification_detector} which
# the scheduler drives in production. Promoted from [dev] in #176 to fix
# ModuleNotFoundError boot loops on default Compose deploys.
"anthropic>=0.30.0",
"openai>=1.30.0",
# Legacy Keboola Storage API client. The primary Keboola path is the
# DuckDB community extension (`connectors/keboola/access.py`,
# `connectors/keboola/extractor.py`), but it routes scans through
# Keboola QueryService — and on projects whose Snowflake backend
# doesn't expose bucket schemas to the storage-token-derived role
# the extension fails with `Schema '..."in.c-..."' does not exist
# or not authorized` while the same token reads fine via the
# `/v2/storage/tables/{id}/data-preview` REST endpoint
# (keboola/duckdb-extension#17). The extractor has had a `kbcstorage`
# fallback path since the extension landed
# (`connectors/keboola/extractor.py:_extract_via_legacy`); making
# the dep core means that fallback is actually importable in the
# default install instead of crashing with `ModuleNotFoundError`.
"kbcstorage>=0.9.0",
]
[project.optional-dependencies]
dev = [
"pytest>=9.0.0",
"pytest-timeout>=2.0.0",
"pytest-xdist>=3.0.0",
"faker>=24.0.0",
# jsonschema validates the corporate-memory extraction-tool golden fixtures
# under tests/test_corporate_memory_v1.py (extraction.json, correction.json,
# confidence_calibration.json). Production code does not depend on it.
"jsonschema>=4.0.0",
# FastAPI debug toolbar — gated behind DEBUG=1 env var in app/main.py.
# Provides per-request panels (headers, routes, timer, profiling, etc.)
# for local development. Never loaded in production (no DEBUG=1 there).
"fastapi-debug-toolbar>=0.6.3",
]
[project.scripts]
agnes = "cli.main:app"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["app", "src", "connectors", "cli", "services", "config"]
[tool.ruff]
line-length = 120
target-version = "py313"
[tool.uv]
dev-dependencies = [
"pytest>=9.0.0",
"pytest-timeout>=2.0.0",
"pytest-xdist>=3.0.0",
"faker>=24.0.0",
"anthropic>=0.30.0",
"openai>=1.30.0",
"fastapi-debug-toolbar>=0.6.3",
]