Since 0.47.0 GET /api/v2/catalog enriched each remote BigQuery row by fetching INFORMATION_SCHEMA.TABLE_STORAGE + COLUMNS through the DuckDB BigQuery extension *inside the request*. On cold caches that fanned out to O(N) sequential BQ jobs-API roundtrips — easily 90 s+ on partitioned / view-backed tables — and reliably blew the CLI's 30 s httpx ReadTimeout. Reproduced with py-spy: three AnyIO worker threads stuck inside connectors/bigquery/metadata._fetch_via_legacy_tables. Refactor: enrichment is read exclusively from a new persistent bq_metadata_cache DuckDB table (schema v40), populated by a scheduler- driven refresh job at SCHEDULER_BQ_METADATA_REFRESH_INTERVAL (default 4 h). Cold catalog response on a fresh container is now tens of milliseconds with metadata_freshness=never_fetched for unwarmed rows. New surface: - POST /api/admin/run-bq-metadata-refresh (scheduler-driven, full) - POST /api/v2/metadata-cache/refresh?table=<id> (admin, single) - GET /api/v2/metadata-cache/status (auth, non-admin) - metadata_freshness field per catalog row Removed (internal API): v2_catalog._size_hint_for_row, _resolve_remote_metadata, _metadata_provider_for, _build_metadata_request, _materialized_size_hint, in-memory _metadata_cache. Response shape unchanged for external consumers. 991 tests passing; 2 pre-existing failures (test_db v3→v4 ladder, test_cli_binary_rename) unrelated to this change.
142 lines
5.5 KiB
TOML
142 lines
5.5 KiB
TOML
[project]
|
||
name = "agnes-the-ai-analyst"
|
||
version = "0.50.0"
|
||
description = "Agnes — AI Data Analyst platform for AI analytical systems"
|
||
requires-python = ">=3.11,<3.14"
|
||
license = "MIT"
|
||
readme = "README.md"
|
||
|
||
dependencies = [
|
||
# Core database
|
||
"duckdb>=0.9.0",
|
||
# Web framework (FastAPI)
|
||
"fastapi>=0.115.0",
|
||
"uvicorn[standard]>=0.32.0",
|
||
"python-multipart>=0.0.27",
|
||
"jinja2>=3.1.0",
|
||
"starlette>=0.41.0",
|
||
# Authentication
|
||
"PyJWT>=2.8.0",
|
||
"itsdangerous>=2.1.0",
|
||
"authlib>=1.6.11",
|
||
"argon2-cffi>=23.1.0",
|
||
# HTTP client. `h2` enables HTTP/2 multiplexing for the persistent
|
||
# CLI client used by `agnes pull` (one TCP connection serves N
|
||
# concurrent parquet streams + range chunks). `cli/client.py`
|
||
# gracefully falls back to HTTP/1.1 if h2 is missing, so this
|
||
# extra is for performance, not correctness.
|
||
"httpx>=0.27.0",
|
||
"h2>=4.1.0",
|
||
# CLI
|
||
"typer>=0.12.0",
|
||
"rich>=13.0.0",
|
||
# Configuration
|
||
"python-dotenv>=1.0.0",
|
||
"pyyaml>=6.0",
|
||
# Data processing
|
||
"pandas>=2.0.0",
|
||
"pyarrow>=12.0.0",
|
||
"pytz>=2024.1",
|
||
# SQL parsing — server-side WHERE validator for /api/v2/scan (app/api/where_validator.py)
|
||
# Minimum 30.x — older versions had walk() yielding (node, parent, key)
|
||
# tuples instead of expression nodes, which would silently bypass the
|
||
# WHERE-validator structural checks (isinstance(tuple, exp.Subquery)
|
||
# is always False). 30.x yields nodes directly.
|
||
"sqlglot>=30.0.0",
|
||
# Data source connectors
|
||
"google-cloud-bigquery>=3.0.0",
|
||
"google-cloud-bigquery-storage>=2.0.0",
|
||
# Google Workspace Cloud Identity / Admin SDK (Workspace group membership sync)
|
||
"google-api-python-client>=2.0.0",
|
||
# Profiler visualizations
|
||
"matplotlib>=3.8.0",
|
||
"numpy>=1.24.0",
|
||
# Claude Code marketplace endpoint — pure-Python git server mounted in FastAPI
|
||
"dulwich>=0.22.0",
|
||
"a2wsgi>=1.10.0",
|
||
# In-process TTL cache for marketplace etag (transitively present via
|
||
# google-auth, declared explicitly here because we depend on it directly).
|
||
"cachetools>=5.3.0",
|
||
# Per-IP rate limiting on auth endpoints (#45). In-process counters by
|
||
# default — fine for single-replica deploys. Multi-replica rollouts can
|
||
# swap the storage backend via slowapi's `storage_uri` (Redis, Memcached).
|
||
"slowapi>=0.1.9",
|
||
# LLM provider SDKs — core (not dev) because connectors/llm/*_provider.py
|
||
# is imported by services/{corporate_memory, verification_detector} which
|
||
# the scheduler drives in production. Promoted from [dev] in #176 to fix
|
||
# ModuleNotFoundError boot loops on default Compose deploys.
|
||
"anthropic>=0.30.0",
|
||
"openai>=1.30.0",
|
||
# Keboola Storage API SDK — used by:
|
||
# - `connectors/keboola/client.py` for admin-side bucket / table list
|
||
# (consumed from `app/api/admin.py` discover-and-register, table
|
||
# metadata refresh).
|
||
# Extraction itself uses the lightweight `connectors/keboola/storage_api.py`
|
||
# module (export-async + signed-URL download) which talks to Storage API
|
||
# directly via `requests` — no SDK dependency on the data-path side. The
|
||
# SDK stays for the metadata reads.
|
||
"kbcstorage>=0.9.0",
|
||
"sse-starlette>=2.0",
|
||
# Optional observability — pure-Python, no compilation. Lazily initialized
|
||
# in src/observability/posthog_client.py and only emits events when
|
||
# POSTHOG_API_KEY is set in the environment. With the key unset the
|
||
# integration is fully off (no network, no init). See docs/observability.md.
|
||
"posthog>=3.7.0",
|
||
# Rust-backed (ammonia) HTML sanitizer for admin-edited rich content
|
||
# (news intro + body). Allowlist-based with per-tag attribute scoping;
|
||
# closes the bypass shapes the legacy regex sanitizer in
|
||
# src/welcome_template.py was vulnerable to. Pre-built wheels published
|
||
# for all supported (mac/linux/windows × arm64/x86_64) targets.
|
||
"nh3>=0.2",
|
||
# Cross-platform advisory file locking for the `agnes push` single-instance
|
||
# guard. Wraps fcntl.flock on POSIX and msvcrt.locking on Windows behind
|
||
# a uniform API; OS releases the lock automatically on process exit (no
|
||
# stale-lock detection required). Used by cli/lib/push_lock.py.
|
||
"filelock>=3.13,<4",
|
||
]
|
||
|
||
[project.optional-dependencies]
|
||
observability = [
|
||
# Already in base dependencies — listed here so operators who want to
|
||
# be explicit can `pip install -e ".[observability]"` and signal intent.
|
||
"posthog>=3.7.0",
|
||
]
|
||
dev = [
|
||
"pytest>=9.0.0",
|
||
"pytest-timeout>=2.0.0",
|
||
"pytest-xdist>=3.0.0",
|
||
"faker>=24.0.0",
|
||
# jsonschema validates the corporate-memory extraction-tool golden fixtures
|
||
# under tests/test_corporate_memory_v1.py (extraction.json, correction.json,
|
||
# confidence_calibration.json). Production code does not depend on it.
|
||
"jsonschema>=4.0.0",
|
||
# FastAPI debug toolbar — gated behind DEBUG=1 env var in app/main.py.
|
||
# Provides per-request panels (headers, routes, timer, profiling, etc.)
|
||
# for local development. Never loaded in production (no DEBUG=1 there).
|
||
"fastapi-debug-toolbar>=0.6.3",
|
||
]
|
||
|
||
[project.scripts]
|
||
agnes = "cli.main:main"
|
||
|
||
[build-system]
|
||
requires = ["hatchling"]
|
||
build-backend = "hatchling.build"
|
||
|
||
[tool.hatch.build.targets.wheel]
|
||
packages = ["app", "src", "connectors", "cli", "services", "config"]
|
||
|
||
[tool.ruff]
|
||
line-length = 120
|
||
target-version = "py313"
|
||
|
||
[tool.uv]
|
||
dev-dependencies = [
|
||
"pytest>=9.0.0",
|
||
"pytest-timeout>=2.0.0",
|
||
"pytest-xdist>=3.0.0",
|
||
"faker>=24.0.0",
|
||
"anthropic>=0.30.0",
|
||
"openai>=1.30.0",
|
||
"fastapi-debug-toolbar>=0.6.3",
|
||
]
|