E2E test on a real BQ deploy showed every verification-extraction call fails with HTTP 400 invalid_request_error: "output_config.format.schema: For 'object' type, 'additionalProperties' must be explicitly set to false". The Anthropic structured-output API now requires the field on every object node in the json_schema. Fix: connectors/llm/anthropic_provider.py wraps the caller-supplied schema through a recursive _strict_json_schema() walker that adds the field where missing (preserving any explicit override), then passes the strict variant to the API. Six unit tests in TestStrictJsonSchema pin the recursion across nested objects, array items, and the no-mutation invariant. Adds /admin/scheduler-runs — a read-only admin page that surfaces the last 200 audit-log entries from scheduler-driven actions. New AuditRepository.query_actions(actions, limit) helper, new admin nav entry. Failed scheduler ticks (HTTP 401, network errors) don't reach the audit_log; the page calls that out with a hint to set SCHEDULER_API_TOKEN if no rows show up.
169 lines
5.7 KiB
Python
169 lines
5.7 KiB
Python
"""Anthropic provider for structured JSON extraction.
|
|
|
|
Uses the Anthropic API with native structured output (json_schema)
|
|
for reliable JSON extraction. Includes retry logic for transient errors.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
|
|
import anthropic
|
|
|
|
from .exceptions import (
|
|
LLMAuthError,
|
|
LLMFormatError,
|
|
LLMRateLimitError,
|
|
LLMRefusalError,
|
|
LLMTimeoutError,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Retry configuration
|
|
MAX_RETRIES = 3
|
|
INITIAL_BACKOFF_SECONDS = 2
|
|
BACKOFF_MULTIPLIER = 2
|
|
|
|
|
|
def _strict_json_schema(schema):
|
|
"""Return a copy of the schema with additionalProperties=False on every object type.
|
|
|
|
The Anthropic structured-output API rejects schemas where a `{"type": "object"}` node
|
|
omits `additionalProperties` (HTTP 400 invalid_request_error). We walk the schema
|
|
recursively and force the field where missing.
|
|
"""
|
|
if isinstance(schema, dict):
|
|
out = {k: _strict_json_schema(v) for k, v in schema.items()}
|
|
if out.get("type") == "object" and "additionalProperties" not in out:
|
|
out["additionalProperties"] = False
|
|
return out
|
|
if isinstance(schema, list):
|
|
return [_strict_json_schema(item) for item in schema]
|
|
return schema
|
|
|
|
|
|
class AnthropicExtractor:
|
|
"""Structured JSON extractor using the Anthropic API.
|
|
|
|
Uses output_config with json_schema format for structured output.
|
|
Retries transient errors (rate limit, timeout, connection) with
|
|
exponential backoff.
|
|
"""
|
|
|
|
def __init__(self, api_key: str, model: str) -> None:
|
|
"""Initialize the Anthropic extractor.
|
|
|
|
Args:
|
|
api_key: Anthropic API key.
|
|
model: Model identifier (e.g., "claude-haiku-4-5-20251001").
|
|
"""
|
|
self._client = anthropic.Anthropic(api_key=api_key)
|
|
self._model = model
|
|
|
|
def extract_json(
|
|
self,
|
|
prompt: str,
|
|
max_tokens: int,
|
|
json_schema: dict,
|
|
schema_name: str,
|
|
) -> dict:
|
|
"""Extract structured JSON using the Anthropic API.
|
|
|
|
Args:
|
|
prompt: The extraction prompt to send to the model.
|
|
max_tokens: Maximum tokens in the response.
|
|
json_schema: JSON Schema that the response must conform to.
|
|
schema_name: Human-readable name for the schema.
|
|
|
|
Returns:
|
|
Parsed JSON dictionary conforming to the provided schema.
|
|
|
|
Raises:
|
|
LLMAuthError: Invalid API key.
|
|
LLMRateLimitError: Rate limited after all retries.
|
|
LLMTimeoutError: Timeout/connection error after all retries.
|
|
LLMFormatError: Response is not valid JSON.
|
|
LLMRefusalError: Model refused to respond.
|
|
"""
|
|
last_exception: Exception | None = None
|
|
|
|
for attempt in range(1, MAX_RETRIES + 1):
|
|
try:
|
|
return self._attempt_extraction(
|
|
prompt, max_tokens, json_schema, schema_name, attempt,
|
|
)
|
|
except LLMAuthError:
|
|
raise
|
|
except LLMRefusalError:
|
|
raise
|
|
except (LLMRateLimitError, LLMTimeoutError) as e:
|
|
last_exception = e
|
|
if attempt < MAX_RETRIES:
|
|
delay = INITIAL_BACKOFF_SECONDS * (BACKOFF_MULTIPLIER ** (attempt - 1))
|
|
logger.warning(
|
|
"Transient error on attempt %d/%d for model %s, "
|
|
"retrying in %ds: %s",
|
|
attempt, MAX_RETRIES, self._model, delay,
|
|
type(e).__name__,
|
|
)
|
|
time.sleep(delay)
|
|
|
|
raise last_exception # type: ignore[misc]
|
|
|
|
def _attempt_extraction(
|
|
self,
|
|
prompt: str,
|
|
max_tokens: int,
|
|
json_schema: dict,
|
|
schema_name: str,
|
|
attempt: int,
|
|
) -> dict:
|
|
"""Single extraction attempt against the Anthropic API."""
|
|
logger.info(
|
|
"Anthropic extraction attempt %d/%d, model=%s, schema=%s",
|
|
attempt, MAX_RETRIES, self._model, schema_name,
|
|
)
|
|
|
|
try:
|
|
response = self._client.messages.create(
|
|
model=self._model,
|
|
max_tokens=max_tokens,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
output_config={
|
|
"format": {
|
|
"type": "json_schema",
|
|
"schema": _strict_json_schema(json_schema),
|
|
},
|
|
},
|
|
)
|
|
except anthropic.AuthenticationError as e:
|
|
raise LLMAuthError("Anthropic authentication failed (check API key)") from e
|
|
except anthropic.RateLimitError as e:
|
|
raise LLMRateLimitError("Anthropic rate limited") from e
|
|
except (anthropic.APITimeoutError, anthropic.APIConnectionError) as e:
|
|
raise LLMTimeoutError(
|
|
f"Anthropic connection error ({type(e).__name__})"
|
|
) from e
|
|
|
|
# Check for truncation - raise and let outer retry loop handle it
|
|
if response.stop_reason == "max_tokens":
|
|
raise LLMFormatError(
|
|
f"Response truncated (max_tokens) for schema {schema_name}"
|
|
)
|
|
|
|
# Check for refusal
|
|
if response.stop_reason == "end_turn" and not response.content:
|
|
raise LLMRefusalError(
|
|
f"Model refused to generate response for schema {schema_name}"
|
|
)
|
|
|
|
# Parse JSON from response
|
|
try:
|
|
text = response.content[0].text
|
|
return json.loads(text)
|
|
except (json.JSONDecodeError, IndexError, AttributeError) as e:
|
|
raise LLMFormatError(
|
|
f"Failed to parse Anthropic response as JSON for "
|
|
f"schema {schema_name} ({type(e).__name__})"
|
|
) from e
|