fix(setup): seed default ai: block + env-var fallback (#176)

POST /api/admin/configure now writes a default ai: block into the
instance.yaml overlay when the request leaves it untouched and either
ANTHROPIC_API_KEY or LLM_API_KEY is set in the environment. The block
references the env var via ${VAR} syntax — secrets never land in YAML.

connectors.llm.factory grows create_extractor_from_env_or_config which
falls back to ANTHROPIC_API_KEY / LLM_API_KEY when ai_config is empty
and raises a clear ValueError when neither is available. Both
services/corporate_memory and services/verification_detector switch to
the new helper, replacing the old 'silently skip when ai: missing'
path that was the silent-failure root cause.

Tests:
- tests/test_setup_ai_block.py — overlay seeding contract.
- tests/test_llm_provider_env_fallback.py — fallback + fail-fast.
This commit is contained in:
ZdenekSrotyr 2026-05-04 23:55:19 +02:00
parent d2104555c6
commit bbb04ac041
7 changed files with 283 additions and 20 deletions

View file

@ -2626,6 +2626,29 @@ async def configure_instance(
"location": request.bigquery_location or "us", "location": request.bigquery_location or "us",
} }
# Seed an ai: block on first-time setup so LLM-driven services
# (corporate_memory, verification_detector) can boot without manual
# YAML editing. Only inserts when the overlay has no ai: yet AND an
# appropriate env var is present — never overwrites operator config,
# never writes a placeholder block (#176).
if "ai" not in overlay:
anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
llm_key = os.environ.get("LLM_API_KEY", "").strip()
if anthropic_key:
overlay["ai"] = {
"provider": "anthropic",
"api_key": "${ANTHROPIC_API_KEY}",
"model": "claude-haiku-4-5-20251001",
"structured_output": "auto",
}
elif llm_key:
overlay["ai"] = {
"provider": "anthropic",
"api_key": "${LLM_API_KEY}",
"model": "claude-haiku-4-5-20251001",
"structured_output": "auto",
}
# Atomic write to writable data volume — same tmp + os.replace pattern # Atomic write to writable data volume — same tmp + os.replace pattern
# as the server-config editor so a concurrent save can't tear the file. # as the server-config editor so a concurrent save can't tear the file.
config_path.parent.mkdir(parents=True, exist_ok=True) config_path.parent.mkdir(parents=True, exist_ok=True)

View file

@ -6,6 +6,10 @@ providers with automatic fallback strategies for structured output.
""" """
from .base import StructuredExtractor from .base import StructuredExtractor
from .factory import create_extractor from .factory import create_extractor, create_extractor_from_env_or_config
__all__ = ["StructuredExtractor", "create_extractor"] __all__ = [
"StructuredExtractor",
"create_extractor",
"create_extractor_from_env_or_config",
]

View file

@ -5,6 +5,7 @@ and creates the appropriate StructuredExtractor implementation.
""" """
import logging import logging
import os
from urllib.parse import urlparse from urllib.parse import urlparse
from .anthropic_provider import AnthropicExtractor from .anthropic_provider import AnthropicExtractor
@ -120,6 +121,48 @@ def create_extractor(ai_config: dict) -> StructuredExtractor:
) )
def create_extractor_from_env_or_config(
ai_config: dict | None,
) -> StructuredExtractor:
"""Build an extractor from config, falling back to env vars.
Resolution order (#176):
1. ``ai_config`` is a non-empty dict delegate to :func:`create_extractor`.
2. ``ANTHROPIC_API_KEY`` set AnthropicExtractor with the default model.
3. ``LLM_API_KEY`` set without a base_url AnthropicExtractor (the proxy
case typically also wires a base_url, in which case the operator should
use the explicit ai: block; this fallback is a best-effort convenience).
4. Otherwise raise ``ValueError`` with a clear actionable message never
silently exit, never return ``None``. The previous "skip when ai: is
missing" behavior was the silent-failure root cause in #176.
"""
if ai_config:
return create_extractor(ai_config)
anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
llm_key = os.environ.get("LLM_API_KEY", "").strip()
if anthropic_key:
logger.info(
"No ai: block in instance.yaml; falling back to ANTHROPIC_API_KEY env var"
)
return AnthropicExtractor(api_key=anthropic_key, model=DEFAULT_MODEL)
if llm_key:
logger.info(
"No ai: block in instance.yaml; falling back to LLM_API_KEY env var"
)
return AnthropicExtractor(api_key=llm_key, model=DEFAULT_MODEL)
raise ValueError(
"LLM not configured. Add an ai: block to instance.yaml (see "
"config/instance.yaml.example) OR set ANTHROPIC_API_KEY / LLM_API_KEY "
"in the environment. The corporate-memory and verification-detector "
"services cannot run without one of these."
)
def _validate_api_key(api_key: str) -> None: def _validate_api_key(api_key: str) -> None:
"""Validate that an API key is present and non-empty. """Validate that an API key is present and non-empty.

View file

@ -416,17 +416,20 @@ def collect_all(dry_run: bool = False) -> dict:
stats["skipped"] = True stats["skipped"] = True
return stats return stats
# Step 2: Initialize AI extractor # Step 2: Initialize AI extractor.
# Fail-fast (#176): no silent skip on missing ai: block. The factory
# falls back to ANTHROPIC_API_KEY / LLM_API_KEY env vars and raises a
# clear ValueError if neither config nor env is available.
try: try:
from config.loader import load_instance_config from config.loader import load_instance_config
from connectors.llm import create_extractor_from_env_or_config
instance_config = load_instance_config() try:
ai_config = instance_config.get("ai") instance_config = load_instance_config()
if not ai_config: except (ValueError, FileNotFoundError):
logger.warning("No ai: section in instance.yaml, skipping catalog refresh") instance_config = {}
stats["skipped"] = True ai_config = instance_config.get("ai") if instance_config else None
return stats extractor = create_extractor_from_env_or_config(ai_config)
extractor = create_extractor(ai_config)
except (ValueError, FileNotFoundError) as e: except (ValueError, FileNotFoundError) as e:
stats["errors"].append(str(e)) stats["errors"].append(str(e))
logger.error("Failed to initialize AI extractor: %s", e) logger.error("Failed to initialize AI extractor: %s", e)

View file

@ -49,22 +49,25 @@ def main() -> None:
setup_logging(__name__, level="DEBUG" if args.verbose else "INFO") setup_logging(__name__, level="DEBUG" if args.verbose else "INFO")
# Load AI config lazily (same pattern as corporate memory collector) # Load AI config; fail fast on missing config + env (#176).
from connectors.llm import create_extractor_from_env_or_config
try: try:
from config.loader import load_instance_config from config.loader import load_instance_config
config = load_instance_config() try:
ai_config = config.get("ai") config = load_instance_config()
if not ai_config: except (ValueError, FileNotFoundError):
logger.error("No ai: section in instance.yaml, cannot run verification detector") config = {}
sys.exit(1) ai_config = config.get("ai") if config else None
extractor = create_extractor_from_env_or_config(ai_config)
except (ValueError, FileNotFoundError) as e: except (ValueError, FileNotFoundError) as e:
logger.error("Failed to load config: %s", e) logger.error(
"Failed to initialize verification detector: %s. "
"Configure ai: in instance.yaml or set ANTHROPIC_API_KEY / LLM_API_KEY.",
e,
)
sys.exit(1) sys.exit(1)
from connectors.llm import create_extractor
extractor = create_extractor(ai_config)
conn = get_system_db() conn = get_system_db()
if args.reset: if args.reset:

View file

@ -0,0 +1,80 @@
"""LLM provider env-var fallback + fail-fast behavior.
When no ai: block is configured, `connectors.llm.factory.create_extractor`
must:
1. Build an extractor from `ANTHROPIC_API_KEY` / `LLM_API_KEY` if either
env var is set (so an operator who only edited .env still gets a
working pipeline).
2. Fail fast with a clear error if neither config nor env is available.
No silent skip.
Closes one of five defects in #176.
"""
from __future__ import annotations
from unittest.mock import patch
import pytest
from connectors.llm.factory import create_extractor, create_extractor_from_env_or_config
class TestEnvFallback:
"""Mocks the AnthropicExtractor constructor so the tests don't try to
open a live SDK client (which loads system SSL certs at __init__ time
and is unhappy on machines with corporate CA-bundle env vars pointing
at a non-existent file). The test surface is the factory routing logic,
not the SDK wiring that's covered by tests/test_llm_providers_full.py.
"""
def test_anthropic_env_fallback_builds_extractor(self, monkeypatch):
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-from-env-aaa")
monkeypatch.delenv("LLM_API_KEY", raising=False)
with patch("connectors.llm.factory.AnthropicExtractor") as mock_cls:
ex = create_extractor_from_env_or_config(ai_config=None)
assert ex is mock_cls.return_value
kwargs = mock_cls.call_args.kwargs
assert kwargs["api_key"] == "sk-ant-from-env-aaa"
def test_llm_api_key_env_fallback_builds_extractor(self, monkeypatch):
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
monkeypatch.setenv("LLM_API_KEY", "sk-proxy-from-env-bbb")
with patch("connectors.llm.factory.AnthropicExtractor") as mock_cls:
ex = create_extractor_from_env_or_config(ai_config=None)
assert ex is mock_cls.return_value
kwargs = mock_cls.call_args.kwargs
assert kwargs["api_key"] == "sk-proxy-from-env-bbb"
def test_no_config_no_env_fails_fast(self, monkeypatch):
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
monkeypatch.delenv("LLM_API_KEY", raising=False)
with pytest.raises(ValueError) as excinfo:
create_extractor_from_env_or_config(ai_config=None)
msg = str(excinfo.value)
# Error must mention BOTH config + env paths so operators know how to fix it.
assert "instance.yaml" in msg or "ai:" in msg
assert "ANTHROPIC_API_KEY" in msg
def test_explicit_ai_config_wins_over_env(self, monkeypatch):
monkeypatch.setenv("ANTHROPIC_API_KEY", "ignored-because-config-wins")
ai_cfg = {
"provider": "anthropic",
"api_key": "sk-ant-from-cfg-zzz",
"model": "claude-haiku-4-5-20251001",
}
with patch("connectors.llm.factory.AnthropicExtractor") as mock_cls:
ex = create_extractor_from_env_or_config(ai_config=ai_cfg)
assert ex is mock_cls.return_value
kwargs = mock_cls.call_args.kwargs
assert kwargs["api_key"] == "sk-ant-from-cfg-zzz"
def test_empty_dict_falls_through_to_env(self, monkeypatch):
"""ai: {} is treated the same as no block — fall through to env vars."""
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-from-env-ccc")
with patch("connectors.llm.factory.AnthropicExtractor") as mock_cls:
ex = create_extractor_from_env_or_config(ai_config={})
assert ex is mock_cls.return_value
kwargs = mock_cls.call_args.kwargs
assert kwargs["api_key"] == "sk-ant-from-env-ccc"

View file

@ -0,0 +1,107 @@
"""Tests for /api/admin/configure writing a default ai: block.
First-time setup must seed an ai: section in the instance.yaml overlay so
LLM-driven services (corporate_memory, verification_detector) can boot
without a manual edit. Closes one of five defects in #176.
"""
from __future__ import annotations
import yaml
from unittest.mock import patch
def _auth(token: str) -> dict:
return {"Authorization": f"Bearer {token}"}
def _read_overlay(env: dict) -> dict:
overlay_path = env["data_dir"] / "state" / "instance.yaml"
if not overlay_path.exists():
return {}
return yaml.safe_load(overlay_path.read_text()) or {}
class TestConfigureSeedsAiBlock:
def test_configure_seeds_ai_block_when_anthropic_api_key_is_set(self, seeded_app, monkeypatch):
"""ANTHROPIC_API_KEY in the env → overlay gets an ai: block referencing it."""
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-test-keyvalue")
c = seeded_app["client"]
token = seeded_app["admin_token"]
resp = c.post(
"/api/admin/configure",
json={"data_source": "local"},
headers=_auth(token),
)
assert resp.status_code == 200
overlay = _read_overlay(seeded_app["env"])
assert "ai" in overlay, "configure must seed ai: block when ANTHROPIC_API_KEY is set"
ai = overlay["ai"]
assert ai.get("provider") == "anthropic"
# The overlay stores the env-var reference (${ANTHROPIC_API_KEY}), not
# the raw secret — secrets belong in .env_overlay only.
assert ai.get("api_key") == "${ANTHROPIC_API_KEY}"
assert "model" in ai
def test_configure_seeds_ai_block_when_llm_api_key_is_set(self, seeded_app, monkeypatch):
"""LLM_API_KEY (proxy/openai_compat fallback) is also acceptable."""
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
monkeypatch.setenv("LLM_API_KEY", "sk-proxy-keyvalue")
c = seeded_app["client"]
token = seeded_app["admin_token"]
resp = c.post(
"/api/admin/configure",
json={"data_source": "local"},
headers=_auth(token),
)
assert resp.status_code == 200
overlay = _read_overlay(seeded_app["env"])
assert "ai" in overlay
# The fallback uses ${LLM_API_KEY} — same env-var-reference pattern.
assert overlay["ai"].get("api_key") == "${LLM_API_KEY}"
def test_configure_does_not_seed_ai_when_no_key_in_env(self, seeded_app, monkeypatch):
"""No env keys → no ai block written. Operator must add manually.
We deliberately do not write a placeholder block: the LLM services
fail-fast on a missing block and the operator gets a clear error.
"""
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
monkeypatch.delenv("LLM_API_KEY", raising=False)
c = seeded_app["client"]
token = seeded_app["admin_token"]
resp = c.post(
"/api/admin/configure",
json={"data_source": "local"},
headers=_auth(token),
)
assert resp.status_code == 200
overlay = _read_overlay(seeded_app["env"])
assert "ai" not in overlay
def test_configure_preserves_existing_ai_block(self, seeded_app, monkeypatch):
"""If overlay already has ai: section, configure must not overwrite it."""
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-from-env")
# Pre-populate the overlay with a custom ai block.
overlay_path = seeded_app["env"]["data_dir"] / "state" / "instance.yaml"
overlay_path.parent.mkdir(parents=True, exist_ok=True)
overlay_path.write_text(yaml.dump({
"ai": {
"provider": "openai_compat",
"api_key": "${LLM_API_KEY}",
"base_url": "https://litellm.example.com",
"model": "claude-haiku-4-5-20251001",
}
}))
c = seeded_app["client"]
token = seeded_app["admin_token"]
resp = c.post(
"/api/admin/configure",
json={"data_source": "local"},
headers=_auth(token),
)
assert resp.status_code == 200
overlay = _read_overlay(seeded_app["env"])
assert overlay["ai"]["provider"] == "openai_compat"
assert overlay["ai"]["base_url"] == "https://litellm.example.com"