1. instance.yaml overlay path now matches read site under STATE_DIR.
Three sites updated:
- app/api/admin.py:1005 (server-config endpoint writer)
- app/api/admin.py:2610 (configure endpoint writer)
- app/instance_config.py:106 (overlay reader)
All three now go through _state_dir() so under flat-mount layout
(STATE_DIR=/data-state) the irreplaceable instance.yaml overlay
lands on the state disk (sdc) instead of the regenerable data
disk (sdb). Without this fix, .env_overlay correctly went to the
state disk while instance.yaml went to the data disk — config
would be lost if an operator wiped sdb.
2. Strip customer-specific tokens from OSS repo per CLAUDE.md
vendor-agnostic rule:
- docker-compose.host-mount.yml: 'a deployer (Groupon FoundryAI)'
→ 'a deployer in production'
- docker-compose.flat-mount.yml: 'caused 2026-05-05 in the
Groupon FoundryAI deployment' → generic 'production failure
mode'
- docs/state-dir.md: rewrote the incident reference to describe
the failure mode abstractly without naming the deployment;
updated the recommendation table to say 'shadow-mount class'
instead of dating the specific incident.
3. Updated docs/state-dir.md 'What reads STATE_DIR' to list all
read/write sites including the three migrated in this round
(admin.py, instance_config.py, marketplaces.py).
ANALYSIS finding (tls-rotate.sh hardcoded host-mount.yml) deferred
— same operator-side class as auto-upgrade.sh hardcoded host-mount,
documented limitation per the PR body.
181 lines
6.9 KiB
Python
181 lines
6.9 KiB
Python
"""Instance configuration — loads instance.yaml and exposes to FastAPI."""
|
|
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_instance_config: Optional[dict] = None
|
|
|
|
|
|
def reset_cache() -> None:
|
|
"""Drop the in-process instance.yaml cache; the next ``load_instance_config``
|
|
call re-reads from disk. Used by `/api/admin/server-config` after a save.
|
|
Public alias so callers don't have to reach into the private global.
|
|
|
|
Also clears ``connectors.bigquery.access.get_bq_access`` so the v2 endpoints
|
|
pick up new BigQuery project IDs after an admin saves `instance.yaml` —
|
|
without this, `get_bq_access`'s `@functools.cache` would freeze the projects
|
|
at first call and require a container restart to pick up changes (Devin
|
|
ANALYSIS_0004 on PR #138). Lazy-imported so this module stays usable in
|
|
environments where the connectors package can't be imported (e.g. unit
|
|
tests of instance_config in isolation)."""
|
|
global _instance_config
|
|
_instance_config = None
|
|
try:
|
|
from connectors.bigquery.access import get_bq_access
|
|
get_bq_access.cache_clear()
|
|
except Exception:
|
|
# Connectors module not loaded yet, or BQ deps missing — both fine.
|
|
pass
|
|
|
|
|
|
def _deep_merge(base: dict, patch: dict) -> dict:
|
|
"""Deep-merge `patch` into `base`, returning a new dict.
|
|
|
|
Dict-into-dict recurses; everything else (scalars, lists, None) is
|
|
replaced wholesale. Used so the writable overlay can hold only the
|
|
sections an operator has touched, while everything else flows from
|
|
the static file unchanged. Same semantics as the helper in
|
|
`/api/admin/server-config`'s POST handler.
|
|
"""
|
|
out = dict(base)
|
|
for key, value in patch.items():
|
|
if isinstance(value, dict) and isinstance(out.get(key), dict):
|
|
out[key] = _deep_merge(out[key], value)
|
|
else:
|
|
out[key] = value
|
|
return out
|
|
|
|
|
|
def load_instance_config() -> dict:
|
|
"""Load instance.yaml as a deep-merge of the static file and the
|
|
writable overlay.
|
|
|
|
Resolution:
|
|
1. Static base: ``CONFIG_DIR/instance.yaml`` via ``config.loader``
|
|
(the source of truth for sections the editor doesn't expose —
|
|
``datasets``, ``corporate_memory``, ``openmetadata``, etc.).
|
|
2. Overlay patch: ``DATA_DIR/state/instance.yaml`` (written by
|
|
``/api/admin/configure`` and ``/api/admin/server-config``;
|
|
contains only the sections those endpoints accept).
|
|
3. Overlay wins per-leaf via deep-merge — operator edits persist,
|
|
static-only sections still flow through.
|
|
|
|
Pre-2026-04-28 this function returned the overlay verbatim when it
|
|
existed and only fell back to static when it didn't. That was a
|
|
silent footgun: the moment someone saved any section through the
|
|
new editor (which writes a narrow overlay by design), every
|
|
consumer of static-only sections (corporate memory page, dataset
|
|
list, OpenMetadata client) saw empty defaults. See PR #107.
|
|
"""
|
|
global _instance_config
|
|
if _instance_config is not None:
|
|
return _instance_config
|
|
|
|
import yaml
|
|
|
|
# Static base — strict validation lives in config.loader.
|
|
base: dict = {}
|
|
try:
|
|
from config.loader import load_instance_config as _load
|
|
base = _load() or {}
|
|
logger.info("Loaded instance.yaml base from config/")
|
|
except Exception as e:
|
|
logger.warning(f"Could not load static instance.yaml: {e}")
|
|
|
|
# Overlay patch from the writable volume. Best-effort — a corrupt
|
|
# overlay shouldn't take the app offline (we'd rather serve stale/base
|
|
# config than 500 every request), but log loudly with a traceback so
|
|
# the corruption surfaces in the operator's logs immediately. The
|
|
# write-side endpoints (POST /api/admin/server-config and /configure)
|
|
# refuse to overwrite a corrupt overlay with HTTP 500, so an admin
|
|
# noticing the saves break is the second line of defence.
|
|
#
|
|
# ${ENV_VAR} interpolation: ``config.loader.load_instance_config`` runs
|
|
# the static base through ``_resolve_env_refs`` already, but raw
|
|
# ``yaml.safe_load`` here would leave overlay strings like
|
|
# ``${ANTHROPIC_API_KEY}`` as literal placeholders. /api/admin/configure
|
|
# writes exactly that string into the seeded ai: block (#176), so we
|
|
# mirror the resolver here before the deep-merge — without it, the
|
|
# LLM factory receives the literal placeholder and rejects it as an
|
|
# invalid api key (#179 review fix).
|
|
# Resolve via _state_dir() so the path matches the writer in
|
|
# app/api/admin.py — under the flat-mount layout (STATE_DIR=/data-state)
|
|
# both the configure-endpoint and the server-config-endpoint write
|
|
# ``/data-state/instance.yaml``; reading from ``/data/state/...`` here
|
|
# would silently load stale config from the regenerable data disk.
|
|
from app.secrets import _state_dir
|
|
overlay_path = _state_dir() / "instance.yaml"
|
|
if overlay_path.exists():
|
|
try:
|
|
overlay = yaml.safe_load(overlay_path.read_text()) or {}
|
|
from config.loader import _resolve_env_refs
|
|
overlay = _resolve_env_refs(overlay)
|
|
base = _deep_merge(base, overlay)
|
|
logger.info("Merged overlay from %s", overlay_path)
|
|
except Exception:
|
|
logger.exception(
|
|
"instance.yaml overlay at %s is corrupt — falling back to "
|
|
"static base config; saves through the editor will refuse "
|
|
"until the file is repaired", overlay_path,
|
|
)
|
|
|
|
_instance_config = base
|
|
return _instance_config
|
|
|
|
|
|
def get_value(*keys, default=None) -> Any:
|
|
"""Get nested value from instance config."""
|
|
config = load_instance_config()
|
|
current = config
|
|
for key in keys:
|
|
if isinstance(current, dict):
|
|
current = current.get(key)
|
|
else:
|
|
return default
|
|
if current is None:
|
|
return default
|
|
return current
|
|
|
|
|
|
def get_data_source_type() -> str:
|
|
return os.environ.get("DATA_SOURCE", get_value("data_source", "type", default="local"))
|
|
|
|
|
|
def get_instance_name() -> str:
|
|
return get_value("instance", "name", default="AI Data Analyst")
|
|
|
|
|
|
def get_instance_subtitle() -> str:
|
|
return get_value("instance", "subtitle", default="")
|
|
|
|
|
|
def get_sync_interval() -> str:
|
|
"""Human-readable refresh cadence shown in the analyst welcome prompt."""
|
|
return get_value("instance", "sync_interval", default="1 hour")
|
|
|
|
|
|
def get_allowed_domains() -> list:
|
|
domain = get_value("auth", "allowed_domain", default="")
|
|
if domain:
|
|
return [d.strip() for d in domain.split(",") if d.strip()]
|
|
return []
|
|
|
|
|
|
def get_datasets() -> dict:
|
|
return get_value("datasets", default={})
|
|
|
|
|
|
def get_theme() -> dict:
|
|
return get_value("theme", default={})
|
|
|
|
|
|
def get_auth_config() -> dict:
|
|
return get_value("auth", default={})
|
|
|
|
|
|
def get_corporate_memory_config() -> dict:
|
|
return get_value("corporate_memory", default={})
|