agnes-the-ai-analyst/app/api/admin.py

"""Admin endpoints — table discovery, registry management, instance configuration.

Every gate on this router uses ``require_admin`` from ``app.auth.access``,
which checks Admin user_group membership for both OAuth session and PAT
callers via the same ``_user_group_ids`` lookup.
"""

import logging
import os
import threading
import uuid
from pathlib import Path

from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query
from pydantic import BaseModel, Field, field_validator, model_validator
from typing import Optional, List, Dict, Any
import duckdb

from app.auth.access import require_admin
from app.auth.dependencies import _get_db
from src.repositories.table_registry import TableRegistryRepository
from src.repositories.audit import AuditRepository
from src.identifier_validation import (
    is_safe_identifier as _is_safe_identifier,
    is_safe_quoted_identifier as _is_safe_quoted_identifier,
)
from src.sql_safe import is_safe_project_id as _is_safe_project_id
from src.scheduler import is_valid_schedule
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/admin", tags=["admin"])

# Serializes the read-modify-write of state/instance.yaml across the two
# endpoints that mutate the overlay (POST /server-config and POST /configure).
# Without it, two admins saving concurrently would each read the same overlay
# snapshot, merge their disjoint patches, and the second os.replace would silently
# drop the first patch. Single-process FastAPI workers; multi-worker deployments
# would need an OS-level file lock — documented limitation.
_overlay_write_lock = threading.Lock()

# Per-processor advisory locks for /api/admin/run-session-processor.
# Two trigger paths exist for the same processor (scheduler tick + manual
# admin POST). Without serialization, overlapping runs would re-process the
# same /data/user_sessions/* set, double-call the LLM, and pile up duplicate
# `verification_evidence` rows — the dedup short-circuit in
# VerificationProcessor only catches the create+contradiction branches, not
# create_evidence (per ADR Decision 3, which expects evidence to accumulate
# per distinct verification event). Lock is non-blocking → second caller
# gets 409 Conflict so the operator sees what happened instead of stacking
# behind a long-running tick.
_processor_run_locks: dict[str, threading.Lock] = {}
_processor_run_locks_mutex = threading.Lock()


def _get_processor_run_lock(name: str) -> threading.Lock:
    """Per-name lock factory; the registry mutex guards dict insertion so
    two threads simultaneously asking for a never-seen processor don't
    each install their own lock instance."""
    with _processor_run_locks_mutex:
        if name not in _processor_run_locks:
            _processor_run_locks[name] = threading.Lock()
        return _processor_run_locks[name]


# SSRF protection: reject private/internal URLs for keboola_url
import ipaddress as _ipaddress
import socket as _socket
from urllib.parse import urlparse as _urlparse


def _validate_url_not_private(url: str, field_name: str = "url") -> None:
    """Raise 400 if the URL host points to a private/reserved network.

    Uses DNS resolution + ipaddress checks instead of hostname regex,
    which correctly handles all IPv4/IPv6 addresses including abbreviated
    forms (fe80::1, ::1, etc.) and DNS rebinding (resolves at check time).
    """
    try:
        parsed = _urlparse(url)
    except Exception:
        raise HTTPException(status_code=400, detail=f"Invalid {field_name}: not a valid URL")
    host = parsed.hostname or ""
    if not host:
        raise HTTPException(status_code=400, detail=f"Invalid {field_name}: missing hostname")

    # Reject well-known dangerous hostnames before DNS resolution
    if host.lower() in ("localhost", "localhost.localdomain"):
        raise HTTPException(
            status_code=400,
            detail=f"Invalid {field_name}: must not point to a private or reserved network",
        )

    # Resolve hostname to IP addresses and check each one
    try:
        addrinfos = _socket.getaddrinfo(host, None, proto=_socket.IPPROTO_TCP)
    except Exception:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid {field_name}: could not resolve hostname",
        )

    for family, _type, _proto, _canonname, sockaddr in addrinfos:
        ip_str = sockaddr[0]
        try:
            ip = _ipaddress.ip_address(ip_str)
        except ValueError:
            continue
        if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
            raise HTTPException(
                status_code=400,
                detail=f"Invalid {field_name}: must not point to a private or reserved network",
            )


def _unescape_shell_quoting(s: str | None) -> str | None:
    """Defensive normalization for descriptions arriving via shell-quoting tooling.

    Some operators register tables with bash/curl invocations whose quoting
    injects literal backslash escapes into the payload (e.g. ``Don\\'t`` or
    embedded ``\\n`` instead of real newlines). The backend would otherwise
    persist those bytes verbatim and the UI would render them verbatim too.
    Mirrored in JS as ``unescapeShellQuoting`` in
    ``app/web/templates/admin_tables.html`` for already-stored rows.
    """
    if not s:
        return s
    # Order matters: protect real backslashes first.
    SENTINEL = "\x00"
    return (
        s.replace("\\\\", SENTINEL)
         .replace("\\n", "\n")
         .replace("\\r", "\r")
         .replace("\\t", "\t")
         .replace("\\'", "'")
         .replace('\\"', '"')
         .replace(SENTINEL, "\\")
    )


def _normalize_primary_key(v):
    """Coerce a string primary_key to ``[v]`` for backward compatibility.

    The 0.14.0 contract is ``Optional[List[str]]`` so composite primary keys
    (e.g. session-grain tables keyed on ``(session_id, event_date)``) round-
    trip cleanly. Pre-0.14.0 callers sent a single string; Pydantic v2
    refuses to coerce, so without this validator a CLI script posting
    ``"primary_key": "session_id"`` would now hit a 422. Wrap a bare string
    in a one-element list so old and new callers both work.
    """
    if v is None:
        return v
    if isinstance(v, str):
        return [v]
    return v


# Patches to these section paths must pass _validate_url_not_private. The
# tuple is `(section, *intermediate_keys, leaf_key)` — same SSRF gate the
# /configure wizard applies to keboola_url, so an admin can't sneak
# http://169.254.169.254/ in via the server-config editor's data_source patch.
#
# Intentionally NOT included: ``("ai", "base_url")``. The openai_compat
# provider legitimately points at internal services (LiteLLM proxy on a
# private network, on-cluster vLLM endpoint, etc.) — see
# config/instance.yaml.example "LiteLLM proxy" example. SSRF blocking
# would break those valid setups. Operators with stricter posture should
# enforce the constraint upstream (firewall / egress proxy allowlist).
# Devin ANALYSIS_0001 on PR #141 5f649a4 review.
_URL_BEARING_FIELDS: tuple[tuple[str, ...], ...] = (
    ("data_source", "keboola", "stack_url"),
    ("marketplace", "curators_url"),
)


def _validate_urls_in_patch(sections: Dict[str, Dict[str, Any]]) -> None:
    """Apply SSRF protection to every URL-bearing field present in the patch.

    Walks each registered ``(section, *path, leaf)`` against the incoming
    patch and runs ``_validate_url_not_private`` on any string value found.
    Missing intermediate keys / non-dict nodes are silently skipped — the
    patch hasn't touched that field, no validation needed.
    """
    for path in _URL_BEARING_FIELDS:
        section = path[0]
        if section not in sections:
            continue
        node: Any = sections[section]
        for key in path[1:-1]:
            if not isinstance(node, dict) or key not in node:
                node = None
                break
            node = node[key]
        if isinstance(node, dict):
            value = node.get(path[-1])
            if isinstance(value, str) and value:
                _validate_url_not_private(value, field_name=".".join(path))


_LOCK_TTL_MIN = 60
_LOCK_TTL_MAX = 7 * 24 * 3600  # 604800 — one week


def _validate_materialize_section(sections: Dict[str, Dict[str, Any]]) -> None:
    """Validate the materialize section patch when present.

    Checks field-level constraints that the Pydantic envelope can't enforce
    (it only validates the outer shape, not nested leaf values).
    """
    mat = sections.get("materialize")
    if not isinstance(mat, dict):
        return
    ttl = mat.get("lock_ttl_seconds")
    if ttl is None:
        return
    if not isinstance(ttl, int) or isinstance(ttl, bool):
        raise HTTPException(
            status_code=422,
            detail="materialize.lock_ttl_seconds must be an integer",
        )
    if ttl < _LOCK_TTL_MIN or ttl > _LOCK_TTL_MAX:
        raise HTTPException(
            status_code=422,
            detail=(
                f"materialize.lock_ttl_seconds must be between "
                f"{_LOCK_TTL_MIN} and {_LOCK_TTL_MAX} "
                f"(got {ttl})"
            ),
        )


# --- Server-config (instance.yaml) editor -----------------------------------
#
# The /admin/server-config UI POSTs a partial dict here keyed by section
# (instance, data_source, email, telegram, jira, theme, server, auth) with
# the field values to merge into instance.yaml. Each save:
#   1. Loads the current instance.yaml (writable overlay first, then static).
#   2. Deep-merges the patch on top.
#   3. Writes to DATA_DIR/state/instance.yaml (the writable overlay).
#   4. Writes one audit_log entry tagged `instance_config.update` containing
#      a sanitized diff (secret-looking keys are masked).
# Hot-reload is OUT OF SCOPE for #91 — the response carries
# `restart_required=True` so the UI can show the banner.

# Sections an admin can mutate. Keep the list explicit so a typo'd section
# in the request body is rejected loudly instead of being silently merged
# into the YAML root and confusing future loads.
_EDITABLE_SECTIONS: tuple[str, ...] = (
    "instance",
    "data_source",
    "email",
    "telegram",
    "jira",
    "theme",
    "server",
    "auth",
    "ai",
    "openmetadata",
    "desktop",
    "corporate_memory",
    "materialize",
    "guardrails",
    "marketplace",
)

# "Danger-zone" sections — flipping these can lock operators out (auth.*) or
# break OAuth callbacks (server.hostname/host). The UI shows a confirmation
# dialog before submitting them. The API accepts them; this list exists so
# the audit entry can flag the change as high-risk and the UI can surface
# the right warning copy.
_DANGER_SECTIONS: tuple[str, ...] = ("auth", "server")

# Known-but-optional config fields per section. The /admin/server-config UI
# uses this registry alongside the YAML payload to render fields the operator
# might want to set even though they're not currently in instance.yaml.
#
# Schema per field:
#   {
#     "kind": "string" | "secret" | "bool" | "int" | "select" | "object" | "array",
#     "default": <type-appropriate default>  (optional)
#     "hint": "<one-line operator-facing help>"
#     "options": [...]              (only for kind="select")
#     "fields": {<name>: <fieldspec>}  (only for kind="object", nested fields)
#     "item_kind": "string" | ...   (only for kind="array", element type)
#     "required": bool             (defaults False; UI marks the label)
#   }
#
# Subagents 2-4 will populate the bodies. The registry enables the UI to
# render missing-but-known fields with placeholders + hints rather than
# forcing the operator to discover them via the JSON-patch textarea or
# hitting a runtime error first. The smoke fixture below
# (data_source.bigquery.billing_project) proves the renderer wiring works
# end-to-end so subagents 2-4 only have to add registry entries — they
# don't need to touch admin_server_config.html.
_KNOWN_FIELDS: dict[str, dict[str, dict]] = {
    "instance": {
        # UI theme — flips `<html data-theme="...">` so the
        # design-system tokens (`--ds-*`) switch palettes via CSS
        # without any markup change. Resolved by
        # `app/instance_config.py::get_instance_theme()`.
        "theme": {
            "kind": "select",
            "options": ["blue", "navy"],
            "default": "blue",
            "hint": (
                "Page-hero colour scheme. `blue` (default) uses the "
                "brand-blue hero + blue CTAs. `navy` opts into the "
                "darker palette with the dark navy hero gradient + "
                "mint-green CTAs and eyebrow accents."
            ),
        },
        # Operator-injected HTML/JS blocks rendered into base.html.
        # `kind: array` renders as a JSON textarea in the admin UI
        # (per admin_server_config.html:702-708 — arrays fall back to
        # the JSON path); the hint documents the per-item shape so the
        # operator knows what to paste. Resolved by
        # `app/instance_config.py::get_custom_scripts()`.
        "custom_scripts": {
            "kind": "array",
            "hint": (
                "Operator-injected HTML/JS blocks rendered into base.html. "
                "Each entry: {name: str, enabled: bool, placement: "
                "head_start|head_end|body_end, html: str}. Used for feedback "
                "widgets (Marker.io), analytics (GTM, PostHog), error capture "
                "(Sentry). Rendered with | safe — admin trust boundary. Review "
                "third-party widget privacy posture before enabling (most "
                "capture session data). Restart required after save."
            ),
        },
        # Operator-authored Support HTML rendered inside the welcome
        # hero on /home, below the operator-owned Overview footnotes.
        # Resolved by `app/instance_config.py::get_instance_support()`.
        # Typical content: a one-line invitation pointing at a chat
        # space, mailing list, or internal runbook. Empty value =
        # block hidden (OSS stays vendor-neutral).
        "support": {
            "kind": "string",
            "hint": (
                "HTML body rendered inside the welcome hero's Support "
                "block on /home (mint-accent panel below the Overview "
                "footnotes). Typically a one-line invitation linking to "
                "a chat space, mailing list, or runbook — e.g. "
                "'<p><strong>Need help?</strong> Drop into our "
                "<a href=\"https://chat.example.com/room/xxx\">Support</a> "
                "chat space.</p>'. Rendered with | safe — admin trust "
                "boundary (link target is operator-controlled). Empty "
                "value hides the block."
            ),
        },
    },
    "data_source": {
        "bigquery": {
            "kind": "object",
            "hint": "BigQuery connection knobs (read more in docs/DEPLOYMENT.md)",
            "fields": {
                "billing_project": {
                    "kind": "string",
                    "hint": (
                        "GCP project to bill BQ jobs against. Set when SA can read "
                        "the data project but cannot bill there (e.g. shared read-only "
                        "data project). Defaults to data_source.bigquery.project. "
                        "Mismatch → 403 USER_PROJECT_DENIED on every BQ call."
                    ),
                    # Issue #160 §4.7.5: when this field is empty in the
                    # admin form, the JS template shows "(defaults to <project>)"
                    # as placeholder text — surfacing the access.py:339-340
                    # fallback rule directly in the UI without the operator
                    # having to read source. Path is walked against the
                    # `original` config payload from GET /api/admin/server-config.
                    "placeholder_from": ["data_source", "bigquery", "project"],
                },
                "max_bytes_per_materialize": {
                    "kind": "int",
                    "default": 10737418240,
                    "hint": (
                        "Cost guardrail for query_mode='materialized' BQ scans (dry-run "
                        "check before running). Bytes processed; exceeds → registration "
                        "or sync rejected. 0 disables the gate. Default 10737418240 = 10 GiB."
                    ),
                },
                "bq_max_scan_bytes": {
                    "kind": "int",
                    "default": 5368709120,
                    "hint": (
                        "Cost guardrail for `agnes query --remote` against query_mode='remote' "
                        "BQ rows (dry-run check on the underlying SELECT before execute). "
                        "Bytes processed; exceeds → 400 remote_scan_too_large with a "
                        "`agnes snapshot create` suggestion. 0 disables the gate. Default 5368709120 = 5 GiB."
                    ),
                },
                "query_timeout_ms": {
                    "kind": "int",
                    "default": 600000,
                    "hint": (
                        "DuckDB BigQuery extension query timeout (milliseconds). Applied "
                        "via `SET bq_query_timeout_ms` after every `LOAD bigquery` on "
                        "every BQ-touching DuckDB session (orchestrator remote-view "
                        "ATTACH, BqAccess factory, standalone extractor). Extension "
                        "default is 90 000 ms = 90 s, which is too tight for analyst "
                        "queries against view-backed datasets — bumped to 600 000 ms = "
                        "10 min by default. Set 0 to fall through to the extension "
                        "default. Note: the underlying BQ jobs.query RPC caps the wait "
                        "at ~200 s per call; the extension polls on top, so the "
                        "effective ceiling is this value but each poll round-trip is "
                        "~200 s. DuckDB itself emits a warning when this is set above "
                        "~200 s — that warning is informational, not an error."
                    ),
                },
            },
        },
        "keboola": {
            "kind": "object",
            "hint": "Keboola Storage API connection",
            "fields": {
                "stack_url": {
                    "kind": "string",
                    "hint": (
                        "e.g. https://connection.keboola.com (instance-specific stack URL). "
                        "Validated against private-IP allowlist on save (SSRF guard)."
                    ),
                },
                "project_id": {
                    "kind": "string",
                    "hint": "Keboola project ID (numeric, but kept as string in YAML).",
                },
            },
        },
    },
    "email": {
        # SMTP fields render via the populated path (always set when email
        # is enabled); no commonly-missing optional knobs at this layer.
    },
    "telegram": {
        # Rarely missing; leave empty.
    },
    "jira": {
        # Webhook + REST credentials always present when Jira is configured.
    },
    "theme": {
        # Cosmetic only; rarely missing.
    },
    "server": {
        # TLS / hostname knobs are mostly env-side; nothing to surface here.
    },
    "auth": {
        "allowed_domain": {
            "kind": "string",
            "hint": (
                "Comma-separated list of allowed sign-in email domains (e.g. "
                "'acme.com,acme-internal.com'). Single domain works too. Empty → no "
                "domain restriction (any verified Google identity can sign in)."
            ),
        },
    },
    "ai": {
        "base_url": {
            "kind": "string",
            "hint": (
                "Required for provider='openai_compat' (LiteLLM, OpenRouter, vLLM, etc.). "
                "Ignored when provider='anthropic'. Examples: https://litellm.example.com, "
                "https://openrouter.ai/api/v1."
            ),
        },
        "structured_output": {
            "kind": "select",
            "options": ["strict", "json", "auto"],
            "default": "auto",
            "hint": (
                "JSON-schema enforcement strategy. strict=Layer 1 only "
                "(Anthropic/OpenAI native, fail otherwise). json=Layer 1 + Layer 2 "
                "fallback. auto=all three layers including prompt-based JSON (most "
                "compatible, least strict)."
            ),
        },
    },
    "openmetadata": {
        "url": {
            "kind": "string",
            "hint": "Base URL of your OpenMetadata server (e.g. https://catalog.example.com).",
        },
        "token": {
            "kind": "secret",
            "hint": (
                "JWT bearer token. Use ${OPENMETADATA_TOKEN} env-var reference "
                "(don't paste secret directly)."
            ),
        },
        "cache_ttl_seconds": {
            "kind": "int",
            "default": 3600,
            "hint": "How long to cache catalog responses in-process. Default 3600s (1h).",
        },
        "verify_ssl": {
            "kind": "bool",
            "default": True,
            "hint": (
                "TLS verification. Default true. Set false ONLY for internal CAs / "
                "self-signed certs — sends the JWT over an unverified channel."
            ),
        },
    },
    "desktop": {
        "jwt_issuer": {
            "kind": "string",
            "default": "data-analyst",
            "hint": "JWT iss claim. Match what the desktop app verifies.",
        },
        "jwt_secret": {
            "kind": "secret",
            "hint": "JWT signing secret. Use ${DESKTOP_JWT_SECRET} env-var reference.",
        },
        "url_scheme": {
            "kind": "string",
            "default": "data-analyst",
            "hint": "Custom URL scheme registered by the desktop app (data-analyst://...).",
        },
    },
    # corporate_memory governance — optional. When the section is missing
    # from instance.yaml the system runs in legacy democratic-wiki mode
    # (no admin review). Schema mirrors config/instance.yaml.example
    # lines 224-317; renderer handles arbitrary depth + arrays + maps.
    "corporate_memory": {
        "distribution_mode": {
            "kind": "select",
            "options": ["mandatory_only", "admin_curated", "hybrid"],
            "default": "hybrid",
            "hint": (
                "How knowledge reaches users. mandatory_only = admin-only; "
                "admin_curated = admin + user voting as feedback; "
                "hybrid = default (mandatory from admin + optional from user voting)."
            ),
        },
        "approval_mode": {
            "kind": "select",
            "options": ["review_queue", "auto_publish", "threshold"],
            "default": "review_queue",
            "hint": (
                "How AI-extracted items enter the system. review_queue = admin "
                "approval required (default); auto_publish = live immediately; "
                "threshold = high-confidence auto, low-confidence to queue."
            ),
        },
        "review_period_months": {
            "kind": "int",
            "default": 6,
            "hint": "How often approved/mandatory items are flagged for re-review (months).",
        },
        "notify_on_new_items": {
            "kind": "bool",
            "default": True,
            "hint": "Notify km_admins when new pending items arrive.",
        },
        "sources": {
            "kind": "object",
            "hint": (
                "Knowledge-source ingestion. Each source has its own enabled "
                "flag + base confidence."
            ),
            "fields": {
                "claude_local_md": {
                    "kind": "object",
                    "fields": {
                        "enabled": {"kind": "bool", "default": True},
                        "confidence_base": {
                            "kind": "float",
                            "default": 0.50,
                            "hint": "Confidence assigned to extractions from CLAUDE.local.md (0-1).",
                        },
                    },
                },
                "session_transcripts": {
                    "kind": "object",
                    "fields": {
                        "enabled": {"kind": "bool", "default": True},
                        "confidence_base": {"kind": "float", "default": 0.60},
                        "max_turns_per_session": {
                            "kind": "int",
                            "default": 100,
                            "hint": "Truncate transcripts longer than this many turns.",
                        },
                        "detection_types": {
                            "kind": "array",
                            "item_kind": "string",
                            "default": [
                                "correction",
                                "confirmation",
                                "unprompted_definition",
                            ],
                            "hint": (
                                "Which extraction patterns to detect. Each entry "
                                "is a detection-type tag."
                            ),
                        },
                    },
                },
            },
        },
        "extraction": {
            "kind": "object",
            "fields": {
                "model": {
                    "kind": "string",
                    "default": "claude-haiku-4-5-20251001",
                    "hint": "LLM used to extract knowledge. Override for cost or quality.",
                },
                "sensitivity_check": {"kind": "bool", "default": True},
                "contradiction_check": {"kind": "bool", "default": True},
            },
        },
        "confidence": {
            "kind": "object",
            "hint": "Confidence scoring + decay rules.",
            "fields": {
                "base": {
                    "kind": "map",
                    "key_kind": "string",
                    "value_kind": "float",
                    "default": {
                        "user_verification.correction": 0.90,
                        "user_verification.unprompted_definition": 0.90,
                        "user_verification.confirmation": 0.60,
                        "admin_mandate": 1.00,
                        "claude_local_md": 0.50,
                        "session_transcript": 0.50,
                    },
                    "hint": (
                        "Base score per source/detection. Keys are 'source_type' "
                        "or 'source_type.detection_type' (the dot is data, not "
                        "nesting)."
                    ),
                },
                "modifiers": {
                    # map<string, map<string, float>>. The renderer's structured
                    # editor for "map of objects with declared subfields" is a
                    # TODO (see admin_server_config.html); for now this falls
                    # back to a JSON textarea — admins editing it see the
                    # schema doc inline via the hint.
                    "kind": "map",
                    "key_kind": "string",
                    "value_kind": "object",
                    "value_fields": {},  # signals the JSON-textarea fallback
                    "hint": (
                        "Per-key modifier step sizes applied to base when "
                        "optional signals are present (3-level dotted paths). "
                        "Edit as a JSON object — outer keys mirror confidence.base "
                        "keys; inner objects map signal name to bonus float."
                    ),
                },
                "decay": {
                    "kind": "object",
                    "fields": {
                        "mode": {
                            "kind": "select",
                            "options": ["linear", "exponential"],
                            "default": "exponential",
                        },
                        "half_life_months": {
                            "kind": "int",
                            "default": 12,
                            "hint": "Used when mode=exponential.",
                        },
                        "decay_rate_monthly": {
                            "kind": "float",
                            "default": 0.02,
                            "hint": "Used when mode=linear.",
                        },
                        "floor": {
                            "kind": "map",
                            "key_kind": "string",
                            "value_kind": "float",
                            "default": {
                                "admin_mandate": 0.50,
                                "user_verification": 0.40,
                                "default": 0.0,
                            },
                            "hint": (
                                "Per-source minimum confidence — items never decay "
                                "below this floor."
                            ),
                        },
                    },
                },
            },
        },
        "contradiction_detection": {
            "kind": "object",
            "fields": {
                "enabled": {"kind": "bool", "default": True},
                "max_candidates": {
                    "kind": "int",
                    "default": 10,
                    "hint": "Max contradiction candidates to evaluate per new item.",
                },
            },
        },
        "entity_resolution": {
            "kind": "object",
            "fields": {
                "enabled": {"kind": "bool", "default": True},
                "entities": {
                    "kind": "map",
                    "key_kind": "string",
                    "value_kind": "array",
                    "value_item_kind": "string",
                    "default": {
                        "metrics": ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"],
                        "products": ["Platform", "API", "Dashboard"],
                    },
                    "hint": (
                        "Domain-entity vocabulary. Key = domain category; value = "
                        "canonical names list."
                    ),
                },
            },
        },
        "domain_owners": {
            "kind": "map",
            "key_kind": "string",
            "value_kind": "array",
            "value_item_kind": "string",
            "hint": (
                "Per-domain admin emails. Key = domain name; value = email list."
            ),
        },
        "domains": {
            "kind": "array",
            "item_kind": "string",
            "default": [
                "finance",
                "engineering",
                "product",
                "data",
                "operations",
                "infrastructure",
            ],
            "hint": (
                "Knowledge domains analysts can target. Each must match a key "
                "in domain_owners."
            ),
        },
    },
    # materialize — file-lock TTL for the concurrent-materialize safety net.
    # A single field; more knobs may follow as the feature matures.
    "materialize": {
        "lock_ttl_seconds": {
            "kind": "int",
            "default": 86400,
            "hint": (
                "How long (seconds) before a stale materialize lock file is "
                "reclaimed. The lock is a .parquet.lock sibling file; if the "
                "holder process is hard-killed, the next attempt reclaims the "
                "lock once the file's mtime is older than this TTL. "
                "Default 86400 (24 h). Min 60, max 604800 (7 days). "
                "Lower only if you know materializes never exceed the new value "
                "and your host regularly hard-kills processes."
            ),
        },
    },
    "guardrails": {
        "min_description_chars": {
            "kind": "int",
            "default": 60,
            "hint": (
                "Minimum character floor for skill / agent / plugin "
                "descriptions on flea-market uploads (the inline content "
                "guardrail). Real-world Claude skill descriptions cluster "
                "150–220 chars; the default 60 is the bottom of the bar "
                "to catch placeholders. Bump to 100+ to push submitters "
                "closer to the ecosystem norm. Min 1."
            ),
        },
        "min_command_description_chars": {
            "kind": "int",
            "default": 25,
            "hint": (
                "Minimum character floor for slash-command descriptions. "
                "Tighter than skills because commands are one-verb "
                "actions (\"run tests\", \"format code\"). Default 25. Min 1."
            ),
        },
        "min_distinct_words": {
            "kind": "int",
            "default": 5,
            "hint": (
                "Minimum number of DISTINCT words in any description "
                "string. Defends against padding-only descriptions like "
                "\"description description description\" that hit the "
                "character count but say nothing. Default 5. Min 1."
            ),
        },
        "min_body_chars": {
            "kind": "int",
            "default": 200,
            "hint": (
                "Minimum body-content floor for skill / agent files "
                "(the markdown after the YAML frontmatter). Real skill "
                "bodies run 500–2000 chars; the default 200 is a "
                "\"one paragraph\" floor that catches stubs. Min 1."
            ),
        },
        "enabled": {
            "kind": "bool",
            "default": True,
            "hint": (
                "Master kill-switch for the LLM guardrail tier. When "
                "False (or when ANTHROPIC_API_KEY / LLM_API_KEY is "
                "absent), uploads still run the inline mechanical "
                "checks but skip the LLM security + content-quality "
                "review and auto-approve. Default True."
            ),
        },
        "review_model": {
            "kind": "select",
            "default": "haiku",
            "options": ["haiku", "sonnet", "opus"],
            "hint": (
                "Anthropic model tier for the LLM security + content "
                "review. Haiku is the cheapest and fastest; Sonnet / "
                "Opus catch subtler prompt-injection + vague descriptions "
                "at proportionally higher per-upload cost."
            ),
        },
        "blocked_quota_per_day": {
            "kind": "int",
            "default": 50,
            "hint": (
                "Per-submitter cap on `blocked_llm` + `review_error` "
                "rows in the trailing 24h. Bounds the worst case where "
                "a bot loops on bundles that survive inline checks but "
                "trip the async LLM reviewer. Inline failures are "
                "hard-rejected upstream (no row, not counted). 0 "
                "disables the quota. Default 50."
            ),
        },
        "blocked_bundle_ttl_days": {
            "kind": "int",
            "default": 30,
            "hint": (
                "How many days to keep a blocked bundle's bytes on disk. "
                "The submission row + sha256 + size always survive; only "
                "the bytes get removed. 0 disables the purge entirely. "
                "Default 30."
            ),
        },
        "stuck_review_grace_seconds": {
            "kind": "int",
            "default": 1800,
            "hint": (
                "How long a submission may stay at `status='pending_llm'` "
                "before the reaper flips it to `review_error`. Default "
                "1800 (30 min) comfortably exceeds Sonnet / Opus p99 "
                "wall time. 0 disables the reaper."
            ),
        },
    },
    "marketplace": {
        "curators_url": {
            "kind": "string",
            "hint": (
                "URL the 'See all curators →' link on /marketplace points to "
                "(e.g. an internal wiki page listing curators accountable for "
                "the curated marketplace). Empty → the link is hidden. "
                "Validated against private-IP allowlist on save (SSRF guard)."
            ),
        },
    },
}

# Keys whose values must be redacted from the audit diff. We match
# substring (case-insensitive) so `client_secret`, `api_token`,
# `webapp_secret_key`, `bot_token`, `password`, `smtp_password`, etc. all
# get masked even when nested.
_SECRET_KEY_PATTERNS: tuple[str, ...] = (
    "secret",
    "token",
    "password",
    "api_key",
)


def _is_secret_key(key: str) -> bool:
    """True if a config key holds a credential and should be masked in audit logs."""
    k = key.lower()
    return any(pat in k for pat in _SECRET_KEY_PATTERNS)


def _mask(value: Any) -> str:
    """Replacement value used in the audit diff for secret fields.

    We deliberately do NOT preserve length or any hint about the secret —
    the diff is read by other admins, and there's no operator value to
    leaking "the new SMTP password is 16 chars". `***` is enough to show
    that the field changed without exposing it.
    """
    if value in (None, ""):
        return "<empty>"
    return "***"


# Sentinel values produced by `_mask`. Any patch leaf that arrives at a
# secret-keyed slot still bearing one of these strings means the caller
# round-tripped the GET payload (which redacts secret-keyed children inside
# nested objects) without changing the value — `_strip_redacted_sentinels`
# drops the leaf so deep-merge preserves whatever the overlay already had,
# rather than persisting the placeholder on top of the real secret.
_REDACTED_SENTINELS: frozenset = frozenset({"***", "<empty>"})


def _strip_redacted_sentinels(value: Any, key_hint: str = "") -> Any:
    """Recursively drop secret-keyed leaves whose value is a redaction sentinel.

    Symmetric with `_redact`: the GET handler masks secret-keyed children
    inside nested objects so the form never shows cleartext, and this
    function is the write-side counterpart that ensures the placeholder
    doesn't make a round-trip back into the overlay. Defense-in-depth
    alongside the client-side `scrubRedactedSecrets` in
    `admin_server_config.html` — an API caller (CLI / script) that forgets
    to scrub still can't corrupt secrets via this endpoint.
    """
    if isinstance(value, dict):
        out: Dict[str, Any] = {}
        for k, v in value.items():
            if _is_secret_key(k) and isinstance(v, str) and v in _REDACTED_SENTINELS:
                continue
            out[k] = _strip_redacted_sentinels(v, k)
        return out
    if isinstance(value, list):
        return [_strip_redacted_sentinels(item, key_hint) for item in value]
    return value


def _redact(value: Any, key_hint: str = "") -> Any:
    """Recursively mask secret-looking fields in a config subtree.

    `key_hint` is the parent key — used so a string value like
    ``"${KEBOOLA_TOKEN}"`` under ``token_env`` is masked even though the
    value itself isn't a credential, because the key signals it points at
    one.
    """
    if isinstance(value, dict):
        return {k: (_mask(v) if _is_secret_key(k) else _redact(v, k)) for k, v in value.items()}
    if isinstance(value, list):
        return [_redact(item, key_hint) for item in value]
    if key_hint and _is_secret_key(key_hint):
        return _mask(value)
    return value


def _diff_dicts(before: dict, after: dict, path: str = "") -> List[Dict[str, Any]]:
    """Flat list of changed fields between two dicts.

    Output: [{"path": "email.smtp_host", "before": "...", "after": "..."}].
    Diff is computed on RAW values, then each row's `before`/`after` is
    masked via `_mask` when the leaf key matches `_is_secret_key` — pre-
    masking the inputs would collapse a secret rotation (e.g. password A
    → password B) into "no diff" because both sides redact to ``"***"``,
    and the audit log would then silently fail to record one of the most
    security-relevant changes. Compare raw, redact when emitting.

    Recurses into a dict on either side (treating the missing side as
    `{}`) so adding a brand-new section reports per-field paths
    (`email.smtp_host`) rather than a single opaque `email` blob — that
    keeps the audit row useful when an admin populates a section for the
    first time.
    """
    changes: List[Dict[str, Any]] = []
    keys = set(before.keys()) | set(after.keys())
    for key in sorted(keys):
        new_path = f"{path}.{key}" if path else key
        b_val = before.get(key)
        a_val = after.get(key)
        b_is_dict = isinstance(b_val, dict)
        a_is_dict = isinstance(a_val, dict)
        # Dict-vs-dict (or dict-vs-None) → recurse for per-field paths.
        if b_is_dict and a_is_dict:
            changes.extend(_diff_dicts(b_val, a_val, new_path))
        elif b_is_dict and a_val is None:
            changes.extend(_diff_dicts(b_val, {}, new_path))
        elif a_is_dict and b_val is None:
            changes.extend(_diff_dicts({}, a_val, new_path))
        # Dict↔scalar shape change is recorded as a single replacement at
        # the parent path. Recursing with `{}` would lose the scalar side
        # entirely (admin sets `keboola: {…}` to `keboola: "disabled"` —
        # auditor would see members removed but never the new value).
        # The dict side may itself contain secret-keyed children (e.g.
        # `keboola: {token_env: ${KEBOOLA_TOKEN}}` resolved to cleartext);
        # `_redact` masks those children even when the parent key isn't
        # secret-named, so the audit log doesn't leak ${ENV_VAR}-resolved
        # values when a section is replaced wholesale.
        elif b_is_dict != a_is_dict:
            if _is_secret_key(key):
                changes.append({
                    "path": new_path,
                    "before": _mask(b_val),
                    "after": _mask(a_val),
                })
            else:
                changes.append({
                    "path": new_path,
                    "before": _redact(b_val, key) if b_is_dict else b_val,
                    "after": _redact(a_val, key) if a_is_dict else a_val,
                })
        elif b_val != a_val:
            if _is_secret_key(key):
                changes.append({
                    "path": new_path,
                    "before": _mask(b_val),
                    "after": _mask(a_val),
                })
            else:
                changes.append({"path": new_path, "before": b_val, "after": a_val})
    return changes


def _deep_merge(base: dict, patch: dict) -> dict:
    """Merge `patch` into `base` recursively, returning a new dict.

    Patch values overwrite base values. Dict-into-dict recurses; everything
    else (lists, scalars, None) is replaced wholesale — admin sets
    ``email: {smtp_port: 465}`` and we don't try to re-merge nested ports.
    """
    out = dict(base)
    for key, value in patch.items():
        if isinstance(value, dict) and isinstance(out.get(key), dict):
            out[key] = _deep_merge(out[key], value)
        else:
            out[key] = value
    return out


def _load_current_instance_yaml() -> dict:
    """Return the editor's view of instance.yaml — deep-merge of static +
    overlay via ``app.instance_config.load_instance_config``.

    Readers (GET /server-config) hit the cache and trust that writers
    invalidate. Writers must call ``reset_cache()`` explicitly *before*
    the read so they see the latest disk state in the read-modify-write
    sequence. The shared helper is the authoritative source so the editor
    never sees a different view than the rest of the running app.
    """
    from app.instance_config import load_instance_config
    return load_instance_config()


def _public_view(config: dict) -> dict:
    """Return a config dict safe to render in the admin UI form.

    Deep-copies and redacts secret-looking fields so an admin can see
    *which* fields are populated without the cleartext leaking into the
    rendered HTML / browser DevTools.
    """
    import copy
    return _redact(copy.deepcopy(config))


class ServerConfigUpdateRequest(BaseModel):
    """Patch payload for POST /api/admin/server-config.

    Only the sections listed in `_EDITABLE_SECTIONS` are accepted; anything
    else is rejected with 400. `confirm_danger` must be true if the patch
    touches any danger-zone section (auth.*, server.*).
    """
    sections: Dict[str, Dict[str, Any]] = Field(
        default_factory=dict,
        description="Per-section patch dict (e.g. {'instance': {'name': 'X'}})",
    )
    confirm_danger: bool = Field(
        default=False,
        description="Must be true to apply changes touching auth.* or server.*",
    )


# Optional BQ fields whose runtime defaults are documented but which used to
# be invisible in the editor when YAML omitted them. The data_source.bigquery
# subtree renders as a JSON textarea; a key that's absent from the GET
# payload literally cannot appear in the form for the operator to edit. We
# surface them with their documented defaults so the UI always shows them as
# editable knobs — see Phase J of the admin-tables-cleanup work.
#
#   - billing_project: defaults to data project; explicit value needed when
#     the SA can read the data project but not bill against it.
#   - max_bytes_per_materialize: cost guardrail for `query_mode='materialized'`
#     (default 10 GiB; 0 disables; null falls through to the default).
_BQ_OPTIONAL_FIELD_DEFAULTS: Dict[str, Any] = {
    # `billing_project` intentionally NOT seeded here. The empty-string
    # default would inject `billing_project: ""` into every GET payload,
    # which makes the JS `isUnset = (value === undefined)` check evaluate
    # False — and the `(defaults to <project>)` placeholder feature
    # (#160 §4.7.5) would never render. Leaving it absent keeps the
    # field in the unset rendering path so placeholder_from fires.
    # Devin Review iter #3 on PR #168.
    "max_bytes_per_materialize": 10737418240,
    "bq_max_scan_bytes": 5368709120,
}


def _ensure_bq_optional_fields(sections: Dict[str, Any]) -> None:
    """In-place: add missing BQ optional fields to data_source.bigquery so the
    UI's JSON-textarea renders them as editable keys. Existing values are
    preserved — only absent keys are populated with their documented default.
    """
    ds = sections.get("data_source")
    if not isinstance(ds, dict):
        return
    bq = ds.get("bigquery")
    if not isinstance(bq, dict):
        # No BQ subsection — leave alone. Non-BQ instances don't need these
        # knobs, and creating an empty bigquery dict would be misleading.
        return
    for key, default in _BQ_OPTIONAL_FIELD_DEFAULTS.items():
        bq.setdefault(key, default)


@router.get("/server-config")
async def get_server_config(
    user: dict = Depends(require_admin),
):
    """Return the current instance.yaml with secrets redacted.

    Used by the /admin/server-config UI to prefill its form. The redacted
    payload mirrors the actual file shape, so the UI doesn't need to know
    the schema — it iterates over the editable sections and renders the
    fields it finds. Empty sections still show in the response so the form
    knows to render their headers.
    """
    config = _load_current_instance_yaml()
    redacted = _public_view(config)
    # Surface every editable section so the UI renders them even when the
    # file omits them — operator can populate from scratch without manual
    # JSON edits.
    sections = {section: redacted.get(section, {}) for section in _EDITABLE_SECTIONS}
    # Always surface the optional BQ knobs so the operator sees them in the
    # UI's JSON editor instead of having to know they exist (Phase J).
    _ensure_bq_optional_fields(sections)
    return {
        "sections": sections,
        "editable_sections": list(_EDITABLE_SECTIONS),
        "danger_sections": list(_DANGER_SECTIONS),
        "secret_key_patterns": list(_SECRET_KEY_PATTERNS),
        # Known-but-optional fields per section so the UI can render
        # placeholders for fields the operator hasn't set yet (Phase J).
        # Subagents 2-4 populate the bodies; the renderer ships now so the
        # mechanism is wired end-to-end and adding entries is purely a
        # data-edit in `_KNOWN_FIELDS` above.
        "known_fields": _KNOWN_FIELDS,
    }


@router.post("/server-config")
async def update_server_config(
    request: ServerConfigUpdateRequest,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Patch instance.yaml from the /admin/server-config editor.

    Accepts a partial patch keyed by section. Validates sections, refuses
    danger-zone edits without explicit confirmation, deep-merges into the
    current overlay, writes the file, and emits one audit entry per save
    with a sanitized diff. Returns ``restart_required=true`` so the UI can
    show the restart banner — hot-reload is a separate issue (see #91 Out
    of scope).
    """
    import yaml

    if not request.sections:
        raise HTTPException(status_code=422, detail="sections cannot be empty")

    # Reject unknown sections loudly. Without this, a typo like "thmee"
    # would silently land in the YAML root and the operator wouldn't see
    # their colour change apply.
    unknown = sorted(set(request.sections.keys()) - set(_EDITABLE_SECTIONS))
    if unknown:
        raise HTTPException(
            status_code=400,
            detail=f"unknown section(s): {', '.join(unknown)}. "
                   f"Editable: {', '.join(_EDITABLE_SECTIONS)}",
        )

    # Danger-zone gate. The UI shows a confirmation dialog before posting
    # with confirm_danger=true; an API caller (CLI/script) has to pass it
    # explicitly so they can't fat-finger a hostname change.
    danger_touched = sorted(set(request.sections.keys()) & set(_DANGER_SECTIONS))
    if danger_touched and not request.confirm_danger:
        raise HTTPException(
            status_code=400,
            detail=f"section(s) {', '.join(danger_touched)} require confirm_danger=true",
        )

    # SSRF protection — same gate the /configure wizard applies to
    # keboola_url, but here it covers any URL-bearing field reachable via
    # the per-section patch (e.g. data_source.keboola.stack_url).
    _validate_urls_in_patch(request.sections)

    # Field-level constraints for sections whose values have documented ranges.
    _validate_materialize_section(request.sections)

    # Defense-in-depth: scrub redaction sentinels (`***` / `<empty>`) out of
    # secret-keyed leaves in the patch before they reach the deep-merge.
    # The client form does the same scrub, but an API caller round-tripping
    # the GET payload could otherwise overwrite real overlay secrets with
    # the placeholder shown in the form.
    scrubbed_sections: Dict[str, Dict[str, Any]] = {
        section: _strip_redacted_sentinels(patch, section)
        for section, patch in request.sections.items()
    }

    # Serialize read-modify-write across concurrent admin saves. Without the
    # lock, two saves would each read the same overlay snapshot, merge their
    # disjoint patches, and the second os.replace would silently drop the
    # first patch. The lock spans the cache-invalidate → load → merge →
    # atomic-write sequence; the audit log sits outside since it operates on
    # local snapshots.
    from app.instance_config import reset_cache
    from app.secrets import _state_dir
    config_path = _state_dir() / "instance.yaml"
    config_path.parent.mkdir(parents=True, exist_ok=True)

    with _overlay_write_lock:
        # Drop the in-process cache so we read the latest on-disk state,
        # including any update that landed from a concurrent caller before
        # we acquired the lock.
        reset_cache()
        before = _load_current_instance_yaml()

        # Deep merge — section-by-section so we never accidentally delete a
        # sibling section the patch didn't touch. Use the redaction-scrubbed
        # patch so a round-tripped GET payload can't overwrite real secrets
        # with the `***` placeholder.
        after = dict(before)
        for section, patch in scrubbed_sections.items():
            if not isinstance(patch, dict):
                raise HTTPException(
                    status_code=422,
                    detail=f"section '{section}' must be an object, got {type(patch).__name__}",
                )
            if isinstance(after.get(section), dict):
                after[section] = _deep_merge(after[section], patch)
            else:
                after[section] = patch

        # Write only the sections the user actually patched in this request.
        # Two reasons:
        #   1. Persisting the full merged config (or every editable section)
        #      would snapshot non-editable static sections into the overlay,
        #      shadowing later operator updates to those sections in the
        #      static file (`_load_current_instance_yaml` merges static + overlay,
        #      overlay wins per leaf).
        #   2. The merged config has `${ENV_VAR}` placeholders RESOLVED to the
        #      runtime values by config.loader. Writing every editable section
        #      back would persist real cleartext secrets where the static file
        #      had only env-var references — turning `smtp_password:
        #      ${SMTP_PASSWORD}` into `smtp_password: hunter2` in the overlay.
        # By writing only the sections in `request.sections` we keep both the
        # static-evolution and the env-var-placeholder properties intact.
        overlay_payload: Dict[str, Any] = {}
        if config_path.exists():
            try:
                overlay_payload = yaml.safe_load(config_path.read_text()) or {}
            except Exception as e:
                # A corrupt overlay used to be silently replaced — that masked
                # disk corruption / partial writes / hand-edits and dropped
                # every previously-saved section on the next save. Refuse and
                # surface so the operator can investigate.
                logger.exception("server-config: refusing to overwrite corrupt overlay at %s", config_path)
                raise HTTPException(
                    status_code=500,
                    detail=f"refusing to overwrite corrupt overlay at {config_path} ({e}); "
                           "back up and remove the file, or fix it by hand",
                ) from e
        for section, patch in scrubbed_sections.items():
            if section not in _EDITABLE_SECTIONS:
                continue
            # Deep-merge the patch into the existing overlay slot (or static-
            # backed `before` if overlay had nothing for this section). This
            # preserves any unrelated keys the operator didn't touch in this
            # request — e.g. patching `email.smtp_host` doesn't blow away the
            # `email.smtp_password: ${SMTP_PASSWORD}` reference.
            existing = overlay_payload.get(section)
            if not isinstance(existing, dict):
                existing = {}
            overlay_payload[section] = _deep_merge(existing, patch)

        # Atomic via tmp + os.replace so two concurrent admin saves can't
        # interleave bytes and produce corrupt YAML (especially harmful since
        # auth.* is editable here — half-written file → operator lockout).
        tmp_path = config_path.with_suffix(config_path.suffix + ".tmp")
        tmp_path.write_text(yaml.dump(overlay_payload, default_flow_style=False, sort_keys=False))
        os.replace(tmp_path, config_path)
        logger.info("server-config: wrote %d section(s) to %s",
                    len(request.sections), config_path)

        # Invalidate cached instance config so subsequent reads pick up the
        # change. Hot-reload of running modules (auth providers, SMTP client)
        # is out of scope — the restart banner tells the operator to bounce.
        reset_cache()

    # Audit entry — diff is computed on RAW values then `_diff_dicts`
    # redacts each row whose leaf key matches `_is_secret_key`. Pre-
    # masking the inputs would collapse a secret rotation into "no
    # diff" because both sides redact to ``***``, hiding the most
    # security-relevant changes from the audit log. We log even if no
    # fields changed so the operator's intent (touched the page, hit
    # save) is auditable.
    diff = _diff_dicts(before, after)
    AuditRepository(conn).log(
        user_id=user.get("id"),
        action="instance_config.update",
        resource="instance.yaml",
        params={
            "sections": sorted(request.sections.keys()),
            "danger_sections": danger_touched,
            "diff": diff,
            "diff_count": len(diff),
        },
    )

    return {
        "status": "ok",
        "restart_required": True,
        "sections_updated": sorted(request.sections.keys()),
        "diff_count": len(diff),
    }


# --- End server-config editor -----------------------------------------------


# Source types accepted by /api/admin/register-table. Anything else is
# rejected with 422 — keeps a typo'd source_type from silently landing in
# table_registry (where it would later confuse the orchestrator scan).
_VALID_SOURCE_TYPES: tuple[str, ...] = ("keboola", "bigquery", "jira", "local")

# Explicit allowlist of audit-payload keys whose values are credentials and
# must be masked. Substring-scan + ad-hoc whitelist (the previous shape) is
# fragile in two ways:
#   1. False positive: legit fields like `primary_key` get masked because
#      they contain "key" — we then need a whitelist exception, which has
#      to be kept in sync as new fields are added.
#   2. False negative: a future field like `primary_key_hash` *would* be
#      masked (defensible) but `not_actually_a_token` ALSO matches "token"
#      and gets masked unnecessarily; conversely, a brand-new credential
#      field that doesn't contain one of the patterns (`auth_material`,
#      `bearer`) silently leaks.
# Allowlist puts the burden on the developer adding a new secret-bearing
# field: they must add the literal key name here, which forces a code-
# review touch on the audit path. Audit the current Pydantic models
# (RegisterTableRequest / UpdateTableRequest / ConfigureRequest /
# ServerConfigUpdateRequest) when extending — the registry payloads don't
# currently carry credentials, but ConfigureRequest does (`keboola_token`)
# and could be routed through this sanitizer in the future.
_SECRET_FIELDS: frozenset = frozenset({
    # ConfigureRequest — POST /api/admin/configure carries Keboola creds.
    "keboola_token",
    # Generic names that have appeared in earlier iterations of admin
    # request bodies and could resurface — keep them masked defensively.
    "api_token",
    "auth_token",
    "bot_token",
    "client_secret",
    "google_client_secret",
    "google_oauth_client_secret",
    "password",
    "smtp_password",
    "webapp_secret_key",
    "bot_secret",
    # Marketplace PATs (private repos) — see src/marketplace.py.
    "marketplace_token",
    "marketplace_pat",
})


def _sanitize_for_audit(payload: Dict[str, Any]) -> Dict[str, Any]:
    """Mask credential-bearing fields in a request payload before audit_log.

    Uses an explicit `_SECRET_FIELDS` allowlist (case-insensitive) instead
    of substring matching. The trade-off is that adding a new secret field
    requires updating the set — but that's the *point*: the test suite
    asserts `not_actually_a_token` does NOT get masked, so a substring-
    based regression would surface immediately, and a missing entry for a
    real new credential gets caught at code review of the audit path.
    """
    out: Dict[str, Any] = {}
    for k, v in payload.items():
        if k.lower() in _SECRET_FIELDS:
            out[k] = "***" if v not in (None, "") else "<empty>"
        else:
            out[k] = v
    return out


# Both the BigQuery and Keboola materialize paths funnel `source_query`
# through DuckDB (BQ via the bigquery extension's COPY translation, Keboola
# via an ATTACH'd extension and a direct COPY). DuckDB uses double quotes
# for quoted identifiers — backticks are a BigQuery-native syntactic form
# DuckDB's parser does not honor, so a backtick-quoted source_query either
# parse-errors at COPY time or silently scans nothing. Surfaced from the
# field validator on RegisterTableRequest AND the merged-record path in
# `update_table` so neither route can persist a backtick query.
_BACKTICK_REJECTION_MESSAGE = (
    "source_query uses BigQuery-native backtick identifiers (e.g. "
    "`project.dataset.table`), but the materialize path runs the SQL "
    "through DuckDB's BigQuery extension which uses DuckDB-flavor "
    "identifiers. Rewrite to DuckDB syntax: bq.\"dataset\".\"table\" "
    "(with the attached catalog alias `bq` plus double-quoted dataset/"
    "table). The instance is configured with the data project, so you "
    "don't need to repeat it in the FROM clause."
)


class RegisterTableRequest(BaseModel):
    name: str
    folder: Optional[str] = None
    sync_strategy: str = Field(
        default="full_refresh",
        description=(
            "Per-table extraction strategy. v26+: drives the Keboola "
            "extractor's dispatcher in connectors/keboola/extractor.py. "
            "Allowed values: 'full_refresh' (default; full table dump on "
            "each sync), 'incremental' (Storage API changedSince + "
            "primary-key dedup merge), 'partitioned' (per-partition "
            "parquet files keyed by partition_by column, per-partition "
            "merge for daily updates, chunked initial load). "
            "Pre-v26 this field was inert; existing rows default to "
            "'full_refresh' so behavior is unchanged unless an admin "
            "opts a table in to incremental/partitioned."
        ),
    )
    # Composite primary keys are real (session-grain MSA tables key on
    # `(session_id, event_date)`, browse rows on more). The frontend sends +
    # reads this as a list; backend stores it JSON-serialized in VARCHAR.
    # A bare string is accepted for backward compat — see _normalize_primary_key.
    primary_key: Optional[List[str]] = None
    description: Optional[str] = None
    source_type: Optional[str] = None
    bucket: Optional[str] = None
    source_table: Optional[str] = None
    # Backs query_mode='materialized'. Stored verbatim in
    # table_registry.source_query (schema v20); the trigger pass runs it
    # through the DuckDB BQ extension via BqAccess and writes the result
    # to /data/extracts/bigquery/data/<id>.parquet.
    source_query: Optional[str] = None
    query_mode: str = "local"
    sync_schedule: Optional[str] = None
    profile_after_sync: bool = Field(
        default=True,
        deprecated=True,
        description=(
            "DEPRECATED: not consumed by the runtime (Agent 1 finding "
            "2026-05-01). Profiler runs unconditionally on every synced "
            "table; this flag has no effect. Field stays for back-compat."
        ),
    )
    # v26 — Keboola sync-strategy support fields. All optional; meaningful
    # only when paired with the matching sync_strategy. Per-strategy
    # required-field rules + conflict policy enforced in the model_validator
    # below.
    incremental_window_days: Optional[int] = None
    max_history_days: Optional[int] = None
    incremental_column: Optional[str] = None
    where_filters: Optional[List[Dict[str, Any]]] = None
    partition_by: Optional[str] = None
    partition_granularity: Optional[str] = None
    initial_load_chunk_days: Optional[int] = None
    # v51 — fully-qualified BigQuery path. When set on a BigQuery row,
    # the extractor uses ``project.dataset.table`` from this field instead
    # of constructing the path from ``bucket`` + ``source_table`` against
    # the globally-attached project. Decouples UX/RBAC ``bucket`` label
    # from physical BQ dataset (issue #343). Format ``project.dataset.table``;
    # validated by ``connectors.bigquery.extractor.parse_bq_fqn``.
    bq_fqn: Optional[str] = Field(
        default=None,
        description=(
            "Fully-qualified BigQuery path (``project.dataset.table``). "
            "Only applies to source_type='bigquery'. When set, overrides "
            "the legacy bucket+source_table path construction. Use this "
            "to register a table whose BQ dataset name differs from the "
            "Agnes ``bucket`` label (issue #343)."
        ),
    )

    @model_validator(mode="after")
    def _check_mode_query_coherence(self):
        """Enforce query_mode ↔ source_query invariants up front so an admin
        can't persist a remote/local row carrying an orphan source_query.

        For BigQuery materialized rows, an empty source_query is allowed here
        because _validate_bigquery_register_payload generates it from
        bucket+source_table after this validator runs. For all other source
        types (e.g. Keboola), source_query is still required for materialized.
        """
        sq = (self.source_query or "").strip() or None
        if self.query_mode != "materialized" and sq:
            raise ValueError(
                "source_query is only valid when query_mode='materialized'"
            )
        # BigQuery materialized auto-generates a full-table-dump SQL from
        # `bucket`+`source_table` when source_query is omitted (see
        # `register_table` BQ branch). Keboola materialized: a NULL
        # source_query means "full-table export via Storage API
        # export-async" — no SQL needed (the API takes a structured
        # filter, see `connectors/keboola/storage_api.py:ExportFilter`).
        # Other source_types (e.g. jira) don't support materialized mode
        # and require an explicit source_query if the operator opts in.
        if (
            self.query_mode == "materialized"
            and not sq
            and self.source_type not in ("bigquery", "keboola")
        ):
            raise ValueError(
                f"query_mode='materialized' for source_type='{self.source_type}' "
                "requires a non-empty source_query"
            )
        # Backtick guard stays for non-materialized rows (DuckDB-flavor SQL
        # contract); materialized SQL is BigQuery-native and MUST allow
        # backticks for dashed identifiers (e.g. `prj-org.dataset.table`).
        if self.query_mode != "materialized" and sq and "`" in sq:
            raise ValueError(_BACKTICK_REJECTION_MESSAGE)
        # Normalise: stash the trimmed-or-None form so the persisted column
        # never carries surrounding whitespace or empty-string sentinels.
        self.source_query = sq
        return self

    @field_validator("primary_key", mode="before")
    @classmethod
    def _coerce_primary_key(cls, v):
        return _normalize_primary_key(v)

    @field_validator("description", mode="before")
    @classmethod
    def _normalize_description(cls, v):
        # Defensive normalization for descriptions arriving via shell-quoting
        # tooling that injects literal backslash escapes (e.g. `Don\'t`, `\n`).
        return _unescape_shell_quoting(v)

    @field_validator("source_type", mode="before")
    @classmethod
    def _validate_source_type(cls, v):
        # None is tolerated for backward compat with old CLI scripts that
        # didn't set a source_type; the route resolves it later. Anything
        # else must be in the canonical list.
        if v in (None, ""):
            return v
        if v not in _VALID_SOURCE_TYPES:
            raise ValueError(
                f"source_type must be one of {sorted(_VALID_SOURCE_TYPES)}, got {v!r}"
            )
        return v

    @field_validator("sync_schedule", mode="before")
    @classmethod
    def _validate_sync_schedule(cls, v):
        # None / "" → no schedule, accepted.
        # Any non-empty string (including pure whitespace) must parse as a
        # valid schedule — otherwise it would be persisted and silently
        # ignored by the runtime evaluator.
        if v in (None, ""):
            return v
        if not is_valid_schedule(v):
            raise ValueError(
                f"sync_schedule must be 'every Nm' / 'every Nh' / "
                f"'daily HH:MM[,HH:MM,...]', got {v!r}"
            )
        return v

    @field_validator("sync_strategy", mode="before")
    @classmethod
    def _validate_sync_strategy(cls, v):
        """v26: enforce the strategy enum. NULL/empty → 'full_refresh' default.

        Pre-v26 the column accepted any string (catalog/profiler metadata
        only). Now the extractor dispatches off this value, so unknown
        strings would silently fall through to the default branch and
        confuse operators.
        """
        if v in (None, ""):
            return "full_refresh"
        allowed = {"full_refresh", "incremental", "partitioned"}
        if v not in allowed:
            raise ValueError(
                f"sync_strategy must be one of {sorted(allowed)}, got {v!r}"
            )
        return v

    @field_validator("partition_granularity", mode="before")
    @classmethod
    def _validate_partition_granularity(cls, v):
        if v in (None, ""):
            return v
        allowed = {"day", "month", "year"}
        if v not in allowed:
            raise ValueError(
                f"partition_granularity must be one of {sorted(allowed)}, got {v!r}"
            )
        return v

    @field_validator("where_filters", mode="before")
    @classmethod
    def _validate_where_filters(cls, v):
        """Validate filter shape via parse_filters from the keboola module.

        Accepts None / empty list, a JSON string, or a pre-parsed list.
        Returns the canonical list-of-dicts form for storage. Raises
        ValueError(InvalidFilterError message) on malformed shape so
        FastAPI returns 422 with a useful body. Placeholders are NOT
        resolved here — they're resolved at sync time so a misspelled
        token is caught when the next sync runs (admin can register a
        rolling-window filter today and the sync next month uses the
        same filter shape with a fresh date)."""
        if v in (None, "", []):
            return None
        from connectors.keboola.where_filters import parse_filters, InvalidFilterError
        try:
            return parse_filters(v)
        except InvalidFilterError as e:
            raise ValueError(str(e))

    @model_validator(mode="after")
    def _check_strategy_invariants(self):
        """v27 conflict policy + per-strategy required-field rules.

        Reject combinations that are silently broken at the extractor
        layer rather than letting the row land in the registry and
        confuse operators when the next sync misbehaves.

        - partitioned ⇒ partition_by required, query_mode='local' only.
          partition_granularity defaults to 'month' if omitted.
        - incremental + where_filters → 400. changedSince already does
          temporal filtering; layering server-side row filters on top is
          not supported by the extractor (legacy repo silently drops
          filters in this combination — match the rejection here).
        - partitioned + where_filters → 400. extract_partitioned does
          not thread where_filters through to its chunked downloads;
          accepting the pair would persist a filter that gets silently
          ignored at sync time (Devin Review concern). Reject explicitly
          until threading lands.
        - query_mode='remote' + where_filters → 400. _extract_via_extension
          (the remote/extension path) doesn't take a filters argument;
          accepting would silently drop them.
        """
        if self.sync_strategy == "partitioned":
            if not self.partition_by:
                raise ValueError(
                    "sync_strategy='partitioned' requires partition_by to be set"
                )
            if self.query_mode == "remote":
                raise ValueError(
                    "sync_strategy='partitioned' is incompatible with query_mode='remote' "
                    "— partitioned writes per-partition parquet files locally"
                )
            if self.where_filters:
                raise ValueError(
                    "sync_strategy='partitioned' is incompatible with where_filters "
                    "in v27 — extract_partitioned does not thread where_filters "
                    "through its chunked downloads; the filter would be silently "
                    "ignored. Use 'full_refresh' for filter+full-overwrite, or "
                    "wait for partitioned + where_filters wiring in a future PR."
                )
            if not self.partition_granularity:
                self.partition_granularity = "month"

        if self.sync_strategy == "incremental" and self.where_filters:
            raise ValueError(
                "sync_strategy='incremental' is incompatible with where_filters "
                "— changedSince already filters temporally; layering whereFilters "
                "on top is silently dropped by the extractor (use 'full_refresh' "
                "for filter+full-overwrite)"
            )

        # query_mode='remote' + where_filters: the DuckDB Keboola extension
        # path does not consume whereFilters. Accepting would silently drop
        # them at sync time. Caller must use query_mode='local' (Direct
        # extract) to apply filters.
        if self.query_mode == "remote" and self.where_filters:
            raise ValueError(
                "query_mode='remote' is incompatible with where_filters "
                "— the DuckDB Keboola extension does not expose whereFilters. "
                "Use query_mode='local' (Direct extract) to apply server-side "
                "row filters."
            )

        return self


def _generate_materialized_source_query(
    bucket: str, source_table: str, project_id: str,
) -> str:
    """Build the canonical full-table-dump source_query for a materialized
    BQ row when admin only supplies dataset + table. The result is
    BigQuery-native SQL — wrapped at materialize time into
    bigquery_query(...) by connectors.bigquery.extractor.materialize_query."""
    if not _is_safe_quoted_identifier(bucket):
        raise HTTPException(
            status_code=400,
            detail=f"bigquery: dataset {bucket!r} is unsafe",
        )
    if not _is_safe_quoted_identifier(source_table):
        raise HTTPException(
            status_code=400,
            detail=f"bigquery: source_table {source_table!r} is unsafe",
        )
    if not _is_safe_project_id(project_id):
        raise HTTPException(
            status_code=400,
            detail=f"bigquery: data_source.bigquery.project {project_id!r} is malformed",
        )
    return f"SELECT * FROM `{project_id}.{bucket}.{source_table}`"


def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
    """Enforce BQ-specific shape on a register/precheck request.

    Two BQ paths:

    - ``query_mode='materialized'`` — admin-registered SQL writes a parquet on
      schedule. Requires ``source_query``; ``bucket`` / ``source_table`` are
      not used (the SQL inlines the references). Doesn't force any field; the
      Pydantic ``model_validator`` already gated the query/mode coherence.

    - ``query_mode='remote'`` (or default) — remote view over a single BQ
      table. Requires ``bucket`` (BQ dataset) + ``source_table``. Mutates
      the model: forces ``query_mode='remote'`` and ``profile_after_sync=False``
      (per Decision 7 in #108) so a caller can't accidentally enqueue a
      parquet profiling pass for a remote view that has no local file.

    Raises HTTPException(422) for missing required fields and
    HTTPException(400) for unsafe identifiers / bogus project_id.
    """
    if req.query_mode == "materialized":
        # Materialized BQ rows: the SQL body replaces dataset+table refs.
        # source_query may be empty if admin supplied bucket+source_table —
        # in that case the server generates a full-table-dump SQL below.
        raw_name = req.name or ""
        if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
            raise HTTPException(
                status_code=400,
                detail=(
                    f"bigquery: view name {raw_name!r} is unsafe — must match "
                    f"^[a-zA-Z_][a-zA-Z0-9_]{{0,63}}$ (DuckDB identifier rules) "
                    "with no leading/trailing whitespace"
                ),
            )
        from app.instance_config import get_value
        project_id = get_value("data_source", "bigquery", "project", default="") or ""
        if not project_id:
            raise HTTPException(
                status_code=400,
                detail=(
                    "bigquery: data_source.bigquery.project is not set in "
                    "instance.yaml; configure it via /admin/server-config or "
                    "/api/admin/configure first"
                ),
            )
        if not _is_safe_project_id(project_id):
            raise HTTPException(
                status_code=400,
                detail=(
                    f"bigquery: data_source.bigquery.project {project_id!r} "
                    "is malformed — must match GCP project_id grammar "
                    "^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
                ),
            )

        if not (req.source_query and req.source_query.strip()):
            # Server-generate from bucket+source_table. Trivial full-table
            # dump path; admin only sets dataset+table and the server
            # builds BQ-native SQL from instance.yaml's configured project.
            if not (req.bucket and req.source_table):
                raise HTTPException(
                    status_code=422,
                    detail=(
                        "bigquery materialized requires either source_query "
                        "(custom SQL) or bucket+source_table (server-generates "
                        "the full-table-dump SQL)"
                    ),
                )
            req.source_query = _generate_materialized_source_query(
                req.bucket, req.source_table, project_id,
            )

        # Phase C: profile_after_sync is now inert (Pydantic field marked
        # deprecated; not read by app/api/sync.py:410-438). The runtime
        # profiles every synced table unconditionally, so we no longer
        # force-set this here as a "signal."
        return

    if not req.bucket or not req.bucket.strip():
        raise HTTPException(
            status_code=422,
            detail="bigquery: 'bucket' (BQ dataset) is required",
        )
    if not req.source_table or not req.source_table.strip():
        raise HTTPException(
            status_code=422,
            detail="bigquery: 'source_table' is required",
        )
    # No wildcard / sharded BQ tables in M1 (Decision 8).
    if "*" in (req.source_table or "") or "*" in (req.bucket or ""):
        raise HTTPException(
            status_code=400,
            detail="bigquery: wildcard / sharded tables are not supported (see #108 M3+)",
        )
    # Strict identifier on the DuckDB view name. CRITICAL: validate the RAW
    # name (the value that ``register_table`` actually persists to
    # ``table_registry.name`` and which the BQ extractor reads back as the
    # DuckDB view name at next rebuild). Earlier revisions normalized first
    # (``strip().lower().replace(" ", "_")``) and then checked, which let
    # names like ``"my table"`` pass here, get stored verbatim, and then
    # blow up inside ``_init_extract`` at view-create time — defeating the
    # whole point of fast-fail-at-register. We do NOT silently rewrite the
    # operator's name; if they typed ``"my table"``, return 400 with a
    # clear message and let them retype with a corrected name.
    raw_name = req.name or ""
    if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
        raise HTTPException(
            status_code=400,
            detail=(
                f"bigquery: view name {raw_name!r} is unsafe — must match "
                f"^[a-zA-Z_][a-zA-Z0-9_]{{0,63}}$ (DuckDB identifier rules) "
                "with no leading/trailing whitespace"
            ),
        )
    # Same fast-fail rule as ``raw_name`` above: validate the RAW value the
    # caller sent, not a stripped form. ``register_table`` persists ``bucket``
    # / ``source_table`` verbatim, and the BQ extractor splices them straight
    # into the ``ATTACH … AS bq_<bucket>`` and view DDL at next rebuild — so a
    # value with leading/trailing whitespace passes validation here, gets
    # stored as-is, and explodes inside DuckDB at view-create time. Surface
    # the offending raw value in the 400 detail and let the operator retype.
    raw_bucket = req.bucket
    if raw_bucket.strip() != raw_bucket or not _is_safe_quoted_identifier(raw_bucket):
        raise HTTPException(
            status_code=400,
            detail=(
                f"bigquery: dataset {raw_bucket!r} is unsafe (only [A-Za-z0-9_.-] "
                "allowed, no leading/trailing whitespace)"
            ),
        )
    raw_source_table = req.source_table
    if raw_source_table.strip() != raw_source_table or not _is_safe_quoted_identifier(raw_source_table):
        raise HTTPException(
            status_code=400,
            detail=(
                f"bigquery: source_table {raw_source_table!r} is unsafe (only "
                "[A-Za-z0-9_.-] allowed, no leading/trailing whitespace)"
            ),
        )
    # Pull project from instance.yaml — single-project model in M1
    # (Decision: no per-table project field). Validate the format here so
    # we surface a config issue at registration rather than at first
    # rebuild, where the operator no longer has a request to look at.
    from app.instance_config import get_value
    project_id = get_value("data_source", "bigquery", "project", default="")
    if not project_id:
        raise HTTPException(
            status_code=400,
            detail=(
                "bigquery: data_source.bigquery.project is not set in instance.yaml; "
                "configure it via /admin/server-config or /api/admin/configure first"
            ),
        )
    if not _is_safe_project_id(project_id):
        raise HTTPException(
            status_code=400,
            detail=(
                f"bigquery: data_source.bigquery.project {project_id!r} is malformed — "
                "must match GCP project_id grammar ^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
            ),
        )
    # Force the BQ-required mode (Decision 7). The orchestrator and
    # extractor both assume remote; persisting `local` here would later create
    # a profiling job against a non-existent parquet file.
    # Phase C: profile_after_sync is now inert (deprecated, not read by the
    # runtime); no longer force-set here.
    req.query_mode = "remote"


# Source types that don't depend on a `data_source.<name>.*` block — they
# get their data through a different ingestion path (e.g. Jira via
# webhooks). Registrations against these types are allowed regardless of
# the configured primary `data_source.type`.
_SOURCE_TYPES_INDEPENDENT_OF_DATA_SOURCE: frozenset[str] = frozenset({
    "jira",
    "local",
})


def _validate_source_type_configured(source_type: Optional[str]) -> None:
    """Refuse register-table requests whose ``source_type`` isn't actually
    configured on this instance.

    Pre-fix the route happily persisted e.g. ``source_type='keboola'`` on a
    BQ-only instance — the row landed in the registry but the scheduler had
    no Keboola URL/token to ATTACH against, so it silently never synced.
    No upfront error, no operator-visible signal until they noticed the
    table was missing from `agnes catalog`.

    A source_type is considered configured when:

    - it matches the instance's primary ``data_source.type``, OR
    - a non-empty ``data_source.<source_type>`` block exists in the
      effective `instance.yaml` (multi-source instances), OR
    - it's in the small allowlist of types that don't sit under
      `data_source.*` at all (Jira, local — see
      ``_SOURCE_TYPES_INDEPENDENT_OF_DATA_SOURCE``).

    Special case: when the configured primary is ``'local'`` (the default
    when an instance is freshly bootstrapped and no `data_source.type` has
    been set yet), the validator stays permissive — refusing registrations
    here would block the first-time-setup workflow where the operator
    registers a few tables against a not-yet-fully-configured instance.
    The misconfiguration that this validator targets is the *explicit
    mismatch*: `type=bigquery` instance + `source_type=keboola` payload
    with no `data_source.keboola.*` block. That case still 422s.

    A bare/None source_type is tolerated for backward compat with legacy
    CLI scripts; the route resolves it later against
    ``get_data_source_type()``.
    """
    if not source_type:
        return
    if source_type in _SOURCE_TYPES_INDEPENDENT_OF_DATA_SOURCE:
        return

    from app.instance_config import get_data_source_type, get_value

    configured_primary = get_data_source_type()
    if source_type == configured_primary:
        return

    # Multi-source: accept if a non-empty `data_source.<source_type>` block
    # exists. Empty dict / None / "" all count as "not configured".
    secondary_block = get_value("data_source", source_type, default=None)
    if secondary_block:
        # Truthy non-empty dict / mapping / scalar — treat as configured.
        return

    # Bootstrap-friendliness: a primary of 'local' means the instance hasn't
    # been pointed at a real source yet (or has been deliberately set to
    # local-only). Don't gate registrations in that state — the operator is
    # likely in the middle of first-time setup and will fill in the config
    # next. The check still fires when primary is an actual source type
    # (bigquery / keboola) and the requested source_type doesn't match
    # AND has no secondary block.
    if configured_primary == "local":
        return

    raise HTTPException(
        status_code=422,
        detail=(
            f"source_type={source_type!r} is not configured on this instance. "
            f"The configured data source is {configured_primary!r}. To enable "
            f"a secondary source, set data_source.{source_type}.* fields in "
            "instance.yaml or via /admin/server-config."
        ),
    )


class UpdateTableRequest(BaseModel):
    name: Optional[str] = None
    sync_strategy: Optional[str] = Field(
        default=None,
        description=(
            "v26+: drives the Keboola extractor dispatcher. PUT-shape "
            "requires a value if sent. See RegisterTableRequest.sync_strategy."
        ),
    )
    primary_key: Optional[List[str]] = None
    description: Optional[str] = None
    source_type: Optional[str] = None
    bucket: Optional[str] = None
    source_table: Optional[str] = None
    source_query: Optional[str] = None
    query_mode: Optional[str] = None
    sync_schedule: Optional[str] = None
    profile_after_sync: Optional[bool] = Field(
        default=None,
        deprecated=True,
        description=(
            "DEPRECATED: not consumed by the runtime. See "
            "RegisterTableRequest.profile_after_sync."
        ),
    )
    # v26 — same fields as RegisterTableRequest, all optional. The PUT
    # handler overlays the body on the existing row and re-runs the
    # synthetic RegisterTableRequest validator on the merged record, so
    # cross-field invariants are checked against the post-update state.
    incremental_window_days: Optional[int] = None
    max_history_days: Optional[int] = None
    incremental_column: Optional[str] = None
    where_filters: Optional[List[Dict[str, Any]]] = None
    partition_by: Optional[str] = None
    partition_granularity: Optional[str] = None
    initial_load_chunk_days: Optional[int] = None
    # v51 — see RegisterTableRequest.bq_fqn. PUT lets an admin add or
    # clear bq_fqn on an existing row (cleared via explicit `null`,
    # per the PUT shape contract documented on the handler below).
    bq_fqn: Optional[str] = None

    @field_validator("sync_strategy", mode="before")
    @classmethod
    def _validate_sync_strategy(cls, v):
        if v in (None, ""):
            return v
        allowed = {"full_refresh", "incremental", "partitioned"}
        if v not in allowed:
            raise ValueError(
                f"sync_strategy must be one of {sorted(allowed)}, got {v!r}"
            )
        return v

    @field_validator("partition_granularity", mode="before")
    @classmethod
    def _validate_partition_granularity(cls, v):
        if v in (None, ""):
            return v
        allowed = {"day", "month", "year"}
        if v not in allowed:
            raise ValueError(
                f"partition_granularity must be one of {sorted(allowed)}, got {v!r}"
            )
        return v

    @field_validator("where_filters", mode="before")
    @classmethod
    def _validate_where_filters(cls, v):
        if v in (None, "", []):
            return None
        from connectors.keboola.where_filters import parse_filters, InvalidFilterError
        try:
            return parse_filters(v)
        except InvalidFilterError as e:
            raise ValueError(str(e))

    @model_validator(mode="after")
    def _check_mode_query_coherence(self):
        """PUT semantics — only the fields explicitly in the body are
        validated. The body is overlaid on the existing row at the handler
        level (see ``update_table``), so omitted fields keep their stored
        values and the synthetic ``RegisterTableRequest`` constructed against
        the merged record runs the strict cross-field check before persist.

        The only invariants enforceable from the PUT body alone:

        - explicit ``source_query='SELECT ...'`` paired with ``query_mode``
          that isn't materialized → coherent reject (the SQL would be dead);
        - explicit ``source_query='SELECT ...'`` without any ``query_mode``
          in the body → reject; the operator must commit to materialized;
        - explicit empty/whitespace ``source_query=''`` paired with
          ``query_mode='materialized'`` → reject (operator clearly
          mistyped — they sent the field).

        Pre-fix this validator also rejected ``{"query_mode": "materialized",
        "sync_schedule": "every 12h"}`` because ``source_query`` was None
        — but that's the canonical "edit the schedule on a materialized
        row" use-case from the Edit modal, which always sends
        ``query_mode`` to indicate intent. Devin BUG_0002 on PR #148
        commit 2219255.
        """
        if self.query_mode is None and self.source_query is None:
            return self

        sq_raw = self.source_query
        sq = (sq_raw or "").strip() or None

        # Operator explicitly sent source_query as empty/whitespace while
        # claiming materialized — typo / bad form data, reject.
        if (
            self.query_mode == "materialized"
            and sq_raw is not None
            and not sq
        ):
            raise ValueError(
                "query_mode='materialized' requires a non-empty source_query"
            )

        # source_query only makes sense with materialized mode. Allow None
        # (omitted) to flow through; only reject when explicitly set with
        # the wrong mode.
        if (
            self.query_mode is not None
            and self.query_mode != "materialized"
            and sq
        ):
            raise ValueError(
                "source_query is only valid when query_mode='materialized'"
            )
        if self.query_mode is None and sq:
            raise ValueError(
                "source_query requires query_mode='materialized' to be set "
                "in the same request"
            )

        # Normalise: drop whitespace-only strings to None so the persisted
        # column is clean. Don't touch when source_query was None to begin
        # with — that signals "PUT didn't touch this field, keep existing".
        if sq_raw is not None:
            self.source_query = sq
        return self

    @field_validator("primary_key", mode="before")
    @classmethod
    def _coerce_primary_key(cls, v):
        return _normalize_primary_key(v)

    @field_validator("description", mode="before")
    @classmethod
    def _normalize_description(cls, v):
        # Defensive normalization for descriptions arriving via shell-quoting
        # tooling that injects literal backslash escapes (e.g. `Don\'t`, `\n`).
        return _unescape_shell_quoting(v)

    # Duplicated from RegisterTableRequest — Pydantic v2 validators don't
    # inherit cleanly across unrelated BaseModel classes; a shared mixin
    # would be overkill for two fields.
    @field_validator("sync_schedule", mode="before")
    @classmethod
    def _validate_sync_schedule(cls, v):
        # None / "" → no schedule, accepted.
        # Any non-empty string (including pure whitespace) must parse as a
        # valid schedule — otherwise it would be persisted and silently
        # ignored by the runtime evaluator.
        if v in (None, ""):
            return v
        if not is_valid_schedule(v):
            raise ValueError(
                f"sync_schedule must be 'every Nm' / 'every Nh' / "
                f"'daily HH:MM[,HH:MM,...]', got {v!r}"
            )
        return v


class ConfigureRequest(BaseModel):
    data_source: str  # "keboola" | "bigquery" | "local"
    keboola_token: Optional[str] = None
    keboola_url: Optional[str] = None
    bigquery_project: Optional[str] = None
    bigquery_location: Optional[str] = None
    instance_name: Optional[str] = None
    allowed_domain: Optional[str] = None


@router.get("/discover-tables")
async def discover_tables(
    user: dict = Depends(require_admin),
    dataset: Optional[str] = None,
):
    """Discover available tables from the configured data source.

    For ``data_source.type='keboola'`` returns the full Storage API table
    list (single round-trip). For ``data_source.type='bigquery'``:

    - Without ``dataset``: list datasets in the configured project.
    - With ``dataset=name``: list tables (BASE TABLE + VIEW) in that dataset.

    Two-step shape avoids paying the per-dataset list_tables cost up-front
    on projects with hundreds of datasets — the UI populates the dataset
    dropdown first, then fetches tables only for the selected dataset.
    """
    try:
        from app.instance_config import get_data_source_type
        source_type = get_data_source_type()

        if source_type == "keboola":
            from connectors.keboola.client import KeboolaClient
            from app.instance_config import get_value
            url = get_value("data_source", "keboola", "stack_url", default="")
            token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
            token = os.environ.get(token_env, "") if token_env else ""
            if not token:
                token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
            client = KeboolaClient(token=token, url=url)
            tables = client.discover_all_tables()
            return {"tables": tables, "count": len(tables), "source": "keboola"}

        if source_type == "bigquery":
            return _discover_bigquery(dataset=dataset)

        return {
            "tables": [],
            "count": 0,
            "source": source_type,
            "error": f"Discovery not implemented for source_type={source_type!r}",
        }
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Discovery failed: {e}")


def _discover_bigquery(dataset: Optional[str]) -> Dict[str, Any]:
    """List BQ datasets (when ``dataset`` is None) or tables-in-dataset.

    Routes through ``BqAccess.client()`` so config / auth / error
    translation matches the rest of the BQ surface (#138 facade). Returns
    the same shape as the Keboola path so the UI doesn't have to branch.
    """
    from connectors.bigquery.access import (
        get_bq_access,
        BqAccessError,
        translate_bq_error,
    )

    try:
        bq = get_bq_access()
        client = bq.client()
    except BqAccessError as e:
        raise HTTPException(
            status_code=BqAccessError.HTTP_STATUS.get(e.kind, 500),
            detail={"error": e.message, "kind": e.kind, "details": e.details},
        )

    try:
        if dataset is None:
            datasets = []
            for ds in client.list_datasets():
                datasets.append({
                    "dataset_id": ds.dataset_id,
                    "full_id": f"{ds.project}.{ds.dataset_id}",
                })
            return {
                "datasets": sorted(datasets, key=lambda d: d["dataset_id"]),
                "count": len(datasets),
                "source": "bigquery",
            }

        # List tables in the named dataset. `list_tables` returns
        # `TableListItem` with `table_id` + `table_type` ('TABLE', 'VIEW',
        # 'MATERIALIZED_VIEW', 'EXTERNAL', 'SNAPSHOT'). UI maps TABLE → Type
        # selector "table" and VIEW/MATERIALIZED_VIEW → "view"; the rest are
        # passed through with their raw type so the operator can decide.
        tables = []
        for t in client.list_tables(dataset):
            tables.append({
                "table_id": t.table_id,
                "table_type": t.table_type,
                "full_id": f"{t.project}.{t.dataset_id}.{t.table_id}",
            })
        return {
            "tables": sorted(tables, key=lambda t: t["table_id"]),
            "count": len(tables),
            "source": "bigquery",
            "dataset": dataset,
        }
    except Exception as e:
        # `translate_bq_error` re-raises non-Google exceptions unchanged,
        # so wrap in HTTPException to keep the JSON-shape contract.
        try:
            err = translate_bq_error(e, bq.projects, bad_request_status="upstream_error")
        except Exception:
            raise HTTPException(status_code=502, detail=f"BQ discovery failed: {e}")
        raise HTTPException(
            status_code=BqAccessError.HTTP_STATUS.get(err.kind, 502),
            detail={"error": err.message, "kind": err.kind, "details": err.details},
        )


@router.get("/registry")
async def list_registry(
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Get full table registry.

    Each table row is enriched with `last_sync_error` from sync_state so
    operators can see WHY a row isn't materializing without trawling
    scheduler logs. None for rows that have never errored or have already
    recovered (status='ok'); the per-row error message string otherwise.
    """
    repo = TableRegistryRepository(conn)
    tables = repo.list_all()

    # Single batched read of sync_state errors — avoid N+1 GETs against
    # `sync_state` for large registries. The sync_state row is keyed on
    # `table_id` which mirrors `table_registry.name` (see comment in
    # _run_materialized_pass / _build_manifest_for_user about name vs id).
    error_by_name: Dict[str, Optional[str]] = {}
    try:
        rows = conn.execute(
            "SELECT table_id, error FROM sync_state "
            "WHERE status = 'error' AND error IS NOT NULL AND error <> ''"
        ).fetchall()
        error_by_name = {r[0]: r[1] for r in rows}
    except Exception:
        # Defensive: if sync_state is unreadable for any reason, the
        # registry response still serializes — operators just lose the
        # last_sync_error column on this call.
        logger.exception("Failed to read sync_state errors for registry")

    for t in tables:
        # Sync_state.table_id == table_registry.name by convention.
        t["last_sync_error"] = error_by_name.get(t.get("name"))

    return {"tables": tables, "count": len(tables)}


# Wall-clock budget for the synchronous BQ materialization that runs after
# a successful BQ register. If the rebuild + view creation exceeds this,
# we hand the rest off to BackgroundTasks and return 202. 5s matches the
# UX contract in #108 ("Queryable as <view> within seconds") — long enough
# to cover a healthy GCE round-trip, short enough that a hung GCE call
# doesn't park the request handler.
_BQ_SYNC_REGISTER_TIMEOUT_S: float = 5.0


def _materialize_bigquery_extract() -> Dict[str, Any]:
    """Re-build the BigQuery extract.duckdb + master views.

    Wrapper used by both the synchronous (in-band) and async (BackgroundTask)
    code paths after a BQ register/update/delete. Imports kept inside the
    function so non-BQ instances don't pay the import cost on app start.

    Opens a FRESH system DB connection rather than reusing the request-scoped
    one. The request handler closes its connection in a `finally` after the
    response, but BackgroundTask + the timeout-fallback daemon thread can
    both outlive that close — they would then operate on a closed handle (or
    one being torn down concurrently). A fresh handle is cheap (DuckDB is an
    embedded engine) and isolates the worker's lifetime from the request's.

    Returns the rebuild result dict (``{"errors": [...], "tables_registered":
    N, ...}``) so the synchronous caller can propagate failures to the
    operator. Background-task callers ignore the return value, but the loud
    log inside ``_run_bigquery_materialize_with_timeout`` covers that path.
    """
    from connectors.bigquery import extractor as _bq_extractor
    from src.db import get_system_db
    from src.orchestrator import SyncOrchestrator

    fresh_conn = get_system_db()
    try:
        result = _bq_extractor.rebuild_from_registry(conn=fresh_conn)
        SyncOrchestrator().rebuild()
        return result or {}
    finally:
        try:
            fresh_conn.close()
        except Exception:
            pass


def _materialize_bigquery_extract_bg() -> None:
    """BackgroundTask wrapper around `_materialize_bigquery_extract`.

    BackgroundTasks discard return values, but `rebuild_from_registry` can
    surface auth / config / identifier errors via the ``errors`` list. Log
    those at ERROR level so the failure is loud in the operator's logs even
    though the 202 response can't carry the detail (Decision 3 in #108: a
    202 is documented as "accepted, may not be queryable yet" — we don't
    block on it but we shouldn't swallow it either).
    """
    try:
        result = _materialize_bigquery_extract()
    except Exception:
        logger.exception("BQ post-register background materialize crashed")
        return
    errors = (result or {}).get("errors") or []
    if errors:
        logger.error(
            "BQ post-register background materialize completed with %d error(s): %s",
            len(errors), errors,
        )


def _run_bigquery_materialize_with_timeout(
    background: BackgroundTasks,
) -> Dict[str, Any]:
    """Try to materialize synchronously within the wall-clock budget.

    Returns a dict with:
      - ``status`` ∈ {"ok", "errors", "timeout"} — caller maps to HTTP code
      - ``errors``: list of {table, error} surfaced by ``rebuild_from_registry``
        (only present on ``status="errors"``)

    Mapping by caller (`register_table`):
      - "ok"       → 200 (synchronous success)
      - "errors"   → 500 (rebuild ran but reported errors — propagate so
                     the operator knows the registry row exists but the
                     view wasn't created)
      - "timeout"  → 202 (rebuild still running on a BackgroundTask)

    The synchronous worker runs on a daemon thread (so a hung GCE call
    can't park the request) that opens its OWN system DB connection (see
    `_materialize_bigquery_extract`). Even though FastAPI now invokes the
    sync route in a threadpool — and `done.wait()` no longer blocks the
    event loop — we still off-load to a daemon so the wait is bounded
    even if `rebuild_from_registry` ignores its own timeouts.
    """
    import threading

    done = threading.Event()
    err_holder: Dict[str, Any] = {}
    result_holder: Dict[str, Any] = {}

    def _worker():
        try:
            result_holder["result"] = _materialize_bigquery_extract()
        except Exception as e:  # pragma: no cover — logged below
            err_holder["error"] = e
        finally:
            done.set()

    t = threading.Thread(target=_worker, daemon=True, name="bq-register-rebuild")
    t.start()
    finished = done.wait(_BQ_SYNC_REGISTER_TIMEOUT_S)

    if finished:
        if "error" in err_holder:
            # Worker finished within the wall-clock budget but raised. This
            # is a HARD ERROR, not a timeout — surface it as such so the
            # operator gets the actual exception in the 500 body instead
            # of a misleading 202 + "still working in the background".
            # Earlier revisions returned ``{"status": "timeout"}`` here,
            # which the register handler then mapped to 202 + a retry
            # BackgroundTask; that hid the real failure for `_BQ_SYNC_
            # REGISTER_TIMEOUT_S` seconds before the BG retry surfaced
            # the same exception in the logs.
            exc = err_holder["error"]
            logger.error(
                "BQ post-register rebuild raised within budget: %r",
                exc,
            )
            return {
                "status": "errors",
                "errors": [{"error": f"{type(exc).__name__}: {exc}"}],
            }
        # Synchronous worker finished cleanly — but check whether
        # `rebuild_from_registry` itself surfaced any errors (auth fail,
        # missing project from the overlay, unsafe identifier slipping the
        # validator, etc.). Without this, those errors got silently logged
        # and the API claimed success.
        result = result_holder.get("result") or {}
        errors = result.get("errors") or []
        if errors:
            logger.error(
                "BQ post-register rebuild reported %d error(s): %s",
                len(errors), errors,
            )
            return {"status": "errors", "errors": errors}
        return {"status": "ok"}

    # Timed out — let the worker keep running on its thread (already daemon)
    # and also schedule a BackgroundTask so the orchestrator gets called via
    # the supported FastAPI path. `_INIT_EXTRACT_LOCK` in the BQ extractor
    # serializes the two file-swap calls so the slow daemon thread and the
    # background task can't tear `extract.duckdb`; the orchestrator's own
    # `_rebuild_lock` protects the master-view rebuild step downstream.
    logger.info(
        "BQ post-register rebuild exceeded %ss budget — handing off to BackgroundTask",
        _BQ_SYNC_REGISTER_TIMEOUT_S,
    )
    background.add_task(_materialize_bigquery_extract_bg)
    return {"status": "timeout"}


@router.post(
    "/register-table",
    responses={
        200: {"description": "BigQuery row registered + materialized synchronously"},
        201: {"description": "Non-BigQuery row registered (no post-insert materialize)"},
        202: {"description": "BigQuery row registered; materialize continues in background"},
        409: {"description": "Table id or view name already in use"},
        500: {"description": "BigQuery row registered but post-insert rebuild failed"},
    },
)
def register_table(
    request: RegisterTableRequest,
    background: BackgroundTasks,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Register a new table in the system.

    Behavior by source_type:
    - **bigquery**: validates BQ-specific shape (dataset / source_table /
      identifier safety / project_id format), forces query_mode='remote' and
      profile_after_sync=False, then synchronously rebuilds extract.duckdb +
      master views with a wall-clock budget. Returns 200 with the view name
      on success, 202 on budget overrun (rebuild continues in a
      BackgroundTask), or 500 if the synchronous rebuild ran but reported
      an error (e.g. auth failure, missing project, unsafe identifier).
    - other source types: insert-only, no post-register hook. Returns 201.

    Defined as a plain ``def`` (not ``async def``) so FastAPI runs it in a
    threadpool — the synchronous-materialize path waits on
    ``threading.Event.wait()``, which would otherwise block the asyncio
    event loop and stall every other request for up to ``_BQ_SYNC_REGISTER_
    TIMEOUT_S``. ``Depends(...)``, ``BackgroundTasks``, and
    ``JSONResponse`` all work the same in sync handlers; the rest of the
    admin module mixes both styles already.

    The route does NOT carry a default ``status_code`` — each branch returns
    its own JSONResponse with the right code. A blanket ``status_code=201``
    on the decorator would mislead OpenAPI consumers about the BQ branch.

    Always: 409 on view-name collision against the existing registry, audit
    log entry on success.
    """
    from fastapi.responses import JSONResponse
    if not request.name or not request.name.strip():
        raise HTTPException(status_code=422, detail="Table name cannot be empty")
    repo = TableRegistryRepository(conn)
    table_id = request.name.strip().lower().replace(" ", "_")

    if repo.get(table_id):
        raise HTTPException(status_code=409, detail=f"Table '{table_id}' already registered")

    # View-name collision pre-check — distinct from id collision above.
    # `id` is derived from `name`, but two callers could legally pick
    # different display names that lower-case + slugify to the same view
    # (e.g. "Orders v2" + "orders_v2"); the strict view-name uniqueness
    # check stops that here, before the orchestrator surfaces it as a
    # silent overwrite at next rebuild.
    existing_by_name = next(
        (r for r in repo.list_all() if (r.get("name") or "") == request.name),
        None,
    )
    if existing_by_name is not None:
        raise HTTPException(
            status_code=409,
            detail=f"View name '{request.name}' is already in use by table id '{existing_by_name.get('id')}'",
        )

    # Refuse rows whose source_type isn't actually configured — pre-fix the
    # row landed in the registry but never synced because there was no
    # Keboola URL/token (or BQ project) to ATTACH against. Surfaces the
    # misconfig at registration time so the operator sees the gap before
    # they wonder why `agnes catalog` is missing the table.
    _validate_source_type_configured(request.source_type)

    # BQ rows go through the extra validation + post-insert materialization
    # contract from issue #108. Other source types keep the legacy insert-only
    # flow — Keboola materialization happens via the scheduled sync, Jira via
    # webhook, local via a manual extractor run.
    is_bigquery = request.source_type == "bigquery"
    if is_bigquery:
        _validate_bigquery_register_payload(request)

    # Phase C: profile_after_sync is no longer passed — the field is
    # deprecated and inert at the runtime layer. The DB column keeps its
    # schema default; the registry response no longer reflects request
    # values for this flag.
    # v51 — validate bq_fqn upfront. The extractor would catch a malformed
    # value at next rebuild and skip the row, but failing at register time
    # gives the admin a clean 422 with the specific complaint instead of
    # a silent "table registered but never materialized" state.
    if request.bq_fqn is not None and request.source_type != "bigquery":
        raise HTTPException(
            status_code=422,
            detail="bq_fqn only applies to source_type='bigquery'",
        )
    if request.bq_fqn is not None:
        from connectors.bigquery.extractor import parse_bq_fqn
        try:
            parse_bq_fqn(request.bq_fqn)
        except ValueError as e:
            raise HTTPException(status_code=422, detail=str(e))

    repo.register(
        id=table_id,
        name=request.name,
        folder=request.folder,
        sync_strategy=request.sync_strategy,
        primary_key=request.primary_key,
        description=request.description,
        registered_by=user.get("email"),
        source_type=request.source_type,
        bucket=request.bucket,
        source_table=request.source_table,
        source_query=request.source_query,
        query_mode=request.query_mode,
        sync_schedule=request.sync_schedule,
        # v26 sync-strategy support fields. None for non-Keboola or
        # full_refresh tables; persisted as NULL.
        incremental_window_days=request.incremental_window_days,
        max_history_days=request.max_history_days,
        incremental_column=request.incremental_column,
        where_filters=request.where_filters,
        partition_by=request.partition_by,
        partition_granularity=request.partition_granularity,
        initial_load_chunk_days=request.initial_load_chunk_days,
        bq_fqn=request.bq_fqn,
    )

    # Audit entry — masked params; description kept raw (it's documentation).
    AuditRepository(conn).log(
        user_id=user.get("id"),
        action="register_table",
        resource=table_id,
        params=_sanitize_for_audit(request.model_dump()),
    )

    from app.api.v2_catalog import invalidate_for_table
    invalidate_for_table(table_id)

    if not is_bigquery:
        # Keboola / Jira / local rows are insert-only here. 201 Created — the
        # decorator no longer carries a default status, so each branch is
        # explicit about its code (BQ branch overrides via JSONResponse).
        return JSONResponse(
            status_code=201,
            content={"id": table_id, "name": request.name, "status": "registered"},
        )

    if request.query_mode == "materialized":
        # Materialized BQ rows are picked up by the trigger pass on the next
        # scheduled tick (or via POST /api/sync/trigger). No synchronous
        # rebuild — the COPY can scan multi-GB and would block the request.
        return JSONResponse(
            status_code=201,
            content={
                "id": table_id,
                "name": request.name,
                "status": "registered",
                "view_name": table_id,
                "message": (
                    "Materialized — parquet will be written on the next sync "
                    "tick. Trigger now via POST /api/sync/trigger."
                ),
            },
        )

    # BQ post-register: rebuild extract + master views, with timeout fallback.
    # Decision 1: 200 on synchronous success, 202 on timeout, 500 if the
    # synchronous rebuild surfaced errors. Distinct from the 201 Keboola
    # path above, so the BQ branch builds its own response.
    outcome = _run_bigquery_materialize_with_timeout(background)
    status = outcome.get("status")
    if status == "ok":
        return JSONResponse(
            status_code=200,
            content={
                "id": table_id,
                "name": request.name,
                "status": "ok",
                "view_name": table_id,
            },
        )
    if status == "errors":
        # Registry insert succeeded but the post-insert rebuild reported
        # errors — the row is in the registry but the master view was NOT
        # created. Surface the failure verbatim so the operator can fix
        # the underlying config (typically a missing
        # `data_source.bigquery.project` in the overlay or auth that lacks
        # bigquery.metadata.get on the dataset). The row stays in the
        # registry; a re-run after fixing the config picks up the existing
        # row and creates the view on the next register/update or
        # scheduler tick.
        return JSONResponse(
            status_code=500,
            content={
                "id": table_id,
                "name": request.name,
                "status": "rebuild_failed",
                "view_name": table_id,
                "errors": outcome.get("errors") or [],
                "message": (
                    "Registry row created but post-insert rebuild failed; "
                    "view is not queryable. See `errors` for details."
                ),
            },
        )
    # Default: timeout — rebuild continues on a BackgroundTask.
    return JSONResponse(
        status_code=202,
        content={
            "id": table_id,
            "name": request.name,
            "status": "accepted",
            "view_name": table_id,
            "message": "Registration accepted; materializing in background",
        },
    )


class PrecheckResponse(BaseModel):
    """Response model for /api/admin/register-table/precheck.

    Documented here so OpenAPI consumers know what to expect; the route
    returns a plain dict for backwards compatibility with the rest of the
    admin API which doesn't use response_model.
    """
    ok: bool
    table: Dict[str, Any]


@router.post("/register-table/precheck")
def register_table_precheck(
    request: RegisterTableRequest,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Validate a register-table payload + (BQ only) confirm the source table exists.

    No DB write. Used by the UI to surface row count + size + column count
    in the modal before the operator clicks Register, and by the CLI's
    ``--dry-run`` to print what *would* be registered without touching
    state. Identical Pydantic validation to register-table; for BQ rows we
    additionally make a ``bigquery.Client(project).get_table(...)`` call
    and surface the GCP error verbatim.

    Defined as a plain ``def`` (not ``async def``) so FastAPI runs it in a
    threadpool — the BQ branch makes synchronous ``bigquery.Client(...)``
    /``client.get_table(...)`` calls, which would otherwise block the
    asyncio event loop and stall every other request for the duration of
    the GCE round-trip. Mirrors the same conversion done for
    ``register_table`` (see comment on that route). ``Depends(...)`` works
    identically in sync handlers.
    """
    if not request.name or not request.name.strip():
        raise HTTPException(status_code=422, detail="Table name cannot be empty")

    if request.source_type != "bigquery":
        # M1 only adds BQ-specific precheck. Other source types get a
        # validation-only response so the CLI / UI can rely on the same
        # endpoint shape across types.
        return {
            "ok": True,
            "table": {
                "name": request.name,
                "source_type": request.source_type,
                "bucket": request.bucket,
                "source_table": request.source_table,
                "rows": None,
                "size_bytes": None,
                "columns": [],
                "note": "precheck for non-bigquery sources is validation-only in M1",
            },
        }

    # BQ-specific shape validation (forces query_mode/profile_after_sync,
    # checks identifier safety, validates project_id from instance.yaml).
    _validate_bigquery_register_payload(request)

    # Materialized BQ rows have no `dataset.source_table` to round-trip —
    # the SQL body is the contract. Skip the BQ-jobs-API call and return a
    # validation-only precheck so the CLI's `--dry-run --query-mode
    # materialized` path doesn't crash on an empty fully-qualified name.
    if request.query_mode == "materialized":
        return {
            "ok": True,
            "table": {
                "name": request.name,
                "source_type": "bigquery",
                "query_mode": "materialized",
                "source_query": request.source_query,
                "rows": None,
                "size_bytes": None,
                "columns": [],
                "note": (
                    "materialized precheck is validation-only — the SQL is "
                    "evaluated for cost on each scheduled materialize tick"
                ),
            },
        }

    # Round-trip the BQ jobs API to confirm the table exists and the SA can
    # see it. Imports kept local to avoid pulling google-cloud-bigquery into
    # the import chain on non-BQ instances.
    try:
        from google.cloud import bigquery  # noqa: PLC0415
        from google.api_core import exceptions as google_exc  # noqa: PLC0415
    except ImportError as e:
        raise HTTPException(
            status_code=500,
            detail=(
                "google-cloud-bigquery not installed; install the bigquery "
                f"extras to use BQ precheck ({e})"
            ),
        ) from e

    from app.instance_config import get_value
    project_id = get_value("data_source", "bigquery", "project", default="")
    dataset = (request.bucket or "").strip()
    source_table = (request.source_table or "").strip()
    fq = f"{project_id}.{dataset}.{source_table}"

    try:
        client = bigquery.Client(project=project_id)
        bq_table = client.get_table(fq)
    except google_exc.NotFound as e:
        raise HTTPException(status_code=404, detail=f"BigQuery table not found: {fq} ({e})") from e
    except google_exc.Forbidden as e:
        raise HTTPException(
            status_code=403,
            detail=(
                f"BigQuery access denied for {fq}: {e}. "
                "Service account needs bigquery.metadata.get on the dataset."
            ),
        ) from e
    except Exception as e:
        # Auth errors, transient 5xx, malformed table refs — surface as 400
        # so the operator gets the GCP error verbatim and can fix their
        # config without us guessing the right HTTP code.
        raise HTTPException(status_code=400, detail=f"BigQuery precheck failed for {fq}: {e}") from e

    columns = [
        {"name": f.name, "type": f.field_type}
        for f in (bq_table.schema or [])
    ]
    return {
        "ok": True,
        "table": {
            "name": request.name,
            "source_type": "bigquery",
            "bucket": dataset,
            "source_table": source_table,
            "project_id": project_id,
            "rows": int(bq_table.num_rows or 0),
            "size_bytes": int(bq_table.num_bytes or 0),
            "columns": columns,
            "column_count": len(columns),
        },
    }


@router.put("/registry/{table_id}")
async def update_table(
    table_id: str,
    request: UpdateTableRequest,
    background: BackgroundTasks,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Update a registered table's configuration.

    For BQ rows, schedules a background rebuild so the master view picks
    up changes (e.g. a renamed dataset) without waiting for the next
    scheduled sync.
    """
    repo = TableRegistryRepository(conn)
    existing = repo.get(table_id)
    if not existing:
        raise HTTPException(status_code=404, detail="Table not found")

    # `exclude_unset=True` honors the PUT-shape distinction between
    # "field omitted from body" (keep existing) vs "field sent as null"
    # (clear to NULL). Pre-v26 the handler used `model_dump()` filtered by
    # `if v is not None`, which collapsed both cases to "omitted" — meaning
    # an admin couldn't clear a field via PUT. v26 needs the clear path so
    # the Edit modal can switch a partitioned row back to full_refresh and
    # have the stale partition_by / partition_granularity / max_history_days
    # actually go away (without this fix, those fields linger and either
    # confuse the dispatcher or trip the v26 conflict-policy validator on
    # the next edit).
    #
    # Contract change (Devin Review finding 0001): callers that previously
    # sent explicit `null` to mean "no-op, keep existing" will now have the
    # field cleared. In practice this is fine — the only known caller is
    # the Edit modal, which pre-populates form fields from the existing row
    # and JSON-encodes the populated (non-null) value back. CLI register-table
    # only POSTs new rows, never PUTs nulls. If a future client needs the
    # old "null = no-op" semantics for some field, it should omit the field
    # from the body instead of sending null — that's the canonical PUT shape.
    updates = request.model_dump(exclude_unset=True)
    # Run BQ-shape validation BEFORE persisting whenever the merged record
    # would be a bigquery row (existing was BQ, or the patch flips it to BQ,
    # or the patch touches BQ-relevant fields on an already-BQ row). Without
    # this gate, an admin could PUT `bucket="evil\"; DROP --"` onto a BQ
    # row and the next rebuild would silently fail at view-create time —
    # surface the bad shape at PUT time instead.
    if updates:
        # Preserve the original `registered_at` across PUTs — `repo.register`
        # now accepts it as an optional kwarg; without this the upsert would
        # stamp a fresh `now()` on every edit (issue #130).
        merged = dict(existing)
        merged.update(updates)
        merged.pop("id", None)  # avoid duplicate id kwarg

        # v52 + v56: per-table docs fields (sample_questions /
        # things_to_know / pairs_well_with + grain / platforms /
        # partition_col / history / gotchas) live on table_registry
        # but have their own PATCH /registry/{id}/docs endpoint.
        # ``repo.register()`` doesn't know them; stripping here keeps
        # the read-modify-write loop the PUT handler relies on
        # (existing → merged → register) from blowing up with
        # TypeError when the docs columns are populated.
        for _docs_key in (
            "sample_questions", "things_to_know", "pairs_well_with",
            "grain", "platforms", "partition_col", "history", "gotchas",
        ):
            merged.pop(_docs_key, None)

        # When switching the merged record away from materialized mode, drop
        # the stale source_query — the request validator can't clear it via
        # the `if v is not None` filter above. Without this, a remote/local
        # row would carry an orphan source_query in the registry.
        if merged.get("query_mode") != "materialized":
            merged["source_query"] = None

        # Cross-source coherence: query_mode='materialized' requires a
        # non-empty source_query for ALL source types, not just BigQuery.
        # BQ rows without source_query can be server-generated from
        # bucket+source_table (handled by _validate_bigquery_register_payload
        # via the synthetic RegisterTableRequest below). Non-BQ rows (e.g.
        # Keboola) still require an explicit source_query at PUT time.
        if merged.get("query_mode") == "materialized":
            sq = merged.get("source_query")
            if not sq or not str(sq).strip():
                # BQ rows: let _validate_bigquery_register_payload generate
                # source_query from bucket+source_table (falls through below).
                # Non-BQ rows: no server-generate fallback; raise 422.
                if merged.get("source_type") != "bigquery":
                    raise HTTPException(
                        status_code=422,
                        detail=(
                            "query_mode='materialized' requires a non-empty "
                            "source_query. To revert to a non-materialized mode, "
                            "PATCH query_mode='local' (Keboola) or 'remote' "
                            "(BigQuery) and the stale source_query is cleared "
                            "automatically."
                        ),
                    )
            # Backtick guard removed for materialized rows: the Task 2 wrapping
            # path (connectors.bigquery.extractor.materialize_query) now runs
            # admin SQL through the BQ jobs API using BQ-native syntax, which
            # requires backticks for dashed project/dataset identifiers.
            # Non-materialized rows still reject backticks in the model validator.

        if merged.get("source_type") == "bigquery":
            # Reuse the register-time validator. It mutates the request to
            # force query_mode='remote' / profile_after_sync=False (or to
            # leave a materialized row alone) — apply the same coercion to
            # `merged` so the persisted row matches.
            synthetic = RegisterTableRequest(
                name=merged.get("name") or table_id,
                bucket=merged.get("bucket"),
                source_table=merged.get("source_table"),
                source_query=merged.get("source_query"),
                source_type="bigquery",
                query_mode=merged.get("query_mode") or "remote",
                profile_after_sync=bool(merged.get("profile_after_sync") or False),
                primary_key=merged.get("primary_key"),
                description=merged.get("description"),
                folder=merged.get("folder"),
                sync_strategy=merged.get("sync_strategy") or "full_refresh",
                sync_schedule=merged.get("sync_schedule"),
            )
            _validate_bigquery_register_payload(synthetic)
            merged["query_mode"] = synthetic.query_mode
            merged["profile_after_sync"] = synthetic.profile_after_sync
            merged["source_query"] = synthetic.source_query

            # v51 — same bq_fqn validation as register-table. PUT can both
            # add a fresh bq_fqn or update an existing one; in either case
            # malformed values should reject at PUT time, not silently
            # land in the DB and break the next rebuild.
            if merged.get("bq_fqn"):
                from connectors.bigquery.extractor import parse_bq_fqn
                try:
                    parse_bq_fqn(merged["bq_fqn"])
                except ValueError as e:
                    raise HTTPException(status_code=422, detail=str(e))
        else:
            # Non-BQ row carrying bq_fqn is nonsensical — reject the same
            # way register-table does.
            if merged.get("bq_fqn"):
                raise HTTPException(
                    status_code=422,
                    detail="bq_fqn only applies to source_type='bigquery'",
                )

        repo.register(id=table_id, **merged)

    AuditRepository(conn).log(
        user_id=user.get("id"),
        action="update_table",
        resource=table_id,
        params=_sanitize_for_audit({"updated_fields": sorted(updates.keys()), **updates}),
    )

    # If we updated a BQ row (or one that's now BQ), refresh the extract in
    # the background so the view picks up renames / column-list changes.
    # Use the BG wrapper so any rebuild errors are logged at ERROR level
    # instead of being silently dropped by BackgroundTasks (which discards
    # return values).
    after = repo.get(table_id) or {}
    if after.get("source_type") == "bigquery":
        background.add_task(_materialize_bigquery_extract_bg)

    from app.api.v2_catalog import invalidate_for_table
    invalidate_for_table(table_id)

    return {"id": table_id, "updated": list(updates.keys())}


class _GotchaItem(BaseModel):
    """v56: a single gotcha entry. ``key=True`` marks the first one as
    the "Key gotcha" rendered distinctly by the package detail page."""
    key: bool = False
    body: str


class TableDocsRequest(BaseModel):
    """Per-table docs surface — v52 (sample_questions / things_to_know /
    pairs_well_with) extended in v56 with structured fields (grain /
    platforms / partition_col / history / gotchas) for the
    /catalog/p/<slug> package detail page rewrite.

    All fields optional. Sending `[]` for a list clears it; sending
    `""` for a scalar clears it; omitting leaves it untouched
    (Optional-is-no-op contract).
    """
    # v52 fields.
    sample_questions: Optional[List[str]] = None
    things_to_know: Optional[str] = None
    pairs_well_with: Optional[List[str]] = None
    # v56 fields.
    grain: Optional[str] = None
    platforms: Optional[List[str]] = None
    partition_col: Optional[str] = None
    history: Optional[str] = None
    gotchas: Optional[List[_GotchaItem]] = None

    @field_validator("platforms")
    @classmethod
    def _check_platforms(cls, v: Optional[List[str]]) -> Optional[List[str]]:
        if v is None:
            return None
        if len(v) > 8:
            raise ValueError("platforms: max 8 entries")
        return v

    @field_validator("gotchas")
    @classmethod
    def _check_gotchas(cls, v):
        if v is None:
            return None
        if len(v) > 8:
            raise ValueError("gotchas: max 8 entries")
        return v


@router.patch("/registry/{table_id}/docs")
async def update_table_docs(
    table_id: str,
    payload: TableDocsRequest,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Write the admin-authored per-table docs read by /catalog/t/<id>
    and (for the v56 structured fields) by the per-table extended
    section on /catalog/p/<slug>. Separated from PUT /registry/{id} so
    admins can flip these fields without re-submitting the whole big
    registration payload."""
    repo = TableRegistryRepository(conn)
    if not repo.get(table_id):
        raise HTTPException(status_code=404, detail="table_not_found")
    # Empty-string ``things_to_know`` clears; explicit `[]` clears lists.
    clear_things = payload.things_to_know == ""
    clear_questions = payload.sample_questions == []
    clear_pairs = payload.pairs_well_with == []
    # v56 ``gotchas`` Pydantic models → list of dicts for the repo (JSON
    # serializer handles plain dicts; we'd lose the validator if we
    # passed _GotchaItem instances through).
    gotchas_payload = (
        [g.model_dump() for g in payload.gotchas]
        if payload.gotchas is not None
        else None
    )
    repo.update_docs(
        table_id,
        sample_questions=(
            None if clear_questions else payload.sample_questions
        ),
        things_to_know=(None if clear_things else payload.things_to_know),
        pairs_well_with=(
            None if clear_pairs else payload.pairs_well_with
        ),
        clear_sample_questions=clear_questions,
        clear_things_to_know=clear_things,
        clear_pairs_well_with=clear_pairs,
        # v56 — same Optional-is-no-op contract.
        grain=payload.grain,
        platforms=payload.platforms,
        partition_col=payload.partition_col,
        history=payload.history,
        gotchas=gotchas_payload,
    )
    # Echo the fresh state so the admin client can re-render without a
    # second GET. Lets the test suite (and the eventual admin UI) inspect
    # what landed in DB.
    fresh = repo.get(table_id) or {}
    return {
        "id": table_id,
        "sample_questions": fresh.get("sample_questions") or [],
        "things_to_know": fresh.get("things_to_know"),
        "pairs_well_with": fresh.get("pairs_well_with") or [],
        "grain": fresh.get("grain"),
        "platforms": fresh.get("platforms") or [],
        "partition_col": fresh.get("partition_col"),
        "history": fresh.get("history"),
        "gotchas": fresh.get("gotchas") or [],
    }


@router.delete("/registry/{table_id}", status_code=204)
async def unregister_table(
    table_id: str,
    background: BackgroundTasks,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Unregister a table from the system.

    For BQ rows, schedules a background rebuild so the dropped row's
    master view is removed from analytics.duckdb (rather than hanging
    around until the next scheduled sync).

    For materialized rows, also removes the canonical parquet at
    `${DATA_DIR}/extracts/<source_type>/data/<id>.parquet` and clears
    the matching `sync_state` row. Without these two cleanups, the
    manifest endpoint kept advertising the dropped table to `agnes pull`
    (sync_state-driven) and the orchestrator's next rebuild could
    resurrect a master view from the leftover parquet (E2E sub-agent
    finding 2026-05-01).
    """
    repo = TableRegistryRepository(conn)
    existing = repo.get(table_id)
    if not existing:
        raise HTTPException(status_code=404, detail="Table not found")

    was_bigquery = existing.get("source_type") == "bigquery"
    was_materialized = existing.get("query_mode") == "materialized"
    source_type = existing.get("source_type") or ""
    name = existing.get("name") or table_id

    repo.unregister(table_id)

    # Drop the canonical parquet for materialized rows. Path layout:
    # `${DATA_DIR}/extracts/<source_type>/data/<name>.parquet` — the
    # filename is keyed by `table_registry.name` (matches sync_state
    # bookkeeping convention; see _run_materialized_pass + the manifest
    # builder for the same name-keyed lookup). Defensively remove the
    # `.parquet.tmp` sibling too in case a prior materialize crashed
    # mid-COPY. Failure to remove (file missing, permission error) is
    # logged but doesn't fail the DELETE — the registry row is already
    # gone, and the orphan parquet will not produce a master view at
    # next rebuild because the orchestrator's _meta-driven scan never
    # picks up bare parquet files.
    if was_materialized and source_type in ("bigquery", "keboola"):
        try:
            data_dir = Path(os.environ.get("DATA_DIR", "./data"))
            base = data_dir / "extracts" / source_type / "data"
            for candidate in (
                base / f"{name}.parquet",
                base / f"{name}.parquet.tmp",
            ):
                if candidate.exists():
                    candidate.unlink()
                    logger.info(
                        "Removed materialized parquet for unregistered table %s: %s",
                        table_id, candidate,
                    )
        except Exception as e:
            logger.warning(
                "Failed to remove materialized parquet for %s: %s — registry row is "
                "still dropped; clean up the file manually if it lingers",
                table_id, e,
            )

    # Clear sync_state for any source/mode (a row that was synced at any
    # point — local/materialized — has a sync_state entry that the manifest
    # serves regardless of registry state). Pre-fix, the manifest still
    # advertised the dropped table to `agnes pull` because sync_state was
    # never cleaned up, and analysts kept getting it through the manifest.
    try:
        conn.execute("DELETE FROM sync_state WHERE table_id = ?", [name])
        conn.execute("DELETE FROM sync_history WHERE table_id = ?", [name])
    except Exception as e:
        logger.warning(
            "Failed to clear sync_state for unregistered table %s: %s — "
            "manifest may still advertise the dropped row to agnes pull",
            table_id, e,
        )

    AuditRepository(conn).log(
        user_id=user.get("id"),
        action="unregister_table",
        resource=table_id,
        params=_sanitize_for_audit({
            "name": existing.get("name"),
            "source_type": existing.get("source_type"),
            "bucket": existing.get("bucket"),
            "source_table": existing.get("source_table"),
        }),
    )

    from app.api.v2_catalog import invalidate_for_table
    invalidate_for_table(table_id)

    if was_bigquery:
        background.add_task(_materialize_bigquery_extract_bg)


@router.post("/configure")
async def configure_instance(
    request: ConfigureRequest,
    user: dict = Depends(require_admin),
):
    """Configure data source and instance settings via API.

    Writes config to instance.yaml and persists secrets to .env_overlay.
    AI agents and the /setup wizard use this instead of manual file editing.
    """
    import yaml

    if request.data_source not in ("keboola", "bigquery", "local"):
        raise HTTPException(status_code=400, detail="data_source must be 'keboola', 'bigquery', or 'local'")

    # Validate credentials if provided
    if request.data_source == "keboola":
        if not request.keboola_token or not request.keboola_url:
            raise HTTPException(status_code=400, detail="keboola_token and keboola_url are required for Keboola data source")
        _validate_url_not_private(request.keboola_url, field_name="keboola_url")
        try:
            from connectors.keboola.client import KeboolaClient
            client = KeboolaClient(token=request.keboola_token, url=request.keboola_url)
            client.test_connection()
        except Exception as e:
            logger.error("Keboola connection validation failed: %s", e)
            raise HTTPException(status_code=400, detail="Keboola connection failed. Check your token and URL.")

    elif request.data_source == "bigquery":
        if not request.bigquery_project:
            raise HTTPException(status_code=400, detail="bigquery_project is required for BigQuery data source")

    # Write instance.yaml to DATA_DIR/state/ (writable Docker volume),
    # NOT to CONFIG_DIR which is mounted read-only in Docker.
    #
    # Narrow-overlay write strategy — must match `/api/admin/server-config`:
    # 1. Read overlay verbatim (do NOT fall back to static). Falling back
    #    would copy env-resolved cleartext secrets from the merged static
    #    file back into the overlay (e.g. `smtp_password: ${SMTP_PASSWORD}`
    #    → `smtp_password: hunter2`). The wizard only ever sets
    #    `instance`, `auth`, `data_source` here, so other sections must
    #    flow from the static file via `load_instance_config`'s deep-merge
    #    — they don't belong in the overlay at all.
    # 2. Patch only the sections this endpoint touches.
    # 3. Write the narrow overlay back atomically (tmp + os.replace).
    from app.secrets import _state_dir
    config_path = _state_dir() / "instance.yaml"

    # Same serialization + corrupt-overlay handling as POST /server-config.
    with _overlay_write_lock:
        overlay: dict = {}
        if config_path.exists():
            try:
                overlay = yaml.safe_load(config_path.read_text()) or {}
            except Exception as e:
                logger.exception("configure: refusing to overwrite corrupt overlay at %s", config_path)
                raise HTTPException(
                    status_code=500,
                    detail=f"refusing to overwrite corrupt overlay at {config_path} ({e}); "
                           "back up and remove the file, or fix it by hand",
                ) from e

        # Merge instance settings into the overlay only — never seed from the
        # env-resolved merged config.
        if request.instance_name:
            overlay.setdefault("instance", {})["name"] = request.instance_name

        if request.allowed_domain:
            overlay.setdefault("auth", {})["allowed_domain"] = request.allowed_domain

        # data_source is fully owned by this endpoint — replace wholesale.
        overlay["data_source"] = {"type": request.data_source}
        if request.data_source == "keboola":
            overlay["data_source"]["keboola"] = {
                "stack_url": request.keboola_url,
                "token_env": "KEBOOLA_STORAGE_TOKEN",
            }
        elif request.data_source == "bigquery":
            overlay["data_source"]["bigquery"] = {
                "project": request.bigquery_project,
                "location": request.bigquery_location or "us",
            }

        # Seed an ai: block on first-time setup so LLM-driven services
        # (corporate_memory, verification_detector) can boot without manual
        # YAML editing. Only inserts when the overlay has no ai: yet AND an
        # appropriate env var is present — never overwrites operator config,
        # never writes a placeholder block (#176).
        if "ai" not in overlay:
            anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
            llm_key = os.environ.get("LLM_API_KEY", "").strip()
            if anthropic_key:
                overlay["ai"] = {
                    "provider": "anthropic",
                    "api_key": "${ANTHROPIC_API_KEY}",
                    "model": "claude-haiku-4-5-20251001",
                    "structured_output": "auto",
                }
            elif llm_key:
                overlay["ai"] = {
                    "provider": "anthropic",
                    "api_key": "${LLM_API_KEY}",
                    "model": "claude-haiku-4-5-20251001",
                    "structured_output": "auto",
                }

        # Atomic write to writable data volume — same tmp + os.replace pattern
        # as the server-config editor so a concurrent save can't tear the file.
        config_path.parent.mkdir(parents=True, exist_ok=True)
        tmp_path = config_path.with_suffix(config_path.suffix + ".tmp")
        tmp_path.write_text(yaml.dump(overlay, default_flow_style=False, sort_keys=False))
        os.replace(tmp_path, config_path)
        logger.info("Wrote instance config to %s", config_path)

    # Persist secrets to .env_overlay (in data volume, never in git)
    secrets_to_persist = {}
    if request.keboola_token:
        secrets_to_persist["KEBOOLA_STORAGE_TOKEN"] = request.keboola_token
    if request.keboola_url:
        secrets_to_persist["KEBOOLA_STACK_URL"] = request.keboola_url

    if secrets_to_persist:
        # Resolve via _state_dir() so the path matches app/main.py's
        # startup-time read of the same overlay. Without this, an operator
        # on the flat-mount layout (STATE_DIR=/data-state) would write
        # secrets to /data/state/.env_overlay here while the app reads
        # from /data-state/.env_overlay — silent loss on next restart.
        from app.secrets import _state_dir
        overlay_path = _state_dir() / ".env_overlay"
        overlay_path.parent.mkdir(parents=True, exist_ok=True)

        # Merge with existing overlay
        existing_overlay = {}
        if overlay_path.exists():
            for line in overlay_path.read_text().splitlines():
                if "=" in line and not line.startswith("#"):
                    k, v = line.split("=", 1)
                    existing_overlay[k.strip()] = v.strip()
        existing_overlay.update(secrets_to_persist)

        overlay_path.write_text(
            "\n".join(f"{k}={v}" for k, v in existing_overlay.items()) + "\n"
        )
        try:
            overlay_path.chmod(0o600)
        except OSError:
            pass
        logger.info("Persisted %d secrets to .env_overlay", len(secrets_to_persist))

        # Inject into current process environment
        for k, v in secrets_to_persist.items():
            os.environ[k] = v

    # Invalidate cached instance config so next read picks up changes.
    # Use the public helper (matches `/api/admin/server-config`); reaching
    # into the private global silently breaks if the cache layout changes.
    from app.instance_config import reset_cache
    reset_cache()

    return {
        "status": "ok",
        "data_source": request.data_source,
        "connection": "verified" if request.data_source != "local" else "local",
    }


def _split_keboola_table_id(full_id: str, fallback_name: str = "") -> tuple[str, str]:
    """Split a Keboola table id into ``(bucket, source_table)``.

    Keboola convention: ``<stage>.<bucket-id>.<table>`` where stage ∈
    ``{in, out, sys}`` and bucket-id typically starts with ``c-``
    (e.g. ``in.c-finance.orders``). Storage API export-async needs the
    FULL ``<stage>.<bucket-id>`` as the bucket arg — a stripped
    ``c-finance`` 404s. The 2-segment fallback covers id strings
    without the stage prefix; the 0/1-segment path returns empty
    bucket and uses ``fallback_name`` as the table name so the row
    fails loud at sync time rather than silently registering with
    no source coordinates.
    """
    parts = (full_id or "").strip().split(".")
    if len(parts) >= 3:
        return ".".join(parts[:-1]), parts[-1]
    if len(parts) == 2:
        return parts[0], parts[1]
    return "", fallback_name or full_id


def _build_keboola_discovery_plan(
    conn: duckdb.DuckDBPyConnection, discovered: list[dict],
) -> dict:
    """Inspect ``discovered`` (output of ``KeboolaClient.discover_all_tables``)
    against the live registry and bucket every entry into one of:

      - ``new``: not in registry, will be inserted.
      - ``existing_match``: row already in registry under the same id
        AND its ``(bucket, source_table)`` matches what discovery would
        write — no-op, nothing to do.
      - ``existing_drift``: a row in the registry conflicts with what
        discovery would write. Two flavours, both surfaced for operator
        visibility but **never overwritten**:

          1. Same registry id, different ``(bucket, source_table)`` —
             admin corrected the coordinates inline (rarer).
          2. Different registry id but the discovered ``name`` clashes
             with an existing row's ``name`` (case-insensitive). Real
             example: registry has ``id='kbc_job', name='kbc_job',
             bucket='in.c-kbc_telemetry'``; Keboola exposes the same
             logical table at id ``in.c-keboola-storage.job`` (which
             slugs to a different ``table_id``). Without this
             check, auto-discovery would insert a duplicate ``kbc_job``
             whose Storage API export-async 404s.

      - ``invalid``: id couldn't produce a usable ``table_id`` slug.

    Each bucket carries the exact rows; the API endpoint composes a
    summary + (optionally) executes. Pre-fix, this logic was inlined
    in ``_discover_and_register_tables`` and there was no way to see
    what would change without writing.
    """
    repo = TableRegistryRepository(conn)
    # Pre-load all keboola rows once so the name-collision lookup
    # below is O(1) per discovered entry. Falls back to per-id
    # `repo.get(...)` calls when list_all isn't available — keeps
    # the single-row test stubs working without forcing them to
    # implement list_all.
    try:
        all_rows = [r for r in repo.list_all() if r.get("source_type") == "keboola"]
    except AttributeError:
        all_rows = []
    by_name: dict[str, dict] = {
        (r.get("name") or "").strip().lower(): r for r in all_rows
    }

    plan = {"new": [], "existing_match": [], "existing_drift": [], "invalid": []}
    for table in discovered:
        full_id = (table.get("id") or "").strip()
        # Slug used as the registry primary key. Lowercase, dots/spaces
        # → underscores. Stable across discovery runs.
        table_id = full_id.lower().replace(".", "_").replace(" ", "_")
        if not table_id:
            plan["invalid"].append({
                "table_id": "",
                "full_id": full_id,
                "reason": "empty id from discovery payload",
            })
            continue

        # Prefer Keboola's authoritative `bucket_id` (separate field in
        # the API response, normalised by `discover_all_tables`) over
        # parsing the full id string. Fall back to the parser when
        # the API didn't return bucket_id (older fallback path inside
        # discover_all_tables).
        bucket = (table.get("bucket_id") or "").strip()
        name = (table.get("name") or "").strip()
        source_table = name
        if not bucket or not source_table:
            bucket, source_table = _split_keboola_table_id(full_id, source_table)

        entry = {
            "table_id": table_id,
            "name": table.get("name", table_id),
            "full_id": full_id,
            "bucket": bucket,
            "source_table": source_table,
        }

        existing = repo.get(table_id)
        if existing is not None:
            ex_bucket = existing.get("bucket") or ""
            ex_source_table = existing.get("source_table") or ""
            if ex_bucket == bucket and ex_source_table == source_table:
                plan["existing_match"].append(entry)
            else:
                plan["existing_drift"].append({
                    **entry,
                    "registry_bucket": ex_bucket,
                    "registry_source_table": ex_source_table,
                    "registry_id": existing.get("id"),
                    "drift_kind": "same_id_diff_coords",
                })
            continue

        # No row at this id. Look for a name collision (admin
        # registered the same logical table under a different id).
        name_match = by_name.get(name.lower()) if name else None
        if name_match is not None:
            plan["existing_drift"].append({
                **entry,
                "registry_bucket": name_match.get("bucket") or "",
                "registry_source_table": name_match.get("source_table") or "",
                "registry_id": name_match.get("id"),
                "drift_kind": "name_collision",
            })
            continue

        plan["new"].append(entry)
    return plan


def _discover_and_register_tables(
    conn: duckdb.DuckDBPyConnection,
    user_email: str,
    *,
    dry_run: bool = False,
) -> dict:
    """Discover tables from configured source and register them.

    Behavior:
      - Only the configured source type ``keboola`` is supported here
        (BigQuery uses a different discovery endpoint).
      - Already-registered rows are NEVER overwritten. The plan
        classifies them as ``existing_match`` (no-op, registry agrees
        with discovery) or ``existing_drift`` (admin edited the
        coordinates; left alone, surfaced in the response so the
        operator sees the divergence).
      - ``dry_run=True`` returns the plan without writing anything —
        useful for auditing before a re-discovery on a registry that
        already has admin overrides.
    """
    from app.instance_config import get_data_source_type, get_value

    source_type = get_data_source_type()
    if source_type != "keboola":
        return {
            "registered": 0,
            "skipped": 0,
            "errors": 0,
            "drifted": 0,
            "tables": [],
            "source": source_type,
            "dry_run": dry_run,
        }

    from connectors.keboola.client import KeboolaClient
    # Read from data_source.keboola (matches what /api/admin/configure writes)
    url = get_value("data_source", "keboola", "stack_url", default="")
    token_env = get_value("data_source", "keboola", "token_env", default="KEBOOLA_STORAGE_TOKEN")
    token = os.environ.get(token_env, "") if token_env else ""
    if not token:
        token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")

    client = KeboolaClient(token=token, url=url)
    discovered = client.discover_all_tables()

    plan = _build_keboola_discovery_plan(conn, discovered)
    drift_summary = [
        {
            "table_id": e["table_id"],
            "discovery": {"bucket": e["bucket"], "source_table": e["source_table"]},
            "registry":  {"bucket": e["registry_bucket"],
                           "source_table": e["registry_source_table"]},
        }
        for e in plan["existing_drift"]
    ]

    if dry_run:
        return {
            "registered": 0,
            "skipped": len(plan["existing_match"]),
            "errors": len(plan["invalid"]),
            "drifted": len(plan["existing_drift"]),
            "tables": [e["table_id"] for e in plan["new"]],
            "would_register": [e["table_id"] for e in plan["new"]],
            "drift": drift_summary,
            "invalid": plan["invalid"],
            "source": "keboola",
            "dry_run": True,
        }

    repo = TableRegistryRepository(conn)
    registered = 0
    errors = 0
    table_names = []

    for entry in plan["new"]:
        try:
            repo.register(
                id=entry["table_id"],
                name=entry["name"],
                source_type="keboola",
                bucket=entry["bucket"],
                source_table=entry["source_table"],
                # Keboola goes through Storage API export-async via the
                # materialized path (NULL source_query = full table). The
                # legacy `local` mode for Keboola was retired in v26 and
                # would no-op here anyway.
                query_mode="materialized",
                registered_by=user_email,
                description=f"Auto-discovered from Keboola: {entry['full_id']}",
            )
            registered += 1
            table_names.append(entry["table_id"])
        except Exception as e:
            logger.warning("Failed to register %s: %s", entry["table_id"], e)
            errors += 1

    if plan["existing_drift"]:
        logger.warning(
            "Auto-discover skipped %d row(s) where the admin-edited "
            "bucket/source_table differs from discovery — preserving "
            "the admin values. Run with dry_run=True to see the deltas.",
            len(plan["existing_drift"]),
        )

    return {
        "registered": registered,
        "skipped": len(plan["existing_match"]),
        "errors": errors + len(plan["invalid"]),
        "drifted": len(plan["existing_drift"]),
        "tables": table_names,
        "drift": drift_summary,
        "invalid": plan["invalid"],
        "source": "keboola",
        "dry_run": False,
    }


@router.post("/discover-and-register")
async def discover_and_register(
    dry_run: bool = False,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Discover tables from configured source and auto-register them.

    Combines discover-tables + register-table into one call. Already-
    registered rows are NEVER overwritten — admin edits to bucket /
    source_table win. The response surfaces a ``drift`` array listing
    any rows where discovery would have written different coordinates
    than what's in the registry, so operators can audit divergence
    after a Keboola-side bucket rename / table move.

    Query params:
      - ``dry_run=true`` returns the plan without writing anything.
        Lists ``would_register``, ``drift``, and ``invalid`` so an
        operator can decide whether to proceed (or, in the drift case,
        which side they want to fix).

    Used by /setup wizard and AI agents.
    """
    try:
        result = _discover_and_register_tables(
            conn, user.get("email", "admin"), dry_run=dry_run,
        )
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Discovery and registration failed: {e}")


# ---------------------------------------------------------------------------
# Scheduler-driven LLM pipeline endpoints (#176)
#
# The scheduler container drives these via HTTP rather than running them
# in-process — same reasoning as the existing /api/marketplaces/sync-all
# job: DuckDB allows only one writer per file across processes, and the
# app keeps a long-lived handle on system.duckdb. Routing through the app
# inherits the existing connection without contention.
#
# Each endpoint is `def` (sync), so FastAPI runs it in a thread pool —
# the underlying jobs do blocking I/O (LLM calls, DuckDB writes,
# filesystem scans). Running on the asyncio thread would block health
# checks for the duration of a job.
# ---------------------------------------------------------------------------


@router.post("/run-session-collector")
def run_session_collector(
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Trigger the session-collector job from the scheduler.

    Walks /home/*/user/sessions/*.jsonl and copies new files into
    /data/user_sessions/<user>/. Idempotent — already-collected files
    are skipped.
    """
    from services.session_collector import collector

    # Call run() not main(): main() does argparse.parse_args() which would
    # try to parse uvicorn's sys.argv and SystemExit(2) the worker.
    rc: int = 1
    stats: dict = {}
    job_error: Optional[Exception] = None
    try:
        rc, stats = collector.run(dry_run=False, verbose=False)
    except Exception as e:
        # Mirror run_verification_detector / run_corporate_memory
        # (#179 review): capture any unhandled error so audit_log +
        # /admin/scheduler-runs reflect the failure. Re-raised below
        # after audit. Filesystem permission, OSError on /home walking,
        # etc. are realistic failure modes worth surfacing.
        job_error = e

    audit_params: dict = {"rc": rc, **stats}
    if job_error is not None:
        audit_params["unhandled_error"] = f"{type(job_error).__name__}: {job_error}"

    AuditRepository(conn).log(
        user_id=user.get("id"),
        action="run_session_collector",
        resource="job:session-collector",
        params=audit_params,
    )

    if job_error is not None:
        raise HTTPException(status_code=500, detail=audit_params["unhandled_error"])

    return {"ok": rc == 0, "details": {"rc": rc, **stats}}


@router.post("/run-session-processor")
def run_session_processor(
    processor: str = Query(..., description="Processor name (e.g. 'verification', 'usage')"),
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Trigger one session-pipeline processor against /data/user_sessions/*.

    Replaces the per-processor /run-* endpoints with a single parametrized
    entry. The scheduler invokes this once per registered processor on its
    own cadence; processors are independent (one slow / failing processor
    can't block any other).

    Returns 400 if `processor` is unknown. The verification processor
    requires an LLM extractor — if the instance has no ai: config and no
    ANTHROPIC_API_KEY / LLM_API_KEY, it won't appear in the registry and
    the call returns 400 the same as a misspelled name.
    """
    from services.session_pipeline.runner import run_processor as _run_processor
    from services.session_processors import get_processor, list_processor_names
    from src.db import get_system_db

    proc = get_processor(processor)
    if proc is None:
        raise HTTPException(
            status_code=400,
            detail=(
                f"Unknown processor '{processor}'. "
                f"Known: {', '.join(list_processor_names())}"
            ),
        )

    # Reject overlapping invocations of the same processor (PR #232 review).
    # See `_get_processor_run_lock` docstring for why this matters
    # (verification_evidence row duplication on race).
    proc_lock = _get_processor_run_lock(processor)
    if not proc_lock.acquire(blocking=False):
        raise HTTPException(
            status_code=409,
            detail=f"Processor '{processor}' is already running",
        )

    job_conn = get_system_db()
    stats: dict = {}
    job_error: Optional[Exception] = None
    try:
        stats = _run_processor(job_conn, proc)
        # Rebuild daily rollups after a successful usage run so the
        # marketplace / admin dashboards see fresh aggregates. Runs on the
        # same connection while it's still open; incremental (last-7-days)
        # so it's cheap. Kept here (not in runner.py) to stay
        # processor-agnostic at the framework level.
        if processor == "usage" and stats.get("errors", 0) == 0:
            from services.session_processors.usage_lib import rebuild_rollups
            try:
                rebuild_rollups(job_conn)
            except Exception as rollup_exc:
                logger.warning("usage rollup rebuild failed: %s", rollup_exc)
    except Exception as e:
        # Capture and re-raise after audit so an unhandled runner error
        # (DuckDB lock, network blip, unexpected SDK type) still leaves a
        # row in audit_log — the /admin/scheduler-runs page is the
        # operator's only signal beyond docker logs.
        job_error = e
    finally:
        try:
            job_conn.close()
        except Exception:
            pass
        # Always release, even if the runner raised. A leaked lock would
        # wedge the processor permanently until process restart.
        proc_lock.release()

    audit_params: dict = {
        "processor": processor,
        "scanned": stats.get("scanned", 0),
        "processed": stats.get("processed", 0),
        "skipped": stats.get("skipped", 0),
        "errors": stats.get("errors", 0),
        "items_extracted": stats.get("items_extracted", 0),
    }
    if job_error is not None:
        audit_params["unhandled_error"] = f"{type(job_error).__name__}: {job_error}"

    AuditRepository(conn).log(
        user_id=user.get("id"),
        action=f"run_session_processor:{processor}",
        resource=f"job:session-processor:{processor}",
        params=audit_params,
    )

    if job_error is not None:
        raise HTTPException(status_code=500, detail=audit_params["unhandled_error"])

    return {"ok": stats.get("errors", 0) == 0, "details": stats}


@router.post("/run-corporate-memory")
def run_corporate_memory(
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Trigger the corporate-memory catalog refresh from the scheduler.

    Reads all CLAUDE.local.md files, sends them through the LLM with the
    existing catalog, and writes an updated catalog to knowledge.json.
    """
    from services.corporate_memory.collector import collect_all

    # Fail-fast (#176): collect_all raises ValueError when no ai: block AND
    # no env keys are present. Surface the actionable factory message in a
    # 500 instead of letting it crash the request anonymously.
    stats: dict = {}
    job_error: Optional[Exception] = None
    try:
        stats = collect_all(dry_run=False)
    except ValueError as e:
        # Already-translated misconfiguration → 500 with actionable message
        # but no audit row (the request never reached the LLM stage).
        raise HTTPException(status_code=500, detail=str(e))
    except Exception as e:
        # Mirror run_verification_detector (#179 review): capture any other
        # unhandled error so audit_log + /admin/scheduler-runs reflect the
        # failure. Re-raised below after audit.
        job_error = e

    audit_params: dict = {
        "items_new": stats.get("items_new", 0),
        "items_filtered": stats.get("items_filtered", 0),
        "errors": len(stats.get("errors", [])),
        "skipped": stats.get("skipped", False),
    }
    if job_error is not None:
        audit_params["unhandled_error"] = f"{type(job_error).__name__}: {job_error}"

    AuditRepository(conn).log(
        user_id=user.get("id"),
        action="run_corporate_memory",
        resource="job:corporate-memory",
        params=audit_params,
    )

    if job_error is not None:
        raise HTTPException(status_code=500, detail=audit_params["unhandled_error"])

    return {"ok": not stats.get("errors"), "details": stats}


# ---------------------------------------------------------------------------
# Flea-market guardrails — admin endpoints
#
# Backs /admin/store/submissions (the human triage page) and the override /
# retry / delete-submission action buttons. Every action here writes an
# audit_log row so the trail of "who force-published what, and why" is
# permanent — same governance posture as the corporate-memory + scheduler
# runs surfaces.
# ---------------------------------------------------------------------------

import shutil as _shutil


@router.get("/store/submissions")
async def admin_list_store_submissions(
    status: Optional[str] = None,
    submitter: Optional[str] = None,
    type: Optional[str] = None,  # noqa: A002 — FastAPI query-param name
    name: Optional[str] = None,
    version: Optional[str] = None,
    sort: Optional[str] = None,
    order: Optional[str] = None,
    limit: int = 100,
    skip: int = 0,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """List flea-market guardrail submissions newest-first.

    All filters AND together. ``status`` is comma-separated
    (e.g. ``blocked_inline,blocked_llm``). ``submitter`` matches
    ``submitter_id`` exactly. ``type`` is one of ``skill`` / ``agent`` /
    ``plugin``. ``name`` and ``version`` are case-insensitive substrings.
    ``limit`` clamped to [1, 500].
    """
    from src.repositories.store_submissions import StoreSubmissionsRepository

    statuses = None
    if status:
        statuses = [s.strip() for s in status.split(",") if s.strip()]
    if type and type not in {"skill", "agent", "plugin"}:
        raise HTTPException(status_code=400, detail="invalid_type")
    limit = max(1, min(int(limit), 500))
    skip = max(0, int(skip))

    # v36+ chip routing: 'archived' / 'deleted' tokens in ?status=
    # are LIFECYCLE filters, not verdict filters. The repo handles the
    # JOIN-on-entity logic for archived; submission terminal marker
    # for deleted. Verdict tokens (approved, blocked_*, pending_*,
    # overridden, review_error) pass through unchanged.
    lifecycle = None
    if statuses == ["archived"]:
        lifecycle = "archived"
        statuses = None
    elif statuses == ["deleted"]:
        lifecycle = "deleted"
        statuses = None

    try:
        items, total = StoreSubmissionsRepository(conn).list_for_admin(
            status=statuses,
            submitter_id=submitter or None,
            type_=type or None,
            name_substr=name or None,
            version_substr=version or None,
            sort_by=sort or None,
            sort_order=order or None,
            lifecycle=lifecycle,
            limit=limit, skip=skip,
        )
    except ValueError as e:
        # Sort key whitelist rejection (#23) — surface as 400 so the UI
        # can show the operator a meaningful message instead of 500.
        msg = str(e)
        if msg.startswith("invalid_sort_key"):
            raise HTTPException(status_code=400, detail="invalid_sort_key")
        raise
    return {"items": items, "total": total, "limit": limit, "skip": skip}


@router.get("/store/submissions/{submission_id}")
async def admin_get_store_submission(
    submission_id: str,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    from src.repositories.store_submissions import StoreSubmissionsRepository

    sub = StoreSubmissionsRepository(conn).get(submission_id)
    if sub is None:
        raise HTTPException(status_code=404, detail="submission_not_found")
    return sub


class _OverrideRequest(BaseModel):
    reason: str = Field(..., min_length=4, max_length=2000)


@router.post("/store/submissions/{submission_id}/override")
async def admin_override_store_submission(
    submission_id: str,
    body: _OverrideRequest,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Force-publish a previously-blocked submission.

    Flips the submission to ``status='overridden'`` and the linked
    store_entities row to ``visibility_status='approved'``. Audit row
    captures who, why, and the verdict that was overridden so the next
    time this submission shows up, the trail is intact.
    """
    from src.repositories.store_entities import StoreEntitiesRepository
    from src.repositories.store_submissions import StoreSubmissionsRepository

    subs = StoreSubmissionsRepository(conn)
    sub = subs.get(submission_id)
    if sub is None:
        raise HTTPException(status_code=404, detail="submission_not_found")
    if sub["status"] not in {"blocked_inline", "blocked_llm", "review_error", "pending_llm"}:
        raise HTTPException(
            status_code=409,
            detail=f"cannot_override_status:{sub['status']}",
        )

    entity_id = sub.get("entity_id")
    if not entity_id:
        # v30+ ought to always carry entity_id. Legacy rows from the
        # pre-v30 inline-rollback design land here — refuse with a
        # message that points at the only path forward (Delete +
        # ask submitter to re-upload).
        raise HTTPException(
            status_code=409,
            detail="cannot_override_legacy_without_entity",
        )

    subs.set_override(submission_id, admin_user_id=user["id"], reason=body.reason)
    ents_repo = StoreEntitiesRepository(conn)
    ents_repo.set_visibility(entity_id, "approved")

    # Mirror the runner's deferred-promotion path. An override on a
    # v2+ edit/restore must promote the overridden version + swap the
    # on-disk live bundle, otherwise the entity stays at the prior
    # approved version and installers keep receiving stale bytes the
    # admin just told us to replace. For an initial v1 submission
    # (no prior approved) the version_no already matches — the loop
    # just no-ops and we skip promotion harmlessly.
    entity_row = ents_repo.get(entity_id) or {}
    promoted_to: Optional[int] = None
    # Look up THIS submission's version entry by submission_id, NOT
    # by hash. Hash-based lookup breaks when the user re-uploads
    # byte-identical bundles (e.g. v2 same content as v1): the loop
    # picks the FIRST history entry with that hash (always v1, n=1),
    # so target_version_no lands at 1 instead of the actual new
    # entry's n. The forward-only `target > current` guard then
    # skips the promote, leaving the entity stuck at v1. Surfaced
    # live on a development deployment.
    from app.api.store import _version_no_for_submission
    target_version_no: Optional[int] = _version_no_for_submission(
        entity_row, submission_id,
    )
    # Forward-only: refuse to promote backwards. An admin overriding a
    # stale v2 submission when v3 is already approved + live must NOT
    # demote the live bundle back to v2's bytes. Override flips the
    # row's status + visibility regardless; only the version-promote
    # is gated. Forward (target > current) is the only motion the
    # publish-gate model is designed to express.
    if (target_version_no is not None
            and target_version_no > int(entity_row.get("version_no") or 0)):
        # Atomic helper: swap live bundle first, then update the DB.
        # Eliminates the "DB promoted but live still on prior bytes"
        # window. If the helper returns None (source missing / swap
        # failed) the row's status + visibility are still flipped
        # above — admin can re-trigger via /rescan once the bundle
        # is recovered.
        from app.api.store import promote_to_version
        promoted_to = promote_to_version(
            entity_id, target_version_no, ents_repo,
        )
        if promoted_to is not None:
            # Re-read after promotion so attribution picks up the
            # new version's name/type if a rename was bundled in.
            entity_row = ents_repo.get(entity_id) or entity_row

    # v46: attribution lookup is live — the next UsageProcessor tick
    # preloads the newly-approved entity by name.

    AuditRepository(conn).log(
        user_id=user["id"],
        action="store.submission.overridden",
        resource=f"store_submission:{submission_id}",
        params={
            "entity_id": entity_id,
            "reason": body.reason,
            "prior_status": sub["status"],
            "prior_findings": sub.get("llm_findings"),
            "prior_inline": sub.get("inline_checks"),
            "promoted_to_version_no": promoted_to,
        },
        result="ok",
    )
    return {"ok": True, "submission_id": submission_id, "entity_id": entity_id}


@router.post("/store/submissions/{submission_id}/rescan")
async def admin_rescan_store_submission(
    submission_id: str,
    background: BackgroundTasks,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Re-run **all** guardrail checks (inline + LLM) against the current
    bundle.

    Different from ``/retry``: rescan starts from scratch (re-runs the
    deterministic inline checks too) and is allowed regardless of
    current status. Use when check rules have changed and a previously-
    approved entity might now fail (or vice versa).

    Effects:
      * inline checks run sync; verdict written to ``inline_checks``
      * on inline fail → ``status='blocked_inline'``, entity hidden
      * on inline pass → ``status='pending_llm'``, LLM call scheduled,
        entity visibility flipped to ``pending`` until verdict lands
      * audit_log entry recorded for both outcomes — admin sees the
        rescan in the detail-page activity timeline
      * audit row recorded

    Requires the bundle to still be on disk. Inline-blocked submissions
    whose bundle was rolled back (no ``entity_id``) cannot be rescanned —
    nothing to scan.
    """
    from app.api.store import (
        _plugin_dir,
        _submission_plugin_dir,
        _version_no_for_submission,
    )
    from src.db import get_system_db
    from src.repositories.store_entities import StoreEntitiesRepository
    from src.repositories.store_submissions import StoreSubmissionsRepository
    from src.store_guardrails import run_inline_checks
    from src.store_guardrails.runner import (
        default_api_key_loader,
        default_model_loader,
        run_llm_review,
    )
    from app.instance_config import (
        get_guardrails_enabled,
        get_guardrails_llm_provider_ready,
    )

    subs = StoreSubmissionsRepository(conn)
    sub = subs.get(submission_id)
    if sub is None:
        raise HTTPException(status_code=404, detail="submission_not_found")
    entity_id = sub.get("entity_id")
    if not entity_id:
        raise HTTPException(status_code=409, detail="cannot_rescan_without_entity")

    ents = StoreEntitiesRepository(conn)
    entity = ents.get(entity_id)
    # Rescan the bundle this submission represents — not live. See the
    # equivalent fix in /retry for the full reasoning. Same fall-back
    # to live for legacy rows that never seeded a versions/v<N>/plugin/.
    target_n = _version_no_for_submission(entity or {}, submission_id)
    if target_n is not None:
        plugin_dir = _submission_plugin_dir(entity_id, target_n)
        if not plugin_dir.exists():
            plugin_dir = _plugin_dir(entity_id)
    else:
        plugin_dir = _plugin_dir(entity_id)
    if not plugin_dir.exists():
        raise HTTPException(status_code=410, detail="bundle_missing")

    description = (entity or {}).get("description")

    inline = run_inline_checks(
        plugin_dir, type_=sub["type"], description=description,
    )

    if not inline.passed:
        # Re-failed inline. Hide the entity (was approved or pending);
        # admin can either fix the bundle (PUT to recreate) or override.
        subs.conn.execute(
            "UPDATE store_submissions SET inline_checks = ?, llm_findings = NULL, "
            "status = 'blocked_inline', updated_at = current_timestamp "
            "WHERE id = ?",
            [__import__("json").dumps(inline.to_response_dict()), submission_id],
        )
        ents.set_visibility(entity_id, "hidden")
        AuditRepository(conn).log(
            user_id=user["id"],
            action="store.submission.rescan",
            resource=f"store_submission:{submission_id}",
            params={"entity_id": entity_id, "outcome": "blocked_inline"},
        )
        return {"ok": True, "submission_id": submission_id, "status": "blocked_inline"}

    # Inline passes. Three-state matrix:
    #   - intent False           → auto-approve (operator opt-out)
    #   - intent True + ready    → pending_llm, schedule LLM
    #   - intent True + not-ready → pending_llm, DO NOT schedule (admin
    #     retries from the same endpoint after providing credentials)
    guardrails_enabled = get_guardrails_enabled()
    provider_ready = get_guardrails_llm_provider_ready()
    hold_for_review = guardrails_enabled
    schedule_async_llm = guardrails_enabled and provider_ready
    guardrails_on = hold_for_review  # retained for audit-log compat
    new_status = "pending_llm" if hold_for_review else "approved"
    subs.conn.execute(
        "UPDATE store_submissions SET inline_checks = ?, llm_findings = NULL, "
        "status = ?, updated_at = current_timestamp "
        "WHERE id = ?",
        [__import__("json").dumps(inline.to_response_dict()), new_status, submission_id],
    )
    if hold_for_review:
        ents.set_visibility(entity_id, "pending")
    else:
        ents.set_visibility(entity_id, "approved")
        # Guardrails explicitly disabled — immediately live. Promote
        # the rescanned submission's version forward (same atomic
        # helper the create / update / restore inline-promote paths
        # use). Pre-fix this branch flipped visibility but never
        # called promote_to_version, so a rescan that re-approved a
        # non-current v2+ left the entity stuck at the prior version.
        # Surfaced by adversarial review of PR #330.
        from app.api.store import promote_to_version
        entity_row = ents.get(entity_id) or {}
        if target_n is not None and target_n > int(entity_row.get("version_no") or 0):
            promote_to_version(entity_id, target_n, ents)
        # v46: attribution lookup is live — no explicit refresh needed.
    AuditRepository(conn).log(
        user_id=user["id"],
        action="store.submission.rescan",
        resource=f"store_submission:{submission_id}",
        params={"entity_id": entity_id, "outcome": new_status,
                "guardrails_enabled": guardrails_on,
                "provider_ready": provider_ready},
    )
    if schedule_async_llm:
        background.add_task(
            run_llm_review,
            submission_id,
            plugin_dir=plugin_dir,
            conn_factory=get_system_db,
            api_key_loader=default_api_key_loader,
            model_loader=default_model_loader,
        )
    return {"ok": True, "submission_id": submission_id, "status": new_status}


@router.post("/store/submissions/{submission_id}/retry")
async def admin_retry_store_submission(
    submission_id: str,
    background: BackgroundTasks,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Re-queue the LLM review for a submission.

    Eligible statuses:
      * ``review_error`` — LLM call failed, admin retrying after the
        underlying issue (rate limit, timeout, transient outage) clears.
      * ``blocked_llm`` — admin disagrees with the prior verdict; rerun
        from a clean slate (review rules may have shifted since).
      * ``pending_llm`` — submission was held when the LLM provider had
        no credentials in env (fail-CLOSED matrix: intent True + not
        ready). Admin sets the key and re-fires from here.

    Only valid when the original submission's plugin tree is still on
    disk — for inline-blocked rows the bundle was deleted at POST time.
    """
    from app.api.store import (
        _plugin_dir,
        _submission_plugin_dir,
        _version_no_for_submission,
    )
    from src.db import get_system_db
    from src.repositories.store_entities import StoreEntitiesRepository
    from src.repositories.store_submissions import StoreSubmissionsRepository
    from src.store_guardrails.runner import (
        default_api_key_loader,
        default_model_loader,
        run_llm_review,
    )

    subs = StoreSubmissionsRepository(conn)
    sub = subs.get(submission_id)
    if sub is None:
        raise HTTPException(status_code=404, detail="submission_not_found")
    if sub["status"] not in {"review_error", "blocked_llm", "pending_llm"}:
        raise HTTPException(
            status_code=409, detail=f"cannot_retry_status:{sub['status']}",
        )
    entity_id = sub.get("entity_id")
    if not entity_id:
        raise HTTPException(
            status_code=409, detail="cannot_retry_without_entity",
        )

    # Review the STAGED version's bytes — not live. For a v2+ edit
    # held at pending_llm or blocked_llm, live `plugin/` still holds
    # the prior approved version. Reviewing live would produce a
    # verdict against the wrong bytes; the runner's hash-match
    # promotion would then advance the entity to staged bytes that
    # were never actually reviewed.
    ent = StoreEntitiesRepository(conn).get(entity_id) or {}
    target_n = _version_no_for_submission(ent, submission_id)
    if target_n is not None:
        plugin_dir = _submission_plugin_dir(entity_id, target_n)
        # Fall back to live for legacy pre-v37 rows where the version
        # dir was never seeded.
        if not plugin_dir.exists():
            plugin_dir = _plugin_dir(entity_id)
    else:
        plugin_dir = _plugin_dir(entity_id)
    if not plugin_dir.exists():
        raise HTTPException(status_code=410, detail="bundle_missing")

    subs.update_status(submission_id, status="pending_llm")
    AuditRepository(conn).log(
        user_id=user["id"],
        action="store.submission.retry",
        resource=f"store_submission:{submission_id}",
        params={"entity_id": entity_id},
    )
    background.add_task(
        run_llm_review,
        submission_id,
        plugin_dir=plugin_dir,
        conn_factory=get_system_db,
        api_key_loader=default_api_key_loader,
        model_loader=default_model_loader,
    )
    return {"ok": True, "submission_id": submission_id, "status": "pending_llm"}


@router.delete("/store/submissions/{submission_id}", status_code=204)
async def admin_delete_store_submission(
    submission_id: str,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Hard-delete a submission record + its linked bundle (if any).

    Use this for spam / accidental uploads after override-publish is the
    wrong call. The audit_log row preserves what was deleted in case
    triage needs the evidence trail later.
    """
    from app.api.store import _entity_dir
    from src.repositories.store_entities import StoreEntitiesRepository
    from src.repositories.store_submissions import StoreSubmissionsRepository
    from src.repositories.user_store_installs import UserStoreInstallsRepository

    subs = StoreSubmissionsRepository(conn)
    sub = subs.get(submission_id)
    if sub is None:
        raise HTTPException(status_code=404, detail="submission_not_found")

    entity_id = sub.get("entity_id")
    if entity_id:
        UserStoreInstallsRepository(conn).delete_all_for_entity(entity_id)
        StoreEntitiesRepository(conn).delete(entity_id)
        _shutil.rmtree(_entity_dir(entity_id), ignore_errors=True)
    conn.execute("DELETE FROM store_submissions WHERE id = ?", [submission_id])

    AuditRepository(conn).log(
        user_id=user["id"],
        action="store.submission.deleted",
        resource=f"store_submission:{submission_id}",
        params={
            "entity_id": entity_id,
            "submitter_id": sub.get("submitter_id"),
            "name": sub.get("name"),
            "status": sub.get("status"),
        },
    )


# ---------------------------------------------------------------------------
# v30: download blocked bundle for forensic inspection
# ---------------------------------------------------------------------------

from fastapi.responses import StreamingResponse


@router.get("/store/submissions/{submission_id}/bundle.zip")
async def admin_download_store_submission_bundle(
    submission_id: str,
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Stream the on-disk bundle as a fresh ZIP for admin inspection.

    Required by the forensic use case: admin needs to inspect what a
    submitter actually tried to upload (not just the verdict). Bundle
    must still be on disk — TTL purge nulls ``entity_id`` and removes
    the directory, in which case this returns 410.
    """
    import io as _io
    import zipfile as _zipfile
    from pathlib import Path as _P
    from app.api.store import (
        _plugin_dir as _sp_plugin_dir,
        _submission_plugin_dir,
        _version_no_for_submission,
    )

    from src.repositories.store_entities import StoreEntitiesRepository
    from src.repositories.store_submissions import StoreSubmissionsRepository

    sub = StoreSubmissionsRepository(conn).get(submission_id)
    if sub is None:
        raise HTTPException(status_code=404, detail="submission_not_found")
    entity_id = sub.get("entity_id")
    if not entity_id:
        raise HTTPException(status_code=410, detail="bundle_purged_or_missing")

    # Resolve the STAGED bundle this submission represents, not live.
    # Under deferred promotion, live `plugin/` holds the prior approved
    # version — so for a blocked v2 row, live shows v1's safe bytes
    # while the staged v2 bytes (the actual risky upload the admin is
    # reviewing) sit in `versions/v2/plugin/`. Falls back to live for
    # legacy rows that never seeded a versions/ dir.
    ent = StoreEntitiesRepository(conn).get(entity_id) or {}
    target_n = _version_no_for_submission(ent, submission_id)
    if target_n is not None:
        plugin_dir = _submission_plugin_dir(entity_id, target_n)
        if not plugin_dir.exists():
            plugin_dir = _sp_plugin_dir(entity_id)
    else:
        plugin_dir = _sp_plugin_dir(entity_id)
    if not plugin_dir.exists():
        raise HTTPException(status_code=410, detail="bundle_missing")

    AuditRepository(conn).log(
        user_id=user["id"],
        action="store.submission.bundle_downloaded",
        resource=f"store_submission:{submission_id}",
        params={"entity_id": entity_id, "name": sub.get("name")},
    )

    buf = _io.BytesIO()
    with _zipfile.ZipFile(buf, "w", _zipfile.ZIP_DEFLATED) as zf:
        for f in sorted(_P(plugin_dir).rglob("*")):
            if not f.is_file():
                continue
            arcname = f.relative_to(plugin_dir).as_posix()
            zf.write(f, arcname)
    buf.seek(0)

    safe_name = (sub.get("name") or "bundle").replace("/", "_")
    filename = f"{safe_name}-{submission_id[:8]}.zip"
    return StreamingResponse(
        buf,
        media_type="application/zip",
        headers={"Content-Disposition": f'attachment; filename="{filename}"'},
    )


# ---------------------------------------------------------------------------
# v30: scheduled TTL purge of blocked bundle bytes
# ---------------------------------------------------------------------------


@router.post("/run-blocked-purge")
async def run_blocked_purge(
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Trigger the TTL purge of blocked bundle bytes.

    Wraps :func:`src.store_guardrails.purge.purge_blocked_bundles`. The
    scheduler service hits this endpoint daily (under
    ``SCHEDULER_API_TOKEN`` like the corporate-memory + verification
    jobs); admins can also run it on demand from the UI.
    """
    from app.instance_config import get_guardrails_blocked_bundle_ttl_days
    from src.store_guardrails.purge import purge_blocked_bundles

    ttl = get_guardrails_blocked_bundle_ttl_days()
    result = purge_blocked_bundles(conn, ttl_days=ttl)

    AuditRepository(conn).log(
        user_id=user.get("id"),
        action="run_blocked_purge",
        resource="job:store-blocked-purge",
        params={"ttl_days": ttl, "purged": result.get("purged", 0),
                "skipped": result.get("skipped", False)},
    )
    return {"ok": True, "details": result}


@router.post("/run-reap-stuck-reviews")
async def run_reap_stuck_reviews(
    user: dict = Depends(require_admin),
    conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
    """Trigger the stuck-review reaper.

    Wraps :func:`src.store_guardrails.reaper.reap_stuck_llm_reviews`.
    The scheduler hits this every 15 minutes; admins can run it on
    demand if a worker crash is suspected. Flips any
    ``status='pending_llm'`` row older than the configured grace to
    ``review_error`` so the queue stops growing indefinitely.
    """
    from app.instance_config import get_guardrails_stuck_review_grace_seconds
    from src.store_guardrails.reaper import reap_stuck_llm_reviews

    grace = get_guardrails_stuck_review_grace_seconds()
    result = reap_stuck_llm_reviews(conn, grace_seconds=grace)

    AuditRepository(conn).log(
        user_id=user.get("id"),
        action="run_reap_stuck_reviews",
        resource="job:store-reap-stuck-reviews",
        params={"grace_seconds": grace,
                "reaped": result.get("reaped", 0),
                "skipped": result.get("skipped", False)},
    )
    return {"ok": True, "details": result}