agnes-the-ai-analyst/tests/test_admin_bq_register.py

"""Tests for BigQuery table registration via admin API + UI + CLI.

Covers issue #108 Milestone 1:
- /api/admin/register-table validation matrix for BQ rows
- /api/admin/register-table/precheck happy + sad paths (mocked
  google.cloud.bigquery.Client)
- View-name collision detection (409 distinct from id collision)
- Audit log entries on register/update/unregister with secret masking
- Sync wiring: register-then-list round-trip exercises
  bigquery.extractor.rebuild_from_registry + SyncOrchestrator.rebuild
- Admin UI: /admin/tables renders BQ vs Keboola fields based on
  data_source.type
- CLI: da admin register-table --dry-run hits /precheck
"""

import json
from unittest.mock import MagicMock, patch

import pytest
from pydantic import ValidationError

from app.api.admin import RegisterTableRequest, UpdateTableRequest


def _auth(token):
    return {"Authorization": f"Bearer {token}"}


def _bq_payload(**overrides):
    """Minimal valid BQ register payload, override with kwargs per test."""
    p = {
        "name": "orders",
        "source_type": "bigquery",
        "bucket": "analytics",
        "source_table": "orders",
        "query_mode": "remote",
    }
    p.update(overrides)
    return p


@pytest.fixture
def bq_instance(monkeypatch):
    """Force instance.yaml to look like a BigQuery deployment for the
    duration of one test. Patches the cached load_instance_config so
    /admin/server-config reads / get_value('data_source.bigquery.project')
    return what we want, without touching the on-disk instance.yaml."""
    fake_cfg = {
        "data_source": {
            "type": "bigquery",
            "bigquery": {
                "project": "my-test-project",
                "location": "us",
            },
        },
    }
    # Patch every read path we know consumers use, plus reset_cache.
    monkeypatch.setattr(
        "app.instance_config.load_instance_config",
        lambda: fake_cfg,
        raising=False,
    )
    # get_value walks the merged dict; load is the source, so the patch
    # above is enough — but reset cache to avoid a stale read poisoning
    # the test.
    from app.instance_config import reset_cache
    reset_cache()
    yield fake_cfg
    reset_cache()


@pytest.fixture
def stub_bq_extractor(monkeypatch):
    """Replace rebuild_from_registry + SyncOrchestrator.rebuild with mocks
    so the API's post-register materialize doesn't try to hit real BQ."""
    rebuild_mock = MagicMock(return_value={
        "project_id": "my-test-project",
        "tables_registered": 1,
        "errors": [],
        "skipped": False,
    })
    monkeypatch.setattr(
        "connectors.bigquery.extractor.rebuild_from_registry",
        rebuild_mock,
    )
    orch_mock = MagicMock()
    monkeypatch.setattr(
        "src.orchestrator.SyncOrchestrator",
        lambda *a, **kw: orch_mock,
    )
    return {"rebuild": rebuild_mock, "orchestrator": orch_mock}


# --- API: register-table for BigQuery ----------------------------------------


class TestBigQueryRegisterValidation:
    def test_missing_bucket_returns_422(self, seeded_app, bq_instance, stub_bq_extractor):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(bucket=""),
            headers=_auth(token),
        )
        assert resp.status_code == 422
        assert "bucket" in resp.json()["detail"].lower()

    def test_missing_source_table_returns_422(self, seeded_app, bq_instance, stub_bq_extractor):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(source_table=""),
            headers=_auth(token),
        )
        assert resp.status_code == 422
        assert "source_table" in resp.json()["detail"].lower()

    def test_unsafe_view_name_returns_400(self, seeded_app, bq_instance, stub_bq_extractor):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        # `name` becomes the DuckDB view name (after lower+slug). A bare
        # hyphen is fine in BQ but not in a DuckDB strict identifier — must
        # fail at register time, not at first rebuild.
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="orders-2026"),
            headers=_auth(token),
        )
        assert resp.status_code == 400
        assert "view name" in resp.json()["detail"].lower()

    def test_unsafe_dataset_returns_400(self, seeded_app, bq_instance, stub_bq_extractor):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(bucket='evil"dataset'),
            headers=_auth(token),
        )
        assert resp.status_code == 400

    def test_unsafe_source_table_returns_400(self, seeded_app, bq_instance, stub_bq_extractor):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(source_table='orders;DROP'),
            headers=_auth(token),
        )
        assert resp.status_code == 400

    def test_wildcard_source_table_returns_400(self, seeded_app, bq_instance, stub_bq_extractor):
        """Wildcard / sharded BQ tables are deferred to M3 (Decision 8)."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(source_table="events_*"),
            headers=_auth(token),
        )
        assert resp.status_code == 400
        assert "wildcard" in resp.json()["detail"].lower()

    def test_invalid_source_type_returns_422(self, seeded_app):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json={"name": "x", "source_type": "snowflake"},
            headers=_auth(token),
        )
        assert resp.status_code == 422

    def test_missing_project_in_yaml_returns_400(self, seeded_app, monkeypatch, stub_bq_extractor):
        """If data_source.bigquery.project isn't set, the BQ branch must
        refuse to register — we'd hit the missing-project error at first
        rebuild anyway, but registering a row that can never materialize
        is an operator footgun."""
        from app.instance_config import reset_cache
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
            lambda: {"data_source": {"type": "bigquery", "bigquery": {}}},
            raising=False,
        )
        reset_cache()
        try:
            c = seeded_app["client"]
            token = seeded_app["admin_token"]
            resp = c.post(
                "/api/admin/register-table",
                json=_bq_payload(),
                headers=_auth(token),
            )
            assert resp.status_code == 400
            assert "project" in resp.json()["detail"].lower()
        finally:
            reset_cache()

    def test_malformed_project_id_returns_400(self, seeded_app, monkeypatch, stub_bq_extractor):
        from app.instance_config import reset_cache
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
            lambda: {
                "data_source": {
                    "type": "bigquery",
                    "bigquery": {"project": "Bad Project With Spaces"},
                }
            },
            raising=False,
        )
        reset_cache()
        try:
            c = seeded_app["client"]
            token = seeded_app["admin_token"]
            resp = c.post(
                "/api/admin/register-table",
                json=_bq_payload(),
                headers=_auth(token),
            )
            assert resp.status_code == 400
            assert "malformed" in resp.json()["detail"].lower() or "grammar" in resp.json()["detail"].lower()
        finally:
            reset_cache()


class TestBigQueryRegisterCoercion:
    """The server must force query_mode='remote' and profile_after_sync=False
    on BQ rows (Decision 7) — even if the caller posts the wrong values."""

    def test_query_mode_forced_to_remote(self, seeded_app, bq_instance, stub_bq_extractor):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(query_mode="local", profile_after_sync=True),
            headers=_auth(token),
        )
        assert resp.status_code in (200, 202), resp.text

        # Read it back and confirm the registry has the forced values, not
        # the caller-supplied ones.
        resp = c.get("/api/admin/registry", headers=_auth(token))
        row = next(t for t in resp.json()["tables"] if t["name"] == "orders")
        assert row["query_mode"] == "remote"
        assert row["profile_after_sync"] is False


class TestBigQueryRegisterCollision:
    def test_id_collision_returns_409(self, seeded_app, bq_instance, stub_bq_extractor):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post("/api/admin/register-table", json=_bq_payload(), headers=_auth(token))
        assert resp.status_code in (200, 202)

        resp = c.post("/api/admin/register-table", json=_bq_payload(), headers=_auth(token))
        assert resp.status_code == 409
        assert "already" in resp.json()["detail"].lower()

    def test_view_name_collision_returns_409(self, seeded_app, bq_instance, stub_bq_extractor):
        """Two different display names that slugify to the same id is the
        id-collision case above. View-name collision is for two callers
        who pick the SAME display name `name` — same view, different rows.
        Pre-fix the second call would silently win at next rebuild
        (orchestrator picks the row whose extract was attached last)."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="orders", bucket="ds_a"),
            headers=_auth(token),
        )
        assert resp.status_code in (200, 202)

        # Same `name` (== view_name) — must 409 even though id derivation
        # would also collide; the pre-check is independent.
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="orders", bucket="ds_b", source_table="orders2"),
            headers=_auth(token),
        )
        assert resp.status_code == 409


class TestBigQueryRegisterAuth:
    def test_register_requires_admin(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        token = seeded_app["analyst_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(),
            headers=_auth(token),
        )
        assert resp.status_code == 403

    def test_register_requires_auth(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        resp = c.post("/api/admin/register-table", json=_bq_payload())
        assert resp.status_code == 401


class TestBigQueryRegisterMaterialize:
    """The server must call rebuild_from_registry + SyncOrchestrator.rebuild
    after a successful BQ register (Decision 1). Verify by stubbing both
    and asserting they fired."""

    def test_register_invokes_rebuild_and_orchestrator(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(),
            headers=_auth(token),
        )
        assert resp.status_code in (200, 202), resp.text
        # Either the synchronous path or the BackgroundTask path; both must
        # fire. BackgroundTasks run after the response in TestClient, which
        # blocks until completion.
        assert stub_bq_extractor["rebuild"].called, "rebuild_from_registry not called"
        assert stub_bq_extractor["orchestrator"].rebuild.called, "orchestrator.rebuild not called"

    def test_register_returns_200_with_view_name_on_sync_success(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(),
            headers=_auth(token),
        )
        # In tests the materialize is fast enough to land synchronously.
        assert resp.status_code == 200, resp.text
        body = resp.json()
        assert body["status"] == "ok"
        assert body["view_name"] == "orders"


# --- API: precheck endpoint --------------------------------------------------


class _FakeBQTable:
    """Stand-in for google.cloud.bigquery.Table — only the attributes the
    precheck route reads."""

    def __init__(self, num_rows=1234, num_bytes=99999, schema=None):
        self.num_rows = num_rows
        self.num_bytes = num_bytes
        self.schema = schema or [
            MagicMock(name="id", field_type="INT64"),
            MagicMock(name="created_at", field_type="TIMESTAMP"),
        ]
        # Configure name attribute on each schema entry — MagicMock(name=…) is
        # the *mock's* name, not an attribute, so we set it explicitly.
        names = ["id", "created_at"]
        for col, n in zip(self.schema, names):
            col.name = n


class TestBigQueryPrecheck:
    def test_precheck_happy_path(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]

        fake_client = MagicMock()
        fake_client.get_table.return_value = _FakeBQTable()
        with patch("google.cloud.bigquery.Client", return_value=fake_client):
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(),
                headers=_auth(token),
            )
        assert resp.status_code == 200, resp.text
        body = resp.json()
        assert body["ok"] is True
        t = body["table"]
        assert t["rows"] == 1234
        assert t["size_bytes"] == 99999
        assert t["column_count"] == 2
        names = [c["name"] for c in t["columns"]]
        assert names == ["id", "created_at"]
        assert t["project_id"] == "my-test-project"

    def test_precheck_not_found_returns_404(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        from google.api_core import exceptions as google_exc
        fake_client = MagicMock()
        fake_client.get_table.side_effect = google_exc.NotFound("missing")
        with patch("google.cloud.bigquery.Client", return_value=fake_client):
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(),
                headers=_auth(token),
            )
        assert resp.status_code == 404
        assert "not found" in resp.json()["detail"].lower()

    def test_precheck_forbidden_returns_403(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        from google.api_core import exceptions as google_exc
        fake_client = MagicMock()
        fake_client.get_table.side_effect = google_exc.Forbidden("nope")
        with patch("google.cloud.bigquery.Client", return_value=fake_client):
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(),
                headers=_auth(token),
            )
        assert resp.status_code == 403
        assert "metadata.get" in resp.json()["detail"]

    def test_precheck_other_error_returns_400(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        fake_client = MagicMock()
        fake_client.get_table.side_effect = RuntimeError("auth failed")
        with patch("google.cloud.bigquery.Client", return_value=fake_client):
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(),
                headers=_auth(token),
            )
        assert resp.status_code == 400

    def test_precheck_no_db_write(self, seeded_app, bq_instance):
        """Precheck must not touch table_registry — operator inspects the
        result, decides whether to commit, then calls register-table."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]

        fake_client = MagicMock()
        fake_client.get_table.return_value = _FakeBQTable()
        with patch("google.cloud.bigquery.Client", return_value=fake_client):
            c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(name="precheck_only"),
                headers=_auth(token),
            )

        resp = c.get("/api/admin/registry", headers=_auth(token))
        names = [t["name"] for t in resp.json()["tables"]]
        assert "precheck_only" not in names

    def test_precheck_validates_before_calling_bq(self, seeded_app, bq_instance):
        """Validation runs before the BQ round-trip — bogus identifiers
        must not result in a real BQ call."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        with patch("google.cloud.bigquery.Client") as cls:
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(source_table="bad;name"),
                headers=_auth(token),
            )
        assert resp.status_code == 400
        cls.assert_not_called()

    def test_precheck_requires_admin(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        token = seeded_app["analyst_token"]
        resp = c.post(
            "/api/admin/register-table/precheck",
            json=_bq_payload(),
            headers=_auth(token),
        )
        assert resp.status_code == 403

    def test_precheck_keboola_skips_bq_roundtrip(self, seeded_app):
        """Non-BQ source types get validation-only precheck — no GCP call."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        with patch("google.cloud.bigquery.Client") as cls:
            resp = c.post(
                "/api/admin/register-table/precheck",
                json={
                    "name": "kb_orders",
                    "source_type": "keboola",
                    "bucket": "in.c-crm",
                    "source_table": "orders",
                },
                headers=_auth(token),
            )
        assert resp.status_code == 200
        cls.assert_not_called()
        body = resp.json()
        assert body["ok"] is True
        # M1 documents this as validation-only via the response note.
        assert "validation-only" in body["table"].get("note", "")


# --- API: audit log entries ---------------------------------------------------


class TestRegistryAuditLog:
    """Decision 4: every registry mutation writes an audit_log row."""

    def _list_audit(self, conn, action):
        from src.repositories.audit import AuditRepository
        return AuditRepository(conn).query(action=action, limit=10)

    def test_register_keboola_writes_audit_entry(self, seeded_app):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json={"name": "kb_aud", "source_type": "keboola", "bucket": "in.c-crm"},
            headers=_auth(token),
        )
        assert resp.status_code == 201

        from src.db import get_system_db
        conn = get_system_db()
        try:
            rows = self._list_audit(conn, "register_table")
        finally:
            conn.close()
        assert any(r["resource"] == "kb_aud" for r in rows), \
            f"register_table audit entry not found in {rows}"

    def test_register_bq_writes_audit_entry(self, seeded_app, bq_instance, stub_bq_extractor):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        c.post("/api/admin/register-table", json=_bq_payload(name="bq_aud"), headers=_auth(token))

        from src.db import get_system_db
        conn = get_system_db()
        try:
            rows = self._list_audit(conn, "register_table")
        finally:
            conn.close()
        match = [r for r in rows if r["resource"] == "bq_aud"]
        assert match, f"register_table audit entry not found for bq_aud: {rows}"
        params = json.loads(match[0]["params"])
        assert params["source_type"] == "bigquery"
        assert params["bucket"] == "analytics"

    def test_audit_masks_secret_keyed_fields(self, seeded_app):
        """Even though the registry payload doesn't normally carry secrets,
        the sanitizer must mask any secret-looking key. Confirm by posting
        a synthetic field — the API ignores unknown fields, but the audit
        path runs `model_dump` so we can't test via the wire. Instead test
        the helper directly."""
        from app.api.admin import _sanitize_for_audit
        out = _sanitize_for_audit({
            "name": "x",
            "api_token": "hunter2",
            "bot_secret": "abc",
            "primary_key": ["id"],
            "description": "raw description stays raw",
            "password": "p",
        })
        assert out["name"] == "x"
        assert out["api_token"] == "***"
        assert out["bot_secret"] == "***"
        assert out["password"] == "***"
        assert out["primary_key"] == ["id"]  # whitelisted
        assert out["description"] == "raw description stays raw"

    def test_update_writes_audit_entry(self, seeded_app):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        c.post(
            "/api/admin/register-table",
            json={"name": "kb_upd", "source_type": "keboola"},
            headers=_auth(token),
        )
        resp = c.put(
            "/api/admin/registry/kb_upd",
            json={"description": "updated"},
            headers=_auth(token),
        )
        assert resp.status_code == 200, resp.text

        from src.db import get_system_db
        conn = get_system_db()
        try:
            rows = self._list_audit(conn, "update_table")
        finally:
            conn.close()
        assert any(r["resource"] == "kb_upd" for r in rows)

    def test_unregister_writes_audit_entry(self, seeded_app):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        c.post(
            "/api/admin/register-table",
            json={"name": "kb_del", "source_type": "keboola"},
            headers=_auth(token),
        )
        resp = c.delete("/api/admin/registry/kb_del", headers=_auth(token))
        assert resp.status_code == 204

        from src.db import get_system_db
        conn = get_system_db()
        try:
            rows = self._list_audit(conn, "unregister_table")
        finally:
            conn.close()
        assert any(r["resource"] == "kb_del" for r in rows)


# --- bigquery.extractor.rebuild_from_registry --------------------------------


class TestRebuildFromRegistry:
    def test_returns_skipped_when_no_bq_rows(self, e2e_env, monkeypatch):
        """No BigQuery rows in registry → skipped=True, no extract written."""
        monkeypatch.setattr(
            "config.loader.load_instance_config",
            lambda: {
                "data_source": {
                    "type": "bigquery",
                    "bigquery": {"project": "ok-project"},
                }
            },
        )
        # Empty registry — get_system_db returns the test DB, fresh.
        from connectors.bigquery import extractor as bq
        fake_init = MagicMock()
        monkeypatch.setattr(bq, "init_extract", fake_init)

        result = bq.rebuild_from_registry()

        assert result["skipped"] is True
        assert result["tables_registered"] == 0
        fake_init.assert_not_called()

    def test_calls_init_extract_with_registry_rows(self, e2e_env, monkeypatch):
        from connectors.bigquery import extractor as bq
        from src.db import get_system_db
        from src.repositories.table_registry import TableRegistryRepository

        # Seed one BQ row.
        conn = get_system_db()
        try:
            TableRegistryRepository(conn).register(
                id="orders",
                name="orders",
                source_type="bigquery",
                bucket="analytics",
                source_table="orders",
                query_mode="remote",
                profile_after_sync=False,
            )
        finally:
            conn.close()

        monkeypatch.setattr(
            "config.loader.load_instance_config",
            lambda: {
                "data_source": {
                    "type": "bigquery",
                    "bigquery": {"project": "ok-project"},
                }
            },
        )
        fake_init = MagicMock(return_value={"tables_registered": 1, "errors": []})
        monkeypatch.setattr(bq, "init_extract", fake_init)

        result = bq.rebuild_from_registry()

        assert result["skipped"] is False
        assert result["project_id"] == "ok-project"
        fake_init.assert_called_once()
        args, kwargs = fake_init.call_args
        # init_extract(output_dir, project_id, table_configs)
        assert args[1] == "ok-project"
        names = [t["name"] for t in args[2]]
        assert "orders" in names

    def test_missing_project_returns_error(self, e2e_env, monkeypatch):
        monkeypatch.setattr(
            "config.loader.load_instance_config",
            lambda: {"data_source": {"type": "bigquery", "bigquery": {}}},
        )
        from connectors.bigquery import extractor as bq
        result = bq.rebuild_from_registry()
        assert result["project_id"] == ""
        assert result["errors"]
        assert "project" in result["errors"][0]["error"].lower()


# --- UI: /admin/tables renders BQ vs Keboola fields --------------------------


class TestAdminTablesUI:
    def test_renders_bq_fields_when_data_source_bigquery(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        c.cookies.set("access_token", seeded_app["admin_token"])
        try:
            resp = c.get("/admin/tables", headers={"Accept": "text/html"})
        finally:
            c.cookies.clear()
        assert resp.status_code == 200, resp.text
        body = resp.text
        # Modal carries the source type so the JS can branch.
        assert 'data-source-type="bigquery"' in body
        # BQ-only inputs.
        assert 'id="bqDataset"' in body
        assert 'id="bqSourceTable"' in body
        assert 'id="bqViewName"' in body
        assert 'id="bqSyncSchedule"' in body
        # Inline hint about scheduler-not-yet-wired (Decision 3).
        assert "scheduler" in body.lower()
        # BQ-specific panel (no discovery for BQ in M1).
        assert 'data-test="bq-register-panel"' in body
        # Keboola-only inputs must NOT be present.
        assert 'id="regTableId"' not in body
        assert 'id="regBucket"' not in body

    def test_renders_keboola_fields_when_data_source_keboola(self, seeded_app, monkeypatch):
        from app.instance_config import reset_cache
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
            lambda: {"data_source": {"type": "keboola"}},
            raising=False,
        )
        reset_cache()
        try:
            c = seeded_app["client"]
            c.cookies.set("access_token", seeded_app["admin_token"])
            try:
                resp = c.get("/admin/tables", headers={"Accept": "text/html"})
            finally:
                c.cookies.clear()
            assert resp.status_code == 200
            body = resp.text
            assert 'data-source-type="keboola"' in body
            # Keboola path — discovery panel + Keboola inputs.
            assert 'id="discoveryResults"' in body
            assert 'id="regBucket"' in body
            assert 'id="regTableName"' in body
            # BQ-only inputs MUST NOT be present.
            assert 'id="bqDataset"' not in body
        finally:
            reset_cache()

    def test_admin_tables_requires_admin(self, seeded_app):
        c = seeded_app["client"]
        c.cookies.set("access_token", seeded_app["analyst_token"])
        try:
            resp = c.get("/admin/tables", follow_redirects=False)
        finally:
            c.cookies.clear()
        assert resp.status_code in (302, 401, 403)


# --- CLI: da admin register-table --dry-run ----------------------------------


class TestCliRegisterTableDryRun:
    def _resp(self, status_code=200, json_data=None, text=""):
        r = MagicMock()
        r.status_code = status_code
        r.json.return_value = json_data if json_data is not None else {}
        r.text = text
        return r

    def test_dry_run_calls_precheck_endpoint(self, monkeypatch, tmp_path):
        from typer.testing import CliRunner
        from cli.main import app
        runner = CliRunner()

        captured = {}

        def fake_post(path, json=None, **kwargs):
            captured["path"] = path
            captured["payload"] = json
            return self._resp(
                200,
                {
                    "ok": True,
                    "table": {
                        "name": "orders",
                        "source_type": "bigquery",
                        "bucket": "analytics",
                        "source_table": "orders",
                        "project_id": "my-test-project",
                        "rows": 100,
                        "size_bytes": 4096,
                        "columns": [
                            {"name": "id", "type": "INT64"},
                            {"name": "created_at", "type": "TIMESTAMP"},
                        ],
                        "column_count": 2,
                    },
                },
            )

        monkeypatch.setenv("DA_CONFIG_DIR", str(tmp_path))
        monkeypatch.setenv("DATA_DIR", str(tmp_path))
        with patch("cli.commands.admin.api_post", side_effect=fake_post):
            result = runner.invoke(app, [
                "admin", "register-table", "orders",
                "--source-type", "bigquery",
                "--bucket", "analytics",
                "--source-table", "orders",
                "--dry-run",
            ])
        assert result.exit_code == 0, result.output
        assert captured["path"] == "/api/admin/register-table/precheck"
        # No DB write happened (we only mocked api_post).
        assert "DRY RUN" in result.output
        assert "rows:" in result.output
        assert "id" in result.output
        assert "created_at" in result.output

    def test_dry_run_failure_exits_nonzero(self, monkeypatch, tmp_path):
        from typer.testing import CliRunner
        from cli.main import app
        runner = CliRunner()

        monkeypatch.setenv("DA_CONFIG_DIR", str(tmp_path))
        monkeypatch.setenv("DATA_DIR", str(tmp_path))
        with patch(
            "cli.commands.admin.api_post",
            return_value=self._resp(404, {"detail": "BigQuery table not found"}, "404"),
        ):
            result = runner.invoke(app, [
                "admin", "register-table", "missing",
                "--source-type", "bigquery",
                "--bucket", "analytics",
                "--source-table", "missing",
                "--dry-run",
            ])
        assert result.exit_code == 1
        assert "not found" in result.output.lower()

    def test_register_without_dry_run_still_works(self, monkeypatch, tmp_path):
        """Backwards compat — the existing flag set unchanged."""
        from typer.testing import CliRunner
        from cli.main import app
        runner = CliRunner()

        monkeypatch.setenv("DA_CONFIG_DIR", str(tmp_path))
        monkeypatch.setenv("DATA_DIR", str(tmp_path))

        captured = {}

        def fake_post(path, json=None, **kwargs):
            captured["path"] = path
            return self._resp(201, {"id": "x", "name": "x", "status": "registered"})

        with patch("cli.commands.admin.api_post", side_effect=fake_post):
            result = runner.invoke(app, [
                "admin", "register-table", "orders",
                "--source-type", "keboola",
                "--bucket", "in.c-crm",
            ])
        assert result.exit_code == 0
        assert captured["path"] == "/api/admin/register-table"

    def test_register_handles_202_response(self, monkeypatch, tmp_path):
        """BQ register can return 202 when materialize exceeds the budget."""
        from typer.testing import CliRunner
        from cli.main import app
        runner = CliRunner()

        monkeypatch.setenv("DA_CONFIG_DIR", str(tmp_path))
        monkeypatch.setenv("DATA_DIR", str(tmp_path))

        with patch(
            "cli.commands.admin.api_post",
            return_value=self._resp(202, {"id": "x", "name": "x", "status": "accepted", "view_name": "x"}),
        ):
            result = runner.invoke(app, [
                "admin", "register-table", "orders",
                "--source-type", "bigquery",
                "--bucket", "analytics",
                "--source-table", "orders",
            ])
        assert result.exit_code == 0
        assert "background" in result.output.lower()


# --- Review fixes for #108 M1 ------------------------------------------------


class TestKeboolaRegisterStatusCode:
    """Status-code contract: the route no longer carries `status_code=201` on
    its decorator — each branch returns its own. Keboola (non-BQ) must still
    explicitly return 201 with the registered-row body."""

    def test_keboola_register_returns_201(self, seeded_app):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json={
                "name": "kb_status",
                "source_type": "keboola",
                "bucket": "in.c-crm",
                "source_table": "orders",
                "query_mode": "local",
            },
            headers=_auth(token),
        )
        assert resp.status_code == 201, resp.text
        body = resp.json()
        assert body["id"] == "kb_status"
        assert body["status"] == "registered"


class TestUpdateTableBigQueryValidation:
    """PUT /api/admin/registry/{id} must run the BQ-shape validator whenever
    the merged record would be a BQ row, including the case where the patch
    flips source_type from keboola → bigquery (review IMPORTANT-4)."""

    def test_put_keboola_row_to_bq_with_bad_project_returns_4xx(
        self, seeded_app, monkeypatch,
    ):
        from app.instance_config import reset_cache
        # Set a malformed project_id in instance.yaml so the BQ validator
        # rejects the merged row at PUT time.
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
            lambda: {
                "data_source": {
                    "type": "bigquery",
                    "bigquery": {"project": "Bad Project With Spaces"},
                }
            },
            raising=False,
        )
        reset_cache()
        try:
            c = seeded_app["client"]
            token = seeded_app["admin_token"]
            # Seed a Keboola row first.
            resp = c.post(
                "/api/admin/register-table",
                json={
                    "name": "rev4",
                    "source_type": "keboola",
                    "bucket": "in.c-crm",
                    "source_table": "rev4",
                    "query_mode": "local",
                },
                headers=_auth(token),
            )
            assert resp.status_code == 201

            # Now PATCH it to bigquery — must run BQ validation and 4xx
            # because the project_id is bogus.
            resp = c.put(
                "/api/admin/registry/rev4",
                json={
                    "source_type": "bigquery",
                    "bucket": "analytics",
                    "source_table": "rev4",
                },
                headers=_auth(token),
            )
            assert resp.status_code in (400, 422), resp.text
        finally:
            reset_cache()

    def test_put_existing_bq_row_with_bad_bucket_returns_400(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        """An admin PATCH that mutates `bucket` on an existing BQ row to an
        unsafe identifier must be rejected before the registry write."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        # Register a BQ row.
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="rev4_bq"),
            headers=_auth(token),
        )
        assert resp.status_code in (200, 202), resp.text

        # PATCH bucket to an unsafe identifier — must 400.
        resp = c.put(
            "/api/admin/registry/rev4_bq",
            json={"bucket": 'evil";DROP'},
            headers=_auth(token),
        )
        assert resp.status_code == 400, resp.text

    def test_put_preserves_registered_at_across_edits(self, seeded_app):
        """Issue #130 — PUT /api/admin/registry/{id} must NOT reset
        registered_at on every edit. The original timestamp from the initial
        register call must survive subsequent PUTs."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        # Initial registration.
        resp = c.post(
            "/api/admin/register-table",
            json={
                "name": "preserve_ts",
                "source_type": "keboola",
                "bucket": "in.c-crm",
                "source_table": "preserve_ts",
                "query_mode": "local",
            },
            headers=_auth(token),
        )
        assert resp.status_code == 201, resp.text
        # Read the timestamp the registry actually stored.
        listing = c.get("/api/admin/registry", headers=_auth(token)).json()
        original_ts = next(
            r for r in listing["tables"] if r["id"] == "preserve_ts"
        )["registered_at"]
        assert original_ts  # not None / empty
        # Edit the row — PUT a description change.
        resp = c.put(
            "/api/admin/registry/preserve_ts",
            json={"description": "edited"},
            headers=_auth(token),
        )
        assert resp.status_code == 200, resp.text
        # Re-read; registered_at must still match the original.
        listing2 = c.get("/api/admin/registry", headers=_auth(token)).json()
        post_edit_ts = next(
            r for r in listing2["tables"] if r["id"] == "preserve_ts"
        )["registered_at"]
        assert post_edit_ts == original_ts, (
            f"registered_at changed across PUT: was {original_ts!r}, "
            f"now {post_edit_ts!r}"
        )


class TestAuditAllowlistMasking:
    """Review IMPORTANT-5: explicit allowlist instead of substring scan.

    Asserts that:
      - field names containing 'token'/'key'/'secret' as substrings are NOT
        masked unless they're in the explicit allowlist; and
      - known-secret fields IN the allowlist are still masked.
    """

    def test_substring_match_does_not_mask_unknown_fields(self):
        from app.api.admin import _sanitize_for_audit
        out = _sanitize_for_audit({
            # All of these would have been masked by the old substring
            # scan but should now flow through cleartext — they aren't
            # actual credentials.
            "not_actually_a_token": "literal value",
            "primary_key": ["id"],
            "primary_key_hash": "deadbeef",
            "passwordless": "no creds here",
            "secretly_an_int": 42,
        })
        assert out["not_actually_a_token"] == "literal value"
        assert out["primary_key"] == ["id"]
        assert out["primary_key_hash"] == "deadbeef"
        assert out["passwordless"] == "no creds here"
        assert out["secretly_an_int"] == 42

    def test_allowlisted_secret_fields_are_masked(self):
        from app.api.admin import _sanitize_for_audit
        out = _sanitize_for_audit({
            "keboola_token": "kbc-1234",
            "client_secret": "abc",
            "smtp_password": "p",
            "bot_token": "tg-1",
            "name": "kept-raw",
        })
        assert out["keboola_token"] == "***"
        assert out["client_secret"] == "***"
        assert out["smtp_password"] == "***"
        assert out["bot_token"] == "***"
        assert out["name"] == "kept-raw"

    def test_empty_secret_fields_are_marked_empty(self):
        from app.api.admin import _sanitize_for_audit
        out = _sanitize_for_audit({"keboola_token": "", "client_secret": None})
        assert out["keboola_token"] == "<empty>"
        assert out["client_secret"] == "<empty>"


class TestBigQueryInitExtractLockSerialization:
    """Review IMPORTANT-2: two concurrent calls to `init_extract` (the
    file-swap path) must serialize cleanly under `_INIT_EXTRACT_LOCK`. We
    verify the lock by stubbing the heavy GCE round-trip and asserting that
    only one worker is inside the locked body at a time."""

    def test_concurrent_init_extract_serializes(self, tmp_path, monkeypatch):
        import threading
        import time

        from connectors.bigquery import extractor as bq

        # Track concurrent entries into the locked body. If the lock works,
        # `inside` is never > 1.
        inside = {"current": 0, "peak": 0}
        lock = threading.Lock()

        def fake_locked(output_dir, project_id, table_configs):
            with lock:
                inside["current"] += 1
                inside["peak"] = max(inside["peak"], inside["current"])
            try:
                # Hold the lock long enough that a parallel call has time to
                # block on `_INIT_EXTRACT_LOCK` if serialization works, or
                # race past it (and bump `peak` to 2) if it doesn't.
                time.sleep(0.05)
                return {"tables_registered": 0, "errors": []}
            finally:
                with lock:
                    inside["current"] -= 1

        monkeypatch.setattr(bq, "_init_extract_locked", fake_locked)

        results = []

        def call():
            results.append(
                bq.init_extract(str(tmp_path / "extr"), "ok-project", [])
            )

        threads = [threading.Thread(target=call) for _ in range(3)]
        for t in threads:
            t.start()
        for t in threads:
            t.join()

        assert len(results) == 3
        assert inside["peak"] == 1, (
            f"_INIT_EXTRACT_LOCK did not serialize concurrent callers — "
            f"peak concurrency was {inside['peak']}"
        )


class TestBigQueryRegisterFreshConnection:
    """Review BLOCKER-1: the worker must not capture the request-scoped
    DuckDB connection. Confirm by asserting the worker calls `get_system_db`
    (fresh handle) and the request connection is NEVER passed through.
    """

    def test_worker_opens_fresh_connection(
        self, seeded_app, bq_instance, stub_bq_extractor, monkeypatch,
    ):
        from src import db as _db

        opens = {"count": 0}
        original_get = _db.get_system_db

        def counting_get_system_db():
            opens["count"] += 1
            return original_get()

        monkeypatch.setattr("src.db.get_system_db", counting_get_system_db)
        # The admin module imports `get_system_db` via `from src.db import …`
        # inside the worker function, so patching `src.db.get_system_db` is
        # sufficient — but also patch any cached binding for safety.
        import app.api.admin as admin_mod
        if hasattr(admin_mod, "get_system_db"):
            monkeypatch.setattr(admin_mod, "get_system_db", counting_get_system_db, raising=False)

        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="fresh_conn"),
            headers=_auth(token),
        )
        assert resp.status_code in (200, 202), resp.text
        # The worker opens at least one fresh connection (via get_system_db).
        # Other parts of the request also use get_system_db (auth gate, repo
        # lookup), so we just assert that the worker contributed at least one
        # extra open. Stronger guarantee: the rebuild stub was invoked.
        assert stub_bq_extractor["rebuild"].called
        # And the connection passed to rebuild_from_registry must NOT be the
        # same one the request handler held — assert it's not None and was
        # opened in the worker (we can't compare object identity without
        # threading the request conn through, but a separate handle implies
        # the worker did its own open).
        passed_conn = stub_bq_extractor["rebuild"].call_args.kwargs.get("conn")
        assert passed_conn is not None, (
            "rebuild_from_registry should receive a fresh worker-opened conn"
        )

    def test_worker_runs_after_request_returns(
        self, seeded_app, bq_instance, monkeypatch,
    ):
        """Force the synchronous budget to expire so the BackgroundTask path
        runs after the request connection is closed. The worker must still
        complete because it opens its own connection."""
        from unittest.mock import MagicMock
        import time

        # Replace SyncOrchestrator with a fast no-op so we can observe the
        # rebuild_from_registry call after the response.
        orch_mock = MagicMock()
        monkeypatch.setattr(
            "src.orchestrator.SyncOrchestrator",
            lambda *a, **kw: orch_mock,
        )

        # Stub rebuild_from_registry to take longer than the budget so the
        # synchronous path times out and BackgroundTask kicks in.
        slow_rebuild = MagicMock()

        def slow_call(conn=None, output_dir=None):
            time.sleep(0.2)
            return {
                "project_id": "my-test-project",
                "tables_registered": 1,
                "errors": [],
                "skipped": False,
            }

        slow_rebuild.side_effect = slow_call
        monkeypatch.setattr(
            "connectors.bigquery.extractor.rebuild_from_registry",
            slow_rebuild,
        )

        # Tighten the budget so the test is fast.
        monkeypatch.setattr(
            "app.api.admin._BQ_SYNC_REGISTER_TIMEOUT_S", 0.05, raising=False,
        )

        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="fresh_after"),
            headers=_auth(token),
        )
        # 202 (timeout) is the expected path; 200 is acceptable if the box is
        # slow enough that BackgroundTask runs synchronously inside TestClient.
        assert resp.status_code in (200, 202), resp.text
        # Wait for the BackgroundTask to drain. TestClient already does this
        # synchronously for tasks, but the timeout-fallback also spawned a
        # daemon thread. Give both up to 1s to settle.
        deadline = time.time() + 1.0
        while time.time() < deadline and slow_rebuild.call_count < 1:
            time.sleep(0.01)
        assert slow_rebuild.called, (
            "rebuild_from_registry should run after request returns "
            "(via BackgroundTask + daemon fallback)"
        )


# --- Devin review fixes (PR #119) -------------------------------------------


class TestRegisterTableHandlerIsSync:
    """Review BLOCKER 1: register_table must NOT be `async def`. The
    synchronous-materialize path waits on `threading.Event.wait()` which
    would otherwise block the asyncio event loop and stall every other
    request for up to `_BQ_SYNC_REGISTER_TIMEOUT_S`. FastAPI runs plain
    `def` handlers in a threadpool so the wait is harmless there.
    """

    def test_handler_is_not_a_coroutine(self):
        import inspect
        from app.api.admin import register_table
        assert not inspect.iscoroutinefunction(register_table), (
            "register_table must be a sync def — see review BLOCKER 1 in #119. "
            "An async handler that blocks on threading.Event.wait() parks the "
            "asyncio event loop for the entire timeout budget."
        )

    def test_event_loop_not_blocked_by_slow_register(
        self, seeded_app, bq_instance, monkeypatch,
    ):
        """A slow BQ register must not stall a parallel request.

        We force the synchronous materialize past its budget by stubbing
        `_run_bigquery_materialize_with_timeout` to spin for ~0.3s, then
        fire two requests "in parallel" (via two threads, since TestClient
        is sync) and assert both finish within a reasonable wall clock.
        If the handler were async + blocking, the second request would
        wait for the first to finish.
        """
        import threading
        import time

        # Stub the materialize helper so the test doesn't need real BQ.
        # `_run_bigquery_materialize_with_timeout` is what the handler
        # waits on; make it sleep, then return ok.
        def _slow(background):
            time.sleep(0.3)
            return {"status": "ok"}

        monkeypatch.setattr(
            "app.api.admin._run_bigquery_materialize_with_timeout",
            _slow,
        )

        c = seeded_app["client"]
        token = seeded_app["admin_token"]

        results = {}

        def fire_register(idx):
            t0 = time.time()
            r = c.post(
                "/api/admin/register-table",
                json=_bq_payload(name=f"par_{idx}"),
                headers=_auth(token),
            )
            results[idx] = (r.status_code, time.time() - t0)

        threads = [
            threading.Thread(target=fire_register, args=(i,)) for i in range(2)
        ]
        for t in threads:
            t.start()
        for t in threads:
            t.join()

        # Both calls must succeed. The exact wall clock depends on the
        # threadpool size FastAPI's anyio uses (default >= 40), but the
        # SECOND call should not be blocked behind the FIRST one's
        # 0.3s sleep — total time for each call should be ~0.3s, not
        # ~0.6s. Allow generous slack for CI noise.
        assert results[0][0] in (200, 202), results[0]
        assert results[1][0] in (200, 202), results[1]


class TestBigQueryRebuildOverlayAware:
    """Review BLOCKER 2: rebuild_from_registry must read the BQ project via
    the overlay-aware `app.instance_config.get_value`, NOT the static-only
    `config.loader.load_instance_config`. Validation already does the
    former, so without this fix validation passes and the rebuild silently
    fails — the row is in the registry but the master view is never built.
    """

    def test_overlay_only_project_resolves(self, e2e_env, monkeypatch):
        """When the project is set ONLY in the overlay (admin UI write),
        rebuild must still resolve it."""
        from app.instance_config import reset_cache
        from connectors.bigquery import extractor as bq
        from src.db import get_system_db
        from src.repositories.table_registry import TableRegistryRepository

        # Static instance.yaml has no BQ block — only the overlay does.
        # We simulate the merged result the way `app.instance_config.load_
        # instance_config` would expose it: deep-merged dict from
        # static + overlay. Patching `app.instance_config.load_instance_
        # config` matches the read path in the new helper.
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
            lambda: {
                "data_source": {
                    "type": "bigquery",
                    "bigquery": {"project": "overlay-project"},
                }
            },
            raising=False,
        )
        # And the static loader has nothing — proves we don't fall back.
        monkeypatch.setattr(
            "config.loader.load_instance_config",
            lambda: {},
            raising=False,
        )
        reset_cache()

        # Seed a BQ row so init_extract is triggered.
        conn = get_system_db()
        try:
            TableRegistryRepository(conn).register(
                id="ovr",
                name="ovr",
                source_type="bigquery",
                bucket="analytics",
                source_table="ovr",
                query_mode="remote",
                profile_after_sync=False,
            )
        finally:
            conn.close()
        fake_init = MagicMock(return_value={"tables_registered": 1, "errors": []})
        monkeypatch.setattr(bq, "init_extract", fake_init)

        try:
            result = bq.rebuild_from_registry()
        finally:
            reset_cache()

        # Project resolved from the overlay, not the (empty) static file.
        assert result["project_id"] == "overlay-project"
        assert result["skipped"] is False
        fake_init.assert_called_once()
        # init_extract(output_dir, project_id, table_configs)
        assert fake_init.call_args.args[1] == "overlay-project"

    def test_static_only_project_still_resolves(self, e2e_env, monkeypatch):
        """Regression: when there's NO overlay, the static config still wins
        (so existing deployments that wrote instance.yaml by hand keep
        working)."""
        from app.instance_config import reset_cache
        from connectors.bigquery import extractor as bq
        from src.db import get_system_db
        from src.repositories.table_registry import TableRegistryRepository

        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
            lambda: {
                "data_source": {
                    "type": "bigquery",
                    "bigquery": {"project": "static-project"},
                }
            },
            raising=False,
        )
        monkeypatch.setattr(
            "config.loader.load_instance_config",
            lambda: {
                "data_source": {
                    "type": "bigquery",
                    "bigquery": {"project": "static-project"},
                }
            },
            raising=False,
        )
        reset_cache()

        conn = get_system_db()
        try:
            TableRegistryRepository(conn).register(
                id="stat",
                name="stat",
                source_type="bigquery",
                bucket="analytics",
                source_table="stat",
                query_mode="remote",
                profile_after_sync=False,
            )
        finally:
            conn.close()
        fake_init = MagicMock(return_value={"tables_registered": 1, "errors": []})
        monkeypatch.setattr(bq, "init_extract", fake_init)

        try:
            result = bq.rebuild_from_registry()
        finally:
            reset_cache()

        assert result["project_id"] == "static-project"
        fake_init.assert_called_once()


class TestBigQueryRebuildErrorPropagation:
    """Review IMPORTANT 3: errors from rebuild_from_registry must surface
    as 500 in the synchronous register path (not be silently logged), and
    in the BackgroundTask path must be logged at ERROR level (not warn)."""

    def test_synchronous_path_returns_500_on_rebuild_errors(
        self, seeded_app, bq_instance, monkeypatch,
    ):
        # Stub rebuild_from_registry to report errors but not raise.
        rebuild_mock = MagicMock(return_value={
            "project_id": "my-test-project",
            "tables_registered": 0,
            "errors": [{"table": "orders", "error": "auth failed"}],
            "skipped": False,
        })
        monkeypatch.setattr(
            "connectors.bigquery.extractor.rebuild_from_registry",
            rebuild_mock,
        )
        orch_mock = MagicMock()
        monkeypatch.setattr(
            "src.orchestrator.SyncOrchestrator",
            lambda *a, **kw: orch_mock,
        )

        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="errprop"),
            headers=_auth(token),
        )
        # Synchronous rebuild ran (within budget) but reported errors —
        # the API must propagate that as 500 with the error list, not
        # claim success.
        assert resp.status_code == 500, resp.text
        body = resp.json()
        assert body["status"] == "rebuild_failed"
        assert body["errors"]
        assert body["errors"][0]["error"] == "auth failed"
        # The row is in the registry though — the rebuild can be retried.
        list_resp = c.get("/api/admin/registry", headers=_auth(token))
        names = [t["name"] for t in list_resp.json()["tables"]]
        assert "errprop" in names

    def test_background_path_logs_at_error_level(
        self, seeded_app, bq_instance, monkeypatch, caplog,
    ):
        """Force timeout so the BackgroundTask wrapper runs, then assert
        the wrapper logs the rebuild errors at ERROR level."""
        import logging
        import time

        # rebuild slow enough to time out the synchronous path.
        def slow_with_errors(conn=None, output_dir=None):
            time.sleep(0.15)
            return {
                "project_id": "my-test-project",
                "tables_registered": 0,
                "errors": [{"table": "x", "error": "bg-rebuild failure"}],
                "skipped": False,
            }

        monkeypatch.setattr(
            "connectors.bigquery.extractor.rebuild_from_registry",
            slow_with_errors,
        )
        orch_mock = MagicMock()
        monkeypatch.setattr(
            "src.orchestrator.SyncOrchestrator",
            lambda *a, **kw: orch_mock,
        )
        # Tighten the budget so timeout kicks in fast.
        monkeypatch.setattr(
            "app.api.admin._BQ_SYNC_REGISTER_TIMEOUT_S", 0.05, raising=False,
        )

        c = seeded_app["client"]
        token = seeded_app["admin_token"]

        with caplog.at_level(logging.ERROR, logger="app.api.admin"):
            resp = c.post(
                "/api/admin/register-table",
                json=_bq_payload(name="bg_err"),
                headers=_auth(token),
            )
            # 202 (timeout) — BackgroundTask runs after the response.
            assert resp.status_code == 202, resp.text
            # Drain BackgroundTasks. TestClient runs them synchronously
            # after the response, so the log should already be present.

        msgs = [r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR]
        # At least one ERROR-level entry must mention "bg-rebuild failure"
        # — so the operator's logs surface the failure even though the
        # 202 response can't carry the detail.
        assert any("bg-rebuild failure" in m for m in msgs), (
            f"expected ERROR-level rebuild-failure log; got: {msgs}"
        )


class TestKeboolaModalUsesDiscoveredTableId:
    """Review IMPORTANT 5: the JS that builds the Keboola register payload
    must derive `source_table` from the discovered table's storage ID
    (`t.id` minus the bucket prefix), NOT the human-friendly display name
    (`t.name`). We verify by static template inspection — this is enough
    to catch a regression that drops the hidden field or reverts the JS
    to reading `regTableName`."""

    def test_template_has_hidden_source_table_field(self, seeded_app, monkeypatch):
        from app.instance_config import reset_cache
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
            lambda: {"data_source": {"type": "keboola"}},
            raising=False,
        )
        reset_cache()
        try:
            c = seeded_app["client"]
            c.cookies.set("access_token", seeded_app["admin_token"])
            try:
                resp = c.get("/admin/tables", headers={"Accept": "text/html"})
            finally:
                c.cookies.clear()
            assert resp.status_code == 200, resp.text
            body = resp.text
            # Hidden field must exist so the JS can stash the bare
            # storage identifier separately from the display name.
            assert 'id="regSourceTable"' in body
            # And the build function must read from that hidden field
            # (NOT from regTableName, which is the display name).
            assert "getElementById('regSourceTable').value" in body
        finally:
            reset_cache()

    def test_template_does_not_send_display_name_as_source_table(
        self, seeded_app, monkeypatch,
    ):
        """Regression check: pre-fix the payload had
        `source_table: document.getElementById('regTableName').value`.
        After the fix, that exact line must be gone (the build function
        reads from the hidden `regSourceTable` first)."""
        from app.instance_config import reset_cache
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
            lambda: {"data_source": {"type": "keboola"}},
            raising=False,
        )
        reset_cache()
        try:
            c = seeded_app["client"]
            c.cookies.set("access_token", seeded_app["admin_token"])
            try:
                resp = c.get("/admin/tables", headers={"Accept": "text/html"})
            finally:
                c.cookies.clear()
            body = resp.text
            # No occurrence of the buggy direct assignment.
            assert (
                "source_table: document.getElementById('regTableName').value"
                not in body
            )
        finally:
            reset_cache()


class TestBigQueryUITwoStepFlow:
    """Review IMPORTANT 4: the BQ register flow in the modal must split
    precheck and register into two operator-driven clicks. We verify the
    JS function structure via template inspection (no JS test runner in
    this codebase)."""

    def test_template_has_separate_confirm_function(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        c.cookies.set("access_token", seeded_app["admin_token"])
        try:
            resp = c.get("/admin/tables", headers={"Accept": "text/html"})
        finally:
            c.cookies.clear()
        assert resp.status_code == 200, resp.text
        body = resp.text
        # Two-step: precheck function + separate confirm function.
        assert "_registerBigQueryTable" in body
        assert "_confirmRegisterBigQueryTable" in body
        # Pre-fix, the precheck callback chained directly into a
        # `fetch('/api/admin/register-table'...)` inside the same `.then`.
        # After the fix, the precheck handler must NOT contain the
        # second fetch URL. Verify the precheck function body explicitly
        # swaps the button to "Register" and assigns onclick to the
        # confirm function.
        assert "btn.onclick = function() { _confirmRegisterBigQueryTable" in body
        # And the actual register POST is inside _confirmRegisterBigQueryTable.
        # Locate the function body and assert it has the register URL.
        idx = body.find("function _confirmRegisterBigQueryTable")
        assert idx >= 0
        # Take the next ~2000 chars as the function body — generous
        # enough for the small handler.
        confirm_body = body[idx:idx + 3000]
        assert "/api/admin/register-table'" in confirm_body
        assert "method: 'POST'" in confirm_body


class TestCliDiscoverAndRegisterAcceptsAllSuccessCodes:
    """Review NIT 6: `da admin discover-and-register` must accept 200
    (BQ sync OK) and 202 (BQ background) as success, not just 201.
    Pre-fix every successful BQ row counted as an error."""

    def _resp(self, status_code=200, json_data=None, text=""):
        r = MagicMock()
        r.status_code = status_code
        r.json.return_value = json_data if json_data is not None else {}
        r.text = text
        return r

    def _run(self, monkeypatch, status_code, body=None, source_type="bigquery"):
        from typer.testing import CliRunner
        from cli.main import app
        runner = CliRunner()

        # Need both KEBOOLA_* env vars for the gate; we mock httpx.get
        # so the actual values don't matter.
        monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake-kbc-token")
        monkeypatch.setenv("KEBOOLA_STACK_URL", "https://connection.example.com")

        fake_tables = [
            {
                "id": "in.c-x.orders",
                "name": "orders",
                "bucket": {"id": "in.c-x"},
                "rowsCount": 100,
            }
        ]
        fake_get = MagicMock()
        fake_get.return_value = self._resp(200, fake_tables)
        fake_get.return_value.raise_for_status = lambda: None
        # `httpx` is imported locally inside discover_and_register, so we
        # patch the module-level attribute the function will resolve.
        import httpx as _httpx
        monkeypatch.setattr(_httpx, "get", fake_get)

        register_resp = self._resp(status_code, body or {"id": "orders", "name": "orders"})
        with patch("cli.commands.admin.api_post", return_value=register_resp):
            result = runner.invoke(app, [
                "admin", "discover-and-register",
                "--source-type", source_type,
            ])
        return result

    def test_accepts_200_as_success(self, monkeypatch):
        result = self._run(monkeypatch, 200, {
            "id": "orders", "name": "orders", "status": "ok", "view_name": "orders",
        })
        assert result.exit_code == 0, result.output
        assert "1 registered" in result.output
        assert "0 errors" in result.output

    def test_accepts_202_as_success(self, monkeypatch):
        result = self._run(monkeypatch, 202, {
            "id": "orders", "name": "orders", "status": "accepted", "view_name": "orders",
        })
        assert result.exit_code == 0, result.output
        assert "1 registered" in result.output
        assert "0 errors" in result.output
        # Operator gets a hint that the row is materializing in BG.
        assert "background" in result.output.lower()

    def test_accepts_201_as_success(self, monkeypatch):
        # Regression: legacy non-BQ insert path still works.
        result = self._run(
            monkeypatch, 201,
            {"id": "orders", "name": "orders", "status": "registered"},
            source_type="keboola",
        )
        assert result.exit_code == 0, result.output
        assert "1 registered" in result.output


class TestBigQueryRegisterRawNameValidation:
    """Round-3 review BLOCKER 1: ``_validate_bigquery_register_payload`` must
    validate the RAW name (the value persisted to ``table_registry.name``
    and used by the BQ extractor as the DuckDB view name), NOT a normalized
    form. Pre-fix a name like ``"my table"`` would pass validation
    (normalized ``"my_table"`` is safe), get stored verbatim, then 500 at
    the post-insert rebuild — defeating fast-fail-at-register."""

    def test_register_rejects_name_with_space(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="my table"),
            headers=_auth(token),
        )
        assert resp.status_code == 400, resp.text
        body = resp.json()
        # Operator-friendly: surface the offending raw value verbatim.
        assert "my table" in body["detail"]
        assert "view name" in body["detail"].lower()

    def test_register_rejects_name_with_leading_whitespace(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="  orders"),
            headers=_auth(token),
        )
        assert resp.status_code == 400, resp.text

    def test_register_rejects_name_with_trailing_whitespace(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="orders  "),
            headers=_auth(token),
        )
        assert resp.status_code == 400, resp.text

    def test_register_accepts_safe_name(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        """Sanity check: the strict check still admits well-formed names."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="my_table"),
            headers=_auth(token),
        )
        assert resp.status_code in (200, 202), resp.text

    def test_precheck_rejects_name_with_space(self, seeded_app, bq_instance):
        """Validation runs identically in /precheck — and it does so BEFORE
        the BQ round-trip, so a bad raw name short-circuits without touching
        the network."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        with patch("google.cloud.bigquery.Client") as cls:
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(name="my table"),
                headers=_auth(token),
            )
        assert resp.status_code == 400, resp.text
        assert "my table" in resp.json()["detail"]
        cls.assert_not_called()

    def test_precheck_accepts_safe_name(self, seeded_app, bq_instance):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        fake_client = MagicMock()
        fake_client.get_table.return_value = _FakeBQTable()
        with patch("google.cloud.bigquery.Client", return_value=fake_client):
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(name="my_table"),
                headers=_auth(token),
            )
        assert resp.status_code == 200, resp.text


class TestBigQueryRegisterRawBucketSourceTableValidation:
    """Round-4 review: ``_validate_bigquery_register_payload`` must apply the
    same RAW-value rule to ``bucket`` and ``source_table`` as it does to
    ``name``. Pre-fix the helper validated ``bucket.strip()`` /
    ``source_table.strip()`` but ``register_table`` persists the un-stripped
    value, so ``"my_dataset "`` slipped through and 500'd downstream at
    view-create time. Parity with the ``name`` fix from round 3."""

    def test_register_rejects_bucket_with_leading_whitespace(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(bucket=" my_dataset"),
            headers=_auth(token),
        )
        assert resp.status_code == 400, resp.text
        body = resp.json()
        # Operator-friendly: surface the offending raw value verbatim.
        assert " my_dataset" in body["detail"]
        assert "dataset" in body["detail"].lower()

    def test_register_rejects_bucket_with_trailing_whitespace(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(bucket="my_dataset "),
            headers=_auth(token),
        )
        assert resp.status_code == 400, resp.text
        body = resp.json()
        assert "my_dataset " in body["detail"]
        assert "dataset" in body["detail"].lower()

    def test_register_rejects_source_table_with_leading_whitespace(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(source_table=" my_table"),
            headers=_auth(token),
        )
        assert resp.status_code == 400, resp.text
        body = resp.json()
        assert " my_table" in body["detail"]
        assert "source_table" in body["detail"].lower()

    def test_register_rejects_source_table_with_trailing_whitespace(
        self, seeded_app, bq_instance, stub_bq_extractor,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(source_table="my_table "),
            headers=_auth(token),
        )
        assert resp.status_code == 400, resp.text
        body = resp.json()
        assert "my_table " in body["detail"]
        assert "source_table" in body["detail"].lower()

    def test_precheck_rejects_bucket_with_leading_whitespace(
        self, seeded_app, bq_instance,
    ):
        """Validation runs identically in /precheck and short-circuits before
        the BQ round-trip — the helper is shared, so this is the same code
        path covered above, but we assert the BQ Client is never constructed."""
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        with patch("google.cloud.bigquery.Client") as cls:
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(bucket=" my_dataset"),
                headers=_auth(token),
            )
        assert resp.status_code == 400, resp.text
        assert " my_dataset" in resp.json()["detail"]
        cls.assert_not_called()

    def test_precheck_rejects_bucket_with_trailing_whitespace(
        self, seeded_app, bq_instance,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        with patch("google.cloud.bigquery.Client") as cls:
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(bucket="my_dataset "),
                headers=_auth(token),
            )
        assert resp.status_code == 400, resp.text
        cls.assert_not_called()

    def test_precheck_rejects_source_table_with_leading_whitespace(
        self, seeded_app, bq_instance,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        with patch("google.cloud.bigquery.Client") as cls:
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(source_table=" my_table"),
                headers=_auth(token),
            )
        assert resp.status_code == 400, resp.text
        cls.assert_not_called()

    def test_precheck_rejects_source_table_with_trailing_whitespace(
        self, seeded_app, bq_instance,
    ):
        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        with patch("google.cloud.bigquery.Client") as cls:
            resp = c.post(
                "/api/admin/register-table/precheck",
                json=_bq_payload(source_table="my_table "),
                headers=_auth(token),
            )
        assert resp.status_code == 400, resp.text
        cls.assert_not_called()


class TestBigQueryWorkerExceptionVsTimeout:
    """Round-3 review IMPORTANT 2: when the synchronous worker raises
    *within* the wall-clock budget, the API must surface that as a 500
    (hard error) — NOT 202 (timeout/retry). Earlier revisions mapped both
    outcomes to "timeout", which hid real failures behind a misleading
    "still working in the background" response for a budget-window worth
    of seconds, then the BG retry surfaced the same exception in the logs."""

    def test_worker_raises_within_budget_returns_500(
        self, seeded_app, bq_instance, monkeypatch,
    ):
        # Stub rebuild_from_registry to RAISE (not return errors). Worker
        # finishes within budget but the exception lands in err_holder.
        def boom(conn=None, output_dir=None):
            raise RuntimeError("simulated GCE auth failure")
        monkeypatch.setattr(
            "connectors.bigquery.extractor.rebuild_from_registry",
            boom,
        )
        orch_mock = MagicMock()
        monkeypatch.setattr(
            "src.orchestrator.SyncOrchestrator",
            lambda *a, **kw: orch_mock,
        )

        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="boomtable"),
            headers=_auth(token),
        )
        assert resp.status_code == 500, resp.text
        body = resp.json()
        assert body["status"] == "rebuild_failed"
        # The exception message must show up in the body so the operator
        # gets the actual root cause, not a "timeout" red herring.
        assert body["errors"], body
        assert any(
            "simulated GCE auth failure" in (e.get("error") or "")
            for e in body["errors"]
        ), body["errors"]
        # The row was still inserted before the rebuild ran — re-running
        # after fixing the underlying issue picks it up.
        list_resp = c.get("/api/admin/registry", headers=_auth(token))
        assert "boomtable" in [t["name"] for t in list_resp.json()["tables"]]

    def test_worker_still_running_at_timeout_returns_202(
        self, seeded_app, bq_instance, monkeypatch,
    ):
        """Counterpart: if the worker is genuinely still running when the
        budget expires, 202 + BackgroundTask is correct."""
        import time

        def slow_ok(conn=None, output_dir=None):
            time.sleep(0.15)
            return {
                "project_id": "my-test-project",
                "tables_registered": 1,
                "errors": [],
                "skipped": False,
            }
        monkeypatch.setattr(
            "connectors.bigquery.extractor.rebuild_from_registry",
            slow_ok,
        )
        orch_mock = MagicMock()
        monkeypatch.setattr(
            "src.orchestrator.SyncOrchestrator",
            lambda *a, **kw: orch_mock,
        )
        # Force a short budget so the worker is still running when wait()
        # returns False.
        monkeypatch.setattr(
            "app.api.admin._BQ_SYNC_REGISTER_TIMEOUT_S", 0.05, raising=False,
        )

        c = seeded_app["client"]
        token = seeded_app["admin_token"]
        resp = c.post(
            "/api/admin/register-table",
            json=_bq_payload(name="slowtable"),
            headers=_auth(token),
        )
        assert resp.status_code == 202, resp.text
        body = resp.json()
        assert body["status"] == "accepted"


class TestRegisterTablePrecheckHandlerIsSync:
    """Round-3 review NIT 3: ``register_table_precheck`` must be a plain
    ``def`` (not ``async def``) — the BQ branch makes synchronous
    ``bigquery.Client(...)`` / ``client.get_table(...)`` calls that would
    otherwise block the asyncio event loop. Mirrors the same conversion
    already done for ``register_table``."""

    def test_precheck_handler_is_sync(self):
        import inspect
        from app.api import admin as admin_mod
        assert not inspect.iscoroutinefunction(
            admin_mod.register_table_precheck
        ), (
            "register_table_precheck must be a plain `def` so FastAPI runs "
            "it in a threadpool; otherwise the synchronous bigquery.Client "
            "calls block the asyncio event loop."
        )


# --- sync_schedule format validation (#79) ----------------------------------


@pytest.mark.parametrize("schedule", [
    "every 15m",
    "every 1h",
    "daily 05:00",
    "daily 07:00,13:00,18:00",
    None,  # explicit None is allowed (no schedule = always sync)
])
def test_register_request_accepts_valid_sync_schedule(schedule):
    req = RegisterTableRequest(name="orders", sync_schedule=schedule)
    assert req.sync_schedule == schedule


@pytest.mark.parametrize("schedule", [
    "hourly",
    "every 0m",
    "daily 25:00",
    "every 5x",
    "  ",
])
def test_register_request_rejects_malformed_sync_schedule(schedule):
    with pytest.raises(ValidationError) as exc_info:
        RegisterTableRequest(name="orders", sync_schedule=schedule)
    assert "sync_schedule" in str(exc_info.value)


@pytest.mark.parametrize("schedule", [
    "every 30m",
    "daily 08:00",
    None,
])
def test_update_request_accepts_valid_sync_schedule(schedule):
    req = UpdateTableRequest(sync_schedule=schedule)
    assert req.sync_schedule == schedule


def test_update_request_rejects_malformed_sync_schedule():
    with pytest.raises(ValidationError):
        UpdateTableRequest(sync_schedule="weekly")