feat(materialized): query_mode='materialized' for BigQuery + Keboola — admin SELECT → parquet → analyst

Closes the 'admin pre-stages a curated table/view for analysts' use case end-to-end across both supported source connectors. Backend (BigQuery + Keboola, schema v20): - schema v20 adds source_query TEXT to table_registry (renumbered from v19 after main's #150 RBAC migration also bumped to v19) - connectors/bigquery/extractor.py adds materialize_query(table_id, sql, *, bq, output_dir, max_bytes=...) — BqAccess session, dry-run cost guardrail (default 10 GiB, configurable via data_source.bigquery.max_bytes_per_materialize), idempotent ATTACH, rows/bytes/md5 metadata for sync_state - connectors/keboola/access.py — new KeboolaAccess facade (parallel of BqAccess) wrapping ATTACH 'keboola://...' AS kbc - connectors/keboola/extractor.py adds materialize_query — same shape, no dry-run analog (Keboola Storage API has different cost model); legacy bucket-download path skips query_mode='materialized' rows - app/api/sync.py:_run_materialized_pass dispatches by source_type to the right materialize_query - app/api/admin.py: RegisterTableRequest accepts source_query; model_validator coheres mode↔source_query↔bucket; PUT preserves omitted fields; deprecation marks (Field(deprecated=True)) on sync_strategy + profile_after_sync (no extractor reads them; profile_after_sync becomes inert — bug from earlier work where /api/sync/trigger never honored the flag); _BQ_OPTIONAL_FIELD_DEFAULTS injects defaults into GET /server-config payload Operator + CLI surface: - da admin register-table --query / --query-mode materialized - scripts/smoke-test-materialized-bq.sh — end-to-end smoke for operators Tests (incl. spike + integration + regression): - test_db_migration_v20, test_table_registry_source_query - test_bq_materialize, test_bq_cost_guardrail, test_bq_init_extract_skips - test_keboola_access, test_keboola_extension_query_passthrough (lock-in for the DuckDB extension capability), test_keboola_materialize, test_keboola_init_extract_skips, test_keboola_materialized_e2e (skipped without KBC_TEST_* creds) - test_sync_trigger_materialized, test_sync_trigger_keboola_materialized - test_api_admin_materialized, test_cli_admin_materialized - test_admin_bq_register, test_admin_discover_bigquery, test_admin_keboola_materialized, test_admin_phase_c_deprecation, test_admin_put_preservation, test_materialized_e2e Cost: BQ uses bigquery_query() (jobs API, view-aware) — works on tables, views, materialized views uniformly. Keboola uses ATTACH+COPY parquet through the DuckDB extension.
2026-05-01 20:25:56 +02:00 · 2026-05-01 20:25:56 +02:00 · 85d3810535
commit 85d3810535
parent d0b7e122d6
29 changed files with 4599 additions and 122 deletions
--- a/app/api/admin.py
+++ b/app/api/admin.py
@ -12,7 +12,7 @@ import uuid
 from pathlib import Path

 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
 from typing import Optional, List, Dict, Any
 import duckdb

@ -172,6 +172,9 @@ _EDITABLE_SECTIONS: tuple[str, ...] = (
    "server",
    "auth",
    "ai",
+    "openmetadata",
+    "desktop",
+    "corporate_memory",
 )

 # "Danger-zone" sections — flipping these can lock operators out (auth.*) or
@ -181,6 +184,404 @@ _EDITABLE_SECTIONS: tuple[str, ...] = (
 # the right warning copy.
 _DANGER_SECTIONS: tuple[str, ...] = ("auth", "server")

+# Known-but-optional config fields per section. The /admin/server-config UI
+# uses this registry alongside the YAML payload to render fields the operator
+# might want to set even though they're not currently in instance.yaml.
+#
+# Schema per field:
+#   {
+#     "kind": "string" | "secret" | "bool" | "int" | "select" | "object" | "array",
+#     "default": <type-appropriate default>  (optional)
+#     "hint": "<one-line operator-facing help>"
+#     "options": [...]              (only for kind="select")
+#     "fields": {<name>: <fieldspec>}  (only for kind="object", nested fields)
+#     "item_kind": "string" | ...   (only for kind="array", element type)
+#     "required": bool             (defaults False; UI marks the label)
+#   }
+#
+# Subagents 2-4 will populate the bodies. The registry enables the UI to
+# render missing-but-known fields with placeholders + hints rather than
+# forcing the operator to discover them via the JSON-patch textarea or
+# hitting a runtime error first. The smoke fixture below
+# (data_source.bigquery.billing_project) proves the renderer wiring works
+# end-to-end so subagents 2-4 only have to add registry entries — they
+# don't need to touch admin_server_config.html.
+_KNOWN_FIELDS: dict[str, dict[str, dict]] = {
+    "instance": {
+        # No commonly-missing instance-level fields. The example YAML's
+        # `name`/`subtitle` are always populated by `da setup` so they
+        # render via the populated path; nothing to surface here.
+    },
+    "data_source": {
+        "bigquery": {
+            "kind": "object",
+            "hint": "BigQuery connection knobs (read more in docs/DEPLOYMENT.md)",
+            "fields": {
+                "billing_project": {
+                    "kind": "string",
+                    "hint": (
+                        "GCP project to bill BQ jobs against. Set when SA can read "
+                        "the data project but cannot bill there (e.g. shared read-only "
+                        "data project). Defaults to data_source.bigquery.project. "
+                        "Mismatch → 403 USER_PROJECT_DENIED on every BQ call."
+                    ),
+                },
+                "legacy_wrap_views": {
+                    "kind": "bool",
+                    "default": False,
+                    "hint": (
+                        "When true, registered VIEWs and MATERIALIZED_VIEWs get a DuckDB "
+                        "master view via bigquery_query() (jobs API) so analysts can "
+                        "SELECT * FROM viewname directly. When false (default), views are "
+                        "catalog-only — analysts use `da fetch` or `da query --remote`. "
+                        "ON for view-heavy deployments where most views are small enough "
+                        "to materialize without 'Response too large' (issue #101)."
+                    ),
+                },
+                "max_bytes_per_materialize": {
+                    "kind": "int",
+                    "default": 10737418240,
+                    "hint": (
+                        "Cost guardrail for query_mode='materialized' BQ scans (dry-run "
+                        "check before running). Bytes processed; exceeds → registration "
+                        "or sync rejected. 0 disables the gate. Default 10737418240 = 10 GiB."
+                    ),
+                },
+            },
+        },
+        "keboola": {
+            "kind": "object",
+            "hint": "Keboola Storage API connection",
+            "fields": {
+                "stack_url": {
+                    "kind": "string",
+                    "hint": (
+                        "e.g. https://connection.keboola.com (instance-specific stack URL). "
+                        "Validated against private-IP allowlist on save (SSRF guard)."
+                    ),
+                },
+                "project_id": {
+                    "kind": "string",
+                    "hint": "Keboola project ID (numeric, but kept as string in YAML).",
+                },
+            },
+        },
+    },
+    "email": {
+        # SMTP fields render via the populated path (always set when email
+        # is enabled); no commonly-missing optional knobs at this layer.
+    },
+    "telegram": {
+        # Rarely missing; leave empty.
+    },
+    "jira": {
+        # Webhook + REST credentials always present when Jira is configured.
+    },
+    "theme": {
+        # Cosmetic only; rarely missing.
+    },
+    "server": {
+        # TLS / hostname knobs are mostly env-side; nothing to surface here.
+    },
+    "auth": {
+        "allowed_domain": {
+            "kind": "string",
+            "hint": (
+                "Comma-separated list of allowed sign-in email domains (e.g. "
+                "'acme.com,acme-internal.com'). Single domain works too. Empty → no "
+                "domain restriction (any verified Google identity can sign in)."
+            ),
+        },
+    },
+    "ai": {
+        "base_url": {
+            "kind": "string",
+            "hint": (
+                "Required for provider='openai_compat' (LiteLLM, OpenRouter, vLLM, etc.). "
+                "Ignored when provider='anthropic'. Examples: https://litellm.example.com, "
+                "https://openrouter.ai/api/v1."
+            ),
+        },
+        "structured_output": {
+            "kind": "select",
+            "options": ["strict", "json", "auto"],
+            "default": "auto",
+            "hint": (
+                "JSON-schema enforcement strategy. strict=Layer 1 only "
+                "(Anthropic/OpenAI native, fail otherwise). json=Layer 1 + Layer 2 "
+                "fallback. auto=all three layers including prompt-based JSON (most "
+                "compatible, least strict)."
+            ),
+        },
+    },
+    "openmetadata": {
+        "url": {
+            "kind": "string",
+            "hint": "Base URL of your OpenMetadata server (e.g. https://catalog.example.com).",
+        },
+        "token": {
+            "kind": "secret",
+            "hint": (
+                "JWT bearer token. Use ${OPENMETADATA_TOKEN} env-var reference "
+                "(don't paste secret directly)."
+            ),
+        },
+        "cache_ttl_seconds": {
+            "kind": "int",
+            "default": 3600,
+            "hint": "How long to cache catalog responses in-process. Default 3600s (1h).",
+        },
+        "verify_ssl": {
+            "kind": "bool",
+            "default": True,
+            "hint": (
+                "TLS verification. Default true. Set false ONLY for internal CAs / "
+                "self-signed certs — sends the JWT over an unverified channel."
+            ),
+        },
+    },
+    "desktop": {
+        "jwt_issuer": {
+            "kind": "string",
+            "default": "data-analyst",
+            "hint": "JWT iss claim. Match what the desktop app verifies.",
+        },
+        "jwt_secret": {
+            "kind": "secret",
+            "hint": "JWT signing secret. Use ${DESKTOP_JWT_SECRET} env-var reference.",
+        },
+        "url_scheme": {
+            "kind": "string",
+            "default": "data-analyst",
+            "hint": "Custom URL scheme registered by the desktop app (data-analyst://...).",
+        },
+    },
+    # corporate_memory governance — optional. When the section is missing
+    # from instance.yaml the system runs in legacy democratic-wiki mode
+    # (no admin review). Schema mirrors config/instance.yaml.example
+    # lines 224-317; renderer handles arbitrary depth + arrays + maps.
+    "corporate_memory": {
+        "distribution_mode": {
+            "kind": "select",
+            "options": ["mandatory_only", "admin_curated", "hybrid"],
+            "default": "hybrid",
+            "hint": (
+                "How knowledge reaches users. mandatory_only = admin-only; "
+                "admin_curated = admin + user voting as feedback; "
+                "hybrid = default (mandatory from admin + optional from user voting)."
+            ),
+        },
+        "approval_mode": {
+            "kind": "select",
+            "options": ["review_queue", "auto_publish", "threshold"],
+            "default": "review_queue",
+            "hint": (
+                "How AI-extracted items enter the system. review_queue = admin "
+                "approval required (default); auto_publish = live immediately; "
+                "threshold = high-confidence auto, low-confidence to queue."
+            ),
+        },
+        "review_period_months": {
+            "kind": "int",
+            "default": 6,
+            "hint": "How often approved/mandatory items are flagged for re-review (months).",
+        },
+        "notify_on_new_items": {
+            "kind": "bool",
+            "default": True,
+            "hint": "Notify km_admins when new pending items arrive.",
+        },
+        "sources": {
+            "kind": "object",
+            "hint": (
+                "Knowledge-source ingestion. Each source has its own enabled "
+                "flag + base confidence."
+            ),
+            "fields": {
+                "claude_local_md": {
+                    "kind": "object",
+                    "fields": {
+                        "enabled": {"kind": "bool", "default": True},
+                        "confidence_base": {
+                            "kind": "float",
+                            "default": 0.50,
+                            "hint": "Confidence assigned to extractions from CLAUDE.local.md (0-1).",
+                        },
+                    },
+                },
+                "session_transcripts": {
+                    "kind": "object",
+                    "fields": {
+                        "enabled": {"kind": "bool", "default": True},
+                        "confidence_base": {"kind": "float", "default": 0.60},
+                        "max_turns_per_session": {
+                            "kind": "int",
+                            "default": 100,
+                            "hint": "Truncate transcripts longer than this many turns.",
+                        },
+                        "detection_types": {
+                            "kind": "array",
+                            "item_kind": "string",
+                            "default": [
+                                "correction",
+                                "confirmation",
+                                "unprompted_definition",
+                            ],
+                            "hint": (
+                                "Which extraction patterns to detect. Each entry "
+                                "is a detection-type tag."
+                            ),
+                        },
+                    },
+                },
+            },
+        },
+        "extraction": {
+            "kind": "object",
+            "fields": {
+                "model": {
+                    "kind": "string",
+                    "default": "claude-haiku-4-5-20251001",
+                    "hint": "LLM used to extract knowledge. Override for cost or quality.",
+                },
+                "sensitivity_check": {"kind": "bool", "default": True},
+                "contradiction_check": {"kind": "bool", "default": True},
+            },
+        },
+        "confidence": {
+            "kind": "object",
+            "hint": "Confidence scoring + decay rules.",
+            "fields": {
+                "base": {
+                    "kind": "map",
+                    "key_kind": "string",
+                    "value_kind": "float",
+                    "default": {
+                        "user_verification.correction": 0.90,
+                        "user_verification.unprompted_definition": 0.90,
+                        "user_verification.confirmation": 0.60,
+                        "admin_mandate": 1.00,
+                        "claude_local_md": 0.50,
+                        "session_transcript": 0.50,
+                    },
+                    "hint": (
+                        "Base score per source/detection. Keys are 'source_type' "
+                        "or 'source_type.detection_type' (the dot is data, not "
+                        "nesting)."
+                    ),
+                },
+                "modifiers": {
+                    # map<string, map<string, float>>. The renderer's structured
+                    # editor for "map of objects with declared subfields" is a
+                    # TODO (see admin_server_config.html); for now this falls
+                    # back to a JSON textarea — admins editing it see the
+                    # schema doc inline via the hint.
+                    "kind": "map",
+                    "key_kind": "string",
+                    "value_kind": "object",
+                    "value_fields": {},  # signals the JSON-textarea fallback
+                    "hint": (
+                        "Per-key modifier step sizes applied to base when "
+                        "optional signals are present (3-level dotted paths). "
+                        "Edit as a JSON object — outer keys mirror confidence.base "
+                        "keys; inner objects map signal name to bonus float."
+                    ),
+                },
+                "decay": {
+                    "kind": "object",
+                    "fields": {
+                        "mode": {
+                            "kind": "select",
+                            "options": ["linear", "exponential"],
+                            "default": "exponential",
+                        },
+                        "half_life_months": {
+                            "kind": "int",
+                            "default": 12,
+                            "hint": "Used when mode=exponential.",
+                        },
+                        "decay_rate_monthly": {
+                            "kind": "float",
+                            "default": 0.02,
+                            "hint": "Used when mode=linear.",
+                        },
+                        "floor": {
+                            "kind": "map",
+                            "key_kind": "string",
+                            "value_kind": "float",
+                            "default": {
+                                "admin_mandate": 0.50,
+                                "user_verification": 0.40,
+                                "default": 0.0,
+                            },
+                            "hint": (
+                                "Per-source minimum confidence — items never decay "
+                                "below this floor."
+                            ),
+                        },
+                    },
+                },
+            },
+        },
+        "contradiction_detection": {
+            "kind": "object",
+            "fields": {
+                "enabled": {"kind": "bool", "default": True},
+                "max_candidates": {
+                    "kind": "int",
+                    "default": 10,
+                    "hint": "Max contradiction candidates to evaluate per new item.",
+                },
+            },
+        },
+        "entity_resolution": {
+            "kind": "object",
+            "fields": {
+                "enabled": {"kind": "bool", "default": True},
+                "entities": {
+                    "kind": "map",
+                    "key_kind": "string",
+                    "value_kind": "array",
+                    "value_item_kind": "string",
+                    "default": {
+                        "metrics": ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"],
+                        "products": ["Platform", "API", "Dashboard"],
+                    },
+                    "hint": (
+                        "Domain-entity vocabulary. Key = domain category; value = "
+                        "canonical names list."
+                    ),
+                },
+            },
+        },
+        "domain_owners": {
+            "kind": "map",
+            "key_kind": "string",
+            "value_kind": "array",
+            "value_item_kind": "string",
+            "hint": (
+                "Per-domain admin emails. Key = domain name; value = email list."
+            ),
+        },
+        "domains": {
+            "kind": "array",
+            "item_kind": "string",
+            "default": [
+                "finance",
+                "engineering",
+                "product",
+                "data",
+                "operations",
+                "infrastructure",
+            ],
+            "hint": (
+                "Knowledge domains analysts can target. Each must match a key "
+                "in domain_owners."
+            ),
+        },
+    },
+}
+
 # Keys whose values must be redacted from the audit diff. We match
 # substring (case-insensitive) so `client_secret`, `api_token`,
 # `webapp_secret_key`, `bot_token`, `password`, `smtp_password`, etc. all
@ -385,6 +786,43 @@ class ServerConfigUpdateRequest(BaseModel):
    )


+# Optional BQ fields whose runtime defaults are documented but which used to
+# be invisible in the editor when YAML omitted them. The data_source.bigquery
+# subtree renders as a JSON textarea; a key that's absent from the GET
+# payload literally cannot appear in the form for the operator to edit. We
+# surface them with their documented defaults so the UI always shows them as
+# editable knobs — see Phase J of the admin-tables-cleanup work.
+#
+#   - billing_project: defaults to data project; explicit value needed when
+#     the SA can read the data project but not bill against it.
+#   - legacy_wrap_views: default False; toggling ON wraps BQ views via
+#     `bigquery_query()` so analysts can `SELECT *` directly.
+#   - max_bytes_per_materialize: cost guardrail for `query_mode='materialized'`
+#     (default 10 GiB; 0 disables; null falls through to the default).
+_BQ_OPTIONAL_FIELD_DEFAULTS: Dict[str, Any] = {
+    "billing_project": "",
+    "legacy_wrap_views": False,
+    "max_bytes_per_materialize": 10737418240,
+}
+
+
+def _ensure_bq_optional_fields(sections: Dict[str, Any]) -> None:
+    """In-place: add missing BQ optional fields to data_source.bigquery so the
+    UI's JSON-textarea renders them as editable keys. Existing values are
+    preserved — only absent keys are populated with their documented default.
+    """
+    ds = sections.get("data_source")
+    if not isinstance(ds, dict):
+        return
+    bq = ds.get("bigquery")
+    if not isinstance(bq, dict):
+        # No BQ subsection — leave alone. Non-BQ instances don't need these
+        # knobs, and creating an empty bigquery dict would be misleading.
+        return
+    for key, default in _BQ_OPTIONAL_FIELD_DEFAULTS.items():
+        bq.setdefault(key, default)
+
+
@router.get("/server-config")
 async def get_server_config(
    user: dict = Depends(require_admin),
@ -403,11 +841,20 @@ async def get_server_config(
    # file omits them — operator can populate from scratch without manual
    # JSON edits.
    sections = {section: redacted.get(section, {}) for section in _EDITABLE_SECTIONS}
+    # Always surface the optional BQ knobs so the operator sees them in the
+    # UI's JSON editor instead of having to know they exist (Phase J).
+    _ensure_bq_optional_fields(sections)
    return {
        "sections": sections,
        "editable_sections": list(_EDITABLE_SECTIONS),
        "danger_sections": list(_DANGER_SECTIONS),
        "secret_key_patterns": list(_SECRET_KEY_PATTERNS),
+        # Known-but-optional fields per section so the UI can render
+        # placeholders for fields the operator hasn't set yet (Phase J).
+        # Subagents 2-4 populate the bodies; the renderer ships now so the
+        # mechanism is wired end-to-end and adding entries is purely a
+        # data-edit in `_KNOWN_FIELDS` above.
+        "known_fields": _KNOWN_FIELDS,
    }


@ -654,7 +1101,17 @@ def _sanitize_for_audit(payload: Dict[str, Any]) -> Dict[str, Any]:
 class RegisterTableRequest(BaseModel):
    name: str
    folder: Optional[str] = None
-    sync_strategy: str = "full_refresh"
+    sync_strategy: str = Field(
+        default="full_refresh",
+        deprecated=True,
+        description=(
+            "DEPRECATED: catalog/profiler metadata only. No extractor reads "
+            "this field; every sync is a full overwrite regardless of value. "
+            "profiler.is_partitioned() consumes it for parquet-layout "
+            "detection. Field stays for back-compat; will be removed in a "
+            "future major release."
+        ),
+    )
    # Composite primary keys are real (session-grain MSA tables key on
    # `(session_id, event_date)`, browse rows on more). The frontend sends +
    # reads this as a list; backend stores it JSON-serialized in VARCHAR.
@ -664,9 +1121,41 @@ class RegisterTableRequest(BaseModel):
    source_type: Optional[str] = None
    bucket: Optional[str] = None
    source_table: Optional[str] = None
+    # Backs query_mode='materialized'. Stored verbatim in
+    # table_registry.source_query (schema v20); the trigger pass runs it
+    # through the DuckDB BQ extension via BqAccess and writes the result
+    # to /data/extracts/bigquery/data/<id>.parquet.
+    source_query: Optional[str] = None
    query_mode: str = "local"
    sync_schedule: Optional[str] = None
-    profile_after_sync: bool = True
+    profile_after_sync: bool = Field(
+        default=True,
+        deprecated=True,
+        description=(
+            "DEPRECATED: not consumed by the runtime (Agent 1 finding "
+            "2026-05-01). Profiler runs unconditionally on every synced "
+            "table; this flag has no effect. Field stays for back-compat."
+        ),
+    )
+
+    @model_validator(mode="after")
+    def _check_mode_query_coherence(self):
+        """Enforce query_mode ↔ source_query invariants up front so an admin
+        can't persist a remote/local row carrying an orphan source_query, and
+        materialized rows can't be registered without a SQL body."""
+        sq = (self.source_query or "").strip() or None
+        if self.query_mode == "materialized" and not sq:
+            raise ValueError(
+                "query_mode='materialized' requires a non-empty source_query"
+            )
+        if self.query_mode != "materialized" and sq:
+            raise ValueError(
+                "source_query is only valid when query_mode='materialized'"
+            )
+        # Normalise: stash the trimmed-or-None form so the persisted column
+        # never carries surrounding whitespace or empty-string sentinels.
+        self.source_query = sq
+        return self

    @field_validator("primary_key", mode="before")
    @classmethod
@ -707,12 +1196,67 @@ class RegisterTableRequest(BaseModel):
 def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
    """Enforce BQ-specific shape on a register/precheck request.

-    Mutates the model: forces ``query_mode='remote'`` and
-    ``profile_after_sync=False`` (per Decision 7 in #108) so a caller can't
-    accidentally enqueue a parquet profiling pass for a remote view that
-    has no local file. Raises HTTPException(422) for missing required
-    fields and HTTPException(400) for unsafe identifiers / bogus project_id.
+    Two BQ paths:
+
+    - ``query_mode='materialized'`` — admin-registered SQL writes a parquet on
+      schedule. Requires ``source_query``; ``bucket`` / ``source_table`` are
+      not used (the SQL inlines the references). Doesn't force any field; the
+      Pydantic ``model_validator`` already gated the query/mode coherence.
+
+    - ``query_mode='remote'`` (or default) — remote view over a single BQ
+      table. Requires ``bucket`` (BQ dataset) + ``source_table``. Mutates
+      the model: forces ``query_mode='remote'`` and ``profile_after_sync=False``
+      (per Decision 7 in #108) so a caller can't accidentally enqueue a
+      parquet profiling pass for a remote view that has no local file.
+
+    Raises HTTPException(422) for missing required fields and
+    HTTPException(400) for unsafe identifiers / bogus project_id.
    """
+    if req.query_mode == "materialized":
+        # Materialized BQ rows: the SQL body replaces dataset+table refs.
+        # Pydantic model_validator already verified source_query is non-empty;
+        # all we still need is a valid project_id and a safe view name.
+        if not req.source_query or not req.source_query.strip():
+            raise HTTPException(
+                status_code=422,
+                detail="bigquery materialized: 'source_query' is required",
+            )
+        raw_name = req.name or ""
+        if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
+            raise HTTPException(
+                status_code=400,
+                detail=(
+                    f"bigquery: view name {raw_name!r} is unsafe — must match "
+                    f"^[a-zA-Z_][a-zA-Z0-9_]{{0,63}}$ (DuckDB identifier rules) "
+                    "with no leading/trailing whitespace"
+                ),
+            )
+        from app.instance_config import get_value
+        project_id = get_value("data_source", "bigquery", "project", default="")
+        if not project_id:
+            raise HTTPException(
+                status_code=400,
+                detail=(
+                    "bigquery: data_source.bigquery.project is not set in "
+                    "instance.yaml; configure it via /admin/server-config or "
+                    "/api/admin/configure first"
+                ),
+            )
+        if not _is_safe_project_id(project_id):
+            raise HTTPException(
+                status_code=400,
+                detail=(
+                    f"bigquery: data_source.bigquery.project {project_id!r} "
+                    "is malformed — must match GCP project_id grammar "
+                    "^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
+                ),
+            )
+        # Phase C: profile_after_sync is now inert (Pydantic field marked
+        # deprecated; not read by app/api/sync.py:410-438). The runtime
+        # profiles every synced table unconditionally, so we no longer
+        # force-set this here as a "signal."
+        return
+
    if not req.bucket or not req.bucket.strip():
        raise HTTPException(
            status_code=422,
@ -796,24 +1340,106 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
                "must match GCP project_id grammar ^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
            ),
        )
-    # Force the BQ-required mode + flag (Decision 7). The orchestrator and
+    # Force the BQ-required mode (Decision 7). The orchestrator and
    # extractor both assume remote; persisting `local` here would later create
    # a profiling job against a non-existent parquet file.
+    # Phase C: profile_after_sync is now inert (deprecated, not read by the
+    # runtime); no longer force-set here.
    req.query_mode = "remote"
-    req.profile_after_sync = False


 class UpdateTableRequest(BaseModel):
    name: Optional[str] = None
-    sync_strategy: Optional[str] = None
+    sync_strategy: Optional[str] = Field(
+        default=None,
+        deprecated=True,
+        description=(
+            "DEPRECATED: catalog/profiler metadata only. See "
+            "RegisterTableRequest.sync_strategy."
+        ),
+    )
    primary_key: Optional[List[str]] = None
    description: Optional[str] = None
    source_type: Optional[str] = None
    bucket: Optional[str] = None
    source_table: Optional[str] = None
+    source_query: Optional[str] = None
    query_mode: Optional[str] = None
    sync_schedule: Optional[str] = None
-    profile_after_sync: Optional[bool] = None
+    profile_after_sync: Optional[bool] = Field(
+        default=None,
+        deprecated=True,
+        description=(
+            "DEPRECATED: not consumed by the runtime. See "
+            "RegisterTableRequest.profile_after_sync."
+        ),
+    )
+
+    @model_validator(mode="after")
+    def _check_mode_query_coherence(self):
+        """PUT semantics — only the fields explicitly in the body are
+        validated. The body is overlaid on the existing row at the handler
+        level (see ``update_table``), so omitted fields keep their stored
+        values and the synthetic ``RegisterTableRequest`` constructed against
+        the merged record runs the strict cross-field check before persist.
+
+        The only invariants enforceable from the PUT body alone:
+
+        - explicit ``source_query='SELECT ...'`` paired with ``query_mode``
+          that isn't materialized → coherent reject (the SQL would be dead);
+        - explicit ``source_query='SELECT ...'`` without any ``query_mode``
+          in the body → reject; the operator must commit to materialized;
+        - explicit empty/whitespace ``source_query=''`` paired with
+          ``query_mode='materialized'`` → reject (operator clearly
+          mistyped — they sent the field).
+
+        Pre-fix this validator also rejected ``{"query_mode": "materialized",
+        "sync_schedule": "every 12h"}`` because ``source_query`` was None
+        — but that's the canonical "edit the schedule on a materialized
+        row" use-case from the Edit modal, which always sends
+        ``query_mode`` to indicate intent. Devin BUG_0002 on PR #148
+        commit 2219255.
+        """
+        if self.query_mode is None and self.source_query is None:
+            return self
+
+        sq_raw = self.source_query
+        sq = (sq_raw or "").strip() or None
+
+        # Operator explicitly sent source_query as empty/whitespace while
+        # claiming materialized — typo / bad form data, reject.
+        if (
+            self.query_mode == "materialized"
+            and sq_raw is not None
+            and not sq
+        ):
+            raise ValueError(
+                "query_mode='materialized' requires a non-empty source_query"
+            )
+
+        # source_query only makes sense with materialized mode. Allow None
+        # (omitted) to flow through; only reject when explicitly set with
+        # the wrong mode.
+        if (
+            self.query_mode is not None
+            and self.query_mode != "materialized"
+            and sq
+        ):
+            raise ValueError(
+                "source_query is only valid when query_mode='materialized'"
+            )
+        if self.query_mode is None and sq:
+            raise ValueError(
+                "source_query requires query_mode='materialized' to be set "
+                "in the same request"
+            )
+
+        # Normalise: drop whitespace-only strings to None so the persisted
+        # column is clean. Don't touch when source_query was None to begin
+        # with — that signals "PUT didn't touch this field, keep existing".
+        if sq_raw is not None:
+            self.source_query = sq
+        return self

    @field_validator("primary_key", mode="before")
    @classmethod
@ -853,8 +1479,20 @@ class ConfigureRequest(BaseModel):
@router.get("/discover-tables")
 async def discover_tables(
    user: dict = Depends(require_admin),
+    dataset: Optional[str] = None,
 ):
-    """Discover all available tables from the configured data source."""
+    """Discover available tables from the configured data source.
+
+    For ``data_source.type='keboola'`` returns the full Storage API table
+    list (single round-trip). For ``data_source.type='bigquery'``:
+
+    - Without ``dataset``: list datasets in the configured project.
+    - With ``dataset=name``: list tables (BASE TABLE + VIEW) in that dataset.
+
+    Two-step shape avoids paying the per-dataset list_tables cost up-front
+    on projects with hundreds of datasets — the UI populates the dataset
+    dropdown first, then fetches tables only for the selected dataset.
+    """
    try:
        from app.instance_config import get_data_source_type
        source_type = get_data_source_type()
@ -870,12 +1508,89 @@ async def discover_tables(
            client = KeboolaClient(token=token, url=url)
            tables = client.discover_all_tables()
            return {"tables": tables, "count": len(tables), "source": "keboola"}
-        else:
-            return {"tables": [], "count": 0, "source": source_type, "error": "Discovery not implemented for this source"}
+
+        if source_type == "bigquery":
+            return _discover_bigquery(dataset=dataset)
+
+        return {
+            "tables": [],
+            "count": 0,
+            "source": source_type,
+            "error": f"Discovery not implemented for source_type={source_type!r}",
+        }
+    except HTTPException:
+        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Discovery failed: {e}")


+def _discover_bigquery(dataset: Optional[str]) -> Dict[str, Any]:
+    """List BQ datasets (when ``dataset`` is None) or tables-in-dataset.
+
+    Routes through ``BqAccess.client()`` so config / auth / error
+    translation matches the rest of the BQ surface (#138 facade). Returns
+    the same shape as the Keboola path so the UI doesn't have to branch.
+    """
+    from connectors.bigquery.access import (
+        get_bq_access,
+        BqAccessError,
+        translate_bq_error,
+    )
+
+    try:
+        bq = get_bq_access()
+        client = bq.client()
+    except BqAccessError as e:
+        raise HTTPException(
+            status_code=BqAccessError.HTTP_STATUS.get(e.kind, 500),
+            detail={"error": e.message, "kind": e.kind, "details": e.details},
+        )
+
+    try:
+        if dataset is None:
+            datasets = []
+            for ds in client.list_datasets():
+                datasets.append({
+                    "dataset_id": ds.dataset_id,
+                    "full_id": f"{ds.project}.{ds.dataset_id}",
+                })
+            return {
+                "datasets": sorted(datasets, key=lambda d: d["dataset_id"]),
+                "count": len(datasets),
+                "source": "bigquery",
+            }
+
+        # List tables in the named dataset. `list_tables` returns
+        # `TableListItem` with `table_id` + `table_type` ('TABLE', 'VIEW',
+        # 'MATERIALIZED_VIEW', 'EXTERNAL', 'SNAPSHOT'). UI maps TABLE → Type
+        # selector "table" and VIEW/MATERIALIZED_VIEW → "view"; the rest are
+        # passed through with their raw type so the operator can decide.
+        tables = []
+        for t in client.list_tables(dataset):
+            tables.append({
+                "table_id": t.table_id,
+                "table_type": t.table_type,
+                "full_id": f"{t.project}.{t.dataset_id}.{t.table_id}",
+            })
+        return {
+            "tables": sorted(tables, key=lambda t: t["table_id"]),
+            "count": len(tables),
+            "source": "bigquery",
+            "dataset": dataset,
+        }
+    except Exception as e:
+        # `translate_bq_error` re-raises non-Google exceptions unchanged,
+        # so wrap in HTTPException to keep the JSON-shape contract.
+        try:
+            err = translate_bq_error(e, bq.projects, bad_request_status="upstream_error")
+        except Exception:
+            raise HTTPException(status_code=502, detail=f"BQ discovery failed: {e}")
+        raise HTTPException(
+            status_code=BqAccessError.HTTP_STATUS.get(err.kind, 502),
+            detail={"error": err.message, "kind": err.kind, "details": err.details},
+        )
+
+
@router.get("/registry")
 async def list_registry(
    user: dict = Depends(require_admin),
@ -1121,6 +1836,10 @@ def register_table(
    if is_bigquery:
        _validate_bigquery_register_payload(request)

+    # Phase C: profile_after_sync is no longer passed — the field is
+    # deprecated and inert at the runtime layer. The DB column keeps its
+    # schema default; the registry response no longer reflects request
+    # values for this flag.
    repo.register(
        id=table_id,
        name=request.name,
@ -1132,9 +1851,9 @@ def register_table(
        source_type=request.source_type,
        bucket=request.bucket,
        source_table=request.source_table,
+        source_query=request.source_query,
        query_mode=request.query_mode,
        sync_schedule=request.sync_schedule,
-        profile_after_sync=request.profile_after_sync,
    )

    # Audit entry — masked params; description kept raw (it's documentation).
@ -1154,6 +1873,24 @@ def register_table(
            content={"id": table_id, "name": request.name, "status": "registered"},
        )

+    if request.query_mode == "materialized":
+        # Materialized BQ rows are picked up by the trigger pass on the next
+        # scheduled tick (or via POST /api/sync/trigger). No synchronous
+        # rebuild — the COPY can scan multi-GB and would block the request.
+        return JSONResponse(
+            status_code=201,
+            content={
+                "id": table_id,
+                "name": request.name,
+                "status": "registered",
+                "view_name": table_id,
+                "message": (
+                    "Materialized — parquet will be written on the next sync "
+                    "tick. Trigger now via POST /api/sync/trigger."
+                ),
+            },
+        )
+
    # BQ post-register: rebuild extract + master views, with timeout fallback.
    # Decision 1: 200 on synchronous success, 202 on timeout, 500 if the
    # synchronous rebuild surfaced errors. Distinct from the 201 Keboola
@ -1266,6 +2003,28 @@ def register_table_precheck(
    # checks identifier safety, validates project_id from instance.yaml).
    _validate_bigquery_register_payload(request)

+    # Materialized BQ rows have no `dataset.source_table` to round-trip —
+    # the SQL body is the contract. Skip the BQ-jobs-API call and return a
+    # validation-only precheck so the CLI's `--dry-run --query-mode
+    # materialized` path doesn't crash on an empty fully-qualified name.
+    if request.query_mode == "materialized":
+        return {
+            "ok": True,
+            "table": {
+                "name": request.name,
+                "source_type": "bigquery",
+                "query_mode": "materialized",
+                "source_query": request.source_query,
+                "rows": None,
+                "size_bytes": None,
+                "columns": [],
+                "note": (
+                    "materialized precheck is validation-only — the SQL is "
+                    "evaluated for cost on each scheduled materialize tick"
+                ),
+            },
+        }
+
    # Round-trip the BQ jobs API to confirm the table exists and the SA can
    # see it. Imports kept local to avoid pulling google-cloud-bigquery into
    # the import chain on non-BQ instances.
@ -1360,14 +2119,23 @@ async def update_table(
        merged.update(updates)
        merged.pop("id", None)  # avoid duplicate id kwarg

+        # When switching the merged record away from materialized mode, drop
+        # the stale source_query — the request validator can't clear it via
+        # the `if v is not None` filter above. Without this, a remote/local
+        # row would carry an orphan source_query in the registry.
+        if merged.get("query_mode") != "materialized":
+            merged["source_query"] = None
+
        if merged.get("source_type") == "bigquery":
            # Reuse the register-time validator. It mutates the request to
-            # force query_mode='remote' / profile_after_sync=False — apply
-            # the same coercion to `merged` so the persisted row matches.
+            # force query_mode='remote' / profile_after_sync=False (or to
+            # leave a materialized row alone) — apply the same coercion to
+            # `merged` so the persisted row matches.
            synthetic = RegisterTableRequest(
                name=merged.get("name") or table_id,
                bucket=merged.get("bucket"),
                source_table=merged.get("source_table"),
+                source_query=merged.get("source_query"),
                source_type="bigquery",
                query_mode=merged.get("query_mode") or "remote",
                profile_after_sync=bool(merged.get("profile_after_sync") or False),
@ -1380,6 +2148,7 @@ async def update_table(
            _validate_bigquery_register_payload(synthetic)
            merged["query_mode"] = synthetic.query_mode
            merged["profile_after_sync"] = synthetic.profile_after_sync
+            merged["source_query"] = synthetic.source_query

        repo.register(id=table_id, **merged)

--- a/app/api/sync.py
+++ b/app/api/sync.py
@ -36,6 +36,204 @@ def _file_hash(path: Path) -> str:
    return h.hexdigest()


+def _materialize_table(
+    *,
+    table_id: str,
+    sql: str,
+    bq,
+    output_dir: str,
+    max_bytes: Optional[int],
+) -> dict:
+    """Thin wrapper around `connectors.bigquery.extractor.materialize_query`
+    so the trigger pass can be unit-tested by patching this seam without
+    touching the real BqAccess factory or the duckdb import."""
+    from connectors.bigquery.extractor import materialize_query
+    return materialize_query(
+        table_id=table_id, sql=sql, bq=bq,
+        output_dir=output_dir, max_bytes=max_bytes,
+    )
+
+
+def _run_materialized_pass(conn: duckdb.DuckDBPyConnection, bq) -> dict:
+    """Walk `table_registry` for `query_mode='materialized'` rows and run any
+    that are due, dispatching by ``source_type`` to the correct connector's
+    materialize_query. Honors per-table `sync_schedule` via `is_table_due()`,
+    computes the file hash inline, and updates `sync_state` so the manifest
+    can serve the row to `da sync` without re-hashing on every request.
+
+    BigQuery rows go through BqAccess + bigquery_query() (jobs API),
+    optionally cost-guarded by ``max_bytes_per_materialize``.
+    Keboola rows go through KeboolaAccess + ATTACH-and-COPY, no
+    guardrail (extension has no dry-run primitive).
+
+    Returns:
+        ``{"materialized": [ids], "skipped": [ids], "errors": [{table, error}]}``
+
+    Errors are aggregated per row — one budget-blown table doesn't stop a
+    healthy sibling. ``MaterializeBudgetError`` is caught and rendered with
+    its structured fields so operator alerting can pick out the cap-vs-actual
+    bytes from the log line.
+    """
+    from src.scheduler import is_table_due
+    from app.instance_config import get_value
+    from connectors.bigquery.extractor import MaterializeBudgetError
+
+    bq_output_dir = str(Path(_get_data_dir()) / "extracts" / "bigquery")
+    kb_output_dir = Path(_get_data_dir()) / "extracts" / "keboola" / "data"
+
+    # Sentinel: max_bytes <= 0 (or None) disables the guardrail. `get_value()`
+    # treats YAML `null` as "missing" → returns the default; operators must use
+    # the explicit `0` sentinel to disable. See config/instance.yaml.example.
+    # YAML accepts floats too (e.g. `10737418240.0`), and operators may
+    # write `1e10` for readability; coerce to int and tolerate non-numeric
+    # entries by falling through to the disable path with a warning.
+    raw_max = get_value(
+        "data_source", "bigquery", "max_bytes_per_materialize",
+        default=10 * 2**30,
+    )
+    try:
+        n = int(raw_max) if raw_max is not None else 0
+    except (TypeError, ValueError):
+        logger.warning(
+            "data_source.bigquery.max_bytes_per_materialize is not numeric "
+            "(%r); cost guardrail disabled. Set an integer or 0 to disable.",
+            raw_max,
+        )
+        n = 0
+    bq_max_bytes = n if n > 0 else None
+
+    registry = TableRegistryRepository(conn)
+    state = SyncStateRepository(conn)
+
+    summary = {"materialized": [], "skipped": [], "errors": []}
+    keboola_access = None  # lazy-init on first Keboola row
+
+    for row in registry.list_all():
+        if row.get("query_mode") != "materialized":
+            continue
+
+        # Convention across connectors: sync_state.table_id and the parquet
+        # filename are keyed by `table_registry.name` (matches Keboola's
+        # `_meta.table_name`) so the manifest's `registry_by_name` lookup
+        # at `_build_manifest_for_user` resolves cleanly. Without this,
+        # admins who register `name="Orders_90d"` (id slugified to
+        # `orders_90d`) would see `query_mode` default to `"local"` in the
+        # manifest because the lookup misses on `id`.
+        ref_name = row["name"]
+
+        last = state.get_last_sync(ref_name)
+        last_iso = last.isoformat() if last else None
+        schedule = row.get("sync_schedule") or "every 1h"
+        if not is_table_due(schedule, last_iso):
+            summary["skipped"].append(ref_name)
+            continue
+
+        source_type = row.get("source_type") or "bigquery"  # legacy default
+
+        # Dispatch by source_type. BQ rows keep using `_materialize_table`
+        # (the existing test seam); Keboola rows use the new Keboola
+        # materialize_query via a lazily-initialized KeboolaAccess.
+        try:
+            if source_type == "bigquery":
+                stats = _materialize_table(
+                    table_id=ref_name,
+                    sql=row["source_query"],
+                    bq=bq,
+                    output_dir=bq_output_dir,
+                    max_bytes=bq_max_bytes,
+                )
+            elif source_type == "keboola":
+                if keboola_access is None:
+                    from connectors.keboola.access import KeboolaAccess
+                    keboola_url = get_value(
+                        "data_source", "keboola", "url", default=""
+                    ) or ""
+                    token_env = get_value(
+                        "data_source", "keboola", "token_env",
+                        default="KEBOOLA_STORAGE_TOKEN",
+                    ) or "KEBOOLA_STORAGE_TOKEN"
+                    keboola_token = os.environ.get(token_env, "")
+                    if not (keboola_url and keboola_token):
+                        summary["errors"].append({
+                            "table": ref_name,
+                            "error": (
+                                "Keboola URL/token not configured for "
+                                "materialized path (data_source.keboola.url "
+                                f"+ env {token_env})"
+                            ),
+                        })
+                        continue
+                    keboola_access = KeboolaAccess(
+                        url=keboola_url, token=keboola_token,
+                    )
+                kb_output_dir.mkdir(parents=True, exist_ok=True)
+                from connectors.keboola.extractor import (
+                    materialize_query as kb_materialize_query,
+                )
+                kb_stats = kb_materialize_query(
+                    table_id=ref_name,
+                    sql=row["source_query"],
+                    keboola_access=keboola_access,
+                    output_dir=kb_output_dir,
+                )
+                # Normalize Keboola materialize_query output to the shape the
+                # BQ branch uses for downstream sync_state updates. KB returns
+                # {table_id, path, rows, bytes, md5}; map to
+                # {rows, size_bytes, hash}.
+                stats = {
+                    "rows": kb_stats["rows"],
+                    "size_bytes": kb_stats["bytes"],
+                    "hash": kb_stats["md5"],
+                    "query_mode": "materialized",
+                }
+            else:
+                summary["errors"].append({
+                    "table": ref_name,
+                    "error": (
+                        f"materialized path not supported for "
+                        f"source_type={source_type!r}"
+                    ),
+                })
+                continue
+        except MaterializeBudgetError as e:
+            logger.warning(
+                "Materialize cap exceeded for %s: %s bytes > %s bytes",
+                e.table_id, f"{e.current:,}", f"{e.limit:,}",
+            )
+            summary["errors"].append({
+                "table": ref_name,
+                "error": str(e),
+                "current": e.current,
+                "limit": e.limit,
+            })
+            continue
+        except Exception as e:
+            logger.exception("Materialize failed for %s", ref_name)
+            summary["errors"].append({"table": ref_name, "error": str(e)})
+            continue
+
+        # `materialize_query` returns the parquet's MD5 inline — hashing
+        # there means we don't re-read a multi-GB file on the request
+        # thread. Fallback to `_file_hash(parquet_path)` if for some
+        # reason the stats dict didn't carry it (defensive).
+        parquet_hash = stats.get("hash")
+        if not parquet_hash:
+            output_dir_for_hash = (
+                bq_output_dir if source_type == "bigquery" else str(kb_output_dir.parent)
+            )
+            parquet_path = Path(output_dir_for_hash) / "data" / f"{ref_name}.parquet"
+            parquet_hash = _file_hash(parquet_path)
+        state.update_sync(
+            table_id=ref_name,
+            rows=stats["rows"],
+            file_size_bytes=stats["size_bytes"],
+            hash=parquet_hash,
+        )
+        summary["materialized"].append(ref_name)
+
+    return summary
+
+
 def _run_sync(tables: Optional[List[str]] = None):
    """Run extractor as subprocess + orchestrator rebuild.

@ -106,20 +304,38 @@ def _run_sync(tables: Optional[List[str]] = None):
                except Exception as e:
                    logger.warning("Auto-discovery failed: %s", e)

-            if not table_configs:
-                logger.warning("No tables to sync for source_type=%s", source_type)
-                return
+        # CRITICAL: don't early-return when local-mode tables are empty.
+        # `list_local("bigquery")` is always empty on BQ-only deployments
+        # (BQ rows are always remote or materialized, never local), so an
+        # early return would prevent the materialized pass AND the
+        # orchestrator rebuild from ever firing on a BQ-only instance.
+        # Devin BUG_0002 on PR #148 commit 2fa44f2. Just flag whether the
+        # Keboola subprocess + custom-connectors should run; everything
+        # below (materialized pass, orchestrator rebuild, profiler) runs
+        # unconditionally so a registry with materialized rows but no
+        # local rows still publishes them.
+        run_extractor_subprocess = bool(table_configs)
+        if not run_extractor_subprocess:
+            logger.info(
+                "No local-mode tables to sync for source_type=%s — "
+                "skipping extractor subprocess; materialized pass + "
+                "orchestrator rebuild still run.",
+                source_type,
+            )

-        # Serialize configs — strip non-serializable fields
-        serializable = []
-        for tc in table_configs:
-            serializable.append({k: (v.isoformat() if hasattr(v, 'isoformat') else v)
-                                 for k, v in tc.items() if v is not None})
-
-        # Run extractor subprocess with table configs via stdin
-        # Subprocess does NOT open system.duckdb — no lock conflict
        env = {**os.environ}
-        cmd = [sys.executable, "-c", """
+        import sys as _sys
+
+        if run_extractor_subprocess:
+            # Serialize configs — strip non-serializable fields
+            serializable = []
+            for tc in table_configs:
+                serializable.append({k: (v.isoformat() if hasattr(v, 'isoformat') else v)
+                                     for k, v in tc.items() if v is not None})
+
+            # Run extractor subprocess with table configs via stdin
+            # Subprocess does NOT open system.duckdb — no lock conflict
+            cmd = [sys.executable, "-c", """
 import json, sys, os, logging
 from pathlib import Path

@ -147,58 +363,113 @@ print(json.dumps(result))
 sys.exit(compute_exit_code(result, len(configs)))
 """]

-        import sys as _sys
-        print(f"[SYNC] Starting extractor subprocess for {len(table_configs)} tables", file=_sys.stderr, flush=True)
+            print(f"[SYNC] Starting extractor subprocess for {len(table_configs)} tables", file=_sys.stderr, flush=True)

-        result = subprocess.run(
-            cmd, input=_json.dumps(serializable), capture_output=True, text=True,
-            timeout=1800, env=env,
-            cwd=str(Path(__file__).parent.parent.parent),
-        )
+            try:
+                result = subprocess.run(
+                    cmd, input=_json.dumps(serializable), capture_output=True, text=True,
+                    timeout=1800, env=env,
+                    cwd=str(Path(__file__).parent.parent.parent),
+                )
+            except subprocess.TimeoutExpired:
+                # Catch the timeout LOCALLY so the materialized BQ pass and
+                # orchestrator rebuild below still fire. Pre-fix the timeout
+                # propagated to the outer except handler and skipped the rest
+                # of `_run_sync` — on a dual-source deployment a slow Keboola
+                # extractor would silently block all materialized parquets +
+                # master-view rebuild until the next trigger. Devin BUG_0001
+                # on PR #148 commit 2219255. Mirrors the per-custom-connector
+                # timeout pattern below (line ~347).
+                print(
+                    "[SYNC] Extractor timed out after 1800s — continuing to "
+                    "materialized pass + orchestrator rebuild",
+                    file=_sys.stderr, flush=True,
+                )
+                result = None

-        if result.stdout:
-            print(f"[SYNC] Extractor stdout: {result.stdout.strip()[-500:]}", file=_sys.stderr, flush=True)
-        if result.stderr:
-            print(f"[SYNC] Extractor stderr: {result.stderr[-500:]}", file=_sys.stderr, flush=True)
-        # Issue #81 Group B: three exit codes. 0 = full success,
-        # 1 = full failure, 2 = partial. Partial is a data-quality
-        # alert, not a crash — the orchestrator's per-table _meta
-        # machinery already captured which tables succeeded; we just
-        # need to log loudly so operator alerting can pick it up.
-        if result.returncode == 0:
-            print(f"[SYNC] Extractor OK", file=_sys.stderr, flush=True)
-        elif result.returncode == 2:
+            if result is not None:
+                if result.stdout:
+                    print(f"[SYNC] Extractor stdout: {result.stdout.strip()[-500:]}", file=_sys.stderr, flush=True)
+                if result.stderr:
+                    print(f"[SYNC] Extractor stderr: {result.stderr[-500:]}", file=_sys.stderr, flush=True)
+                # Issue #81 Group B: three exit codes. 0 = full success,
+                # 1 = full failure, 2 = partial. Partial is a data-quality
+                # alert, not a crash — the orchestrator's per-table _meta
+                # machinery already captured which tables succeeded; we just
+                # need to log loudly so operator alerting can pick it up.
+                if result.returncode == 0:
+                    print(f"[SYNC] Extractor OK", file=_sys.stderr, flush=True)
+                elif result.returncode == 2:
+                    print(
+                        f"[SYNC] Extractor PARTIAL FAILURE (exit 2) — some tables "
+                        f"succeeded, some failed; see stderr for per-table errors. "
+                        f"Successful tables will still be published by the orchestrator.",
+                        file=_sys.stderr, flush=True,
+                    )
+                else:
+                    print(f"[SYNC] Extractor FAILED (exit {result.returncode})", file=_sys.stderr, flush=True)
+
+            # Run custom connectors (Tier A: local mount) — only when there
+            # were local-mode tables to drive the extractor. Custom connectors
+            # currently piggyback on the same env as the Keboola extractor.
+            connectors_dir = Path(os.environ.get("CONNECTORS_DIR", str(Path(__file__).parent.parent.parent / "connectors" / "custom")))
+            if connectors_dir.exists():
+                for connector_dir in sorted(connectors_dir.iterdir()):
+                    if not connector_dir.is_dir():
+                        continue
+                    extractor = connector_dir / "extractor.py"
+                    if not extractor.exists():
+                        continue
+                    logger.info("Running custom connector: %s", connector_dir.name)
+                    try:
+                        custom_result = subprocess.run(
+                            [sys.executable, str(extractor)],
+                            env=env, capture_output=True, text=True, timeout=600,
+                            cwd=str(Path(__file__).parent.parent.parent),
+                        )
+                        if custom_result.returncode != 0:
+                            logger.error("Custom connector %s failed: %s", connector_dir.name, custom_result.stderr[-500:])
+                        else:
+                            logger.info("Custom connector %s completed", connector_dir.name)
+                    except subprocess.TimeoutExpired:
+                        logger.error("Custom connector %s timed out", connector_dir.name)
+
+        # Materialized BigQuery pass — runs admin-registered SQL through the
+        # DuckDB BQ extension (via BqAccess) and writes parquet for due rows.
+        # The orchestrator rebuild below picks the parquets up via the
+        # standard local-parquet discovery. Wrapped so a misconfigured BQ
+        # facade doesn't kill the Keboola path.
+        try:
+            from app.instance_config import get_value as _get_value
+            bq_project = _get_value(
+                "data_source", "bigquery", "project", default=""
+            ) or ""
+            if bq_project:
+                from connectors.bigquery.access import get_bq_access
+                from src.db import get_system_db as _get_system_db
+                bq_access = get_bq_access()
+                mat_conn = _get_system_db()
+                try:
+                    mat_summary = _run_materialized_pass(mat_conn, bq_access)
+                finally:
+                    mat_conn.close()
+                print(
+                    f"[SYNC] Materialized BQ: {len(mat_summary['materialized'])} ok, "
+                    f"{len(mat_summary['skipped'])} skipped, "
+                    f"{len(mat_summary['errors'])} errors",
+                    file=_sys.stderr, flush=True,
+                )
+                for err in mat_summary["errors"]:
+                    print(
+                        f"[SYNC]   {err['table']}: {err['error']}",
+                        file=_sys.stderr, flush=True,
+                    )
+        except Exception as e:
            print(
-                f"[SYNC] Extractor PARTIAL FAILURE (exit 2) — some tables "
-                f"succeeded, some failed; see stderr for per-table errors. "
-                f"Successful tables will still be published by the orchestrator.",
+                f"[SYNC] Materialized BQ pass FAILED: {e}",
                file=_sys.stderr, flush=True,
            )
-        else:
-            print(f"[SYNC] Extractor FAILED (exit {result.returncode})", file=_sys.stderr, flush=True)
-
-        # Run custom connectors (Tier A: local mount)
-        connectors_dir = Path(os.environ.get("CONNECTORS_DIR", str(Path(__file__).parent.parent.parent / "connectors" / "custom")))
-        if connectors_dir.exists():
-            for connector_dir in sorted(connectors_dir.iterdir()):
-                if not connector_dir.is_dir():
-                    continue
-                extractor = connector_dir / "extractor.py"
-                if not extractor.exists():
-                    continue
-                logger.info("Running custom connector: %s", connector_dir.name)
-                try:
-                    custom_result = subprocess.run(
-                        [sys.executable, str(extractor)],
-                        env=env, capture_output=True, text=True, timeout=600,
-                        cwd=str(Path(__file__).parent.parent.parent),
-                    )
-                    if custom_result.returncode != 0:
-                        logger.error("Custom connector %s failed: %s", connector_dir.name, custom_result.stderr[-500:])
-                    else:
-                        logger.info("Custom connector %s completed", connector_dir.name)
-                except subprocess.TimeoutExpired:
-                    logger.error("Custom connector %s timed out", connector_dir.name)
+            traceback.print_exc()

        # Rebuild master views (reads extract.duckdb files, no write conflict)
        from src.orchestrator import SyncOrchestrator
--- a/cli/commands/admin.py
+++ b/cli/commands/admin.py
@ -67,11 +67,19 @@ def register_table(
    source_type: str = typer.Option("keboola", help="Source type: keboola | bigquery | jira | local"),
    bucket: str = typer.Option("", help="Source bucket (Keboola) or dataset (BigQuery)"),
    source_table: str = typer.Option("", help="Source table name in the bucket/dataset"),
-    query_mode: str = typer.Option("local", help="Query mode: local or remote (forced to 'remote' for bigquery)"),
+    query_mode: str = typer.Option("local", help="Query mode: local | remote | materialized"),
+    query: str = typer.Option(
+        "",
+        "--query",
+        help=(
+            "SQL body for query_mode='materialized' (BigQuery only). "
+            "Inline SQL or `@path/to.sql` to read from disk."
+        ),
+    ),
    description: str = typer.Option("", help="Table description"),
    sync_schedule: str = typer.Option(
        "",
-        help="Cron schedule (BigQuery only — note: scheduler not yet wired, see #79 / M3 of #108)",
+        help="Cron schedule (e.g. 'every 6h' / 'daily 03:00'); honored by materialized BQ rows",
    ),
    dry_run: bool = typer.Option(
        False,
@ -81,11 +89,38 @@ def register_table(
 ):
    """Register a single table.

-    For BigQuery: dataset goes in --bucket, the BQ table/view name in
-    --source-table, the DuckDB view name in NAME. The server forces
-    query_mode=remote and profile_after_sync=False; --dry-run goes
-    through /precheck and prints rows + size + columns without writing.
+    Modes:
+    - **local** (Keboola): batch pull, parquet on disk.
+    - **remote** (BigQuery): view only, queries go to BQ. Requires
+      `--bucket` + `--source-table`.
+    - **materialized** (BigQuery): server-side scheduled SQL → parquet.
+      Requires `--query` (inline or `@file.sql`); `--bucket` /
+      `--source-table` ignored.
+
+    `--dry-run` goes through /precheck (BQ remote only — for materialized
+    rows, dry-run is a no-op since the SQL itself is the contract).
    """
+    from pathlib import Path
+
+    # Resolve --query @file.sql shorthand.
+    source_query = ""
+    if query:
+        if query.startswith("@"):
+            sql_path = Path(query[1:])
+            if not sql_path.exists():
+                typer.echo(f"Error: SQL file not found: {sql_path}", err=True)
+                raise typer.Exit(2)
+            source_query = sql_path.read_text(encoding="utf-8").strip()
+        else:
+            source_query = query.strip()
+
+    if query_mode == "materialized" and not source_query:
+        typer.echo(
+            "Error: --query-mode materialized requires --query (literal SQL or @path.sql)",
+            err=True,
+        )
+        raise typer.Exit(2)
+
    payload = {
        "name": name,
        "source_type": source_type,
@ -94,6 +129,11 @@ def register_table(
        "query_mode": query_mode,
        "description": description,
    }
+    # Omit empty optional fields so the server-side validator doesn't see
+    # `source_query=""` on a remote/local row (which would trigger the
+    # "source_query forbidden" branch).
+    if source_query:
+        payload["source_query"] = source_query
    if sync_schedule:
        payload["sync_schedule"] = sync_schedule

--- a/connectors/bigquery/extractor.py
+++ b/connectors/bigquery/extractor.py
@ -9,7 +9,7 @@ import shutil
 import threading
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Optional

 import duckdb

@ -182,6 +182,14 @@ def _init_extract_locked(
        _create_remote_attach_table(conn, project_id)

        for tc in table_configs:
+            # Materialized rows are written by the sync trigger pass via
+            # `materialize_query()` — they live as parquets in
+            # /data/extracts/bigquery/data/, picked up by the orchestrator's
+            # standard local-parquet discovery. Don't create a remote view
+            # here (would shadow the parquet via name collision).
+            if tc.get("query_mode") == "materialized":
+                continue
+
            table_name = tc["name"]
            dataset = tc.get("bucket", "")
            source_table = tc.get("source_table", table_name)
@ -279,6 +287,183 @@ def _init_extract_locked(
    return stats


+class MaterializeBudgetError(RuntimeError):
+    """Raised when a `materialize_query` BQ dry-run estimate exceeds the
+    configured `data_source.bigquery.max_bytes_per_materialize` cap.
+
+    The materialize trigger pass logs this and skips the row; the next
+    scheduled tick re-tries (in case the underlying table size dropped
+    or the operator raised the cap). Shape mirrors `BqAccessError` —
+    `current` and `limit` for operator triage.
+    """
+
+    def __init__(self, message: str, *, table_id: str, current: int, limit: int):
+        self.table_id = table_id
+        self.current = current
+        self.limit = limit
+        super().__init__(message)
+
+
+def materialize_query(
+    table_id: str,
+    sql: str,
+    *,
+    bq,  # connectors.bigquery.access.BqAccess (untyped here to avoid circular import at type-check)
+    output_dir: str,
+    max_bytes: Optional[int] = None,
+) -> Dict[str, Any]:
+    """Run `sql` through the DuckDB BigQuery extension and write the result
+    to `<output_dir>/data/<table_id>.parquet` atomically.
+
+    Designed for `query_mode='materialized'` table_registry rows. The SQL
+    is admin-registered (validated upstream) and may reference DuckDB
+    three-part identifiers (`bq."dataset"."table"`) resolved by the
+    in-session ATTACH, OR native BQ identifiers via the `bigquery_query()`
+    table function — both work because the session has the bigquery
+    extension loaded with a SECRET token.
+
+    Cost guardrail: when `max_bytes` is a positive int, run a BQ dry-run
+    via `bq.client()` first; raise `MaterializeBudgetError` if the
+    estimate exceeds the cap. `max_bytes=None` or `max_bytes <= 0`
+    disables the guardrail (config sentinel, see
+    `data_source.bigquery.max_bytes_per_materialize`).
+
+    Dry-run is best-effort and fail-open: if the SQL uses DuckDB syntax
+    that the native BQ client can't parse (e.g. `bq."ds"."t"`), the
+    dry-run raises and we log a warning; the COPY still runs. This
+    matches the BqAccess facade's "client is for native BQ SQL only"
+    contract — operators who need the cap to engage write the registered
+    SQL using native BQ identifiers (`\\`project.ds.t\\``).
+
+    Atomic write: result lands in `<id>.parquet.tmp` first, then
+    `os.replace` swaps it in. A failed COPY leaves no partial file behind.
+
+    Args:
+        table_id: Logical id from table_registry; becomes the parquet
+            filename. Must pass `validate_identifier()` so it can't
+            inject path traversal.
+        sql: SELECT statement, no trailing semicolon.
+        bq: A `BqAccess` instance — provides `duckdb_session()` for the
+            COPY and `client()` for the dry-run.
+        output_dir: Connector root, e.g. `/data/extracts/bigquery`.
+            Parquet lands in `<output_dir>/data/<table_id>.parquet`.
+        max_bytes: Optional cap on BQ bytes scanned. None or <= 0 disables.
+
+    Returns:
+        {"rows": int, "size_bytes": int, "query_mode": "materialized"}
+
+    Raises:
+        ValueError: if `table_id` is unsafe.
+        MaterializeBudgetError: if `max_bytes > 0` and dry-run estimate exceeds it.
+        BqAccessError: from `bq.duckdb_session()` (auth_failed / bq_lib_missing /
+            not_configured) — caller catches and aggregates into the trigger
+            pass summary.
+        duckdb.Error: if the COPY itself fails (e.g. bad SQL, missing table).
+    """
+    if not validate_identifier(table_id, "materialize table_id"):
+        raise ValueError(f"unsafe table_id: {table_id!r}")
+
+    out_path = Path(output_dir)
+    data_dir = out_path / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    parquet_path = data_dir / f"{table_id}.parquet"
+    tmp_path = data_dir / f"{table_id}.parquet.tmp"
+    if tmp_path.exists():
+        tmp_path.unlink()
+
+    # Cost guardrail (best-effort — fail-open if dry-run can't parse the SQL).
+    if max_bytes is not None and max_bytes > 0:
+        try:
+            from app.api.v2_scan import _bq_dry_run_bytes  # reuse main's impl
+            estimated = _bq_dry_run_bytes(bq, sql)
+        except Exception as e:
+            logger.warning(
+                "BQ dry-run failed for materialize cost guardrail (fail-open): %s. "
+                "If the SQL uses DuckDB three-part names like bq.\"ds\".\"t\", "
+                "rewrite to native BQ identifiers (`project.ds.t`) for the "
+                "guardrail to engage. Proceeding with COPY.",
+                e,
+            )
+            estimated = 0
+        if estimated > max_bytes:
+            raise MaterializeBudgetError(
+                f"dry-run estimate {estimated:,} bytes exceeds cap "
+                f"{max_bytes:,} for table {table_id!r}",
+                table_id=table_id,
+                current=estimated,
+                limit=max_bytes,
+            )
+
+    # COPY through a BqAccess-managed session.
+    with bq.duckdb_session() as conn:
+        # ATTACH the data project — but only when no `bq` catalog is
+        # already attached. Production sessions (real BqAccess) come with
+        # only `:memory:` and need the ATTACH; test sessions pre-populate
+        # `bq` as a fixture catalog and would error on a redundant ATTACH
+        # (alias already in use) AND on the bigquery extension load when
+        # the test runner has no cached extension. Detecting via
+        # `duckdb_databases()` keeps the ATTACH path idempotent without
+        # swallowing real errors (auth, cross-project permission,
+        # malformed project_id) — those still propagate from the actual
+        # ATTACH call.
+        attached = {
+            r[0] for r in conn.execute(
+                "SELECT database_name FROM duckdb_databases()"
+            ).fetchall()
+        }
+        if "bq" not in attached:
+            conn.execute(
+                f"ATTACH 'project={bq.projects.data}' AS bq (TYPE bigquery, READ_ONLY)"
+            )
+
+        try:
+            safe_path = str(tmp_path).replace("'", "''")
+            conn.execute(f"COPY ({sql}) TO '{safe_path}' (FORMAT PARQUET)")
+            rows = conn.execute(
+                f"SELECT count(*) FROM read_parquet('{safe_path}')"
+            ).fetchone()[0]
+        except Exception:
+            if tmp_path.exists():
+                tmp_path.unlink()
+            raise
+
+    # Compute the parquet hash inline before the atomic swap. The caller used
+    # to re-read the file in `_run_materialized_pass` to hash it via
+    # `_file_hash`, but that's a synchronous full-read on the FastAPI worker
+    # thread — a 10 GiB parquet means 50+ seconds of disk I/O blocking other
+    # requests. Hashing here keeps the open-file handle hot from the COPY
+    # round and removes the second read. Devil's-advocate review item.
+    import hashlib
+    h = hashlib.md5()
+    with open(tmp_path, "rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            h.update(chunk)
+    parquet_hash = h.hexdigest()
+
+    size_bytes = tmp_path.stat().st_size
+    os.replace(tmp_path, parquet_path)
+
+    rows = int(rows)
+    if rows == 0:
+        # 0 rows is indistinguishable from "the SQL is wrong and nobody
+        # noticed" — surface it loudly so operators see it in the scheduler
+        # log line and the per-row error aggregation. Caller decides whether
+        # to alert.
+        logger.warning(
+            "Materialized %s produced 0 rows — verify the SQL filter is "
+            "intentional. Parquet written: %s",
+            table_id, parquet_path,
+        )
+
+    return {
+        "rows": rows,
+        "size_bytes": size_bytes,
+        "query_mode": "materialized",
+        "hash": parquet_hash,
+    }
+
+
 def _resolve_bq_project_id() -> str:
    """Resolve ``data_source.bigquery.project`` honoring the overlay.

--- a/connectors/keboola/access.py
+++ b/connectors/keboola/access.py
@ -0,0 +1,43 @@
+"""DuckDB session facade for the Keboola Storage API extension.
+
+Parallel of `connectors/bigquery/access.py:BqAccess`. The materialized
+Keboola SQL path needs a one-shot DuckDB connection with the Keboola
+extension installed, loaded, and ATTACHed; this facade encapsulates
+that wiring so `_run_materialized_pass` doesn't need to know the
+extension name, the ATTACH alias, or how the token gets quoted into
+the URL literal.
+"""
+from __future__ import annotations
+from contextlib import contextmanager
+from typing import Iterator
+
+import duckdb
+
+
+class KeboolaAccess:
+    """Lazy DuckDB session manager for the Keboola Storage API extension.
+
+    Single-use — call `.duckdb_session()` as a context manager once per
+    materialized job.
+    """
+
+    def __init__(self, *, url: str, token: str) -> None:
+        if not url or not token:
+            raise ValueError("KeboolaAccess requires url and token")
+        self._url = url
+        self._token = token
+
+    @contextmanager
+    def duckdb_session(self) -> Iterator[duckdb.DuckDBPyConnection]:
+        conn = duckdb.connect(":memory:")
+        try:
+            conn.execute("INSTALL keboola FROM community")
+            conn.execute("LOAD keboola")
+            escaped_token = self._token.replace("'", "''")
+            conn.execute(
+                f"ATTACH '{self._url}' AS kbc "
+                f"(TYPE keboola, TOKEN '{escaped_token}')"
+            )
+            yield conn
+        finally:
+            conn.close()
--- a/connectors/keboola/extractor.py
+++ b/connectors/keboola/extractor.py
@ -17,6 +17,66 @@ from src.identifier_validation import (
 logger = logging.getLogger(__name__)


+def materialize_query(
+    table_id: str,
+    sql: str,
+    *,
+    keboola_access,  # KeboolaAccess (avoid circular import)
+    output_dir: Path,
+) -> dict:
+    """Materialize an admin-registered SELECT against the Keboola Storage
+    API extension into a parquet file.
+
+    Parallel of `connectors/bigquery/extractor.py:materialize_query`.
+    Cost guardrail: the Keboola extension has no analog of BQ dry-run;
+    Storage API cost is download-shaped (per-byte egress + Storage API
+    job). Phase B ships without a guardrail and logs the byte count;
+    a future PR can add a configurable `max_bytes_per_keboola_materialize`
+    gate similar to BQ's `max_bytes_per_materialize`.
+    """
+    import re
+    import hashlib
+
+    # Defense: table_id is interpolated into the parquet filename.
+    # Reject anything that's not a safe identifier.
+    if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", table_id):
+        raise ValueError(f"unsafe table_id for materialize: {table_id!r}")
+
+    parquet_path = Path(output_dir) / f"{table_id}.parquet"
+    safe_pq_lit = str(parquet_path).replace("'", "''")
+
+    with keboola_access.duckdb_session() as conn:
+        # Run the admin SELECT and copy the result to parquet.
+        # The COPY wrapper is identical to the existing legacy extract
+        # path at extractor.py:209; the only difference is the SELECT is
+        # admin-supplied rather than `SELECT * FROM kbc.bucket.table`.
+        conn.execute(f"COPY ({sql}) TO '{safe_pq_lit}' (FORMAT PARQUET)")
+
+        # Read back row count.
+        row_count = conn.execute(
+            f"SELECT COUNT(*) FROM read_parquet('{safe_pq_lit}')"
+        ).fetchone()[0]
+
+    file_bytes = parquet_path.read_bytes()
+    md5 = hashlib.md5(file_bytes).hexdigest()
+    size = len(file_bytes)
+
+    if row_count == 0:
+        logger.warning(
+            "Materialized Keboola query for %s wrote 0 rows — verify the "
+            "SQL filters and that the source bucket has data.",
+            table_id,
+        )
+
+    return {
+        "table_id": table_id,
+        "path": str(parquet_path),
+        "rows": row_count,
+        "bytes": size,
+        "md5": md5,
+    }
+
+
 def _create_meta_table(conn: duckdb.DuckDBPyConnection) -> None:
    """Create the _meta table required by the extract.duckdb contract."""
    conn.execute("DROP TABLE IF EXISTS _meta")
@ -99,6 +159,21 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
            table_name = tc["name"]
            query_mode = tc.get("query_mode", "local")

+            # Materialized rows are written by the sync trigger pass via
+            # `materialize_query()` — they live as parquets in
+            # /data/extracts/keboola/data/, picked up by the orchestrator's
+            # standard local-parquet discovery. Don't extract here (would
+            # double-write data via the source bucket reference and confuse
+            # sync_state bookkeeping). Mirror of the BQ extractor's skip at
+            # connectors/bigquery/extractor.py:190.
+            if query_mode == "materialized":
+                logger.info(
+                    "Skipping legacy extract for %s — query_mode='materialized', "
+                    "handled by _run_materialized_pass instead",
+                    tc.get("id") or tc.get("name"),
+                )
+                continue
+
            # #81 Group D — refuse rows whose identifiers don't pass the
            # whitelist. The registry is admin-controlled but anyone with
            # write access can otherwise inject SQL via the CREATE VIEW /
--- a/scripts/smoke-test-materialized-bq.sh
+++ b/scripts/smoke-test-materialized-bq.sh
@ -0,0 +1,227 @@
+#!/usr/bin/env bash
+# Smoke test — query_mode='materialized' for BigQuery.
+#
+# Runs the full happy-path + 3 adversarial scenarios against a live Agnes
+# instance that has BigQuery configured. Cheap (uses bigquery-public-data
+# samples; ~36 rows, < 1 KB scan) — safe to run against staging.
+#
+# Usage:
+#   ./scripts/smoke-test-materialized-bq.sh [host:port]
+#
+# Required environment:
+#   AGNES_PAT       — admin PAT for the target instance
+#   BQ_TEST_BIG     — (optional) name of a BQ table > 10 GiB to test the
+#                     cost guardrail. Defaults to a public dataset that
+#                     scans ~50 GB on full SELECT.
+#
+# Defaults: AGNES_HOST=http://localhost:8000.
+#
+# Cleans up the test rows on exit (trap), even on SIGINT.
+
+set -euo pipefail
+
+HOST="${1:-${AGNES_HOST:-http://localhost:8000}}"
+PAT="${AGNES_PAT:?AGNES_PAT must be set (admin token)}"
+BIG_TABLE="${BQ_TEST_BIG:-bigquery-public-data.github_repos.commits}"
+
+PASS=0
+FAIL=0
+
+# Test rows we'll create — captured for cleanup.
+CREATED_IDS=()
+
+cleanup() {
+    echo
+    echo "--- Cleanup ---"
+    for tid in "${CREATED_IDS[@]}"; do
+        curl -sS -X DELETE "$HOST/api/admin/registry/$tid" \
+            -H "Authorization: Bearer $PAT" -o /dev/null -w "  DELETE %{http_code} $tid\n" || true
+    done
+}
+trap cleanup EXIT INT TERM
+
+check() {
+    local name="$1" ok="$2"
+    if [ "$ok" = "true" ]; then
+        echo "  PASS $name"
+        PASS=$((PASS + 1))
+    else
+        echo "  FAIL $name"
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+http() {
+    # POST/PUT/DELETE helper that returns the HTTP status + body separately.
+    local method="$1" path="$2" body="${3:-}"
+    if [ -n "$body" ]; then
+        curl -sS -o /tmp/smoke-mat-body -w "%{http_code}" \
+            -X "$method" "$HOST$path" \
+            -H "Authorization: Bearer $PAT" \
+            -H "Content-Type: application/json" \
+            -d "$body"
+    else
+        curl -sS -o /tmp/smoke-mat-body -w "%{http_code}" \
+            -X "$method" "$HOST$path" \
+            -H "Authorization: Bearer $PAT"
+    fi
+}
+
+echo "Materialized BQ smoke: $HOST"
+echo "Big table for cost-guardrail test: $BIG_TABLE"
+echo "---"
+
+# ---------------------------------------------------------------------------
+# Scenario A — Happy path: register tiny materialized table, trigger,
+#              verify parquet on disk + manifest carries hash.
+# ---------------------------------------------------------------------------
+echo
+echo "[A] Happy path (Shakespeare sample, ~36 rows)"
+SQL_A='SELECT corpus, COUNT(*) AS c FROM `bigquery-public-data.samples.shakespeare` GROUP BY 1 ORDER BY 1'
+TID_A="smoke_mat_shakespeare_$(date +%s)"
+CREATED_IDS+=("$TID_A")
+
+STATUS=$(http POST /api/admin/register-table "{
+    \"name\": \"$TID_A\",
+    \"source_type\": \"bigquery\",
+    \"query_mode\": \"materialized\",
+    \"source_query\": $(printf '%s' "$SQL_A" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"),
+    \"sync_schedule\": \"every 1m\"
+}")
+[ "$STATUS" = "201" ] && check "register 201" true || { check "register 201 (got $STATUS)" false; cat /tmp/smoke-mat-body; }
+
+echo "    triggering sync..."
+http POST /api/sync/trigger '{}' >/dev/null
+sleep 5  # background task
+
+# Manifest must list the row with query_mode + non-empty hash.
+http GET /api/sync/manifest >/dev/null
+HASH=$(python3 -c "
+import json
+m = json.load(open('/tmp/smoke-mat-body'))
+t = m.get('tables', {}).get('$TID_A')
+print(t.get('hash', '') if t else '')
+")
+[ -n "$HASH" ] && [ "$HASH" != "null" ] && check "manifest hash present" true || check "manifest hash present (got '$HASH')" false
+
+# Parquet on disk (assumes co-located filesystem, e.g. local docker compose).
+PARQUET="${DATA_DIR:-./data}/extracts/bigquery/data/$TID_A.parquet"
+if [ -f "$PARQUET" ]; then
+    ROWS=$(python3 -c "import duckdb; print(duckdb.connect().execute(\"SELECT count(*) FROM read_parquet('$PARQUET')\").fetchone()[0])" 2>/dev/null || echo "0")
+    [ "$ROWS" -gt 0 ] && check "parquet has $ROWS rows" true || check "parquet rows ($ROWS)" false
+else
+    echo "    note: parquet at $PARQUET not visible from this host (skip if Agnes is remote)"
+fi
+
+# ---------------------------------------------------------------------------
+# Scenario B — Cost guardrail: register a large-scan materialized SQL,
+#              trigger, expect MaterializeBudgetError logged + row skipped.
+# ---------------------------------------------------------------------------
+echo
+echo "[B] Cost guardrail (\`$BIG_TABLE\` full SELECT)"
+TID_B="smoke_mat_huge_$(date +%s)"
+CREATED_IDS+=("$TID_B")
+SQL_B="SELECT * FROM \`$BIG_TABLE\`"
+
+STATUS=$(http POST /api/admin/register-table "{
+    \"name\": \"$TID_B\",
+    \"source_type\": \"bigquery\",
+    \"query_mode\": \"materialized\",
+    \"source_query\": $(printf '%s' "$SQL_B" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"),
+    \"sync_schedule\": \"every 1m\"
+}")
+[ "$STATUS" = "201" ] && check "register 201" true || check "register 201 (got $STATUS)" false
+
+echo "    triggering sync (expect cap to fire)..."
+http POST /api/sync/trigger '{}' >/dev/null
+sleep 5
+
+# Manifest should NOT have a hash for the huge row (materialize was skipped).
+http GET /api/sync/manifest >/dev/null
+HUGE_HASH=$(python3 -c "
+import json
+m = json.load(open('/tmp/smoke-mat-body'))
+t = m.get('tables', {}).get('$TID_B')
+print(t.get('hash', '') if t else 'absent')
+")
+if [ "$HUGE_HASH" = "absent" ] || [ -z "$HUGE_HASH" ] || [ "$HUGE_HASH" = "null" ]; then
+    check "huge row skipped (no hash in manifest)" true
+else
+    check "huge row skipped (got hash '$HUGE_HASH' — guardrail did not fire)" false
+fi
+echo "    grep server logs for: 'MaterializeBudgetError' or 'Materialize cap exceeded'"
+
+# ---------------------------------------------------------------------------
+# Scenario C — 0-row warning: SQL with always-false WHERE.
+# ---------------------------------------------------------------------------
+echo
+echo "[C] 0-row WARNING (filter to empty result)"
+TID_C="smoke_mat_empty_$(date +%s)"
+CREATED_IDS+=("$TID_C")
+SQL_C='SELECT corpus FROM `bigquery-public-data.samples.shakespeare` WHERE 1=0'
+
+STATUS=$(http POST /api/admin/register-table "{
+    \"name\": \"$TID_C\",
+    \"source_type\": \"bigquery\",
+    \"query_mode\": \"materialized\",
+    \"source_query\": $(printf '%s' "$SQL_C" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"),
+    \"sync_schedule\": \"every 1m\"
+}")
+[ "$STATUS" = "201" ] && check "register 201" true || check "register 201 (got $STATUS)" false
+
+http POST /api/sync/trigger '{}' >/dev/null
+sleep 5
+
+http GET /api/sync/manifest >/dev/null
+EMPTY_ROWS=$(python3 -c "
+import json
+m = json.load(open('/tmp/smoke-mat-body'))
+t = m.get('tables', {}).get('$TID_C')
+print(t.get('rows', 'absent') if t else 'absent')
+")
+[ "$EMPTY_ROWS" = "0" ] && check "empty-result rows=0 in manifest" true || check "empty-result rows ($EMPTY_ROWS)" false
+echo "    grep server logs for: 'produced 0 rows'"
+
+# ---------------------------------------------------------------------------
+# Scenario D — Mode-switch transition clears stale source_query.
+# ---------------------------------------------------------------------------
+echo
+echo "[D] Mode-switch materialized → remote clears source_query"
+TID_D="smoke_mat_switch_$(date +%s)"
+CREATED_IDS+=("$TID_D")
+
+STATUS=$(http POST /api/admin/register-table "{
+    \"name\": \"$TID_D\",
+    \"source_type\": \"bigquery\",
+    \"query_mode\": \"materialized\",
+    \"source_query\": \"SELECT 1\"
+}")
+[ "$STATUS" = "201" ] && check "register materialized" true || check "register materialized (got $STATUS)" false
+
+# Switch to remote, providing required bucket+source_table.
+STATUS=$(http PUT "/api/admin/registry/$TID_D" "{
+    \"query_mode\": \"remote\",
+    \"bucket\": \"samples\",
+    \"source_table\": \"shakespeare\"
+}")
+[ "$STATUS" = "200" ] && check "switch to remote 200" true || check "switch to remote (got $STATUS)" false
+
+http GET /api/admin/registry >/dev/null
+SWITCHED_SQ=$(python3 -c "
+import json
+r = json.load(open('/tmp/smoke-mat-body'))
+row = next((t for t in r.get('tables', []) if t['id'] == '$TID_D'), None)
+print(row.get('source_query') if row else 'NOT_FOUND')
+")
+[ "$SWITCHED_SQ" = "None" ] || [ -z "$SWITCHED_SQ" ] || [ "$SWITCHED_SQ" = "null" ] \
+    && check "source_query cleared on switch" true \
+    || check "source_query cleared (got '$SWITCHED_SQ')" false
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+echo
+echo "---"
+echo "Passed: $PASS"
+echo "Failed: $FAIL"
+[ "$FAIL" -eq 0 ] || exit 1
--- a/src/db.py
+++ b/src/db.py
@ -39,7 +39,7 @@ def _maybe_instrument(con, db_tag: str):

 _SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")

-SCHEMA_VERSION = 19
+SCHEMA_VERSION = 20

 _SYSTEM_SCHEMA = """
 CREATE TABLE IF NOT EXISTS schema_version (
@ -259,6 +259,7 @@ CREATE TABLE IF NOT EXISTS table_registry (
    source_type VARCHAR,
    bucket VARCHAR,
    source_table VARCHAR,
+    source_query TEXT,
    sync_strategy VARCHAR DEFAULT 'full_refresh',
    query_mode VARCHAR DEFAULT 'local',
    sync_schedule VARCHAR,
@ -918,6 +919,16 @@ _V16_TO_V17_MIGRATIONS = [
 # helper rather than a flat SQL list (the migrate-ladder calls it directly).


+# v19 -> v20: source_query column backs query_mode='materialized' for BigQuery.
+# Admin-registered SQL stored verbatim; scheduler runs it through the DuckDB BQ
+# extension (via BqAccess) and writes the result to
+# /data/extracts/bigquery/data/<id>.parquet so the existing manifest + da sync
+# flow distributes it to analysts. NULL on existing rows.
+_V19_TO_V20_MIGRATIONS = [
+    "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_query TEXT",
+]
+
+
 # Core role seed data — single source of truth. Used by both _seed_core_roles
 # (idempotent insert) and the v8→v9 backfill. Order matters: lowest privilege
 # first so implies references resolve cleanly when expand_implies does BFS.
@ -1735,6 +1746,9 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
                _v17_to_v18_finalize(conn)
            if current < 19:
                _v18_to_v19_finalize(conn)
+            if current < 20:
+                for sql in _V19_TO_V20_MIGRATIONS:
+                    conn.execute(sql)
            conn.execute(
                "UPDATE schema_version SET version = ?, applied_at = current_timestamp",
                [SCHEMA_VERSION],
--- a/src/repositories/table_registry.py
+++ b/src/repositories/table_registry.py
@ -72,7 +72,9 @@ class TableRegistryRepository:
        primary_key: Union[None, str, List[str]] = None,
        description: Optional[str] = None, registered_by: Optional[str] = None,
        source_type: Optional[str] = None, bucket: Optional[str] = None,
-        source_table: Optional[str] = None, query_mode: str = "local",
+        source_table: Optional[str] = None,
+        source_query: Optional[str] = None,
+        query_mode: str = "local",
        sync_schedule: Optional[str] = None, profile_after_sync: bool = True,
        registered_at: Optional[datetime] = None,
    ) -> None:
@ -85,18 +87,21 @@ class TableRegistryRepository:
        self.conn.execute(
            """INSERT INTO table_registry (id, name, folder, sync_strategy,
                primary_key, description, registered_by, registered_at,
-                source_type, bucket, source_table, query_mode,
+                source_type, bucket, source_table, source_query, query_mode,
                sync_schedule, profile_after_sync)
-            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT (id) DO UPDATE SET
                name = excluded.name, folder = excluded.folder,
                sync_strategy = excluded.sync_strategy, primary_key = excluded.primary_key,
                description = excluded.description, registered_at = excluded.registered_at,
                source_type = excluded.source_type, bucket = excluded.bucket,
-                source_table = excluded.source_table, query_mode = excluded.query_mode,
-                sync_schedule = excluded.sync_schedule, profile_after_sync = excluded.profile_after_sync""",
+                source_table = excluded.source_table, source_query = excluded.source_query,
+                query_mode = excluded.query_mode,
+                sync_schedule = excluded.sync_schedule,
+                profile_after_sync = excluded.profile_after_sync""",
            [id, name, folder, sync_strategy, encoded_pk, description, registered_by, ts,
-             source_type, bucket, source_table, query_mode, sync_schedule, profile_after_sync],
+             source_type, bucket, source_table, source_query, query_mode,
+             sync_schedule, profile_after_sync],
        )

    @staticmethod
--- a/tests/test_admin_bq_register.py
+++ b/tests/test_admin_bq_register.py
@ -244,7 +244,12 @@ class TestBigQueryRegisterCoercion:
        resp = c.get("/api/admin/registry", headers=_auth(token))
        row = next(t for t in resp.json()["tables"] if t["name"] == "orders")
        assert row["query_mode"] == "remote"
-        assert row["profile_after_sync"] is False
+        # Phase C: profile_after_sync is now inert. The field is accepted in
+        # the request for back-compat but no longer overrides the DB default.
+        # Was: assert row["profile_after_sync"] is False  (when BQ register
+        # forced it to False as a "signal"). Now the row carries the schema
+        # default (True). Profiler runs unconditionally regardless.
+        assert row.get("profile_after_sync") in (True, None)


 class TestBigQueryRegisterCollision:
@ -698,20 +703,25 @@ class TestAdminTablesUI:
            c.cookies.clear()
        assert resp.status_code == 200, resp.text
        body = resp.text
-        # Modal carries the source type so the JS can branch.
+        # Modal carries the source type so legacy openRegisterModal({}) still
+        # routes through the JS dispatcher.
        assert 'data-source-type="bigquery"' in body
        # BQ-only inputs.
        assert 'id="bqDataset"' in body
        assert 'id="bqSourceTable"' in body
        assert 'id="bqViewName"' in body
        assert 'id="bqSyncSchedule"' in body
-        # Inline hint about scheduler-not-yet-wired (Decision 3).
-        assert "scheduler" in body.lower()
-        # BQ-specific panel (no discovery for BQ in M1).
-        assert 'data-test="bq-register-panel"' in body
-        # Keboola-only inputs must NOT be present.
-        assert 'id="regTableId"' not in body
-        assert 'id="regBucket"' not in body
+        # Cron-style schedule examples are surfaced near the field
+        # (operator-facing copy explains the syntax).
+        assert "every 6h" in body or "daily 03:00" in body
+        # BQ tab content section (legacy out-of-tab BQ register panel was
+        # removed when the user-visible cleanup landed — each tab now owns
+        # its own header + Register button).
+        assert 'id="tab-content-bigquery"' in body
+        assert 'id="bqRegisterBtn"' in body
+        # Phase E: BQ + Keboola modals are now both always rendered (each
+        # inside its own tab). On a BQ instance the BQ tab is the visible
+        # one; the Keboola modal is just hidden in a non-active tab.

    def test_renders_keboola_fields_when_data_source_keboola(self, seeded_app, monkeypatch):
        from app.instance_config import reset_cache
@ -731,12 +741,22 @@ class TestAdminTablesUI:
            assert resp.status_code == 200
            body = resp.text
            assert 'data-source-type="keboola"' in body
-            # Keboola path — discovery panel + Keboola inputs.
-            assert 'id="discoveryResults"' in body
-            assert 'id="regBucket"' in body
-            assert 'id="regTableName"' in body
-            # BQ-only inputs MUST NOT be present.
-            assert 'id="bqDataset"' not in body
+            # Keboola tab content section + Register-Keboola button (legacy
+            # global Discovery panel was removed when the user-visible
+            # cleanup landed — Keboola discovery is per-modal now).
+            assert 'id="tab-content-keboola"' in body
+            assert 'id="kbRegisterBtn"' in body
+            # C3: legacy #registerModal is gone; the Phase F Keboola modal
+            # at #registerKeboolaModal owns the Keboola register flow now.
+            assert 'id="registerModal"' not in body
+            assert 'id="regBucket"' not in body
+            assert 'id="regTableName"' not in body
+            # The Phase F Keboola modal's inputs are present.
+            assert 'id="kbBucket"' in body
+            assert 'id="kbViewName"' in body
+            # Phase E: BQ form now always rendered (inside #tab-content-bigquery)
+            # — operator can switch tabs to register a BQ table on a Keboola
+            # instance. Tab is hidden by default but the form is in the DOM.
        finally:
            reset_cache()

@ -1546,11 +1566,17 @@ class TestKeboolaModalUsesDiscoveredTableId:
    """Review IMPORTANT 5: the JS that builds the Keboola register payload
    must derive `source_table` from the discovered table's storage ID
    (`t.id` minus the bucket prefix), NOT the human-friendly display name
-    (`t.name`). We verify by static template inspection — this is enough
-    to catch a regression that drops the hidden field or reverts the JS
-    to reading `regTableName`."""
+    (`t.name`). We verify by static template inspection.

-    def test_template_has_hidden_source_table_field(self, seeded_app, monkeypatch):
+    C3: the legacy #registerModal that owned regTableName / regSourceTable
+    was removed. The Phase F #registerKeboolaModal uses kbBucket /
+    kbSourceTable and a different payload builder (`_buildKeboolaPayload`)
+    that already keeps storage identifier separate from display name. The
+    tests below were rewritten to gate the Phase F flow."""
+
+    def test_phase_f_modal_separates_storage_id_from_display_name(
+        self, seeded_app, monkeypatch,
+    ):
        from app.instance_config import reset_cache
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
@ -1567,22 +1593,25 @@ class TestKeboolaModalUsesDiscoveredTableId:
                c.cookies.clear()
            assert resp.status_code == 200, resp.text
            body = resp.text
-            # Hidden field must exist so the JS can stash the bare
-            # storage identifier separately from the display name.
-            assert 'id="regSourceTable"' in body
-            # And the build function must read from that hidden field
-            # (NOT from regTableName, which is the display name).
-            assert "getElementById('regSourceTable').value" in body
+            # Phase F modal owns the Keboola Register flow now.
+            assert 'id="registerKeboolaModal"' in body
+            # The source-table input is NOT the same field as the
+            # human-friendly view name input.
+            assert 'id="kbSourceTable"' in body
+            assert 'id="kbViewName"' in body
+            # The payload builder reads kbSourceTable for the storage
+            # identifier (used in SELECT * FROM kbc."b"."t").
+            assert "getElementById('kbSourceTable').value" in body
        finally:
            reset_cache()

-    def test_template_does_not_send_display_name_as_source_table(
+    def test_legacy_regtablename_payload_path_is_gone(
        self, seeded_app, monkeypatch,
    ):
-        """Regression check: pre-fix the payload had
-        `source_table: document.getElementById('regTableName').value`.
-        After the fix, that exact line must be gone (the build function
-        reads from the hidden `regSourceTable` first)."""
+        """Regression check: the pre-fix payload line
+        `source_table: document.getElementById('regTableName').value`
+        must remain absent. C3 removed the entire legacy modal so no
+        regTableName-based payload can be reintroduced."""
        from app.instance_config import reset_cache
        monkeypatch.setattr(
            "app.instance_config.load_instance_config",
@ -1598,11 +1627,14 @@ class TestKeboolaModalUsesDiscoveredTableId:
            finally:
                c.cookies.clear()
            body = resp.text
-            # No occurrence of the buggy direct assignment.
            assert (
                "source_table: document.getElementById('regTableName').value"
                not in body
            )
+            # C3: the regTableName / regSourceTable inputs themselves are
+            # gone with the legacy modal.
+            assert 'id="regTableName"' not in body
+            assert 'id="regSourceTable"' not in body
        finally:
            reset_cache()

--- a/tests/test_admin_discover_bigquery.py
+++ b/tests/test_admin_discover_bigquery.py
@ -0,0 +1,205 @@
+"""GET /api/admin/discover-tables — BigQuery branch.
+
+Two-step shape: dataset list (no `dataset` query param) → table list (with
+`dataset=name`). The UI populates the dataset autocomplete first, then
+fetches tables only after the operator picks a dataset, avoiding the
+per-dataset `list_tables()` cost on projects with hundreds of datasets.
+"""
+import pytest
+from unittest.mock import MagicMock, patch
+
+from connectors.bigquery.access import BqAccess, BqProjects
+
+
+def _auth(token):
+    return {"Authorization": f"Bearer {token}"}
+
+
+@pytest.fixture
+def bq_instance(monkeypatch):
+    """Force `data_source.type='bigquery'` so the endpoint routes to the
+    BQ branch."""
+    fake_cfg = {
+        "data_source": {
+            "type": "bigquery",
+            "bigquery": {"project": "my-test-project", "location": "us"},
+        },
+    }
+    monkeypatch.setattr(
+        "app.instance_config.load_instance_config",
+        lambda: fake_cfg,
+        raising=False,
+    )
+    from app.instance_config import reset_cache
+    reset_cache()
+    yield fake_cfg
+    reset_cache()
+
+
+def _stub_bq_with_client(client_mock):
+    """Build a BqAccess wired to return `client_mock` from .client(). The
+    duckdb_session_factory is unused by the discover endpoint — supply a
+    no-op."""
+    from contextlib import contextmanager
+    @contextmanager
+    def _noop(_p):
+        yield None
+    return BqAccess(
+        BqProjects(billing="my-test-project", data="my-test-project"),
+        client_factory=lambda _p: client_mock,
+        duckdb_session_factory=_noop,
+    )
+
+
+def test_discover_returns_dataset_list(seeded_app, bq_instance, monkeypatch):
+    """Without `dataset` param: list datasets in the configured project."""
+    client = MagicMock()
+    ds_a = MagicMock()
+    ds_a.dataset_id = "analytics"
+    ds_a.project = "my-test-project"
+    ds_b = MagicMock()
+    ds_b.dataset_id = "raw"
+    ds_b.project = "my-test-project"
+    client.list_datasets.return_value = [ds_a, ds_b]
+
+    monkeypatch.setattr(
+        "connectors.bigquery.access.get_bq_access",
+        lambda: _stub_bq_with_client(client),
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.get("/api/admin/discover-tables", headers=_auth(token))
+    assert r.status_code == 200, r.json()
+    body = r.json()
+    assert body["source"] == "bigquery"
+    assert body["count"] == 2
+    # Sorted alphabetically by dataset_id.
+    assert [d["dataset_id"] for d in body["datasets"]] == ["analytics", "raw"]
+    assert body["datasets"][0]["full_id"] == "my-test-project.analytics"
+
+
+def test_discover_returns_table_list_for_dataset(seeded_app, bq_instance, monkeypatch):
+    """With `?dataset=analytics`: list tables + views in that dataset."""
+    client = MagicMock()
+    t_orders = MagicMock()
+    t_orders.table_id = "orders"
+    t_orders.table_type = "TABLE"
+    t_orders.project = "my-test-project"
+    t_orders.dataset_id = "analytics"
+    t_view = MagicMock()
+    t_view.table_id = "orders_active"
+    t_view.table_type = "VIEW"
+    t_view.project = "my-test-project"
+    t_view.dataset_id = "analytics"
+    client.list_tables.return_value = [t_view, t_orders]  # unsorted
+
+    monkeypatch.setattr(
+        "connectors.bigquery.access.get_bq_access",
+        lambda: _stub_bq_with_client(client),
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.get(
+        "/api/admin/discover-tables?dataset=analytics",
+        headers=_auth(token),
+    )
+    assert r.status_code == 200, r.json()
+    body = r.json()
+    assert body["source"] == "bigquery"
+    assert body["dataset"] == "analytics"
+    assert body["count"] == 2
+    # Sorted by table_id.
+    assert [t["table_id"] for t in body["tables"]] == ["orders", "orders_active"]
+    by_id = {t["table_id"]: t for t in body["tables"]}
+    assert by_id["orders"]["table_type"] == "TABLE"
+    assert by_id["orders_active"]["table_type"] == "VIEW"
+    # Verify dataset filter was passed through.
+    client.list_tables.assert_called_once_with("analytics")
+
+
+def test_discover_keboola_branch_unchanged(seeded_app, monkeypatch):
+    """Negative — when source_type is keboola, BQ logic isn't reached.
+
+    Skipped when the Keboola SDK (`kbcstorage`) is not installed: CI
+    runners don't ship it because the dev container only needs it for
+    instances that actually configure source_type=keboola, and the
+    route's lazy import would fail before the test stub gets a chance
+    to fire. The branch-unchanged contract is tested separately by the
+    Keboola integration suite when the package is present.
+    """
+    pytest.importorskip("kbcstorage")
+    fake_cfg = {"data_source": {"type": "keboola", "keboola": {}}}
+    monkeypatch.setattr(
+        "app.instance_config.load_instance_config",
+        lambda: fake_cfg,
+        raising=False,
+    )
+    from app.instance_config import reset_cache
+    reset_cache()
+
+    # Stub the Keboola client so the test doesn't reach the network.
+    fake_client = MagicMock()
+    fake_client.discover_all_tables.return_value = [{"id": "in.c-foo.bar"}]
+    monkeypatch.setattr(
+        "connectors.keboola.client.KeboolaClient",
+        lambda *a, **kw: fake_client,
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    try:
+        r = c.get("/api/admin/discover-tables", headers=_auth(token))
+        assert r.status_code == 200, r.json()
+        body = r.json()
+        assert body["source"] == "keboola"
+        assert body["count"] == 1
+    finally:
+        reset_cache()
+
+
+def test_discover_bq_not_configured_returns_500(seeded_app, monkeypatch):
+    """When data_source.bigquery.project is missing, BqAccess returns its
+    not_configured sentinel — endpoint surfaces the structured error."""
+    fake_cfg = {
+        "data_source": {
+            "type": "bigquery",
+            "bigquery": {},  # no project
+        },
+    }
+    monkeypatch.setattr(
+        "app.instance_config.load_instance_config",
+        lambda: fake_cfg,
+        raising=False,
+    )
+    from app.instance_config import reset_cache
+    reset_cache()
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    try:
+        r = c.get("/api/admin/discover-tables", headers=_auth(token))
+        # not_configured is mapped to 500 in BqAccessError.HTTP_STATUS.
+        assert r.status_code == 500, r.json()
+        detail = r.json().get("detail", {})
+        assert detail.get("kind") == "not_configured"
+    finally:
+        reset_cache()
+
+
+def test_admin_tables_html_wires_discover_buttons(seeded_app, bq_instance):
+    """Structural — the BQ register modal in the rendered HTML now has the
+    Discover (datasets) and List tables buttons + datalists wired to the
+    endpoint."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.get("/admin/tables", headers=_auth(token))
+    assert r.status_code == 200, r.text
+    html = r.text
+    assert "discoverBqDatasets" in html
+    assert "discoverBqTables" in html
+    assert 'id="bqDatasetList"' in html
+    assert 'id="bqTableList"' in html
+    assert "list=\"bqDatasetList\"" in html
+    assert "list=\"bqTableList\"" in html
--- a/tests/test_admin_keboola_materialized.py
+++ b/tests/test_admin_keboola_materialized.py
@ -0,0 +1,95 @@
+"""Tests for Keboola materialized registration."""
+import pytest
+
+
+def test_register_keboola_materialized_accepts_source_query(seeded_app):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = {"Authorization": f"Bearer {token}"}
+    r = c.post(
+        "/api/admin/register-table",
+        headers=auth,
+        json={
+            "name": "orders_recent",
+            "source_type": "keboola",
+            "query_mode": "materialized",
+            "source_query": "SELECT * FROM kbc.\"in.c-sales\".\"orders\" WHERE date > '2026-01-01'",
+            "sync_schedule": "daily 03:00",
+        },
+    )
+    assert r.status_code == 201, r.text
+
+
+def test_register_keboola_materialized_rejects_missing_source_query(seeded_app):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = {"Authorization": f"Bearer {token}"}
+    r = c.post(
+        "/api/admin/register-table",
+        headers=auth,
+        json={
+            "name": "orders_recent",
+            "source_type": "keboola",
+            "query_mode": "materialized",
+            # source_query missing
+        },
+    )
+    assert r.status_code == 422
+    assert "source_query" in r.text
+
+
+def test_register_keboola_materialized_skips_bucket_check(seeded_app):
+    """Materialized rows don't need bucket/source_table — the SELECT inlines
+    the references. Mirror of BQ materialized validator behavior."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = {"Authorization": f"Bearer {token}"}
+    r = c.post(
+        "/api/admin/register-table",
+        headers=auth,
+        json={
+            "name": "x",
+            "source_type": "keboola",
+            "query_mode": "materialized",
+            "source_query": "SELECT 1",
+            # No bucket / source_table — must still succeed.
+        },
+    )
+    assert r.status_code == 201, r.text
+
+
+def test_update_keboola_materialized_clears_stale_source_query_on_mode_switch(seeded_app):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = {"Authorization": f"Bearer {token}"}
+
+    # Register materialized.
+    r = c.post(
+        "/api/admin/register-table",
+        headers=auth,
+        json={
+            "name": "x",
+            "source_type": "keboola",
+            "query_mode": "materialized",
+            "source_query": "SELECT 1",
+        },
+    )
+    assert r.status_code == 201
+
+    # PUT to switch back to local — source_query must clear.
+    r = c.put(
+        "/api/admin/registry/x",
+        headers=auth,
+        json={
+            "source_type": "keboola",
+            "query_mode": "local",
+            "bucket": "in.c-foo",
+            "source_table": "y",
+        },
+    )
+    assert r.status_code == 200
+
+    r = c.get("/api/admin/registry", headers=auth)
+    rows = r.json()["tables"]
+    row = next(t for t in rows if t["id"] == "x")
+    assert row.get("source_query") in (None, "")
--- a/tests/test_admin_phase_c_deprecation.py
+++ b/tests/test_admin_phase_c_deprecation.py
@ -0,0 +1,70 @@
+"""Verify Phase C deprecation marks + profile_after_sync becomes inert."""
+import pytest
+from app.api.admin import RegisterTableRequest, UpdateTableRequest
+
+
+def test_register_request_marks_sync_strategy_deprecated():
+    schema = RegisterTableRequest.model_json_schema()
+    field = schema["properties"]["sync_strategy"]
+    assert field.get("deprecated") is True
+
+
+def test_register_request_marks_profile_after_sync_deprecated():
+    schema = RegisterTableRequest.model_json_schema()
+    field = schema["properties"]["profile_after_sync"]
+    assert field.get("deprecated") is True
+
+
+def test_register_endpoint_accepts_profile_after_sync_for_backcompat(seeded_app):
+    """External clients sending profile_after_sync get no error — the
+    field is silently ignored."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = {"Authorization": f"Bearer {token}"}
+    r = c.post(
+        "/api/admin/register-table",
+        headers=auth,
+        json={
+            "name": "x",
+            "source_type": "keboola",
+            "bucket": "in.c-foo",
+            "source_table": "y",
+            "query_mode": "local",
+            "profile_after_sync": True,  # legacy client may send this
+        },
+    )
+    assert r.status_code == 201
+
+
+def test_register_endpoint_does_not_persist_profile_after_sync(seeded_app):
+    """The persisted row no longer carries the old profile_after_sync
+    value (column may still exist in DB for back-compat, but admin path
+    never writes a non-default value)."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = {"Authorization": f"Bearer {token}"}
+    r = c.post(
+        "/api/admin/register-table",
+        headers=auth,
+        json={
+            "name": "y",
+            "source_type": "keboola",
+            "bucket": "in.c-foo",
+            "source_table": "y",
+            "query_mode": "local",
+            "profile_after_sync": True,
+        },
+    )
+    assert r.status_code == 201
+    r = c.get("/api/admin/registry", headers=auth)
+    rows = r.json()["tables"]
+    row = next(t for t in rows if t["id"] == "y")
+    # The field's value in the registry response is now whatever the DB
+    # default is (True per current schema). Critical: the request value
+    # is NOT echoed back.
+    # If the value is in the response at all (legacy back-compat in the
+    # GET serializer), it's the schema default, not the request value.
+    # If the value is absent (deprecated and stripped), that's also fine.
+    if "profile_after_sync" in row:
+        # Whatever this is, it's the schema default, not request-driven.
+        assert row["profile_after_sync"] is True or row["profile_after_sync"] is None
--- a/tests/test_admin_put_preservation.py
+++ b/tests/test_admin_put_preservation.py
@ -0,0 +1,67 @@
+"""Regression guard for PUT field preservation.
+
+Locks the Pydantic semantics that the Phase F form-cleanup relies on:
+when the Edit modal omits a field from its payload, the existing value
+must survive. If a future maintainer flips ``model_dump()`` to
+``exclude_unset=True`` or otherwise changes the partial-update semantics,
+these tests fire before partitioned rows or primary keys silently
+regress.
+"""
+import pytest
+
+
+def _auth(token):
+    return {"Authorization": f"Bearer {token}"}
+
+
+def test_put_preserves_omitted_sync_strategy(seeded_app):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = _auth(token)
+
+    r = c.post("/api/admin/register-table", headers=auth, json={
+        "name": "events_partitioned",
+        "source_type": "keboola",
+        "bucket": "in.c-events",
+        "source_table": "events",
+        "query_mode": "local",
+        "sync_strategy": "partitioned",
+    })
+    assert r.status_code == 201, r.text
+
+    r = c.put("/api/admin/registry/events_partitioned", headers=auth, json={
+        "sync_schedule": "daily 03:00",
+        "description": "now daily",
+    })
+    assert r.status_code == 200
+
+    r = c.get("/api/admin/registry", headers=auth)
+    rows = r.json()["tables"]
+    row = next(t for t in rows if t["id"] == "events_partitioned")
+    assert row["sync_strategy"] == "partitioned"
+
+
+def test_put_preserves_omitted_primary_key(seeded_app):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = _auth(token)
+
+    r = c.post("/api/admin/register-table", headers=auth, json={
+        "name": "orders_with_pk",
+        "source_type": "keboola",
+        "bucket": "in.c-shop",
+        "source_table": "orders",
+        "query_mode": "local",
+        "primary_key": ["order_id", "tenant_id"],
+    })
+    assert r.status_code == 201, r.text
+
+    r = c.put("/api/admin/registry/orders_with_pk", headers=auth, json={
+        "description": "shop orders",
+    })
+    assert r.status_code == 200
+
+    r = c.get("/api/admin/registry", headers=auth)
+    rows = r.json()["tables"]
+    row = next(t for t in rows if t["id"] == "orders_with_pk")
+    assert row["primary_key"] == ["order_id", "tenant_id"]
--- a/tests/test_api_admin_materialized.py
+++ b/tests/test_api_admin_materialized.py
@ -0,0 +1,322 @@
+"""Admin API accepts source_query when query_mode='materialized', rejects
+mismatches between mode and query field.
+
+Tests that hit the remote-mode register path require `stub_bq_extractor`
+to bypass the post-register rebuild's real-BQ traffic. Materialized-only
+tests skip the BG path (the 201 fast-path returns before any rebuild
+fires) so they don't need the stub.
+
+Covers PR #145 (re-implementation against 0.24.0 base):
+- RegisterTableRequest + UpdateTableRequest model_validators
+- _validate_bigquery_register_payload materialized branch (skips bucket/
+  source_table checks, requires source_query)
+- register_table 201 response for materialized BQ rows (no synchronous
+  materialize — cron tick or manual /api/sync/trigger picks them up)
+- update_table clears stale source_query when switching mode away from
+  materialized
+
+Shares the seeded_app + bq_instance fixtures from conftest /
+test_admin_bq_register.py for parity with the existing BQ test surface.
+"""
+from unittest.mock import MagicMock
+
+import pytest
+
+
+def _auth(token):
+    return {"Authorization": f"Bearer {token}"}
+
+
+@pytest.fixture
+def stub_bq_extractor(monkeypatch):
+    """Mirror tests/test_admin_bq_register.py — bypasses real-BQ traffic
+    in the post-register rebuild path so the test stays offline. Required
+    whenever the test seeds a remote-mode BQ row via the HTTP API."""
+    rebuild_mock = MagicMock(return_value={
+        "project_id": "my-test-project",
+        "tables_registered": 1, "errors": [], "skipped": False,
+    })
+    monkeypatch.setattr(
+        "connectors.bigquery.extractor.rebuild_from_registry",
+        rebuild_mock,
+    )
+    monkeypatch.setattr(
+        "src.orchestrator.SyncOrchestrator",
+        lambda *a, **kw: MagicMock(),
+    )
+    return rebuild_mock
+
+
+@pytest.fixture
+def bq_instance(monkeypatch):
+    """Force instance.yaml to look like a BigQuery deployment.
+
+    Mirrors tests/test_admin_bq_register.py::bq_instance so the
+    project_id read inside _validate_bigquery_register_payload succeeds.
+    """
+    fake_cfg = {
+        "data_source": {
+            "type": "bigquery",
+            "bigquery": {"project": "my-test-project", "location": "us"},
+        },
+    }
+    monkeypatch.setattr(
+        "app.instance_config.load_instance_config",
+        lambda: fake_cfg,
+        raising=False,
+    )
+    from app.instance_config import reset_cache
+    reset_cache()
+    yield fake_cfg
+    reset_cache()
+
+
+def _materialized_payload(**overrides):
+    p = {
+        "name": "orders_90d",
+        "source_type": "bigquery",
+        "query_mode": "materialized",
+        "source_query": "SELECT date FROM `prj.ds.orders`",
+        "sync_schedule": "every 6h",
+    }
+    p.update(overrides)
+    return p
+
+
+def test_register_materialized_requires_source_query(seeded_app, bq_instance):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/admin/register-table",
+        json={
+            "name": "missing_query",
+            "source_type": "bigquery",
+            "query_mode": "materialized",
+            # source_query missing
+        },
+        headers=_auth(token),
+    )
+    assert 400 <= r.status_code < 500, r.json()
+    detail = str(r.json().get("detail", "")).lower()
+    assert "source_query" in detail or "materialized" in detail
+
+
+def test_register_materialized_accepts_source_query(seeded_app, bq_instance):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/admin/register-table",
+        json=_materialized_payload(name="orders_90d_a"),
+        headers=_auth(token),
+    )
+    assert r.status_code == 201, r.json()
+    body = r.json()
+    assert body["status"] == "registered"
+    assert "Materialized" in body.get("message", "")
+
+
+def test_register_remote_rejects_source_query(seeded_app, bq_instance):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/admin/register-table",
+        json={
+            "name": "live_orders",
+            "source_type": "bigquery",
+            "bucket": "analytics",
+            "source_table": "orders",
+            "query_mode": "remote",
+            "source_query": "SELECT 1",
+        },
+        headers=_auth(token),
+    )
+    assert 400 <= r.status_code < 500, r.json()
+
+
+def test_register_local_rejects_source_query(seeded_app):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/admin/register-table",
+        json={
+            "name": "kbc_orders",
+            "source_type": "keboola",
+            "query_mode": "local",
+            "source_query": "SELECT 1",
+        },
+        headers=_auth(token),
+    )
+    assert 400 <= r.status_code < 500, r.json()
+
+
+def test_register_materialized_with_empty_source_query_rejected(seeded_app, bq_instance):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/admin/register-table",
+        json=_materialized_payload(name="empty_q", source_query=""),
+        headers=_auth(token),
+    )
+    assert 400 <= r.status_code < 500, r.json()
+
+
+def test_update_source_query_alone_requires_query_mode(seeded_app, bq_instance, stub_bq_extractor):
+    """PUT body with source_query but no query_mode is incoherent — reject
+    so non-materialized rows can't carry an orphan source_query."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+
+    # Seed a remote-mode row.
+    r = c.post(
+        "/api/admin/register-table",
+        json={
+            "name": "live_orphan",
+            "source_type": "bigquery",
+            "bucket": "analytics",
+            "source_table": "orders",
+            "query_mode": "remote",
+        },
+        headers=_auth(token),
+    )
+    assert r.status_code in (200, 202), r.json()  # synchronous or async
+    table_id = r.json()["id"]
+
+    r2 = c.put(
+        f"/api/admin/registry/{table_id}",
+        json={"source_query": "SELECT 1"},
+        headers=_auth(token),
+    )
+    assert 400 <= r2.status_code < 500, r2.json()
+
+
+def test_update_schedule_only_on_materialized_row_succeeds(
+    seeded_app, bq_instance, stub_bq_extractor,
+):
+    """REGRESSION (Devin BUG_0002 on 2219255): an admin editing only the
+    sync_schedule of a materialized row sends `{query_mode: 'materialized',
+    sync_schedule: '...'}` (the Edit modal always sends query_mode for BQ
+    rows). Pre-fix the UpdateTableRequest validator rejected this with 422
+    because source_query wasn't in the body — even though the existing row
+    already had one.
+
+    The PUT semantics overlay the body on the existing row, so omitted
+    source_query keeps the stored value. The synthetic RegisterTableRequest
+    constructed against the merged record at the handler still runs the
+    strict cross-field check, so the truly-broken case (materialized
+    without ANY source_query, even on existing) is still caught."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+
+    # Seed a materialized row with a real source_query.
+    r = c.post("/api/admin/register-table", json={
+        "name": "schedule_edit_target",
+        "source_type": "bigquery",
+        "query_mode": "materialized",
+        "source_query": "SELECT 1",
+        "sync_schedule": "every 1h",
+    }, headers=_auth(token))
+    assert r.status_code == 201, r.json()
+    table_id = r.json()["id"]
+
+    # Edit ONLY the schedule. UI's saveTableEdit sends query_mode for BQ
+    # rows even when the operator didn't change it.
+    r2 = c.put(f"/api/admin/registry/{table_id}", json={
+        "query_mode": "materialized",
+        "sync_schedule": "every 12h",
+    }, headers=_auth(token))
+    assert r2.status_code == 200, r2.json()
+
+    # Verify the schedule changed and source_query survived.
+    r3 = c.get("/api/admin/registry", headers=_auth(token))
+    row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None)
+    assert row is not None
+    assert row["sync_schedule"] == "every 12h"
+    assert row["source_query"] == "SELECT 1"  # preserved across edit
+    assert row["query_mode"] == "materialized"
+
+
+def test_update_materialized_with_explicit_empty_source_query_rejected(
+    seeded_app, bq_instance, stub_bq_extractor,
+):
+    """The fix above relaxes the validator for OMITTED source_query, but
+    explicitly setting it to an empty / whitespace string while claiming
+    materialized is still a typo and must be rejected (not silently
+    persisted as NULL)."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+
+    r = c.post("/api/admin/register-table", json={
+        "name": "explicit_empty",
+        "source_type": "bigquery",
+        "query_mode": "materialized",
+        "source_query": "SELECT 1",
+    }, headers=_auth(token))
+    assert r.status_code == 201, r.json()
+    table_id = r.json()["id"]
+
+    r2 = c.put(f"/api/admin/registry/{table_id}", json={
+        "query_mode": "materialized",
+        "source_query": "",  # explicitly empty
+    }, headers=_auth(token))
+    assert 400 <= r2.status_code < 500, r2.json()
+
+
+def test_update_materialized_to_remote_clears_source_query(
+    seeded_app, bq_instance, stub_bq_extractor,
+):
+    """When admin switches a materialized table to remote/local, the stale
+    source_query must be cleared in the DB — otherwise the registry shows
+    a non-materialized row carrying an orphan SQL body."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+
+    # Seed a materialized table with a source_query.
+    r = c.post(
+        "/api/admin/register-table",
+        json=_materialized_payload(name="switcher"),
+        headers=_auth(token),
+    )
+    assert r.status_code == 201, r.json()
+    table_id = r.json()["id"]
+
+    # Switch to remote — must include bucket+source_table for the new mode
+    # (the merged validator runs the BQ payload check on the merged record).
+    r2 = c.put(
+        f"/api/admin/registry/{table_id}",
+        json={
+            "query_mode": "remote",
+            "bucket": "analytics",
+            "source_table": "orders_90d",
+        },
+        headers=_auth(token),
+    )
+    assert r2.status_code == 200, r2.json()
+
+    # Verify in the registry: query_mode flipped, source_query cleared.
+    r3 = c.get("/api/admin/registry", headers=_auth(token))
+    assert r3.status_code == 200, r3.json()
+    row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None)
+    assert row is not None, f"Table {table_id} not found in registry"
+    assert row["query_mode"] == "remote"
+    assert row["source_query"] in (None, "")
+
+
+def test_register_materialized_persists_source_query_in_registry(seeded_app, bq_instance):
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    r = c.post(
+        "/api/admin/register-table",
+        json=_materialized_payload(
+            name="persist_q",
+            source_query="SELECT col FROM `prj.ds.t` WHERE x = 1",
+        ),
+        headers=_auth(token),
+    )
+    assert r.status_code == 201, r.json()
+    table_id = r.json()["id"]
+
+    r2 = c.get("/api/admin/registry", headers=_auth(token))
+    row = next((t for t in r2.json()["tables"] if t["id"] == table_id), None)
+    assert row is not None
+    assert row["query_mode"] == "materialized"
+    assert "WHERE x = 1" in row["source_query"]
--- a/tests/test_bq_cost_guardrail.py
+++ b/tests/test_bq_cost_guardrail.py
@ -0,0 +1,137 @@
+"""materialize_query refuses to run when dry-run estimate exceeds the cap.
+
+The cap is wired through `data_source.bigquery.max_bytes_per_materialize`
+(read by the trigger pass; default 10 GiB; set 0 to disable). The dry-run
+itself reuses `app.api.v2_scan._bq_dry_run_bytes` so cost-estimate logic
+lives in exactly one place. Fail-open behaviour (DuckDB-syntax SQL the
+native BQ client can't parse → estimate=0 → COPY proceeds with a warning)
+is documented and exercised here too.
+"""
+import duckdb
+import pytest
+from contextlib import contextmanager
+from unittest.mock import MagicMock, patch
+
+from connectors.bigquery.access import BqAccess, BqProjects
+from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError
+
+
+def _bq_with_seed(tables: dict[str, str] | None = None) -> BqAccess:
+    """Stub BqAccess seeded with in-memory tables (same recipe as
+    test_bq_materialize)."""
+    tables = tables or {}
+
+    @contextmanager
+    def _session(_projects):
+        conn = duckdb.connect(":memory:")
+        try:
+            conn.execute("ATTACH ':memory:' AS bq")
+            for s in {ref.rsplit(".", 1)[0] for ref in tables}:
+                conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}")
+            for ref, body in tables.items():
+                conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}")
+            yield conn
+        finally:
+            conn.close()
+
+    return BqAccess(
+        BqProjects(billing="test-billing", data="test-data"),
+        client_factory=lambda _p: MagicMock(),
+        duckdb_session_factory=_session,
+    )
+
+
+def test_refuses_when_estimate_exceeds_cap(tmp_path):
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+
+    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
+
+    with patch(
+        "app.api.v2_scan._bq_dry_run_bytes", return_value=100 * 2**30
+    ):
+        with pytest.raises(MaterializeBudgetError) as exc:
+            materialize_query(
+                table_id="huge",
+                sql="SELECT * FROM bq.test.tiny",
+                bq=bq,
+                output_dir=str(out),
+                max_bytes=10 * 2**30,
+            )
+    err = exc.value
+    assert err.table_id == "huge"
+    assert err.current == 100 * 2**30
+    assert err.limit == 10 * 2**30
+
+
+def test_proceeds_when_estimate_under_cap(tmp_path):
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+
+    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
+
+    with patch("app.api.v2_scan._bq_dry_run_bytes", return_value=1024):
+        stats = materialize_query(
+            table_id="tiny",
+            sql="SELECT * FROM bq.test.tiny",
+            bq=bq,
+            output_dir=str(out),
+            max_bytes=10 * 2**30,
+        )
+    assert stats["rows"] == 1
+
+
+def test_no_cap_skips_dry_run(tmp_path):
+    """When max_bytes=None (default), no dry-run is performed."""
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
+
+    with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry:
+        stats = materialize_query(
+            table_id="t1",
+            sql="SELECT * FROM bq.test.tiny",
+            bq=bq,
+            output_dir=str(out),
+        )
+    mock_dry.assert_not_called()
+    assert stats["rows"] == 1
+
+
+def test_zero_max_bytes_skips_dry_run(tmp_path):
+    """Sentinel: max_bytes=0 disables the guardrail (config docs)."""
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
+
+    with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry:
+        stats = materialize_query(
+            table_id="t1",
+            sql="SELECT * FROM bq.test.tiny",
+            bq=bq,
+            output_dir=str(out),
+            max_bytes=0,
+        )
+    mock_dry.assert_not_called()
+    assert stats["rows"] == 1
+
+
+def test_dry_run_failure_is_fail_open(tmp_path):
+    """If the dry-run errors (DuckDB syntax, missing google lib, transient
+    upstream failure) we don't block — log + proceed with COPY. Operators
+    who need hard-fail watch logs for the warning."""
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+    bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
+
+    with patch(
+        "app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
+    ):
+        stats = materialize_query(
+            table_id="t1",
+            sql="SELECT * FROM bq.test.tiny",
+            bq=bq,
+            output_dir=str(out),
+            max_bytes=10 * 2**30,
+        )
+    assert stats["rows"] == 1
--- a/tests/test_bq_init_extract_skips.py
+++ b/tests/test_bq_init_extract_skips.py
@ -0,0 +1,98 @@
+"""init_extract skips rows with query_mode='materialized'.
+
+Materialized rows are written by the sync trigger pass via
+`materialize_query()`; they live as parquets in /data/extracts/bigquery/data/
+and surface via the orchestrator's standard local-parquet discovery.
+Creating a remote view in extract.duckdb for the same name would shadow
+the parquet via cross-source name collision.
+
+Pattern matches `tests/test_bigquery_extractor.py::TestViewVsTableTemplates`
+(uses `_CapturingProxy` to wrap a real DuckDB conn and stub BQ-specific calls).
+"""
+import duckdb
+from unittest.mock import MagicMock
+
+
+class _CapturingProxy:
+    """Wraps a real DuckDB connection, captures SQL, stubs BQ-specific calls.
+
+    DuckDBPyConnection.execute is C-level read-only, so we wrap rather than
+    monkey-patch. Shape lifted directly from tests/test_bigquery_extractor.py
+    to keep stub behavior consistent across the BQ test suite.
+    """
+
+    def __init__(self, real_conn, captured: list):
+        self._real = real_conn
+        self._captured = captured
+
+    def execute(self, sql, *args, **kwargs):
+        self._captured.append(sql)
+        stripped_u = sql.strip().upper()
+        if stripped_u.startswith(("INSTALL ", "LOAD ", "CREATE SECRET")):
+            return MagicMock()
+        if stripped_u.startswith("ATTACH ") and "BIGQUERY" in stripped_u:
+            return MagicMock()
+        if stripped_u.startswith("DETACH "):
+            return MagicMock()
+        if 'FROM bq.' in sql or 'FROM bigquery_query' in sql:
+            return MagicMock()
+        return self._real.execute(sql, *args, **kwargs)
+
+    def close(self):
+        return self._real.close()
+
+    def __getattr__(self, name):
+        return getattr(self._real, name)
+
+
+def test_init_extract_skips_materialized_rows(tmp_path, monkeypatch):
+    """A registry mix of remote + materialized rows: only the remote row
+    gets a `_meta` entry; the materialized row is silently skipped."""
+    from connectors.bigquery.extractor import init_extract
+
+    monkeypatch.setattr(
+        "connectors.bigquery.extractor.get_metadata_token",
+        lambda: "test-token",
+    )
+    monkeypatch.setattr(
+        "connectors.bigquery.extractor._detect_table_type",
+        lambda *a, **kw: "BASE TABLE",
+    )
+
+    captured: list[str] = []
+    real_connect = duckdb.connect
+
+    def spy_connect(*a, **kw):
+        return _CapturingProxy(real_connect(*a, **kw), captured)
+
+    monkeypatch.setattr(
+        "connectors.bigquery.extractor.duckdb.connect", spy_connect
+    )
+
+    configs = [
+        {
+            "name": "live_orders", "bucket": "dset", "source_table": "live",
+            "query_mode": "remote", "description": "",
+        },
+        {
+            "name": "agg_90d", "bucket": "dset", "source_table": "live",
+            "query_mode": "materialized",
+            "source_query": "SELECT 1",
+            "description": "",
+        },
+    ]
+    stats = init_extract(str(tmp_path), "test-project", configs)
+
+    db_path = tmp_path / "extract.duckdb"
+    assert db_path.exists(), "extract.duckdb should be written"
+
+    db = duckdb.connect(str(db_path))
+    meta = db.execute(
+        "SELECT table_name, query_mode FROM _meta ORDER BY table_name"
+    ).fetchall()
+    db.close()
+
+    assert meta == [("live_orders", "remote")]
+    assert stats["tables_registered"] == 1
+    # No CREATE VIEW for the materialized row
+    assert not any("agg_90d" in s for s in captured if "CREATE" in s.upper())
--- a/tests/test_bq_materialize.py
+++ b/tests/test_bq_materialize.py
@ -0,0 +1,139 @@
+"""BigQuery `materialize_query` writes parquet via BqAccess + DuckDB COPY.
+
+The function takes a `BqAccess` instance so the BQ extension session and
+SECRET token live in one place across the codebase (cf. `v2_scan` / `v2_sample`
+/ `v2_schema`). Tests inject a stub BqAccess whose `duckdb_session()` yields
+an in-memory connection with a pre-attached `bq` catalog containing fixture
+tables, exercising the COPY path end-to-end without any GCP traffic.
+"""
+import duckdb
+import pytest
+from contextlib import contextmanager
+from pathlib import Path
+from unittest.mock import MagicMock
+
+from connectors.bigquery.access import BqAccess, BqProjects
+from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError
+
+
+def _make_stub_bq(tables: dict[str, str] | None = None) -> BqAccess:
+    """Return a BqAccess wired to factories that yield an in-memory DuckDB
+    with a pretend `bq` catalog containing test tables. `tables` maps
+    DuckDB-three-part references like `'bq.test.orders'` to a SELECT
+    expression to seed them with.
+    """
+    tables = tables or {}
+
+    @contextmanager
+    def _session(_projects):
+        conn = duckdb.connect(":memory:")
+        try:
+            conn.execute("ATTACH ':memory:' AS bq")
+            schemas = {ref.rsplit(".", 1)[0] for ref in tables}
+            for s in schemas:
+                conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}")
+            for ref, body in tables.items():
+                conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}")
+            yield conn
+        finally:
+            conn.close()
+
+    # client_factory returns a stub whose .query(sql, job_config=...) yields
+    # a job whose .total_bytes_processed defaults to 0 (fail-open).
+    def _client(_projects):
+        client = MagicMock()
+        job = MagicMock()
+        job.total_bytes_processed = 0
+        client.query.return_value = job
+        return client
+
+    return BqAccess(
+        BqProjects(billing="test-billing", data="test-data"),
+        client_factory=_client,
+        duckdb_session_factory=_session,
+    )
+
+
+def test_materialize_writes_parquet_and_returns_stats(tmp_path):
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+
+    bq = _make_stub_bq({
+        "bq.test.orders": (
+            "SELECT 'EU' AS region, 100 AS revenue UNION ALL "
+            "SELECT 'US' AS region, 250 AS revenue"
+        )
+    })
+
+    stats = materialize_query(
+        table_id="orders_summary",
+        sql="SELECT region, SUM(revenue) AS revenue FROM bq.test.orders GROUP BY 1",
+        bq=bq,
+        output_dir=str(out),
+    )
+
+    parquet_path = out / "data" / "orders_summary.parquet"
+    assert parquet_path.exists()
+    assert stats["rows"] == 2
+    assert stats["size_bytes"] > 0
+    assert stats["query_mode"] == "materialized"
+
+    # Parquet readable end-to-end
+    rows = duckdb.connect().execute(
+        f"SELECT region, revenue FROM read_parquet('{parquet_path}') ORDER BY region"
+    ).fetchall()
+    assert rows == [("EU", 100), ("US", 250)]
+
+
+def test_materialize_atomic_on_failure(tmp_path):
+    """Bad SQL must not leave a half-written parquet behind."""
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+    parquet_path = out / "data" / "broken.parquet"
+
+    bq = _make_stub_bq({"bq.test.orders": "SELECT 1 AS n"})
+
+    with pytest.raises(Exception):
+        materialize_query(
+            table_id="broken",
+            sql="SELECT * FROM bq.test.does_not_exist",
+            bq=bq,
+            output_dir=str(out),
+        )
+    assert not parquet_path.exists()
+    # Tmp also cleaned
+    assert not (out / "data" / "broken.parquet.tmp").exists()
+
+
+def test_materialize_rejects_unsafe_table_id(tmp_path):
+    """table_id becomes the parquet filename — block path traversal up front."""
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+    bq = _make_stub_bq()
+
+    with pytest.raises(ValueError, match="unsafe"):
+        materialize_query(
+            table_id="../etc/passwd",
+            sql="SELECT 1",
+            bq=bq,
+            output_dir=str(out),
+        )
+
+
+def test_materialize_overwrites_existing_parquet(tmp_path):
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+    bq = _make_stub_bq({"bq.test.tiny": "SELECT 1 AS n"})
+
+    materialize_query(
+        table_id="t1", sql="SELECT 1 AS n",
+        bq=bq, output_dir=str(out),
+    )
+    materialize_query(
+        table_id="t1", sql="SELECT 2 AS n",
+        bq=bq, output_dir=str(out),
+    )
+    rows = duckdb.connect().execute(
+        f"SELECT n FROM read_parquet('{out}/data/t1.parquet')"
+    ).fetchall()
+    assert rows == [(2,)]
--- a/tests/test_cli_admin_materialized.py
+++ b/tests/test_cli_admin_materialized.py
@ -0,0 +1,157 @@
+"""`da admin register-table --query-mode materialized --query @file.sql`
+sends source_query in the payload; existing local/remote paths still work
+unchanged."""
+from typer.testing import CliRunner
+from unittest.mock import MagicMock
+
+from cli.main import app
+
+
+def _fake_resp(status_code, body=None):
+    resp = MagicMock()
+    resp.status_code = status_code
+    resp.json = lambda: body or {"id": "x", "name": "x", "status": "registered"}
+    return resp
+
+
+def test_register_materialized_with_inline_query(monkeypatch):
+    captured = {}
+
+    def fake_post(path, json):
+        captured["path"] = path
+        captured["json"] = json
+        return _fake_resp(201)
+
+    monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
+
+    runner = CliRunner()
+    result = runner.invoke(app, [
+        "admin", "register-table", "orders_90d",
+        "--source-type", "bigquery",
+        "--query-mode", "materialized",
+        "--query", "SELECT date FROM `prj.ds.orders`",
+        "--sync-schedule", "every 6h",
+    ])
+
+    assert result.exit_code == 0, result.stdout
+    assert captured["path"] == "/api/admin/register-table"
+    assert captured["json"]["query_mode"] == "materialized"
+    assert captured["json"]["source_query"] == "SELECT date FROM `prj.ds.orders`"
+    assert captured["json"]["sync_schedule"] == "every 6h"
+
+
+def test_register_materialized_reads_query_from_file(tmp_path, monkeypatch):
+    sql_file = tmp_path / "orders.sql"
+    sql_file.write_text(
+        "SELECT date, SUM(revenue) FROM `prj.ds.orders` GROUP BY 1\n"
+    )
+
+    captured = {}
+
+    def fake_post(path, json):
+        captured["json"] = json
+        return _fake_resp(201)
+
+    monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
+
+    runner = CliRunner()
+    result = runner.invoke(app, [
+        "admin", "register-table", "orders_90d",
+        "--source-type", "bigquery",
+        "--query-mode", "materialized",
+        "--query", f"@{sql_file}",
+        "--sync-schedule", "daily 03:00",
+    ])
+
+    assert result.exit_code == 0, result.stdout
+    assert "SELECT date, SUM(revenue)" in captured["json"]["source_query"]
+    assert not captured["json"]["source_query"].endswith("\n")
+
+
+def test_register_materialized_without_query_fails(monkeypatch):
+    """--query-mode materialized without --query is a client-side error,
+    no API call made."""
+    called = {"count": 0}
+
+    def fake_post(*args, **kwargs):
+        called["count"] += 1
+        return _fake_resp(201)
+
+    monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
+
+    runner = CliRunner()
+    result = runner.invoke(app, [
+        "admin", "register-table", "orders_90d",
+        "--source-type", "bigquery",
+        "--query-mode", "materialized",
+    ])
+
+    assert result.exit_code != 0
+    assert called["count"] == 0
+    combined = result.stdout + (result.stderr or "")
+    assert "--query" in combined
+
+
+def test_register_local_mode_does_not_send_source_query(monkeypatch):
+    """Default local mode shouldn't send source_query — server-side
+    validator forbids it on local."""
+    captured = {}
+
+    def fake_post(path, json):
+        captured["json"] = json
+        return _fake_resp(201)
+
+    monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
+
+    runner = CliRunner()
+    result = runner.invoke(app, [
+        "admin", "register-table", "kbc_orders",
+        "--source-type", "keboola",
+        "--bucket", "in.c-crm",
+    ])
+
+    assert result.exit_code == 0
+    assert "source_query" not in captured["json"]
+    assert "sync_schedule" not in captured["json"]
+
+
+def test_register_query_at_path_missing_file_fails(monkeypatch):
+    """@file.sql where the file doesn't exist surfaces a clear error."""
+    monkeypatch.setattr(
+        "cli.commands.admin.api_post", lambda *a, **kw: _fake_resp(201),
+    )
+    runner = CliRunner()
+    result = runner.invoke(app, [
+        "admin", "register-table", "x",
+        "--source-type", "bigquery",
+        "--query-mode", "materialized",
+        "--query", "@/tmp/definitely-does-not-exist-9b4f7e2c.sql",
+    ])
+    assert result.exit_code != 0
+
+
+def test_register_remote_path_unchanged(monkeypatch):
+    """The pre-existing --bucket / --source-table / --query-mode remote
+    flow still works without --query."""
+    captured = {}
+
+    def fake_post(path, json):
+        captured["json"] = json
+        return _fake_resp(200)
+
+    monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
+
+    runner = CliRunner()
+    result = runner.invoke(app, [
+        "admin", "register-table", "live_orders",
+        "--source-type", "bigquery",
+        "--bucket", "analytics",
+        "--source-table", "orders",
+        "--query-mode", "remote",
+    ])
+
+    assert result.exit_code == 0
+    assert captured["json"]["query_mode"] == "remote"
+    assert "source_query" not in captured["json"]
+    assert captured["json"]["bucket"] == "analytics"
+    assert captured["json"]["source_table"] == "orders"
--- a/tests/test_db_migration_v20.py
+++ b/tests/test_db_migration_v20.py
@ -0,0 +1,77 @@
+"""v20 adds source_query column to table_registry.
+
+Backs query_mode='materialized' for BigQuery: admin registers a SQL body
+that the scheduler runs through the DuckDB BQ extension and writes as a
+parquet to /data/extracts/bigquery/data/<id>.parquet.
+
+The v19 step (#150) drops dataset_permissions, access_requests tables and
+users.role, table_registry.is_public columns; v20 then ALTERs the post-v19
+table_registry to add the source_query column.
+"""
+import duckdb
+
+from src.db import SCHEMA_VERSION, _ensure_schema, get_schema_version
+
+
+def test_schema_version_is_20():
+    assert SCHEMA_VERSION == 20
+
+
+def test_v20_adds_source_query(tmp_path):
+    db_path = tmp_path / "system.duckdb"
+    conn = duckdb.connect(str(db_path))
+    _ensure_schema(conn)
+
+    cols = {
+        r[0] for r in conn.execute(
+            "SELECT column_name FROM information_schema.columns "
+            "WHERE table_name = 'table_registry'"
+        ).fetchall()
+    }
+    assert "source_query" in cols, f"source_query missing from {cols}"
+    assert get_schema_version(conn) == 20
+    conn.close()
+
+
+def test_v19_db_migrates_to_v20(tmp_path):
+    """Pre-existing v19 DB (post-RBAC-drop) without source_query upgrades
+    cleanly without losing data."""
+    db_path = tmp_path / "system.duckdb"
+    conn = duckdb.connect(str(db_path))
+
+    # Simulate a v19 DB at minimal but realistic shape: schema_version row +
+    # a table_registry row in the post-v19 column shape (no is_public column,
+    # since v19 finalize dropped it via the table-rebuild idiom).
+    conn.execute(
+        "CREATE TABLE schema_version (version INTEGER, "
+        "applied_at TIMESTAMP DEFAULT current_timestamp)"
+    )
+    conn.execute("INSERT INTO schema_version (version) VALUES (19)")
+    conn.execute("""CREATE TABLE table_registry (
+        id VARCHAR PRIMARY KEY, name VARCHAR NOT NULL,
+        source_type VARCHAR, bucket VARCHAR, source_table VARCHAR,
+        sync_strategy VARCHAR DEFAULT 'full_refresh',
+        query_mode VARCHAR DEFAULT 'local',
+        sync_schedule VARCHAR, profile_after_sync BOOLEAN DEFAULT true,
+        primary_key VARCHAR, folder VARCHAR, description TEXT,
+        registered_by VARCHAR,
+        registered_at TIMESTAMP DEFAULT current_timestamp
+    )""")
+    conn.execute("INSERT INTO table_registry (id, name) VALUES ('foo', 'foo')")
+
+    _ensure_schema(conn)
+
+    assert get_schema_version(conn) == 20
+    cols = {
+        r[0] for r in conn.execute(
+            "SELECT column_name FROM information_schema.columns "
+            "WHERE table_name = 'table_registry'"
+        ).fetchall()
+    }
+    assert "source_query" in cols
+    # Existing row preserved, new column NULL
+    row = conn.execute(
+        "SELECT id, source_query FROM table_registry WHERE id='foo'"
+    ).fetchone()
+    assert row == ("foo", None)
+    conn.close()
--- a/tests/test_keboola_access.py
+++ b/tests/test_keboola_access.py
@ -0,0 +1,75 @@
+"""Tests for KeboolaAccess facade."""
+import os
+import pytest
+from connectors.keboola.access import KeboolaAccess
+
+
+def test_access_session_yields_attached_duckdb(tmp_path, monkeypatch):
+    """Mock-mode test: the facade should accept a token, install+load
+    the Keboola extension, and ATTACH it as 'kbc'. We verify the SQL
+    issued by intercepting the duckdb.connect call.
+    """
+    issued = []
+    class FakeConn:
+        def execute(self, sql, *args, **kwargs):
+            issued.append(sql)
+            class R:
+                def fetchall(s): return []
+                def fetchone(s): return (0,)
+            return R()
+        def close(self): pass
+
+    import duckdb
+    monkeypatch.setattr(duckdb, "connect", lambda *a, **kw: FakeConn())
+
+    acc = KeboolaAccess(
+        url="https://connection.keboola.com/",
+        token="fake-token-xyz",
+    )
+    with acc.duckdb_session() as conn:
+        assert conn is not None
+    # Verify the install + load + attach sequence happened.
+    joined = "\n".join(issued)
+    assert "INSTALL keboola" in joined
+    assert "LOAD keboola" in joined
+    assert "ATTACH" in joined and "TYPE keboola" in joined
+    # Token must be escaped for embedding in the ATTACH literal.
+    assert "fake-token-xyz" in joined
+
+
+def test_access_escapes_single_quote_in_token(monkeypatch):
+    """Defense against a token containing a single quote breaking the
+    ATTACH literal. SQL injection here is non-trivial because the token
+    is admin-supplied at instance config time, but escape it anyway."""
+    issued = []
+    class FakeConn:
+        def execute(self, sql, *args, **kwargs):
+            issued.append(sql)
+            class R:
+                def fetchall(s): return []
+                def fetchone(s): return (0,)
+            return R()
+        def close(self): pass
+    import duckdb
+    monkeypatch.setattr(duckdb, "connect", lambda *a, **kw: FakeConn())
+
+    acc = KeboolaAccess(url="x", token="bad'token")
+    with acc.duckdb_session() as conn:
+        pass
+    attach_sql = next(s for s in issued if "ATTACH" in s)
+    # Doubled single-quote per SQL string-literal escaping.
+    assert "bad''token" in attach_sql
+
+
+def test_access_real_attach_when_creds_present(tmp_path):
+    """Smoke when KBC_TEST_URL + KBC_TEST_TOKEN are present."""
+    url = os.environ.get("KBC_TEST_URL")
+    token = os.environ.get("KBC_TEST_TOKEN")
+    if not (url and token):
+        pytest.skip("Keboola creds not provided")
+    acc = KeboolaAccess(url=url, token=token)
+    with acc.duckdb_session() as conn:
+        # ATTACH must have succeeded — querying duckdb_databases() should
+        # show the 'kbc' alias.
+        rows = [r[0] for r in conn.execute("SELECT name FROM duckdb_databases()").fetchall()]
+        assert "kbc" in rows
--- a/tests/test_keboola_extension_query_passthrough.py
+++ b/tests/test_keboola_extension_query_passthrough.py
@ -0,0 +1,75 @@
+"""Lock-in test for the DuckDB Keboola extension's query-passthrough
+capability that the Keboola materialized path depends on.
+
+Run only when KBC_TEST_URL + KBC_TEST_TOKEN env vars are set (CI without
+real Keboola credentials skips). Local dev with a real Storage API
+token exercises the path.
+"""
+import os
+import pytest
+import duckdb
+
+
+KBC_URL = os.environ.get("KBC_TEST_URL")
+KBC_TOKEN = os.environ.get("KBC_TEST_TOKEN")
+KBC_BUCKET = os.environ.get("KBC_TEST_BUCKET")
+KBC_TABLE = os.environ.get("KBC_TEST_TABLE")
+
+pytestmark = pytest.mark.skipif(
+    not all([KBC_URL, KBC_TOKEN, KBC_BUCKET, KBC_TABLE]),
+    reason="Keboola integration creds not provided",
+)
+
+
+def test_extension_supports_attach_and_select(tmp_path):
+    """Keboola extension must support: ATTACH 'keboola://...' AS kbc, then
+    SELECT * FROM kbc.bucket.table. The Keboola materialized path uses this
+    primitive at runtime (just like connectors/keboola/extractor.py:133)."""
+    conn = duckdb.connect(str(tmp_path / "spike.duckdb"))
+    conn.execute("INSTALL keboola FROM community")
+    conn.execute("LOAD keboola")
+    escaped_token = KBC_TOKEN.replace("'", "''")
+    conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')")
+    rows = conn.execute(
+        f'SELECT COUNT(*) FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}"'
+    ).fetchone()
+    assert rows[0] >= 0  # any non-negative count is fine; we're testing the path works
+
+
+def test_extension_supports_copy_to_parquet(tmp_path):
+    """Keboola materialized writes the SELECT result via
+    `COPY (...) TO '...' (FORMAT PARQUET)`. Lock that primitive."""
+    conn = duckdb.connect(str(tmp_path / "spike.duckdb"))
+    conn.execute("INSTALL keboola FROM community")
+    conn.execute("LOAD keboola")
+    escaped_token = KBC_TOKEN.replace("'", "''")
+    conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')")
+
+    parquet_path = tmp_path / "out.parquet"
+    safe_lit = str(parquet_path).replace("'", "''")
+    conn.execute(
+        f'COPY (SELECT * FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 5) '
+        f"TO '{safe_lit}' (FORMAT PARQUET)"
+    )
+    assert parquet_path.exists() and parquet_path.stat().st_size > 0
+
+
+def test_extension_supports_filtered_query(tmp_path):
+    """Most important capability: a non-trivial WHERE/projection survives.
+    This is what 'Custom SQL' mode actually relies on."""
+    conn = duckdb.connect(str(tmp_path / "spike.duckdb"))
+    conn.execute("INSTALL keboola FROM community")
+    conn.execute("LOAD keboola")
+    escaped_token = KBC_TOKEN.replace("'", "''")
+    conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')")
+
+    parquet_path = tmp_path / "filtered.parquet"
+    safe_lit = str(parquet_path).replace("'", "''")
+    # Trivially filterable SELECT — extension must push the WHERE down or
+    # at minimum execute it client-side. Either is acceptable for our
+    # materialized path.
+    conn.execute(
+        f'COPY (SELECT 1 AS marker FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 3) '
+        f"TO '{safe_lit}' (FORMAT PARQUET)"
+    )
+    assert parquet_path.exists()
--- a/tests/test_keboola_init_extract_skips.py
+++ b/tests/test_keboola_init_extract_skips.py
@ -0,0 +1,89 @@
+"""Verify the legacy Keboola download path skips materialized rows.
+
+Materialized rows are handled by `_run_materialized_pass` in
+`app/api/sync.py`, not by the legacy extractor. Mirror of the BQ
+extractor's existing skip behavior at line 188.
+
+The Keboola extractor entrypoint is `run()` (not `init_extract` like
+the BQ extractor). Tests below observe the skip via caplog and the
+stats dict (no parquet written, table not in tables_extracted/failed).
+"""
+from connectors.keboola import extractor as kbe
+
+
+def test_run_skips_materialized_rows(tmp_path, caplog):
+    """Given a registry with one materialized row, run() must NOT write a
+    parquet for it and must NOT count it in tables_extracted/failed."""
+    output_dir = tmp_path / "extracts" / "keboola"
+    output_dir.mkdir(parents=True)
+
+    table_configs = [
+        {
+            "id": "orders_recent",
+            "name": "orders_recent",
+            "source_query": "SELECT * FROM kbc.\"in.c-sales\".\"orders\" WHERE date > '2026-01-01'",
+            "query_mode": "materialized",
+        },
+    ]
+
+    with caplog.at_level("INFO"):
+        # Use bogus URL/token — the extractor will fail to ATTACH the
+        # extension (legacy client fallback also unavailable for the
+        # materialized row, but materialized rows must be skipped before
+        # any of that code runs).
+        stats = kbe.run(
+            str(output_dir), table_configs,
+            keboola_url="https://invalid.example/",
+            keboola_token="bogus",
+        )
+
+    # No parquet files for the materialized row.
+    parquet = output_dir / "data" / "orders_recent.parquet"
+    assert not parquet.exists(), "materialized row was written by legacy extractor"
+
+    # The materialized row must NOT count as extracted or failed.
+    assert stats["tables_extracted"] == 0
+    assert stats["tables_failed"] == 0
+
+    # Skip log line for ops visibility.
+    assert "Skipping" in caplog.text and "materialized" in caplog.text
+
+
+def test_run_processes_local_alongside_skipped_materialized(tmp_path, caplog):
+    """Mixed registry: one local + one materialized. Materialized is
+    skipped, local row attempts extraction (and fails because there's
+    no real Keboola in this test, but that's a separate failure path —
+    the materialized row must not appear in tables_failed)."""
+    output_dir = tmp_path / "extracts" / "keboola"
+    output_dir.mkdir(parents=True)
+
+    table_configs = [
+        {
+            "id": "orders",
+            "name": "orders",
+            "bucket": "in.c-sales",
+            "source_table": "orders",
+            "query_mode": "local",
+        },
+        {
+            "id": "orders_recent",
+            "name": "orders_recent",
+            "source_query": "SELECT 1",
+            "query_mode": "materialized",
+        },
+    ]
+
+    with caplog.at_level("INFO"):
+        stats = kbe.run(
+            str(output_dir), table_configs,
+            keboola_url="https://invalid.example/",
+            keboola_token="bogus",
+        )
+
+    # The materialized row must not be in failures (it was skipped, not failed).
+    failed_names = {e["table"] for e in stats.get("errors", [])}
+    assert "orders_recent" not in failed_names, (
+        "materialized row appeared in failures — should have been skipped"
+    )
+    # Skip log line for ops visibility.
+    assert "Skipping" in caplog.text and "materialized" in caplog.text
--- a/tests/test_keboola_materialize.py
+++ b/tests/test_keboola_materialize.py
@ -0,0 +1,95 @@
+"""Tests for the Keboola materialize_query path."""
+import hashlib
+import pytest
+from pathlib import Path
+from unittest.mock import MagicMock
+
+from connectors.keboola import extractor as kbe
+
+
+def test_materialize_query_writes_parquet_and_returns_metadata(tmp_path, monkeypatch):
+    """Mock-mode: feed in a fake KeboolaAccess that yields a fake DuckDB
+    connection accepting `COPY ... TO '...' (FORMAT PARQUET)` and just
+    writes a small parquet via duckdb's own primitive on a tmp DB.
+    """
+    import duckdb
+    real_conn = duckdb.connect(":memory:")
+    # Pre-create a small relation the fake materialize "copies".
+    real_conn.execute("CREATE TABLE t AS SELECT 1 AS x, 'hello' AS y UNION ALL SELECT 2, 'world'")
+
+    class FakeAccess:
+        def duckdb_session(self):
+            from contextlib import contextmanager
+            @contextmanager
+            def _cm():
+                yield real_conn
+            return _cm()
+    fake_access = FakeAccess()
+
+    output_dir = tmp_path / "out"
+    output_dir.mkdir()
+
+    # Submit a query that selects from the in-memory table (not a real
+    # Keboola bucket — the test verifies the COPY/parquet/hash path,
+    # not the extension behavior).
+    result = kbe.materialize_query(
+        table_id="example_subset",
+        sql="SELECT * FROM t",
+        keboola_access=fake_access,
+        output_dir=output_dir,
+    )
+
+    parquet_path = output_dir / "example_subset.parquet"
+    assert parquet_path.exists()
+    assert result["table_id"] == "example_subset"
+    assert result["path"] == str(parquet_path)
+    assert result["rows"] == 2
+    assert result["bytes"] > 0
+    # MD5 of the bytes should match what we recompute.
+    expected_md5 = hashlib.md5(parquet_path.read_bytes()).hexdigest()
+    assert result["md5"] == expected_md5
+
+
+def test_materialize_query_zero_rows_logs_warning(tmp_path, caplog):
+    import duckdb
+    real_conn = duckdb.connect(":memory:")
+    real_conn.execute("CREATE TABLE t AS SELECT 1 AS x WHERE FALSE")
+
+    class FakeAccess:
+        def duckdb_session(self):
+            from contextlib import contextmanager
+            @contextmanager
+            def _cm():
+                yield real_conn
+            return _cm()
+
+    output_dir = tmp_path / "out"
+    output_dir.mkdir()
+
+    with caplog.at_level("WARNING"):
+        result = kbe.materialize_query(
+            table_id="empty_subset",
+            sql="SELECT * FROM t",
+            keboola_access=FakeAccess(),
+            output_dir=output_dir,
+        )
+    assert result["rows"] == 0
+    assert "0 rows" in caplog.text or "empty" in caplog.text.lower()
+
+
+def test_materialize_query_rejects_unsafe_table_id(tmp_path):
+    """Defense: table_id is interpolated into the parquet filename. SQL/
+    path-traversal-unsafe values must be rejected up-front (mirror of BQ
+    materialize_query's validation)."""
+    class FakeAccess:
+        def duckdb_session(self):
+            raise AssertionError("should not be called")
+    output_dir = tmp_path / "out"
+    output_dir.mkdir()
+    with pytest.raises(ValueError, match="table_id"):
+        kbe.materialize_query(
+            table_id="../../etc/passwd",
+            sql="SELECT 1",
+            keboola_access=FakeAccess(),
+            output_dir=output_dir,
+        )
--- a/tests/test_keboola_materialized_e2e.py
+++ b/tests/test_keboola_materialized_e2e.py
@ -0,0 +1,71 @@
+"""End-to-end: register a Keboola materialized row -> trigger sync ->
+parquet appears -> manifest serves it -> CLI da sync would download it.
+
+Skipped unless KBC_TEST_URL + KBC_TEST_TOKEN + KBC_TEST_BUCKET +
+KBC_TEST_TABLE are present.
+"""
+import os
+from pathlib import Path
+
+import pytest
+
+
+KBC_URL = os.environ.get("KBC_TEST_URL")
+KBC_TOKEN = os.environ.get("KBC_TEST_TOKEN")
+KBC_BUCKET = os.environ.get("KBC_TEST_BUCKET")
+KBC_TABLE = os.environ.get("KBC_TEST_TABLE")
+
+pytestmark = pytest.mark.skipif(
+    not all([KBC_URL, KBC_TOKEN, KBC_BUCKET, KBC_TABLE]),
+    reason="Keboola creds not provided",
+)
+
+
+def test_register_trigger_manifest_path(seeded_app, monkeypatch, tmp_path):
+    monkeypatch.setenv("DATA_DIR", str(tmp_path))
+    monkeypatch.setenv("KEBOOLA_TOKEN", KBC_TOKEN)
+    monkeypatch.setattr(
+        "app.instance_config.load_instance_config",
+        lambda: {
+            "data_source": {
+                "type": "keboola",
+                "keboola": {
+                    "url": KBC_URL,
+                    "token_env": "KEBOOLA_TOKEN",
+                },
+            },
+        },
+        raising=False,
+    )
+
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+    auth = {"Authorization": f"Bearer {token}"}
+
+    # Register.
+    r = c.post("/api/admin/register-table", headers=auth, json={
+        "name": "smoke_subset",
+        "source_type": "keboola",
+        "query_mode": "materialized",
+        "source_query": (
+            f'SELECT * FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 5'
+        ),
+    })
+    assert r.status_code == 201
+
+    # Trigger sync.
+    r = c.post("/api/sync/trigger", headers=auth)
+    assert r.status_code in (200, 202)
+
+    # Parquet must exist.
+    parquet = Path(tmp_path) / "extracts" / "keboola" / "data" / "smoke_subset.parquet"
+    assert parquet.exists() and parquet.stat().st_size > 0
+
+    # Manifest serves it.
+    r = c.get("/api/sync/manifest", headers=auth)
+    rows = r.json()["tables"]
+    smoke = next((t for t in rows if t["id"] == "smoke_subset"), None)
+    assert smoke is not None
+    assert smoke["source_type"] == "keboola"
+    assert smoke["query_mode"] == "local"  # materialized parquets surface as local
+    assert smoke["md5"]  # has a hash for da sync delta detection
--- a/tests/test_materialized_e2e.py
+++ b/tests/test_materialized_e2e.py
@ -0,0 +1,335 @@
+"""End-to-end integration coverage for query_mode='materialized'.
+
+Unit tests verify each piece in isolation; this file glues them together:
+
+1. Admin POST /api/admin/register-table (materialized) → registry row written
+2. _run_materialized_pass writes parquet + sync_state with correct hash
+3. GET /api/sync/manifest (per-user) returns the row with query_mode +
+   the parquet hash, filtered by RBAC
+4. Mode-switch transitions (remote → materialized, materialized → SQL edit
+   preserves registered_at) maintain registry invariants.
+
+Devil's-advocate review found these were the gaps the unit tests left
+open. Each piece passes in isolation; this file proves they compose.
+"""
+import duckdb
+import hashlib
+import pytest
+from contextlib import contextmanager
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+from connectors.bigquery.access import BqAccess, BqProjects
+from src.repositories.table_registry import TableRegistryRepository
+from src.repositories.sync_state import SyncStateRepository
+
+
+def _auth(token):
+    return {"Authorization": f"Bearer {token}"}
+
+
+@pytest.fixture
+def bq_instance(monkeypatch):
+    """Force instance.yaml to look like a BigQuery deployment so the BQ
+    register validator's project_id check passes."""
+    fake_cfg = {
+        "data_source": {
+            "type": "bigquery",
+            "bigquery": {"project": "my-test-project", "location": "us"},
+        },
+    }
+    monkeypatch.setattr(
+        "app.instance_config.load_instance_config",
+        lambda: fake_cfg,
+        raising=False,
+    )
+    from app.instance_config import reset_cache
+    reset_cache()
+    yield fake_cfg
+    reset_cache()
+
+
+@pytest.fixture
+def stub_bq_extractor(monkeypatch):
+    """Mirror tests/test_admin_bq_register.py::stub_bq_extractor — replaces
+    rebuild_from_registry + SyncOrchestrator so the API's post-register
+    materialize doesn't hit real BQ during HTTP-driven tests."""
+    rebuild_mock = MagicMock(return_value={
+        "project_id": "my-test-project",
+        "tables_registered": 1,
+        "errors": [],
+        "skipped": False,
+    })
+    monkeypatch.setattr(
+        "connectors.bigquery.extractor.rebuild_from_registry",
+        rebuild_mock,
+    )
+    orch_mock = MagicMock()
+    monkeypatch.setattr(
+        "src.orchestrator.SyncOrchestrator",
+        lambda *a, **kw: orch_mock,
+    )
+    return {"rebuild": rebuild_mock, "orchestrator": orch_mock}
+
+
+@pytest.fixture
+def stub_bq():
+    """Real-shape BqAccess wired to in-memory DuckDB factories so the
+    materialize_query path can run end-to-end without GCP."""
+    @contextmanager
+    def _session(_p):
+        conn = duckdb.connect(":memory:")
+        try:
+            conn.execute("ATTACH ':memory:' AS bq")
+            conn.execute("CREATE SCHEMA bq.test")
+            conn.execute(
+                "CREATE OR REPLACE TABLE bq.test.orders AS "
+                "SELECT 'EU' AS region, 100 AS revenue UNION ALL "
+                "SELECT 'US' AS region, 250 AS revenue"
+            )
+            yield conn
+        finally:
+            conn.close()
+    return BqAccess(
+        BqProjects(billing="my-test-project", data="my-test-project"),
+        client_factory=lambda _p: MagicMock(),
+        duckdb_session_factory=_session,
+    )
+
+
+def test_e2e_register_then_materialize_then_manifest_via_repo(
+    bq_instance, stub_bq, tmp_path, monkeypatch,
+):
+    """Glue test: register row at the repository layer (skips HTTP/auth),
+    run the materialized pass, verify sync_state, then exercise the
+    `_build_manifest_for_user` admin path. Catches integration breakage
+    that unit tests miss because each only sees one layer."""
+    monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
+    db_path = tmp_path / "system.duckdb"
+    conn = duckdb.connect(str(db_path))
+    from src.db import _ensure_schema
+    _ensure_schema(conn)
+
+    table_id = "orders_summary_e2e"
+    repo = TableRegistryRepository(conn)
+    repo.register(
+        id=table_id, name=table_id, source_type="bigquery",
+        query_mode="materialized",
+        source_query="SELECT region, SUM(revenue) AS revenue "
+                     "FROM bq.test.orders GROUP BY 1",
+        sync_schedule="every 1m",
+    )
+
+    # Run the materialized pass.
+    from app.api import sync as sync_mod
+    summary = sync_mod._run_materialized_pass(conn, stub_bq)
+    assert table_id in summary["materialized"], summary
+    assert not summary["errors"]
+
+    # Parquet on disk.
+    parquet_path = (
+        tmp_path / "data" / "extracts" / "bigquery" / "data"
+        / f"{table_id}.parquet"
+    )
+    assert parquet_path.exists(), f"Expected {parquet_path} to exist"
+
+    # sync_state hash matches the file's MD5.
+    expected_hash = hashlib.md5(parquet_path.read_bytes()).hexdigest()
+    state = SyncStateRepository(conn)
+    row = state.get_table_state(table_id)
+    assert row is not None
+    assert row["hash"] == expected_hash
+    assert row["rows"] == 2
+
+    # Manifest builder exposes query_mode + hash to admin (no RBAC filter).
+    # Post-#150 RBAC: admin shortcut keys on Admin user_group membership,
+    # not the legacy `users.role` column. Seed the user + Admin membership
+    # so `can_access_table` short-circuits to True.
+    conn.execute(
+        "INSERT OR IGNORE INTO users (id, email) VALUES ('u-admin', 'admin@test')"
+    )
+    admin_group_id = conn.execute(
+        "SELECT id FROM user_groups WHERE name = 'Admin'"
+    ).fetchone()[0]
+    conn.execute(
+        "INSERT OR IGNORE INTO user_group_members (user_id, group_id, source) "
+        "VALUES ('u-admin', ?, 'admin')",
+        [admin_group_id],
+    )
+    admin_user = {"id": "u-admin", "email": "admin@test"}
+    manifest = sync_mod._build_manifest_for_user(conn, admin_user)
+    assert table_id in manifest["tables"]
+    entry = manifest["tables"][table_id]
+    assert entry["query_mode"] == "materialized"
+    assert entry["hash"] == expected_hash
+    assert entry["rows"] == 2
+
+    conn.close()
+
+
+def test_remote_to_materialized_transition_clears_bucket_table(
+    seeded_app, bq_instance, stub_bq_extractor,
+):
+    """Switching a remote BQ row to materialized must accept source_query
+    and the merged validator must not trip on the now-irrelevant
+    bucket/source_table fields."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+
+    # Seed a remote row.
+    r = c.post("/api/admin/register-table", json={
+        "name": "live_to_mat",
+        "source_type": "bigquery",
+        "bucket": "analytics",
+        "source_table": "orders",
+        "query_mode": "remote",
+    }, headers=_auth(token))
+    assert r.status_code in (200, 202), r.json()
+    table_id = r.json()["id"]
+
+    # Switch to materialized — must include source_query for the validator.
+    r2 = c.put(f"/api/admin/registry/{table_id}", json={
+        "query_mode": "materialized",
+        "source_query": "SELECT 1 AS n",
+    }, headers=_auth(token))
+    assert r2.status_code == 200, r2.json()
+
+    # Verify the merged record reflects the switch.
+    r3 = c.get("/api/admin/registry", headers=_auth(token))
+    row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None)
+    assert row is not None
+    assert row["query_mode"] == "materialized"
+    assert row["source_query"] == "SELECT 1 AS n"
+
+
+def test_materialized_sql_edit_preserves_registered_at(
+    seeded_app, bq_instance, stub_bq_extractor, monkeypatch,
+):
+    """Editing source_query on an existing materialized row must not
+    reset registered_at — the row's registration history is preserved
+    across SQL edits (issue #130 invariant)."""
+    c = seeded_app["client"]
+    token = seeded_app["admin_token"]
+
+    # Seed a materialized row.
+    r = c.post("/api/admin/register-table", json={
+        "name": "sql_edit_target",
+        "source_type": "bigquery",
+        "query_mode": "materialized",
+        "source_query": "SELECT 1 AS n",
+    }, headers=_auth(token))
+    assert r.status_code == 201, r.json()
+    table_id = r.json()["id"]
+
+    # Capture the original registered_at.
+    r2 = c.get("/api/admin/registry", headers=_auth(token))
+    row = next((t for t in r2.json()["tables"] if t["id"] == table_id), None)
+    original_ts = row["registered_at"]
+    assert original_ts is not None
+
+    # Edit the SQL.
+    import time
+    time.sleep(0.01)  # ensure a clock tick elapses so a fresh stamp would differ
+    r3 = c.put(f"/api/admin/registry/{table_id}", json={
+        "query_mode": "materialized",
+        "source_query": "SELECT 2 AS n",
+    }, headers=_auth(token))
+    assert r3.status_code == 200, r3.json()
+
+    r4 = c.get("/api/admin/registry", headers=_auth(token))
+    row = next((t for t in r4.json()["tables"] if t["id"] == table_id), None)
+    assert row["source_query"] == "SELECT 2 AS n"
+    # registered_at preserved across edit
+    assert row["registered_at"] == original_ts, (
+        f"Expected registered_at preserved (issue #130 contract). "
+        f"Original: {original_ts}, after edit: {row['registered_at']}"
+    )
+
+
+def test_materialized_zero_rows_logs_warning(stub_bq, tmp_path, caplog):
+    """Devil's-advocate item: an SQL filter that returns 0 rows is
+    indistinguishable from 'SQL is wrong'. Confirm we log a WARNING so
+    operators can grep on it."""
+    import logging
+    from connectors.bigquery.extractor import materialize_query
+
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+
+    # Add an empty BQ table to the stub for this test.
+    @contextmanager
+    def _session_empty(_p):
+        conn = duckdb.connect(":memory:")
+        try:
+            conn.execute("ATTACH ':memory:' AS bq")
+            conn.execute("CREATE SCHEMA bq.test")
+            conn.execute("CREATE OR REPLACE TABLE bq.test.empty AS "
+                         "SELECT 1 AS n WHERE FALSE")
+            yield conn
+        finally:
+            conn.close()
+
+    bq_empty = BqAccess(
+        BqProjects(billing="t", data="t"),
+        client_factory=lambda _p: MagicMock(),
+        duckdb_session_factory=_session_empty,
+    )
+
+    with caplog.at_level(logging.WARNING, logger="connectors.bigquery.extractor"):
+        stats = materialize_query(
+            table_id="empty_t",
+            sql="SELECT * FROM bq.test.empty",
+            bq=bq_empty,
+            output_dir=str(out),
+        )
+
+    assert stats["rows"] == 0
+    assert any("0 rows" in rec.message for rec in caplog.records), (
+        f"Expected '0 rows' WARNING; got: {[r.message for r in caplog.records]}"
+    )
+
+
+def test_attach_real_error_propagates(stub_bq, tmp_path):
+    """ATTACH 'project=...' that fails for a real reason (not the
+    'already attached' tolerated case) must propagate so callers see
+    the actual error instead of a confusing downstream 'bq is not
+    attached' message."""
+    from connectors.bigquery.extractor import materialize_query
+
+    out = tmp_path / "extracts" / "bigquery"
+    out.mkdir(parents=True)
+
+    @contextmanager
+    def _session_attach_fails(_p):
+        conn = duckdb.connect(":memory:")
+        try:
+            # Force ATTACH 'project=...' to raise something other than
+            # "already attached" by intercepting via execute wrapper —
+            # since DuckDB's real connection doesn't accept attribute
+            # patches, we use a thin proxy for this test.
+            class _Proxy:
+                def __init__(self, real):
+                    self._real = real
+                def execute(self, sql, *a, **kw):
+                    if sql.startswith("ATTACH 'project="):
+                        raise duckdb.Error("fake permission denied: missing serviceusage.services.use")
+                    return self._real.execute(sql, *a, **kw)
+                def __getattr__(self, name):
+                    return getattr(self._real, name)
+                def close(self):
+                    return self._real.close()
+            yield _Proxy(conn)
+        finally:
+            conn.close()
+
+    bq_bad = BqAccess(
+        BqProjects(billing="t", data="t"),
+        client_factory=lambda _p: MagicMock(),
+        duckdb_session_factory=_session_attach_fails,
+    )
+
+    with pytest.raises(duckdb.Error, match="permission denied"):
+        materialize_query(
+            table_id="x", sql="SELECT 1",
+            bq=bq_bad, output_dir=str(out),
+        )
--- a/tests/test_sync_trigger_keboola_materialized.py
+++ b/tests/test_sync_trigger_keboola_materialized.py
@ -0,0 +1,132 @@
+"""Scheduler-level test: when a Keboola row has query_mode='materialized',
+_run_materialized_pass dispatches to connectors.keboola.extractor.materialize_query
+(not BQ's). Existing BQ-materialized rows continue using BqAccess.
+
+Mirrors the unit-style of tests/test_sync_trigger_materialized.py — patches
+the inner extractor entry points instead of going through the API layer.
+"""
+import duckdb
+import pytest
+from contextlib import contextmanager
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+from src.db import _ensure_schema
+from src.repositories.table_registry import TableRegistryRepository
+from connectors.bigquery.access import BqAccess, BqProjects
+
+
+@pytest.fixture
+def system_db(tmp_path, monkeypatch):
+    db_path = tmp_path / "system.duckdb"
+    conn = duckdb.connect(str(db_path))
+    _ensure_schema(conn)
+    monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
+    yield conn
+    conn.close()
+
+
+@pytest.fixture
+def stub_bq():
+    @contextmanager
+    def _session(_p):
+        conn = duckdb.connect(":memory:")
+        try:
+            yield conn
+        finally:
+            conn.close()
+
+    return BqAccess(
+        BqProjects(billing="t", data="t"),
+        client_factory=lambda _p: MagicMock(),
+        duckdb_session_factory=_session,
+    )
+
+
+def test_run_materialized_pass_dispatches_keboola_to_keboola_extractor(
+    system_db, stub_bq, tmp_path, monkeypatch
+):
+    """Keboola row with query_mode='materialized' must invoke the Keboola
+    materialize_query, not the BQ one."""
+    repo = TableRegistryRepository(system_db)
+    repo.register(
+        id="orders_recent", name="orders_recent",
+        source_type="keboola", query_mode="materialized",
+        source_query='SELECT * FROM kbc."in.c-sales"."orders" WHERE 1=1',
+        sync_schedule="every 1m",  # always due
+    )
+
+    # Provide instance.yaml-shape config + env so the Keboola lazy-init succeeds.
+    monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake-token")
+    from app.api import sync as sync_mod
+
+    # Patch get_value to return the keboola URL/token_env.
+    def _fake_get_value(*keys, default=None):
+        path = keys
+        if path == ("data_source", "keboola", "url"):
+            return "https://connection.keboola.com/"
+        if path == ("data_source", "keboola", "token_env"):
+            return "KEBOOLA_STORAGE_TOKEN"
+        if path == ("data_source", "bigquery", "max_bytes_per_materialize"):
+            return default if default is not None else 0
+        return default
+
+    # Pre-create the parquet for hash bookkeeping (kb materialize is patched
+    # so it won't write a real one).
+    parquet_dir = tmp_path / "data" / "extracts" / "keboola" / "data"
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    (parquet_dir / "orders_recent.parquet").write_bytes(
+        b"PAR1" + b"\x00" * 16 + b"PAR1"
+    )
+
+    bq_called = MagicMock()
+    kb_called = MagicMock(return_value={
+        "table_id": "orders_recent", "rows": 1, "bytes": 100,
+        "md5": "abc123", "path": str(parquet_dir / "orders_recent.parquet"),
+    })
+
+    with patch("app.instance_config.get_value", _fake_get_value), \
+         patch("connectors.bigquery.extractor.materialize_query", bq_called), \
+         patch("connectors.keboola.extractor.materialize_query", kb_called):
+        summary = sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    assert kb_called.called, "Keboola materialize_query was not invoked"
+    assert not bq_called.called, (
+        "BQ materialize_query was wrongly invoked for a Keboola row"
+    )
+    assert "orders_recent" in summary["materialized"]
+
+
+def test_run_materialized_pass_dispatches_bigquery_to_bq_extractor(
+    system_db, stub_bq, tmp_path
+):
+    """Regression: BQ-materialized path keeps working unchanged."""
+    repo = TableRegistryRepository(system_db)
+    repo.register(
+        id="events_summary", name="events_summary",
+        source_type="bigquery", query_mode="materialized",
+        source_query="SELECT date, COUNT(*) FROM `proj.dataset.events` GROUP BY 1",
+        sync_schedule="every 1m",
+    )
+
+    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    (parquet_dir / "events_summary.parquet").write_bytes(
+        b"PAR1" + b"\x00" * 16 + b"PAR1"
+    )
+
+    bq_called = MagicMock(return_value={
+        "rows": 1, "size_bytes": 100, "query_mode": "materialized",
+        "hash": "abc123",
+    })
+    kb_called = MagicMock()
+
+    from app.api import sync as sync_mod
+
+    with patch("app.api.sync._materialize_table", bq_called), \
+         patch("connectors.keboola.extractor.materialize_query", kb_called):
+        summary = sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    assert bq_called.called
+    assert not kb_called.called
+    assert "events_summary" in summary["materialized"]
--- a/tests/test_sync_trigger_materialized.py
+++ b/tests/test_sync_trigger_materialized.py
@ -0,0 +1,484 @@
+"""_run_materialized_pass walks table_registry for materialized BQ rows
+and runs each that is due via _materialize_table.
+
+Tests inject a stub BqAccess (factories never called by these tests since
+_materialize_table is patched) and assert that scheduling, error
+aggregation, sync_state hash, and the disable-sentinel all behave
+correctly.
+"""
+import duckdb
+import pytest
+from contextlib import contextmanager
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+from src.db import _ensure_schema
+from src.repositories.table_registry import TableRegistryRepository
+from src.repositories.sync_state import SyncStateRepository
+from connectors.bigquery.access import BqAccess, BqProjects
+
+
+@pytest.fixture
+def system_db(tmp_path, monkeypatch):
+    db_path = tmp_path / "system.duckdb"
+    conn = duckdb.connect(str(db_path))
+    _ensure_schema(conn)
+    monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
+    yield conn
+    conn.close()
+
+
+@pytest.fixture
+def stub_bq():
+    """A BqAccess instance that the tests don't actually exercise (the test
+    patches `_materialize_table`); just needs to be a valid BqAccess so the
+    type contract doesn't break."""
+    @contextmanager
+    def _session(_p):
+        conn = duckdb.connect(":memory:")
+        try:
+            yield conn
+        finally:
+            conn.close()
+
+    return BqAccess(
+        BqProjects(billing="t", data="t"),
+        client_factory=lambda _p: MagicMock(),
+        duckdb_session_factory=_session,
+    )
+
+
+def test_materialized_pass_calls_materialize_for_due_rows(system_db, stub_bq, tmp_path):
+    repo = TableRegistryRepository(system_db)
+    repo.register(
+        id="orders_90d", name="orders_90d",
+        source_type="bigquery", query_mode="materialized",
+        source_query="SELECT 1 AS n",
+        sync_schedule="every 1m",  # always due in tests (no prior sync)
+    )
+
+    # Pre-create the parquet so _file_hash returns non-empty
+    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    (parquet_dir / "orders_90d.parquet").write_bytes(
+        b"PAR1" + b"\x00" * 16 + b"PAR1"
+    )
+
+    from app.api import sync as sync_mod
+
+    with patch("app.api.sync._materialize_table") as mock_mat:
+        mock_mat.return_value = {
+            "rows": 1, "size_bytes": 100, "query_mode": "materialized",
+        }
+        summary = sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    mock_mat.assert_called_once()
+    call_kwargs = mock_mat.call_args.kwargs
+    assert call_kwargs["table_id"] == "orders_90d"
+    assert "SELECT 1 AS n" in call_kwargs["sql"]
+    assert call_kwargs["bq"] is stub_bq
+    # Default cap (10 GiB) flows through when no instance.yaml override
+    assert call_kwargs["max_bytes"] == 10 * 2**30
+    assert "orders_90d" in summary["materialized"]
+    assert not summary["errors"]
+
+
+def test_materialized_pass_skips_undue_rows(system_db, stub_bq):
+    repo = TableRegistryRepository(system_db)
+    repo.register(
+        id="orders_daily", name="orders_daily",
+        source_type="bigquery", query_mode="materialized",
+        source_query="SELECT 1",
+        sync_schedule="daily 03:00",
+    )
+    state = SyncStateRepository(system_db)
+    state.update_sync(
+        table_id="orders_daily", rows=1, file_size_bytes=10, hash="x",
+    )
+
+    from app.api import sync as sync_mod
+
+    with patch("app.api.sync._materialize_table") as mock_mat:
+        summary = sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    mock_mat.assert_not_called()
+    assert "orders_daily" in summary["skipped"]
+
+
+def test_materialized_pass_skips_non_materialized_rows(system_db, stub_bq):
+    repo = TableRegistryRepository(system_db)
+    repo.register(id="t1", name="t1", source_type="keboola", query_mode="local")
+    repo.register(id="t2", name="t2", source_type="bigquery", query_mode="remote")
+
+    from app.api import sync as sync_mod
+
+    with patch("app.api.sync._materialize_table") as mock_mat:
+        summary = sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    mock_mat.assert_not_called()
+    assert summary == {"materialized": [], "skipped": [], "errors": []}
+
+
+def test_materialized_pass_collects_errors_per_row(system_db, stub_bq, tmp_path):
+    """One row failing must not stop a healthy sibling."""
+    repo = TableRegistryRepository(system_db)
+    repo.register(
+        id="ok", name="ok", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT 1",
+        sync_schedule="every 1m",
+    )
+    repo.register(
+        id="bad", name="bad", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT broken",
+        sync_schedule="every 1m",
+    )
+
+    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    (parquet_dir / "ok.parquet").write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1")
+
+    from app.api import sync as sync_mod
+
+    def _fake(table_id, sql, bq, output_dir, max_bytes):
+        if table_id == "bad":
+            raise RuntimeError("simulated COPY failure")
+        return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"}
+
+    with patch("app.api.sync._materialize_table", side_effect=_fake):
+        summary = sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    assert summary["materialized"] == ["ok"]
+    assert len(summary["errors"]) == 1
+    assert summary["errors"][0]["table"] == "bad"
+    assert "simulated" in summary["errors"][0]["error"]
+
+
+def test_materialized_pass_records_parquet_hash(system_db, stub_bq, tmp_path):
+    """sync_state.hash must be the MD5 of the parquet file — otherwise the
+    manifest reports an empty hash and every da sync re-downloads."""
+    repo = TableRegistryRepository(system_db)
+    repo.register(
+        id="hashed", name="hashed",
+        source_type="bigquery", query_mode="materialized",
+        source_query="SELECT 1",
+        sync_schedule="every 1m",
+    )
+
+    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    parquet_path = parquet_dir / "hashed.parquet"
+
+    def _fake(**kwargs):
+        parquet_path.write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1")
+        return {"rows": 1, "size_bytes": 24, "query_mode": "materialized"}
+
+    from app.api import sync as sync_mod
+
+    with patch("app.api.sync._materialize_table", side_effect=_fake):
+        sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    state = SyncStateRepository(system_db)
+    row = state.get_table_state("hashed")
+    assert row is not None
+    import hashlib
+    expected = hashlib.md5(b"PAR1" + b"\x00" * 16 + b"PAR1").hexdigest()
+    assert row["hash"] == expected
+
+
+def test_run_sync_keboola_timeout_does_not_skip_materialized(tmp_path, monkeypatch):
+    """REGRESSION (Devin BUG_0001 on 2219255): when the Keboola extractor
+    subprocess raises TimeoutExpired, the materialized BQ pass and the
+    orchestrator rebuild must still fire. Pre-fix the timeout propagated
+    to the outer except handler and skipped the rest of _run_sync — on
+    a dual-source deployment a slow Keboola extractor would silently
+    block all materialized parquets and the master-view rebuild until
+    the next trigger."""
+    import duckdb
+    import subprocess as _sp
+    from src.db import _ensure_schema
+
+    db_path = tmp_path / "system.duckdb"
+    conn = duckdb.connect(str(db_path))
+    _ensure_schema(conn)
+
+    repo = TableRegistryRepository(conn)
+    # One Keboola row (drives the extractor subprocess) + one materialized
+    # BQ row (must still run after the Keboola timeout).
+    repo.register(
+        id="kbc_a", name="kbc_a", source_type="keboola",
+        query_mode="local", bucket="in.c-foo", source_table="a",
+    )
+    repo.register(
+        id="m1", name="m1", source_type="bigquery",
+        query_mode="materialized",
+        source_query="SELECT 1",
+        sync_schedule="every 1m",
+    )
+    conn.close()
+
+    monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
+    monkeypatch.setenv("KEBOOLA_STACK_URL", "https://example.invalid")
+    monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake")
+
+    from app.api import sync as sync_mod
+
+    # Subprocess raises TimeoutExpired the moment _run_sync calls it.
+    def _timeout(*a, **kw):
+        raise _sp.TimeoutExpired(cmd=["fake"], timeout=1)
+
+    monkeypatch.setattr(sync_mod.subprocess, "run", _timeout)
+
+    materialized_called = {"count": 0}
+    orchestrator_called = {"count": 0}
+
+    def _spy_materialized(_conn, _bq):
+        materialized_called["count"] += 1
+        return {"materialized": ["m1"], "skipped": [], "errors": []}
+
+    class _OrchStub:
+        def rebuild(self):
+            orchestrator_called["count"] += 1
+            return {}
+
+    monkeypatch.setattr("app.api.sync._run_materialized_pass", _spy_materialized)
+    monkeypatch.setattr(
+        "src.orchestrator.SyncOrchestrator", lambda *a, **kw: _OrchStub(),
+    )
+    monkeypatch.setattr(
+        "app.instance_config.get_data_source_type", lambda: "keboola",
+    )
+    monkeypatch.setattr(
+        "app.instance_config.get_value",
+        lambda *args, **kw: (
+            "my-bq-proj" if (args and args[-1] == "project")
+            else kw.get("default", "")
+        ),
+    )
+
+    sync_mod._run_sync()
+
+    assert materialized_called["count"] == 1, (
+        "materialized pass must run even when Keboola subprocess timed out"
+    )
+    assert orchestrator_called["count"] == 1, (
+        "orchestrator rebuild must run after Keboola timeout to publish "
+        "any partial / materialized parquets that did land"
+    )
+
+
+def test_run_sync_runs_materialized_pass_on_bq_only_deployment(
+    tmp_path, monkeypatch,
+):
+    """REGRESSION (Devin BUG_0002 on 2fa44f2): on BigQuery-only deployments
+    `list_local('bigquery')` is always empty (BQ rows are remote or
+    materialized, never local). The pre-fix _run_sync early-returned in
+    that case → materialized pass + orchestrator rebuild were dead code.
+    Post-fix: run_extractor_subprocess flag skips just the Keboola
+    subprocess, and the materialized pass still fires."""
+    import duckdb
+    from src.db import _ensure_schema
+
+    db_path = tmp_path / "system.duckdb"
+    conn = duckdb.connect(str(db_path))
+    _ensure_schema(conn)
+
+    repo = TableRegistryRepository(conn)
+    # Materialized BQ row — would be invisible to list_local('bigquery').
+    repo.register(
+        id="m1", name="m1", source_type="bigquery",
+        query_mode="materialized",
+        source_query="SELECT 1",
+        sync_schedule="every 1m",
+    )
+    conn.close()
+
+    monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
+
+    # Patch the heavy collaborators so we observe what _run_sync invoked
+    # without actually running BQ / orchestrator.
+    from app.api import sync as sync_mod
+
+    materialized_called = {"count": 0}
+    orchestrator_called = {"count": 0}
+
+    def _spy_materialized_pass(_conn, _bq):
+        materialized_called["count"] += 1
+        return {"materialized": ["m1"], "skipped": [], "errors": []}
+
+    class _OrchStub:
+        def rebuild(self):
+            orchestrator_called["count"] += 1
+            return {}
+
+    monkeypatch.setattr(
+        "app.api.sync._run_materialized_pass",
+        _spy_materialized_pass,
+    )
+    monkeypatch.setattr(
+        "src.orchestrator.SyncOrchestrator",
+        lambda *a, **kw: _OrchStub(),
+    )
+    # Pretend instance.yaml says data_source.type=bigquery
+    monkeypatch.setattr(
+        "app.instance_config.get_data_source_type",
+        lambda: "bigquery",
+    )
+    # bq_project must be truthy so the materialized pass branch fires.
+    real_get_value = sync_mod.__dict__.get("get_value")
+    monkeypatch.setattr(
+        "app.instance_config.get_value",
+        lambda *args, **kw: (
+            "my-bq-proj" if (args and args[-1] == "project")
+            else kw.get("default", "")
+        ),
+    )
+
+    sync_mod._run_sync()
+
+    assert materialized_called["count"] == 1, (
+        "materialized pass must run on BQ-only deployment (no local rows)"
+    )
+    assert orchestrator_called["count"] == 1, (
+        "orchestrator rebuild must run so materialized parquets are picked up"
+    )
+
+
+@pytest.mark.parametrize("yaml_value, expected_max", [
+    (10737418240, 10737418240),       # int — canonical
+    (10737418240.0, 10737418240),     # float — YAML often parses as float
+    (1e10, 10000000000),              # scientific notation
+    ("10737418240", 10737418240),     # string — coerced
+    (0, None),                        # explicit disable sentinel
+    (None, None),                     # missing key
+    ("not-a-number", None),           # malformed → fail-open + warn
+])
+def test_materialized_pass_max_bytes_yaml_coercion(
+    system_db, stub_bq, tmp_path, monkeypatch, yaml_value, expected_max,
+):
+    """`max_bytes_per_materialize` YAML value is coerced to int regardless of
+    the YAML scalar type (int / float / scientific / string). Devin found
+    that an `isinstance(raw, int)` guard silently disabled the guardrail
+    on float values."""
+    repo = TableRegistryRepository(system_db)
+    repo.register(
+        id="t", name="t", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT 1",
+        sync_schedule="every 1m",
+    )
+
+    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    (parquet_dir / "t.parquet").write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1")
+
+    captured = {}
+
+    def _spy(table_id, sql, bq, output_dir, max_bytes):
+        captured["max_bytes"] = max_bytes
+        return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"}
+
+    from app.api import sync as sync_mod
+
+    with patch(
+        "app.instance_config.get_value",
+        side_effect=lambda *a, **kw: (
+            yaml_value if a[-1] == "max_bytes_per_materialize"
+            else kw.get("default", "")
+        ),
+    ), patch("app.api.sync._materialize_table", side_effect=_spy):
+        sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    assert captured["max_bytes"] == expected_max
+
+
+def test_materialized_pass_keys_sync_state_by_name_not_id(
+    system_db, stub_bq, tmp_path,
+):
+    """Devin review: when admin registers a name with mixed case (e.g.
+    "Orders_90d") the slug-derived id ("orders_90d") differs from name.
+    sync_state must be keyed by `name` so the manifest's `registry_by_name`
+    lookup resolves and `query_mode='materialized'` flows through to the
+    client. Otherwise CLI sees `query_mode='local'` and downloads the
+    wrong file or skips the row."""
+    repo = TableRegistryRepository(system_db)
+    # Mixed-case name — id will be slugified to lowercase by the API path,
+    # but at the repo level we control both directly.
+    repo.register(
+        id="orders_90d", name="Orders_90d",
+        source_type="bigquery", query_mode="materialized",
+        source_query="SELECT 1",
+        sync_schedule="every 1m",
+    )
+
+    # Pre-create the parquet at the NAME-keyed path.
+    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    (parquet_dir / "Orders_90d.parquet").write_bytes(
+        b"PAR1" + b"\x00" * 16 + b"PAR1"
+    )
+
+    from app.api import sync as sync_mod
+
+    captured = {}
+
+    def _spy(table_id, sql, bq, output_dir, max_bytes):
+        captured["table_id"] = table_id
+        return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"}
+
+    with patch("app.api.sync._materialize_table", side_effect=_spy):
+        sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    # materialize_query was called with the NAME, not the id.
+    assert captured["table_id"] == "Orders_90d"
+
+    # sync_state row keyed by name.
+    state = SyncStateRepository(system_db)
+    name_row = state.get_table_state("Orders_90d")
+    id_row = state.get_table_state("orders_90d")
+    assert name_row is not None, "sync_state should be keyed by name"
+    assert id_row is None, "sync_state should NOT be keyed by id"
+
+
+def test_materialized_pass_zero_max_bytes_disables_guardrail(
+    system_db, stub_bq, tmp_path, monkeypatch
+):
+    """`max_bytes_per_materialize: 0` in instance.yaml → None passed downstream
+    so materialize_query skips the dry-run entirely."""
+    repo = TableRegistryRepository(system_db)
+    repo.register(
+        id="big", name="big", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT 1",
+        sync_schedule="every 1m",
+    )
+
+    parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    (parquet_dir / "big.parquet").write_bytes(
+        b"PAR1" + b"\x00" * 16 + b"PAR1"
+    )
+
+    monkeypatch.setattr(
+        "app.api.sync.get_value",
+        lambda *args, **kwargs: 0 if args[-1] == "max_bytes_per_materialize" else "",
+        raising=False,
+    )
+
+    from app.api import sync as sync_mod
+
+    captured = {}
+
+    def _spy(**kwargs):
+        captured.update(kwargs)
+        return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"}
+
+    # The function reads `get_value` via a local import in the body — patch
+    # the import target instead.
+    with patch(
+        "app.instance_config.get_value",
+        side_effect=lambda *args, **kw: (
+            0 if args[-1] == "max_bytes_per_materialize"
+            else kw.get("default", "")
+        ),
+    ), patch("app.api.sync._materialize_table", side_effect=_spy):
+        sync_mod._run_materialized_pass(system_db, stub_bq)
+
+    assert captured["max_bytes"] is None
--- a/tests/test_table_registry_source_query.py
+++ b/tests/test_table_registry_source_query.py
@ -0,0 +1,93 @@
+"""Repository round-trips source_query column for query_mode='materialized'.
+
+Lives alongside the schema-v20 migration: register() now accepts source_query
+as an Optional[str] kwarg, and the column flows through SELECT * via list/get.
+"""
+import duckdb
+import pytest
+
+from src.db import _ensure_schema
+from src.repositories.table_registry import TableRegistryRepository
+
+
+@pytest.fixture
+def repo(tmp_path):
+    conn = duckdb.connect(str(tmp_path / "system.duckdb"))
+    _ensure_schema(conn)
+    return TableRegistryRepository(conn)
+
+
+def test_register_persists_source_query(repo):
+    repo.register(
+        id="orders_90d",
+        name="orders_90d",
+        source_type="bigquery",
+        query_mode="materialized",
+        source_query=(
+            "SELECT date, SUM(revenue) FROM `prj.ds.orders` "
+            "WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY) GROUP BY 1"
+        ),
+        sync_schedule="every 6h",
+    )
+    row = repo.get("orders_90d")
+    assert row is not None
+    assert row["query_mode"] == "materialized"
+    assert "INTERVAL 90 DAY" in row["source_query"]
+    assert row["sync_schedule"] == "every 6h"
+
+
+def test_register_omitted_source_query_stays_null(repo):
+    """Default registrations (Keboola local) must not stamp an empty string."""
+    repo.register(id="t1", name="t1", source_type="keboola", query_mode="local")
+    row = repo.get("t1")
+    assert row is not None
+    assert row["source_query"] is None
+
+
+def test_list_all_includes_source_query(repo):
+    repo.register(
+        id="m1", name="m1", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT 1",
+    )
+    rows = repo.list_all()
+    assert len(rows) == 1
+    assert rows[0]["source_query"] == "SELECT 1"
+
+
+def test_register_updates_source_query_on_conflict(repo):
+    """Re-registering the same id must overwrite source_query (admin edit)."""
+    repo.register(
+        id="m1", name="m1", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT 1",
+    )
+    repo.register(
+        id="m1", name="m1", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT 2",
+    )
+    row = repo.get("m1")
+    assert row["source_query"] == "SELECT 2"
+
+
+def test_register_preserves_registered_at_when_supplied(repo):
+    """source_query addition must not break the existing registered_at
+    preservation contract (admin edits keep the original timestamp)."""
+    from datetime import datetime, timezone
+
+    original = datetime(2026, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
+    repo.register(
+        id="t", name="t", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT 1",
+        registered_at=original,
+    )
+    repo.register(
+        id="t", name="t", source_type="bigquery",
+        query_mode="materialized", source_query="SELECT 2",
+        registered_at=original,
+    )
+    row = repo.get("t")
+    assert row["source_query"] == "SELECT 2"
+    # Don't assert exact equality on naive vs aware (DuckDB strips tz);
+    # just confirm the year+month+day didn't slide forward to 'now'.
+    assert row["registered_at"].year == 2026
+    assert row["registered_at"].month == 1
+    assert row["registered_at"].day == 1