diff --git a/app/api/admin.py b/app/api/admin.py index ae2728a..813120c 100644 --- a/app/api/admin.py +++ b/app/api/admin.py @@ -12,7 +12,7 @@ import uuid from pathlib import Path from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field, field_validator, model_validator from typing import Optional, List, Dict, Any import duckdb @@ -172,6 +172,9 @@ _EDITABLE_SECTIONS: tuple[str, ...] = ( "server", "auth", "ai", + "openmetadata", + "desktop", + "corporate_memory", ) # "Danger-zone" sections — flipping these can lock operators out (auth.*) or @@ -181,6 +184,404 @@ _EDITABLE_SECTIONS: tuple[str, ...] = ( # the right warning copy. _DANGER_SECTIONS: tuple[str, ...] = ("auth", "server") +# Known-but-optional config fields per section. The /admin/server-config UI +# uses this registry alongside the YAML payload to render fields the operator +# might want to set even though they're not currently in instance.yaml. +# +# Schema per field: +# { +# "kind": "string" | "secret" | "bool" | "int" | "select" | "object" | "array", +# "default": (optional) +# "hint": "" +# "options": [...] (only for kind="select") +# "fields": {: } (only for kind="object", nested fields) +# "item_kind": "string" | ... (only for kind="array", element type) +# "required": bool (defaults False; UI marks the label) +# } +# +# Subagents 2-4 will populate the bodies. The registry enables the UI to +# render missing-but-known fields with placeholders + hints rather than +# forcing the operator to discover them via the JSON-patch textarea or +# hitting a runtime error first. The smoke fixture below +# (data_source.bigquery.billing_project) proves the renderer wiring works +# end-to-end so subagents 2-4 only have to add registry entries — they +# don't need to touch admin_server_config.html. +_KNOWN_FIELDS: dict[str, dict[str, dict]] = { + "instance": { + # No commonly-missing instance-level fields. The example YAML's + # `name`/`subtitle` are always populated by `da setup` so they + # render via the populated path; nothing to surface here. + }, + "data_source": { + "bigquery": { + "kind": "object", + "hint": "BigQuery connection knobs (read more in docs/DEPLOYMENT.md)", + "fields": { + "billing_project": { + "kind": "string", + "hint": ( + "GCP project to bill BQ jobs against. Set when SA can read " + "the data project but cannot bill there (e.g. shared read-only " + "data project). Defaults to data_source.bigquery.project. " + "Mismatch → 403 USER_PROJECT_DENIED on every BQ call." + ), + }, + "legacy_wrap_views": { + "kind": "bool", + "default": False, + "hint": ( + "When true, registered VIEWs and MATERIALIZED_VIEWs get a DuckDB " + "master view via bigquery_query() (jobs API) so analysts can " + "SELECT * FROM viewname directly. When false (default), views are " + "catalog-only — analysts use `da fetch` or `da query --remote`. " + "ON for view-heavy deployments where most views are small enough " + "to materialize without 'Response too large' (issue #101)." + ), + }, + "max_bytes_per_materialize": { + "kind": "int", + "default": 10737418240, + "hint": ( + "Cost guardrail for query_mode='materialized' BQ scans (dry-run " + "check before running). Bytes processed; exceeds → registration " + "or sync rejected. 0 disables the gate. Default 10737418240 = 10 GiB." + ), + }, + }, + }, + "keboola": { + "kind": "object", + "hint": "Keboola Storage API connection", + "fields": { + "stack_url": { + "kind": "string", + "hint": ( + "e.g. https://connection.keboola.com (instance-specific stack URL). " + "Validated against private-IP allowlist on save (SSRF guard)." + ), + }, + "project_id": { + "kind": "string", + "hint": "Keboola project ID (numeric, but kept as string in YAML).", + }, + }, + }, + }, + "email": { + # SMTP fields render via the populated path (always set when email + # is enabled); no commonly-missing optional knobs at this layer. + }, + "telegram": { + # Rarely missing; leave empty. + }, + "jira": { + # Webhook + REST credentials always present when Jira is configured. + }, + "theme": { + # Cosmetic only; rarely missing. + }, + "server": { + # TLS / hostname knobs are mostly env-side; nothing to surface here. + }, + "auth": { + "allowed_domain": { + "kind": "string", + "hint": ( + "Comma-separated list of allowed sign-in email domains (e.g. " + "'acme.com,acme-internal.com'). Single domain works too. Empty → no " + "domain restriction (any verified Google identity can sign in)." + ), + }, + }, + "ai": { + "base_url": { + "kind": "string", + "hint": ( + "Required for provider='openai_compat' (LiteLLM, OpenRouter, vLLM, etc.). " + "Ignored when provider='anthropic'. Examples: https://litellm.example.com, " + "https://openrouter.ai/api/v1." + ), + }, + "structured_output": { + "kind": "select", + "options": ["strict", "json", "auto"], + "default": "auto", + "hint": ( + "JSON-schema enforcement strategy. strict=Layer 1 only " + "(Anthropic/OpenAI native, fail otherwise). json=Layer 1 + Layer 2 " + "fallback. auto=all three layers including prompt-based JSON (most " + "compatible, least strict)." + ), + }, + }, + "openmetadata": { + "url": { + "kind": "string", + "hint": "Base URL of your OpenMetadata server (e.g. https://catalog.example.com).", + }, + "token": { + "kind": "secret", + "hint": ( + "JWT bearer token. Use ${OPENMETADATA_TOKEN} env-var reference " + "(don't paste secret directly)." + ), + }, + "cache_ttl_seconds": { + "kind": "int", + "default": 3600, + "hint": "How long to cache catalog responses in-process. Default 3600s (1h).", + }, + "verify_ssl": { + "kind": "bool", + "default": True, + "hint": ( + "TLS verification. Default true. Set false ONLY for internal CAs / " + "self-signed certs — sends the JWT over an unverified channel." + ), + }, + }, + "desktop": { + "jwt_issuer": { + "kind": "string", + "default": "data-analyst", + "hint": "JWT iss claim. Match what the desktop app verifies.", + }, + "jwt_secret": { + "kind": "secret", + "hint": "JWT signing secret. Use ${DESKTOP_JWT_SECRET} env-var reference.", + }, + "url_scheme": { + "kind": "string", + "default": "data-analyst", + "hint": "Custom URL scheme registered by the desktop app (data-analyst://...).", + }, + }, + # corporate_memory governance — optional. When the section is missing + # from instance.yaml the system runs in legacy democratic-wiki mode + # (no admin review). Schema mirrors config/instance.yaml.example + # lines 224-317; renderer handles arbitrary depth + arrays + maps. + "corporate_memory": { + "distribution_mode": { + "kind": "select", + "options": ["mandatory_only", "admin_curated", "hybrid"], + "default": "hybrid", + "hint": ( + "How knowledge reaches users. mandatory_only = admin-only; " + "admin_curated = admin + user voting as feedback; " + "hybrid = default (mandatory from admin + optional from user voting)." + ), + }, + "approval_mode": { + "kind": "select", + "options": ["review_queue", "auto_publish", "threshold"], + "default": "review_queue", + "hint": ( + "How AI-extracted items enter the system. review_queue = admin " + "approval required (default); auto_publish = live immediately; " + "threshold = high-confidence auto, low-confidence to queue." + ), + }, + "review_period_months": { + "kind": "int", + "default": 6, + "hint": "How often approved/mandatory items are flagged for re-review (months).", + }, + "notify_on_new_items": { + "kind": "bool", + "default": True, + "hint": "Notify km_admins when new pending items arrive.", + }, + "sources": { + "kind": "object", + "hint": ( + "Knowledge-source ingestion. Each source has its own enabled " + "flag + base confidence." + ), + "fields": { + "claude_local_md": { + "kind": "object", + "fields": { + "enabled": {"kind": "bool", "default": True}, + "confidence_base": { + "kind": "float", + "default": 0.50, + "hint": "Confidence assigned to extractions from CLAUDE.local.md (0-1).", + }, + }, + }, + "session_transcripts": { + "kind": "object", + "fields": { + "enabled": {"kind": "bool", "default": True}, + "confidence_base": {"kind": "float", "default": 0.60}, + "max_turns_per_session": { + "kind": "int", + "default": 100, + "hint": "Truncate transcripts longer than this many turns.", + }, + "detection_types": { + "kind": "array", + "item_kind": "string", + "default": [ + "correction", + "confirmation", + "unprompted_definition", + ], + "hint": ( + "Which extraction patterns to detect. Each entry " + "is a detection-type tag." + ), + }, + }, + }, + }, + }, + "extraction": { + "kind": "object", + "fields": { + "model": { + "kind": "string", + "default": "claude-haiku-4-5-20251001", + "hint": "LLM used to extract knowledge. Override for cost or quality.", + }, + "sensitivity_check": {"kind": "bool", "default": True}, + "contradiction_check": {"kind": "bool", "default": True}, + }, + }, + "confidence": { + "kind": "object", + "hint": "Confidence scoring + decay rules.", + "fields": { + "base": { + "kind": "map", + "key_kind": "string", + "value_kind": "float", + "default": { + "user_verification.correction": 0.90, + "user_verification.unprompted_definition": 0.90, + "user_verification.confirmation": 0.60, + "admin_mandate": 1.00, + "claude_local_md": 0.50, + "session_transcript": 0.50, + }, + "hint": ( + "Base score per source/detection. Keys are 'source_type' " + "or 'source_type.detection_type' (the dot is data, not " + "nesting)." + ), + }, + "modifiers": { + # map>. The renderer's structured + # editor for "map of objects with declared subfields" is a + # TODO (see admin_server_config.html); for now this falls + # back to a JSON textarea — admins editing it see the + # schema doc inline via the hint. + "kind": "map", + "key_kind": "string", + "value_kind": "object", + "value_fields": {}, # signals the JSON-textarea fallback + "hint": ( + "Per-key modifier step sizes applied to base when " + "optional signals are present (3-level dotted paths). " + "Edit as a JSON object — outer keys mirror confidence.base " + "keys; inner objects map signal name to bonus float." + ), + }, + "decay": { + "kind": "object", + "fields": { + "mode": { + "kind": "select", + "options": ["linear", "exponential"], + "default": "exponential", + }, + "half_life_months": { + "kind": "int", + "default": 12, + "hint": "Used when mode=exponential.", + }, + "decay_rate_monthly": { + "kind": "float", + "default": 0.02, + "hint": "Used when mode=linear.", + }, + "floor": { + "kind": "map", + "key_kind": "string", + "value_kind": "float", + "default": { + "admin_mandate": 0.50, + "user_verification": 0.40, + "default": 0.0, + }, + "hint": ( + "Per-source minimum confidence — items never decay " + "below this floor." + ), + }, + }, + }, + }, + }, + "contradiction_detection": { + "kind": "object", + "fields": { + "enabled": {"kind": "bool", "default": True}, + "max_candidates": { + "kind": "int", + "default": 10, + "hint": "Max contradiction candidates to evaluate per new item.", + }, + }, + }, + "entity_resolution": { + "kind": "object", + "fields": { + "enabled": {"kind": "bool", "default": True}, + "entities": { + "kind": "map", + "key_kind": "string", + "value_kind": "array", + "value_item_kind": "string", + "default": { + "metrics": ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"], + "products": ["Platform", "API", "Dashboard"], + }, + "hint": ( + "Domain-entity vocabulary. Key = domain category; value = " + "canonical names list." + ), + }, + }, + }, + "domain_owners": { + "kind": "map", + "key_kind": "string", + "value_kind": "array", + "value_item_kind": "string", + "hint": ( + "Per-domain admin emails. Key = domain name; value = email list." + ), + }, + "domains": { + "kind": "array", + "item_kind": "string", + "default": [ + "finance", + "engineering", + "product", + "data", + "operations", + "infrastructure", + ], + "hint": ( + "Knowledge domains analysts can target. Each must match a key " + "in domain_owners." + ), + }, + }, +} + # Keys whose values must be redacted from the audit diff. We match # substring (case-insensitive) so `client_secret`, `api_token`, # `webapp_secret_key`, `bot_token`, `password`, `smtp_password`, etc. all @@ -385,6 +786,43 @@ class ServerConfigUpdateRequest(BaseModel): ) +# Optional BQ fields whose runtime defaults are documented but which used to +# be invisible in the editor when YAML omitted them. The data_source.bigquery +# subtree renders as a JSON textarea; a key that's absent from the GET +# payload literally cannot appear in the form for the operator to edit. We +# surface them with their documented defaults so the UI always shows them as +# editable knobs — see Phase J of the admin-tables-cleanup work. +# +# - billing_project: defaults to data project; explicit value needed when +# the SA can read the data project but not bill against it. +# - legacy_wrap_views: default False; toggling ON wraps BQ views via +# `bigquery_query()` so analysts can `SELECT *` directly. +# - max_bytes_per_materialize: cost guardrail for `query_mode='materialized'` +# (default 10 GiB; 0 disables; null falls through to the default). +_BQ_OPTIONAL_FIELD_DEFAULTS: Dict[str, Any] = { + "billing_project": "", + "legacy_wrap_views": False, + "max_bytes_per_materialize": 10737418240, +} + + +def _ensure_bq_optional_fields(sections: Dict[str, Any]) -> None: + """In-place: add missing BQ optional fields to data_source.bigquery so the + UI's JSON-textarea renders them as editable keys. Existing values are + preserved — only absent keys are populated with their documented default. + """ + ds = sections.get("data_source") + if not isinstance(ds, dict): + return + bq = ds.get("bigquery") + if not isinstance(bq, dict): + # No BQ subsection — leave alone. Non-BQ instances don't need these + # knobs, and creating an empty bigquery dict would be misleading. + return + for key, default in _BQ_OPTIONAL_FIELD_DEFAULTS.items(): + bq.setdefault(key, default) + + @router.get("/server-config") async def get_server_config( user: dict = Depends(require_admin), @@ -403,11 +841,20 @@ async def get_server_config( # file omits them — operator can populate from scratch without manual # JSON edits. sections = {section: redacted.get(section, {}) for section in _EDITABLE_SECTIONS} + # Always surface the optional BQ knobs so the operator sees them in the + # UI's JSON editor instead of having to know they exist (Phase J). + _ensure_bq_optional_fields(sections) return { "sections": sections, "editable_sections": list(_EDITABLE_SECTIONS), "danger_sections": list(_DANGER_SECTIONS), "secret_key_patterns": list(_SECRET_KEY_PATTERNS), + # Known-but-optional fields per section so the UI can render + # placeholders for fields the operator hasn't set yet (Phase J). + # Subagents 2-4 populate the bodies; the renderer ships now so the + # mechanism is wired end-to-end and adding entries is purely a + # data-edit in `_KNOWN_FIELDS` above. + "known_fields": _KNOWN_FIELDS, } @@ -654,7 +1101,17 @@ def _sanitize_for_audit(payload: Dict[str, Any]) -> Dict[str, Any]: class RegisterTableRequest(BaseModel): name: str folder: Optional[str] = None - sync_strategy: str = "full_refresh" + sync_strategy: str = Field( + default="full_refresh", + deprecated=True, + description=( + "DEPRECATED: catalog/profiler metadata only. No extractor reads " + "this field; every sync is a full overwrite regardless of value. " + "profiler.is_partitioned() consumes it for parquet-layout " + "detection. Field stays for back-compat; will be removed in a " + "future major release." + ), + ) # Composite primary keys are real (session-grain MSA tables key on # `(session_id, event_date)`, browse rows on more). The frontend sends + # reads this as a list; backend stores it JSON-serialized in VARCHAR. @@ -664,9 +1121,41 @@ class RegisterTableRequest(BaseModel): source_type: Optional[str] = None bucket: Optional[str] = None source_table: Optional[str] = None + # Backs query_mode='materialized'. Stored verbatim in + # table_registry.source_query (schema v20); the trigger pass runs it + # through the DuckDB BQ extension via BqAccess and writes the result + # to /data/extracts/bigquery/data/.parquet. + source_query: Optional[str] = None query_mode: str = "local" sync_schedule: Optional[str] = None - profile_after_sync: bool = True + profile_after_sync: bool = Field( + default=True, + deprecated=True, + description=( + "DEPRECATED: not consumed by the runtime (Agent 1 finding " + "2026-05-01). Profiler runs unconditionally on every synced " + "table; this flag has no effect. Field stays for back-compat." + ), + ) + + @model_validator(mode="after") + def _check_mode_query_coherence(self): + """Enforce query_mode ↔ source_query invariants up front so an admin + can't persist a remote/local row carrying an orphan source_query, and + materialized rows can't be registered without a SQL body.""" + sq = (self.source_query or "").strip() or None + if self.query_mode == "materialized" and not sq: + raise ValueError( + "query_mode='materialized' requires a non-empty source_query" + ) + if self.query_mode != "materialized" and sq: + raise ValueError( + "source_query is only valid when query_mode='materialized'" + ) + # Normalise: stash the trimmed-or-None form so the persisted column + # never carries surrounding whitespace or empty-string sentinels. + self.source_query = sq + return self @field_validator("primary_key", mode="before") @classmethod @@ -707,12 +1196,67 @@ class RegisterTableRequest(BaseModel): def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None: """Enforce BQ-specific shape on a register/precheck request. - Mutates the model: forces ``query_mode='remote'`` and - ``profile_after_sync=False`` (per Decision 7 in #108) so a caller can't - accidentally enqueue a parquet profiling pass for a remote view that - has no local file. Raises HTTPException(422) for missing required - fields and HTTPException(400) for unsafe identifiers / bogus project_id. + Two BQ paths: + + - ``query_mode='materialized'`` — admin-registered SQL writes a parquet on + schedule. Requires ``source_query``; ``bucket`` / ``source_table`` are + not used (the SQL inlines the references). Doesn't force any field; the + Pydantic ``model_validator`` already gated the query/mode coherence. + + - ``query_mode='remote'`` (or default) — remote view over a single BQ + table. Requires ``bucket`` (BQ dataset) + ``source_table``. Mutates + the model: forces ``query_mode='remote'`` and ``profile_after_sync=False`` + (per Decision 7 in #108) so a caller can't accidentally enqueue a + parquet profiling pass for a remote view that has no local file. + + Raises HTTPException(422) for missing required fields and + HTTPException(400) for unsafe identifiers / bogus project_id. """ + if req.query_mode == "materialized": + # Materialized BQ rows: the SQL body replaces dataset+table refs. + # Pydantic model_validator already verified source_query is non-empty; + # all we still need is a valid project_id and a safe view name. + if not req.source_query or not req.source_query.strip(): + raise HTTPException( + status_code=422, + detail="bigquery materialized: 'source_query' is required", + ) + raw_name = req.name or "" + if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name): + raise HTTPException( + status_code=400, + detail=( + f"bigquery: view name {raw_name!r} is unsafe — must match " + f"^[a-zA-Z_][a-zA-Z0-9_]{{0,63}}$ (DuckDB identifier rules) " + "with no leading/trailing whitespace" + ), + ) + from app.instance_config import get_value + project_id = get_value("data_source", "bigquery", "project", default="") + if not project_id: + raise HTTPException( + status_code=400, + detail=( + "bigquery: data_source.bigquery.project is not set in " + "instance.yaml; configure it via /admin/server-config or " + "/api/admin/configure first" + ), + ) + if not _is_safe_project_id(project_id): + raise HTTPException( + status_code=400, + detail=( + f"bigquery: data_source.bigquery.project {project_id!r} " + "is malformed — must match GCP project_id grammar " + "^[a-z][a-z0-9-]{4,28}[a-z0-9]$" + ), + ) + # Phase C: profile_after_sync is now inert (Pydantic field marked + # deprecated; not read by app/api/sync.py:410-438). The runtime + # profiles every synced table unconditionally, so we no longer + # force-set this here as a "signal." + return + if not req.bucket or not req.bucket.strip(): raise HTTPException( status_code=422, @@ -796,24 +1340,106 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None: "must match GCP project_id grammar ^[a-z][a-z0-9-]{4,28}[a-z0-9]$" ), ) - # Force the BQ-required mode + flag (Decision 7). The orchestrator and + # Force the BQ-required mode (Decision 7). The orchestrator and # extractor both assume remote; persisting `local` here would later create # a profiling job against a non-existent parquet file. + # Phase C: profile_after_sync is now inert (deprecated, not read by the + # runtime); no longer force-set here. req.query_mode = "remote" - req.profile_after_sync = False class UpdateTableRequest(BaseModel): name: Optional[str] = None - sync_strategy: Optional[str] = None + sync_strategy: Optional[str] = Field( + default=None, + deprecated=True, + description=( + "DEPRECATED: catalog/profiler metadata only. See " + "RegisterTableRequest.sync_strategy." + ), + ) primary_key: Optional[List[str]] = None description: Optional[str] = None source_type: Optional[str] = None bucket: Optional[str] = None source_table: Optional[str] = None + source_query: Optional[str] = None query_mode: Optional[str] = None sync_schedule: Optional[str] = None - profile_after_sync: Optional[bool] = None + profile_after_sync: Optional[bool] = Field( + default=None, + deprecated=True, + description=( + "DEPRECATED: not consumed by the runtime. See " + "RegisterTableRequest.profile_after_sync." + ), + ) + + @model_validator(mode="after") + def _check_mode_query_coherence(self): + """PUT semantics — only the fields explicitly in the body are + validated. The body is overlaid on the existing row at the handler + level (see ``update_table``), so omitted fields keep their stored + values and the synthetic ``RegisterTableRequest`` constructed against + the merged record runs the strict cross-field check before persist. + + The only invariants enforceable from the PUT body alone: + + - explicit ``source_query='SELECT ...'`` paired with ``query_mode`` + that isn't materialized → coherent reject (the SQL would be dead); + - explicit ``source_query='SELECT ...'`` without any ``query_mode`` + in the body → reject; the operator must commit to materialized; + - explicit empty/whitespace ``source_query=''`` paired with + ``query_mode='materialized'`` → reject (operator clearly + mistyped — they sent the field). + + Pre-fix this validator also rejected ``{"query_mode": "materialized", + "sync_schedule": "every 12h"}`` because ``source_query`` was None + — but that's the canonical "edit the schedule on a materialized + row" use-case from the Edit modal, which always sends + ``query_mode`` to indicate intent. Devin BUG_0002 on PR #148 + commit 2219255. + """ + if self.query_mode is None and self.source_query is None: + return self + + sq_raw = self.source_query + sq = (sq_raw or "").strip() or None + + # Operator explicitly sent source_query as empty/whitespace while + # claiming materialized — typo / bad form data, reject. + if ( + self.query_mode == "materialized" + and sq_raw is not None + and not sq + ): + raise ValueError( + "query_mode='materialized' requires a non-empty source_query" + ) + + # source_query only makes sense with materialized mode. Allow None + # (omitted) to flow through; only reject when explicitly set with + # the wrong mode. + if ( + self.query_mode is not None + and self.query_mode != "materialized" + and sq + ): + raise ValueError( + "source_query is only valid when query_mode='materialized'" + ) + if self.query_mode is None and sq: + raise ValueError( + "source_query requires query_mode='materialized' to be set " + "in the same request" + ) + + # Normalise: drop whitespace-only strings to None so the persisted + # column is clean. Don't touch when source_query was None to begin + # with — that signals "PUT didn't touch this field, keep existing". + if sq_raw is not None: + self.source_query = sq + return self @field_validator("primary_key", mode="before") @classmethod @@ -853,8 +1479,20 @@ class ConfigureRequest(BaseModel): @router.get("/discover-tables") async def discover_tables( user: dict = Depends(require_admin), + dataset: Optional[str] = None, ): - """Discover all available tables from the configured data source.""" + """Discover available tables from the configured data source. + + For ``data_source.type='keboola'`` returns the full Storage API table + list (single round-trip). For ``data_source.type='bigquery'``: + + - Without ``dataset``: list datasets in the configured project. + - With ``dataset=name``: list tables (BASE TABLE + VIEW) in that dataset. + + Two-step shape avoids paying the per-dataset list_tables cost up-front + on projects with hundreds of datasets — the UI populates the dataset + dropdown first, then fetches tables only for the selected dataset. + """ try: from app.instance_config import get_data_source_type source_type = get_data_source_type() @@ -870,12 +1508,89 @@ async def discover_tables( client = KeboolaClient(token=token, url=url) tables = client.discover_all_tables() return {"tables": tables, "count": len(tables), "source": "keboola"} - else: - return {"tables": [], "count": 0, "source": source_type, "error": "Discovery not implemented for this source"} + + if source_type == "bigquery": + return _discover_bigquery(dataset=dataset) + + return { + "tables": [], + "count": 0, + "source": source_type, + "error": f"Discovery not implemented for source_type={source_type!r}", + } + except HTTPException: + raise except Exception as e: raise HTTPException(status_code=500, detail=f"Discovery failed: {e}") +def _discover_bigquery(dataset: Optional[str]) -> Dict[str, Any]: + """List BQ datasets (when ``dataset`` is None) or tables-in-dataset. + + Routes through ``BqAccess.client()`` so config / auth / error + translation matches the rest of the BQ surface (#138 facade). Returns + the same shape as the Keboola path so the UI doesn't have to branch. + """ + from connectors.bigquery.access import ( + get_bq_access, + BqAccessError, + translate_bq_error, + ) + + try: + bq = get_bq_access() + client = bq.client() + except BqAccessError as e: + raise HTTPException( + status_code=BqAccessError.HTTP_STATUS.get(e.kind, 500), + detail={"error": e.message, "kind": e.kind, "details": e.details}, + ) + + try: + if dataset is None: + datasets = [] + for ds in client.list_datasets(): + datasets.append({ + "dataset_id": ds.dataset_id, + "full_id": f"{ds.project}.{ds.dataset_id}", + }) + return { + "datasets": sorted(datasets, key=lambda d: d["dataset_id"]), + "count": len(datasets), + "source": "bigquery", + } + + # List tables in the named dataset. `list_tables` returns + # `TableListItem` with `table_id` + `table_type` ('TABLE', 'VIEW', + # 'MATERIALIZED_VIEW', 'EXTERNAL', 'SNAPSHOT'). UI maps TABLE → Type + # selector "table" and VIEW/MATERIALIZED_VIEW → "view"; the rest are + # passed through with their raw type so the operator can decide. + tables = [] + for t in client.list_tables(dataset): + tables.append({ + "table_id": t.table_id, + "table_type": t.table_type, + "full_id": f"{t.project}.{t.dataset_id}.{t.table_id}", + }) + return { + "tables": sorted(tables, key=lambda t: t["table_id"]), + "count": len(tables), + "source": "bigquery", + "dataset": dataset, + } + except Exception as e: + # `translate_bq_error` re-raises non-Google exceptions unchanged, + # so wrap in HTTPException to keep the JSON-shape contract. + try: + err = translate_bq_error(e, bq.projects, bad_request_status="upstream_error") + except Exception: + raise HTTPException(status_code=502, detail=f"BQ discovery failed: {e}") + raise HTTPException( + status_code=BqAccessError.HTTP_STATUS.get(err.kind, 502), + detail={"error": err.message, "kind": err.kind, "details": err.details}, + ) + + @router.get("/registry") async def list_registry( user: dict = Depends(require_admin), @@ -1121,6 +1836,10 @@ def register_table( if is_bigquery: _validate_bigquery_register_payload(request) + # Phase C: profile_after_sync is no longer passed — the field is + # deprecated and inert at the runtime layer. The DB column keeps its + # schema default; the registry response no longer reflects request + # values for this flag. repo.register( id=table_id, name=request.name, @@ -1132,9 +1851,9 @@ def register_table( source_type=request.source_type, bucket=request.bucket, source_table=request.source_table, + source_query=request.source_query, query_mode=request.query_mode, sync_schedule=request.sync_schedule, - profile_after_sync=request.profile_after_sync, ) # Audit entry — masked params; description kept raw (it's documentation). @@ -1154,6 +1873,24 @@ def register_table( content={"id": table_id, "name": request.name, "status": "registered"}, ) + if request.query_mode == "materialized": + # Materialized BQ rows are picked up by the trigger pass on the next + # scheduled tick (or via POST /api/sync/trigger). No synchronous + # rebuild — the COPY can scan multi-GB and would block the request. + return JSONResponse( + status_code=201, + content={ + "id": table_id, + "name": request.name, + "status": "registered", + "view_name": table_id, + "message": ( + "Materialized — parquet will be written on the next sync " + "tick. Trigger now via POST /api/sync/trigger." + ), + }, + ) + # BQ post-register: rebuild extract + master views, with timeout fallback. # Decision 1: 200 on synchronous success, 202 on timeout, 500 if the # synchronous rebuild surfaced errors. Distinct from the 201 Keboola @@ -1266,6 +2003,28 @@ def register_table_precheck( # checks identifier safety, validates project_id from instance.yaml). _validate_bigquery_register_payload(request) + # Materialized BQ rows have no `dataset.source_table` to round-trip — + # the SQL body is the contract. Skip the BQ-jobs-API call and return a + # validation-only precheck so the CLI's `--dry-run --query-mode + # materialized` path doesn't crash on an empty fully-qualified name. + if request.query_mode == "materialized": + return { + "ok": True, + "table": { + "name": request.name, + "source_type": "bigquery", + "query_mode": "materialized", + "source_query": request.source_query, + "rows": None, + "size_bytes": None, + "columns": [], + "note": ( + "materialized precheck is validation-only — the SQL is " + "evaluated for cost on each scheduled materialize tick" + ), + }, + } + # Round-trip the BQ jobs API to confirm the table exists and the SA can # see it. Imports kept local to avoid pulling google-cloud-bigquery into # the import chain on non-BQ instances. @@ -1360,14 +2119,23 @@ async def update_table( merged.update(updates) merged.pop("id", None) # avoid duplicate id kwarg + # When switching the merged record away from materialized mode, drop + # the stale source_query — the request validator can't clear it via + # the `if v is not None` filter above. Without this, a remote/local + # row would carry an orphan source_query in the registry. + if merged.get("query_mode") != "materialized": + merged["source_query"] = None + if merged.get("source_type") == "bigquery": # Reuse the register-time validator. It mutates the request to - # force query_mode='remote' / profile_after_sync=False — apply - # the same coercion to `merged` so the persisted row matches. + # force query_mode='remote' / profile_after_sync=False (or to + # leave a materialized row alone) — apply the same coercion to + # `merged` so the persisted row matches. synthetic = RegisterTableRequest( name=merged.get("name") or table_id, bucket=merged.get("bucket"), source_table=merged.get("source_table"), + source_query=merged.get("source_query"), source_type="bigquery", query_mode=merged.get("query_mode") or "remote", profile_after_sync=bool(merged.get("profile_after_sync") or False), @@ -1380,6 +2148,7 @@ async def update_table( _validate_bigquery_register_payload(synthetic) merged["query_mode"] = synthetic.query_mode merged["profile_after_sync"] = synthetic.profile_after_sync + merged["source_query"] = synthetic.source_query repo.register(id=table_id, **merged) diff --git a/app/api/sync.py b/app/api/sync.py index 4b32316..d6bcd92 100644 --- a/app/api/sync.py +++ b/app/api/sync.py @@ -36,6 +36,204 @@ def _file_hash(path: Path) -> str: return h.hexdigest() +def _materialize_table( + *, + table_id: str, + sql: str, + bq, + output_dir: str, + max_bytes: Optional[int], +) -> dict: + """Thin wrapper around `connectors.bigquery.extractor.materialize_query` + so the trigger pass can be unit-tested by patching this seam without + touching the real BqAccess factory or the duckdb import.""" + from connectors.bigquery.extractor import materialize_query + return materialize_query( + table_id=table_id, sql=sql, bq=bq, + output_dir=output_dir, max_bytes=max_bytes, + ) + + +def _run_materialized_pass(conn: duckdb.DuckDBPyConnection, bq) -> dict: + """Walk `table_registry` for `query_mode='materialized'` rows and run any + that are due, dispatching by ``source_type`` to the correct connector's + materialize_query. Honors per-table `sync_schedule` via `is_table_due()`, + computes the file hash inline, and updates `sync_state` so the manifest + can serve the row to `da sync` without re-hashing on every request. + + BigQuery rows go through BqAccess + bigquery_query() (jobs API), + optionally cost-guarded by ``max_bytes_per_materialize``. + Keboola rows go through KeboolaAccess + ATTACH-and-COPY, no + guardrail (extension has no dry-run primitive). + + Returns: + ``{"materialized": [ids], "skipped": [ids], "errors": [{table, error}]}`` + + Errors are aggregated per row — one budget-blown table doesn't stop a + healthy sibling. ``MaterializeBudgetError`` is caught and rendered with + its structured fields so operator alerting can pick out the cap-vs-actual + bytes from the log line. + """ + from src.scheduler import is_table_due + from app.instance_config import get_value + from connectors.bigquery.extractor import MaterializeBudgetError + + bq_output_dir = str(Path(_get_data_dir()) / "extracts" / "bigquery") + kb_output_dir = Path(_get_data_dir()) / "extracts" / "keboola" / "data" + + # Sentinel: max_bytes <= 0 (or None) disables the guardrail. `get_value()` + # treats YAML `null` as "missing" → returns the default; operators must use + # the explicit `0` sentinel to disable. See config/instance.yaml.example. + # YAML accepts floats too (e.g. `10737418240.0`), and operators may + # write `1e10` for readability; coerce to int and tolerate non-numeric + # entries by falling through to the disable path with a warning. + raw_max = get_value( + "data_source", "bigquery", "max_bytes_per_materialize", + default=10 * 2**30, + ) + try: + n = int(raw_max) if raw_max is not None else 0 + except (TypeError, ValueError): + logger.warning( + "data_source.bigquery.max_bytes_per_materialize is not numeric " + "(%r); cost guardrail disabled. Set an integer or 0 to disable.", + raw_max, + ) + n = 0 + bq_max_bytes = n if n > 0 else None + + registry = TableRegistryRepository(conn) + state = SyncStateRepository(conn) + + summary = {"materialized": [], "skipped": [], "errors": []} + keboola_access = None # lazy-init on first Keboola row + + for row in registry.list_all(): + if row.get("query_mode") != "materialized": + continue + + # Convention across connectors: sync_state.table_id and the parquet + # filename are keyed by `table_registry.name` (matches Keboola's + # `_meta.table_name`) so the manifest's `registry_by_name` lookup + # at `_build_manifest_for_user` resolves cleanly. Without this, + # admins who register `name="Orders_90d"` (id slugified to + # `orders_90d`) would see `query_mode` default to `"local"` in the + # manifest because the lookup misses on `id`. + ref_name = row["name"] + + last = state.get_last_sync(ref_name) + last_iso = last.isoformat() if last else None + schedule = row.get("sync_schedule") or "every 1h" + if not is_table_due(schedule, last_iso): + summary["skipped"].append(ref_name) + continue + + source_type = row.get("source_type") or "bigquery" # legacy default + + # Dispatch by source_type. BQ rows keep using `_materialize_table` + # (the existing test seam); Keboola rows use the new Keboola + # materialize_query via a lazily-initialized KeboolaAccess. + try: + if source_type == "bigquery": + stats = _materialize_table( + table_id=ref_name, + sql=row["source_query"], + bq=bq, + output_dir=bq_output_dir, + max_bytes=bq_max_bytes, + ) + elif source_type == "keboola": + if keboola_access is None: + from connectors.keboola.access import KeboolaAccess + keboola_url = get_value( + "data_source", "keboola", "url", default="" + ) or "" + token_env = get_value( + "data_source", "keboola", "token_env", + default="KEBOOLA_STORAGE_TOKEN", + ) or "KEBOOLA_STORAGE_TOKEN" + keboola_token = os.environ.get(token_env, "") + if not (keboola_url and keboola_token): + summary["errors"].append({ + "table": ref_name, + "error": ( + "Keboola URL/token not configured for " + "materialized path (data_source.keboola.url " + f"+ env {token_env})" + ), + }) + continue + keboola_access = KeboolaAccess( + url=keboola_url, token=keboola_token, + ) + kb_output_dir.mkdir(parents=True, exist_ok=True) + from connectors.keboola.extractor import ( + materialize_query as kb_materialize_query, + ) + kb_stats = kb_materialize_query( + table_id=ref_name, + sql=row["source_query"], + keboola_access=keboola_access, + output_dir=kb_output_dir, + ) + # Normalize Keboola materialize_query output to the shape the + # BQ branch uses for downstream sync_state updates. KB returns + # {table_id, path, rows, bytes, md5}; map to + # {rows, size_bytes, hash}. + stats = { + "rows": kb_stats["rows"], + "size_bytes": kb_stats["bytes"], + "hash": kb_stats["md5"], + "query_mode": "materialized", + } + else: + summary["errors"].append({ + "table": ref_name, + "error": ( + f"materialized path not supported for " + f"source_type={source_type!r}" + ), + }) + continue + except MaterializeBudgetError as e: + logger.warning( + "Materialize cap exceeded for %s: %s bytes > %s bytes", + e.table_id, f"{e.current:,}", f"{e.limit:,}", + ) + summary["errors"].append({ + "table": ref_name, + "error": str(e), + "current": e.current, + "limit": e.limit, + }) + continue + except Exception as e: + logger.exception("Materialize failed for %s", ref_name) + summary["errors"].append({"table": ref_name, "error": str(e)}) + continue + + # `materialize_query` returns the parquet's MD5 inline — hashing + # there means we don't re-read a multi-GB file on the request + # thread. Fallback to `_file_hash(parquet_path)` if for some + # reason the stats dict didn't carry it (defensive). + parquet_hash = stats.get("hash") + if not parquet_hash: + output_dir_for_hash = ( + bq_output_dir if source_type == "bigquery" else str(kb_output_dir.parent) + ) + parquet_path = Path(output_dir_for_hash) / "data" / f"{ref_name}.parquet" + parquet_hash = _file_hash(parquet_path) + state.update_sync( + table_id=ref_name, + rows=stats["rows"], + file_size_bytes=stats["size_bytes"], + hash=parquet_hash, + ) + summary["materialized"].append(ref_name) + + return summary + + def _run_sync(tables: Optional[List[str]] = None): """Run extractor as subprocess + orchestrator rebuild. @@ -106,20 +304,38 @@ def _run_sync(tables: Optional[List[str]] = None): except Exception as e: logger.warning("Auto-discovery failed: %s", e) - if not table_configs: - logger.warning("No tables to sync for source_type=%s", source_type) - return + # CRITICAL: don't early-return when local-mode tables are empty. + # `list_local("bigquery")` is always empty on BQ-only deployments + # (BQ rows are always remote or materialized, never local), so an + # early return would prevent the materialized pass AND the + # orchestrator rebuild from ever firing on a BQ-only instance. + # Devin BUG_0002 on PR #148 commit 2fa44f2. Just flag whether the + # Keboola subprocess + custom-connectors should run; everything + # below (materialized pass, orchestrator rebuild, profiler) runs + # unconditionally so a registry with materialized rows but no + # local rows still publishes them. + run_extractor_subprocess = bool(table_configs) + if not run_extractor_subprocess: + logger.info( + "No local-mode tables to sync for source_type=%s — " + "skipping extractor subprocess; materialized pass + " + "orchestrator rebuild still run.", + source_type, + ) - # Serialize configs — strip non-serializable fields - serializable = [] - for tc in table_configs: - serializable.append({k: (v.isoformat() if hasattr(v, 'isoformat') else v) - for k, v in tc.items() if v is not None}) - - # Run extractor subprocess with table configs via stdin - # Subprocess does NOT open system.duckdb — no lock conflict env = {**os.environ} - cmd = [sys.executable, "-c", """ + import sys as _sys + + if run_extractor_subprocess: + # Serialize configs — strip non-serializable fields + serializable = [] + for tc in table_configs: + serializable.append({k: (v.isoformat() if hasattr(v, 'isoformat') else v) + for k, v in tc.items() if v is not None}) + + # Run extractor subprocess with table configs via stdin + # Subprocess does NOT open system.duckdb — no lock conflict + cmd = [sys.executable, "-c", """ import json, sys, os, logging from pathlib import Path @@ -147,58 +363,113 @@ print(json.dumps(result)) sys.exit(compute_exit_code(result, len(configs))) """] - import sys as _sys - print(f"[SYNC] Starting extractor subprocess for {len(table_configs)} tables", file=_sys.stderr, flush=True) + print(f"[SYNC] Starting extractor subprocess for {len(table_configs)} tables", file=_sys.stderr, flush=True) - result = subprocess.run( - cmd, input=_json.dumps(serializable), capture_output=True, text=True, - timeout=1800, env=env, - cwd=str(Path(__file__).parent.parent.parent), - ) + try: + result = subprocess.run( + cmd, input=_json.dumps(serializable), capture_output=True, text=True, + timeout=1800, env=env, + cwd=str(Path(__file__).parent.parent.parent), + ) + except subprocess.TimeoutExpired: + # Catch the timeout LOCALLY so the materialized BQ pass and + # orchestrator rebuild below still fire. Pre-fix the timeout + # propagated to the outer except handler and skipped the rest + # of `_run_sync` — on a dual-source deployment a slow Keboola + # extractor would silently block all materialized parquets + + # master-view rebuild until the next trigger. Devin BUG_0001 + # on PR #148 commit 2219255. Mirrors the per-custom-connector + # timeout pattern below (line ~347). + print( + "[SYNC] Extractor timed out after 1800s — continuing to " + "materialized pass + orchestrator rebuild", + file=_sys.stderr, flush=True, + ) + result = None - if result.stdout: - print(f"[SYNC] Extractor stdout: {result.stdout.strip()[-500:]}", file=_sys.stderr, flush=True) - if result.stderr: - print(f"[SYNC] Extractor stderr: {result.stderr[-500:]}", file=_sys.stderr, flush=True) - # Issue #81 Group B: three exit codes. 0 = full success, - # 1 = full failure, 2 = partial. Partial is a data-quality - # alert, not a crash — the orchestrator's per-table _meta - # machinery already captured which tables succeeded; we just - # need to log loudly so operator alerting can pick it up. - if result.returncode == 0: - print(f"[SYNC] Extractor OK", file=_sys.stderr, flush=True) - elif result.returncode == 2: + if result is not None: + if result.stdout: + print(f"[SYNC] Extractor stdout: {result.stdout.strip()[-500:]}", file=_sys.stderr, flush=True) + if result.stderr: + print(f"[SYNC] Extractor stderr: {result.stderr[-500:]}", file=_sys.stderr, flush=True) + # Issue #81 Group B: three exit codes. 0 = full success, + # 1 = full failure, 2 = partial. Partial is a data-quality + # alert, not a crash — the orchestrator's per-table _meta + # machinery already captured which tables succeeded; we just + # need to log loudly so operator alerting can pick it up. + if result.returncode == 0: + print(f"[SYNC] Extractor OK", file=_sys.stderr, flush=True) + elif result.returncode == 2: + print( + f"[SYNC] Extractor PARTIAL FAILURE (exit 2) — some tables " + f"succeeded, some failed; see stderr for per-table errors. " + f"Successful tables will still be published by the orchestrator.", + file=_sys.stderr, flush=True, + ) + else: + print(f"[SYNC] Extractor FAILED (exit {result.returncode})", file=_sys.stderr, flush=True) + + # Run custom connectors (Tier A: local mount) — only when there + # were local-mode tables to drive the extractor. Custom connectors + # currently piggyback on the same env as the Keboola extractor. + connectors_dir = Path(os.environ.get("CONNECTORS_DIR", str(Path(__file__).parent.parent.parent / "connectors" / "custom"))) + if connectors_dir.exists(): + for connector_dir in sorted(connectors_dir.iterdir()): + if not connector_dir.is_dir(): + continue + extractor = connector_dir / "extractor.py" + if not extractor.exists(): + continue + logger.info("Running custom connector: %s", connector_dir.name) + try: + custom_result = subprocess.run( + [sys.executable, str(extractor)], + env=env, capture_output=True, text=True, timeout=600, + cwd=str(Path(__file__).parent.parent.parent), + ) + if custom_result.returncode != 0: + logger.error("Custom connector %s failed: %s", connector_dir.name, custom_result.stderr[-500:]) + else: + logger.info("Custom connector %s completed", connector_dir.name) + except subprocess.TimeoutExpired: + logger.error("Custom connector %s timed out", connector_dir.name) + + # Materialized BigQuery pass — runs admin-registered SQL through the + # DuckDB BQ extension (via BqAccess) and writes parquet for due rows. + # The orchestrator rebuild below picks the parquets up via the + # standard local-parquet discovery. Wrapped so a misconfigured BQ + # facade doesn't kill the Keboola path. + try: + from app.instance_config import get_value as _get_value + bq_project = _get_value( + "data_source", "bigquery", "project", default="" + ) or "" + if bq_project: + from connectors.bigquery.access import get_bq_access + from src.db import get_system_db as _get_system_db + bq_access = get_bq_access() + mat_conn = _get_system_db() + try: + mat_summary = _run_materialized_pass(mat_conn, bq_access) + finally: + mat_conn.close() + print( + f"[SYNC] Materialized BQ: {len(mat_summary['materialized'])} ok, " + f"{len(mat_summary['skipped'])} skipped, " + f"{len(mat_summary['errors'])} errors", + file=_sys.stderr, flush=True, + ) + for err in mat_summary["errors"]: + print( + f"[SYNC] {err['table']}: {err['error']}", + file=_sys.stderr, flush=True, + ) + except Exception as e: print( - f"[SYNC] Extractor PARTIAL FAILURE (exit 2) — some tables " - f"succeeded, some failed; see stderr for per-table errors. " - f"Successful tables will still be published by the orchestrator.", + f"[SYNC] Materialized BQ pass FAILED: {e}", file=_sys.stderr, flush=True, ) - else: - print(f"[SYNC] Extractor FAILED (exit {result.returncode})", file=_sys.stderr, flush=True) - - # Run custom connectors (Tier A: local mount) - connectors_dir = Path(os.environ.get("CONNECTORS_DIR", str(Path(__file__).parent.parent.parent / "connectors" / "custom"))) - if connectors_dir.exists(): - for connector_dir in sorted(connectors_dir.iterdir()): - if not connector_dir.is_dir(): - continue - extractor = connector_dir / "extractor.py" - if not extractor.exists(): - continue - logger.info("Running custom connector: %s", connector_dir.name) - try: - custom_result = subprocess.run( - [sys.executable, str(extractor)], - env=env, capture_output=True, text=True, timeout=600, - cwd=str(Path(__file__).parent.parent.parent), - ) - if custom_result.returncode != 0: - logger.error("Custom connector %s failed: %s", connector_dir.name, custom_result.stderr[-500:]) - else: - logger.info("Custom connector %s completed", connector_dir.name) - except subprocess.TimeoutExpired: - logger.error("Custom connector %s timed out", connector_dir.name) + traceback.print_exc() # Rebuild master views (reads extract.duckdb files, no write conflict) from src.orchestrator import SyncOrchestrator diff --git a/cli/commands/admin.py b/cli/commands/admin.py index 9517b02..77a48f6 100644 --- a/cli/commands/admin.py +++ b/cli/commands/admin.py @@ -67,11 +67,19 @@ def register_table( source_type: str = typer.Option("keboola", help="Source type: keboola | bigquery | jira | local"), bucket: str = typer.Option("", help="Source bucket (Keboola) or dataset (BigQuery)"), source_table: str = typer.Option("", help="Source table name in the bucket/dataset"), - query_mode: str = typer.Option("local", help="Query mode: local or remote (forced to 'remote' for bigquery)"), + query_mode: str = typer.Option("local", help="Query mode: local | remote | materialized"), + query: str = typer.Option( + "", + "--query", + help=( + "SQL body for query_mode='materialized' (BigQuery only). " + "Inline SQL or `@path/to.sql` to read from disk." + ), + ), description: str = typer.Option("", help="Table description"), sync_schedule: str = typer.Option( "", - help="Cron schedule (BigQuery only — note: scheduler not yet wired, see #79 / M3 of #108)", + help="Cron schedule (e.g. 'every 6h' / 'daily 03:00'); honored by materialized BQ rows", ), dry_run: bool = typer.Option( False, @@ -81,11 +89,38 @@ def register_table( ): """Register a single table. - For BigQuery: dataset goes in --bucket, the BQ table/view name in - --source-table, the DuckDB view name in NAME. The server forces - query_mode=remote and profile_after_sync=False; --dry-run goes - through /precheck and prints rows + size + columns without writing. + Modes: + - **local** (Keboola): batch pull, parquet on disk. + - **remote** (BigQuery): view only, queries go to BQ. Requires + `--bucket` + `--source-table`. + - **materialized** (BigQuery): server-side scheduled SQL → parquet. + Requires `--query` (inline or `@file.sql`); `--bucket` / + `--source-table` ignored. + + `--dry-run` goes through /precheck (BQ remote only — for materialized + rows, dry-run is a no-op since the SQL itself is the contract). """ + from pathlib import Path + + # Resolve --query @file.sql shorthand. + source_query = "" + if query: + if query.startswith("@"): + sql_path = Path(query[1:]) + if not sql_path.exists(): + typer.echo(f"Error: SQL file not found: {sql_path}", err=True) + raise typer.Exit(2) + source_query = sql_path.read_text(encoding="utf-8").strip() + else: + source_query = query.strip() + + if query_mode == "materialized" and not source_query: + typer.echo( + "Error: --query-mode materialized requires --query (literal SQL or @path.sql)", + err=True, + ) + raise typer.Exit(2) + payload = { "name": name, "source_type": source_type, @@ -94,6 +129,11 @@ def register_table( "query_mode": query_mode, "description": description, } + # Omit empty optional fields so the server-side validator doesn't see + # `source_query=""` on a remote/local row (which would trigger the + # "source_query forbidden" branch). + if source_query: + payload["source_query"] = source_query if sync_schedule: payload["sync_schedule"] = sync_schedule diff --git a/connectors/bigquery/extractor.py b/connectors/bigquery/extractor.py index 99faa5a..e3e7838 100644 --- a/connectors/bigquery/extractor.py +++ b/connectors/bigquery/extractor.py @@ -9,7 +9,7 @@ import shutil import threading from datetime import datetime, timezone from pathlib import Path -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional import duckdb @@ -182,6 +182,14 @@ def _init_extract_locked( _create_remote_attach_table(conn, project_id) for tc in table_configs: + # Materialized rows are written by the sync trigger pass via + # `materialize_query()` — they live as parquets in + # /data/extracts/bigquery/data/, picked up by the orchestrator's + # standard local-parquet discovery. Don't create a remote view + # here (would shadow the parquet via name collision). + if tc.get("query_mode") == "materialized": + continue + table_name = tc["name"] dataset = tc.get("bucket", "") source_table = tc.get("source_table", table_name) @@ -279,6 +287,183 @@ def _init_extract_locked( return stats +class MaterializeBudgetError(RuntimeError): + """Raised when a `materialize_query` BQ dry-run estimate exceeds the + configured `data_source.bigquery.max_bytes_per_materialize` cap. + + The materialize trigger pass logs this and skips the row; the next + scheduled tick re-tries (in case the underlying table size dropped + or the operator raised the cap). Shape mirrors `BqAccessError` — + `current` and `limit` for operator triage. + """ + + def __init__(self, message: str, *, table_id: str, current: int, limit: int): + self.table_id = table_id + self.current = current + self.limit = limit + super().__init__(message) + + +def materialize_query( + table_id: str, + sql: str, + *, + bq, # connectors.bigquery.access.BqAccess (untyped here to avoid circular import at type-check) + output_dir: str, + max_bytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run `sql` through the DuckDB BigQuery extension and write the result + to `/data/.parquet` atomically. + + Designed for `query_mode='materialized'` table_registry rows. The SQL + is admin-registered (validated upstream) and may reference DuckDB + three-part identifiers (`bq."dataset"."table"`) resolved by the + in-session ATTACH, OR native BQ identifiers via the `bigquery_query()` + table function — both work because the session has the bigquery + extension loaded with a SECRET token. + + Cost guardrail: when `max_bytes` is a positive int, run a BQ dry-run + via `bq.client()` first; raise `MaterializeBudgetError` if the + estimate exceeds the cap. `max_bytes=None` or `max_bytes <= 0` + disables the guardrail (config sentinel, see + `data_source.bigquery.max_bytes_per_materialize`). + + Dry-run is best-effort and fail-open: if the SQL uses DuckDB syntax + that the native BQ client can't parse (e.g. `bq."ds"."t"`), the + dry-run raises and we log a warning; the COPY still runs. This + matches the BqAccess facade's "client is for native BQ SQL only" + contract — operators who need the cap to engage write the registered + SQL using native BQ identifiers (`\\`project.ds.t\\``). + + Atomic write: result lands in `.parquet.tmp` first, then + `os.replace` swaps it in. A failed COPY leaves no partial file behind. + + Args: + table_id: Logical id from table_registry; becomes the parquet + filename. Must pass `validate_identifier()` so it can't + inject path traversal. + sql: SELECT statement, no trailing semicolon. + bq: A `BqAccess` instance — provides `duckdb_session()` for the + COPY and `client()` for the dry-run. + output_dir: Connector root, e.g. `/data/extracts/bigquery`. + Parquet lands in `/data/.parquet`. + max_bytes: Optional cap on BQ bytes scanned. None or <= 0 disables. + + Returns: + {"rows": int, "size_bytes": int, "query_mode": "materialized"} + + Raises: + ValueError: if `table_id` is unsafe. + MaterializeBudgetError: if `max_bytes > 0` and dry-run estimate exceeds it. + BqAccessError: from `bq.duckdb_session()` (auth_failed / bq_lib_missing / + not_configured) — caller catches and aggregates into the trigger + pass summary. + duckdb.Error: if the COPY itself fails (e.g. bad SQL, missing table). + """ + if not validate_identifier(table_id, "materialize table_id"): + raise ValueError(f"unsafe table_id: {table_id!r}") + + out_path = Path(output_dir) + data_dir = out_path / "data" + data_dir.mkdir(parents=True, exist_ok=True) + + parquet_path = data_dir / f"{table_id}.parquet" + tmp_path = data_dir / f"{table_id}.parquet.tmp" + if tmp_path.exists(): + tmp_path.unlink() + + # Cost guardrail (best-effort — fail-open if dry-run can't parse the SQL). + if max_bytes is not None and max_bytes > 0: + try: + from app.api.v2_scan import _bq_dry_run_bytes # reuse main's impl + estimated = _bq_dry_run_bytes(bq, sql) + except Exception as e: + logger.warning( + "BQ dry-run failed for materialize cost guardrail (fail-open): %s. " + "If the SQL uses DuckDB three-part names like bq.\"ds\".\"t\", " + "rewrite to native BQ identifiers (`project.ds.t`) for the " + "guardrail to engage. Proceeding with COPY.", + e, + ) + estimated = 0 + if estimated > max_bytes: + raise MaterializeBudgetError( + f"dry-run estimate {estimated:,} bytes exceeds cap " + f"{max_bytes:,} for table {table_id!r}", + table_id=table_id, + current=estimated, + limit=max_bytes, + ) + + # COPY through a BqAccess-managed session. + with bq.duckdb_session() as conn: + # ATTACH the data project — but only when no `bq` catalog is + # already attached. Production sessions (real BqAccess) come with + # only `:memory:` and need the ATTACH; test sessions pre-populate + # `bq` as a fixture catalog and would error on a redundant ATTACH + # (alias already in use) AND on the bigquery extension load when + # the test runner has no cached extension. Detecting via + # `duckdb_databases()` keeps the ATTACH path idempotent without + # swallowing real errors (auth, cross-project permission, + # malformed project_id) — those still propagate from the actual + # ATTACH call. + attached = { + r[0] for r in conn.execute( + "SELECT database_name FROM duckdb_databases()" + ).fetchall() + } + if "bq" not in attached: + conn.execute( + f"ATTACH 'project={bq.projects.data}' AS bq (TYPE bigquery, READ_ONLY)" + ) + + try: + safe_path = str(tmp_path).replace("'", "''") + conn.execute(f"COPY ({sql}) TO '{safe_path}' (FORMAT PARQUET)") + rows = conn.execute( + f"SELECT count(*) FROM read_parquet('{safe_path}')" + ).fetchone()[0] + except Exception: + if tmp_path.exists(): + tmp_path.unlink() + raise + + # Compute the parquet hash inline before the atomic swap. The caller used + # to re-read the file in `_run_materialized_pass` to hash it via + # `_file_hash`, but that's a synchronous full-read on the FastAPI worker + # thread — a 10 GiB parquet means 50+ seconds of disk I/O blocking other + # requests. Hashing here keeps the open-file handle hot from the COPY + # round and removes the second read. Devil's-advocate review item. + import hashlib + h = hashlib.md5() + with open(tmp_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + h.update(chunk) + parquet_hash = h.hexdigest() + + size_bytes = tmp_path.stat().st_size + os.replace(tmp_path, parquet_path) + + rows = int(rows) + if rows == 0: + # 0 rows is indistinguishable from "the SQL is wrong and nobody + # noticed" — surface it loudly so operators see it in the scheduler + # log line and the per-row error aggregation. Caller decides whether + # to alert. + logger.warning( + "Materialized %s produced 0 rows — verify the SQL filter is " + "intentional. Parquet written: %s", + table_id, parquet_path, + ) + + return { + "rows": rows, + "size_bytes": size_bytes, + "query_mode": "materialized", + "hash": parquet_hash, + } + + def _resolve_bq_project_id() -> str: """Resolve ``data_source.bigquery.project`` honoring the overlay. diff --git a/connectors/keboola/access.py b/connectors/keboola/access.py new file mode 100644 index 0000000..5950077 --- /dev/null +++ b/connectors/keboola/access.py @@ -0,0 +1,43 @@ +"""DuckDB session facade for the Keboola Storage API extension. + +Parallel of `connectors/bigquery/access.py:BqAccess`. The materialized +Keboola SQL path needs a one-shot DuckDB connection with the Keboola +extension installed, loaded, and ATTACHed; this facade encapsulates +that wiring so `_run_materialized_pass` doesn't need to know the +extension name, the ATTACH alias, or how the token gets quoted into +the URL literal. +""" +from __future__ import annotations +from contextlib import contextmanager +from typing import Iterator + +import duckdb + + +class KeboolaAccess: + """Lazy DuckDB session manager for the Keboola Storage API extension. + + Single-use — call `.duckdb_session()` as a context manager once per + materialized job. + """ + + def __init__(self, *, url: str, token: str) -> None: + if not url or not token: + raise ValueError("KeboolaAccess requires url and token") + self._url = url + self._token = token + + @contextmanager + def duckdb_session(self) -> Iterator[duckdb.DuckDBPyConnection]: + conn = duckdb.connect(":memory:") + try: + conn.execute("INSTALL keboola FROM community") + conn.execute("LOAD keboola") + escaped_token = self._token.replace("'", "''") + conn.execute( + f"ATTACH '{self._url}' AS kbc " + f"(TYPE keboola, TOKEN '{escaped_token}')" + ) + yield conn + finally: + conn.close() diff --git a/connectors/keboola/extractor.py b/connectors/keboola/extractor.py index bb4ebe9..828542b 100644 --- a/connectors/keboola/extractor.py +++ b/connectors/keboola/extractor.py @@ -17,6 +17,66 @@ from src.identifier_validation import ( logger = logging.getLogger(__name__) +def materialize_query( + table_id: str, + sql: str, + *, + keboola_access, # KeboolaAccess (avoid circular import) + output_dir: Path, +) -> dict: + """Materialize an admin-registered SELECT against the Keboola Storage + API extension into a parquet file. + + Parallel of `connectors/bigquery/extractor.py:materialize_query`. + Cost guardrail: the Keboola extension has no analog of BQ dry-run; + Storage API cost is download-shaped (per-byte egress + Storage API + job). Phase B ships without a guardrail and logs the byte count; + a future PR can add a configurable `max_bytes_per_keboola_materialize` + gate similar to BQ's `max_bytes_per_materialize`. + """ + import re + import hashlib + + # Defense: table_id is interpolated into the parquet filename. + # Reject anything that's not a safe identifier. + if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", table_id): + raise ValueError(f"unsafe table_id for materialize: {table_id!r}") + + parquet_path = Path(output_dir) / f"{table_id}.parquet" + safe_pq_lit = str(parquet_path).replace("'", "''") + + with keboola_access.duckdb_session() as conn: + # Run the admin SELECT and copy the result to parquet. + # The COPY wrapper is identical to the existing legacy extract + # path at extractor.py:209; the only difference is the SELECT is + # admin-supplied rather than `SELECT * FROM kbc.bucket.table`. + conn.execute(f"COPY ({sql}) TO '{safe_pq_lit}' (FORMAT PARQUET)") + + # Read back row count. + row_count = conn.execute( + f"SELECT COUNT(*) FROM read_parquet('{safe_pq_lit}')" + ).fetchone()[0] + + file_bytes = parquet_path.read_bytes() + md5 = hashlib.md5(file_bytes).hexdigest() + size = len(file_bytes) + + if row_count == 0: + logger.warning( + "Materialized Keboola query for %s wrote 0 rows — verify the " + "SQL filters and that the source bucket has data.", + table_id, + ) + + return { + "table_id": table_id, + "path": str(parquet_path), + "rows": row_count, + "bytes": size, + "md5": md5, + } + + def _create_meta_table(conn: duckdb.DuckDBPyConnection) -> None: """Create the _meta table required by the extract.duckdb contract.""" conn.execute("DROP TABLE IF EXISTS _meta") @@ -99,6 +159,21 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str, table_name = tc["name"] query_mode = tc.get("query_mode", "local") + # Materialized rows are written by the sync trigger pass via + # `materialize_query()` — they live as parquets in + # /data/extracts/keboola/data/, picked up by the orchestrator's + # standard local-parquet discovery. Don't extract here (would + # double-write data via the source bucket reference and confuse + # sync_state bookkeeping). Mirror of the BQ extractor's skip at + # connectors/bigquery/extractor.py:190. + if query_mode == "materialized": + logger.info( + "Skipping legacy extract for %s — query_mode='materialized', " + "handled by _run_materialized_pass instead", + tc.get("id") or tc.get("name"), + ) + continue + # #81 Group D — refuse rows whose identifiers don't pass the # whitelist. The registry is admin-controlled but anyone with # write access can otherwise inject SQL via the CREATE VIEW / diff --git a/scripts/smoke-test-materialized-bq.sh b/scripts/smoke-test-materialized-bq.sh new file mode 100755 index 0000000..7bcbca5 --- /dev/null +++ b/scripts/smoke-test-materialized-bq.sh @@ -0,0 +1,227 @@ +#!/usr/bin/env bash +# Smoke test — query_mode='materialized' for BigQuery. +# +# Runs the full happy-path + 3 adversarial scenarios against a live Agnes +# instance that has BigQuery configured. Cheap (uses bigquery-public-data +# samples; ~36 rows, < 1 KB scan) — safe to run against staging. +# +# Usage: +# ./scripts/smoke-test-materialized-bq.sh [host:port] +# +# Required environment: +# AGNES_PAT — admin PAT for the target instance +# BQ_TEST_BIG — (optional) name of a BQ table > 10 GiB to test the +# cost guardrail. Defaults to a public dataset that +# scans ~50 GB on full SELECT. +# +# Defaults: AGNES_HOST=http://localhost:8000. +# +# Cleans up the test rows on exit (trap), even on SIGINT. + +set -euo pipefail + +HOST="${1:-${AGNES_HOST:-http://localhost:8000}}" +PAT="${AGNES_PAT:?AGNES_PAT must be set (admin token)}" +BIG_TABLE="${BQ_TEST_BIG:-bigquery-public-data.github_repos.commits}" + +PASS=0 +FAIL=0 + +# Test rows we'll create — captured for cleanup. +CREATED_IDS=() + +cleanup() { + echo + echo "--- Cleanup ---" + for tid in "${CREATED_IDS[@]}"; do + curl -sS -X DELETE "$HOST/api/admin/registry/$tid" \ + -H "Authorization: Bearer $PAT" -o /dev/null -w " DELETE %{http_code} $tid\n" || true + done +} +trap cleanup EXIT INT TERM + +check() { + local name="$1" ok="$2" + if [ "$ok" = "true" ]; then + echo " PASS $name" + PASS=$((PASS + 1)) + else + echo " FAIL $name" + FAIL=$((FAIL + 1)) + fi +} + +http() { + # POST/PUT/DELETE helper that returns the HTTP status + body separately. + local method="$1" path="$2" body="${3:-}" + if [ -n "$body" ]; then + curl -sS -o /tmp/smoke-mat-body -w "%{http_code}" \ + -X "$method" "$HOST$path" \ + -H "Authorization: Bearer $PAT" \ + -H "Content-Type: application/json" \ + -d "$body" + else + curl -sS -o /tmp/smoke-mat-body -w "%{http_code}" \ + -X "$method" "$HOST$path" \ + -H "Authorization: Bearer $PAT" + fi +} + +echo "Materialized BQ smoke: $HOST" +echo "Big table for cost-guardrail test: $BIG_TABLE" +echo "---" + +# --------------------------------------------------------------------------- +# Scenario A — Happy path: register tiny materialized table, trigger, +# verify parquet on disk + manifest carries hash. +# --------------------------------------------------------------------------- +echo +echo "[A] Happy path (Shakespeare sample, ~36 rows)" +SQL_A='SELECT corpus, COUNT(*) AS c FROM `bigquery-public-data.samples.shakespeare` GROUP BY 1 ORDER BY 1' +TID_A="smoke_mat_shakespeare_$(date +%s)" +CREATED_IDS+=("$TID_A") + +STATUS=$(http POST /api/admin/register-table "{ + \"name\": \"$TID_A\", + \"source_type\": \"bigquery\", + \"query_mode\": \"materialized\", + \"source_query\": $(printf '%s' "$SQL_A" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"), + \"sync_schedule\": \"every 1m\" +}") +[ "$STATUS" = "201" ] && check "register 201" true || { check "register 201 (got $STATUS)" false; cat /tmp/smoke-mat-body; } + +echo " triggering sync..." +http POST /api/sync/trigger '{}' >/dev/null +sleep 5 # background task + +# Manifest must list the row with query_mode + non-empty hash. +http GET /api/sync/manifest >/dev/null +HASH=$(python3 -c " +import json +m = json.load(open('/tmp/smoke-mat-body')) +t = m.get('tables', {}).get('$TID_A') +print(t.get('hash', '') if t else '') +") +[ -n "$HASH" ] && [ "$HASH" != "null" ] && check "manifest hash present" true || check "manifest hash present (got '$HASH')" false + +# Parquet on disk (assumes co-located filesystem, e.g. local docker compose). +PARQUET="${DATA_DIR:-./data}/extracts/bigquery/data/$TID_A.parquet" +if [ -f "$PARQUET" ]; then + ROWS=$(python3 -c "import duckdb; print(duckdb.connect().execute(\"SELECT count(*) FROM read_parquet('$PARQUET')\").fetchone()[0])" 2>/dev/null || echo "0") + [ "$ROWS" -gt 0 ] && check "parquet has $ROWS rows" true || check "parquet rows ($ROWS)" false +else + echo " note: parquet at $PARQUET not visible from this host (skip if Agnes is remote)" +fi + +# --------------------------------------------------------------------------- +# Scenario B — Cost guardrail: register a large-scan materialized SQL, +# trigger, expect MaterializeBudgetError logged + row skipped. +# --------------------------------------------------------------------------- +echo +echo "[B] Cost guardrail (\`$BIG_TABLE\` full SELECT)" +TID_B="smoke_mat_huge_$(date +%s)" +CREATED_IDS+=("$TID_B") +SQL_B="SELECT * FROM \`$BIG_TABLE\`" + +STATUS=$(http POST /api/admin/register-table "{ + \"name\": \"$TID_B\", + \"source_type\": \"bigquery\", + \"query_mode\": \"materialized\", + \"source_query\": $(printf '%s' "$SQL_B" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"), + \"sync_schedule\": \"every 1m\" +}") +[ "$STATUS" = "201" ] && check "register 201" true || check "register 201 (got $STATUS)" false + +echo " triggering sync (expect cap to fire)..." +http POST /api/sync/trigger '{}' >/dev/null +sleep 5 + +# Manifest should NOT have a hash for the huge row (materialize was skipped). +http GET /api/sync/manifest >/dev/null +HUGE_HASH=$(python3 -c " +import json +m = json.load(open('/tmp/smoke-mat-body')) +t = m.get('tables', {}).get('$TID_B') +print(t.get('hash', '') if t else 'absent') +") +if [ "$HUGE_HASH" = "absent" ] || [ -z "$HUGE_HASH" ] || [ "$HUGE_HASH" = "null" ]; then + check "huge row skipped (no hash in manifest)" true +else + check "huge row skipped (got hash '$HUGE_HASH' — guardrail did not fire)" false +fi +echo " grep server logs for: 'MaterializeBudgetError' or 'Materialize cap exceeded'" + +# --------------------------------------------------------------------------- +# Scenario C — 0-row warning: SQL with always-false WHERE. +# --------------------------------------------------------------------------- +echo +echo "[C] 0-row WARNING (filter to empty result)" +TID_C="smoke_mat_empty_$(date +%s)" +CREATED_IDS+=("$TID_C") +SQL_C='SELECT corpus FROM `bigquery-public-data.samples.shakespeare` WHERE 1=0' + +STATUS=$(http POST /api/admin/register-table "{ + \"name\": \"$TID_C\", + \"source_type\": \"bigquery\", + \"query_mode\": \"materialized\", + \"source_query\": $(printf '%s' "$SQL_C" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"), + \"sync_schedule\": \"every 1m\" +}") +[ "$STATUS" = "201" ] && check "register 201" true || check "register 201 (got $STATUS)" false + +http POST /api/sync/trigger '{}' >/dev/null +sleep 5 + +http GET /api/sync/manifest >/dev/null +EMPTY_ROWS=$(python3 -c " +import json +m = json.load(open('/tmp/smoke-mat-body')) +t = m.get('tables', {}).get('$TID_C') +print(t.get('rows', 'absent') if t else 'absent') +") +[ "$EMPTY_ROWS" = "0" ] && check "empty-result rows=0 in manifest" true || check "empty-result rows ($EMPTY_ROWS)" false +echo " grep server logs for: 'produced 0 rows'" + +# --------------------------------------------------------------------------- +# Scenario D — Mode-switch transition clears stale source_query. +# --------------------------------------------------------------------------- +echo +echo "[D] Mode-switch materialized → remote clears source_query" +TID_D="smoke_mat_switch_$(date +%s)" +CREATED_IDS+=("$TID_D") + +STATUS=$(http POST /api/admin/register-table "{ + \"name\": \"$TID_D\", + \"source_type\": \"bigquery\", + \"query_mode\": \"materialized\", + \"source_query\": \"SELECT 1\" +}") +[ "$STATUS" = "201" ] && check "register materialized" true || check "register materialized (got $STATUS)" false + +# Switch to remote, providing required bucket+source_table. +STATUS=$(http PUT "/api/admin/registry/$TID_D" "{ + \"query_mode\": \"remote\", + \"bucket\": \"samples\", + \"source_table\": \"shakespeare\" +}") +[ "$STATUS" = "200" ] && check "switch to remote 200" true || check "switch to remote (got $STATUS)" false + +http GET /api/admin/registry >/dev/null +SWITCHED_SQ=$(python3 -c " +import json +r = json.load(open('/tmp/smoke-mat-body')) +row = next((t for t in r.get('tables', []) if t['id'] == '$TID_D'), None) +print(row.get('source_query') if row else 'NOT_FOUND') +") +[ "$SWITCHED_SQ" = "None" ] || [ -z "$SWITCHED_SQ" ] || [ "$SWITCHED_SQ" = "null" ] \ + && check "source_query cleared on switch" true \ + || check "source_query cleared (got '$SWITCHED_SQ')" false + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +echo +echo "---" +echo "Passed: $PASS" +echo "Failed: $FAIL" +[ "$FAIL" -eq 0 ] || exit 1 diff --git a/src/db.py b/src/db.py index 728e0f1..854703c 100644 --- a/src/db.py +++ b/src/db.py @@ -39,7 +39,7 @@ def _maybe_instrument(con, db_tag: str): _SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$") -SCHEMA_VERSION = 19 +SCHEMA_VERSION = 20 _SYSTEM_SCHEMA = """ CREATE TABLE IF NOT EXISTS schema_version ( @@ -259,6 +259,7 @@ CREATE TABLE IF NOT EXISTS table_registry ( source_type VARCHAR, bucket VARCHAR, source_table VARCHAR, + source_query TEXT, sync_strategy VARCHAR DEFAULT 'full_refresh', query_mode VARCHAR DEFAULT 'local', sync_schedule VARCHAR, @@ -918,6 +919,16 @@ _V16_TO_V17_MIGRATIONS = [ # helper rather than a flat SQL list (the migrate-ladder calls it directly). +# v19 -> v20: source_query column backs query_mode='materialized' for BigQuery. +# Admin-registered SQL stored verbatim; scheduler runs it through the DuckDB BQ +# extension (via BqAccess) and writes the result to +# /data/extracts/bigquery/data/.parquet so the existing manifest + da sync +# flow distributes it to analysts. NULL on existing rows. +_V19_TO_V20_MIGRATIONS = [ + "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_query TEXT", +] + + # Core role seed data — single source of truth. Used by both _seed_core_roles # (idempotent insert) and the v8→v9 backfill. Order matters: lowest privilege # first so implies references resolve cleanly when expand_implies does BFS. @@ -1735,6 +1746,9 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: _v17_to_v18_finalize(conn) if current < 19: _v18_to_v19_finalize(conn) + if current < 20: + for sql in _V19_TO_V20_MIGRATIONS: + conn.execute(sql) conn.execute( "UPDATE schema_version SET version = ?, applied_at = current_timestamp", [SCHEMA_VERSION], diff --git a/src/repositories/table_registry.py b/src/repositories/table_registry.py index baf7e1c..ef2134c 100644 --- a/src/repositories/table_registry.py +++ b/src/repositories/table_registry.py @@ -72,7 +72,9 @@ class TableRegistryRepository: primary_key: Union[None, str, List[str]] = None, description: Optional[str] = None, registered_by: Optional[str] = None, source_type: Optional[str] = None, bucket: Optional[str] = None, - source_table: Optional[str] = None, query_mode: str = "local", + source_table: Optional[str] = None, + source_query: Optional[str] = None, + query_mode: str = "local", sync_schedule: Optional[str] = None, profile_after_sync: bool = True, registered_at: Optional[datetime] = None, ) -> None: @@ -85,18 +87,21 @@ class TableRegistryRepository: self.conn.execute( """INSERT INTO table_registry (id, name, folder, sync_strategy, primary_key, description, registered_by, registered_at, - source_type, bucket, source_table, query_mode, + source_type, bucket, source_table, source_query, query_mode, sync_schedule, profile_after_sync) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT (id) DO UPDATE SET name = excluded.name, folder = excluded.folder, sync_strategy = excluded.sync_strategy, primary_key = excluded.primary_key, description = excluded.description, registered_at = excluded.registered_at, source_type = excluded.source_type, bucket = excluded.bucket, - source_table = excluded.source_table, query_mode = excluded.query_mode, - sync_schedule = excluded.sync_schedule, profile_after_sync = excluded.profile_after_sync""", + source_table = excluded.source_table, source_query = excluded.source_query, + query_mode = excluded.query_mode, + sync_schedule = excluded.sync_schedule, + profile_after_sync = excluded.profile_after_sync""", [id, name, folder, sync_strategy, encoded_pk, description, registered_by, ts, - source_type, bucket, source_table, query_mode, sync_schedule, profile_after_sync], + source_type, bucket, source_table, source_query, query_mode, + sync_schedule, profile_after_sync], ) @staticmethod diff --git a/tests/test_admin_bq_register.py b/tests/test_admin_bq_register.py index 267e920..87528c7 100644 --- a/tests/test_admin_bq_register.py +++ b/tests/test_admin_bq_register.py @@ -244,7 +244,12 @@ class TestBigQueryRegisterCoercion: resp = c.get("/api/admin/registry", headers=_auth(token)) row = next(t for t in resp.json()["tables"] if t["name"] == "orders") assert row["query_mode"] == "remote" - assert row["profile_after_sync"] is False + # Phase C: profile_after_sync is now inert. The field is accepted in + # the request for back-compat but no longer overrides the DB default. + # Was: assert row["profile_after_sync"] is False (when BQ register + # forced it to False as a "signal"). Now the row carries the schema + # default (True). Profiler runs unconditionally regardless. + assert row.get("profile_after_sync") in (True, None) class TestBigQueryRegisterCollision: @@ -698,20 +703,25 @@ class TestAdminTablesUI: c.cookies.clear() assert resp.status_code == 200, resp.text body = resp.text - # Modal carries the source type so the JS can branch. + # Modal carries the source type so legacy openRegisterModal({}) still + # routes through the JS dispatcher. assert 'data-source-type="bigquery"' in body # BQ-only inputs. assert 'id="bqDataset"' in body assert 'id="bqSourceTable"' in body assert 'id="bqViewName"' in body assert 'id="bqSyncSchedule"' in body - # Inline hint about scheduler-not-yet-wired (Decision 3). - assert "scheduler" in body.lower() - # BQ-specific panel (no discovery for BQ in M1). - assert 'data-test="bq-register-panel"' in body - # Keboola-only inputs must NOT be present. - assert 'id="regTableId"' not in body - assert 'id="regBucket"' not in body + # Cron-style schedule examples are surfaced near the field + # (operator-facing copy explains the syntax). + assert "every 6h" in body or "daily 03:00" in body + # BQ tab content section (legacy out-of-tab BQ register panel was + # removed when the user-visible cleanup landed — each tab now owns + # its own header + Register button). + assert 'id="tab-content-bigquery"' in body + assert 'id="bqRegisterBtn"' in body + # Phase E: BQ + Keboola modals are now both always rendered (each + # inside its own tab). On a BQ instance the BQ tab is the visible + # one; the Keboola modal is just hidden in a non-active tab. def test_renders_keboola_fields_when_data_source_keboola(self, seeded_app, monkeypatch): from app.instance_config import reset_cache @@ -731,12 +741,22 @@ class TestAdminTablesUI: assert resp.status_code == 200 body = resp.text assert 'data-source-type="keboola"' in body - # Keboola path — discovery panel + Keboola inputs. - assert 'id="discoveryResults"' in body - assert 'id="regBucket"' in body - assert 'id="regTableName"' in body - # BQ-only inputs MUST NOT be present. - assert 'id="bqDataset"' not in body + # Keboola tab content section + Register-Keboola button (legacy + # global Discovery panel was removed when the user-visible + # cleanup landed — Keboola discovery is per-modal now). + assert 'id="tab-content-keboola"' in body + assert 'id="kbRegisterBtn"' in body + # C3: legacy #registerModal is gone; the Phase F Keboola modal + # at #registerKeboolaModal owns the Keboola register flow now. + assert 'id="registerModal"' not in body + assert 'id="regBucket"' not in body + assert 'id="regTableName"' not in body + # The Phase F Keboola modal's inputs are present. + assert 'id="kbBucket"' in body + assert 'id="kbViewName"' in body + # Phase E: BQ form now always rendered (inside #tab-content-bigquery) + # — operator can switch tabs to register a BQ table on a Keboola + # instance. Tab is hidden by default but the form is in the DOM. finally: reset_cache() @@ -1546,11 +1566,17 @@ class TestKeboolaModalUsesDiscoveredTableId: """Review IMPORTANT 5: the JS that builds the Keboola register payload must derive `source_table` from the discovered table's storage ID (`t.id` minus the bucket prefix), NOT the human-friendly display name - (`t.name`). We verify by static template inspection — this is enough - to catch a regression that drops the hidden field or reverts the JS - to reading `regTableName`.""" + (`t.name`). We verify by static template inspection. - def test_template_has_hidden_source_table_field(self, seeded_app, monkeypatch): + C3: the legacy #registerModal that owned regTableName / regSourceTable + was removed. The Phase F #registerKeboolaModal uses kbBucket / + kbSourceTable and a different payload builder (`_buildKeboolaPayload`) + that already keeps storage identifier separate from display name. The + tests below were rewritten to gate the Phase F flow.""" + + def test_phase_f_modal_separates_storage_id_from_display_name( + self, seeded_app, monkeypatch, + ): from app.instance_config import reset_cache monkeypatch.setattr( "app.instance_config.load_instance_config", @@ -1567,22 +1593,25 @@ class TestKeboolaModalUsesDiscoveredTableId: c.cookies.clear() assert resp.status_code == 200, resp.text body = resp.text - # Hidden field must exist so the JS can stash the bare - # storage identifier separately from the display name. - assert 'id="regSourceTable"' in body - # And the build function must read from that hidden field - # (NOT from regTableName, which is the display name). - assert "getElementById('regSourceTable').value" in body + # Phase F modal owns the Keboola Register flow now. + assert 'id="registerKeboolaModal"' in body + # The source-table input is NOT the same field as the + # human-friendly view name input. + assert 'id="kbSourceTable"' in body + assert 'id="kbViewName"' in body + # The payload builder reads kbSourceTable for the storage + # identifier (used in SELECT * FROM kbc."b"."t"). + assert "getElementById('kbSourceTable').value" in body finally: reset_cache() - def test_template_does_not_send_display_name_as_source_table( + def test_legacy_regtablename_payload_path_is_gone( self, seeded_app, monkeypatch, ): - """Regression check: pre-fix the payload had - `source_table: document.getElementById('regTableName').value`. - After the fix, that exact line must be gone (the build function - reads from the hidden `regSourceTable` first).""" + """Regression check: the pre-fix payload line + `source_table: document.getElementById('regTableName').value` + must remain absent. C3 removed the entire legacy modal so no + regTableName-based payload can be reintroduced.""" from app.instance_config import reset_cache monkeypatch.setattr( "app.instance_config.load_instance_config", @@ -1598,11 +1627,14 @@ class TestKeboolaModalUsesDiscoveredTableId: finally: c.cookies.clear() body = resp.text - # No occurrence of the buggy direct assignment. assert ( "source_table: document.getElementById('regTableName').value" not in body ) + # C3: the regTableName / regSourceTable inputs themselves are + # gone with the legacy modal. + assert 'id="regTableName"' not in body + assert 'id="regSourceTable"' not in body finally: reset_cache() diff --git a/tests/test_admin_discover_bigquery.py b/tests/test_admin_discover_bigquery.py new file mode 100644 index 0000000..1bc82cc --- /dev/null +++ b/tests/test_admin_discover_bigquery.py @@ -0,0 +1,205 @@ +"""GET /api/admin/discover-tables — BigQuery branch. + +Two-step shape: dataset list (no `dataset` query param) → table list (with +`dataset=name`). The UI populates the dataset autocomplete first, then +fetches tables only after the operator picks a dataset, avoiding the +per-dataset `list_tables()` cost on projects with hundreds of datasets. +""" +import pytest +from unittest.mock import MagicMock, patch + +from connectors.bigquery.access import BqAccess, BqProjects + + +def _auth(token): + return {"Authorization": f"Bearer {token}"} + + +@pytest.fixture +def bq_instance(monkeypatch): + """Force `data_source.type='bigquery'` so the endpoint routes to the + BQ branch.""" + fake_cfg = { + "data_source": { + "type": "bigquery", + "bigquery": {"project": "my-test-project", "location": "us"}, + }, + } + monkeypatch.setattr( + "app.instance_config.load_instance_config", + lambda: fake_cfg, + raising=False, + ) + from app.instance_config import reset_cache + reset_cache() + yield fake_cfg + reset_cache() + + +def _stub_bq_with_client(client_mock): + """Build a BqAccess wired to return `client_mock` from .client(). The + duckdb_session_factory is unused by the discover endpoint — supply a + no-op.""" + from contextlib import contextmanager + @contextmanager + def _noop(_p): + yield None + return BqAccess( + BqProjects(billing="my-test-project", data="my-test-project"), + client_factory=lambda _p: client_mock, + duckdb_session_factory=_noop, + ) + + +def test_discover_returns_dataset_list(seeded_app, bq_instance, monkeypatch): + """Without `dataset` param: list datasets in the configured project.""" + client = MagicMock() + ds_a = MagicMock() + ds_a.dataset_id = "analytics" + ds_a.project = "my-test-project" + ds_b = MagicMock() + ds_b.dataset_id = "raw" + ds_b.project = "my-test-project" + client.list_datasets.return_value = [ds_a, ds_b] + + monkeypatch.setattr( + "connectors.bigquery.access.get_bq_access", + lambda: _stub_bq_with_client(client), + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.get("/api/admin/discover-tables", headers=_auth(token)) + assert r.status_code == 200, r.json() + body = r.json() + assert body["source"] == "bigquery" + assert body["count"] == 2 + # Sorted alphabetically by dataset_id. + assert [d["dataset_id"] for d in body["datasets"]] == ["analytics", "raw"] + assert body["datasets"][0]["full_id"] == "my-test-project.analytics" + + +def test_discover_returns_table_list_for_dataset(seeded_app, bq_instance, monkeypatch): + """With `?dataset=analytics`: list tables + views in that dataset.""" + client = MagicMock() + t_orders = MagicMock() + t_orders.table_id = "orders" + t_orders.table_type = "TABLE" + t_orders.project = "my-test-project" + t_orders.dataset_id = "analytics" + t_view = MagicMock() + t_view.table_id = "orders_active" + t_view.table_type = "VIEW" + t_view.project = "my-test-project" + t_view.dataset_id = "analytics" + client.list_tables.return_value = [t_view, t_orders] # unsorted + + monkeypatch.setattr( + "connectors.bigquery.access.get_bq_access", + lambda: _stub_bq_with_client(client), + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.get( + "/api/admin/discover-tables?dataset=analytics", + headers=_auth(token), + ) + assert r.status_code == 200, r.json() + body = r.json() + assert body["source"] == "bigquery" + assert body["dataset"] == "analytics" + assert body["count"] == 2 + # Sorted by table_id. + assert [t["table_id"] for t in body["tables"]] == ["orders", "orders_active"] + by_id = {t["table_id"]: t for t in body["tables"]} + assert by_id["orders"]["table_type"] == "TABLE" + assert by_id["orders_active"]["table_type"] == "VIEW" + # Verify dataset filter was passed through. + client.list_tables.assert_called_once_with("analytics") + + +def test_discover_keboola_branch_unchanged(seeded_app, monkeypatch): + """Negative — when source_type is keboola, BQ logic isn't reached. + + Skipped when the Keboola SDK (`kbcstorage`) is not installed: CI + runners don't ship it because the dev container only needs it for + instances that actually configure source_type=keboola, and the + route's lazy import would fail before the test stub gets a chance + to fire. The branch-unchanged contract is tested separately by the + Keboola integration suite when the package is present. + """ + pytest.importorskip("kbcstorage") + fake_cfg = {"data_source": {"type": "keboola", "keboola": {}}} + monkeypatch.setattr( + "app.instance_config.load_instance_config", + lambda: fake_cfg, + raising=False, + ) + from app.instance_config import reset_cache + reset_cache() + + # Stub the Keboola client so the test doesn't reach the network. + fake_client = MagicMock() + fake_client.discover_all_tables.return_value = [{"id": "in.c-foo.bar"}] + monkeypatch.setattr( + "connectors.keboola.client.KeboolaClient", + lambda *a, **kw: fake_client, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + try: + r = c.get("/api/admin/discover-tables", headers=_auth(token)) + assert r.status_code == 200, r.json() + body = r.json() + assert body["source"] == "keboola" + assert body["count"] == 1 + finally: + reset_cache() + + +def test_discover_bq_not_configured_returns_500(seeded_app, monkeypatch): + """When data_source.bigquery.project is missing, BqAccess returns its + not_configured sentinel — endpoint surfaces the structured error.""" + fake_cfg = { + "data_source": { + "type": "bigquery", + "bigquery": {}, # no project + }, + } + monkeypatch.setattr( + "app.instance_config.load_instance_config", + lambda: fake_cfg, + raising=False, + ) + from app.instance_config import reset_cache + reset_cache() + + c = seeded_app["client"] + token = seeded_app["admin_token"] + try: + r = c.get("/api/admin/discover-tables", headers=_auth(token)) + # not_configured is mapped to 500 in BqAccessError.HTTP_STATUS. + assert r.status_code == 500, r.json() + detail = r.json().get("detail", {}) + assert detail.get("kind") == "not_configured" + finally: + reset_cache() + + +def test_admin_tables_html_wires_discover_buttons(seeded_app, bq_instance): + """Structural — the BQ register modal in the rendered HTML now has the + Discover (datasets) and List tables buttons + datalists wired to the + endpoint.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.get("/admin/tables", headers=_auth(token)) + assert r.status_code == 200, r.text + html = r.text + assert "discoverBqDatasets" in html + assert "discoverBqTables" in html + assert 'id="bqDatasetList"' in html + assert 'id="bqTableList"' in html + assert "list=\"bqDatasetList\"" in html + assert "list=\"bqTableList\"" in html diff --git a/tests/test_admin_keboola_materialized.py b/tests/test_admin_keboola_materialized.py new file mode 100644 index 0000000..1a08776 --- /dev/null +++ b/tests/test_admin_keboola_materialized.py @@ -0,0 +1,95 @@ +"""Tests for Keboola materialized registration.""" +import pytest + + +def test_register_keboola_materialized_accepts_source_query(seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = {"Authorization": f"Bearer {token}"} + r = c.post( + "/api/admin/register-table", + headers=auth, + json={ + "name": "orders_recent", + "source_type": "keboola", + "query_mode": "materialized", + "source_query": "SELECT * FROM kbc.\"in.c-sales\".\"orders\" WHERE date > '2026-01-01'", + "sync_schedule": "daily 03:00", + }, + ) + assert r.status_code == 201, r.text + + +def test_register_keboola_materialized_rejects_missing_source_query(seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = {"Authorization": f"Bearer {token}"} + r = c.post( + "/api/admin/register-table", + headers=auth, + json={ + "name": "orders_recent", + "source_type": "keboola", + "query_mode": "materialized", + # source_query missing + }, + ) + assert r.status_code == 422 + assert "source_query" in r.text + + +def test_register_keboola_materialized_skips_bucket_check(seeded_app): + """Materialized rows don't need bucket/source_table — the SELECT inlines + the references. Mirror of BQ materialized validator behavior.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = {"Authorization": f"Bearer {token}"} + r = c.post( + "/api/admin/register-table", + headers=auth, + json={ + "name": "x", + "source_type": "keboola", + "query_mode": "materialized", + "source_query": "SELECT 1", + # No bucket / source_table — must still succeed. + }, + ) + assert r.status_code == 201, r.text + + +def test_update_keboola_materialized_clears_stale_source_query_on_mode_switch(seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = {"Authorization": f"Bearer {token}"} + + # Register materialized. + r = c.post( + "/api/admin/register-table", + headers=auth, + json={ + "name": "x", + "source_type": "keboola", + "query_mode": "materialized", + "source_query": "SELECT 1", + }, + ) + assert r.status_code == 201 + + # PUT to switch back to local — source_query must clear. + r = c.put( + "/api/admin/registry/x", + headers=auth, + json={ + "source_type": "keboola", + "query_mode": "local", + "bucket": "in.c-foo", + "source_table": "y", + }, + ) + assert r.status_code == 200 + + r = c.get("/api/admin/registry", headers=auth) + rows = r.json()["tables"] + row = next(t for t in rows if t["id"] == "x") + assert row.get("source_query") in (None, "") diff --git a/tests/test_admin_phase_c_deprecation.py b/tests/test_admin_phase_c_deprecation.py new file mode 100644 index 0000000..d494b43 --- /dev/null +++ b/tests/test_admin_phase_c_deprecation.py @@ -0,0 +1,70 @@ +"""Verify Phase C deprecation marks + profile_after_sync becomes inert.""" +import pytest +from app.api.admin import RegisterTableRequest, UpdateTableRequest + + +def test_register_request_marks_sync_strategy_deprecated(): + schema = RegisterTableRequest.model_json_schema() + field = schema["properties"]["sync_strategy"] + assert field.get("deprecated") is True + + +def test_register_request_marks_profile_after_sync_deprecated(): + schema = RegisterTableRequest.model_json_schema() + field = schema["properties"]["profile_after_sync"] + assert field.get("deprecated") is True + + +def test_register_endpoint_accepts_profile_after_sync_for_backcompat(seeded_app): + """External clients sending profile_after_sync get no error — the + field is silently ignored.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = {"Authorization": f"Bearer {token}"} + r = c.post( + "/api/admin/register-table", + headers=auth, + json={ + "name": "x", + "source_type": "keboola", + "bucket": "in.c-foo", + "source_table": "y", + "query_mode": "local", + "profile_after_sync": True, # legacy client may send this + }, + ) + assert r.status_code == 201 + + +def test_register_endpoint_does_not_persist_profile_after_sync(seeded_app): + """The persisted row no longer carries the old profile_after_sync + value (column may still exist in DB for back-compat, but admin path + never writes a non-default value).""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = {"Authorization": f"Bearer {token}"} + r = c.post( + "/api/admin/register-table", + headers=auth, + json={ + "name": "y", + "source_type": "keboola", + "bucket": "in.c-foo", + "source_table": "y", + "query_mode": "local", + "profile_after_sync": True, + }, + ) + assert r.status_code == 201 + r = c.get("/api/admin/registry", headers=auth) + rows = r.json()["tables"] + row = next(t for t in rows if t["id"] == "y") + # The field's value in the registry response is now whatever the DB + # default is (True per current schema). Critical: the request value + # is NOT echoed back. + # If the value is in the response at all (legacy back-compat in the + # GET serializer), it's the schema default, not the request value. + # If the value is absent (deprecated and stripped), that's also fine. + if "profile_after_sync" in row: + # Whatever this is, it's the schema default, not request-driven. + assert row["profile_after_sync"] is True or row["profile_after_sync"] is None diff --git a/tests/test_admin_put_preservation.py b/tests/test_admin_put_preservation.py new file mode 100644 index 0000000..95490d9 --- /dev/null +++ b/tests/test_admin_put_preservation.py @@ -0,0 +1,67 @@ +"""Regression guard for PUT field preservation. + +Locks the Pydantic semantics that the Phase F form-cleanup relies on: +when the Edit modal omits a field from its payload, the existing value +must survive. If a future maintainer flips ``model_dump()`` to +``exclude_unset=True`` or otherwise changes the partial-update semantics, +these tests fire before partitioned rows or primary keys silently +regress. +""" +import pytest + + +def _auth(token): + return {"Authorization": f"Bearer {token}"} + + +def test_put_preserves_omitted_sync_strategy(seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = _auth(token) + + r = c.post("/api/admin/register-table", headers=auth, json={ + "name": "events_partitioned", + "source_type": "keboola", + "bucket": "in.c-events", + "source_table": "events", + "query_mode": "local", + "sync_strategy": "partitioned", + }) + assert r.status_code == 201, r.text + + r = c.put("/api/admin/registry/events_partitioned", headers=auth, json={ + "sync_schedule": "daily 03:00", + "description": "now daily", + }) + assert r.status_code == 200 + + r = c.get("/api/admin/registry", headers=auth) + rows = r.json()["tables"] + row = next(t for t in rows if t["id"] == "events_partitioned") + assert row["sync_strategy"] == "partitioned" + + +def test_put_preserves_omitted_primary_key(seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = _auth(token) + + r = c.post("/api/admin/register-table", headers=auth, json={ + "name": "orders_with_pk", + "source_type": "keboola", + "bucket": "in.c-shop", + "source_table": "orders", + "query_mode": "local", + "primary_key": ["order_id", "tenant_id"], + }) + assert r.status_code == 201, r.text + + r = c.put("/api/admin/registry/orders_with_pk", headers=auth, json={ + "description": "shop orders", + }) + assert r.status_code == 200 + + r = c.get("/api/admin/registry", headers=auth) + rows = r.json()["tables"] + row = next(t for t in rows if t["id"] == "orders_with_pk") + assert row["primary_key"] == ["order_id", "tenant_id"] diff --git a/tests/test_api_admin_materialized.py b/tests/test_api_admin_materialized.py new file mode 100644 index 0000000..3783118 --- /dev/null +++ b/tests/test_api_admin_materialized.py @@ -0,0 +1,322 @@ +"""Admin API accepts source_query when query_mode='materialized', rejects +mismatches between mode and query field. + +Tests that hit the remote-mode register path require `stub_bq_extractor` +to bypass the post-register rebuild's real-BQ traffic. Materialized-only +tests skip the BG path (the 201 fast-path returns before any rebuild +fires) so they don't need the stub. + +Covers PR #145 (re-implementation against 0.24.0 base): +- RegisterTableRequest + UpdateTableRequest model_validators +- _validate_bigquery_register_payload materialized branch (skips bucket/ + source_table checks, requires source_query) +- register_table 201 response for materialized BQ rows (no synchronous + materialize — cron tick or manual /api/sync/trigger picks them up) +- update_table clears stale source_query when switching mode away from + materialized + +Shares the seeded_app + bq_instance fixtures from conftest / +test_admin_bq_register.py for parity with the existing BQ test surface. +""" +from unittest.mock import MagicMock + +import pytest + + +def _auth(token): + return {"Authorization": f"Bearer {token}"} + + +@pytest.fixture +def stub_bq_extractor(monkeypatch): + """Mirror tests/test_admin_bq_register.py — bypasses real-BQ traffic + in the post-register rebuild path so the test stays offline. Required + whenever the test seeds a remote-mode BQ row via the HTTP API.""" + rebuild_mock = MagicMock(return_value={ + "project_id": "my-test-project", + "tables_registered": 1, "errors": [], "skipped": False, + }) + monkeypatch.setattr( + "connectors.bigquery.extractor.rebuild_from_registry", + rebuild_mock, + ) + monkeypatch.setattr( + "src.orchestrator.SyncOrchestrator", + lambda *a, **kw: MagicMock(), + ) + return rebuild_mock + + +@pytest.fixture +def bq_instance(monkeypatch): + """Force instance.yaml to look like a BigQuery deployment. + + Mirrors tests/test_admin_bq_register.py::bq_instance so the + project_id read inside _validate_bigquery_register_payload succeeds. + """ + fake_cfg = { + "data_source": { + "type": "bigquery", + "bigquery": {"project": "my-test-project", "location": "us"}, + }, + } + monkeypatch.setattr( + "app.instance_config.load_instance_config", + lambda: fake_cfg, + raising=False, + ) + from app.instance_config import reset_cache + reset_cache() + yield fake_cfg + reset_cache() + + +def _materialized_payload(**overrides): + p = { + "name": "orders_90d", + "source_type": "bigquery", + "query_mode": "materialized", + "source_query": "SELECT date FROM `prj.ds.orders`", + "sync_schedule": "every 6h", + } + p.update(overrides) + return p + + +def test_register_materialized_requires_source_query(seeded_app, bq_instance): + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/admin/register-table", + json={ + "name": "missing_query", + "source_type": "bigquery", + "query_mode": "materialized", + # source_query missing + }, + headers=_auth(token), + ) + assert 400 <= r.status_code < 500, r.json() + detail = str(r.json().get("detail", "")).lower() + assert "source_query" in detail or "materialized" in detail + + +def test_register_materialized_accepts_source_query(seeded_app, bq_instance): + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/admin/register-table", + json=_materialized_payload(name="orders_90d_a"), + headers=_auth(token), + ) + assert r.status_code == 201, r.json() + body = r.json() + assert body["status"] == "registered" + assert "Materialized" in body.get("message", "") + + +def test_register_remote_rejects_source_query(seeded_app, bq_instance): + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/admin/register-table", + json={ + "name": "live_orders", + "source_type": "bigquery", + "bucket": "analytics", + "source_table": "orders", + "query_mode": "remote", + "source_query": "SELECT 1", + }, + headers=_auth(token), + ) + assert 400 <= r.status_code < 500, r.json() + + +def test_register_local_rejects_source_query(seeded_app): + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/admin/register-table", + json={ + "name": "kbc_orders", + "source_type": "keboola", + "query_mode": "local", + "source_query": "SELECT 1", + }, + headers=_auth(token), + ) + assert 400 <= r.status_code < 500, r.json() + + +def test_register_materialized_with_empty_source_query_rejected(seeded_app, bq_instance): + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/admin/register-table", + json=_materialized_payload(name="empty_q", source_query=""), + headers=_auth(token), + ) + assert 400 <= r.status_code < 500, r.json() + + +def test_update_source_query_alone_requires_query_mode(seeded_app, bq_instance, stub_bq_extractor): + """PUT body with source_query but no query_mode is incoherent — reject + so non-materialized rows can't carry an orphan source_query.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + + # Seed a remote-mode row. + r = c.post( + "/api/admin/register-table", + json={ + "name": "live_orphan", + "source_type": "bigquery", + "bucket": "analytics", + "source_table": "orders", + "query_mode": "remote", + }, + headers=_auth(token), + ) + assert r.status_code in (200, 202), r.json() # synchronous or async + table_id = r.json()["id"] + + r2 = c.put( + f"/api/admin/registry/{table_id}", + json={"source_query": "SELECT 1"}, + headers=_auth(token), + ) + assert 400 <= r2.status_code < 500, r2.json() + + +def test_update_schedule_only_on_materialized_row_succeeds( + seeded_app, bq_instance, stub_bq_extractor, +): + """REGRESSION (Devin BUG_0002 on 2219255): an admin editing only the + sync_schedule of a materialized row sends `{query_mode: 'materialized', + sync_schedule: '...'}` (the Edit modal always sends query_mode for BQ + rows). Pre-fix the UpdateTableRequest validator rejected this with 422 + because source_query wasn't in the body — even though the existing row + already had one. + + The PUT semantics overlay the body on the existing row, so omitted + source_query keeps the stored value. The synthetic RegisterTableRequest + constructed against the merged record at the handler still runs the + strict cross-field check, so the truly-broken case (materialized + without ANY source_query, even on existing) is still caught.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + + # Seed a materialized row with a real source_query. + r = c.post("/api/admin/register-table", json={ + "name": "schedule_edit_target", + "source_type": "bigquery", + "query_mode": "materialized", + "source_query": "SELECT 1", + "sync_schedule": "every 1h", + }, headers=_auth(token)) + assert r.status_code == 201, r.json() + table_id = r.json()["id"] + + # Edit ONLY the schedule. UI's saveTableEdit sends query_mode for BQ + # rows even when the operator didn't change it. + r2 = c.put(f"/api/admin/registry/{table_id}", json={ + "query_mode": "materialized", + "sync_schedule": "every 12h", + }, headers=_auth(token)) + assert r2.status_code == 200, r2.json() + + # Verify the schedule changed and source_query survived. + r3 = c.get("/api/admin/registry", headers=_auth(token)) + row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None) + assert row is not None + assert row["sync_schedule"] == "every 12h" + assert row["source_query"] == "SELECT 1" # preserved across edit + assert row["query_mode"] == "materialized" + + +def test_update_materialized_with_explicit_empty_source_query_rejected( + seeded_app, bq_instance, stub_bq_extractor, +): + """The fix above relaxes the validator for OMITTED source_query, but + explicitly setting it to an empty / whitespace string while claiming + materialized is still a typo and must be rejected (not silently + persisted as NULL).""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + + r = c.post("/api/admin/register-table", json={ + "name": "explicit_empty", + "source_type": "bigquery", + "query_mode": "materialized", + "source_query": "SELECT 1", + }, headers=_auth(token)) + assert r.status_code == 201, r.json() + table_id = r.json()["id"] + + r2 = c.put(f"/api/admin/registry/{table_id}", json={ + "query_mode": "materialized", + "source_query": "", # explicitly empty + }, headers=_auth(token)) + assert 400 <= r2.status_code < 500, r2.json() + + +def test_update_materialized_to_remote_clears_source_query( + seeded_app, bq_instance, stub_bq_extractor, +): + """When admin switches a materialized table to remote/local, the stale + source_query must be cleared in the DB — otherwise the registry shows + a non-materialized row carrying an orphan SQL body.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + + # Seed a materialized table with a source_query. + r = c.post( + "/api/admin/register-table", + json=_materialized_payload(name="switcher"), + headers=_auth(token), + ) + assert r.status_code == 201, r.json() + table_id = r.json()["id"] + + # Switch to remote — must include bucket+source_table for the new mode + # (the merged validator runs the BQ payload check on the merged record). + r2 = c.put( + f"/api/admin/registry/{table_id}", + json={ + "query_mode": "remote", + "bucket": "analytics", + "source_table": "orders_90d", + }, + headers=_auth(token), + ) + assert r2.status_code == 200, r2.json() + + # Verify in the registry: query_mode flipped, source_query cleared. + r3 = c.get("/api/admin/registry", headers=_auth(token)) + assert r3.status_code == 200, r3.json() + row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None) + assert row is not None, f"Table {table_id} not found in registry" + assert row["query_mode"] == "remote" + assert row["source_query"] in (None, "") + + +def test_register_materialized_persists_source_query_in_registry(seeded_app, bq_instance): + c = seeded_app["client"] + token = seeded_app["admin_token"] + r = c.post( + "/api/admin/register-table", + json=_materialized_payload( + name="persist_q", + source_query="SELECT col FROM `prj.ds.t` WHERE x = 1", + ), + headers=_auth(token), + ) + assert r.status_code == 201, r.json() + table_id = r.json()["id"] + + r2 = c.get("/api/admin/registry", headers=_auth(token)) + row = next((t for t in r2.json()["tables"] if t["id"] == table_id), None) + assert row is not None + assert row["query_mode"] == "materialized" + assert "WHERE x = 1" in row["source_query"] diff --git a/tests/test_bq_cost_guardrail.py b/tests/test_bq_cost_guardrail.py new file mode 100644 index 0000000..d51702e --- /dev/null +++ b/tests/test_bq_cost_guardrail.py @@ -0,0 +1,137 @@ +"""materialize_query refuses to run when dry-run estimate exceeds the cap. + +The cap is wired through `data_source.bigquery.max_bytes_per_materialize` +(read by the trigger pass; default 10 GiB; set 0 to disable). The dry-run +itself reuses `app.api.v2_scan._bq_dry_run_bytes` so cost-estimate logic +lives in exactly one place. Fail-open behaviour (DuckDB-syntax SQL the +native BQ client can't parse → estimate=0 → COPY proceeds with a warning) +is documented and exercised here too. +""" +import duckdb +import pytest +from contextlib import contextmanager +from unittest.mock import MagicMock, patch + +from connectors.bigquery.access import BqAccess, BqProjects +from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError + + +def _bq_with_seed(tables: dict[str, str] | None = None) -> BqAccess: + """Stub BqAccess seeded with in-memory tables (same recipe as + test_bq_materialize).""" + tables = tables or {} + + @contextmanager + def _session(_projects): + conn = duckdb.connect(":memory:") + try: + conn.execute("ATTACH ':memory:' AS bq") + for s in {ref.rsplit(".", 1)[0] for ref in tables}: + conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}") + for ref, body in tables.items(): + conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}") + yield conn + finally: + conn.close() + + return BqAccess( + BqProjects(billing="test-billing", data="test-data"), + client_factory=lambda _p: MagicMock(), + duckdb_session_factory=_session, + ) + + +def test_refuses_when_estimate_exceeds_cap(tmp_path): + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + + bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"}) + + with patch( + "app.api.v2_scan._bq_dry_run_bytes", return_value=100 * 2**30 + ): + with pytest.raises(MaterializeBudgetError) as exc: + materialize_query( + table_id="huge", + sql="SELECT * FROM bq.test.tiny", + bq=bq, + output_dir=str(out), + max_bytes=10 * 2**30, + ) + err = exc.value + assert err.table_id == "huge" + assert err.current == 100 * 2**30 + assert err.limit == 10 * 2**30 + + +def test_proceeds_when_estimate_under_cap(tmp_path): + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + + bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"}) + + with patch("app.api.v2_scan._bq_dry_run_bytes", return_value=1024): + stats = materialize_query( + table_id="tiny", + sql="SELECT * FROM bq.test.tiny", + bq=bq, + output_dir=str(out), + max_bytes=10 * 2**30, + ) + assert stats["rows"] == 1 + + +def test_no_cap_skips_dry_run(tmp_path): + """When max_bytes=None (default), no dry-run is performed.""" + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"}) + + with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry: + stats = materialize_query( + table_id="t1", + sql="SELECT * FROM bq.test.tiny", + bq=bq, + output_dir=str(out), + ) + mock_dry.assert_not_called() + assert stats["rows"] == 1 + + +def test_zero_max_bytes_skips_dry_run(tmp_path): + """Sentinel: max_bytes=0 disables the guardrail (config docs).""" + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"}) + + with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry: + stats = materialize_query( + table_id="t1", + sql="SELECT * FROM bq.test.tiny", + bq=bq, + output_dir=str(out), + max_bytes=0, + ) + mock_dry.assert_not_called() + assert stats["rows"] == 1 + + +def test_dry_run_failure_is_fail_open(tmp_path): + """If the dry-run errors (DuckDB syntax, missing google lib, transient + upstream failure) we don't block — log + proceed with COPY. Operators + who need hard-fail watch logs for the warning.""" + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"}) + + with patch( + "app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom") + ): + stats = materialize_query( + table_id="t1", + sql="SELECT * FROM bq.test.tiny", + bq=bq, + output_dir=str(out), + max_bytes=10 * 2**30, + ) + assert stats["rows"] == 1 diff --git a/tests/test_bq_init_extract_skips.py b/tests/test_bq_init_extract_skips.py new file mode 100644 index 0000000..aaa295b --- /dev/null +++ b/tests/test_bq_init_extract_skips.py @@ -0,0 +1,98 @@ +"""init_extract skips rows with query_mode='materialized'. + +Materialized rows are written by the sync trigger pass via +`materialize_query()`; they live as parquets in /data/extracts/bigquery/data/ +and surface via the orchestrator's standard local-parquet discovery. +Creating a remote view in extract.duckdb for the same name would shadow +the parquet via cross-source name collision. + +Pattern matches `tests/test_bigquery_extractor.py::TestViewVsTableTemplates` +(uses `_CapturingProxy` to wrap a real DuckDB conn and stub BQ-specific calls). +""" +import duckdb +from unittest.mock import MagicMock + + +class _CapturingProxy: + """Wraps a real DuckDB connection, captures SQL, stubs BQ-specific calls. + + DuckDBPyConnection.execute is C-level read-only, so we wrap rather than + monkey-patch. Shape lifted directly from tests/test_bigquery_extractor.py + to keep stub behavior consistent across the BQ test suite. + """ + + def __init__(self, real_conn, captured: list): + self._real = real_conn + self._captured = captured + + def execute(self, sql, *args, **kwargs): + self._captured.append(sql) + stripped_u = sql.strip().upper() + if stripped_u.startswith(("INSTALL ", "LOAD ", "CREATE SECRET")): + return MagicMock() + if stripped_u.startswith("ATTACH ") and "BIGQUERY" in stripped_u: + return MagicMock() + if stripped_u.startswith("DETACH "): + return MagicMock() + if 'FROM bq.' in sql or 'FROM bigquery_query' in sql: + return MagicMock() + return self._real.execute(sql, *args, **kwargs) + + def close(self): + return self._real.close() + + def __getattr__(self, name): + return getattr(self._real, name) + + +def test_init_extract_skips_materialized_rows(tmp_path, monkeypatch): + """A registry mix of remote + materialized rows: only the remote row + gets a `_meta` entry; the materialized row is silently skipped.""" + from connectors.bigquery.extractor import init_extract + + monkeypatch.setattr( + "connectors.bigquery.extractor.get_metadata_token", + lambda: "test-token", + ) + monkeypatch.setattr( + "connectors.bigquery.extractor._detect_table_type", + lambda *a, **kw: "BASE TABLE", + ) + + captured: list[str] = [] + real_connect = duckdb.connect + + def spy_connect(*a, **kw): + return _CapturingProxy(real_connect(*a, **kw), captured) + + monkeypatch.setattr( + "connectors.bigquery.extractor.duckdb.connect", spy_connect + ) + + configs = [ + { + "name": "live_orders", "bucket": "dset", "source_table": "live", + "query_mode": "remote", "description": "", + }, + { + "name": "agg_90d", "bucket": "dset", "source_table": "live", + "query_mode": "materialized", + "source_query": "SELECT 1", + "description": "", + }, + ] + stats = init_extract(str(tmp_path), "test-project", configs) + + db_path = tmp_path / "extract.duckdb" + assert db_path.exists(), "extract.duckdb should be written" + + db = duckdb.connect(str(db_path)) + meta = db.execute( + "SELECT table_name, query_mode FROM _meta ORDER BY table_name" + ).fetchall() + db.close() + + assert meta == [("live_orders", "remote")] + assert stats["tables_registered"] == 1 + # No CREATE VIEW for the materialized row + assert not any("agg_90d" in s for s in captured if "CREATE" in s.upper()) diff --git a/tests/test_bq_materialize.py b/tests/test_bq_materialize.py new file mode 100644 index 0000000..ae23680 --- /dev/null +++ b/tests/test_bq_materialize.py @@ -0,0 +1,139 @@ +"""BigQuery `materialize_query` writes parquet via BqAccess + DuckDB COPY. + +The function takes a `BqAccess` instance so the BQ extension session and +SECRET token live in one place across the codebase (cf. `v2_scan` / `v2_sample` +/ `v2_schema`). Tests inject a stub BqAccess whose `duckdb_session()` yields +an in-memory connection with a pre-attached `bq` catalog containing fixture +tables, exercising the COPY path end-to-end without any GCP traffic. +""" +import duckdb +import pytest +from contextlib import contextmanager +from pathlib import Path +from unittest.mock import MagicMock + +from connectors.bigquery.access import BqAccess, BqProjects +from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError + + +def _make_stub_bq(tables: dict[str, str] | None = None) -> BqAccess: + """Return a BqAccess wired to factories that yield an in-memory DuckDB + with a pretend `bq` catalog containing test tables. `tables` maps + DuckDB-three-part references like `'bq.test.orders'` to a SELECT + expression to seed them with. + """ + tables = tables or {} + + @contextmanager + def _session(_projects): + conn = duckdb.connect(":memory:") + try: + conn.execute("ATTACH ':memory:' AS bq") + schemas = {ref.rsplit(".", 1)[0] for ref in tables} + for s in schemas: + conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}") + for ref, body in tables.items(): + conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}") + yield conn + finally: + conn.close() + + # client_factory returns a stub whose .query(sql, job_config=...) yields + # a job whose .total_bytes_processed defaults to 0 (fail-open). + def _client(_projects): + client = MagicMock() + job = MagicMock() + job.total_bytes_processed = 0 + client.query.return_value = job + return client + + return BqAccess( + BqProjects(billing="test-billing", data="test-data"), + client_factory=_client, + duckdb_session_factory=_session, + ) + + +def test_materialize_writes_parquet_and_returns_stats(tmp_path): + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + + bq = _make_stub_bq({ + "bq.test.orders": ( + "SELECT 'EU' AS region, 100 AS revenue UNION ALL " + "SELECT 'US' AS region, 250 AS revenue" + ) + }) + + stats = materialize_query( + table_id="orders_summary", + sql="SELECT region, SUM(revenue) AS revenue FROM bq.test.orders GROUP BY 1", + bq=bq, + output_dir=str(out), + ) + + parquet_path = out / "data" / "orders_summary.parquet" + assert parquet_path.exists() + assert stats["rows"] == 2 + assert stats["size_bytes"] > 0 + assert stats["query_mode"] == "materialized" + + # Parquet readable end-to-end + rows = duckdb.connect().execute( + f"SELECT region, revenue FROM read_parquet('{parquet_path}') ORDER BY region" + ).fetchall() + assert rows == [("EU", 100), ("US", 250)] + + +def test_materialize_atomic_on_failure(tmp_path): + """Bad SQL must not leave a half-written parquet behind.""" + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + parquet_path = out / "data" / "broken.parquet" + + bq = _make_stub_bq({"bq.test.orders": "SELECT 1 AS n"}) + + with pytest.raises(Exception): + materialize_query( + table_id="broken", + sql="SELECT * FROM bq.test.does_not_exist", + bq=bq, + output_dir=str(out), + ) + assert not parquet_path.exists() + # Tmp also cleaned + assert not (out / "data" / "broken.parquet.tmp").exists() + + +def test_materialize_rejects_unsafe_table_id(tmp_path): + """table_id becomes the parquet filename — block path traversal up front.""" + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + bq = _make_stub_bq() + + with pytest.raises(ValueError, match="unsafe"): + materialize_query( + table_id="../etc/passwd", + sql="SELECT 1", + bq=bq, + output_dir=str(out), + ) + + +def test_materialize_overwrites_existing_parquet(tmp_path): + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + bq = _make_stub_bq({"bq.test.tiny": "SELECT 1 AS n"}) + + materialize_query( + table_id="t1", sql="SELECT 1 AS n", + bq=bq, output_dir=str(out), + ) + materialize_query( + table_id="t1", sql="SELECT 2 AS n", + bq=bq, output_dir=str(out), + ) + rows = duckdb.connect().execute( + f"SELECT n FROM read_parquet('{out}/data/t1.parquet')" + ).fetchall() + assert rows == [(2,)] diff --git a/tests/test_cli_admin_materialized.py b/tests/test_cli_admin_materialized.py new file mode 100644 index 0000000..c522b3d --- /dev/null +++ b/tests/test_cli_admin_materialized.py @@ -0,0 +1,157 @@ +"""`da admin register-table --query-mode materialized --query @file.sql` +sends source_query in the payload; existing local/remote paths still work +unchanged.""" +from typer.testing import CliRunner +from unittest.mock import MagicMock + +from cli.main import app + + +def _fake_resp(status_code, body=None): + resp = MagicMock() + resp.status_code = status_code + resp.json = lambda: body or {"id": "x", "name": "x", "status": "registered"} + return resp + + +def test_register_materialized_with_inline_query(monkeypatch): + captured = {} + + def fake_post(path, json): + captured["path"] = path + captured["json"] = json + return _fake_resp(201) + + monkeypatch.setattr("cli.commands.admin.api_post", fake_post) + + runner = CliRunner() + result = runner.invoke(app, [ + "admin", "register-table", "orders_90d", + "--source-type", "bigquery", + "--query-mode", "materialized", + "--query", "SELECT date FROM `prj.ds.orders`", + "--sync-schedule", "every 6h", + ]) + + assert result.exit_code == 0, result.stdout + assert captured["path"] == "/api/admin/register-table" + assert captured["json"]["query_mode"] == "materialized" + assert captured["json"]["source_query"] == "SELECT date FROM `prj.ds.orders`" + assert captured["json"]["sync_schedule"] == "every 6h" + + +def test_register_materialized_reads_query_from_file(tmp_path, monkeypatch): + sql_file = tmp_path / "orders.sql" + sql_file.write_text( + "SELECT date, SUM(revenue) FROM `prj.ds.orders` GROUP BY 1\n" + ) + + captured = {} + + def fake_post(path, json): + captured["json"] = json + return _fake_resp(201) + + monkeypatch.setattr("cli.commands.admin.api_post", fake_post) + + runner = CliRunner() + result = runner.invoke(app, [ + "admin", "register-table", "orders_90d", + "--source-type", "bigquery", + "--query-mode", "materialized", + "--query", f"@{sql_file}", + "--sync-schedule", "daily 03:00", + ]) + + assert result.exit_code == 0, result.stdout + assert "SELECT date, SUM(revenue)" in captured["json"]["source_query"] + assert not captured["json"]["source_query"].endswith("\n") + + +def test_register_materialized_without_query_fails(monkeypatch): + """--query-mode materialized without --query is a client-side error, + no API call made.""" + called = {"count": 0} + + def fake_post(*args, **kwargs): + called["count"] += 1 + return _fake_resp(201) + + monkeypatch.setattr("cli.commands.admin.api_post", fake_post) + + runner = CliRunner() + result = runner.invoke(app, [ + "admin", "register-table", "orders_90d", + "--source-type", "bigquery", + "--query-mode", "materialized", + ]) + + assert result.exit_code != 0 + assert called["count"] == 0 + combined = result.stdout + (result.stderr or "") + assert "--query" in combined + + +def test_register_local_mode_does_not_send_source_query(monkeypatch): + """Default local mode shouldn't send source_query — server-side + validator forbids it on local.""" + captured = {} + + def fake_post(path, json): + captured["json"] = json + return _fake_resp(201) + + monkeypatch.setattr("cli.commands.admin.api_post", fake_post) + + runner = CliRunner() + result = runner.invoke(app, [ + "admin", "register-table", "kbc_orders", + "--source-type", "keboola", + "--bucket", "in.c-crm", + ]) + + assert result.exit_code == 0 + assert "source_query" not in captured["json"] + assert "sync_schedule" not in captured["json"] + + +def test_register_query_at_path_missing_file_fails(monkeypatch): + """@file.sql where the file doesn't exist surfaces a clear error.""" + monkeypatch.setattr( + "cli.commands.admin.api_post", lambda *a, **kw: _fake_resp(201), + ) + runner = CliRunner() + result = runner.invoke(app, [ + "admin", "register-table", "x", + "--source-type", "bigquery", + "--query-mode", "materialized", + "--query", "@/tmp/definitely-does-not-exist-9b4f7e2c.sql", + ]) + assert result.exit_code != 0 + + +def test_register_remote_path_unchanged(monkeypatch): + """The pre-existing --bucket / --source-table / --query-mode remote + flow still works without --query.""" + captured = {} + + def fake_post(path, json): + captured["json"] = json + return _fake_resp(200) + + monkeypatch.setattr("cli.commands.admin.api_post", fake_post) + + runner = CliRunner() + result = runner.invoke(app, [ + "admin", "register-table", "live_orders", + "--source-type", "bigquery", + "--bucket", "analytics", + "--source-table", "orders", + "--query-mode", "remote", + ]) + + assert result.exit_code == 0 + assert captured["json"]["query_mode"] == "remote" + assert "source_query" not in captured["json"] + assert captured["json"]["bucket"] == "analytics" + assert captured["json"]["source_table"] == "orders" diff --git a/tests/test_db_migration_v20.py b/tests/test_db_migration_v20.py new file mode 100644 index 0000000..7840b81 --- /dev/null +++ b/tests/test_db_migration_v20.py @@ -0,0 +1,77 @@ +"""v20 adds source_query column to table_registry. + +Backs query_mode='materialized' for BigQuery: admin registers a SQL body +that the scheduler runs through the DuckDB BQ extension and writes as a +parquet to /data/extracts/bigquery/data/.parquet. + +The v19 step (#150) drops dataset_permissions, access_requests tables and +users.role, table_registry.is_public columns; v20 then ALTERs the post-v19 +table_registry to add the source_query column. +""" +import duckdb + +from src.db import SCHEMA_VERSION, _ensure_schema, get_schema_version + + +def test_schema_version_is_20(): + assert SCHEMA_VERSION == 20 + + +def test_v20_adds_source_query(tmp_path): + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + _ensure_schema(conn) + + cols = { + r[0] for r in conn.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_name = 'table_registry'" + ).fetchall() + } + assert "source_query" in cols, f"source_query missing from {cols}" + assert get_schema_version(conn) == 20 + conn.close() + + +def test_v19_db_migrates_to_v20(tmp_path): + """Pre-existing v19 DB (post-RBAC-drop) without source_query upgrades + cleanly without losing data.""" + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + + # Simulate a v19 DB at minimal but realistic shape: schema_version row + + # a table_registry row in the post-v19 column shape (no is_public column, + # since v19 finalize dropped it via the table-rebuild idiom). + conn.execute( + "CREATE TABLE schema_version (version INTEGER, " + "applied_at TIMESTAMP DEFAULT current_timestamp)" + ) + conn.execute("INSERT INTO schema_version (version) VALUES (19)") + conn.execute("""CREATE TABLE table_registry ( + id VARCHAR PRIMARY KEY, name VARCHAR NOT NULL, + source_type VARCHAR, bucket VARCHAR, source_table VARCHAR, + sync_strategy VARCHAR DEFAULT 'full_refresh', + query_mode VARCHAR DEFAULT 'local', + sync_schedule VARCHAR, profile_after_sync BOOLEAN DEFAULT true, + primary_key VARCHAR, folder VARCHAR, description TEXT, + registered_by VARCHAR, + registered_at TIMESTAMP DEFAULT current_timestamp + )""") + conn.execute("INSERT INTO table_registry (id, name) VALUES ('foo', 'foo')") + + _ensure_schema(conn) + + assert get_schema_version(conn) == 20 + cols = { + r[0] for r in conn.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_name = 'table_registry'" + ).fetchall() + } + assert "source_query" in cols + # Existing row preserved, new column NULL + row = conn.execute( + "SELECT id, source_query FROM table_registry WHERE id='foo'" + ).fetchone() + assert row == ("foo", None) + conn.close() diff --git a/tests/test_keboola_access.py b/tests/test_keboola_access.py new file mode 100644 index 0000000..781e650 --- /dev/null +++ b/tests/test_keboola_access.py @@ -0,0 +1,75 @@ +"""Tests for KeboolaAccess facade.""" +import os +import pytest +from connectors.keboola.access import KeboolaAccess + + +def test_access_session_yields_attached_duckdb(tmp_path, monkeypatch): + """Mock-mode test: the facade should accept a token, install+load + the Keboola extension, and ATTACH it as 'kbc'. We verify the SQL + issued by intercepting the duckdb.connect call. + """ + issued = [] + class FakeConn: + def execute(self, sql, *args, **kwargs): + issued.append(sql) + class R: + def fetchall(s): return [] + def fetchone(s): return (0,) + return R() + def close(self): pass + + import duckdb + monkeypatch.setattr(duckdb, "connect", lambda *a, **kw: FakeConn()) + + acc = KeboolaAccess( + url="https://connection.keboola.com/", + token="fake-token-xyz", + ) + with acc.duckdb_session() as conn: + assert conn is not None + # Verify the install + load + attach sequence happened. + joined = "\n".join(issued) + assert "INSTALL keboola" in joined + assert "LOAD keboola" in joined + assert "ATTACH" in joined and "TYPE keboola" in joined + # Token must be escaped for embedding in the ATTACH literal. + assert "fake-token-xyz" in joined + + +def test_access_escapes_single_quote_in_token(monkeypatch): + """Defense against a token containing a single quote breaking the + ATTACH literal. SQL injection here is non-trivial because the token + is admin-supplied at instance config time, but escape it anyway.""" + issued = [] + class FakeConn: + def execute(self, sql, *args, **kwargs): + issued.append(sql) + class R: + def fetchall(s): return [] + def fetchone(s): return (0,) + return R() + def close(self): pass + import duckdb + monkeypatch.setattr(duckdb, "connect", lambda *a, **kw: FakeConn()) + + acc = KeboolaAccess(url="x", token="bad'token") + with acc.duckdb_session() as conn: + pass + attach_sql = next(s for s in issued if "ATTACH" in s) + # Doubled single-quote per SQL string-literal escaping. + assert "bad''token" in attach_sql + + +def test_access_real_attach_when_creds_present(tmp_path): + """Smoke when KBC_TEST_URL + KBC_TEST_TOKEN are present.""" + url = os.environ.get("KBC_TEST_URL") + token = os.environ.get("KBC_TEST_TOKEN") + if not (url and token): + pytest.skip("Keboola creds not provided") + acc = KeboolaAccess(url=url, token=token) + with acc.duckdb_session() as conn: + # ATTACH must have succeeded — querying duckdb_databases() should + # show the 'kbc' alias. + rows = [r[0] for r in conn.execute("SELECT name FROM duckdb_databases()").fetchall()] + assert "kbc" in rows diff --git a/tests/test_keboola_extension_query_passthrough.py b/tests/test_keboola_extension_query_passthrough.py new file mode 100644 index 0000000..af5c24f --- /dev/null +++ b/tests/test_keboola_extension_query_passthrough.py @@ -0,0 +1,75 @@ +"""Lock-in test for the DuckDB Keboola extension's query-passthrough +capability that the Keboola materialized path depends on. + +Run only when KBC_TEST_URL + KBC_TEST_TOKEN env vars are set (CI without +real Keboola credentials skips). Local dev with a real Storage API +token exercises the path. +""" +import os +import pytest +import duckdb + + +KBC_URL = os.environ.get("KBC_TEST_URL") +KBC_TOKEN = os.environ.get("KBC_TEST_TOKEN") +KBC_BUCKET = os.environ.get("KBC_TEST_BUCKET") +KBC_TABLE = os.environ.get("KBC_TEST_TABLE") + +pytestmark = pytest.mark.skipif( + not all([KBC_URL, KBC_TOKEN, KBC_BUCKET, KBC_TABLE]), + reason="Keboola integration creds not provided", +) + + +def test_extension_supports_attach_and_select(tmp_path): + """Keboola extension must support: ATTACH 'keboola://...' AS kbc, then + SELECT * FROM kbc.bucket.table. The Keboola materialized path uses this + primitive at runtime (just like connectors/keboola/extractor.py:133).""" + conn = duckdb.connect(str(tmp_path / "spike.duckdb")) + conn.execute("INSTALL keboola FROM community") + conn.execute("LOAD keboola") + escaped_token = KBC_TOKEN.replace("'", "''") + conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')") + rows = conn.execute( + f'SELECT COUNT(*) FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}"' + ).fetchone() + assert rows[0] >= 0 # any non-negative count is fine; we're testing the path works + + +def test_extension_supports_copy_to_parquet(tmp_path): + """Keboola materialized writes the SELECT result via + `COPY (...) TO '...' (FORMAT PARQUET)`. Lock that primitive.""" + conn = duckdb.connect(str(tmp_path / "spike.duckdb")) + conn.execute("INSTALL keboola FROM community") + conn.execute("LOAD keboola") + escaped_token = KBC_TOKEN.replace("'", "''") + conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')") + + parquet_path = tmp_path / "out.parquet" + safe_lit = str(parquet_path).replace("'", "''") + conn.execute( + f'COPY (SELECT * FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 5) ' + f"TO '{safe_lit}' (FORMAT PARQUET)" + ) + assert parquet_path.exists() and parquet_path.stat().st_size > 0 + + +def test_extension_supports_filtered_query(tmp_path): + """Most important capability: a non-trivial WHERE/projection survives. + This is what 'Custom SQL' mode actually relies on.""" + conn = duckdb.connect(str(tmp_path / "spike.duckdb")) + conn.execute("INSTALL keboola FROM community") + conn.execute("LOAD keboola") + escaped_token = KBC_TOKEN.replace("'", "''") + conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')") + + parquet_path = tmp_path / "filtered.parquet" + safe_lit = str(parquet_path).replace("'", "''") + # Trivially filterable SELECT — extension must push the WHERE down or + # at minimum execute it client-side. Either is acceptable for our + # materialized path. + conn.execute( + f'COPY (SELECT 1 AS marker FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 3) ' + f"TO '{safe_lit}' (FORMAT PARQUET)" + ) + assert parquet_path.exists() diff --git a/tests/test_keboola_init_extract_skips.py b/tests/test_keboola_init_extract_skips.py new file mode 100644 index 0000000..9a57c52 --- /dev/null +++ b/tests/test_keboola_init_extract_skips.py @@ -0,0 +1,89 @@ +"""Verify the legacy Keboola download path skips materialized rows. + +Materialized rows are handled by `_run_materialized_pass` in +`app/api/sync.py`, not by the legacy extractor. Mirror of the BQ +extractor's existing skip behavior at line 188. + +The Keboola extractor entrypoint is `run()` (not `init_extract` like +the BQ extractor). Tests below observe the skip via caplog and the +stats dict (no parquet written, table not in tables_extracted/failed). +""" +from connectors.keboola import extractor as kbe + + +def test_run_skips_materialized_rows(tmp_path, caplog): + """Given a registry with one materialized row, run() must NOT write a + parquet for it and must NOT count it in tables_extracted/failed.""" + output_dir = tmp_path / "extracts" / "keboola" + output_dir.mkdir(parents=True) + + table_configs = [ + { + "id": "orders_recent", + "name": "orders_recent", + "source_query": "SELECT * FROM kbc.\"in.c-sales\".\"orders\" WHERE date > '2026-01-01'", + "query_mode": "materialized", + }, + ] + + with caplog.at_level("INFO"): + # Use bogus URL/token — the extractor will fail to ATTACH the + # extension (legacy client fallback also unavailable for the + # materialized row, but materialized rows must be skipped before + # any of that code runs). + stats = kbe.run( + str(output_dir), table_configs, + keboola_url="https://invalid.example/", + keboola_token="bogus", + ) + + # No parquet files for the materialized row. + parquet = output_dir / "data" / "orders_recent.parquet" + assert not parquet.exists(), "materialized row was written by legacy extractor" + + # The materialized row must NOT count as extracted or failed. + assert stats["tables_extracted"] == 0 + assert stats["tables_failed"] == 0 + + # Skip log line for ops visibility. + assert "Skipping" in caplog.text and "materialized" in caplog.text + + +def test_run_processes_local_alongside_skipped_materialized(tmp_path, caplog): + """Mixed registry: one local + one materialized. Materialized is + skipped, local row attempts extraction (and fails because there's + no real Keboola in this test, but that's a separate failure path — + the materialized row must not appear in tables_failed).""" + output_dir = tmp_path / "extracts" / "keboola" + output_dir.mkdir(parents=True) + + table_configs = [ + { + "id": "orders", + "name": "orders", + "bucket": "in.c-sales", + "source_table": "orders", + "query_mode": "local", + }, + { + "id": "orders_recent", + "name": "orders_recent", + "source_query": "SELECT 1", + "query_mode": "materialized", + }, + ] + + with caplog.at_level("INFO"): + stats = kbe.run( + str(output_dir), table_configs, + keboola_url="https://invalid.example/", + keboola_token="bogus", + ) + + # The materialized row must not be in failures (it was skipped, not failed). + failed_names = {e["table"] for e in stats.get("errors", [])} + assert "orders_recent" not in failed_names, ( + "materialized row appeared in failures — should have been skipped" + ) + # Skip log line for ops visibility. + assert "Skipping" in caplog.text and "materialized" in caplog.text diff --git a/tests/test_keboola_materialize.py b/tests/test_keboola_materialize.py new file mode 100644 index 0000000..216c638 --- /dev/null +++ b/tests/test_keboola_materialize.py @@ -0,0 +1,95 @@ +"""Tests for the Keboola materialize_query path.""" +import hashlib +import pytest +from pathlib import Path +from unittest.mock import MagicMock + +from connectors.keboola import extractor as kbe + + +def test_materialize_query_writes_parquet_and_returns_metadata(tmp_path, monkeypatch): + """Mock-mode: feed in a fake KeboolaAccess that yields a fake DuckDB + connection accepting `COPY ... TO '...' (FORMAT PARQUET)` and just + writes a small parquet via duckdb's own primitive on a tmp DB. + """ + import duckdb + real_conn = duckdb.connect(":memory:") + # Pre-create a small relation the fake materialize "copies". + real_conn.execute("CREATE TABLE t AS SELECT 1 AS x, 'hello' AS y UNION ALL SELECT 2, 'world'") + + class FakeAccess: + def duckdb_session(self): + from contextlib import contextmanager + @contextmanager + def _cm(): + yield real_conn + return _cm() + fake_access = FakeAccess() + + output_dir = tmp_path / "out" + output_dir.mkdir() + + # Submit a query that selects from the in-memory table (not a real + # Keboola bucket — the test verifies the COPY/parquet/hash path, + # not the extension behavior). + result = kbe.materialize_query( + table_id="example_subset", + sql="SELECT * FROM t", + keboola_access=fake_access, + output_dir=output_dir, + ) + + parquet_path = output_dir / "example_subset.parquet" + assert parquet_path.exists() + assert result["table_id"] == "example_subset" + assert result["path"] == str(parquet_path) + assert result["rows"] == 2 + assert result["bytes"] > 0 + # MD5 of the bytes should match what we recompute. + expected_md5 = hashlib.md5(parquet_path.read_bytes()).hexdigest() + assert result["md5"] == expected_md5 + + +def test_materialize_query_zero_rows_logs_warning(tmp_path, caplog): + import duckdb + real_conn = duckdb.connect(":memory:") + real_conn.execute("CREATE TABLE t AS SELECT 1 AS x WHERE FALSE") + + class FakeAccess: + def duckdb_session(self): + from contextlib import contextmanager + @contextmanager + def _cm(): + yield real_conn + return _cm() + + output_dir = tmp_path / "out" + output_dir.mkdir() + + with caplog.at_level("WARNING"): + result = kbe.materialize_query( + table_id="empty_subset", + sql="SELECT * FROM t", + keboola_access=FakeAccess(), + output_dir=output_dir, + ) + assert result["rows"] == 0 + assert "0 rows" in caplog.text or "empty" in caplog.text.lower() + + +def test_materialize_query_rejects_unsafe_table_id(tmp_path): + """Defense: table_id is interpolated into the parquet filename. SQL/ + path-traversal-unsafe values must be rejected up-front (mirror of BQ + materialize_query's validation).""" + class FakeAccess: + def duckdb_session(self): + raise AssertionError("should not be called") + output_dir = tmp_path / "out" + output_dir.mkdir() + with pytest.raises(ValueError, match="table_id"): + kbe.materialize_query( + table_id="../../etc/passwd", + sql="SELECT 1", + keboola_access=FakeAccess(), + output_dir=output_dir, + ) diff --git a/tests/test_keboola_materialized_e2e.py b/tests/test_keboola_materialized_e2e.py new file mode 100644 index 0000000..90982b6 --- /dev/null +++ b/tests/test_keboola_materialized_e2e.py @@ -0,0 +1,71 @@ +"""End-to-end: register a Keboola materialized row -> trigger sync -> +parquet appears -> manifest serves it -> CLI da sync would download it. + +Skipped unless KBC_TEST_URL + KBC_TEST_TOKEN + KBC_TEST_BUCKET + +KBC_TEST_TABLE are present. +""" +import os +from pathlib import Path + +import pytest + + +KBC_URL = os.environ.get("KBC_TEST_URL") +KBC_TOKEN = os.environ.get("KBC_TEST_TOKEN") +KBC_BUCKET = os.environ.get("KBC_TEST_BUCKET") +KBC_TABLE = os.environ.get("KBC_TEST_TABLE") + +pytestmark = pytest.mark.skipif( + not all([KBC_URL, KBC_TOKEN, KBC_BUCKET, KBC_TABLE]), + reason="Keboola creds not provided", +) + + +def test_register_trigger_manifest_path(seeded_app, monkeypatch, tmp_path): + monkeypatch.setenv("DATA_DIR", str(tmp_path)) + monkeypatch.setenv("KEBOOLA_TOKEN", KBC_TOKEN) + monkeypatch.setattr( + "app.instance_config.load_instance_config", + lambda: { + "data_source": { + "type": "keboola", + "keboola": { + "url": KBC_URL, + "token_env": "KEBOOLA_TOKEN", + }, + }, + }, + raising=False, + ) + + c = seeded_app["client"] + token = seeded_app["admin_token"] + auth = {"Authorization": f"Bearer {token}"} + + # Register. + r = c.post("/api/admin/register-table", headers=auth, json={ + "name": "smoke_subset", + "source_type": "keboola", + "query_mode": "materialized", + "source_query": ( + f'SELECT * FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 5' + ), + }) + assert r.status_code == 201 + + # Trigger sync. + r = c.post("/api/sync/trigger", headers=auth) + assert r.status_code in (200, 202) + + # Parquet must exist. + parquet = Path(tmp_path) / "extracts" / "keboola" / "data" / "smoke_subset.parquet" + assert parquet.exists() and parquet.stat().st_size > 0 + + # Manifest serves it. + r = c.get("/api/sync/manifest", headers=auth) + rows = r.json()["tables"] + smoke = next((t for t in rows if t["id"] == "smoke_subset"), None) + assert smoke is not None + assert smoke["source_type"] == "keboola" + assert smoke["query_mode"] == "local" # materialized parquets surface as local + assert smoke["md5"] # has a hash for da sync delta detection diff --git a/tests/test_materialized_e2e.py b/tests/test_materialized_e2e.py new file mode 100644 index 0000000..f0e07d1 --- /dev/null +++ b/tests/test_materialized_e2e.py @@ -0,0 +1,335 @@ +"""End-to-end integration coverage for query_mode='materialized'. + +Unit tests verify each piece in isolation; this file glues them together: + +1. Admin POST /api/admin/register-table (materialized) → registry row written +2. _run_materialized_pass writes parquet + sync_state with correct hash +3. GET /api/sync/manifest (per-user) returns the row with query_mode + + the parquet hash, filtered by RBAC +4. Mode-switch transitions (remote → materialized, materialized → SQL edit + preserves registered_at) maintain registry invariants. + +Devil's-advocate review found these were the gaps the unit tests left +open. Each piece passes in isolation; this file proves they compose. +""" +import duckdb +import hashlib +import pytest +from contextlib import contextmanager +from pathlib import Path +from unittest.mock import patch, MagicMock + +from connectors.bigquery.access import BqAccess, BqProjects +from src.repositories.table_registry import TableRegistryRepository +from src.repositories.sync_state import SyncStateRepository + + +def _auth(token): + return {"Authorization": f"Bearer {token}"} + + +@pytest.fixture +def bq_instance(monkeypatch): + """Force instance.yaml to look like a BigQuery deployment so the BQ + register validator's project_id check passes.""" + fake_cfg = { + "data_source": { + "type": "bigquery", + "bigquery": {"project": "my-test-project", "location": "us"}, + }, + } + monkeypatch.setattr( + "app.instance_config.load_instance_config", + lambda: fake_cfg, + raising=False, + ) + from app.instance_config import reset_cache + reset_cache() + yield fake_cfg + reset_cache() + + +@pytest.fixture +def stub_bq_extractor(monkeypatch): + """Mirror tests/test_admin_bq_register.py::stub_bq_extractor — replaces + rebuild_from_registry + SyncOrchestrator so the API's post-register + materialize doesn't hit real BQ during HTTP-driven tests.""" + rebuild_mock = MagicMock(return_value={ + "project_id": "my-test-project", + "tables_registered": 1, + "errors": [], + "skipped": False, + }) + monkeypatch.setattr( + "connectors.bigquery.extractor.rebuild_from_registry", + rebuild_mock, + ) + orch_mock = MagicMock() + monkeypatch.setattr( + "src.orchestrator.SyncOrchestrator", + lambda *a, **kw: orch_mock, + ) + return {"rebuild": rebuild_mock, "orchestrator": orch_mock} + + +@pytest.fixture +def stub_bq(): + """Real-shape BqAccess wired to in-memory DuckDB factories so the + materialize_query path can run end-to-end without GCP.""" + @contextmanager + def _session(_p): + conn = duckdb.connect(":memory:") + try: + conn.execute("ATTACH ':memory:' AS bq") + conn.execute("CREATE SCHEMA bq.test") + conn.execute( + "CREATE OR REPLACE TABLE bq.test.orders AS " + "SELECT 'EU' AS region, 100 AS revenue UNION ALL " + "SELECT 'US' AS region, 250 AS revenue" + ) + yield conn + finally: + conn.close() + return BqAccess( + BqProjects(billing="my-test-project", data="my-test-project"), + client_factory=lambda _p: MagicMock(), + duckdb_session_factory=_session, + ) + + +def test_e2e_register_then_materialize_then_manifest_via_repo( + bq_instance, stub_bq, tmp_path, monkeypatch, +): + """Glue test: register row at the repository layer (skips HTTP/auth), + run the materialized pass, verify sync_state, then exercise the + `_build_manifest_for_user` admin path. Catches integration breakage + that unit tests miss because each only sees one layer.""" + monkeypatch.setenv("DATA_DIR", str(tmp_path / "data")) + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + from src.db import _ensure_schema + _ensure_schema(conn) + + table_id = "orders_summary_e2e" + repo = TableRegistryRepository(conn) + repo.register( + id=table_id, name=table_id, source_type="bigquery", + query_mode="materialized", + source_query="SELECT region, SUM(revenue) AS revenue " + "FROM bq.test.orders GROUP BY 1", + sync_schedule="every 1m", + ) + + # Run the materialized pass. + from app.api import sync as sync_mod + summary = sync_mod._run_materialized_pass(conn, stub_bq) + assert table_id in summary["materialized"], summary + assert not summary["errors"] + + # Parquet on disk. + parquet_path = ( + tmp_path / "data" / "extracts" / "bigquery" / "data" + / f"{table_id}.parquet" + ) + assert parquet_path.exists(), f"Expected {parquet_path} to exist" + + # sync_state hash matches the file's MD5. + expected_hash = hashlib.md5(parquet_path.read_bytes()).hexdigest() + state = SyncStateRepository(conn) + row = state.get_table_state(table_id) + assert row is not None + assert row["hash"] == expected_hash + assert row["rows"] == 2 + + # Manifest builder exposes query_mode + hash to admin (no RBAC filter). + # Post-#150 RBAC: admin shortcut keys on Admin user_group membership, + # not the legacy `users.role` column. Seed the user + Admin membership + # so `can_access_table` short-circuits to True. + conn.execute( + "INSERT OR IGNORE INTO users (id, email) VALUES ('u-admin', 'admin@test')" + ) + admin_group_id = conn.execute( + "SELECT id FROM user_groups WHERE name = 'Admin'" + ).fetchone()[0] + conn.execute( + "INSERT OR IGNORE INTO user_group_members (user_id, group_id, source) " + "VALUES ('u-admin', ?, 'admin')", + [admin_group_id], + ) + admin_user = {"id": "u-admin", "email": "admin@test"} + manifest = sync_mod._build_manifest_for_user(conn, admin_user) + assert table_id in manifest["tables"] + entry = manifest["tables"][table_id] + assert entry["query_mode"] == "materialized" + assert entry["hash"] == expected_hash + assert entry["rows"] == 2 + + conn.close() + + +def test_remote_to_materialized_transition_clears_bucket_table( + seeded_app, bq_instance, stub_bq_extractor, +): + """Switching a remote BQ row to materialized must accept source_query + and the merged validator must not trip on the now-irrelevant + bucket/source_table fields.""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + + # Seed a remote row. + r = c.post("/api/admin/register-table", json={ + "name": "live_to_mat", + "source_type": "bigquery", + "bucket": "analytics", + "source_table": "orders", + "query_mode": "remote", + }, headers=_auth(token)) + assert r.status_code in (200, 202), r.json() + table_id = r.json()["id"] + + # Switch to materialized — must include source_query for the validator. + r2 = c.put(f"/api/admin/registry/{table_id}", json={ + "query_mode": "materialized", + "source_query": "SELECT 1 AS n", + }, headers=_auth(token)) + assert r2.status_code == 200, r2.json() + + # Verify the merged record reflects the switch. + r3 = c.get("/api/admin/registry", headers=_auth(token)) + row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None) + assert row is not None + assert row["query_mode"] == "materialized" + assert row["source_query"] == "SELECT 1 AS n" + + +def test_materialized_sql_edit_preserves_registered_at( + seeded_app, bq_instance, stub_bq_extractor, monkeypatch, +): + """Editing source_query on an existing materialized row must not + reset registered_at — the row's registration history is preserved + across SQL edits (issue #130 invariant).""" + c = seeded_app["client"] + token = seeded_app["admin_token"] + + # Seed a materialized row. + r = c.post("/api/admin/register-table", json={ + "name": "sql_edit_target", + "source_type": "bigquery", + "query_mode": "materialized", + "source_query": "SELECT 1 AS n", + }, headers=_auth(token)) + assert r.status_code == 201, r.json() + table_id = r.json()["id"] + + # Capture the original registered_at. + r2 = c.get("/api/admin/registry", headers=_auth(token)) + row = next((t for t in r2.json()["tables"] if t["id"] == table_id), None) + original_ts = row["registered_at"] + assert original_ts is not None + + # Edit the SQL. + import time + time.sleep(0.01) # ensure a clock tick elapses so a fresh stamp would differ + r3 = c.put(f"/api/admin/registry/{table_id}", json={ + "query_mode": "materialized", + "source_query": "SELECT 2 AS n", + }, headers=_auth(token)) + assert r3.status_code == 200, r3.json() + + r4 = c.get("/api/admin/registry", headers=_auth(token)) + row = next((t for t in r4.json()["tables"] if t["id"] == table_id), None) + assert row["source_query"] == "SELECT 2 AS n" + # registered_at preserved across edit + assert row["registered_at"] == original_ts, ( + f"Expected registered_at preserved (issue #130 contract). " + f"Original: {original_ts}, after edit: {row['registered_at']}" + ) + + +def test_materialized_zero_rows_logs_warning(stub_bq, tmp_path, caplog): + """Devil's-advocate item: an SQL filter that returns 0 rows is + indistinguishable from 'SQL is wrong'. Confirm we log a WARNING so + operators can grep on it.""" + import logging + from connectors.bigquery.extractor import materialize_query + + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + + # Add an empty BQ table to the stub for this test. + @contextmanager + def _session_empty(_p): + conn = duckdb.connect(":memory:") + try: + conn.execute("ATTACH ':memory:' AS bq") + conn.execute("CREATE SCHEMA bq.test") + conn.execute("CREATE OR REPLACE TABLE bq.test.empty AS " + "SELECT 1 AS n WHERE FALSE") + yield conn + finally: + conn.close() + + bq_empty = BqAccess( + BqProjects(billing="t", data="t"), + client_factory=lambda _p: MagicMock(), + duckdb_session_factory=_session_empty, + ) + + with caplog.at_level(logging.WARNING, logger="connectors.bigquery.extractor"): + stats = materialize_query( + table_id="empty_t", + sql="SELECT * FROM bq.test.empty", + bq=bq_empty, + output_dir=str(out), + ) + + assert stats["rows"] == 0 + assert any("0 rows" in rec.message for rec in caplog.records), ( + f"Expected '0 rows' WARNING; got: {[r.message for r in caplog.records]}" + ) + + +def test_attach_real_error_propagates(stub_bq, tmp_path): + """ATTACH 'project=...' that fails for a real reason (not the + 'already attached' tolerated case) must propagate so callers see + the actual error instead of a confusing downstream 'bq is not + attached' message.""" + from connectors.bigquery.extractor import materialize_query + + out = tmp_path / "extracts" / "bigquery" + out.mkdir(parents=True) + + @contextmanager + def _session_attach_fails(_p): + conn = duckdb.connect(":memory:") + try: + # Force ATTACH 'project=...' to raise something other than + # "already attached" by intercepting via execute wrapper — + # since DuckDB's real connection doesn't accept attribute + # patches, we use a thin proxy for this test. + class _Proxy: + def __init__(self, real): + self._real = real + def execute(self, sql, *a, **kw): + if sql.startswith("ATTACH 'project="): + raise duckdb.Error("fake permission denied: missing serviceusage.services.use") + return self._real.execute(sql, *a, **kw) + def __getattr__(self, name): + return getattr(self._real, name) + def close(self): + return self._real.close() + yield _Proxy(conn) + finally: + conn.close() + + bq_bad = BqAccess( + BqProjects(billing="t", data="t"), + client_factory=lambda _p: MagicMock(), + duckdb_session_factory=_session_attach_fails, + ) + + with pytest.raises(duckdb.Error, match="permission denied"): + materialize_query( + table_id="x", sql="SELECT 1", + bq=bq_bad, output_dir=str(out), + ) diff --git a/tests/test_sync_trigger_keboola_materialized.py b/tests/test_sync_trigger_keboola_materialized.py new file mode 100644 index 0000000..83a0b5f --- /dev/null +++ b/tests/test_sync_trigger_keboola_materialized.py @@ -0,0 +1,132 @@ +"""Scheduler-level test: when a Keboola row has query_mode='materialized', +_run_materialized_pass dispatches to connectors.keboola.extractor.materialize_query +(not BQ's). Existing BQ-materialized rows continue using BqAccess. + +Mirrors the unit-style of tests/test_sync_trigger_materialized.py — patches +the inner extractor entry points instead of going through the API layer. +""" +import duckdb +import pytest +from contextlib import contextmanager +from pathlib import Path +from unittest.mock import patch, MagicMock + +from src.db import _ensure_schema +from src.repositories.table_registry import TableRegistryRepository +from connectors.bigquery.access import BqAccess, BqProjects + + +@pytest.fixture +def system_db(tmp_path, monkeypatch): + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + _ensure_schema(conn) + monkeypatch.setenv("DATA_DIR", str(tmp_path / "data")) + yield conn + conn.close() + + +@pytest.fixture +def stub_bq(): + @contextmanager + def _session(_p): + conn = duckdb.connect(":memory:") + try: + yield conn + finally: + conn.close() + + return BqAccess( + BqProjects(billing="t", data="t"), + client_factory=lambda _p: MagicMock(), + duckdb_session_factory=_session, + ) + + +def test_run_materialized_pass_dispatches_keboola_to_keboola_extractor( + system_db, stub_bq, tmp_path, monkeypatch +): + """Keboola row with query_mode='materialized' must invoke the Keboola + materialize_query, not the BQ one.""" + repo = TableRegistryRepository(system_db) + repo.register( + id="orders_recent", name="orders_recent", + source_type="keboola", query_mode="materialized", + source_query='SELECT * FROM kbc."in.c-sales"."orders" WHERE 1=1', + sync_schedule="every 1m", # always due + ) + + # Provide instance.yaml-shape config + env so the Keboola lazy-init succeeds. + monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake-token") + from app.api import sync as sync_mod + + # Patch get_value to return the keboola URL/token_env. + def _fake_get_value(*keys, default=None): + path = keys + if path == ("data_source", "keboola", "url"): + return "https://connection.keboola.com/" + if path == ("data_source", "keboola", "token_env"): + return "KEBOOLA_STORAGE_TOKEN" + if path == ("data_source", "bigquery", "max_bytes_per_materialize"): + return default if default is not None else 0 + return default + + # Pre-create the parquet for hash bookkeeping (kb materialize is patched + # so it won't write a real one). + parquet_dir = tmp_path / "data" / "extracts" / "keboola" / "data" + parquet_dir.mkdir(parents=True, exist_ok=True) + (parquet_dir / "orders_recent.parquet").write_bytes( + b"PAR1" + b"\x00" * 16 + b"PAR1" + ) + + bq_called = MagicMock() + kb_called = MagicMock(return_value={ + "table_id": "orders_recent", "rows": 1, "bytes": 100, + "md5": "abc123", "path": str(parquet_dir / "orders_recent.parquet"), + }) + + with patch("app.instance_config.get_value", _fake_get_value), \ + patch("connectors.bigquery.extractor.materialize_query", bq_called), \ + patch("connectors.keboola.extractor.materialize_query", kb_called): + summary = sync_mod._run_materialized_pass(system_db, stub_bq) + + assert kb_called.called, "Keboola materialize_query was not invoked" + assert not bq_called.called, ( + "BQ materialize_query was wrongly invoked for a Keboola row" + ) + assert "orders_recent" in summary["materialized"] + + +def test_run_materialized_pass_dispatches_bigquery_to_bq_extractor( + system_db, stub_bq, tmp_path +): + """Regression: BQ-materialized path keeps working unchanged.""" + repo = TableRegistryRepository(system_db) + repo.register( + id="events_summary", name="events_summary", + source_type="bigquery", query_mode="materialized", + source_query="SELECT date, COUNT(*) FROM `proj.dataset.events` GROUP BY 1", + sync_schedule="every 1m", + ) + + parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data" + parquet_dir.mkdir(parents=True, exist_ok=True) + (parquet_dir / "events_summary.parquet").write_bytes( + b"PAR1" + b"\x00" * 16 + b"PAR1" + ) + + bq_called = MagicMock(return_value={ + "rows": 1, "size_bytes": 100, "query_mode": "materialized", + "hash": "abc123", + }) + kb_called = MagicMock() + + from app.api import sync as sync_mod + + with patch("app.api.sync._materialize_table", bq_called), \ + patch("connectors.keboola.extractor.materialize_query", kb_called): + summary = sync_mod._run_materialized_pass(system_db, stub_bq) + + assert bq_called.called + assert not kb_called.called + assert "events_summary" in summary["materialized"] diff --git a/tests/test_sync_trigger_materialized.py b/tests/test_sync_trigger_materialized.py new file mode 100644 index 0000000..6f94e91 --- /dev/null +++ b/tests/test_sync_trigger_materialized.py @@ -0,0 +1,484 @@ +"""_run_materialized_pass walks table_registry for materialized BQ rows +and runs each that is due via _materialize_table. + +Tests inject a stub BqAccess (factories never called by these tests since +_materialize_table is patched) and assert that scheduling, error +aggregation, sync_state hash, and the disable-sentinel all behave +correctly. +""" +import duckdb +import pytest +from contextlib import contextmanager +from pathlib import Path +from unittest.mock import patch, MagicMock + +from src.db import _ensure_schema +from src.repositories.table_registry import TableRegistryRepository +from src.repositories.sync_state import SyncStateRepository +from connectors.bigquery.access import BqAccess, BqProjects + + +@pytest.fixture +def system_db(tmp_path, monkeypatch): + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + _ensure_schema(conn) + monkeypatch.setenv("DATA_DIR", str(tmp_path / "data")) + yield conn + conn.close() + + +@pytest.fixture +def stub_bq(): + """A BqAccess instance that the tests don't actually exercise (the test + patches `_materialize_table`); just needs to be a valid BqAccess so the + type contract doesn't break.""" + @contextmanager + def _session(_p): + conn = duckdb.connect(":memory:") + try: + yield conn + finally: + conn.close() + + return BqAccess( + BqProjects(billing="t", data="t"), + client_factory=lambda _p: MagicMock(), + duckdb_session_factory=_session, + ) + + +def test_materialized_pass_calls_materialize_for_due_rows(system_db, stub_bq, tmp_path): + repo = TableRegistryRepository(system_db) + repo.register( + id="orders_90d", name="orders_90d", + source_type="bigquery", query_mode="materialized", + source_query="SELECT 1 AS n", + sync_schedule="every 1m", # always due in tests (no prior sync) + ) + + # Pre-create the parquet so _file_hash returns non-empty + parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data" + parquet_dir.mkdir(parents=True, exist_ok=True) + (parquet_dir / "orders_90d.parquet").write_bytes( + b"PAR1" + b"\x00" * 16 + b"PAR1" + ) + + from app.api import sync as sync_mod + + with patch("app.api.sync._materialize_table") as mock_mat: + mock_mat.return_value = { + "rows": 1, "size_bytes": 100, "query_mode": "materialized", + } + summary = sync_mod._run_materialized_pass(system_db, stub_bq) + + mock_mat.assert_called_once() + call_kwargs = mock_mat.call_args.kwargs + assert call_kwargs["table_id"] == "orders_90d" + assert "SELECT 1 AS n" in call_kwargs["sql"] + assert call_kwargs["bq"] is stub_bq + # Default cap (10 GiB) flows through when no instance.yaml override + assert call_kwargs["max_bytes"] == 10 * 2**30 + assert "orders_90d" in summary["materialized"] + assert not summary["errors"] + + +def test_materialized_pass_skips_undue_rows(system_db, stub_bq): + repo = TableRegistryRepository(system_db) + repo.register( + id="orders_daily", name="orders_daily", + source_type="bigquery", query_mode="materialized", + source_query="SELECT 1", + sync_schedule="daily 03:00", + ) + state = SyncStateRepository(system_db) + state.update_sync( + table_id="orders_daily", rows=1, file_size_bytes=10, hash="x", + ) + + from app.api import sync as sync_mod + + with patch("app.api.sync._materialize_table") as mock_mat: + summary = sync_mod._run_materialized_pass(system_db, stub_bq) + + mock_mat.assert_not_called() + assert "orders_daily" in summary["skipped"] + + +def test_materialized_pass_skips_non_materialized_rows(system_db, stub_bq): + repo = TableRegistryRepository(system_db) + repo.register(id="t1", name="t1", source_type="keboola", query_mode="local") + repo.register(id="t2", name="t2", source_type="bigquery", query_mode="remote") + + from app.api import sync as sync_mod + + with patch("app.api.sync._materialize_table") as mock_mat: + summary = sync_mod._run_materialized_pass(system_db, stub_bq) + + mock_mat.assert_not_called() + assert summary == {"materialized": [], "skipped": [], "errors": []} + + +def test_materialized_pass_collects_errors_per_row(system_db, stub_bq, tmp_path): + """One row failing must not stop a healthy sibling.""" + repo = TableRegistryRepository(system_db) + repo.register( + id="ok", name="ok", source_type="bigquery", + query_mode="materialized", source_query="SELECT 1", + sync_schedule="every 1m", + ) + repo.register( + id="bad", name="bad", source_type="bigquery", + query_mode="materialized", source_query="SELECT broken", + sync_schedule="every 1m", + ) + + parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data" + parquet_dir.mkdir(parents=True, exist_ok=True) + (parquet_dir / "ok.parquet").write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1") + + from app.api import sync as sync_mod + + def _fake(table_id, sql, bq, output_dir, max_bytes): + if table_id == "bad": + raise RuntimeError("simulated COPY failure") + return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"} + + with patch("app.api.sync._materialize_table", side_effect=_fake): + summary = sync_mod._run_materialized_pass(system_db, stub_bq) + + assert summary["materialized"] == ["ok"] + assert len(summary["errors"]) == 1 + assert summary["errors"][0]["table"] == "bad" + assert "simulated" in summary["errors"][0]["error"] + + +def test_materialized_pass_records_parquet_hash(system_db, stub_bq, tmp_path): + """sync_state.hash must be the MD5 of the parquet file — otherwise the + manifest reports an empty hash and every da sync re-downloads.""" + repo = TableRegistryRepository(system_db) + repo.register( + id="hashed", name="hashed", + source_type="bigquery", query_mode="materialized", + source_query="SELECT 1", + sync_schedule="every 1m", + ) + + parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data" + parquet_dir.mkdir(parents=True, exist_ok=True) + parquet_path = parquet_dir / "hashed.parquet" + + def _fake(**kwargs): + parquet_path.write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1") + return {"rows": 1, "size_bytes": 24, "query_mode": "materialized"} + + from app.api import sync as sync_mod + + with patch("app.api.sync._materialize_table", side_effect=_fake): + sync_mod._run_materialized_pass(system_db, stub_bq) + + state = SyncStateRepository(system_db) + row = state.get_table_state("hashed") + assert row is not None + import hashlib + expected = hashlib.md5(b"PAR1" + b"\x00" * 16 + b"PAR1").hexdigest() + assert row["hash"] == expected + + +def test_run_sync_keboola_timeout_does_not_skip_materialized(tmp_path, monkeypatch): + """REGRESSION (Devin BUG_0001 on 2219255): when the Keboola extractor + subprocess raises TimeoutExpired, the materialized BQ pass and the + orchestrator rebuild must still fire. Pre-fix the timeout propagated + to the outer except handler and skipped the rest of _run_sync — on + a dual-source deployment a slow Keboola extractor would silently + block all materialized parquets and the master-view rebuild until + the next trigger.""" + import duckdb + import subprocess as _sp + from src.db import _ensure_schema + + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + _ensure_schema(conn) + + repo = TableRegistryRepository(conn) + # One Keboola row (drives the extractor subprocess) + one materialized + # BQ row (must still run after the Keboola timeout). + repo.register( + id="kbc_a", name="kbc_a", source_type="keboola", + query_mode="local", bucket="in.c-foo", source_table="a", + ) + repo.register( + id="m1", name="m1", source_type="bigquery", + query_mode="materialized", + source_query="SELECT 1", + sync_schedule="every 1m", + ) + conn.close() + + monkeypatch.setenv("DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("KEBOOLA_STACK_URL", "https://example.invalid") + monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake") + + from app.api import sync as sync_mod + + # Subprocess raises TimeoutExpired the moment _run_sync calls it. + def _timeout(*a, **kw): + raise _sp.TimeoutExpired(cmd=["fake"], timeout=1) + + monkeypatch.setattr(sync_mod.subprocess, "run", _timeout) + + materialized_called = {"count": 0} + orchestrator_called = {"count": 0} + + def _spy_materialized(_conn, _bq): + materialized_called["count"] += 1 + return {"materialized": ["m1"], "skipped": [], "errors": []} + + class _OrchStub: + def rebuild(self): + orchestrator_called["count"] += 1 + return {} + + monkeypatch.setattr("app.api.sync._run_materialized_pass", _spy_materialized) + monkeypatch.setattr( + "src.orchestrator.SyncOrchestrator", lambda *a, **kw: _OrchStub(), + ) + monkeypatch.setattr( + "app.instance_config.get_data_source_type", lambda: "keboola", + ) + monkeypatch.setattr( + "app.instance_config.get_value", + lambda *args, **kw: ( + "my-bq-proj" if (args and args[-1] == "project") + else kw.get("default", "") + ), + ) + + sync_mod._run_sync() + + assert materialized_called["count"] == 1, ( + "materialized pass must run even when Keboola subprocess timed out" + ) + assert orchestrator_called["count"] == 1, ( + "orchestrator rebuild must run after Keboola timeout to publish " + "any partial / materialized parquets that did land" + ) + + +def test_run_sync_runs_materialized_pass_on_bq_only_deployment( + tmp_path, monkeypatch, +): + """REGRESSION (Devin BUG_0002 on 2fa44f2): on BigQuery-only deployments + `list_local('bigquery')` is always empty (BQ rows are remote or + materialized, never local). The pre-fix _run_sync early-returned in + that case → materialized pass + orchestrator rebuild were dead code. + Post-fix: run_extractor_subprocess flag skips just the Keboola + subprocess, and the materialized pass still fires.""" + import duckdb + from src.db import _ensure_schema + + db_path = tmp_path / "system.duckdb" + conn = duckdb.connect(str(db_path)) + _ensure_schema(conn) + + repo = TableRegistryRepository(conn) + # Materialized BQ row — would be invisible to list_local('bigquery'). + repo.register( + id="m1", name="m1", source_type="bigquery", + query_mode="materialized", + source_query="SELECT 1", + sync_schedule="every 1m", + ) + conn.close() + + monkeypatch.setenv("DATA_DIR", str(tmp_path / "data")) + + # Patch the heavy collaborators so we observe what _run_sync invoked + # without actually running BQ / orchestrator. + from app.api import sync as sync_mod + + materialized_called = {"count": 0} + orchestrator_called = {"count": 0} + + def _spy_materialized_pass(_conn, _bq): + materialized_called["count"] += 1 + return {"materialized": ["m1"], "skipped": [], "errors": []} + + class _OrchStub: + def rebuild(self): + orchestrator_called["count"] += 1 + return {} + + monkeypatch.setattr( + "app.api.sync._run_materialized_pass", + _spy_materialized_pass, + ) + monkeypatch.setattr( + "src.orchestrator.SyncOrchestrator", + lambda *a, **kw: _OrchStub(), + ) + # Pretend instance.yaml says data_source.type=bigquery + monkeypatch.setattr( + "app.instance_config.get_data_source_type", + lambda: "bigquery", + ) + # bq_project must be truthy so the materialized pass branch fires. + real_get_value = sync_mod.__dict__.get("get_value") + monkeypatch.setattr( + "app.instance_config.get_value", + lambda *args, **kw: ( + "my-bq-proj" if (args and args[-1] == "project") + else kw.get("default", "") + ), + ) + + sync_mod._run_sync() + + assert materialized_called["count"] == 1, ( + "materialized pass must run on BQ-only deployment (no local rows)" + ) + assert orchestrator_called["count"] == 1, ( + "orchestrator rebuild must run so materialized parquets are picked up" + ) + + +@pytest.mark.parametrize("yaml_value, expected_max", [ + (10737418240, 10737418240), # int — canonical + (10737418240.0, 10737418240), # float — YAML often parses as float + (1e10, 10000000000), # scientific notation + ("10737418240", 10737418240), # string — coerced + (0, None), # explicit disable sentinel + (None, None), # missing key + ("not-a-number", None), # malformed → fail-open + warn +]) +def test_materialized_pass_max_bytes_yaml_coercion( + system_db, stub_bq, tmp_path, monkeypatch, yaml_value, expected_max, +): + """`max_bytes_per_materialize` YAML value is coerced to int regardless of + the YAML scalar type (int / float / scientific / string). Devin found + that an `isinstance(raw, int)` guard silently disabled the guardrail + on float values.""" + repo = TableRegistryRepository(system_db) + repo.register( + id="t", name="t", source_type="bigquery", + query_mode="materialized", source_query="SELECT 1", + sync_schedule="every 1m", + ) + + parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data" + parquet_dir.mkdir(parents=True, exist_ok=True) + (parquet_dir / "t.parquet").write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1") + + captured = {} + + def _spy(table_id, sql, bq, output_dir, max_bytes): + captured["max_bytes"] = max_bytes + return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"} + + from app.api import sync as sync_mod + + with patch( + "app.instance_config.get_value", + side_effect=lambda *a, **kw: ( + yaml_value if a[-1] == "max_bytes_per_materialize" + else kw.get("default", "") + ), + ), patch("app.api.sync._materialize_table", side_effect=_spy): + sync_mod._run_materialized_pass(system_db, stub_bq) + + assert captured["max_bytes"] == expected_max + + +def test_materialized_pass_keys_sync_state_by_name_not_id( + system_db, stub_bq, tmp_path, +): + """Devin review: when admin registers a name with mixed case (e.g. + "Orders_90d") the slug-derived id ("orders_90d") differs from name. + sync_state must be keyed by `name` so the manifest's `registry_by_name` + lookup resolves and `query_mode='materialized'` flows through to the + client. Otherwise CLI sees `query_mode='local'` and downloads the + wrong file or skips the row.""" + repo = TableRegistryRepository(system_db) + # Mixed-case name — id will be slugified to lowercase by the API path, + # but at the repo level we control both directly. + repo.register( + id="orders_90d", name="Orders_90d", + source_type="bigquery", query_mode="materialized", + source_query="SELECT 1", + sync_schedule="every 1m", + ) + + # Pre-create the parquet at the NAME-keyed path. + parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data" + parquet_dir.mkdir(parents=True, exist_ok=True) + (parquet_dir / "Orders_90d.parquet").write_bytes( + b"PAR1" + b"\x00" * 16 + b"PAR1" + ) + + from app.api import sync as sync_mod + + captured = {} + + def _spy(table_id, sql, bq, output_dir, max_bytes): + captured["table_id"] = table_id + return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"} + + with patch("app.api.sync._materialize_table", side_effect=_spy): + sync_mod._run_materialized_pass(system_db, stub_bq) + + # materialize_query was called with the NAME, not the id. + assert captured["table_id"] == "Orders_90d" + + # sync_state row keyed by name. + state = SyncStateRepository(system_db) + name_row = state.get_table_state("Orders_90d") + id_row = state.get_table_state("orders_90d") + assert name_row is not None, "sync_state should be keyed by name" + assert id_row is None, "sync_state should NOT be keyed by id" + + +def test_materialized_pass_zero_max_bytes_disables_guardrail( + system_db, stub_bq, tmp_path, monkeypatch +): + """`max_bytes_per_materialize: 0` in instance.yaml → None passed downstream + so materialize_query skips the dry-run entirely.""" + repo = TableRegistryRepository(system_db) + repo.register( + id="big", name="big", source_type="bigquery", + query_mode="materialized", source_query="SELECT 1", + sync_schedule="every 1m", + ) + + parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data" + parquet_dir.mkdir(parents=True, exist_ok=True) + (parquet_dir / "big.parquet").write_bytes( + b"PAR1" + b"\x00" * 16 + b"PAR1" + ) + + monkeypatch.setattr( + "app.api.sync.get_value", + lambda *args, **kwargs: 0 if args[-1] == "max_bytes_per_materialize" else "", + raising=False, + ) + + from app.api import sync as sync_mod + + captured = {} + + def _spy(**kwargs): + captured.update(kwargs) + return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"} + + # The function reads `get_value` via a local import in the body — patch + # the import target instead. + with patch( + "app.instance_config.get_value", + side_effect=lambda *args, **kw: ( + 0 if args[-1] == "max_bytes_per_materialize" + else kw.get("default", "") + ), + ), patch("app.api.sync._materialize_table", side_effect=_spy): + sync_mod._run_materialized_pass(system_db, stub_bq) + + assert captured["max_bytes"] is None diff --git a/tests/test_table_registry_source_query.py b/tests/test_table_registry_source_query.py new file mode 100644 index 0000000..2578be5 --- /dev/null +++ b/tests/test_table_registry_source_query.py @@ -0,0 +1,93 @@ +"""Repository round-trips source_query column for query_mode='materialized'. + +Lives alongside the schema-v20 migration: register() now accepts source_query +as an Optional[str] kwarg, and the column flows through SELECT * via list/get. +""" +import duckdb +import pytest + +from src.db import _ensure_schema +from src.repositories.table_registry import TableRegistryRepository + + +@pytest.fixture +def repo(tmp_path): + conn = duckdb.connect(str(tmp_path / "system.duckdb")) + _ensure_schema(conn) + return TableRegistryRepository(conn) + + +def test_register_persists_source_query(repo): + repo.register( + id="orders_90d", + name="orders_90d", + source_type="bigquery", + query_mode="materialized", + source_query=( + "SELECT date, SUM(revenue) FROM `prj.ds.orders` " + "WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY) GROUP BY 1" + ), + sync_schedule="every 6h", + ) + row = repo.get("orders_90d") + assert row is not None + assert row["query_mode"] == "materialized" + assert "INTERVAL 90 DAY" in row["source_query"] + assert row["sync_schedule"] == "every 6h" + + +def test_register_omitted_source_query_stays_null(repo): + """Default registrations (Keboola local) must not stamp an empty string.""" + repo.register(id="t1", name="t1", source_type="keboola", query_mode="local") + row = repo.get("t1") + assert row is not None + assert row["source_query"] is None + + +def test_list_all_includes_source_query(repo): + repo.register( + id="m1", name="m1", source_type="bigquery", + query_mode="materialized", source_query="SELECT 1", + ) + rows = repo.list_all() + assert len(rows) == 1 + assert rows[0]["source_query"] == "SELECT 1" + + +def test_register_updates_source_query_on_conflict(repo): + """Re-registering the same id must overwrite source_query (admin edit).""" + repo.register( + id="m1", name="m1", source_type="bigquery", + query_mode="materialized", source_query="SELECT 1", + ) + repo.register( + id="m1", name="m1", source_type="bigquery", + query_mode="materialized", source_query="SELECT 2", + ) + row = repo.get("m1") + assert row["source_query"] == "SELECT 2" + + +def test_register_preserves_registered_at_when_supplied(repo): + """source_query addition must not break the existing registered_at + preservation contract (admin edits keep the original timestamp).""" + from datetime import datetime, timezone + + original = datetime(2026, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + repo.register( + id="t", name="t", source_type="bigquery", + query_mode="materialized", source_query="SELECT 1", + registered_at=original, + ) + repo.register( + id="t", name="t", source_type="bigquery", + query_mode="materialized", source_query="SELECT 2", + registered_at=original, + ) + row = repo.get("t") + assert row["source_query"] == "SELECT 2" + # Don't assert exact equality on naive vs aware (DuckDB strips tz); + # just confirm the year+month+day didn't slide forward to 'now'. + assert row["registered_at"].year == 2026 + assert row["registered_at"].month == 1 + assert row["registered_at"].day == 1