feat(materialized): query_mode='materialized' for BigQuery + Keboola — admin SELECT → parquet → analyst
Closes the 'admin pre-stages a curated table/view for analysts' use case end-to-end across both supported source connectors. Backend (BigQuery + Keboola, schema v20): - schema v20 adds source_query TEXT to table_registry (renumbered from v19 after main's #150 RBAC migration also bumped to v19) - connectors/bigquery/extractor.py adds materialize_query(table_id, sql, *, bq, output_dir, max_bytes=...) — BqAccess session, dry-run cost guardrail (default 10 GiB, configurable via data_source.bigquery.max_bytes_per_materialize), idempotent ATTACH, rows/bytes/md5 metadata for sync_state - connectors/keboola/access.py — new KeboolaAccess facade (parallel of BqAccess) wrapping ATTACH 'keboola://...' AS kbc - connectors/keboola/extractor.py adds materialize_query — same shape, no dry-run analog (Keboola Storage API has different cost model); legacy bucket-download path skips query_mode='materialized' rows - app/api/sync.py:_run_materialized_pass dispatches by source_type to the right materialize_query - app/api/admin.py: RegisterTableRequest accepts source_query; model_validator coheres mode↔source_query↔bucket; PUT preserves omitted fields; deprecation marks (Field(deprecated=True)) on sync_strategy + profile_after_sync (no extractor reads them; profile_after_sync becomes inert — bug from earlier work where /api/sync/trigger never honored the flag); _BQ_OPTIONAL_FIELD_DEFAULTS injects defaults into GET /server-config payload Operator + CLI surface: - da admin register-table --query / --query-mode materialized - scripts/smoke-test-materialized-bq.sh — end-to-end smoke for operators Tests (incl. spike + integration + regression): - test_db_migration_v20, test_table_registry_source_query - test_bq_materialize, test_bq_cost_guardrail, test_bq_init_extract_skips - test_keboola_access, test_keboola_extension_query_passthrough (lock-in for the DuckDB extension capability), test_keboola_materialize, test_keboola_init_extract_skips, test_keboola_materialized_e2e (skipped without KBC_TEST_* creds) - test_sync_trigger_materialized, test_sync_trigger_keboola_materialized - test_api_admin_materialized, test_cli_admin_materialized - test_admin_bq_register, test_admin_discover_bigquery, test_admin_keboola_materialized, test_admin_phase_c_deprecation, test_admin_put_preservation, test_materialized_e2e Cost: BQ uses bigquery_query() (jobs API, view-aware) — works on tables, views, materialized views uniformly. Keboola uses ATTACH+COPY parquet through the DuckDB extension.
This commit is contained in:
parent
d0b7e122d6
commit
85d3810535
29 changed files with 4599 additions and 122 deletions
805
app/api/admin.py
805
app/api/admin.py
|
|
@ -12,7 +12,7 @@ import uuid
|
|||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
from pydantic import BaseModel, Field, field_validator, model_validator
|
||||
from typing import Optional, List, Dict, Any
|
||||
import duckdb
|
||||
|
||||
|
|
@ -172,6 +172,9 @@ _EDITABLE_SECTIONS: tuple[str, ...] = (
|
|||
"server",
|
||||
"auth",
|
||||
"ai",
|
||||
"openmetadata",
|
||||
"desktop",
|
||||
"corporate_memory",
|
||||
)
|
||||
|
||||
# "Danger-zone" sections — flipping these can lock operators out (auth.*) or
|
||||
|
|
@ -181,6 +184,404 @@ _EDITABLE_SECTIONS: tuple[str, ...] = (
|
|||
# the right warning copy.
|
||||
_DANGER_SECTIONS: tuple[str, ...] = ("auth", "server")
|
||||
|
||||
# Known-but-optional config fields per section. The /admin/server-config UI
|
||||
# uses this registry alongside the YAML payload to render fields the operator
|
||||
# might want to set even though they're not currently in instance.yaml.
|
||||
#
|
||||
# Schema per field:
|
||||
# {
|
||||
# "kind": "string" | "secret" | "bool" | "int" | "select" | "object" | "array",
|
||||
# "default": <type-appropriate default> (optional)
|
||||
# "hint": "<one-line operator-facing help>"
|
||||
# "options": [...] (only for kind="select")
|
||||
# "fields": {<name>: <fieldspec>} (only for kind="object", nested fields)
|
||||
# "item_kind": "string" | ... (only for kind="array", element type)
|
||||
# "required": bool (defaults False; UI marks the label)
|
||||
# }
|
||||
#
|
||||
# Subagents 2-4 will populate the bodies. The registry enables the UI to
|
||||
# render missing-but-known fields with placeholders + hints rather than
|
||||
# forcing the operator to discover them via the JSON-patch textarea or
|
||||
# hitting a runtime error first. The smoke fixture below
|
||||
# (data_source.bigquery.billing_project) proves the renderer wiring works
|
||||
# end-to-end so subagents 2-4 only have to add registry entries — they
|
||||
# don't need to touch admin_server_config.html.
|
||||
_KNOWN_FIELDS: dict[str, dict[str, dict]] = {
|
||||
"instance": {
|
||||
# No commonly-missing instance-level fields. The example YAML's
|
||||
# `name`/`subtitle` are always populated by `da setup` so they
|
||||
# render via the populated path; nothing to surface here.
|
||||
},
|
||||
"data_source": {
|
||||
"bigquery": {
|
||||
"kind": "object",
|
||||
"hint": "BigQuery connection knobs (read more in docs/DEPLOYMENT.md)",
|
||||
"fields": {
|
||||
"billing_project": {
|
||||
"kind": "string",
|
||||
"hint": (
|
||||
"GCP project to bill BQ jobs against. Set when SA can read "
|
||||
"the data project but cannot bill there (e.g. shared read-only "
|
||||
"data project). Defaults to data_source.bigquery.project. "
|
||||
"Mismatch → 403 USER_PROJECT_DENIED on every BQ call."
|
||||
),
|
||||
},
|
||||
"legacy_wrap_views": {
|
||||
"kind": "bool",
|
||||
"default": False,
|
||||
"hint": (
|
||||
"When true, registered VIEWs and MATERIALIZED_VIEWs get a DuckDB "
|
||||
"master view via bigquery_query() (jobs API) so analysts can "
|
||||
"SELECT * FROM viewname directly. When false (default), views are "
|
||||
"catalog-only — analysts use `da fetch` or `da query --remote`. "
|
||||
"ON for view-heavy deployments where most views are small enough "
|
||||
"to materialize without 'Response too large' (issue #101)."
|
||||
),
|
||||
},
|
||||
"max_bytes_per_materialize": {
|
||||
"kind": "int",
|
||||
"default": 10737418240,
|
||||
"hint": (
|
||||
"Cost guardrail for query_mode='materialized' BQ scans (dry-run "
|
||||
"check before running). Bytes processed; exceeds → registration "
|
||||
"or sync rejected. 0 disables the gate. Default 10737418240 = 10 GiB."
|
||||
),
|
||||
},
|
||||
},
|
||||
},
|
||||
"keboola": {
|
||||
"kind": "object",
|
||||
"hint": "Keboola Storage API connection",
|
||||
"fields": {
|
||||
"stack_url": {
|
||||
"kind": "string",
|
||||
"hint": (
|
||||
"e.g. https://connection.keboola.com (instance-specific stack URL). "
|
||||
"Validated against private-IP allowlist on save (SSRF guard)."
|
||||
),
|
||||
},
|
||||
"project_id": {
|
||||
"kind": "string",
|
||||
"hint": "Keboola project ID (numeric, but kept as string in YAML).",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"email": {
|
||||
# SMTP fields render via the populated path (always set when email
|
||||
# is enabled); no commonly-missing optional knobs at this layer.
|
||||
},
|
||||
"telegram": {
|
||||
# Rarely missing; leave empty.
|
||||
},
|
||||
"jira": {
|
||||
# Webhook + REST credentials always present when Jira is configured.
|
||||
},
|
||||
"theme": {
|
||||
# Cosmetic only; rarely missing.
|
||||
},
|
||||
"server": {
|
||||
# TLS / hostname knobs are mostly env-side; nothing to surface here.
|
||||
},
|
||||
"auth": {
|
||||
"allowed_domain": {
|
||||
"kind": "string",
|
||||
"hint": (
|
||||
"Comma-separated list of allowed sign-in email domains (e.g. "
|
||||
"'acme.com,acme-internal.com'). Single domain works too. Empty → no "
|
||||
"domain restriction (any verified Google identity can sign in)."
|
||||
),
|
||||
},
|
||||
},
|
||||
"ai": {
|
||||
"base_url": {
|
||||
"kind": "string",
|
||||
"hint": (
|
||||
"Required for provider='openai_compat' (LiteLLM, OpenRouter, vLLM, etc.). "
|
||||
"Ignored when provider='anthropic'. Examples: https://litellm.example.com, "
|
||||
"https://openrouter.ai/api/v1."
|
||||
),
|
||||
},
|
||||
"structured_output": {
|
||||
"kind": "select",
|
||||
"options": ["strict", "json", "auto"],
|
||||
"default": "auto",
|
||||
"hint": (
|
||||
"JSON-schema enforcement strategy. strict=Layer 1 only "
|
||||
"(Anthropic/OpenAI native, fail otherwise). json=Layer 1 + Layer 2 "
|
||||
"fallback. auto=all three layers including prompt-based JSON (most "
|
||||
"compatible, least strict)."
|
||||
),
|
||||
},
|
||||
},
|
||||
"openmetadata": {
|
||||
"url": {
|
||||
"kind": "string",
|
||||
"hint": "Base URL of your OpenMetadata server (e.g. https://catalog.example.com).",
|
||||
},
|
||||
"token": {
|
||||
"kind": "secret",
|
||||
"hint": (
|
||||
"JWT bearer token. Use ${OPENMETADATA_TOKEN} env-var reference "
|
||||
"(don't paste secret directly)."
|
||||
),
|
||||
},
|
||||
"cache_ttl_seconds": {
|
||||
"kind": "int",
|
||||
"default": 3600,
|
||||
"hint": "How long to cache catalog responses in-process. Default 3600s (1h).",
|
||||
},
|
||||
"verify_ssl": {
|
||||
"kind": "bool",
|
||||
"default": True,
|
||||
"hint": (
|
||||
"TLS verification. Default true. Set false ONLY for internal CAs / "
|
||||
"self-signed certs — sends the JWT over an unverified channel."
|
||||
),
|
||||
},
|
||||
},
|
||||
"desktop": {
|
||||
"jwt_issuer": {
|
||||
"kind": "string",
|
||||
"default": "data-analyst",
|
||||
"hint": "JWT iss claim. Match what the desktop app verifies.",
|
||||
},
|
||||
"jwt_secret": {
|
||||
"kind": "secret",
|
||||
"hint": "JWT signing secret. Use ${DESKTOP_JWT_SECRET} env-var reference.",
|
||||
},
|
||||
"url_scheme": {
|
||||
"kind": "string",
|
||||
"default": "data-analyst",
|
||||
"hint": "Custom URL scheme registered by the desktop app (data-analyst://...).",
|
||||
},
|
||||
},
|
||||
# corporate_memory governance — optional. When the section is missing
|
||||
# from instance.yaml the system runs in legacy democratic-wiki mode
|
||||
# (no admin review). Schema mirrors config/instance.yaml.example
|
||||
# lines 224-317; renderer handles arbitrary depth + arrays + maps.
|
||||
"corporate_memory": {
|
||||
"distribution_mode": {
|
||||
"kind": "select",
|
||||
"options": ["mandatory_only", "admin_curated", "hybrid"],
|
||||
"default": "hybrid",
|
||||
"hint": (
|
||||
"How knowledge reaches users. mandatory_only = admin-only; "
|
||||
"admin_curated = admin + user voting as feedback; "
|
||||
"hybrid = default (mandatory from admin + optional from user voting)."
|
||||
),
|
||||
},
|
||||
"approval_mode": {
|
||||
"kind": "select",
|
||||
"options": ["review_queue", "auto_publish", "threshold"],
|
||||
"default": "review_queue",
|
||||
"hint": (
|
||||
"How AI-extracted items enter the system. review_queue = admin "
|
||||
"approval required (default); auto_publish = live immediately; "
|
||||
"threshold = high-confidence auto, low-confidence to queue."
|
||||
),
|
||||
},
|
||||
"review_period_months": {
|
||||
"kind": "int",
|
||||
"default": 6,
|
||||
"hint": "How often approved/mandatory items are flagged for re-review (months).",
|
||||
},
|
||||
"notify_on_new_items": {
|
||||
"kind": "bool",
|
||||
"default": True,
|
||||
"hint": "Notify km_admins when new pending items arrive.",
|
||||
},
|
||||
"sources": {
|
||||
"kind": "object",
|
||||
"hint": (
|
||||
"Knowledge-source ingestion. Each source has its own enabled "
|
||||
"flag + base confidence."
|
||||
),
|
||||
"fields": {
|
||||
"claude_local_md": {
|
||||
"kind": "object",
|
||||
"fields": {
|
||||
"enabled": {"kind": "bool", "default": True},
|
||||
"confidence_base": {
|
||||
"kind": "float",
|
||||
"default": 0.50,
|
||||
"hint": "Confidence assigned to extractions from CLAUDE.local.md (0-1).",
|
||||
},
|
||||
},
|
||||
},
|
||||
"session_transcripts": {
|
||||
"kind": "object",
|
||||
"fields": {
|
||||
"enabled": {"kind": "bool", "default": True},
|
||||
"confidence_base": {"kind": "float", "default": 0.60},
|
||||
"max_turns_per_session": {
|
||||
"kind": "int",
|
||||
"default": 100,
|
||||
"hint": "Truncate transcripts longer than this many turns.",
|
||||
},
|
||||
"detection_types": {
|
||||
"kind": "array",
|
||||
"item_kind": "string",
|
||||
"default": [
|
||||
"correction",
|
||||
"confirmation",
|
||||
"unprompted_definition",
|
||||
],
|
||||
"hint": (
|
||||
"Which extraction patterns to detect. Each entry "
|
||||
"is a detection-type tag."
|
||||
),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"extraction": {
|
||||
"kind": "object",
|
||||
"fields": {
|
||||
"model": {
|
||||
"kind": "string",
|
||||
"default": "claude-haiku-4-5-20251001",
|
||||
"hint": "LLM used to extract knowledge. Override for cost or quality.",
|
||||
},
|
||||
"sensitivity_check": {"kind": "bool", "default": True},
|
||||
"contradiction_check": {"kind": "bool", "default": True},
|
||||
},
|
||||
},
|
||||
"confidence": {
|
||||
"kind": "object",
|
||||
"hint": "Confidence scoring + decay rules.",
|
||||
"fields": {
|
||||
"base": {
|
||||
"kind": "map",
|
||||
"key_kind": "string",
|
||||
"value_kind": "float",
|
||||
"default": {
|
||||
"user_verification.correction": 0.90,
|
||||
"user_verification.unprompted_definition": 0.90,
|
||||
"user_verification.confirmation": 0.60,
|
||||
"admin_mandate": 1.00,
|
||||
"claude_local_md": 0.50,
|
||||
"session_transcript": 0.50,
|
||||
},
|
||||
"hint": (
|
||||
"Base score per source/detection. Keys are 'source_type' "
|
||||
"or 'source_type.detection_type' (the dot is data, not "
|
||||
"nesting)."
|
||||
),
|
||||
},
|
||||
"modifiers": {
|
||||
# map<string, map<string, float>>. The renderer's structured
|
||||
# editor for "map of objects with declared subfields" is a
|
||||
# TODO (see admin_server_config.html); for now this falls
|
||||
# back to a JSON textarea — admins editing it see the
|
||||
# schema doc inline via the hint.
|
||||
"kind": "map",
|
||||
"key_kind": "string",
|
||||
"value_kind": "object",
|
||||
"value_fields": {}, # signals the JSON-textarea fallback
|
||||
"hint": (
|
||||
"Per-key modifier step sizes applied to base when "
|
||||
"optional signals are present (3-level dotted paths). "
|
||||
"Edit as a JSON object — outer keys mirror confidence.base "
|
||||
"keys; inner objects map signal name to bonus float."
|
||||
),
|
||||
},
|
||||
"decay": {
|
||||
"kind": "object",
|
||||
"fields": {
|
||||
"mode": {
|
||||
"kind": "select",
|
||||
"options": ["linear", "exponential"],
|
||||
"default": "exponential",
|
||||
},
|
||||
"half_life_months": {
|
||||
"kind": "int",
|
||||
"default": 12,
|
||||
"hint": "Used when mode=exponential.",
|
||||
},
|
||||
"decay_rate_monthly": {
|
||||
"kind": "float",
|
||||
"default": 0.02,
|
||||
"hint": "Used when mode=linear.",
|
||||
},
|
||||
"floor": {
|
||||
"kind": "map",
|
||||
"key_kind": "string",
|
||||
"value_kind": "float",
|
||||
"default": {
|
||||
"admin_mandate": 0.50,
|
||||
"user_verification": 0.40,
|
||||
"default": 0.0,
|
||||
},
|
||||
"hint": (
|
||||
"Per-source minimum confidence — items never decay "
|
||||
"below this floor."
|
||||
),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"contradiction_detection": {
|
||||
"kind": "object",
|
||||
"fields": {
|
||||
"enabled": {"kind": "bool", "default": True},
|
||||
"max_candidates": {
|
||||
"kind": "int",
|
||||
"default": 10,
|
||||
"hint": "Max contradiction candidates to evaluate per new item.",
|
||||
},
|
||||
},
|
||||
},
|
||||
"entity_resolution": {
|
||||
"kind": "object",
|
||||
"fields": {
|
||||
"enabled": {"kind": "bool", "default": True},
|
||||
"entities": {
|
||||
"kind": "map",
|
||||
"key_kind": "string",
|
||||
"value_kind": "array",
|
||||
"value_item_kind": "string",
|
||||
"default": {
|
||||
"metrics": ["churn", "MRR", "ARR", "NPS", "CAC", "LTV"],
|
||||
"products": ["Platform", "API", "Dashboard"],
|
||||
},
|
||||
"hint": (
|
||||
"Domain-entity vocabulary. Key = domain category; value = "
|
||||
"canonical names list."
|
||||
),
|
||||
},
|
||||
},
|
||||
},
|
||||
"domain_owners": {
|
||||
"kind": "map",
|
||||
"key_kind": "string",
|
||||
"value_kind": "array",
|
||||
"value_item_kind": "string",
|
||||
"hint": (
|
||||
"Per-domain admin emails. Key = domain name; value = email list."
|
||||
),
|
||||
},
|
||||
"domains": {
|
||||
"kind": "array",
|
||||
"item_kind": "string",
|
||||
"default": [
|
||||
"finance",
|
||||
"engineering",
|
||||
"product",
|
||||
"data",
|
||||
"operations",
|
||||
"infrastructure",
|
||||
],
|
||||
"hint": (
|
||||
"Knowledge domains analysts can target. Each must match a key "
|
||||
"in domain_owners."
|
||||
),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Keys whose values must be redacted from the audit diff. We match
|
||||
# substring (case-insensitive) so `client_secret`, `api_token`,
|
||||
# `webapp_secret_key`, `bot_token`, `password`, `smtp_password`, etc. all
|
||||
|
|
@ -385,6 +786,43 @@ class ServerConfigUpdateRequest(BaseModel):
|
|||
)
|
||||
|
||||
|
||||
# Optional BQ fields whose runtime defaults are documented but which used to
|
||||
# be invisible in the editor when YAML omitted them. The data_source.bigquery
|
||||
# subtree renders as a JSON textarea; a key that's absent from the GET
|
||||
# payload literally cannot appear in the form for the operator to edit. We
|
||||
# surface them with their documented defaults so the UI always shows them as
|
||||
# editable knobs — see Phase J of the admin-tables-cleanup work.
|
||||
#
|
||||
# - billing_project: defaults to data project; explicit value needed when
|
||||
# the SA can read the data project but not bill against it.
|
||||
# - legacy_wrap_views: default False; toggling ON wraps BQ views via
|
||||
# `bigquery_query()` so analysts can `SELECT *` directly.
|
||||
# - max_bytes_per_materialize: cost guardrail for `query_mode='materialized'`
|
||||
# (default 10 GiB; 0 disables; null falls through to the default).
|
||||
_BQ_OPTIONAL_FIELD_DEFAULTS: Dict[str, Any] = {
|
||||
"billing_project": "",
|
||||
"legacy_wrap_views": False,
|
||||
"max_bytes_per_materialize": 10737418240,
|
||||
}
|
||||
|
||||
|
||||
def _ensure_bq_optional_fields(sections: Dict[str, Any]) -> None:
|
||||
"""In-place: add missing BQ optional fields to data_source.bigquery so the
|
||||
UI's JSON-textarea renders them as editable keys. Existing values are
|
||||
preserved — only absent keys are populated with their documented default.
|
||||
"""
|
||||
ds = sections.get("data_source")
|
||||
if not isinstance(ds, dict):
|
||||
return
|
||||
bq = ds.get("bigquery")
|
||||
if not isinstance(bq, dict):
|
||||
# No BQ subsection — leave alone. Non-BQ instances don't need these
|
||||
# knobs, and creating an empty bigquery dict would be misleading.
|
||||
return
|
||||
for key, default in _BQ_OPTIONAL_FIELD_DEFAULTS.items():
|
||||
bq.setdefault(key, default)
|
||||
|
||||
|
||||
@router.get("/server-config")
|
||||
async def get_server_config(
|
||||
user: dict = Depends(require_admin),
|
||||
|
|
@ -403,11 +841,20 @@ async def get_server_config(
|
|||
# file omits them — operator can populate from scratch without manual
|
||||
# JSON edits.
|
||||
sections = {section: redacted.get(section, {}) for section in _EDITABLE_SECTIONS}
|
||||
# Always surface the optional BQ knobs so the operator sees them in the
|
||||
# UI's JSON editor instead of having to know they exist (Phase J).
|
||||
_ensure_bq_optional_fields(sections)
|
||||
return {
|
||||
"sections": sections,
|
||||
"editable_sections": list(_EDITABLE_SECTIONS),
|
||||
"danger_sections": list(_DANGER_SECTIONS),
|
||||
"secret_key_patterns": list(_SECRET_KEY_PATTERNS),
|
||||
# Known-but-optional fields per section so the UI can render
|
||||
# placeholders for fields the operator hasn't set yet (Phase J).
|
||||
# Subagents 2-4 populate the bodies; the renderer ships now so the
|
||||
# mechanism is wired end-to-end and adding entries is purely a
|
||||
# data-edit in `_KNOWN_FIELDS` above.
|
||||
"known_fields": _KNOWN_FIELDS,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -654,7 +1101,17 @@ def _sanitize_for_audit(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|||
class RegisterTableRequest(BaseModel):
|
||||
name: str
|
||||
folder: Optional[str] = None
|
||||
sync_strategy: str = "full_refresh"
|
||||
sync_strategy: str = Field(
|
||||
default="full_refresh",
|
||||
deprecated=True,
|
||||
description=(
|
||||
"DEPRECATED: catalog/profiler metadata only. No extractor reads "
|
||||
"this field; every sync is a full overwrite regardless of value. "
|
||||
"profiler.is_partitioned() consumes it for parquet-layout "
|
||||
"detection. Field stays for back-compat; will be removed in a "
|
||||
"future major release."
|
||||
),
|
||||
)
|
||||
# Composite primary keys are real (session-grain MSA tables key on
|
||||
# `(session_id, event_date)`, browse rows on more). The frontend sends +
|
||||
# reads this as a list; backend stores it JSON-serialized in VARCHAR.
|
||||
|
|
@ -664,9 +1121,41 @@ class RegisterTableRequest(BaseModel):
|
|||
source_type: Optional[str] = None
|
||||
bucket: Optional[str] = None
|
||||
source_table: Optional[str] = None
|
||||
# Backs query_mode='materialized'. Stored verbatim in
|
||||
# table_registry.source_query (schema v20); the trigger pass runs it
|
||||
# through the DuckDB BQ extension via BqAccess and writes the result
|
||||
# to /data/extracts/bigquery/data/<id>.parquet.
|
||||
source_query: Optional[str] = None
|
||||
query_mode: str = "local"
|
||||
sync_schedule: Optional[str] = None
|
||||
profile_after_sync: bool = True
|
||||
profile_after_sync: bool = Field(
|
||||
default=True,
|
||||
deprecated=True,
|
||||
description=(
|
||||
"DEPRECATED: not consumed by the runtime (Agent 1 finding "
|
||||
"2026-05-01). Profiler runs unconditionally on every synced "
|
||||
"table; this flag has no effect. Field stays for back-compat."
|
||||
),
|
||||
)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _check_mode_query_coherence(self):
|
||||
"""Enforce query_mode ↔ source_query invariants up front so an admin
|
||||
can't persist a remote/local row carrying an orphan source_query, and
|
||||
materialized rows can't be registered without a SQL body."""
|
||||
sq = (self.source_query or "").strip() or None
|
||||
if self.query_mode == "materialized" and not sq:
|
||||
raise ValueError(
|
||||
"query_mode='materialized' requires a non-empty source_query"
|
||||
)
|
||||
if self.query_mode != "materialized" and sq:
|
||||
raise ValueError(
|
||||
"source_query is only valid when query_mode='materialized'"
|
||||
)
|
||||
# Normalise: stash the trimmed-or-None form so the persisted column
|
||||
# never carries surrounding whitespace or empty-string sentinels.
|
||||
self.source_query = sq
|
||||
return self
|
||||
|
||||
@field_validator("primary_key", mode="before")
|
||||
@classmethod
|
||||
|
|
@ -707,12 +1196,67 @@ class RegisterTableRequest(BaseModel):
|
|||
def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
|
||||
"""Enforce BQ-specific shape on a register/precheck request.
|
||||
|
||||
Mutates the model: forces ``query_mode='remote'`` and
|
||||
``profile_after_sync=False`` (per Decision 7 in #108) so a caller can't
|
||||
accidentally enqueue a parquet profiling pass for a remote view that
|
||||
has no local file. Raises HTTPException(422) for missing required
|
||||
fields and HTTPException(400) for unsafe identifiers / bogus project_id.
|
||||
Two BQ paths:
|
||||
|
||||
- ``query_mode='materialized'`` — admin-registered SQL writes a parquet on
|
||||
schedule. Requires ``source_query``; ``bucket`` / ``source_table`` are
|
||||
not used (the SQL inlines the references). Doesn't force any field; the
|
||||
Pydantic ``model_validator`` already gated the query/mode coherence.
|
||||
|
||||
- ``query_mode='remote'`` (or default) — remote view over a single BQ
|
||||
table. Requires ``bucket`` (BQ dataset) + ``source_table``. Mutates
|
||||
the model: forces ``query_mode='remote'`` and ``profile_after_sync=False``
|
||||
(per Decision 7 in #108) so a caller can't accidentally enqueue a
|
||||
parquet profiling pass for a remote view that has no local file.
|
||||
|
||||
Raises HTTPException(422) for missing required fields and
|
||||
HTTPException(400) for unsafe identifiers / bogus project_id.
|
||||
"""
|
||||
if req.query_mode == "materialized":
|
||||
# Materialized BQ rows: the SQL body replaces dataset+table refs.
|
||||
# Pydantic model_validator already verified source_query is non-empty;
|
||||
# all we still need is a valid project_id and a safe view name.
|
||||
if not req.source_query or not req.source_query.strip():
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail="bigquery materialized: 'source_query' is required",
|
||||
)
|
||||
raw_name = req.name or ""
|
||||
if raw_name.strip() != raw_name or not _is_safe_identifier(raw_name):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=(
|
||||
f"bigquery: view name {raw_name!r} is unsafe — must match "
|
||||
f"^[a-zA-Z_][a-zA-Z0-9_]{{0,63}}$ (DuckDB identifier rules) "
|
||||
"with no leading/trailing whitespace"
|
||||
),
|
||||
)
|
||||
from app.instance_config import get_value
|
||||
project_id = get_value("data_source", "bigquery", "project", default="")
|
||||
if not project_id:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=(
|
||||
"bigquery: data_source.bigquery.project is not set in "
|
||||
"instance.yaml; configure it via /admin/server-config or "
|
||||
"/api/admin/configure first"
|
||||
),
|
||||
)
|
||||
if not _is_safe_project_id(project_id):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=(
|
||||
f"bigquery: data_source.bigquery.project {project_id!r} "
|
||||
"is malformed — must match GCP project_id grammar "
|
||||
"^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
|
||||
),
|
||||
)
|
||||
# Phase C: profile_after_sync is now inert (Pydantic field marked
|
||||
# deprecated; not read by app/api/sync.py:410-438). The runtime
|
||||
# profiles every synced table unconditionally, so we no longer
|
||||
# force-set this here as a "signal."
|
||||
return
|
||||
|
||||
if not req.bucket or not req.bucket.strip():
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
|
|
@ -796,24 +1340,106 @@ def _validate_bigquery_register_payload(req: "RegisterTableRequest") -> None:
|
|||
"must match GCP project_id grammar ^[a-z][a-z0-9-]{4,28}[a-z0-9]$"
|
||||
),
|
||||
)
|
||||
# Force the BQ-required mode + flag (Decision 7). The orchestrator and
|
||||
# Force the BQ-required mode (Decision 7). The orchestrator and
|
||||
# extractor both assume remote; persisting `local` here would later create
|
||||
# a profiling job against a non-existent parquet file.
|
||||
# Phase C: profile_after_sync is now inert (deprecated, not read by the
|
||||
# runtime); no longer force-set here.
|
||||
req.query_mode = "remote"
|
||||
req.profile_after_sync = False
|
||||
|
||||
|
||||
class UpdateTableRequest(BaseModel):
|
||||
name: Optional[str] = None
|
||||
sync_strategy: Optional[str] = None
|
||||
sync_strategy: Optional[str] = Field(
|
||||
default=None,
|
||||
deprecated=True,
|
||||
description=(
|
||||
"DEPRECATED: catalog/profiler metadata only. See "
|
||||
"RegisterTableRequest.sync_strategy."
|
||||
),
|
||||
)
|
||||
primary_key: Optional[List[str]] = None
|
||||
description: Optional[str] = None
|
||||
source_type: Optional[str] = None
|
||||
bucket: Optional[str] = None
|
||||
source_table: Optional[str] = None
|
||||
source_query: Optional[str] = None
|
||||
query_mode: Optional[str] = None
|
||||
sync_schedule: Optional[str] = None
|
||||
profile_after_sync: Optional[bool] = None
|
||||
profile_after_sync: Optional[bool] = Field(
|
||||
default=None,
|
||||
deprecated=True,
|
||||
description=(
|
||||
"DEPRECATED: not consumed by the runtime. See "
|
||||
"RegisterTableRequest.profile_after_sync."
|
||||
),
|
||||
)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _check_mode_query_coherence(self):
|
||||
"""PUT semantics — only the fields explicitly in the body are
|
||||
validated. The body is overlaid on the existing row at the handler
|
||||
level (see ``update_table``), so omitted fields keep their stored
|
||||
values and the synthetic ``RegisterTableRequest`` constructed against
|
||||
the merged record runs the strict cross-field check before persist.
|
||||
|
||||
The only invariants enforceable from the PUT body alone:
|
||||
|
||||
- explicit ``source_query='SELECT ...'`` paired with ``query_mode``
|
||||
that isn't materialized → coherent reject (the SQL would be dead);
|
||||
- explicit ``source_query='SELECT ...'`` without any ``query_mode``
|
||||
in the body → reject; the operator must commit to materialized;
|
||||
- explicit empty/whitespace ``source_query=''`` paired with
|
||||
``query_mode='materialized'`` → reject (operator clearly
|
||||
mistyped — they sent the field).
|
||||
|
||||
Pre-fix this validator also rejected ``{"query_mode": "materialized",
|
||||
"sync_schedule": "every 12h"}`` because ``source_query`` was None
|
||||
— but that's the canonical "edit the schedule on a materialized
|
||||
row" use-case from the Edit modal, which always sends
|
||||
``query_mode`` to indicate intent. Devin BUG_0002 on PR #148
|
||||
commit 2219255.
|
||||
"""
|
||||
if self.query_mode is None and self.source_query is None:
|
||||
return self
|
||||
|
||||
sq_raw = self.source_query
|
||||
sq = (sq_raw or "").strip() or None
|
||||
|
||||
# Operator explicitly sent source_query as empty/whitespace while
|
||||
# claiming materialized — typo / bad form data, reject.
|
||||
if (
|
||||
self.query_mode == "materialized"
|
||||
and sq_raw is not None
|
||||
and not sq
|
||||
):
|
||||
raise ValueError(
|
||||
"query_mode='materialized' requires a non-empty source_query"
|
||||
)
|
||||
|
||||
# source_query only makes sense with materialized mode. Allow None
|
||||
# (omitted) to flow through; only reject when explicitly set with
|
||||
# the wrong mode.
|
||||
if (
|
||||
self.query_mode is not None
|
||||
and self.query_mode != "materialized"
|
||||
and sq
|
||||
):
|
||||
raise ValueError(
|
||||
"source_query is only valid when query_mode='materialized'"
|
||||
)
|
||||
if self.query_mode is None and sq:
|
||||
raise ValueError(
|
||||
"source_query requires query_mode='materialized' to be set "
|
||||
"in the same request"
|
||||
)
|
||||
|
||||
# Normalise: drop whitespace-only strings to None so the persisted
|
||||
# column is clean. Don't touch when source_query was None to begin
|
||||
# with — that signals "PUT didn't touch this field, keep existing".
|
||||
if sq_raw is not None:
|
||||
self.source_query = sq
|
||||
return self
|
||||
|
||||
@field_validator("primary_key", mode="before")
|
||||
@classmethod
|
||||
|
|
@ -853,8 +1479,20 @@ class ConfigureRequest(BaseModel):
|
|||
@router.get("/discover-tables")
|
||||
async def discover_tables(
|
||||
user: dict = Depends(require_admin),
|
||||
dataset: Optional[str] = None,
|
||||
):
|
||||
"""Discover all available tables from the configured data source."""
|
||||
"""Discover available tables from the configured data source.
|
||||
|
||||
For ``data_source.type='keboola'`` returns the full Storage API table
|
||||
list (single round-trip). For ``data_source.type='bigquery'``:
|
||||
|
||||
- Without ``dataset``: list datasets in the configured project.
|
||||
- With ``dataset=name``: list tables (BASE TABLE + VIEW) in that dataset.
|
||||
|
||||
Two-step shape avoids paying the per-dataset list_tables cost up-front
|
||||
on projects with hundreds of datasets — the UI populates the dataset
|
||||
dropdown first, then fetches tables only for the selected dataset.
|
||||
"""
|
||||
try:
|
||||
from app.instance_config import get_data_source_type
|
||||
source_type = get_data_source_type()
|
||||
|
|
@ -870,12 +1508,89 @@ async def discover_tables(
|
|||
client = KeboolaClient(token=token, url=url)
|
||||
tables = client.discover_all_tables()
|
||||
return {"tables": tables, "count": len(tables), "source": "keboola"}
|
||||
else:
|
||||
return {"tables": [], "count": 0, "source": source_type, "error": "Discovery not implemented for this source"}
|
||||
|
||||
if source_type == "bigquery":
|
||||
return _discover_bigquery(dataset=dataset)
|
||||
|
||||
return {
|
||||
"tables": [],
|
||||
"count": 0,
|
||||
"source": source_type,
|
||||
"error": f"Discovery not implemented for source_type={source_type!r}",
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Discovery failed: {e}")
|
||||
|
||||
|
||||
def _discover_bigquery(dataset: Optional[str]) -> Dict[str, Any]:
|
||||
"""List BQ datasets (when ``dataset`` is None) or tables-in-dataset.
|
||||
|
||||
Routes through ``BqAccess.client()`` so config / auth / error
|
||||
translation matches the rest of the BQ surface (#138 facade). Returns
|
||||
the same shape as the Keboola path so the UI doesn't have to branch.
|
||||
"""
|
||||
from connectors.bigquery.access import (
|
||||
get_bq_access,
|
||||
BqAccessError,
|
||||
translate_bq_error,
|
||||
)
|
||||
|
||||
try:
|
||||
bq = get_bq_access()
|
||||
client = bq.client()
|
||||
except BqAccessError as e:
|
||||
raise HTTPException(
|
||||
status_code=BqAccessError.HTTP_STATUS.get(e.kind, 500),
|
||||
detail={"error": e.message, "kind": e.kind, "details": e.details},
|
||||
)
|
||||
|
||||
try:
|
||||
if dataset is None:
|
||||
datasets = []
|
||||
for ds in client.list_datasets():
|
||||
datasets.append({
|
||||
"dataset_id": ds.dataset_id,
|
||||
"full_id": f"{ds.project}.{ds.dataset_id}",
|
||||
})
|
||||
return {
|
||||
"datasets": sorted(datasets, key=lambda d: d["dataset_id"]),
|
||||
"count": len(datasets),
|
||||
"source": "bigquery",
|
||||
}
|
||||
|
||||
# List tables in the named dataset. `list_tables` returns
|
||||
# `TableListItem` with `table_id` + `table_type` ('TABLE', 'VIEW',
|
||||
# 'MATERIALIZED_VIEW', 'EXTERNAL', 'SNAPSHOT'). UI maps TABLE → Type
|
||||
# selector "table" and VIEW/MATERIALIZED_VIEW → "view"; the rest are
|
||||
# passed through with their raw type so the operator can decide.
|
||||
tables = []
|
||||
for t in client.list_tables(dataset):
|
||||
tables.append({
|
||||
"table_id": t.table_id,
|
||||
"table_type": t.table_type,
|
||||
"full_id": f"{t.project}.{t.dataset_id}.{t.table_id}",
|
||||
})
|
||||
return {
|
||||
"tables": sorted(tables, key=lambda t: t["table_id"]),
|
||||
"count": len(tables),
|
||||
"source": "bigquery",
|
||||
"dataset": dataset,
|
||||
}
|
||||
except Exception as e:
|
||||
# `translate_bq_error` re-raises non-Google exceptions unchanged,
|
||||
# so wrap in HTTPException to keep the JSON-shape contract.
|
||||
try:
|
||||
err = translate_bq_error(e, bq.projects, bad_request_status="upstream_error")
|
||||
except Exception:
|
||||
raise HTTPException(status_code=502, detail=f"BQ discovery failed: {e}")
|
||||
raise HTTPException(
|
||||
status_code=BqAccessError.HTTP_STATUS.get(err.kind, 502),
|
||||
detail={"error": err.message, "kind": err.kind, "details": err.details},
|
||||
)
|
||||
|
||||
|
||||
@router.get("/registry")
|
||||
async def list_registry(
|
||||
user: dict = Depends(require_admin),
|
||||
|
|
@ -1121,6 +1836,10 @@ def register_table(
|
|||
if is_bigquery:
|
||||
_validate_bigquery_register_payload(request)
|
||||
|
||||
# Phase C: profile_after_sync is no longer passed — the field is
|
||||
# deprecated and inert at the runtime layer. The DB column keeps its
|
||||
# schema default; the registry response no longer reflects request
|
||||
# values for this flag.
|
||||
repo.register(
|
||||
id=table_id,
|
||||
name=request.name,
|
||||
|
|
@ -1132,9 +1851,9 @@ def register_table(
|
|||
source_type=request.source_type,
|
||||
bucket=request.bucket,
|
||||
source_table=request.source_table,
|
||||
source_query=request.source_query,
|
||||
query_mode=request.query_mode,
|
||||
sync_schedule=request.sync_schedule,
|
||||
profile_after_sync=request.profile_after_sync,
|
||||
)
|
||||
|
||||
# Audit entry — masked params; description kept raw (it's documentation).
|
||||
|
|
@ -1154,6 +1873,24 @@ def register_table(
|
|||
content={"id": table_id, "name": request.name, "status": "registered"},
|
||||
)
|
||||
|
||||
if request.query_mode == "materialized":
|
||||
# Materialized BQ rows are picked up by the trigger pass on the next
|
||||
# scheduled tick (or via POST /api/sync/trigger). No synchronous
|
||||
# rebuild — the COPY can scan multi-GB and would block the request.
|
||||
return JSONResponse(
|
||||
status_code=201,
|
||||
content={
|
||||
"id": table_id,
|
||||
"name": request.name,
|
||||
"status": "registered",
|
||||
"view_name": table_id,
|
||||
"message": (
|
||||
"Materialized — parquet will be written on the next sync "
|
||||
"tick. Trigger now via POST /api/sync/trigger."
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
# BQ post-register: rebuild extract + master views, with timeout fallback.
|
||||
# Decision 1: 200 on synchronous success, 202 on timeout, 500 if the
|
||||
# synchronous rebuild surfaced errors. Distinct from the 201 Keboola
|
||||
|
|
@ -1266,6 +2003,28 @@ def register_table_precheck(
|
|||
# checks identifier safety, validates project_id from instance.yaml).
|
||||
_validate_bigquery_register_payload(request)
|
||||
|
||||
# Materialized BQ rows have no `dataset.source_table` to round-trip —
|
||||
# the SQL body is the contract. Skip the BQ-jobs-API call and return a
|
||||
# validation-only precheck so the CLI's `--dry-run --query-mode
|
||||
# materialized` path doesn't crash on an empty fully-qualified name.
|
||||
if request.query_mode == "materialized":
|
||||
return {
|
||||
"ok": True,
|
||||
"table": {
|
||||
"name": request.name,
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
"source_query": request.source_query,
|
||||
"rows": None,
|
||||
"size_bytes": None,
|
||||
"columns": [],
|
||||
"note": (
|
||||
"materialized precheck is validation-only — the SQL is "
|
||||
"evaluated for cost on each scheduled materialize tick"
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
# Round-trip the BQ jobs API to confirm the table exists and the SA can
|
||||
# see it. Imports kept local to avoid pulling google-cloud-bigquery into
|
||||
# the import chain on non-BQ instances.
|
||||
|
|
@ -1360,14 +2119,23 @@ async def update_table(
|
|||
merged.update(updates)
|
||||
merged.pop("id", None) # avoid duplicate id kwarg
|
||||
|
||||
# When switching the merged record away from materialized mode, drop
|
||||
# the stale source_query — the request validator can't clear it via
|
||||
# the `if v is not None` filter above. Without this, a remote/local
|
||||
# row would carry an orphan source_query in the registry.
|
||||
if merged.get("query_mode") != "materialized":
|
||||
merged["source_query"] = None
|
||||
|
||||
if merged.get("source_type") == "bigquery":
|
||||
# Reuse the register-time validator. It mutates the request to
|
||||
# force query_mode='remote' / profile_after_sync=False — apply
|
||||
# the same coercion to `merged` so the persisted row matches.
|
||||
# force query_mode='remote' / profile_after_sync=False (or to
|
||||
# leave a materialized row alone) — apply the same coercion to
|
||||
# `merged` so the persisted row matches.
|
||||
synthetic = RegisterTableRequest(
|
||||
name=merged.get("name") or table_id,
|
||||
bucket=merged.get("bucket"),
|
||||
source_table=merged.get("source_table"),
|
||||
source_query=merged.get("source_query"),
|
||||
source_type="bigquery",
|
||||
query_mode=merged.get("query_mode") or "remote",
|
||||
profile_after_sync=bool(merged.get("profile_after_sync") or False),
|
||||
|
|
@ -1380,6 +2148,7 @@ async def update_table(
|
|||
_validate_bigquery_register_payload(synthetic)
|
||||
merged["query_mode"] = synthetic.query_mode
|
||||
merged["profile_after_sync"] = synthetic.profile_after_sync
|
||||
merged["source_query"] = synthetic.source_query
|
||||
|
||||
repo.register(id=table_id, **merged)
|
||||
|
||||
|
|
|
|||
389
app/api/sync.py
389
app/api/sync.py
|
|
@ -36,6 +36,204 @@ def _file_hash(path: Path) -> str:
|
|||
return h.hexdigest()
|
||||
|
||||
|
||||
def _materialize_table(
|
||||
*,
|
||||
table_id: str,
|
||||
sql: str,
|
||||
bq,
|
||||
output_dir: str,
|
||||
max_bytes: Optional[int],
|
||||
) -> dict:
|
||||
"""Thin wrapper around `connectors.bigquery.extractor.materialize_query`
|
||||
so the trigger pass can be unit-tested by patching this seam without
|
||||
touching the real BqAccess factory or the duckdb import."""
|
||||
from connectors.bigquery.extractor import materialize_query
|
||||
return materialize_query(
|
||||
table_id=table_id, sql=sql, bq=bq,
|
||||
output_dir=output_dir, max_bytes=max_bytes,
|
||||
)
|
||||
|
||||
|
||||
def _run_materialized_pass(conn: duckdb.DuckDBPyConnection, bq) -> dict:
|
||||
"""Walk `table_registry` for `query_mode='materialized'` rows and run any
|
||||
that are due, dispatching by ``source_type`` to the correct connector's
|
||||
materialize_query. Honors per-table `sync_schedule` via `is_table_due()`,
|
||||
computes the file hash inline, and updates `sync_state` so the manifest
|
||||
can serve the row to `da sync` without re-hashing on every request.
|
||||
|
||||
BigQuery rows go through BqAccess + bigquery_query() (jobs API),
|
||||
optionally cost-guarded by ``max_bytes_per_materialize``.
|
||||
Keboola rows go through KeboolaAccess + ATTACH-and-COPY, no
|
||||
guardrail (extension has no dry-run primitive).
|
||||
|
||||
Returns:
|
||||
``{"materialized": [ids], "skipped": [ids], "errors": [{table, error}]}``
|
||||
|
||||
Errors are aggregated per row — one budget-blown table doesn't stop a
|
||||
healthy sibling. ``MaterializeBudgetError`` is caught and rendered with
|
||||
its structured fields so operator alerting can pick out the cap-vs-actual
|
||||
bytes from the log line.
|
||||
"""
|
||||
from src.scheduler import is_table_due
|
||||
from app.instance_config import get_value
|
||||
from connectors.bigquery.extractor import MaterializeBudgetError
|
||||
|
||||
bq_output_dir = str(Path(_get_data_dir()) / "extracts" / "bigquery")
|
||||
kb_output_dir = Path(_get_data_dir()) / "extracts" / "keboola" / "data"
|
||||
|
||||
# Sentinel: max_bytes <= 0 (or None) disables the guardrail. `get_value()`
|
||||
# treats YAML `null` as "missing" → returns the default; operators must use
|
||||
# the explicit `0` sentinel to disable. See config/instance.yaml.example.
|
||||
# YAML accepts floats too (e.g. `10737418240.0`), and operators may
|
||||
# write `1e10` for readability; coerce to int and tolerate non-numeric
|
||||
# entries by falling through to the disable path with a warning.
|
||||
raw_max = get_value(
|
||||
"data_source", "bigquery", "max_bytes_per_materialize",
|
||||
default=10 * 2**30,
|
||||
)
|
||||
try:
|
||||
n = int(raw_max) if raw_max is not None else 0
|
||||
except (TypeError, ValueError):
|
||||
logger.warning(
|
||||
"data_source.bigquery.max_bytes_per_materialize is not numeric "
|
||||
"(%r); cost guardrail disabled. Set an integer or 0 to disable.",
|
||||
raw_max,
|
||||
)
|
||||
n = 0
|
||||
bq_max_bytes = n if n > 0 else None
|
||||
|
||||
registry = TableRegistryRepository(conn)
|
||||
state = SyncStateRepository(conn)
|
||||
|
||||
summary = {"materialized": [], "skipped": [], "errors": []}
|
||||
keboola_access = None # lazy-init on first Keboola row
|
||||
|
||||
for row in registry.list_all():
|
||||
if row.get("query_mode") != "materialized":
|
||||
continue
|
||||
|
||||
# Convention across connectors: sync_state.table_id and the parquet
|
||||
# filename are keyed by `table_registry.name` (matches Keboola's
|
||||
# `_meta.table_name`) so the manifest's `registry_by_name` lookup
|
||||
# at `_build_manifest_for_user` resolves cleanly. Without this,
|
||||
# admins who register `name="Orders_90d"` (id slugified to
|
||||
# `orders_90d`) would see `query_mode` default to `"local"` in the
|
||||
# manifest because the lookup misses on `id`.
|
||||
ref_name = row["name"]
|
||||
|
||||
last = state.get_last_sync(ref_name)
|
||||
last_iso = last.isoformat() if last else None
|
||||
schedule = row.get("sync_schedule") or "every 1h"
|
||||
if not is_table_due(schedule, last_iso):
|
||||
summary["skipped"].append(ref_name)
|
||||
continue
|
||||
|
||||
source_type = row.get("source_type") or "bigquery" # legacy default
|
||||
|
||||
# Dispatch by source_type. BQ rows keep using `_materialize_table`
|
||||
# (the existing test seam); Keboola rows use the new Keboola
|
||||
# materialize_query via a lazily-initialized KeboolaAccess.
|
||||
try:
|
||||
if source_type == "bigquery":
|
||||
stats = _materialize_table(
|
||||
table_id=ref_name,
|
||||
sql=row["source_query"],
|
||||
bq=bq,
|
||||
output_dir=bq_output_dir,
|
||||
max_bytes=bq_max_bytes,
|
||||
)
|
||||
elif source_type == "keboola":
|
||||
if keboola_access is None:
|
||||
from connectors.keboola.access import KeboolaAccess
|
||||
keboola_url = get_value(
|
||||
"data_source", "keboola", "url", default=""
|
||||
) or ""
|
||||
token_env = get_value(
|
||||
"data_source", "keboola", "token_env",
|
||||
default="KEBOOLA_STORAGE_TOKEN",
|
||||
) or "KEBOOLA_STORAGE_TOKEN"
|
||||
keboola_token = os.environ.get(token_env, "")
|
||||
if not (keboola_url and keboola_token):
|
||||
summary["errors"].append({
|
||||
"table": ref_name,
|
||||
"error": (
|
||||
"Keboola URL/token not configured for "
|
||||
"materialized path (data_source.keboola.url "
|
||||
f"+ env {token_env})"
|
||||
),
|
||||
})
|
||||
continue
|
||||
keboola_access = KeboolaAccess(
|
||||
url=keboola_url, token=keboola_token,
|
||||
)
|
||||
kb_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
from connectors.keboola.extractor import (
|
||||
materialize_query as kb_materialize_query,
|
||||
)
|
||||
kb_stats = kb_materialize_query(
|
||||
table_id=ref_name,
|
||||
sql=row["source_query"],
|
||||
keboola_access=keboola_access,
|
||||
output_dir=kb_output_dir,
|
||||
)
|
||||
# Normalize Keboola materialize_query output to the shape the
|
||||
# BQ branch uses for downstream sync_state updates. KB returns
|
||||
# {table_id, path, rows, bytes, md5}; map to
|
||||
# {rows, size_bytes, hash}.
|
||||
stats = {
|
||||
"rows": kb_stats["rows"],
|
||||
"size_bytes": kb_stats["bytes"],
|
||||
"hash": kb_stats["md5"],
|
||||
"query_mode": "materialized",
|
||||
}
|
||||
else:
|
||||
summary["errors"].append({
|
||||
"table": ref_name,
|
||||
"error": (
|
||||
f"materialized path not supported for "
|
||||
f"source_type={source_type!r}"
|
||||
),
|
||||
})
|
||||
continue
|
||||
except MaterializeBudgetError as e:
|
||||
logger.warning(
|
||||
"Materialize cap exceeded for %s: %s bytes > %s bytes",
|
||||
e.table_id, f"{e.current:,}", f"{e.limit:,}",
|
||||
)
|
||||
summary["errors"].append({
|
||||
"table": ref_name,
|
||||
"error": str(e),
|
||||
"current": e.current,
|
||||
"limit": e.limit,
|
||||
})
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.exception("Materialize failed for %s", ref_name)
|
||||
summary["errors"].append({"table": ref_name, "error": str(e)})
|
||||
continue
|
||||
|
||||
# `materialize_query` returns the parquet's MD5 inline — hashing
|
||||
# there means we don't re-read a multi-GB file on the request
|
||||
# thread. Fallback to `_file_hash(parquet_path)` if for some
|
||||
# reason the stats dict didn't carry it (defensive).
|
||||
parquet_hash = stats.get("hash")
|
||||
if not parquet_hash:
|
||||
output_dir_for_hash = (
|
||||
bq_output_dir if source_type == "bigquery" else str(kb_output_dir.parent)
|
||||
)
|
||||
parquet_path = Path(output_dir_for_hash) / "data" / f"{ref_name}.parquet"
|
||||
parquet_hash = _file_hash(parquet_path)
|
||||
state.update_sync(
|
||||
table_id=ref_name,
|
||||
rows=stats["rows"],
|
||||
file_size_bytes=stats["size_bytes"],
|
||||
hash=parquet_hash,
|
||||
)
|
||||
summary["materialized"].append(ref_name)
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def _run_sync(tables: Optional[List[str]] = None):
|
||||
"""Run extractor as subprocess + orchestrator rebuild.
|
||||
|
||||
|
|
@ -106,20 +304,38 @@ def _run_sync(tables: Optional[List[str]] = None):
|
|||
except Exception as e:
|
||||
logger.warning("Auto-discovery failed: %s", e)
|
||||
|
||||
if not table_configs:
|
||||
logger.warning("No tables to sync for source_type=%s", source_type)
|
||||
return
|
||||
# CRITICAL: don't early-return when local-mode tables are empty.
|
||||
# `list_local("bigquery")` is always empty on BQ-only deployments
|
||||
# (BQ rows are always remote or materialized, never local), so an
|
||||
# early return would prevent the materialized pass AND the
|
||||
# orchestrator rebuild from ever firing on a BQ-only instance.
|
||||
# Devin BUG_0002 on PR #148 commit 2fa44f2. Just flag whether the
|
||||
# Keboola subprocess + custom-connectors should run; everything
|
||||
# below (materialized pass, orchestrator rebuild, profiler) runs
|
||||
# unconditionally so a registry with materialized rows but no
|
||||
# local rows still publishes them.
|
||||
run_extractor_subprocess = bool(table_configs)
|
||||
if not run_extractor_subprocess:
|
||||
logger.info(
|
||||
"No local-mode tables to sync for source_type=%s — "
|
||||
"skipping extractor subprocess; materialized pass + "
|
||||
"orchestrator rebuild still run.",
|
||||
source_type,
|
||||
)
|
||||
|
||||
# Serialize configs — strip non-serializable fields
|
||||
serializable = []
|
||||
for tc in table_configs:
|
||||
serializable.append({k: (v.isoformat() if hasattr(v, 'isoformat') else v)
|
||||
for k, v in tc.items() if v is not None})
|
||||
|
||||
# Run extractor subprocess with table configs via stdin
|
||||
# Subprocess does NOT open system.duckdb — no lock conflict
|
||||
env = {**os.environ}
|
||||
cmd = [sys.executable, "-c", """
|
||||
import sys as _sys
|
||||
|
||||
if run_extractor_subprocess:
|
||||
# Serialize configs — strip non-serializable fields
|
||||
serializable = []
|
||||
for tc in table_configs:
|
||||
serializable.append({k: (v.isoformat() if hasattr(v, 'isoformat') else v)
|
||||
for k, v in tc.items() if v is not None})
|
||||
|
||||
# Run extractor subprocess with table configs via stdin
|
||||
# Subprocess does NOT open system.duckdb — no lock conflict
|
||||
cmd = [sys.executable, "-c", """
|
||||
import json, sys, os, logging
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -147,58 +363,113 @@ print(json.dumps(result))
|
|||
sys.exit(compute_exit_code(result, len(configs)))
|
||||
"""]
|
||||
|
||||
import sys as _sys
|
||||
print(f"[SYNC] Starting extractor subprocess for {len(table_configs)} tables", file=_sys.stderr, flush=True)
|
||||
print(f"[SYNC] Starting extractor subprocess for {len(table_configs)} tables", file=_sys.stderr, flush=True)
|
||||
|
||||
result = subprocess.run(
|
||||
cmd, input=_json.dumps(serializable), capture_output=True, text=True,
|
||||
timeout=1800, env=env,
|
||||
cwd=str(Path(__file__).parent.parent.parent),
|
||||
)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd, input=_json.dumps(serializable), capture_output=True, text=True,
|
||||
timeout=1800, env=env,
|
||||
cwd=str(Path(__file__).parent.parent.parent),
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
# Catch the timeout LOCALLY so the materialized BQ pass and
|
||||
# orchestrator rebuild below still fire. Pre-fix the timeout
|
||||
# propagated to the outer except handler and skipped the rest
|
||||
# of `_run_sync` — on a dual-source deployment a slow Keboola
|
||||
# extractor would silently block all materialized parquets +
|
||||
# master-view rebuild until the next trigger. Devin BUG_0001
|
||||
# on PR #148 commit 2219255. Mirrors the per-custom-connector
|
||||
# timeout pattern below (line ~347).
|
||||
print(
|
||||
"[SYNC] Extractor timed out after 1800s — continuing to "
|
||||
"materialized pass + orchestrator rebuild",
|
||||
file=_sys.stderr, flush=True,
|
||||
)
|
||||
result = None
|
||||
|
||||
if result.stdout:
|
||||
print(f"[SYNC] Extractor stdout: {result.stdout.strip()[-500:]}", file=_sys.stderr, flush=True)
|
||||
if result.stderr:
|
||||
print(f"[SYNC] Extractor stderr: {result.stderr[-500:]}", file=_sys.stderr, flush=True)
|
||||
# Issue #81 Group B: three exit codes. 0 = full success,
|
||||
# 1 = full failure, 2 = partial. Partial is a data-quality
|
||||
# alert, not a crash — the orchestrator's per-table _meta
|
||||
# machinery already captured which tables succeeded; we just
|
||||
# need to log loudly so operator alerting can pick it up.
|
||||
if result.returncode == 0:
|
||||
print(f"[SYNC] Extractor OK", file=_sys.stderr, flush=True)
|
||||
elif result.returncode == 2:
|
||||
if result is not None:
|
||||
if result.stdout:
|
||||
print(f"[SYNC] Extractor stdout: {result.stdout.strip()[-500:]}", file=_sys.stderr, flush=True)
|
||||
if result.stderr:
|
||||
print(f"[SYNC] Extractor stderr: {result.stderr[-500:]}", file=_sys.stderr, flush=True)
|
||||
# Issue #81 Group B: three exit codes. 0 = full success,
|
||||
# 1 = full failure, 2 = partial. Partial is a data-quality
|
||||
# alert, not a crash — the orchestrator's per-table _meta
|
||||
# machinery already captured which tables succeeded; we just
|
||||
# need to log loudly so operator alerting can pick it up.
|
||||
if result.returncode == 0:
|
||||
print(f"[SYNC] Extractor OK", file=_sys.stderr, flush=True)
|
||||
elif result.returncode == 2:
|
||||
print(
|
||||
f"[SYNC] Extractor PARTIAL FAILURE (exit 2) — some tables "
|
||||
f"succeeded, some failed; see stderr for per-table errors. "
|
||||
f"Successful tables will still be published by the orchestrator.",
|
||||
file=_sys.stderr, flush=True,
|
||||
)
|
||||
else:
|
||||
print(f"[SYNC] Extractor FAILED (exit {result.returncode})", file=_sys.stderr, flush=True)
|
||||
|
||||
# Run custom connectors (Tier A: local mount) — only when there
|
||||
# were local-mode tables to drive the extractor. Custom connectors
|
||||
# currently piggyback on the same env as the Keboola extractor.
|
||||
connectors_dir = Path(os.environ.get("CONNECTORS_DIR", str(Path(__file__).parent.parent.parent / "connectors" / "custom")))
|
||||
if connectors_dir.exists():
|
||||
for connector_dir in sorted(connectors_dir.iterdir()):
|
||||
if not connector_dir.is_dir():
|
||||
continue
|
||||
extractor = connector_dir / "extractor.py"
|
||||
if not extractor.exists():
|
||||
continue
|
||||
logger.info("Running custom connector: %s", connector_dir.name)
|
||||
try:
|
||||
custom_result = subprocess.run(
|
||||
[sys.executable, str(extractor)],
|
||||
env=env, capture_output=True, text=True, timeout=600,
|
||||
cwd=str(Path(__file__).parent.parent.parent),
|
||||
)
|
||||
if custom_result.returncode != 0:
|
||||
logger.error("Custom connector %s failed: %s", connector_dir.name, custom_result.stderr[-500:])
|
||||
else:
|
||||
logger.info("Custom connector %s completed", connector_dir.name)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("Custom connector %s timed out", connector_dir.name)
|
||||
|
||||
# Materialized BigQuery pass — runs admin-registered SQL through the
|
||||
# DuckDB BQ extension (via BqAccess) and writes parquet for due rows.
|
||||
# The orchestrator rebuild below picks the parquets up via the
|
||||
# standard local-parquet discovery. Wrapped so a misconfigured BQ
|
||||
# facade doesn't kill the Keboola path.
|
||||
try:
|
||||
from app.instance_config import get_value as _get_value
|
||||
bq_project = _get_value(
|
||||
"data_source", "bigquery", "project", default=""
|
||||
) or ""
|
||||
if bq_project:
|
||||
from connectors.bigquery.access import get_bq_access
|
||||
from src.db import get_system_db as _get_system_db
|
||||
bq_access = get_bq_access()
|
||||
mat_conn = _get_system_db()
|
||||
try:
|
||||
mat_summary = _run_materialized_pass(mat_conn, bq_access)
|
||||
finally:
|
||||
mat_conn.close()
|
||||
print(
|
||||
f"[SYNC] Materialized BQ: {len(mat_summary['materialized'])} ok, "
|
||||
f"{len(mat_summary['skipped'])} skipped, "
|
||||
f"{len(mat_summary['errors'])} errors",
|
||||
file=_sys.stderr, flush=True,
|
||||
)
|
||||
for err in mat_summary["errors"]:
|
||||
print(
|
||||
f"[SYNC] {err['table']}: {err['error']}",
|
||||
file=_sys.stderr, flush=True,
|
||||
)
|
||||
except Exception as e:
|
||||
print(
|
||||
f"[SYNC] Extractor PARTIAL FAILURE (exit 2) — some tables "
|
||||
f"succeeded, some failed; see stderr for per-table errors. "
|
||||
f"Successful tables will still be published by the orchestrator.",
|
||||
f"[SYNC] Materialized BQ pass FAILED: {e}",
|
||||
file=_sys.stderr, flush=True,
|
||||
)
|
||||
else:
|
||||
print(f"[SYNC] Extractor FAILED (exit {result.returncode})", file=_sys.stderr, flush=True)
|
||||
|
||||
# Run custom connectors (Tier A: local mount)
|
||||
connectors_dir = Path(os.environ.get("CONNECTORS_DIR", str(Path(__file__).parent.parent.parent / "connectors" / "custom")))
|
||||
if connectors_dir.exists():
|
||||
for connector_dir in sorted(connectors_dir.iterdir()):
|
||||
if not connector_dir.is_dir():
|
||||
continue
|
||||
extractor = connector_dir / "extractor.py"
|
||||
if not extractor.exists():
|
||||
continue
|
||||
logger.info("Running custom connector: %s", connector_dir.name)
|
||||
try:
|
||||
custom_result = subprocess.run(
|
||||
[sys.executable, str(extractor)],
|
||||
env=env, capture_output=True, text=True, timeout=600,
|
||||
cwd=str(Path(__file__).parent.parent.parent),
|
||||
)
|
||||
if custom_result.returncode != 0:
|
||||
logger.error("Custom connector %s failed: %s", connector_dir.name, custom_result.stderr[-500:])
|
||||
else:
|
||||
logger.info("Custom connector %s completed", connector_dir.name)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("Custom connector %s timed out", connector_dir.name)
|
||||
traceback.print_exc()
|
||||
|
||||
# Rebuild master views (reads extract.duckdb files, no write conflict)
|
||||
from src.orchestrator import SyncOrchestrator
|
||||
|
|
|
|||
|
|
@ -67,11 +67,19 @@ def register_table(
|
|||
source_type: str = typer.Option("keboola", help="Source type: keboola | bigquery | jira | local"),
|
||||
bucket: str = typer.Option("", help="Source bucket (Keboola) or dataset (BigQuery)"),
|
||||
source_table: str = typer.Option("", help="Source table name in the bucket/dataset"),
|
||||
query_mode: str = typer.Option("local", help="Query mode: local or remote (forced to 'remote' for bigquery)"),
|
||||
query_mode: str = typer.Option("local", help="Query mode: local | remote | materialized"),
|
||||
query: str = typer.Option(
|
||||
"",
|
||||
"--query",
|
||||
help=(
|
||||
"SQL body for query_mode='materialized' (BigQuery only). "
|
||||
"Inline SQL or `@path/to.sql` to read from disk."
|
||||
),
|
||||
),
|
||||
description: str = typer.Option("", help="Table description"),
|
||||
sync_schedule: str = typer.Option(
|
||||
"",
|
||||
help="Cron schedule (BigQuery only — note: scheduler not yet wired, see #79 / M3 of #108)",
|
||||
help="Cron schedule (e.g. 'every 6h' / 'daily 03:00'); honored by materialized BQ rows",
|
||||
),
|
||||
dry_run: bool = typer.Option(
|
||||
False,
|
||||
|
|
@ -81,11 +89,38 @@ def register_table(
|
|||
):
|
||||
"""Register a single table.
|
||||
|
||||
For BigQuery: dataset goes in --bucket, the BQ table/view name in
|
||||
--source-table, the DuckDB view name in NAME. The server forces
|
||||
query_mode=remote and profile_after_sync=False; --dry-run goes
|
||||
through /precheck and prints rows + size + columns without writing.
|
||||
Modes:
|
||||
- **local** (Keboola): batch pull, parquet on disk.
|
||||
- **remote** (BigQuery): view only, queries go to BQ. Requires
|
||||
`--bucket` + `--source-table`.
|
||||
- **materialized** (BigQuery): server-side scheduled SQL → parquet.
|
||||
Requires `--query` (inline or `@file.sql`); `--bucket` /
|
||||
`--source-table` ignored.
|
||||
|
||||
`--dry-run` goes through /precheck (BQ remote only — for materialized
|
||||
rows, dry-run is a no-op since the SQL itself is the contract).
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
# Resolve --query @file.sql shorthand.
|
||||
source_query = ""
|
||||
if query:
|
||||
if query.startswith("@"):
|
||||
sql_path = Path(query[1:])
|
||||
if not sql_path.exists():
|
||||
typer.echo(f"Error: SQL file not found: {sql_path}", err=True)
|
||||
raise typer.Exit(2)
|
||||
source_query = sql_path.read_text(encoding="utf-8").strip()
|
||||
else:
|
||||
source_query = query.strip()
|
||||
|
||||
if query_mode == "materialized" and not source_query:
|
||||
typer.echo(
|
||||
"Error: --query-mode materialized requires --query (literal SQL or @path.sql)",
|
||||
err=True,
|
||||
)
|
||||
raise typer.Exit(2)
|
||||
|
||||
payload = {
|
||||
"name": name,
|
||||
"source_type": source_type,
|
||||
|
|
@ -94,6 +129,11 @@ def register_table(
|
|||
"query_mode": query_mode,
|
||||
"description": description,
|
||||
}
|
||||
# Omit empty optional fields so the server-side validator doesn't see
|
||||
# `source_query=""` on a remote/local row (which would trigger the
|
||||
# "source_query forbidden" branch).
|
||||
if source_query:
|
||||
payload["source_query"] = source_query
|
||||
if sync_schedule:
|
||||
payload["sync_schedule"] = sync_schedule
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import shutil
|
|||
import threading
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import duckdb
|
||||
|
||||
|
|
@ -182,6 +182,14 @@ def _init_extract_locked(
|
|||
_create_remote_attach_table(conn, project_id)
|
||||
|
||||
for tc in table_configs:
|
||||
# Materialized rows are written by the sync trigger pass via
|
||||
# `materialize_query()` — they live as parquets in
|
||||
# /data/extracts/bigquery/data/, picked up by the orchestrator's
|
||||
# standard local-parquet discovery. Don't create a remote view
|
||||
# here (would shadow the parquet via name collision).
|
||||
if tc.get("query_mode") == "materialized":
|
||||
continue
|
||||
|
||||
table_name = tc["name"]
|
||||
dataset = tc.get("bucket", "")
|
||||
source_table = tc.get("source_table", table_name)
|
||||
|
|
@ -279,6 +287,183 @@ def _init_extract_locked(
|
|||
return stats
|
||||
|
||||
|
||||
class MaterializeBudgetError(RuntimeError):
|
||||
"""Raised when a `materialize_query` BQ dry-run estimate exceeds the
|
||||
configured `data_source.bigquery.max_bytes_per_materialize` cap.
|
||||
|
||||
The materialize trigger pass logs this and skips the row; the next
|
||||
scheduled tick re-tries (in case the underlying table size dropped
|
||||
or the operator raised the cap). Shape mirrors `BqAccessError` —
|
||||
`current` and `limit` for operator triage.
|
||||
"""
|
||||
|
||||
def __init__(self, message: str, *, table_id: str, current: int, limit: int):
|
||||
self.table_id = table_id
|
||||
self.current = current
|
||||
self.limit = limit
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
def materialize_query(
|
||||
table_id: str,
|
||||
sql: str,
|
||||
*,
|
||||
bq, # connectors.bigquery.access.BqAccess (untyped here to avoid circular import at type-check)
|
||||
output_dir: str,
|
||||
max_bytes: Optional[int] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Run `sql` through the DuckDB BigQuery extension and write the result
|
||||
to `<output_dir>/data/<table_id>.parquet` atomically.
|
||||
|
||||
Designed for `query_mode='materialized'` table_registry rows. The SQL
|
||||
is admin-registered (validated upstream) and may reference DuckDB
|
||||
three-part identifiers (`bq."dataset"."table"`) resolved by the
|
||||
in-session ATTACH, OR native BQ identifiers via the `bigquery_query()`
|
||||
table function — both work because the session has the bigquery
|
||||
extension loaded with a SECRET token.
|
||||
|
||||
Cost guardrail: when `max_bytes` is a positive int, run a BQ dry-run
|
||||
via `bq.client()` first; raise `MaterializeBudgetError` if the
|
||||
estimate exceeds the cap. `max_bytes=None` or `max_bytes <= 0`
|
||||
disables the guardrail (config sentinel, see
|
||||
`data_source.bigquery.max_bytes_per_materialize`).
|
||||
|
||||
Dry-run is best-effort and fail-open: if the SQL uses DuckDB syntax
|
||||
that the native BQ client can't parse (e.g. `bq."ds"."t"`), the
|
||||
dry-run raises and we log a warning; the COPY still runs. This
|
||||
matches the BqAccess facade's "client is for native BQ SQL only"
|
||||
contract — operators who need the cap to engage write the registered
|
||||
SQL using native BQ identifiers (`\\`project.ds.t\\``).
|
||||
|
||||
Atomic write: result lands in `<id>.parquet.tmp` first, then
|
||||
`os.replace` swaps it in. A failed COPY leaves no partial file behind.
|
||||
|
||||
Args:
|
||||
table_id: Logical id from table_registry; becomes the parquet
|
||||
filename. Must pass `validate_identifier()` so it can't
|
||||
inject path traversal.
|
||||
sql: SELECT statement, no trailing semicolon.
|
||||
bq: A `BqAccess` instance — provides `duckdb_session()` for the
|
||||
COPY and `client()` for the dry-run.
|
||||
output_dir: Connector root, e.g. `/data/extracts/bigquery`.
|
||||
Parquet lands in `<output_dir>/data/<table_id>.parquet`.
|
||||
max_bytes: Optional cap on BQ bytes scanned. None or <= 0 disables.
|
||||
|
||||
Returns:
|
||||
{"rows": int, "size_bytes": int, "query_mode": "materialized"}
|
||||
|
||||
Raises:
|
||||
ValueError: if `table_id` is unsafe.
|
||||
MaterializeBudgetError: if `max_bytes > 0` and dry-run estimate exceeds it.
|
||||
BqAccessError: from `bq.duckdb_session()` (auth_failed / bq_lib_missing /
|
||||
not_configured) — caller catches and aggregates into the trigger
|
||||
pass summary.
|
||||
duckdb.Error: if the COPY itself fails (e.g. bad SQL, missing table).
|
||||
"""
|
||||
if not validate_identifier(table_id, "materialize table_id"):
|
||||
raise ValueError(f"unsafe table_id: {table_id!r}")
|
||||
|
||||
out_path = Path(output_dir)
|
||||
data_dir = out_path / "data"
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
parquet_path = data_dir / f"{table_id}.parquet"
|
||||
tmp_path = data_dir / f"{table_id}.parquet.tmp"
|
||||
if tmp_path.exists():
|
||||
tmp_path.unlink()
|
||||
|
||||
# Cost guardrail (best-effort — fail-open if dry-run can't parse the SQL).
|
||||
if max_bytes is not None and max_bytes > 0:
|
||||
try:
|
||||
from app.api.v2_scan import _bq_dry_run_bytes # reuse main's impl
|
||||
estimated = _bq_dry_run_bytes(bq, sql)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"BQ dry-run failed for materialize cost guardrail (fail-open): %s. "
|
||||
"If the SQL uses DuckDB three-part names like bq.\"ds\".\"t\", "
|
||||
"rewrite to native BQ identifiers (`project.ds.t`) for the "
|
||||
"guardrail to engage. Proceeding with COPY.",
|
||||
e,
|
||||
)
|
||||
estimated = 0
|
||||
if estimated > max_bytes:
|
||||
raise MaterializeBudgetError(
|
||||
f"dry-run estimate {estimated:,} bytes exceeds cap "
|
||||
f"{max_bytes:,} for table {table_id!r}",
|
||||
table_id=table_id,
|
||||
current=estimated,
|
||||
limit=max_bytes,
|
||||
)
|
||||
|
||||
# COPY through a BqAccess-managed session.
|
||||
with bq.duckdb_session() as conn:
|
||||
# ATTACH the data project — but only when no `bq` catalog is
|
||||
# already attached. Production sessions (real BqAccess) come with
|
||||
# only `:memory:` and need the ATTACH; test sessions pre-populate
|
||||
# `bq` as a fixture catalog and would error on a redundant ATTACH
|
||||
# (alias already in use) AND on the bigquery extension load when
|
||||
# the test runner has no cached extension. Detecting via
|
||||
# `duckdb_databases()` keeps the ATTACH path idempotent without
|
||||
# swallowing real errors (auth, cross-project permission,
|
||||
# malformed project_id) — those still propagate from the actual
|
||||
# ATTACH call.
|
||||
attached = {
|
||||
r[0] for r in conn.execute(
|
||||
"SELECT database_name FROM duckdb_databases()"
|
||||
).fetchall()
|
||||
}
|
||||
if "bq" not in attached:
|
||||
conn.execute(
|
||||
f"ATTACH 'project={bq.projects.data}' AS bq (TYPE bigquery, READ_ONLY)"
|
||||
)
|
||||
|
||||
try:
|
||||
safe_path = str(tmp_path).replace("'", "''")
|
||||
conn.execute(f"COPY ({sql}) TO '{safe_path}' (FORMAT PARQUET)")
|
||||
rows = conn.execute(
|
||||
f"SELECT count(*) FROM read_parquet('{safe_path}')"
|
||||
).fetchone()[0]
|
||||
except Exception:
|
||||
if tmp_path.exists():
|
||||
tmp_path.unlink()
|
||||
raise
|
||||
|
||||
# Compute the parquet hash inline before the atomic swap. The caller used
|
||||
# to re-read the file in `_run_materialized_pass` to hash it via
|
||||
# `_file_hash`, but that's a synchronous full-read on the FastAPI worker
|
||||
# thread — a 10 GiB parquet means 50+ seconds of disk I/O blocking other
|
||||
# requests. Hashing here keeps the open-file handle hot from the COPY
|
||||
# round and removes the second read. Devil's-advocate review item.
|
||||
import hashlib
|
||||
h = hashlib.md5()
|
||||
with open(tmp_path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(8192), b""):
|
||||
h.update(chunk)
|
||||
parquet_hash = h.hexdigest()
|
||||
|
||||
size_bytes = tmp_path.stat().st_size
|
||||
os.replace(tmp_path, parquet_path)
|
||||
|
||||
rows = int(rows)
|
||||
if rows == 0:
|
||||
# 0 rows is indistinguishable from "the SQL is wrong and nobody
|
||||
# noticed" — surface it loudly so operators see it in the scheduler
|
||||
# log line and the per-row error aggregation. Caller decides whether
|
||||
# to alert.
|
||||
logger.warning(
|
||||
"Materialized %s produced 0 rows — verify the SQL filter is "
|
||||
"intentional. Parquet written: %s",
|
||||
table_id, parquet_path,
|
||||
)
|
||||
|
||||
return {
|
||||
"rows": rows,
|
||||
"size_bytes": size_bytes,
|
||||
"query_mode": "materialized",
|
||||
"hash": parquet_hash,
|
||||
}
|
||||
|
||||
|
||||
def _resolve_bq_project_id() -> str:
|
||||
"""Resolve ``data_source.bigquery.project`` honoring the overlay.
|
||||
|
||||
|
|
|
|||
43
connectors/keboola/access.py
Normal file
43
connectors/keboola/access.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
"""DuckDB session facade for the Keboola Storage API extension.
|
||||
|
||||
Parallel of `connectors/bigquery/access.py:BqAccess`. The materialized
|
||||
Keboola SQL path needs a one-shot DuckDB connection with the Keboola
|
||||
extension installed, loaded, and ATTACHed; this facade encapsulates
|
||||
that wiring so `_run_materialized_pass` doesn't need to know the
|
||||
extension name, the ATTACH alias, or how the token gets quoted into
|
||||
the URL literal.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from contextlib import contextmanager
|
||||
from typing import Iterator
|
||||
|
||||
import duckdb
|
||||
|
||||
|
||||
class KeboolaAccess:
|
||||
"""Lazy DuckDB session manager for the Keboola Storage API extension.
|
||||
|
||||
Single-use — call `.duckdb_session()` as a context manager once per
|
||||
materialized job.
|
||||
"""
|
||||
|
||||
def __init__(self, *, url: str, token: str) -> None:
|
||||
if not url or not token:
|
||||
raise ValueError("KeboolaAccess requires url and token")
|
||||
self._url = url
|
||||
self._token = token
|
||||
|
||||
@contextmanager
|
||||
def duckdb_session(self) -> Iterator[duckdb.DuckDBPyConnection]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
conn.execute("INSTALL keboola FROM community")
|
||||
conn.execute("LOAD keboola")
|
||||
escaped_token = self._token.replace("'", "''")
|
||||
conn.execute(
|
||||
f"ATTACH '{self._url}' AS kbc "
|
||||
f"(TYPE keboola, TOKEN '{escaped_token}')"
|
||||
)
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
|
@ -17,6 +17,66 @@ from src.identifier_validation import (
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def materialize_query(
|
||||
table_id: str,
|
||||
sql: str,
|
||||
*,
|
||||
keboola_access, # KeboolaAccess (avoid circular import)
|
||||
output_dir: Path,
|
||||
) -> dict:
|
||||
"""Materialize an admin-registered SELECT against the Keboola Storage
|
||||
API extension into a parquet file.
|
||||
|
||||
Parallel of `connectors/bigquery/extractor.py:materialize_query`.
|
||||
Cost guardrail: the Keboola extension has no analog of BQ dry-run;
|
||||
Storage API cost is download-shaped (per-byte egress + Storage API
|
||||
job). Phase B ships without a guardrail and logs the byte count;
|
||||
a future PR can add a configurable `max_bytes_per_keboola_materialize`
|
||||
gate similar to BQ's `max_bytes_per_materialize`.
|
||||
"""
|
||||
import re
|
||||
import hashlib
|
||||
|
||||
# Defense: table_id is interpolated into the parquet filename.
|
||||
# Reject anything that's not a safe identifier.
|
||||
if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", table_id):
|
||||
raise ValueError(f"unsafe table_id for materialize: {table_id!r}")
|
||||
|
||||
parquet_path = Path(output_dir) / f"{table_id}.parquet"
|
||||
safe_pq_lit = str(parquet_path).replace("'", "''")
|
||||
|
||||
with keboola_access.duckdb_session() as conn:
|
||||
# Run the admin SELECT and copy the result to parquet.
|
||||
# The COPY wrapper is identical to the existing legacy extract
|
||||
# path at extractor.py:209; the only difference is the SELECT is
|
||||
# admin-supplied rather than `SELECT * FROM kbc.bucket.table`.
|
||||
conn.execute(f"COPY ({sql}) TO '{safe_pq_lit}' (FORMAT PARQUET)")
|
||||
|
||||
# Read back row count.
|
||||
row_count = conn.execute(
|
||||
f"SELECT COUNT(*) FROM read_parquet('{safe_pq_lit}')"
|
||||
).fetchone()[0]
|
||||
|
||||
file_bytes = parquet_path.read_bytes()
|
||||
md5 = hashlib.md5(file_bytes).hexdigest()
|
||||
size = len(file_bytes)
|
||||
|
||||
if row_count == 0:
|
||||
logger.warning(
|
||||
"Materialized Keboola query for %s wrote 0 rows — verify the "
|
||||
"SQL filters and that the source bucket has data.",
|
||||
table_id,
|
||||
)
|
||||
|
||||
return {
|
||||
"table_id": table_id,
|
||||
"path": str(parquet_path),
|
||||
"rows": row_count,
|
||||
"bytes": size,
|
||||
"md5": md5,
|
||||
}
|
||||
|
||||
|
||||
def _create_meta_table(conn: duckdb.DuckDBPyConnection) -> None:
|
||||
"""Create the _meta table required by the extract.duckdb contract."""
|
||||
conn.execute("DROP TABLE IF EXISTS _meta")
|
||||
|
|
@ -99,6 +159,21 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
|
|||
table_name = tc["name"]
|
||||
query_mode = tc.get("query_mode", "local")
|
||||
|
||||
# Materialized rows are written by the sync trigger pass via
|
||||
# `materialize_query()` — they live as parquets in
|
||||
# /data/extracts/keboola/data/, picked up by the orchestrator's
|
||||
# standard local-parquet discovery. Don't extract here (would
|
||||
# double-write data via the source bucket reference and confuse
|
||||
# sync_state bookkeeping). Mirror of the BQ extractor's skip at
|
||||
# connectors/bigquery/extractor.py:190.
|
||||
if query_mode == "materialized":
|
||||
logger.info(
|
||||
"Skipping legacy extract for %s — query_mode='materialized', "
|
||||
"handled by _run_materialized_pass instead",
|
||||
tc.get("id") or tc.get("name"),
|
||||
)
|
||||
continue
|
||||
|
||||
# #81 Group D — refuse rows whose identifiers don't pass the
|
||||
# whitelist. The registry is admin-controlled but anyone with
|
||||
# write access can otherwise inject SQL via the CREATE VIEW /
|
||||
|
|
|
|||
227
scripts/smoke-test-materialized-bq.sh
Executable file
227
scripts/smoke-test-materialized-bq.sh
Executable file
|
|
@ -0,0 +1,227 @@
|
|||
#!/usr/bin/env bash
|
||||
# Smoke test — query_mode='materialized' for BigQuery.
|
||||
#
|
||||
# Runs the full happy-path + 3 adversarial scenarios against a live Agnes
|
||||
# instance that has BigQuery configured. Cheap (uses bigquery-public-data
|
||||
# samples; ~36 rows, < 1 KB scan) — safe to run against staging.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/smoke-test-materialized-bq.sh [host:port]
|
||||
#
|
||||
# Required environment:
|
||||
# AGNES_PAT — admin PAT for the target instance
|
||||
# BQ_TEST_BIG — (optional) name of a BQ table > 10 GiB to test the
|
||||
# cost guardrail. Defaults to a public dataset that
|
||||
# scans ~50 GB on full SELECT.
|
||||
#
|
||||
# Defaults: AGNES_HOST=http://localhost:8000.
|
||||
#
|
||||
# Cleans up the test rows on exit (trap), even on SIGINT.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${1:-${AGNES_HOST:-http://localhost:8000}}"
|
||||
PAT="${AGNES_PAT:?AGNES_PAT must be set (admin token)}"
|
||||
BIG_TABLE="${BQ_TEST_BIG:-bigquery-public-data.github_repos.commits}"
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
# Test rows we'll create — captured for cleanup.
|
||||
CREATED_IDS=()
|
||||
|
||||
cleanup() {
|
||||
echo
|
||||
echo "--- Cleanup ---"
|
||||
for tid in "${CREATED_IDS[@]}"; do
|
||||
curl -sS -X DELETE "$HOST/api/admin/registry/$tid" \
|
||||
-H "Authorization: Bearer $PAT" -o /dev/null -w " DELETE %{http_code} $tid\n" || true
|
||||
done
|
||||
}
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
check() {
|
||||
local name="$1" ok="$2"
|
||||
if [ "$ok" = "true" ]; then
|
||||
echo " PASS $name"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
echo " FAIL $name"
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
http() {
|
||||
# POST/PUT/DELETE helper that returns the HTTP status + body separately.
|
||||
local method="$1" path="$2" body="${3:-}"
|
||||
if [ -n "$body" ]; then
|
||||
curl -sS -o /tmp/smoke-mat-body -w "%{http_code}" \
|
||||
-X "$method" "$HOST$path" \
|
||||
-H "Authorization: Bearer $PAT" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$body"
|
||||
else
|
||||
curl -sS -o /tmp/smoke-mat-body -w "%{http_code}" \
|
||||
-X "$method" "$HOST$path" \
|
||||
-H "Authorization: Bearer $PAT"
|
||||
fi
|
||||
}
|
||||
|
||||
echo "Materialized BQ smoke: $HOST"
|
||||
echo "Big table for cost-guardrail test: $BIG_TABLE"
|
||||
echo "---"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario A — Happy path: register tiny materialized table, trigger,
|
||||
# verify parquet on disk + manifest carries hash.
|
||||
# ---------------------------------------------------------------------------
|
||||
echo
|
||||
echo "[A] Happy path (Shakespeare sample, ~36 rows)"
|
||||
SQL_A='SELECT corpus, COUNT(*) AS c FROM `bigquery-public-data.samples.shakespeare` GROUP BY 1 ORDER BY 1'
|
||||
TID_A="smoke_mat_shakespeare_$(date +%s)"
|
||||
CREATED_IDS+=("$TID_A")
|
||||
|
||||
STATUS=$(http POST /api/admin/register-table "{
|
||||
\"name\": \"$TID_A\",
|
||||
\"source_type\": \"bigquery\",
|
||||
\"query_mode\": \"materialized\",
|
||||
\"source_query\": $(printf '%s' "$SQL_A" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"),
|
||||
\"sync_schedule\": \"every 1m\"
|
||||
}")
|
||||
[ "$STATUS" = "201" ] && check "register 201" true || { check "register 201 (got $STATUS)" false; cat /tmp/smoke-mat-body; }
|
||||
|
||||
echo " triggering sync..."
|
||||
http POST /api/sync/trigger '{}' >/dev/null
|
||||
sleep 5 # background task
|
||||
|
||||
# Manifest must list the row with query_mode + non-empty hash.
|
||||
http GET /api/sync/manifest >/dev/null
|
||||
HASH=$(python3 -c "
|
||||
import json
|
||||
m = json.load(open('/tmp/smoke-mat-body'))
|
||||
t = m.get('tables', {}).get('$TID_A')
|
||||
print(t.get('hash', '') if t else '')
|
||||
")
|
||||
[ -n "$HASH" ] && [ "$HASH" != "null" ] && check "manifest hash present" true || check "manifest hash present (got '$HASH')" false
|
||||
|
||||
# Parquet on disk (assumes co-located filesystem, e.g. local docker compose).
|
||||
PARQUET="${DATA_DIR:-./data}/extracts/bigquery/data/$TID_A.parquet"
|
||||
if [ -f "$PARQUET" ]; then
|
||||
ROWS=$(python3 -c "import duckdb; print(duckdb.connect().execute(\"SELECT count(*) FROM read_parquet('$PARQUET')\").fetchone()[0])" 2>/dev/null || echo "0")
|
||||
[ "$ROWS" -gt 0 ] && check "parquet has $ROWS rows" true || check "parquet rows ($ROWS)" false
|
||||
else
|
||||
echo " note: parquet at $PARQUET not visible from this host (skip if Agnes is remote)"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario B — Cost guardrail: register a large-scan materialized SQL,
|
||||
# trigger, expect MaterializeBudgetError logged + row skipped.
|
||||
# ---------------------------------------------------------------------------
|
||||
echo
|
||||
echo "[B] Cost guardrail (\`$BIG_TABLE\` full SELECT)"
|
||||
TID_B="smoke_mat_huge_$(date +%s)"
|
||||
CREATED_IDS+=("$TID_B")
|
||||
SQL_B="SELECT * FROM \`$BIG_TABLE\`"
|
||||
|
||||
STATUS=$(http POST /api/admin/register-table "{
|
||||
\"name\": \"$TID_B\",
|
||||
\"source_type\": \"bigquery\",
|
||||
\"query_mode\": \"materialized\",
|
||||
\"source_query\": $(printf '%s' "$SQL_B" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"),
|
||||
\"sync_schedule\": \"every 1m\"
|
||||
}")
|
||||
[ "$STATUS" = "201" ] && check "register 201" true || check "register 201 (got $STATUS)" false
|
||||
|
||||
echo " triggering sync (expect cap to fire)..."
|
||||
http POST /api/sync/trigger '{}' >/dev/null
|
||||
sleep 5
|
||||
|
||||
# Manifest should NOT have a hash for the huge row (materialize was skipped).
|
||||
http GET /api/sync/manifest >/dev/null
|
||||
HUGE_HASH=$(python3 -c "
|
||||
import json
|
||||
m = json.load(open('/tmp/smoke-mat-body'))
|
||||
t = m.get('tables', {}).get('$TID_B')
|
||||
print(t.get('hash', '') if t else 'absent')
|
||||
")
|
||||
if [ "$HUGE_HASH" = "absent" ] || [ -z "$HUGE_HASH" ] || [ "$HUGE_HASH" = "null" ]; then
|
||||
check "huge row skipped (no hash in manifest)" true
|
||||
else
|
||||
check "huge row skipped (got hash '$HUGE_HASH' — guardrail did not fire)" false
|
||||
fi
|
||||
echo " grep server logs for: 'MaterializeBudgetError' or 'Materialize cap exceeded'"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario C — 0-row warning: SQL with always-false WHERE.
|
||||
# ---------------------------------------------------------------------------
|
||||
echo
|
||||
echo "[C] 0-row WARNING (filter to empty result)"
|
||||
TID_C="smoke_mat_empty_$(date +%s)"
|
||||
CREATED_IDS+=("$TID_C")
|
||||
SQL_C='SELECT corpus FROM `bigquery-public-data.samples.shakespeare` WHERE 1=0'
|
||||
|
||||
STATUS=$(http POST /api/admin/register-table "{
|
||||
\"name\": \"$TID_C\",
|
||||
\"source_type\": \"bigquery\",
|
||||
\"query_mode\": \"materialized\",
|
||||
\"source_query\": $(printf '%s' "$SQL_C" | python3 -c "import json,sys; print(json.dumps(sys.stdin.read()))"),
|
||||
\"sync_schedule\": \"every 1m\"
|
||||
}")
|
||||
[ "$STATUS" = "201" ] && check "register 201" true || check "register 201 (got $STATUS)" false
|
||||
|
||||
http POST /api/sync/trigger '{}' >/dev/null
|
||||
sleep 5
|
||||
|
||||
http GET /api/sync/manifest >/dev/null
|
||||
EMPTY_ROWS=$(python3 -c "
|
||||
import json
|
||||
m = json.load(open('/tmp/smoke-mat-body'))
|
||||
t = m.get('tables', {}).get('$TID_C')
|
||||
print(t.get('rows', 'absent') if t else 'absent')
|
||||
")
|
||||
[ "$EMPTY_ROWS" = "0" ] && check "empty-result rows=0 in manifest" true || check "empty-result rows ($EMPTY_ROWS)" false
|
||||
echo " grep server logs for: 'produced 0 rows'"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario D — Mode-switch transition clears stale source_query.
|
||||
# ---------------------------------------------------------------------------
|
||||
echo
|
||||
echo "[D] Mode-switch materialized → remote clears source_query"
|
||||
TID_D="smoke_mat_switch_$(date +%s)"
|
||||
CREATED_IDS+=("$TID_D")
|
||||
|
||||
STATUS=$(http POST /api/admin/register-table "{
|
||||
\"name\": \"$TID_D\",
|
||||
\"source_type\": \"bigquery\",
|
||||
\"query_mode\": \"materialized\",
|
||||
\"source_query\": \"SELECT 1\"
|
||||
}")
|
||||
[ "$STATUS" = "201" ] && check "register materialized" true || check "register materialized (got $STATUS)" false
|
||||
|
||||
# Switch to remote, providing required bucket+source_table.
|
||||
STATUS=$(http PUT "/api/admin/registry/$TID_D" "{
|
||||
\"query_mode\": \"remote\",
|
||||
\"bucket\": \"samples\",
|
||||
\"source_table\": \"shakespeare\"
|
||||
}")
|
||||
[ "$STATUS" = "200" ] && check "switch to remote 200" true || check "switch to remote (got $STATUS)" false
|
||||
|
||||
http GET /api/admin/registry >/dev/null
|
||||
SWITCHED_SQ=$(python3 -c "
|
||||
import json
|
||||
r = json.load(open('/tmp/smoke-mat-body'))
|
||||
row = next((t for t in r.get('tables', []) if t['id'] == '$TID_D'), None)
|
||||
print(row.get('source_query') if row else 'NOT_FOUND')
|
||||
")
|
||||
[ "$SWITCHED_SQ" = "None" ] || [ -z "$SWITCHED_SQ" ] || [ "$SWITCHED_SQ" = "null" ] \
|
||||
&& check "source_query cleared on switch" true \
|
||||
|| check "source_query cleared (got '$SWITCHED_SQ')" false
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summary
|
||||
# ---------------------------------------------------------------------------
|
||||
echo
|
||||
echo "---"
|
||||
echo "Passed: $PASS"
|
||||
echo "Failed: $FAIL"
|
||||
[ "$FAIL" -eq 0 ] || exit 1
|
||||
16
src/db.py
16
src/db.py
|
|
@ -39,7 +39,7 @@ def _maybe_instrument(con, db_tag: str):
|
|||
|
||||
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
|
||||
|
||||
SCHEMA_VERSION = 19
|
||||
SCHEMA_VERSION = 20
|
||||
|
||||
_SYSTEM_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS schema_version (
|
||||
|
|
@ -259,6 +259,7 @@ CREATE TABLE IF NOT EXISTS table_registry (
|
|||
source_type VARCHAR,
|
||||
bucket VARCHAR,
|
||||
source_table VARCHAR,
|
||||
source_query TEXT,
|
||||
sync_strategy VARCHAR DEFAULT 'full_refresh',
|
||||
query_mode VARCHAR DEFAULT 'local',
|
||||
sync_schedule VARCHAR,
|
||||
|
|
@ -918,6 +919,16 @@ _V16_TO_V17_MIGRATIONS = [
|
|||
# helper rather than a flat SQL list (the migrate-ladder calls it directly).
|
||||
|
||||
|
||||
# v19 -> v20: source_query column backs query_mode='materialized' for BigQuery.
|
||||
# Admin-registered SQL stored verbatim; scheduler runs it through the DuckDB BQ
|
||||
# extension (via BqAccess) and writes the result to
|
||||
# /data/extracts/bigquery/data/<id>.parquet so the existing manifest + da sync
|
||||
# flow distributes it to analysts. NULL on existing rows.
|
||||
_V19_TO_V20_MIGRATIONS = [
|
||||
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_query TEXT",
|
||||
]
|
||||
|
||||
|
||||
# Core role seed data — single source of truth. Used by both _seed_core_roles
|
||||
# (idempotent insert) and the v8→v9 backfill. Order matters: lowest privilege
|
||||
# first so implies references resolve cleanly when expand_implies does BFS.
|
||||
|
|
@ -1735,6 +1746,9 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
|
|||
_v17_to_v18_finalize(conn)
|
||||
if current < 19:
|
||||
_v18_to_v19_finalize(conn)
|
||||
if current < 20:
|
||||
for sql in _V19_TO_V20_MIGRATIONS:
|
||||
conn.execute(sql)
|
||||
conn.execute(
|
||||
"UPDATE schema_version SET version = ?, applied_at = current_timestamp",
|
||||
[SCHEMA_VERSION],
|
||||
|
|
|
|||
|
|
@ -72,7 +72,9 @@ class TableRegistryRepository:
|
|||
primary_key: Union[None, str, List[str]] = None,
|
||||
description: Optional[str] = None, registered_by: Optional[str] = None,
|
||||
source_type: Optional[str] = None, bucket: Optional[str] = None,
|
||||
source_table: Optional[str] = None, query_mode: str = "local",
|
||||
source_table: Optional[str] = None,
|
||||
source_query: Optional[str] = None,
|
||||
query_mode: str = "local",
|
||||
sync_schedule: Optional[str] = None, profile_after_sync: bool = True,
|
||||
registered_at: Optional[datetime] = None,
|
||||
) -> None:
|
||||
|
|
@ -85,18 +87,21 @@ class TableRegistryRepository:
|
|||
self.conn.execute(
|
||||
"""INSERT INTO table_registry (id, name, folder, sync_strategy,
|
||||
primary_key, description, registered_by, registered_at,
|
||||
source_type, bucket, source_table, query_mode,
|
||||
source_type, bucket, source_table, source_query, query_mode,
|
||||
sync_schedule, profile_after_sync)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
name = excluded.name, folder = excluded.folder,
|
||||
sync_strategy = excluded.sync_strategy, primary_key = excluded.primary_key,
|
||||
description = excluded.description, registered_at = excluded.registered_at,
|
||||
source_type = excluded.source_type, bucket = excluded.bucket,
|
||||
source_table = excluded.source_table, query_mode = excluded.query_mode,
|
||||
sync_schedule = excluded.sync_schedule, profile_after_sync = excluded.profile_after_sync""",
|
||||
source_table = excluded.source_table, source_query = excluded.source_query,
|
||||
query_mode = excluded.query_mode,
|
||||
sync_schedule = excluded.sync_schedule,
|
||||
profile_after_sync = excluded.profile_after_sync""",
|
||||
[id, name, folder, sync_strategy, encoded_pk, description, registered_by, ts,
|
||||
source_type, bucket, source_table, query_mode, sync_schedule, profile_after_sync],
|
||||
source_type, bucket, source_table, source_query, query_mode,
|
||||
sync_schedule, profile_after_sync],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
|
|
@ -244,7 +244,12 @@ class TestBigQueryRegisterCoercion:
|
|||
resp = c.get("/api/admin/registry", headers=_auth(token))
|
||||
row = next(t for t in resp.json()["tables"] if t["name"] == "orders")
|
||||
assert row["query_mode"] == "remote"
|
||||
assert row["profile_after_sync"] is False
|
||||
# Phase C: profile_after_sync is now inert. The field is accepted in
|
||||
# the request for back-compat but no longer overrides the DB default.
|
||||
# Was: assert row["profile_after_sync"] is False (when BQ register
|
||||
# forced it to False as a "signal"). Now the row carries the schema
|
||||
# default (True). Profiler runs unconditionally regardless.
|
||||
assert row.get("profile_after_sync") in (True, None)
|
||||
|
||||
|
||||
class TestBigQueryRegisterCollision:
|
||||
|
|
@ -698,20 +703,25 @@ class TestAdminTablesUI:
|
|||
c.cookies.clear()
|
||||
assert resp.status_code == 200, resp.text
|
||||
body = resp.text
|
||||
# Modal carries the source type so the JS can branch.
|
||||
# Modal carries the source type so legacy openRegisterModal({}) still
|
||||
# routes through the JS dispatcher.
|
||||
assert 'data-source-type="bigquery"' in body
|
||||
# BQ-only inputs.
|
||||
assert 'id="bqDataset"' in body
|
||||
assert 'id="bqSourceTable"' in body
|
||||
assert 'id="bqViewName"' in body
|
||||
assert 'id="bqSyncSchedule"' in body
|
||||
# Inline hint about scheduler-not-yet-wired (Decision 3).
|
||||
assert "scheduler" in body.lower()
|
||||
# BQ-specific panel (no discovery for BQ in M1).
|
||||
assert 'data-test="bq-register-panel"' in body
|
||||
# Keboola-only inputs must NOT be present.
|
||||
assert 'id="regTableId"' not in body
|
||||
assert 'id="regBucket"' not in body
|
||||
# Cron-style schedule examples are surfaced near the field
|
||||
# (operator-facing copy explains the syntax).
|
||||
assert "every 6h" in body or "daily 03:00" in body
|
||||
# BQ tab content section (legacy out-of-tab BQ register panel was
|
||||
# removed when the user-visible cleanup landed — each tab now owns
|
||||
# its own header + Register button).
|
||||
assert 'id="tab-content-bigquery"' in body
|
||||
assert 'id="bqRegisterBtn"' in body
|
||||
# Phase E: BQ + Keboola modals are now both always rendered (each
|
||||
# inside its own tab). On a BQ instance the BQ tab is the visible
|
||||
# one; the Keboola modal is just hidden in a non-active tab.
|
||||
|
||||
def test_renders_keboola_fields_when_data_source_keboola(self, seeded_app, monkeypatch):
|
||||
from app.instance_config import reset_cache
|
||||
|
|
@ -731,12 +741,22 @@ class TestAdminTablesUI:
|
|||
assert resp.status_code == 200
|
||||
body = resp.text
|
||||
assert 'data-source-type="keboola"' in body
|
||||
# Keboola path — discovery panel + Keboola inputs.
|
||||
assert 'id="discoveryResults"' in body
|
||||
assert 'id="regBucket"' in body
|
||||
assert 'id="regTableName"' in body
|
||||
# BQ-only inputs MUST NOT be present.
|
||||
assert 'id="bqDataset"' not in body
|
||||
# Keboola tab content section + Register-Keboola button (legacy
|
||||
# global Discovery panel was removed when the user-visible
|
||||
# cleanup landed — Keboola discovery is per-modal now).
|
||||
assert 'id="tab-content-keboola"' in body
|
||||
assert 'id="kbRegisterBtn"' in body
|
||||
# C3: legacy #registerModal is gone; the Phase F Keboola modal
|
||||
# at #registerKeboolaModal owns the Keboola register flow now.
|
||||
assert 'id="registerModal"' not in body
|
||||
assert 'id="regBucket"' not in body
|
||||
assert 'id="regTableName"' not in body
|
||||
# The Phase F Keboola modal's inputs are present.
|
||||
assert 'id="kbBucket"' in body
|
||||
assert 'id="kbViewName"' in body
|
||||
# Phase E: BQ form now always rendered (inside #tab-content-bigquery)
|
||||
# — operator can switch tabs to register a BQ table on a Keboola
|
||||
# instance. Tab is hidden by default but the form is in the DOM.
|
||||
finally:
|
||||
reset_cache()
|
||||
|
||||
|
|
@ -1546,11 +1566,17 @@ class TestKeboolaModalUsesDiscoveredTableId:
|
|||
"""Review IMPORTANT 5: the JS that builds the Keboola register payload
|
||||
must derive `source_table` from the discovered table's storage ID
|
||||
(`t.id` minus the bucket prefix), NOT the human-friendly display name
|
||||
(`t.name`). We verify by static template inspection — this is enough
|
||||
to catch a regression that drops the hidden field or reverts the JS
|
||||
to reading `regTableName`."""
|
||||
(`t.name`). We verify by static template inspection.
|
||||
|
||||
def test_template_has_hidden_source_table_field(self, seeded_app, monkeypatch):
|
||||
C3: the legacy #registerModal that owned regTableName / regSourceTable
|
||||
was removed. The Phase F #registerKeboolaModal uses kbBucket /
|
||||
kbSourceTable and a different payload builder (`_buildKeboolaPayload`)
|
||||
that already keeps storage identifier separate from display name. The
|
||||
tests below were rewritten to gate the Phase F flow."""
|
||||
|
||||
def test_phase_f_modal_separates_storage_id_from_display_name(
|
||||
self, seeded_app, monkeypatch,
|
||||
):
|
||||
from app.instance_config import reset_cache
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.load_instance_config",
|
||||
|
|
@ -1567,22 +1593,25 @@ class TestKeboolaModalUsesDiscoveredTableId:
|
|||
c.cookies.clear()
|
||||
assert resp.status_code == 200, resp.text
|
||||
body = resp.text
|
||||
# Hidden field must exist so the JS can stash the bare
|
||||
# storage identifier separately from the display name.
|
||||
assert 'id="regSourceTable"' in body
|
||||
# And the build function must read from that hidden field
|
||||
# (NOT from regTableName, which is the display name).
|
||||
assert "getElementById('regSourceTable').value" in body
|
||||
# Phase F modal owns the Keboola Register flow now.
|
||||
assert 'id="registerKeboolaModal"' in body
|
||||
# The source-table input is NOT the same field as the
|
||||
# human-friendly view name input.
|
||||
assert 'id="kbSourceTable"' in body
|
||||
assert 'id="kbViewName"' in body
|
||||
# The payload builder reads kbSourceTable for the storage
|
||||
# identifier (used in SELECT * FROM kbc."b"."t").
|
||||
assert "getElementById('kbSourceTable').value" in body
|
||||
finally:
|
||||
reset_cache()
|
||||
|
||||
def test_template_does_not_send_display_name_as_source_table(
|
||||
def test_legacy_regtablename_payload_path_is_gone(
|
||||
self, seeded_app, monkeypatch,
|
||||
):
|
||||
"""Regression check: pre-fix the payload had
|
||||
`source_table: document.getElementById('regTableName').value`.
|
||||
After the fix, that exact line must be gone (the build function
|
||||
reads from the hidden `regSourceTable` first)."""
|
||||
"""Regression check: the pre-fix payload line
|
||||
`source_table: document.getElementById('regTableName').value`
|
||||
must remain absent. C3 removed the entire legacy modal so no
|
||||
regTableName-based payload can be reintroduced."""
|
||||
from app.instance_config import reset_cache
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.load_instance_config",
|
||||
|
|
@ -1598,11 +1627,14 @@ class TestKeboolaModalUsesDiscoveredTableId:
|
|||
finally:
|
||||
c.cookies.clear()
|
||||
body = resp.text
|
||||
# No occurrence of the buggy direct assignment.
|
||||
assert (
|
||||
"source_table: document.getElementById('regTableName').value"
|
||||
not in body
|
||||
)
|
||||
# C3: the regTableName / regSourceTable inputs themselves are
|
||||
# gone with the legacy modal.
|
||||
assert 'id="regTableName"' not in body
|
||||
assert 'id="regSourceTable"' not in body
|
||||
finally:
|
||||
reset_cache()
|
||||
|
||||
|
|
|
|||
205
tests/test_admin_discover_bigquery.py
Normal file
205
tests/test_admin_discover_bigquery.py
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
"""GET /api/admin/discover-tables — BigQuery branch.
|
||||
|
||||
Two-step shape: dataset list (no `dataset` query param) → table list (with
|
||||
`dataset=name`). The UI populates the dataset autocomplete first, then
|
||||
fetches tables only after the operator picks a dataset, avoiding the
|
||||
per-dataset `list_tables()` cost on projects with hundreds of datasets.
|
||||
"""
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from connectors.bigquery.access import BqAccess, BqProjects
|
||||
|
||||
|
||||
def _auth(token):
|
||||
return {"Authorization": f"Bearer {token}"}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def bq_instance(monkeypatch):
|
||||
"""Force `data_source.type='bigquery'` so the endpoint routes to the
|
||||
BQ branch."""
|
||||
fake_cfg = {
|
||||
"data_source": {
|
||||
"type": "bigquery",
|
||||
"bigquery": {"project": "my-test-project", "location": "us"},
|
||||
},
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.load_instance_config",
|
||||
lambda: fake_cfg,
|
||||
raising=False,
|
||||
)
|
||||
from app.instance_config import reset_cache
|
||||
reset_cache()
|
||||
yield fake_cfg
|
||||
reset_cache()
|
||||
|
||||
|
||||
def _stub_bq_with_client(client_mock):
|
||||
"""Build a BqAccess wired to return `client_mock` from .client(). The
|
||||
duckdb_session_factory is unused by the discover endpoint — supply a
|
||||
no-op."""
|
||||
from contextlib import contextmanager
|
||||
@contextmanager
|
||||
def _noop(_p):
|
||||
yield None
|
||||
return BqAccess(
|
||||
BqProjects(billing="my-test-project", data="my-test-project"),
|
||||
client_factory=lambda _p: client_mock,
|
||||
duckdb_session_factory=_noop,
|
||||
)
|
||||
|
||||
|
||||
def test_discover_returns_dataset_list(seeded_app, bq_instance, monkeypatch):
|
||||
"""Without `dataset` param: list datasets in the configured project."""
|
||||
client = MagicMock()
|
||||
ds_a = MagicMock()
|
||||
ds_a.dataset_id = "analytics"
|
||||
ds_a.project = "my-test-project"
|
||||
ds_b = MagicMock()
|
||||
ds_b.dataset_id = "raw"
|
||||
ds_b.project = "my-test-project"
|
||||
client.list_datasets.return_value = [ds_a, ds_b]
|
||||
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.access.get_bq_access",
|
||||
lambda: _stub_bq_with_client(client),
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.get("/api/admin/discover-tables", headers=_auth(token))
|
||||
assert r.status_code == 200, r.json()
|
||||
body = r.json()
|
||||
assert body["source"] == "bigquery"
|
||||
assert body["count"] == 2
|
||||
# Sorted alphabetically by dataset_id.
|
||||
assert [d["dataset_id"] for d in body["datasets"]] == ["analytics", "raw"]
|
||||
assert body["datasets"][0]["full_id"] == "my-test-project.analytics"
|
||||
|
||||
|
||||
def test_discover_returns_table_list_for_dataset(seeded_app, bq_instance, monkeypatch):
|
||||
"""With `?dataset=analytics`: list tables + views in that dataset."""
|
||||
client = MagicMock()
|
||||
t_orders = MagicMock()
|
||||
t_orders.table_id = "orders"
|
||||
t_orders.table_type = "TABLE"
|
||||
t_orders.project = "my-test-project"
|
||||
t_orders.dataset_id = "analytics"
|
||||
t_view = MagicMock()
|
||||
t_view.table_id = "orders_active"
|
||||
t_view.table_type = "VIEW"
|
||||
t_view.project = "my-test-project"
|
||||
t_view.dataset_id = "analytics"
|
||||
client.list_tables.return_value = [t_view, t_orders] # unsorted
|
||||
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.access.get_bq_access",
|
||||
lambda: _stub_bq_with_client(client),
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.get(
|
||||
"/api/admin/discover-tables?dataset=analytics",
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 200, r.json()
|
||||
body = r.json()
|
||||
assert body["source"] == "bigquery"
|
||||
assert body["dataset"] == "analytics"
|
||||
assert body["count"] == 2
|
||||
# Sorted by table_id.
|
||||
assert [t["table_id"] for t in body["tables"]] == ["orders", "orders_active"]
|
||||
by_id = {t["table_id"]: t for t in body["tables"]}
|
||||
assert by_id["orders"]["table_type"] == "TABLE"
|
||||
assert by_id["orders_active"]["table_type"] == "VIEW"
|
||||
# Verify dataset filter was passed through.
|
||||
client.list_tables.assert_called_once_with("analytics")
|
||||
|
||||
|
||||
def test_discover_keboola_branch_unchanged(seeded_app, monkeypatch):
|
||||
"""Negative — when source_type is keboola, BQ logic isn't reached.
|
||||
|
||||
Skipped when the Keboola SDK (`kbcstorage`) is not installed: CI
|
||||
runners don't ship it because the dev container only needs it for
|
||||
instances that actually configure source_type=keboola, and the
|
||||
route's lazy import would fail before the test stub gets a chance
|
||||
to fire. The branch-unchanged contract is tested separately by the
|
||||
Keboola integration suite when the package is present.
|
||||
"""
|
||||
pytest.importorskip("kbcstorage")
|
||||
fake_cfg = {"data_source": {"type": "keboola", "keboola": {}}}
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.load_instance_config",
|
||||
lambda: fake_cfg,
|
||||
raising=False,
|
||||
)
|
||||
from app.instance_config import reset_cache
|
||||
reset_cache()
|
||||
|
||||
# Stub the Keboola client so the test doesn't reach the network.
|
||||
fake_client = MagicMock()
|
||||
fake_client.discover_all_tables.return_value = [{"id": "in.c-foo.bar"}]
|
||||
monkeypatch.setattr(
|
||||
"connectors.keboola.client.KeboolaClient",
|
||||
lambda *a, **kw: fake_client,
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
try:
|
||||
r = c.get("/api/admin/discover-tables", headers=_auth(token))
|
||||
assert r.status_code == 200, r.json()
|
||||
body = r.json()
|
||||
assert body["source"] == "keboola"
|
||||
assert body["count"] == 1
|
||||
finally:
|
||||
reset_cache()
|
||||
|
||||
|
||||
def test_discover_bq_not_configured_returns_500(seeded_app, monkeypatch):
|
||||
"""When data_source.bigquery.project is missing, BqAccess returns its
|
||||
not_configured sentinel — endpoint surfaces the structured error."""
|
||||
fake_cfg = {
|
||||
"data_source": {
|
||||
"type": "bigquery",
|
||||
"bigquery": {}, # no project
|
||||
},
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.load_instance_config",
|
||||
lambda: fake_cfg,
|
||||
raising=False,
|
||||
)
|
||||
from app.instance_config import reset_cache
|
||||
reset_cache()
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
try:
|
||||
r = c.get("/api/admin/discover-tables", headers=_auth(token))
|
||||
# not_configured is mapped to 500 in BqAccessError.HTTP_STATUS.
|
||||
assert r.status_code == 500, r.json()
|
||||
detail = r.json().get("detail", {})
|
||||
assert detail.get("kind") == "not_configured"
|
||||
finally:
|
||||
reset_cache()
|
||||
|
||||
|
||||
def test_admin_tables_html_wires_discover_buttons(seeded_app, bq_instance):
|
||||
"""Structural — the BQ register modal in the rendered HTML now has the
|
||||
Discover (datasets) and List tables buttons + datalists wired to the
|
||||
endpoint."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.get("/admin/tables", headers=_auth(token))
|
||||
assert r.status_code == 200, r.text
|
||||
html = r.text
|
||||
assert "discoverBqDatasets" in html
|
||||
assert "discoverBqTables" in html
|
||||
assert 'id="bqDatasetList"' in html
|
||||
assert 'id="bqTableList"' in html
|
||||
assert "list=\"bqDatasetList\"" in html
|
||||
assert "list=\"bqTableList\"" in html
|
||||
95
tests/test_admin_keboola_materialized.py
Normal file
95
tests/test_admin_keboola_materialized.py
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
"""Tests for Keboola materialized registration."""
|
||||
import pytest
|
||||
|
||||
|
||||
def test_register_keboola_materialized_accepts_source_query(seeded_app):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = {"Authorization": f"Bearer {token}"}
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
headers=auth,
|
||||
json={
|
||||
"name": "orders_recent",
|
||||
"source_type": "keboola",
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT * FROM kbc.\"in.c-sales\".\"orders\" WHERE date > '2026-01-01'",
|
||||
"sync_schedule": "daily 03:00",
|
||||
},
|
||||
)
|
||||
assert r.status_code == 201, r.text
|
||||
|
||||
|
||||
def test_register_keboola_materialized_rejects_missing_source_query(seeded_app):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = {"Authorization": f"Bearer {token}"}
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
headers=auth,
|
||||
json={
|
||||
"name": "orders_recent",
|
||||
"source_type": "keboola",
|
||||
"query_mode": "materialized",
|
||||
# source_query missing
|
||||
},
|
||||
)
|
||||
assert r.status_code == 422
|
||||
assert "source_query" in r.text
|
||||
|
||||
|
||||
def test_register_keboola_materialized_skips_bucket_check(seeded_app):
|
||||
"""Materialized rows don't need bucket/source_table — the SELECT inlines
|
||||
the references. Mirror of BQ materialized validator behavior."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = {"Authorization": f"Bearer {token}"}
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
headers=auth,
|
||||
json={
|
||||
"name": "x",
|
||||
"source_type": "keboola",
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT 1",
|
||||
# No bucket / source_table — must still succeed.
|
||||
},
|
||||
)
|
||||
assert r.status_code == 201, r.text
|
||||
|
||||
|
||||
def test_update_keboola_materialized_clears_stale_source_query_on_mode_switch(seeded_app):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = {"Authorization": f"Bearer {token}"}
|
||||
|
||||
# Register materialized.
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
headers=auth,
|
||||
json={
|
||||
"name": "x",
|
||||
"source_type": "keboola",
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT 1",
|
||||
},
|
||||
)
|
||||
assert r.status_code == 201
|
||||
|
||||
# PUT to switch back to local — source_query must clear.
|
||||
r = c.put(
|
||||
"/api/admin/registry/x",
|
||||
headers=auth,
|
||||
json={
|
||||
"source_type": "keboola",
|
||||
"query_mode": "local",
|
||||
"bucket": "in.c-foo",
|
||||
"source_table": "y",
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
|
||||
r = c.get("/api/admin/registry", headers=auth)
|
||||
rows = r.json()["tables"]
|
||||
row = next(t for t in rows if t["id"] == "x")
|
||||
assert row.get("source_query") in (None, "")
|
||||
70
tests/test_admin_phase_c_deprecation.py
Normal file
70
tests/test_admin_phase_c_deprecation.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
"""Verify Phase C deprecation marks + profile_after_sync becomes inert."""
|
||||
import pytest
|
||||
from app.api.admin import RegisterTableRequest, UpdateTableRequest
|
||||
|
||||
|
||||
def test_register_request_marks_sync_strategy_deprecated():
|
||||
schema = RegisterTableRequest.model_json_schema()
|
||||
field = schema["properties"]["sync_strategy"]
|
||||
assert field.get("deprecated") is True
|
||||
|
||||
|
||||
def test_register_request_marks_profile_after_sync_deprecated():
|
||||
schema = RegisterTableRequest.model_json_schema()
|
||||
field = schema["properties"]["profile_after_sync"]
|
||||
assert field.get("deprecated") is True
|
||||
|
||||
|
||||
def test_register_endpoint_accepts_profile_after_sync_for_backcompat(seeded_app):
|
||||
"""External clients sending profile_after_sync get no error — the
|
||||
field is silently ignored."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = {"Authorization": f"Bearer {token}"}
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
headers=auth,
|
||||
json={
|
||||
"name": "x",
|
||||
"source_type": "keboola",
|
||||
"bucket": "in.c-foo",
|
||||
"source_table": "y",
|
||||
"query_mode": "local",
|
||||
"profile_after_sync": True, # legacy client may send this
|
||||
},
|
||||
)
|
||||
assert r.status_code == 201
|
||||
|
||||
|
||||
def test_register_endpoint_does_not_persist_profile_after_sync(seeded_app):
|
||||
"""The persisted row no longer carries the old profile_after_sync
|
||||
value (column may still exist in DB for back-compat, but admin path
|
||||
never writes a non-default value)."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = {"Authorization": f"Bearer {token}"}
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
headers=auth,
|
||||
json={
|
||||
"name": "y",
|
||||
"source_type": "keboola",
|
||||
"bucket": "in.c-foo",
|
||||
"source_table": "y",
|
||||
"query_mode": "local",
|
||||
"profile_after_sync": True,
|
||||
},
|
||||
)
|
||||
assert r.status_code == 201
|
||||
r = c.get("/api/admin/registry", headers=auth)
|
||||
rows = r.json()["tables"]
|
||||
row = next(t for t in rows if t["id"] == "y")
|
||||
# The field's value in the registry response is now whatever the DB
|
||||
# default is (True per current schema). Critical: the request value
|
||||
# is NOT echoed back.
|
||||
# If the value is in the response at all (legacy back-compat in the
|
||||
# GET serializer), it's the schema default, not the request value.
|
||||
# If the value is absent (deprecated and stripped), that's also fine.
|
||||
if "profile_after_sync" in row:
|
||||
# Whatever this is, it's the schema default, not request-driven.
|
||||
assert row["profile_after_sync"] is True or row["profile_after_sync"] is None
|
||||
67
tests/test_admin_put_preservation.py
Normal file
67
tests/test_admin_put_preservation.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
"""Regression guard for PUT field preservation.
|
||||
|
||||
Locks the Pydantic semantics that the Phase F form-cleanup relies on:
|
||||
when the Edit modal omits a field from its payload, the existing value
|
||||
must survive. If a future maintainer flips ``model_dump()`` to
|
||||
``exclude_unset=True`` or otherwise changes the partial-update semantics,
|
||||
these tests fire before partitioned rows or primary keys silently
|
||||
regress.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
|
||||
def _auth(token):
|
||||
return {"Authorization": f"Bearer {token}"}
|
||||
|
||||
|
||||
def test_put_preserves_omitted_sync_strategy(seeded_app):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = _auth(token)
|
||||
|
||||
r = c.post("/api/admin/register-table", headers=auth, json={
|
||||
"name": "events_partitioned",
|
||||
"source_type": "keboola",
|
||||
"bucket": "in.c-events",
|
||||
"source_table": "events",
|
||||
"query_mode": "local",
|
||||
"sync_strategy": "partitioned",
|
||||
})
|
||||
assert r.status_code == 201, r.text
|
||||
|
||||
r = c.put("/api/admin/registry/events_partitioned", headers=auth, json={
|
||||
"sync_schedule": "daily 03:00",
|
||||
"description": "now daily",
|
||||
})
|
||||
assert r.status_code == 200
|
||||
|
||||
r = c.get("/api/admin/registry", headers=auth)
|
||||
rows = r.json()["tables"]
|
||||
row = next(t for t in rows if t["id"] == "events_partitioned")
|
||||
assert row["sync_strategy"] == "partitioned"
|
||||
|
||||
|
||||
def test_put_preserves_omitted_primary_key(seeded_app):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = _auth(token)
|
||||
|
||||
r = c.post("/api/admin/register-table", headers=auth, json={
|
||||
"name": "orders_with_pk",
|
||||
"source_type": "keboola",
|
||||
"bucket": "in.c-shop",
|
||||
"source_table": "orders",
|
||||
"query_mode": "local",
|
||||
"primary_key": ["order_id", "tenant_id"],
|
||||
})
|
||||
assert r.status_code == 201, r.text
|
||||
|
||||
r = c.put("/api/admin/registry/orders_with_pk", headers=auth, json={
|
||||
"description": "shop orders",
|
||||
})
|
||||
assert r.status_code == 200
|
||||
|
||||
r = c.get("/api/admin/registry", headers=auth)
|
||||
rows = r.json()["tables"]
|
||||
row = next(t for t in rows if t["id"] == "orders_with_pk")
|
||||
assert row["primary_key"] == ["order_id", "tenant_id"]
|
||||
322
tests/test_api_admin_materialized.py
Normal file
322
tests/test_api_admin_materialized.py
Normal file
|
|
@ -0,0 +1,322 @@
|
|||
"""Admin API accepts source_query when query_mode='materialized', rejects
|
||||
mismatches between mode and query field.
|
||||
|
||||
Tests that hit the remote-mode register path require `stub_bq_extractor`
|
||||
to bypass the post-register rebuild's real-BQ traffic. Materialized-only
|
||||
tests skip the BG path (the 201 fast-path returns before any rebuild
|
||||
fires) so they don't need the stub.
|
||||
|
||||
Covers PR #145 (re-implementation against 0.24.0 base):
|
||||
- RegisterTableRequest + UpdateTableRequest model_validators
|
||||
- _validate_bigquery_register_payload materialized branch (skips bucket/
|
||||
source_table checks, requires source_query)
|
||||
- register_table 201 response for materialized BQ rows (no synchronous
|
||||
materialize — cron tick or manual /api/sync/trigger picks them up)
|
||||
- update_table clears stale source_query when switching mode away from
|
||||
materialized
|
||||
|
||||
Shares the seeded_app + bq_instance fixtures from conftest /
|
||||
test_admin_bq_register.py for parity with the existing BQ test surface.
|
||||
"""
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _auth(token):
|
||||
return {"Authorization": f"Bearer {token}"}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stub_bq_extractor(monkeypatch):
|
||||
"""Mirror tests/test_admin_bq_register.py — bypasses real-BQ traffic
|
||||
in the post-register rebuild path so the test stays offline. Required
|
||||
whenever the test seeds a remote-mode BQ row via the HTTP API."""
|
||||
rebuild_mock = MagicMock(return_value={
|
||||
"project_id": "my-test-project",
|
||||
"tables_registered": 1, "errors": [], "skipped": False,
|
||||
})
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.extractor.rebuild_from_registry",
|
||||
rebuild_mock,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.orchestrator.SyncOrchestrator",
|
||||
lambda *a, **kw: MagicMock(),
|
||||
)
|
||||
return rebuild_mock
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def bq_instance(monkeypatch):
|
||||
"""Force instance.yaml to look like a BigQuery deployment.
|
||||
|
||||
Mirrors tests/test_admin_bq_register.py::bq_instance so the
|
||||
project_id read inside _validate_bigquery_register_payload succeeds.
|
||||
"""
|
||||
fake_cfg = {
|
||||
"data_source": {
|
||||
"type": "bigquery",
|
||||
"bigquery": {"project": "my-test-project", "location": "us"},
|
||||
},
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.load_instance_config",
|
||||
lambda: fake_cfg,
|
||||
raising=False,
|
||||
)
|
||||
from app.instance_config import reset_cache
|
||||
reset_cache()
|
||||
yield fake_cfg
|
||||
reset_cache()
|
||||
|
||||
|
||||
def _materialized_payload(**overrides):
|
||||
p = {
|
||||
"name": "orders_90d",
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT date FROM `prj.ds.orders`",
|
||||
"sync_schedule": "every 6h",
|
||||
}
|
||||
p.update(overrides)
|
||||
return p
|
||||
|
||||
|
||||
def test_register_materialized_requires_source_query(seeded_app, bq_instance):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
json={
|
||||
"name": "missing_query",
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
# source_query missing
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert 400 <= r.status_code < 500, r.json()
|
||||
detail = str(r.json().get("detail", "")).lower()
|
||||
assert "source_query" in detail or "materialized" in detail
|
||||
|
||||
|
||||
def test_register_materialized_accepts_source_query(seeded_app, bq_instance):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
json=_materialized_payload(name="orders_90d_a"),
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 201, r.json()
|
||||
body = r.json()
|
||||
assert body["status"] == "registered"
|
||||
assert "Materialized" in body.get("message", "")
|
||||
|
||||
|
||||
def test_register_remote_rejects_source_query(seeded_app, bq_instance):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
json={
|
||||
"name": "live_orders",
|
||||
"source_type": "bigquery",
|
||||
"bucket": "analytics",
|
||||
"source_table": "orders",
|
||||
"query_mode": "remote",
|
||||
"source_query": "SELECT 1",
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert 400 <= r.status_code < 500, r.json()
|
||||
|
||||
|
||||
def test_register_local_rejects_source_query(seeded_app):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
json={
|
||||
"name": "kbc_orders",
|
||||
"source_type": "keboola",
|
||||
"query_mode": "local",
|
||||
"source_query": "SELECT 1",
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert 400 <= r.status_code < 500, r.json()
|
||||
|
||||
|
||||
def test_register_materialized_with_empty_source_query_rejected(seeded_app, bq_instance):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
json=_materialized_payload(name="empty_q", source_query=""),
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert 400 <= r.status_code < 500, r.json()
|
||||
|
||||
|
||||
def test_update_source_query_alone_requires_query_mode(seeded_app, bq_instance, stub_bq_extractor):
|
||||
"""PUT body with source_query but no query_mode is incoherent — reject
|
||||
so non-materialized rows can't carry an orphan source_query."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
|
||||
# Seed a remote-mode row.
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
json={
|
||||
"name": "live_orphan",
|
||||
"source_type": "bigquery",
|
||||
"bucket": "analytics",
|
||||
"source_table": "orders",
|
||||
"query_mode": "remote",
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code in (200, 202), r.json() # synchronous or async
|
||||
table_id = r.json()["id"]
|
||||
|
||||
r2 = c.put(
|
||||
f"/api/admin/registry/{table_id}",
|
||||
json={"source_query": "SELECT 1"},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert 400 <= r2.status_code < 500, r2.json()
|
||||
|
||||
|
||||
def test_update_schedule_only_on_materialized_row_succeeds(
|
||||
seeded_app, bq_instance, stub_bq_extractor,
|
||||
):
|
||||
"""REGRESSION (Devin BUG_0002 on 2219255): an admin editing only the
|
||||
sync_schedule of a materialized row sends `{query_mode: 'materialized',
|
||||
sync_schedule: '...'}` (the Edit modal always sends query_mode for BQ
|
||||
rows). Pre-fix the UpdateTableRequest validator rejected this with 422
|
||||
because source_query wasn't in the body — even though the existing row
|
||||
already had one.
|
||||
|
||||
The PUT semantics overlay the body on the existing row, so omitted
|
||||
source_query keeps the stored value. The synthetic RegisterTableRequest
|
||||
constructed against the merged record at the handler still runs the
|
||||
strict cross-field check, so the truly-broken case (materialized
|
||||
without ANY source_query, even on existing) is still caught."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
|
||||
# Seed a materialized row with a real source_query.
|
||||
r = c.post("/api/admin/register-table", json={
|
||||
"name": "schedule_edit_target",
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT 1",
|
||||
"sync_schedule": "every 1h",
|
||||
}, headers=_auth(token))
|
||||
assert r.status_code == 201, r.json()
|
||||
table_id = r.json()["id"]
|
||||
|
||||
# Edit ONLY the schedule. UI's saveTableEdit sends query_mode for BQ
|
||||
# rows even when the operator didn't change it.
|
||||
r2 = c.put(f"/api/admin/registry/{table_id}", json={
|
||||
"query_mode": "materialized",
|
||||
"sync_schedule": "every 12h",
|
||||
}, headers=_auth(token))
|
||||
assert r2.status_code == 200, r2.json()
|
||||
|
||||
# Verify the schedule changed and source_query survived.
|
||||
r3 = c.get("/api/admin/registry", headers=_auth(token))
|
||||
row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None)
|
||||
assert row is not None
|
||||
assert row["sync_schedule"] == "every 12h"
|
||||
assert row["source_query"] == "SELECT 1" # preserved across edit
|
||||
assert row["query_mode"] == "materialized"
|
||||
|
||||
|
||||
def test_update_materialized_with_explicit_empty_source_query_rejected(
|
||||
seeded_app, bq_instance, stub_bq_extractor,
|
||||
):
|
||||
"""The fix above relaxes the validator for OMITTED source_query, but
|
||||
explicitly setting it to an empty / whitespace string while claiming
|
||||
materialized is still a typo and must be rejected (not silently
|
||||
persisted as NULL)."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
|
||||
r = c.post("/api/admin/register-table", json={
|
||||
"name": "explicit_empty",
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT 1",
|
||||
}, headers=_auth(token))
|
||||
assert r.status_code == 201, r.json()
|
||||
table_id = r.json()["id"]
|
||||
|
||||
r2 = c.put(f"/api/admin/registry/{table_id}", json={
|
||||
"query_mode": "materialized",
|
||||
"source_query": "", # explicitly empty
|
||||
}, headers=_auth(token))
|
||||
assert 400 <= r2.status_code < 500, r2.json()
|
||||
|
||||
|
||||
def test_update_materialized_to_remote_clears_source_query(
|
||||
seeded_app, bq_instance, stub_bq_extractor,
|
||||
):
|
||||
"""When admin switches a materialized table to remote/local, the stale
|
||||
source_query must be cleared in the DB — otherwise the registry shows
|
||||
a non-materialized row carrying an orphan SQL body."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
|
||||
# Seed a materialized table with a source_query.
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
json=_materialized_payload(name="switcher"),
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 201, r.json()
|
||||
table_id = r.json()["id"]
|
||||
|
||||
# Switch to remote — must include bucket+source_table for the new mode
|
||||
# (the merged validator runs the BQ payload check on the merged record).
|
||||
r2 = c.put(
|
||||
f"/api/admin/registry/{table_id}",
|
||||
json={
|
||||
"query_mode": "remote",
|
||||
"bucket": "analytics",
|
||||
"source_table": "orders_90d",
|
||||
},
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r2.status_code == 200, r2.json()
|
||||
|
||||
# Verify in the registry: query_mode flipped, source_query cleared.
|
||||
r3 = c.get("/api/admin/registry", headers=_auth(token))
|
||||
assert r3.status_code == 200, r3.json()
|
||||
row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None)
|
||||
assert row is not None, f"Table {table_id} not found in registry"
|
||||
assert row["query_mode"] == "remote"
|
||||
assert row["source_query"] in (None, "")
|
||||
|
||||
|
||||
def test_register_materialized_persists_source_query_in_registry(seeded_app, bq_instance):
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
r = c.post(
|
||||
"/api/admin/register-table",
|
||||
json=_materialized_payload(
|
||||
name="persist_q",
|
||||
source_query="SELECT col FROM `prj.ds.t` WHERE x = 1",
|
||||
),
|
||||
headers=_auth(token),
|
||||
)
|
||||
assert r.status_code == 201, r.json()
|
||||
table_id = r.json()["id"]
|
||||
|
||||
r2 = c.get("/api/admin/registry", headers=_auth(token))
|
||||
row = next((t for t in r2.json()["tables"] if t["id"] == table_id), None)
|
||||
assert row is not None
|
||||
assert row["query_mode"] == "materialized"
|
||||
assert "WHERE x = 1" in row["source_query"]
|
||||
137
tests/test_bq_cost_guardrail.py
Normal file
137
tests/test_bq_cost_guardrail.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
"""materialize_query refuses to run when dry-run estimate exceeds the cap.
|
||||
|
||||
The cap is wired through `data_source.bigquery.max_bytes_per_materialize`
|
||||
(read by the trigger pass; default 10 GiB; set 0 to disable). The dry-run
|
||||
itself reuses `app.api.v2_scan._bq_dry_run_bytes` so cost-estimate logic
|
||||
lives in exactly one place. Fail-open behaviour (DuckDB-syntax SQL the
|
||||
native BQ client can't parse → estimate=0 → COPY proceeds with a warning)
|
||||
is documented and exercised here too.
|
||||
"""
|
||||
import duckdb
|
||||
import pytest
|
||||
from contextlib import contextmanager
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from connectors.bigquery.access import BqAccess, BqProjects
|
||||
from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError
|
||||
|
||||
|
||||
def _bq_with_seed(tables: dict[str, str] | None = None) -> BqAccess:
|
||||
"""Stub BqAccess seeded with in-memory tables (same recipe as
|
||||
test_bq_materialize)."""
|
||||
tables = tables or {}
|
||||
|
||||
@contextmanager
|
||||
def _session(_projects):
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
conn.execute("ATTACH ':memory:' AS bq")
|
||||
for s in {ref.rsplit(".", 1)[0] for ref in tables}:
|
||||
conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}")
|
||||
for ref, body in tables.items():
|
||||
conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}")
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return BqAccess(
|
||||
BqProjects(billing="test-billing", data="test-data"),
|
||||
client_factory=lambda _p: MagicMock(),
|
||||
duckdb_session_factory=_session,
|
||||
)
|
||||
|
||||
|
||||
def test_refuses_when_estimate_exceeds_cap(tmp_path):
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
|
||||
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
||||
|
||||
with patch(
|
||||
"app.api.v2_scan._bq_dry_run_bytes", return_value=100 * 2**30
|
||||
):
|
||||
with pytest.raises(MaterializeBudgetError) as exc:
|
||||
materialize_query(
|
||||
table_id="huge",
|
||||
sql="SELECT * FROM bq.test.tiny",
|
||||
bq=bq,
|
||||
output_dir=str(out),
|
||||
max_bytes=10 * 2**30,
|
||||
)
|
||||
err = exc.value
|
||||
assert err.table_id == "huge"
|
||||
assert err.current == 100 * 2**30
|
||||
assert err.limit == 10 * 2**30
|
||||
|
||||
|
||||
def test_proceeds_when_estimate_under_cap(tmp_path):
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
|
||||
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
||||
|
||||
with patch("app.api.v2_scan._bq_dry_run_bytes", return_value=1024):
|
||||
stats = materialize_query(
|
||||
table_id="tiny",
|
||||
sql="SELECT * FROM bq.test.tiny",
|
||||
bq=bq,
|
||||
output_dir=str(out),
|
||||
max_bytes=10 * 2**30,
|
||||
)
|
||||
assert stats["rows"] == 1
|
||||
|
||||
|
||||
def test_no_cap_skips_dry_run(tmp_path):
|
||||
"""When max_bytes=None (default), no dry-run is performed."""
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
||||
|
||||
with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry:
|
||||
stats = materialize_query(
|
||||
table_id="t1",
|
||||
sql="SELECT * FROM bq.test.tiny",
|
||||
bq=bq,
|
||||
output_dir=str(out),
|
||||
)
|
||||
mock_dry.assert_not_called()
|
||||
assert stats["rows"] == 1
|
||||
|
||||
|
||||
def test_zero_max_bytes_skips_dry_run(tmp_path):
|
||||
"""Sentinel: max_bytes=0 disables the guardrail (config docs)."""
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
||||
|
||||
with patch("app.api.v2_scan._bq_dry_run_bytes") as mock_dry:
|
||||
stats = materialize_query(
|
||||
table_id="t1",
|
||||
sql="SELECT * FROM bq.test.tiny",
|
||||
bq=bq,
|
||||
output_dir=str(out),
|
||||
max_bytes=0,
|
||||
)
|
||||
mock_dry.assert_not_called()
|
||||
assert stats["rows"] == 1
|
||||
|
||||
|
||||
def test_dry_run_failure_is_fail_open(tmp_path):
|
||||
"""If the dry-run errors (DuckDB syntax, missing google lib, transient
|
||||
upstream failure) we don't block — log + proceed with COPY. Operators
|
||||
who need hard-fail watch logs for the warning."""
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
||||
|
||||
with patch(
|
||||
"app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
|
||||
):
|
||||
stats = materialize_query(
|
||||
table_id="t1",
|
||||
sql="SELECT * FROM bq.test.tiny",
|
||||
bq=bq,
|
||||
output_dir=str(out),
|
||||
max_bytes=10 * 2**30,
|
||||
)
|
||||
assert stats["rows"] == 1
|
||||
98
tests/test_bq_init_extract_skips.py
Normal file
98
tests/test_bq_init_extract_skips.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
"""init_extract skips rows with query_mode='materialized'.
|
||||
|
||||
Materialized rows are written by the sync trigger pass via
|
||||
`materialize_query()`; they live as parquets in /data/extracts/bigquery/data/
|
||||
and surface via the orchestrator's standard local-parquet discovery.
|
||||
Creating a remote view in extract.duckdb for the same name would shadow
|
||||
the parquet via cross-source name collision.
|
||||
|
||||
Pattern matches `tests/test_bigquery_extractor.py::TestViewVsTableTemplates`
|
||||
(uses `_CapturingProxy` to wrap a real DuckDB conn and stub BQ-specific calls).
|
||||
"""
|
||||
import duckdb
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
|
||||
class _CapturingProxy:
|
||||
"""Wraps a real DuckDB connection, captures SQL, stubs BQ-specific calls.
|
||||
|
||||
DuckDBPyConnection.execute is C-level read-only, so we wrap rather than
|
||||
monkey-patch. Shape lifted directly from tests/test_bigquery_extractor.py
|
||||
to keep stub behavior consistent across the BQ test suite.
|
||||
"""
|
||||
|
||||
def __init__(self, real_conn, captured: list):
|
||||
self._real = real_conn
|
||||
self._captured = captured
|
||||
|
||||
def execute(self, sql, *args, **kwargs):
|
||||
self._captured.append(sql)
|
||||
stripped_u = sql.strip().upper()
|
||||
if stripped_u.startswith(("INSTALL ", "LOAD ", "CREATE SECRET")):
|
||||
return MagicMock()
|
||||
if stripped_u.startswith("ATTACH ") and "BIGQUERY" in stripped_u:
|
||||
return MagicMock()
|
||||
if stripped_u.startswith("DETACH "):
|
||||
return MagicMock()
|
||||
if 'FROM bq.' in sql or 'FROM bigquery_query' in sql:
|
||||
return MagicMock()
|
||||
return self._real.execute(sql, *args, **kwargs)
|
||||
|
||||
def close(self):
|
||||
return self._real.close()
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._real, name)
|
||||
|
||||
|
||||
def test_init_extract_skips_materialized_rows(tmp_path, monkeypatch):
|
||||
"""A registry mix of remote + materialized rows: only the remote row
|
||||
gets a `_meta` entry; the materialized row is silently skipped."""
|
||||
from connectors.bigquery.extractor import init_extract
|
||||
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.extractor.get_metadata_token",
|
||||
lambda: "test-token",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.extractor._detect_table_type",
|
||||
lambda *a, **kw: "BASE TABLE",
|
||||
)
|
||||
|
||||
captured: list[str] = []
|
||||
real_connect = duckdb.connect
|
||||
|
||||
def spy_connect(*a, **kw):
|
||||
return _CapturingProxy(real_connect(*a, **kw), captured)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.extractor.duckdb.connect", spy_connect
|
||||
)
|
||||
|
||||
configs = [
|
||||
{
|
||||
"name": "live_orders", "bucket": "dset", "source_table": "live",
|
||||
"query_mode": "remote", "description": "",
|
||||
},
|
||||
{
|
||||
"name": "agg_90d", "bucket": "dset", "source_table": "live",
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT 1",
|
||||
"description": "",
|
||||
},
|
||||
]
|
||||
stats = init_extract(str(tmp_path), "test-project", configs)
|
||||
|
||||
db_path = tmp_path / "extract.duckdb"
|
||||
assert db_path.exists(), "extract.duckdb should be written"
|
||||
|
||||
db = duckdb.connect(str(db_path))
|
||||
meta = db.execute(
|
||||
"SELECT table_name, query_mode FROM _meta ORDER BY table_name"
|
||||
).fetchall()
|
||||
db.close()
|
||||
|
||||
assert meta == [("live_orders", "remote")]
|
||||
assert stats["tables_registered"] == 1
|
||||
# No CREATE VIEW for the materialized row
|
||||
assert not any("agg_90d" in s for s in captured if "CREATE" in s.upper())
|
||||
139
tests/test_bq_materialize.py
Normal file
139
tests/test_bq_materialize.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
"""BigQuery `materialize_query` writes parquet via BqAccess + DuckDB COPY.
|
||||
|
||||
The function takes a `BqAccess` instance so the BQ extension session and
|
||||
SECRET token live in one place across the codebase (cf. `v2_scan` / `v2_sample`
|
||||
/ `v2_schema`). Tests inject a stub BqAccess whose `duckdb_session()` yields
|
||||
an in-memory connection with a pre-attached `bq` catalog containing fixture
|
||||
tables, exercising the COPY path end-to-end without any GCP traffic.
|
||||
"""
|
||||
import duckdb
|
||||
import pytest
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from connectors.bigquery.access import BqAccess, BqProjects
|
||||
from connectors.bigquery.extractor import materialize_query, MaterializeBudgetError
|
||||
|
||||
|
||||
def _make_stub_bq(tables: dict[str, str] | None = None) -> BqAccess:
|
||||
"""Return a BqAccess wired to factories that yield an in-memory DuckDB
|
||||
with a pretend `bq` catalog containing test tables. `tables` maps
|
||||
DuckDB-three-part references like `'bq.test.orders'` to a SELECT
|
||||
expression to seed them with.
|
||||
"""
|
||||
tables = tables or {}
|
||||
|
||||
@contextmanager
|
||||
def _session(_projects):
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
conn.execute("ATTACH ':memory:' AS bq")
|
||||
schemas = {ref.rsplit(".", 1)[0] for ref in tables}
|
||||
for s in schemas:
|
||||
conn.execute(f"CREATE SCHEMA IF NOT EXISTS {s}")
|
||||
for ref, body in tables.items():
|
||||
conn.execute(f"CREATE OR REPLACE TABLE {ref} AS {body}")
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# client_factory returns a stub whose .query(sql, job_config=...) yields
|
||||
# a job whose .total_bytes_processed defaults to 0 (fail-open).
|
||||
def _client(_projects):
|
||||
client = MagicMock()
|
||||
job = MagicMock()
|
||||
job.total_bytes_processed = 0
|
||||
client.query.return_value = job
|
||||
return client
|
||||
|
||||
return BqAccess(
|
||||
BqProjects(billing="test-billing", data="test-data"),
|
||||
client_factory=_client,
|
||||
duckdb_session_factory=_session,
|
||||
)
|
||||
|
||||
|
||||
def test_materialize_writes_parquet_and_returns_stats(tmp_path):
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
|
||||
bq = _make_stub_bq({
|
||||
"bq.test.orders": (
|
||||
"SELECT 'EU' AS region, 100 AS revenue UNION ALL "
|
||||
"SELECT 'US' AS region, 250 AS revenue"
|
||||
)
|
||||
})
|
||||
|
||||
stats = materialize_query(
|
||||
table_id="orders_summary",
|
||||
sql="SELECT region, SUM(revenue) AS revenue FROM bq.test.orders GROUP BY 1",
|
||||
bq=bq,
|
||||
output_dir=str(out),
|
||||
)
|
||||
|
||||
parquet_path = out / "data" / "orders_summary.parquet"
|
||||
assert parquet_path.exists()
|
||||
assert stats["rows"] == 2
|
||||
assert stats["size_bytes"] > 0
|
||||
assert stats["query_mode"] == "materialized"
|
||||
|
||||
# Parquet readable end-to-end
|
||||
rows = duckdb.connect().execute(
|
||||
f"SELECT region, revenue FROM read_parquet('{parquet_path}') ORDER BY region"
|
||||
).fetchall()
|
||||
assert rows == [("EU", 100), ("US", 250)]
|
||||
|
||||
|
||||
def test_materialize_atomic_on_failure(tmp_path):
|
||||
"""Bad SQL must not leave a half-written parquet behind."""
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
parquet_path = out / "data" / "broken.parquet"
|
||||
|
||||
bq = _make_stub_bq({"bq.test.orders": "SELECT 1 AS n"})
|
||||
|
||||
with pytest.raises(Exception):
|
||||
materialize_query(
|
||||
table_id="broken",
|
||||
sql="SELECT * FROM bq.test.does_not_exist",
|
||||
bq=bq,
|
||||
output_dir=str(out),
|
||||
)
|
||||
assert not parquet_path.exists()
|
||||
# Tmp also cleaned
|
||||
assert not (out / "data" / "broken.parquet.tmp").exists()
|
||||
|
||||
|
||||
def test_materialize_rejects_unsafe_table_id(tmp_path):
|
||||
"""table_id becomes the parquet filename — block path traversal up front."""
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
bq = _make_stub_bq()
|
||||
|
||||
with pytest.raises(ValueError, match="unsafe"):
|
||||
materialize_query(
|
||||
table_id="../etc/passwd",
|
||||
sql="SELECT 1",
|
||||
bq=bq,
|
||||
output_dir=str(out),
|
||||
)
|
||||
|
||||
|
||||
def test_materialize_overwrites_existing_parquet(tmp_path):
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
bq = _make_stub_bq({"bq.test.tiny": "SELECT 1 AS n"})
|
||||
|
||||
materialize_query(
|
||||
table_id="t1", sql="SELECT 1 AS n",
|
||||
bq=bq, output_dir=str(out),
|
||||
)
|
||||
materialize_query(
|
||||
table_id="t1", sql="SELECT 2 AS n",
|
||||
bq=bq, output_dir=str(out),
|
||||
)
|
||||
rows = duckdb.connect().execute(
|
||||
f"SELECT n FROM read_parquet('{out}/data/t1.parquet')"
|
||||
).fetchall()
|
||||
assert rows == [(2,)]
|
||||
157
tests/test_cli_admin_materialized.py
Normal file
157
tests/test_cli_admin_materialized.py
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
"""`da admin register-table --query-mode materialized --query @file.sql`
|
||||
sends source_query in the payload; existing local/remote paths still work
|
||||
unchanged."""
|
||||
from typer.testing import CliRunner
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from cli.main import app
|
||||
|
||||
|
||||
def _fake_resp(status_code, body=None):
|
||||
resp = MagicMock()
|
||||
resp.status_code = status_code
|
||||
resp.json = lambda: body or {"id": "x", "name": "x", "status": "registered"}
|
||||
return resp
|
||||
|
||||
|
||||
def test_register_materialized_with_inline_query(monkeypatch):
|
||||
captured = {}
|
||||
|
||||
def fake_post(path, json):
|
||||
captured["path"] = path
|
||||
captured["json"] = json
|
||||
return _fake_resp(201)
|
||||
|
||||
monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(app, [
|
||||
"admin", "register-table", "orders_90d",
|
||||
"--source-type", "bigquery",
|
||||
"--query-mode", "materialized",
|
||||
"--query", "SELECT date FROM `prj.ds.orders`",
|
||||
"--sync-schedule", "every 6h",
|
||||
])
|
||||
|
||||
assert result.exit_code == 0, result.stdout
|
||||
assert captured["path"] == "/api/admin/register-table"
|
||||
assert captured["json"]["query_mode"] == "materialized"
|
||||
assert captured["json"]["source_query"] == "SELECT date FROM `prj.ds.orders`"
|
||||
assert captured["json"]["sync_schedule"] == "every 6h"
|
||||
|
||||
|
||||
def test_register_materialized_reads_query_from_file(tmp_path, monkeypatch):
|
||||
sql_file = tmp_path / "orders.sql"
|
||||
sql_file.write_text(
|
||||
"SELECT date, SUM(revenue) FROM `prj.ds.orders` GROUP BY 1\n"
|
||||
)
|
||||
|
||||
captured = {}
|
||||
|
||||
def fake_post(path, json):
|
||||
captured["json"] = json
|
||||
return _fake_resp(201)
|
||||
|
||||
monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(app, [
|
||||
"admin", "register-table", "orders_90d",
|
||||
"--source-type", "bigquery",
|
||||
"--query-mode", "materialized",
|
||||
"--query", f"@{sql_file}",
|
||||
"--sync-schedule", "daily 03:00",
|
||||
])
|
||||
|
||||
assert result.exit_code == 0, result.stdout
|
||||
assert "SELECT date, SUM(revenue)" in captured["json"]["source_query"]
|
||||
assert not captured["json"]["source_query"].endswith("\n")
|
||||
|
||||
|
||||
def test_register_materialized_without_query_fails(monkeypatch):
|
||||
"""--query-mode materialized without --query is a client-side error,
|
||||
no API call made."""
|
||||
called = {"count": 0}
|
||||
|
||||
def fake_post(*args, **kwargs):
|
||||
called["count"] += 1
|
||||
return _fake_resp(201)
|
||||
|
||||
monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(app, [
|
||||
"admin", "register-table", "orders_90d",
|
||||
"--source-type", "bigquery",
|
||||
"--query-mode", "materialized",
|
||||
])
|
||||
|
||||
assert result.exit_code != 0
|
||||
assert called["count"] == 0
|
||||
combined = result.stdout + (result.stderr or "")
|
||||
assert "--query" in combined
|
||||
|
||||
|
||||
def test_register_local_mode_does_not_send_source_query(monkeypatch):
|
||||
"""Default local mode shouldn't send source_query — server-side
|
||||
validator forbids it on local."""
|
||||
captured = {}
|
||||
|
||||
def fake_post(path, json):
|
||||
captured["json"] = json
|
||||
return _fake_resp(201)
|
||||
|
||||
monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(app, [
|
||||
"admin", "register-table", "kbc_orders",
|
||||
"--source-type", "keboola",
|
||||
"--bucket", "in.c-crm",
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert "source_query" not in captured["json"]
|
||||
assert "sync_schedule" not in captured["json"]
|
||||
|
||||
|
||||
def test_register_query_at_path_missing_file_fails(monkeypatch):
|
||||
"""@file.sql where the file doesn't exist surfaces a clear error."""
|
||||
monkeypatch.setattr(
|
||||
"cli.commands.admin.api_post", lambda *a, **kw: _fake_resp(201),
|
||||
)
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(app, [
|
||||
"admin", "register-table", "x",
|
||||
"--source-type", "bigquery",
|
||||
"--query-mode", "materialized",
|
||||
"--query", "@/tmp/definitely-does-not-exist-9b4f7e2c.sql",
|
||||
])
|
||||
assert result.exit_code != 0
|
||||
|
||||
|
||||
def test_register_remote_path_unchanged(monkeypatch):
|
||||
"""The pre-existing --bucket / --source-table / --query-mode remote
|
||||
flow still works without --query."""
|
||||
captured = {}
|
||||
|
||||
def fake_post(path, json):
|
||||
captured["json"] = json
|
||||
return _fake_resp(200)
|
||||
|
||||
monkeypatch.setattr("cli.commands.admin.api_post", fake_post)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(app, [
|
||||
"admin", "register-table", "live_orders",
|
||||
"--source-type", "bigquery",
|
||||
"--bucket", "analytics",
|
||||
"--source-table", "orders",
|
||||
"--query-mode", "remote",
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert captured["json"]["query_mode"] == "remote"
|
||||
assert "source_query" not in captured["json"]
|
||||
assert captured["json"]["bucket"] == "analytics"
|
||||
assert captured["json"]["source_table"] == "orders"
|
||||
77
tests/test_db_migration_v20.py
Normal file
77
tests/test_db_migration_v20.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
"""v20 adds source_query column to table_registry.
|
||||
|
||||
Backs query_mode='materialized' for BigQuery: admin registers a SQL body
|
||||
that the scheduler runs through the DuckDB BQ extension and writes as a
|
||||
parquet to /data/extracts/bigquery/data/<id>.parquet.
|
||||
|
||||
The v19 step (#150) drops dataset_permissions, access_requests tables and
|
||||
users.role, table_registry.is_public columns; v20 then ALTERs the post-v19
|
||||
table_registry to add the source_query column.
|
||||
"""
|
||||
import duckdb
|
||||
|
||||
from src.db import SCHEMA_VERSION, _ensure_schema, get_schema_version
|
||||
|
||||
|
||||
def test_schema_version_is_20():
|
||||
assert SCHEMA_VERSION == 20
|
||||
|
||||
|
||||
def test_v20_adds_source_query(tmp_path):
|
||||
db_path = tmp_path / "system.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
_ensure_schema(conn)
|
||||
|
||||
cols = {
|
||||
r[0] for r in conn.execute(
|
||||
"SELECT column_name FROM information_schema.columns "
|
||||
"WHERE table_name = 'table_registry'"
|
||||
).fetchall()
|
||||
}
|
||||
assert "source_query" in cols, f"source_query missing from {cols}"
|
||||
assert get_schema_version(conn) == 20
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_v19_db_migrates_to_v20(tmp_path):
|
||||
"""Pre-existing v19 DB (post-RBAC-drop) without source_query upgrades
|
||||
cleanly without losing data."""
|
||||
db_path = tmp_path / "system.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
|
||||
# Simulate a v19 DB at minimal but realistic shape: schema_version row +
|
||||
# a table_registry row in the post-v19 column shape (no is_public column,
|
||||
# since v19 finalize dropped it via the table-rebuild idiom).
|
||||
conn.execute(
|
||||
"CREATE TABLE schema_version (version INTEGER, "
|
||||
"applied_at TIMESTAMP DEFAULT current_timestamp)"
|
||||
)
|
||||
conn.execute("INSERT INTO schema_version (version) VALUES (19)")
|
||||
conn.execute("""CREATE TABLE table_registry (
|
||||
id VARCHAR PRIMARY KEY, name VARCHAR NOT NULL,
|
||||
source_type VARCHAR, bucket VARCHAR, source_table VARCHAR,
|
||||
sync_strategy VARCHAR DEFAULT 'full_refresh',
|
||||
query_mode VARCHAR DEFAULT 'local',
|
||||
sync_schedule VARCHAR, profile_after_sync BOOLEAN DEFAULT true,
|
||||
primary_key VARCHAR, folder VARCHAR, description TEXT,
|
||||
registered_by VARCHAR,
|
||||
registered_at TIMESTAMP DEFAULT current_timestamp
|
||||
)""")
|
||||
conn.execute("INSERT INTO table_registry (id, name) VALUES ('foo', 'foo')")
|
||||
|
||||
_ensure_schema(conn)
|
||||
|
||||
assert get_schema_version(conn) == 20
|
||||
cols = {
|
||||
r[0] for r in conn.execute(
|
||||
"SELECT column_name FROM information_schema.columns "
|
||||
"WHERE table_name = 'table_registry'"
|
||||
).fetchall()
|
||||
}
|
||||
assert "source_query" in cols
|
||||
# Existing row preserved, new column NULL
|
||||
row = conn.execute(
|
||||
"SELECT id, source_query FROM table_registry WHERE id='foo'"
|
||||
).fetchone()
|
||||
assert row == ("foo", None)
|
||||
conn.close()
|
||||
75
tests/test_keboola_access.py
Normal file
75
tests/test_keboola_access.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
"""Tests for KeboolaAccess facade."""
|
||||
import os
|
||||
import pytest
|
||||
from connectors.keboola.access import KeboolaAccess
|
||||
|
||||
|
||||
def test_access_session_yields_attached_duckdb(tmp_path, monkeypatch):
|
||||
"""Mock-mode test: the facade should accept a token, install+load
|
||||
the Keboola extension, and ATTACH it as 'kbc'. We verify the SQL
|
||||
issued by intercepting the duckdb.connect call.
|
||||
"""
|
||||
issued = []
|
||||
class FakeConn:
|
||||
def execute(self, sql, *args, **kwargs):
|
||||
issued.append(sql)
|
||||
class R:
|
||||
def fetchall(s): return []
|
||||
def fetchone(s): return (0,)
|
||||
return R()
|
||||
def close(self): pass
|
||||
|
||||
import duckdb
|
||||
monkeypatch.setattr(duckdb, "connect", lambda *a, **kw: FakeConn())
|
||||
|
||||
acc = KeboolaAccess(
|
||||
url="https://connection.keboola.com/",
|
||||
token="fake-token-xyz",
|
||||
)
|
||||
with acc.duckdb_session() as conn:
|
||||
assert conn is not None
|
||||
# Verify the install + load + attach sequence happened.
|
||||
joined = "\n".join(issued)
|
||||
assert "INSTALL keboola" in joined
|
||||
assert "LOAD keboola" in joined
|
||||
assert "ATTACH" in joined and "TYPE keboola" in joined
|
||||
# Token must be escaped for embedding in the ATTACH literal.
|
||||
assert "fake-token-xyz" in joined
|
||||
|
||||
|
||||
def test_access_escapes_single_quote_in_token(monkeypatch):
|
||||
"""Defense against a token containing a single quote breaking the
|
||||
ATTACH literal. SQL injection here is non-trivial because the token
|
||||
is admin-supplied at instance config time, but escape it anyway."""
|
||||
issued = []
|
||||
class FakeConn:
|
||||
def execute(self, sql, *args, **kwargs):
|
||||
issued.append(sql)
|
||||
class R:
|
||||
def fetchall(s): return []
|
||||
def fetchone(s): return (0,)
|
||||
return R()
|
||||
def close(self): pass
|
||||
import duckdb
|
||||
monkeypatch.setattr(duckdb, "connect", lambda *a, **kw: FakeConn())
|
||||
|
||||
acc = KeboolaAccess(url="x", token="bad'token")
|
||||
with acc.duckdb_session() as conn:
|
||||
pass
|
||||
attach_sql = next(s for s in issued if "ATTACH" in s)
|
||||
# Doubled single-quote per SQL string-literal escaping.
|
||||
assert "bad''token" in attach_sql
|
||||
|
||||
|
||||
def test_access_real_attach_when_creds_present(tmp_path):
|
||||
"""Smoke when KBC_TEST_URL + KBC_TEST_TOKEN are present."""
|
||||
url = os.environ.get("KBC_TEST_URL")
|
||||
token = os.environ.get("KBC_TEST_TOKEN")
|
||||
if not (url and token):
|
||||
pytest.skip("Keboola creds not provided")
|
||||
acc = KeboolaAccess(url=url, token=token)
|
||||
with acc.duckdb_session() as conn:
|
||||
# ATTACH must have succeeded — querying duckdb_databases() should
|
||||
# show the 'kbc' alias.
|
||||
rows = [r[0] for r in conn.execute("SELECT name FROM duckdb_databases()").fetchall()]
|
||||
assert "kbc" in rows
|
||||
75
tests/test_keboola_extension_query_passthrough.py
Normal file
75
tests/test_keboola_extension_query_passthrough.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
"""Lock-in test for the DuckDB Keboola extension's query-passthrough
|
||||
capability that the Keboola materialized path depends on.
|
||||
|
||||
Run only when KBC_TEST_URL + KBC_TEST_TOKEN env vars are set (CI without
|
||||
real Keboola credentials skips). Local dev with a real Storage API
|
||||
token exercises the path.
|
||||
"""
|
||||
import os
|
||||
import pytest
|
||||
import duckdb
|
||||
|
||||
|
||||
KBC_URL = os.environ.get("KBC_TEST_URL")
|
||||
KBC_TOKEN = os.environ.get("KBC_TEST_TOKEN")
|
||||
KBC_BUCKET = os.environ.get("KBC_TEST_BUCKET")
|
||||
KBC_TABLE = os.environ.get("KBC_TEST_TABLE")
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not all([KBC_URL, KBC_TOKEN, KBC_BUCKET, KBC_TABLE]),
|
||||
reason="Keboola integration creds not provided",
|
||||
)
|
||||
|
||||
|
||||
def test_extension_supports_attach_and_select(tmp_path):
|
||||
"""Keboola extension must support: ATTACH 'keboola://...' AS kbc, then
|
||||
SELECT * FROM kbc.bucket.table. The Keboola materialized path uses this
|
||||
primitive at runtime (just like connectors/keboola/extractor.py:133)."""
|
||||
conn = duckdb.connect(str(tmp_path / "spike.duckdb"))
|
||||
conn.execute("INSTALL keboola FROM community")
|
||||
conn.execute("LOAD keboola")
|
||||
escaped_token = KBC_TOKEN.replace("'", "''")
|
||||
conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')")
|
||||
rows = conn.execute(
|
||||
f'SELECT COUNT(*) FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}"'
|
||||
).fetchone()
|
||||
assert rows[0] >= 0 # any non-negative count is fine; we're testing the path works
|
||||
|
||||
|
||||
def test_extension_supports_copy_to_parquet(tmp_path):
|
||||
"""Keboola materialized writes the SELECT result via
|
||||
`COPY (...) TO '...' (FORMAT PARQUET)`. Lock that primitive."""
|
||||
conn = duckdb.connect(str(tmp_path / "spike.duckdb"))
|
||||
conn.execute("INSTALL keboola FROM community")
|
||||
conn.execute("LOAD keboola")
|
||||
escaped_token = KBC_TOKEN.replace("'", "''")
|
||||
conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')")
|
||||
|
||||
parquet_path = tmp_path / "out.parquet"
|
||||
safe_lit = str(parquet_path).replace("'", "''")
|
||||
conn.execute(
|
||||
f'COPY (SELECT * FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 5) '
|
||||
f"TO '{safe_lit}' (FORMAT PARQUET)"
|
||||
)
|
||||
assert parquet_path.exists() and parquet_path.stat().st_size > 0
|
||||
|
||||
|
||||
def test_extension_supports_filtered_query(tmp_path):
|
||||
"""Most important capability: a non-trivial WHERE/projection survives.
|
||||
This is what 'Custom SQL' mode actually relies on."""
|
||||
conn = duckdb.connect(str(tmp_path / "spike.duckdb"))
|
||||
conn.execute("INSTALL keboola FROM community")
|
||||
conn.execute("LOAD keboola")
|
||||
escaped_token = KBC_TOKEN.replace("'", "''")
|
||||
conn.execute(f"ATTACH '{KBC_URL}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')")
|
||||
|
||||
parquet_path = tmp_path / "filtered.parquet"
|
||||
safe_lit = str(parquet_path).replace("'", "''")
|
||||
# Trivially filterable SELECT — extension must push the WHERE down or
|
||||
# at minimum execute it client-side. Either is acceptable for our
|
||||
# materialized path.
|
||||
conn.execute(
|
||||
f'COPY (SELECT 1 AS marker FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 3) '
|
||||
f"TO '{safe_lit}' (FORMAT PARQUET)"
|
||||
)
|
||||
assert parquet_path.exists()
|
||||
89
tests/test_keboola_init_extract_skips.py
Normal file
89
tests/test_keboola_init_extract_skips.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
"""Verify the legacy Keboola download path skips materialized rows.
|
||||
|
||||
Materialized rows are handled by `_run_materialized_pass` in
|
||||
`app/api/sync.py`, not by the legacy extractor. Mirror of the BQ
|
||||
extractor's existing skip behavior at line 188.
|
||||
|
||||
The Keboola extractor entrypoint is `run()` (not `init_extract` like
|
||||
the BQ extractor). Tests below observe the skip via caplog and the
|
||||
stats dict (no parquet written, table not in tables_extracted/failed).
|
||||
"""
|
||||
from connectors.keboola import extractor as kbe
|
||||
|
||||
|
||||
def test_run_skips_materialized_rows(tmp_path, caplog):
|
||||
"""Given a registry with one materialized row, run() must NOT write a
|
||||
parquet for it and must NOT count it in tables_extracted/failed."""
|
||||
output_dir = tmp_path / "extracts" / "keboola"
|
||||
output_dir.mkdir(parents=True)
|
||||
|
||||
table_configs = [
|
||||
{
|
||||
"id": "orders_recent",
|
||||
"name": "orders_recent",
|
||||
"source_query": "SELECT * FROM kbc.\"in.c-sales\".\"orders\" WHERE date > '2026-01-01'",
|
||||
"query_mode": "materialized",
|
||||
},
|
||||
]
|
||||
|
||||
with caplog.at_level("INFO"):
|
||||
# Use bogus URL/token — the extractor will fail to ATTACH the
|
||||
# extension (legacy client fallback also unavailable for the
|
||||
# materialized row, but materialized rows must be skipped before
|
||||
# any of that code runs).
|
||||
stats = kbe.run(
|
||||
str(output_dir), table_configs,
|
||||
keboola_url="https://invalid.example/",
|
||||
keboola_token="bogus",
|
||||
)
|
||||
|
||||
# No parquet files for the materialized row.
|
||||
parquet = output_dir / "data" / "orders_recent.parquet"
|
||||
assert not parquet.exists(), "materialized row was written by legacy extractor"
|
||||
|
||||
# The materialized row must NOT count as extracted or failed.
|
||||
assert stats["tables_extracted"] == 0
|
||||
assert stats["tables_failed"] == 0
|
||||
|
||||
# Skip log line for ops visibility.
|
||||
assert "Skipping" in caplog.text and "materialized" in caplog.text
|
||||
|
||||
|
||||
def test_run_processes_local_alongside_skipped_materialized(tmp_path, caplog):
|
||||
"""Mixed registry: one local + one materialized. Materialized is
|
||||
skipped, local row attempts extraction (and fails because there's
|
||||
no real Keboola in this test, but that's a separate failure path —
|
||||
the materialized row must not appear in tables_failed)."""
|
||||
output_dir = tmp_path / "extracts" / "keboola"
|
||||
output_dir.mkdir(parents=True)
|
||||
|
||||
table_configs = [
|
||||
{
|
||||
"id": "orders",
|
||||
"name": "orders",
|
||||
"bucket": "in.c-sales",
|
||||
"source_table": "orders",
|
||||
"query_mode": "local",
|
||||
},
|
||||
{
|
||||
"id": "orders_recent",
|
||||
"name": "orders_recent",
|
||||
"source_query": "SELECT 1",
|
||||
"query_mode": "materialized",
|
||||
},
|
||||
]
|
||||
|
||||
with caplog.at_level("INFO"):
|
||||
stats = kbe.run(
|
||||
str(output_dir), table_configs,
|
||||
keboola_url="https://invalid.example/",
|
||||
keboola_token="bogus",
|
||||
)
|
||||
|
||||
# The materialized row must not be in failures (it was skipped, not failed).
|
||||
failed_names = {e["table"] for e in stats.get("errors", [])}
|
||||
assert "orders_recent" not in failed_names, (
|
||||
"materialized row appeared in failures — should have been skipped"
|
||||
)
|
||||
# Skip log line for ops visibility.
|
||||
assert "Skipping" in caplog.text and "materialized" in caplog.text
|
||||
95
tests/test_keboola_materialize.py
Normal file
95
tests/test_keboola_materialize.py
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
"""Tests for the Keboola materialize_query path."""
|
||||
import hashlib
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from connectors.keboola import extractor as kbe
|
||||
|
||||
|
||||
def test_materialize_query_writes_parquet_and_returns_metadata(tmp_path, monkeypatch):
|
||||
"""Mock-mode: feed in a fake KeboolaAccess that yields a fake DuckDB
|
||||
connection accepting `COPY ... TO '...' (FORMAT PARQUET)` and just
|
||||
writes a small parquet via duckdb's own primitive on a tmp DB.
|
||||
"""
|
||||
import duckdb
|
||||
real_conn = duckdb.connect(":memory:")
|
||||
# Pre-create a small relation the fake materialize "copies".
|
||||
real_conn.execute("CREATE TABLE t AS SELECT 1 AS x, 'hello' AS y UNION ALL SELECT 2, 'world'")
|
||||
|
||||
class FakeAccess:
|
||||
def duckdb_session(self):
|
||||
from contextlib import contextmanager
|
||||
@contextmanager
|
||||
def _cm():
|
||||
yield real_conn
|
||||
return _cm()
|
||||
fake_access = FakeAccess()
|
||||
|
||||
output_dir = tmp_path / "out"
|
||||
output_dir.mkdir()
|
||||
|
||||
# Submit a query that selects from the in-memory table (not a real
|
||||
# Keboola bucket — the test verifies the COPY/parquet/hash path,
|
||||
# not the extension behavior).
|
||||
result = kbe.materialize_query(
|
||||
table_id="example_subset",
|
||||
sql="SELECT * FROM t",
|
||||
keboola_access=fake_access,
|
||||
output_dir=output_dir,
|
||||
)
|
||||
|
||||
parquet_path = output_dir / "example_subset.parquet"
|
||||
assert parquet_path.exists()
|
||||
assert result["table_id"] == "example_subset"
|
||||
assert result["path"] == str(parquet_path)
|
||||
assert result["rows"] == 2
|
||||
assert result["bytes"] > 0
|
||||
# MD5 of the bytes should match what we recompute.
|
||||
expected_md5 = hashlib.md5(parquet_path.read_bytes()).hexdigest()
|
||||
assert result["md5"] == expected_md5
|
||||
|
||||
|
||||
def test_materialize_query_zero_rows_logs_warning(tmp_path, caplog):
|
||||
import duckdb
|
||||
real_conn = duckdb.connect(":memory:")
|
||||
real_conn.execute("CREATE TABLE t AS SELECT 1 AS x WHERE FALSE")
|
||||
|
||||
class FakeAccess:
|
||||
def duckdb_session(self):
|
||||
from contextlib import contextmanager
|
||||
@contextmanager
|
||||
def _cm():
|
||||
yield real_conn
|
||||
return _cm()
|
||||
|
||||
output_dir = tmp_path / "out"
|
||||
output_dir.mkdir()
|
||||
|
||||
with caplog.at_level("WARNING"):
|
||||
result = kbe.materialize_query(
|
||||
table_id="empty_subset",
|
||||
sql="SELECT * FROM t",
|
||||
keboola_access=FakeAccess(),
|
||||
output_dir=output_dir,
|
||||
)
|
||||
assert result["rows"] == 0
|
||||
assert "0 rows" in caplog.text or "empty" in caplog.text.lower()
|
||||
|
||||
|
||||
def test_materialize_query_rejects_unsafe_table_id(tmp_path):
|
||||
"""Defense: table_id is interpolated into the parquet filename. SQL/
|
||||
path-traversal-unsafe values must be rejected up-front (mirror of BQ
|
||||
materialize_query's validation)."""
|
||||
class FakeAccess:
|
||||
def duckdb_session(self):
|
||||
raise AssertionError("should not be called")
|
||||
output_dir = tmp_path / "out"
|
||||
output_dir.mkdir()
|
||||
with pytest.raises(ValueError, match="table_id"):
|
||||
kbe.materialize_query(
|
||||
table_id="../../etc/passwd",
|
||||
sql="SELECT 1",
|
||||
keboola_access=FakeAccess(),
|
||||
output_dir=output_dir,
|
||||
)
|
||||
71
tests/test_keboola_materialized_e2e.py
Normal file
71
tests/test_keboola_materialized_e2e.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
"""End-to-end: register a Keboola materialized row -> trigger sync ->
|
||||
parquet appears -> manifest serves it -> CLI da sync would download it.
|
||||
|
||||
Skipped unless KBC_TEST_URL + KBC_TEST_TOKEN + KBC_TEST_BUCKET +
|
||||
KBC_TEST_TABLE are present.
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
KBC_URL = os.environ.get("KBC_TEST_URL")
|
||||
KBC_TOKEN = os.environ.get("KBC_TEST_TOKEN")
|
||||
KBC_BUCKET = os.environ.get("KBC_TEST_BUCKET")
|
||||
KBC_TABLE = os.environ.get("KBC_TEST_TABLE")
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not all([KBC_URL, KBC_TOKEN, KBC_BUCKET, KBC_TABLE]),
|
||||
reason="Keboola creds not provided",
|
||||
)
|
||||
|
||||
|
||||
def test_register_trigger_manifest_path(seeded_app, monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
||||
monkeypatch.setenv("KEBOOLA_TOKEN", KBC_TOKEN)
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.load_instance_config",
|
||||
lambda: {
|
||||
"data_source": {
|
||||
"type": "keboola",
|
||||
"keboola": {
|
||||
"url": KBC_URL,
|
||||
"token_env": "KEBOOLA_TOKEN",
|
||||
},
|
||||
},
|
||||
},
|
||||
raising=False,
|
||||
)
|
||||
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
auth = {"Authorization": f"Bearer {token}"}
|
||||
|
||||
# Register.
|
||||
r = c.post("/api/admin/register-table", headers=auth, json={
|
||||
"name": "smoke_subset",
|
||||
"source_type": "keboola",
|
||||
"query_mode": "materialized",
|
||||
"source_query": (
|
||||
f'SELECT * FROM kbc."{KBC_BUCKET}"."{KBC_TABLE}" LIMIT 5'
|
||||
),
|
||||
})
|
||||
assert r.status_code == 201
|
||||
|
||||
# Trigger sync.
|
||||
r = c.post("/api/sync/trigger", headers=auth)
|
||||
assert r.status_code in (200, 202)
|
||||
|
||||
# Parquet must exist.
|
||||
parquet = Path(tmp_path) / "extracts" / "keboola" / "data" / "smoke_subset.parquet"
|
||||
assert parquet.exists() and parquet.stat().st_size > 0
|
||||
|
||||
# Manifest serves it.
|
||||
r = c.get("/api/sync/manifest", headers=auth)
|
||||
rows = r.json()["tables"]
|
||||
smoke = next((t for t in rows if t["id"] == "smoke_subset"), None)
|
||||
assert smoke is not None
|
||||
assert smoke["source_type"] == "keboola"
|
||||
assert smoke["query_mode"] == "local" # materialized parquets surface as local
|
||||
assert smoke["md5"] # has a hash for da sync delta detection
|
||||
335
tests/test_materialized_e2e.py
Normal file
335
tests/test_materialized_e2e.py
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
"""End-to-end integration coverage for query_mode='materialized'.
|
||||
|
||||
Unit tests verify each piece in isolation; this file glues them together:
|
||||
|
||||
1. Admin POST /api/admin/register-table (materialized) → registry row written
|
||||
2. _run_materialized_pass writes parquet + sync_state with correct hash
|
||||
3. GET /api/sync/manifest (per-user) returns the row with query_mode +
|
||||
the parquet hash, filtered by RBAC
|
||||
4. Mode-switch transitions (remote → materialized, materialized → SQL edit
|
||||
preserves registered_at) maintain registry invariants.
|
||||
|
||||
Devil's-advocate review found these were the gaps the unit tests left
|
||||
open. Each piece passes in isolation; this file proves they compose.
|
||||
"""
|
||||
import duckdb
|
||||
import hashlib
|
||||
import pytest
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from connectors.bigquery.access import BqAccess, BqProjects
|
||||
from src.repositories.table_registry import TableRegistryRepository
|
||||
from src.repositories.sync_state import SyncStateRepository
|
||||
|
||||
|
||||
def _auth(token):
|
||||
return {"Authorization": f"Bearer {token}"}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def bq_instance(monkeypatch):
|
||||
"""Force instance.yaml to look like a BigQuery deployment so the BQ
|
||||
register validator's project_id check passes."""
|
||||
fake_cfg = {
|
||||
"data_source": {
|
||||
"type": "bigquery",
|
||||
"bigquery": {"project": "my-test-project", "location": "us"},
|
||||
},
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.load_instance_config",
|
||||
lambda: fake_cfg,
|
||||
raising=False,
|
||||
)
|
||||
from app.instance_config import reset_cache
|
||||
reset_cache()
|
||||
yield fake_cfg
|
||||
reset_cache()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stub_bq_extractor(monkeypatch):
|
||||
"""Mirror tests/test_admin_bq_register.py::stub_bq_extractor — replaces
|
||||
rebuild_from_registry + SyncOrchestrator so the API's post-register
|
||||
materialize doesn't hit real BQ during HTTP-driven tests."""
|
||||
rebuild_mock = MagicMock(return_value={
|
||||
"project_id": "my-test-project",
|
||||
"tables_registered": 1,
|
||||
"errors": [],
|
||||
"skipped": False,
|
||||
})
|
||||
monkeypatch.setattr(
|
||||
"connectors.bigquery.extractor.rebuild_from_registry",
|
||||
rebuild_mock,
|
||||
)
|
||||
orch_mock = MagicMock()
|
||||
monkeypatch.setattr(
|
||||
"src.orchestrator.SyncOrchestrator",
|
||||
lambda *a, **kw: orch_mock,
|
||||
)
|
||||
return {"rebuild": rebuild_mock, "orchestrator": orch_mock}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stub_bq():
|
||||
"""Real-shape BqAccess wired to in-memory DuckDB factories so the
|
||||
materialize_query path can run end-to-end without GCP."""
|
||||
@contextmanager
|
||||
def _session(_p):
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
conn.execute("ATTACH ':memory:' AS bq")
|
||||
conn.execute("CREATE SCHEMA bq.test")
|
||||
conn.execute(
|
||||
"CREATE OR REPLACE TABLE bq.test.orders AS "
|
||||
"SELECT 'EU' AS region, 100 AS revenue UNION ALL "
|
||||
"SELECT 'US' AS region, 250 AS revenue"
|
||||
)
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
return BqAccess(
|
||||
BqProjects(billing="my-test-project", data="my-test-project"),
|
||||
client_factory=lambda _p: MagicMock(),
|
||||
duckdb_session_factory=_session,
|
||||
)
|
||||
|
||||
|
||||
def test_e2e_register_then_materialize_then_manifest_via_repo(
|
||||
bq_instance, stub_bq, tmp_path, monkeypatch,
|
||||
):
|
||||
"""Glue test: register row at the repository layer (skips HTTP/auth),
|
||||
run the materialized pass, verify sync_state, then exercise the
|
||||
`_build_manifest_for_user` admin path. Catches integration breakage
|
||||
that unit tests miss because each only sees one layer."""
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
|
||||
db_path = tmp_path / "system.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
from src.db import _ensure_schema
|
||||
_ensure_schema(conn)
|
||||
|
||||
table_id = "orders_summary_e2e"
|
||||
repo = TableRegistryRepository(conn)
|
||||
repo.register(
|
||||
id=table_id, name=table_id, source_type="bigquery",
|
||||
query_mode="materialized",
|
||||
source_query="SELECT region, SUM(revenue) AS revenue "
|
||||
"FROM bq.test.orders GROUP BY 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
|
||||
# Run the materialized pass.
|
||||
from app.api import sync as sync_mod
|
||||
summary = sync_mod._run_materialized_pass(conn, stub_bq)
|
||||
assert table_id in summary["materialized"], summary
|
||||
assert not summary["errors"]
|
||||
|
||||
# Parquet on disk.
|
||||
parquet_path = (
|
||||
tmp_path / "data" / "extracts" / "bigquery" / "data"
|
||||
/ f"{table_id}.parquet"
|
||||
)
|
||||
assert parquet_path.exists(), f"Expected {parquet_path} to exist"
|
||||
|
||||
# sync_state hash matches the file's MD5.
|
||||
expected_hash = hashlib.md5(parquet_path.read_bytes()).hexdigest()
|
||||
state = SyncStateRepository(conn)
|
||||
row = state.get_table_state(table_id)
|
||||
assert row is not None
|
||||
assert row["hash"] == expected_hash
|
||||
assert row["rows"] == 2
|
||||
|
||||
# Manifest builder exposes query_mode + hash to admin (no RBAC filter).
|
||||
# Post-#150 RBAC: admin shortcut keys on Admin user_group membership,
|
||||
# not the legacy `users.role` column. Seed the user + Admin membership
|
||||
# so `can_access_table` short-circuits to True.
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO users (id, email) VALUES ('u-admin', 'admin@test')"
|
||||
)
|
||||
admin_group_id = conn.execute(
|
||||
"SELECT id FROM user_groups WHERE name = 'Admin'"
|
||||
).fetchone()[0]
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO user_group_members (user_id, group_id, source) "
|
||||
"VALUES ('u-admin', ?, 'admin')",
|
||||
[admin_group_id],
|
||||
)
|
||||
admin_user = {"id": "u-admin", "email": "admin@test"}
|
||||
manifest = sync_mod._build_manifest_for_user(conn, admin_user)
|
||||
assert table_id in manifest["tables"]
|
||||
entry = manifest["tables"][table_id]
|
||||
assert entry["query_mode"] == "materialized"
|
||||
assert entry["hash"] == expected_hash
|
||||
assert entry["rows"] == 2
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_remote_to_materialized_transition_clears_bucket_table(
|
||||
seeded_app, bq_instance, stub_bq_extractor,
|
||||
):
|
||||
"""Switching a remote BQ row to materialized must accept source_query
|
||||
and the merged validator must not trip on the now-irrelevant
|
||||
bucket/source_table fields."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
|
||||
# Seed a remote row.
|
||||
r = c.post("/api/admin/register-table", json={
|
||||
"name": "live_to_mat",
|
||||
"source_type": "bigquery",
|
||||
"bucket": "analytics",
|
||||
"source_table": "orders",
|
||||
"query_mode": "remote",
|
||||
}, headers=_auth(token))
|
||||
assert r.status_code in (200, 202), r.json()
|
||||
table_id = r.json()["id"]
|
||||
|
||||
# Switch to materialized — must include source_query for the validator.
|
||||
r2 = c.put(f"/api/admin/registry/{table_id}", json={
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT 1 AS n",
|
||||
}, headers=_auth(token))
|
||||
assert r2.status_code == 200, r2.json()
|
||||
|
||||
# Verify the merged record reflects the switch.
|
||||
r3 = c.get("/api/admin/registry", headers=_auth(token))
|
||||
row = next((t for t in r3.json()["tables"] if t["id"] == table_id), None)
|
||||
assert row is not None
|
||||
assert row["query_mode"] == "materialized"
|
||||
assert row["source_query"] == "SELECT 1 AS n"
|
||||
|
||||
|
||||
def test_materialized_sql_edit_preserves_registered_at(
|
||||
seeded_app, bq_instance, stub_bq_extractor, monkeypatch,
|
||||
):
|
||||
"""Editing source_query on an existing materialized row must not
|
||||
reset registered_at — the row's registration history is preserved
|
||||
across SQL edits (issue #130 invariant)."""
|
||||
c = seeded_app["client"]
|
||||
token = seeded_app["admin_token"]
|
||||
|
||||
# Seed a materialized row.
|
||||
r = c.post("/api/admin/register-table", json={
|
||||
"name": "sql_edit_target",
|
||||
"source_type": "bigquery",
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT 1 AS n",
|
||||
}, headers=_auth(token))
|
||||
assert r.status_code == 201, r.json()
|
||||
table_id = r.json()["id"]
|
||||
|
||||
# Capture the original registered_at.
|
||||
r2 = c.get("/api/admin/registry", headers=_auth(token))
|
||||
row = next((t for t in r2.json()["tables"] if t["id"] == table_id), None)
|
||||
original_ts = row["registered_at"]
|
||||
assert original_ts is not None
|
||||
|
||||
# Edit the SQL.
|
||||
import time
|
||||
time.sleep(0.01) # ensure a clock tick elapses so a fresh stamp would differ
|
||||
r3 = c.put(f"/api/admin/registry/{table_id}", json={
|
||||
"query_mode": "materialized",
|
||||
"source_query": "SELECT 2 AS n",
|
||||
}, headers=_auth(token))
|
||||
assert r3.status_code == 200, r3.json()
|
||||
|
||||
r4 = c.get("/api/admin/registry", headers=_auth(token))
|
||||
row = next((t for t in r4.json()["tables"] if t["id"] == table_id), None)
|
||||
assert row["source_query"] == "SELECT 2 AS n"
|
||||
# registered_at preserved across edit
|
||||
assert row["registered_at"] == original_ts, (
|
||||
f"Expected registered_at preserved (issue #130 contract). "
|
||||
f"Original: {original_ts}, after edit: {row['registered_at']}"
|
||||
)
|
||||
|
||||
|
||||
def test_materialized_zero_rows_logs_warning(stub_bq, tmp_path, caplog):
|
||||
"""Devil's-advocate item: an SQL filter that returns 0 rows is
|
||||
indistinguishable from 'SQL is wrong'. Confirm we log a WARNING so
|
||||
operators can grep on it."""
|
||||
import logging
|
||||
from connectors.bigquery.extractor import materialize_query
|
||||
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
|
||||
# Add an empty BQ table to the stub for this test.
|
||||
@contextmanager
|
||||
def _session_empty(_p):
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
conn.execute("ATTACH ':memory:' AS bq")
|
||||
conn.execute("CREATE SCHEMA bq.test")
|
||||
conn.execute("CREATE OR REPLACE TABLE bq.test.empty AS "
|
||||
"SELECT 1 AS n WHERE FALSE")
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
bq_empty = BqAccess(
|
||||
BqProjects(billing="t", data="t"),
|
||||
client_factory=lambda _p: MagicMock(),
|
||||
duckdb_session_factory=_session_empty,
|
||||
)
|
||||
|
||||
with caplog.at_level(logging.WARNING, logger="connectors.bigquery.extractor"):
|
||||
stats = materialize_query(
|
||||
table_id="empty_t",
|
||||
sql="SELECT * FROM bq.test.empty",
|
||||
bq=bq_empty,
|
||||
output_dir=str(out),
|
||||
)
|
||||
|
||||
assert stats["rows"] == 0
|
||||
assert any("0 rows" in rec.message for rec in caplog.records), (
|
||||
f"Expected '0 rows' WARNING; got: {[r.message for r in caplog.records]}"
|
||||
)
|
||||
|
||||
|
||||
def test_attach_real_error_propagates(stub_bq, tmp_path):
|
||||
"""ATTACH 'project=...' that fails for a real reason (not the
|
||||
'already attached' tolerated case) must propagate so callers see
|
||||
the actual error instead of a confusing downstream 'bq is not
|
||||
attached' message."""
|
||||
from connectors.bigquery.extractor import materialize_query
|
||||
|
||||
out = tmp_path / "extracts" / "bigquery"
|
||||
out.mkdir(parents=True)
|
||||
|
||||
@contextmanager
|
||||
def _session_attach_fails(_p):
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
# Force ATTACH 'project=...' to raise something other than
|
||||
# "already attached" by intercepting via execute wrapper —
|
||||
# since DuckDB's real connection doesn't accept attribute
|
||||
# patches, we use a thin proxy for this test.
|
||||
class _Proxy:
|
||||
def __init__(self, real):
|
||||
self._real = real
|
||||
def execute(self, sql, *a, **kw):
|
||||
if sql.startswith("ATTACH 'project="):
|
||||
raise duckdb.Error("fake permission denied: missing serviceusage.services.use")
|
||||
return self._real.execute(sql, *a, **kw)
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._real, name)
|
||||
def close(self):
|
||||
return self._real.close()
|
||||
yield _Proxy(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
bq_bad = BqAccess(
|
||||
BqProjects(billing="t", data="t"),
|
||||
client_factory=lambda _p: MagicMock(),
|
||||
duckdb_session_factory=_session_attach_fails,
|
||||
)
|
||||
|
||||
with pytest.raises(duckdb.Error, match="permission denied"):
|
||||
materialize_query(
|
||||
table_id="x", sql="SELECT 1",
|
||||
bq=bq_bad, output_dir=str(out),
|
||||
)
|
||||
132
tests/test_sync_trigger_keboola_materialized.py
Normal file
132
tests/test_sync_trigger_keboola_materialized.py
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
"""Scheduler-level test: when a Keboola row has query_mode='materialized',
|
||||
_run_materialized_pass dispatches to connectors.keboola.extractor.materialize_query
|
||||
(not BQ's). Existing BQ-materialized rows continue using BqAccess.
|
||||
|
||||
Mirrors the unit-style of tests/test_sync_trigger_materialized.py — patches
|
||||
the inner extractor entry points instead of going through the API layer.
|
||||
"""
|
||||
import duckdb
|
||||
import pytest
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from src.db import _ensure_schema
|
||||
from src.repositories.table_registry import TableRegistryRepository
|
||||
from connectors.bigquery.access import BqAccess, BqProjects
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def system_db(tmp_path, monkeypatch):
|
||||
db_path = tmp_path / "system.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
_ensure_schema(conn)
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
|
||||
yield conn
|
||||
conn.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stub_bq():
|
||||
@contextmanager
|
||||
def _session(_p):
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return BqAccess(
|
||||
BqProjects(billing="t", data="t"),
|
||||
client_factory=lambda _p: MagicMock(),
|
||||
duckdb_session_factory=_session,
|
||||
)
|
||||
|
||||
|
||||
def test_run_materialized_pass_dispatches_keboola_to_keboola_extractor(
|
||||
system_db, stub_bq, tmp_path, monkeypatch
|
||||
):
|
||||
"""Keboola row with query_mode='materialized' must invoke the Keboola
|
||||
materialize_query, not the BQ one."""
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(
|
||||
id="orders_recent", name="orders_recent",
|
||||
source_type="keboola", query_mode="materialized",
|
||||
source_query='SELECT * FROM kbc."in.c-sales"."orders" WHERE 1=1',
|
||||
sync_schedule="every 1m", # always due
|
||||
)
|
||||
|
||||
# Provide instance.yaml-shape config + env so the Keboola lazy-init succeeds.
|
||||
monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake-token")
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
# Patch get_value to return the keboola URL/token_env.
|
||||
def _fake_get_value(*keys, default=None):
|
||||
path = keys
|
||||
if path == ("data_source", "keboola", "url"):
|
||||
return "https://connection.keboola.com/"
|
||||
if path == ("data_source", "keboola", "token_env"):
|
||||
return "KEBOOLA_STORAGE_TOKEN"
|
||||
if path == ("data_source", "bigquery", "max_bytes_per_materialize"):
|
||||
return default if default is not None else 0
|
||||
return default
|
||||
|
||||
# Pre-create the parquet for hash bookkeeping (kb materialize is patched
|
||||
# so it won't write a real one).
|
||||
parquet_dir = tmp_path / "data" / "extracts" / "keboola" / "data"
|
||||
parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
(parquet_dir / "orders_recent.parquet").write_bytes(
|
||||
b"PAR1" + b"\x00" * 16 + b"PAR1"
|
||||
)
|
||||
|
||||
bq_called = MagicMock()
|
||||
kb_called = MagicMock(return_value={
|
||||
"table_id": "orders_recent", "rows": 1, "bytes": 100,
|
||||
"md5": "abc123", "path": str(parquet_dir / "orders_recent.parquet"),
|
||||
})
|
||||
|
||||
with patch("app.instance_config.get_value", _fake_get_value), \
|
||||
patch("connectors.bigquery.extractor.materialize_query", bq_called), \
|
||||
patch("connectors.keboola.extractor.materialize_query", kb_called):
|
||||
summary = sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
assert kb_called.called, "Keboola materialize_query was not invoked"
|
||||
assert not bq_called.called, (
|
||||
"BQ materialize_query was wrongly invoked for a Keboola row"
|
||||
)
|
||||
assert "orders_recent" in summary["materialized"]
|
||||
|
||||
|
||||
def test_run_materialized_pass_dispatches_bigquery_to_bq_extractor(
|
||||
system_db, stub_bq, tmp_path
|
||||
):
|
||||
"""Regression: BQ-materialized path keeps working unchanged."""
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(
|
||||
id="events_summary", name="events_summary",
|
||||
source_type="bigquery", query_mode="materialized",
|
||||
source_query="SELECT date, COUNT(*) FROM `proj.dataset.events` GROUP BY 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
|
||||
parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
|
||||
parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
(parquet_dir / "events_summary.parquet").write_bytes(
|
||||
b"PAR1" + b"\x00" * 16 + b"PAR1"
|
||||
)
|
||||
|
||||
bq_called = MagicMock(return_value={
|
||||
"rows": 1, "size_bytes": 100, "query_mode": "materialized",
|
||||
"hash": "abc123",
|
||||
})
|
||||
kb_called = MagicMock()
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
with patch("app.api.sync._materialize_table", bq_called), \
|
||||
patch("connectors.keboola.extractor.materialize_query", kb_called):
|
||||
summary = sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
assert bq_called.called
|
||||
assert not kb_called.called
|
||||
assert "events_summary" in summary["materialized"]
|
||||
484
tests/test_sync_trigger_materialized.py
Normal file
484
tests/test_sync_trigger_materialized.py
Normal file
|
|
@ -0,0 +1,484 @@
|
|||
"""_run_materialized_pass walks table_registry for materialized BQ rows
|
||||
and runs each that is due via _materialize_table.
|
||||
|
||||
Tests inject a stub BqAccess (factories never called by these tests since
|
||||
_materialize_table is patched) and assert that scheduling, error
|
||||
aggregation, sync_state hash, and the disable-sentinel all behave
|
||||
correctly.
|
||||
"""
|
||||
import duckdb
|
||||
import pytest
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from src.db import _ensure_schema
|
||||
from src.repositories.table_registry import TableRegistryRepository
|
||||
from src.repositories.sync_state import SyncStateRepository
|
||||
from connectors.bigquery.access import BqAccess, BqProjects
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def system_db(tmp_path, monkeypatch):
|
||||
db_path = tmp_path / "system.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
_ensure_schema(conn)
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
|
||||
yield conn
|
||||
conn.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stub_bq():
|
||||
"""A BqAccess instance that the tests don't actually exercise (the test
|
||||
patches `_materialize_table`); just needs to be a valid BqAccess so the
|
||||
type contract doesn't break."""
|
||||
@contextmanager
|
||||
def _session(_p):
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return BqAccess(
|
||||
BqProjects(billing="t", data="t"),
|
||||
client_factory=lambda _p: MagicMock(),
|
||||
duckdb_session_factory=_session,
|
||||
)
|
||||
|
||||
|
||||
def test_materialized_pass_calls_materialize_for_due_rows(system_db, stub_bq, tmp_path):
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(
|
||||
id="orders_90d", name="orders_90d",
|
||||
source_type="bigquery", query_mode="materialized",
|
||||
source_query="SELECT 1 AS n",
|
||||
sync_schedule="every 1m", # always due in tests (no prior sync)
|
||||
)
|
||||
|
||||
# Pre-create the parquet so _file_hash returns non-empty
|
||||
parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
|
||||
parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
(parquet_dir / "orders_90d.parquet").write_bytes(
|
||||
b"PAR1" + b"\x00" * 16 + b"PAR1"
|
||||
)
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
with patch("app.api.sync._materialize_table") as mock_mat:
|
||||
mock_mat.return_value = {
|
||||
"rows": 1, "size_bytes": 100, "query_mode": "materialized",
|
||||
}
|
||||
summary = sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
mock_mat.assert_called_once()
|
||||
call_kwargs = mock_mat.call_args.kwargs
|
||||
assert call_kwargs["table_id"] == "orders_90d"
|
||||
assert "SELECT 1 AS n" in call_kwargs["sql"]
|
||||
assert call_kwargs["bq"] is stub_bq
|
||||
# Default cap (10 GiB) flows through when no instance.yaml override
|
||||
assert call_kwargs["max_bytes"] == 10 * 2**30
|
||||
assert "orders_90d" in summary["materialized"]
|
||||
assert not summary["errors"]
|
||||
|
||||
|
||||
def test_materialized_pass_skips_undue_rows(system_db, stub_bq):
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(
|
||||
id="orders_daily", name="orders_daily",
|
||||
source_type="bigquery", query_mode="materialized",
|
||||
source_query="SELECT 1",
|
||||
sync_schedule="daily 03:00",
|
||||
)
|
||||
state = SyncStateRepository(system_db)
|
||||
state.update_sync(
|
||||
table_id="orders_daily", rows=1, file_size_bytes=10, hash="x",
|
||||
)
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
with patch("app.api.sync._materialize_table") as mock_mat:
|
||||
summary = sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
mock_mat.assert_not_called()
|
||||
assert "orders_daily" in summary["skipped"]
|
||||
|
||||
|
||||
def test_materialized_pass_skips_non_materialized_rows(system_db, stub_bq):
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(id="t1", name="t1", source_type="keboola", query_mode="local")
|
||||
repo.register(id="t2", name="t2", source_type="bigquery", query_mode="remote")
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
with patch("app.api.sync._materialize_table") as mock_mat:
|
||||
summary = sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
mock_mat.assert_not_called()
|
||||
assert summary == {"materialized": [], "skipped": [], "errors": []}
|
||||
|
||||
|
||||
def test_materialized_pass_collects_errors_per_row(system_db, stub_bq, tmp_path):
|
||||
"""One row failing must not stop a healthy sibling."""
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(
|
||||
id="ok", name="ok", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
repo.register(
|
||||
id="bad", name="bad", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT broken",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
|
||||
parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
|
||||
parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
(parquet_dir / "ok.parquet").write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1")
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
def _fake(table_id, sql, bq, output_dir, max_bytes):
|
||||
if table_id == "bad":
|
||||
raise RuntimeError("simulated COPY failure")
|
||||
return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"}
|
||||
|
||||
with patch("app.api.sync._materialize_table", side_effect=_fake):
|
||||
summary = sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
assert summary["materialized"] == ["ok"]
|
||||
assert len(summary["errors"]) == 1
|
||||
assert summary["errors"][0]["table"] == "bad"
|
||||
assert "simulated" in summary["errors"][0]["error"]
|
||||
|
||||
|
||||
def test_materialized_pass_records_parquet_hash(system_db, stub_bq, tmp_path):
|
||||
"""sync_state.hash must be the MD5 of the parquet file — otherwise the
|
||||
manifest reports an empty hash and every da sync re-downloads."""
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(
|
||||
id="hashed", name="hashed",
|
||||
source_type="bigquery", query_mode="materialized",
|
||||
source_query="SELECT 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
|
||||
parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
|
||||
parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
parquet_path = parquet_dir / "hashed.parquet"
|
||||
|
||||
def _fake(**kwargs):
|
||||
parquet_path.write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1")
|
||||
return {"rows": 1, "size_bytes": 24, "query_mode": "materialized"}
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
with patch("app.api.sync._materialize_table", side_effect=_fake):
|
||||
sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
state = SyncStateRepository(system_db)
|
||||
row = state.get_table_state("hashed")
|
||||
assert row is not None
|
||||
import hashlib
|
||||
expected = hashlib.md5(b"PAR1" + b"\x00" * 16 + b"PAR1").hexdigest()
|
||||
assert row["hash"] == expected
|
||||
|
||||
|
||||
def test_run_sync_keboola_timeout_does_not_skip_materialized(tmp_path, monkeypatch):
|
||||
"""REGRESSION (Devin BUG_0001 on 2219255): when the Keboola extractor
|
||||
subprocess raises TimeoutExpired, the materialized BQ pass and the
|
||||
orchestrator rebuild must still fire. Pre-fix the timeout propagated
|
||||
to the outer except handler and skipped the rest of _run_sync — on
|
||||
a dual-source deployment a slow Keboola extractor would silently
|
||||
block all materialized parquets and the master-view rebuild until
|
||||
the next trigger."""
|
||||
import duckdb
|
||||
import subprocess as _sp
|
||||
from src.db import _ensure_schema
|
||||
|
||||
db_path = tmp_path / "system.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
_ensure_schema(conn)
|
||||
|
||||
repo = TableRegistryRepository(conn)
|
||||
# One Keboola row (drives the extractor subprocess) + one materialized
|
||||
# BQ row (must still run after the Keboola timeout).
|
||||
repo.register(
|
||||
id="kbc_a", name="kbc_a", source_type="keboola",
|
||||
query_mode="local", bucket="in.c-foo", source_table="a",
|
||||
)
|
||||
repo.register(
|
||||
id="m1", name="m1", source_type="bigquery",
|
||||
query_mode="materialized",
|
||||
source_query="SELECT 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
conn.close()
|
||||
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
|
||||
monkeypatch.setenv("KEBOOLA_STACK_URL", "https://example.invalid")
|
||||
monkeypatch.setenv("KEBOOLA_STORAGE_TOKEN", "fake")
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
# Subprocess raises TimeoutExpired the moment _run_sync calls it.
|
||||
def _timeout(*a, **kw):
|
||||
raise _sp.TimeoutExpired(cmd=["fake"], timeout=1)
|
||||
|
||||
monkeypatch.setattr(sync_mod.subprocess, "run", _timeout)
|
||||
|
||||
materialized_called = {"count": 0}
|
||||
orchestrator_called = {"count": 0}
|
||||
|
||||
def _spy_materialized(_conn, _bq):
|
||||
materialized_called["count"] += 1
|
||||
return {"materialized": ["m1"], "skipped": [], "errors": []}
|
||||
|
||||
class _OrchStub:
|
||||
def rebuild(self):
|
||||
orchestrator_called["count"] += 1
|
||||
return {}
|
||||
|
||||
monkeypatch.setattr("app.api.sync._run_materialized_pass", _spy_materialized)
|
||||
monkeypatch.setattr(
|
||||
"src.orchestrator.SyncOrchestrator", lambda *a, **kw: _OrchStub(),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.get_data_source_type", lambda: "keboola",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.get_value",
|
||||
lambda *args, **kw: (
|
||||
"my-bq-proj" if (args and args[-1] == "project")
|
||||
else kw.get("default", "")
|
||||
),
|
||||
)
|
||||
|
||||
sync_mod._run_sync()
|
||||
|
||||
assert materialized_called["count"] == 1, (
|
||||
"materialized pass must run even when Keboola subprocess timed out"
|
||||
)
|
||||
assert orchestrator_called["count"] == 1, (
|
||||
"orchestrator rebuild must run after Keboola timeout to publish "
|
||||
"any partial / materialized parquets that did land"
|
||||
)
|
||||
|
||||
|
||||
def test_run_sync_runs_materialized_pass_on_bq_only_deployment(
|
||||
tmp_path, monkeypatch,
|
||||
):
|
||||
"""REGRESSION (Devin BUG_0002 on 2fa44f2): on BigQuery-only deployments
|
||||
`list_local('bigquery')` is always empty (BQ rows are remote or
|
||||
materialized, never local). The pre-fix _run_sync early-returned in
|
||||
that case → materialized pass + orchestrator rebuild were dead code.
|
||||
Post-fix: run_extractor_subprocess flag skips just the Keboola
|
||||
subprocess, and the materialized pass still fires."""
|
||||
import duckdb
|
||||
from src.db import _ensure_schema
|
||||
|
||||
db_path = tmp_path / "system.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
_ensure_schema(conn)
|
||||
|
||||
repo = TableRegistryRepository(conn)
|
||||
# Materialized BQ row — would be invisible to list_local('bigquery').
|
||||
repo.register(
|
||||
id="m1", name="m1", source_type="bigquery",
|
||||
query_mode="materialized",
|
||||
source_query="SELECT 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
conn.close()
|
||||
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path / "data"))
|
||||
|
||||
# Patch the heavy collaborators so we observe what _run_sync invoked
|
||||
# without actually running BQ / orchestrator.
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
materialized_called = {"count": 0}
|
||||
orchestrator_called = {"count": 0}
|
||||
|
||||
def _spy_materialized_pass(_conn, _bq):
|
||||
materialized_called["count"] += 1
|
||||
return {"materialized": ["m1"], "skipped": [], "errors": []}
|
||||
|
||||
class _OrchStub:
|
||||
def rebuild(self):
|
||||
orchestrator_called["count"] += 1
|
||||
return {}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.api.sync._run_materialized_pass",
|
||||
_spy_materialized_pass,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.orchestrator.SyncOrchestrator",
|
||||
lambda *a, **kw: _OrchStub(),
|
||||
)
|
||||
# Pretend instance.yaml says data_source.type=bigquery
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.get_data_source_type",
|
||||
lambda: "bigquery",
|
||||
)
|
||||
# bq_project must be truthy so the materialized pass branch fires.
|
||||
real_get_value = sync_mod.__dict__.get("get_value")
|
||||
monkeypatch.setattr(
|
||||
"app.instance_config.get_value",
|
||||
lambda *args, **kw: (
|
||||
"my-bq-proj" if (args and args[-1] == "project")
|
||||
else kw.get("default", "")
|
||||
),
|
||||
)
|
||||
|
||||
sync_mod._run_sync()
|
||||
|
||||
assert materialized_called["count"] == 1, (
|
||||
"materialized pass must run on BQ-only deployment (no local rows)"
|
||||
)
|
||||
assert orchestrator_called["count"] == 1, (
|
||||
"orchestrator rebuild must run so materialized parquets are picked up"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("yaml_value, expected_max", [
|
||||
(10737418240, 10737418240), # int — canonical
|
||||
(10737418240.0, 10737418240), # float — YAML often parses as float
|
||||
(1e10, 10000000000), # scientific notation
|
||||
("10737418240", 10737418240), # string — coerced
|
||||
(0, None), # explicit disable sentinel
|
||||
(None, None), # missing key
|
||||
("not-a-number", None), # malformed → fail-open + warn
|
||||
])
|
||||
def test_materialized_pass_max_bytes_yaml_coercion(
|
||||
system_db, stub_bq, tmp_path, monkeypatch, yaml_value, expected_max,
|
||||
):
|
||||
"""`max_bytes_per_materialize` YAML value is coerced to int regardless of
|
||||
the YAML scalar type (int / float / scientific / string). Devin found
|
||||
that an `isinstance(raw, int)` guard silently disabled the guardrail
|
||||
on float values."""
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(
|
||||
id="t", name="t", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
|
||||
parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
|
||||
parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
(parquet_dir / "t.parquet").write_bytes(b"PAR1" + b"\x00" * 16 + b"PAR1")
|
||||
|
||||
captured = {}
|
||||
|
||||
def _spy(table_id, sql, bq, output_dir, max_bytes):
|
||||
captured["max_bytes"] = max_bytes
|
||||
return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"}
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
with patch(
|
||||
"app.instance_config.get_value",
|
||||
side_effect=lambda *a, **kw: (
|
||||
yaml_value if a[-1] == "max_bytes_per_materialize"
|
||||
else kw.get("default", "")
|
||||
),
|
||||
), patch("app.api.sync._materialize_table", side_effect=_spy):
|
||||
sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
assert captured["max_bytes"] == expected_max
|
||||
|
||||
|
||||
def test_materialized_pass_keys_sync_state_by_name_not_id(
|
||||
system_db, stub_bq, tmp_path,
|
||||
):
|
||||
"""Devin review: when admin registers a name with mixed case (e.g.
|
||||
"Orders_90d") the slug-derived id ("orders_90d") differs from name.
|
||||
sync_state must be keyed by `name` so the manifest's `registry_by_name`
|
||||
lookup resolves and `query_mode='materialized'` flows through to the
|
||||
client. Otherwise CLI sees `query_mode='local'` and downloads the
|
||||
wrong file or skips the row."""
|
||||
repo = TableRegistryRepository(system_db)
|
||||
# Mixed-case name — id will be slugified to lowercase by the API path,
|
||||
# but at the repo level we control both directly.
|
||||
repo.register(
|
||||
id="orders_90d", name="Orders_90d",
|
||||
source_type="bigquery", query_mode="materialized",
|
||||
source_query="SELECT 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
|
||||
# Pre-create the parquet at the NAME-keyed path.
|
||||
parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
|
||||
parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
(parquet_dir / "Orders_90d.parquet").write_bytes(
|
||||
b"PAR1" + b"\x00" * 16 + b"PAR1"
|
||||
)
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
captured = {}
|
||||
|
||||
def _spy(table_id, sql, bq, output_dir, max_bytes):
|
||||
captured["table_id"] = table_id
|
||||
return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"}
|
||||
|
||||
with patch("app.api.sync._materialize_table", side_effect=_spy):
|
||||
sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
# materialize_query was called with the NAME, not the id.
|
||||
assert captured["table_id"] == "Orders_90d"
|
||||
|
||||
# sync_state row keyed by name.
|
||||
state = SyncStateRepository(system_db)
|
||||
name_row = state.get_table_state("Orders_90d")
|
||||
id_row = state.get_table_state("orders_90d")
|
||||
assert name_row is not None, "sync_state should be keyed by name"
|
||||
assert id_row is None, "sync_state should NOT be keyed by id"
|
||||
|
||||
|
||||
def test_materialized_pass_zero_max_bytes_disables_guardrail(
|
||||
system_db, stub_bq, tmp_path, monkeypatch
|
||||
):
|
||||
"""`max_bytes_per_materialize: 0` in instance.yaml → None passed downstream
|
||||
so materialize_query skips the dry-run entirely."""
|
||||
repo = TableRegistryRepository(system_db)
|
||||
repo.register(
|
||||
id="big", name="big", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT 1",
|
||||
sync_schedule="every 1m",
|
||||
)
|
||||
|
||||
parquet_dir = tmp_path / "data" / "extracts" / "bigquery" / "data"
|
||||
parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
(parquet_dir / "big.parquet").write_bytes(
|
||||
b"PAR1" + b"\x00" * 16 + b"PAR1"
|
||||
)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.api.sync.get_value",
|
||||
lambda *args, **kwargs: 0 if args[-1] == "max_bytes_per_materialize" else "",
|
||||
raising=False,
|
||||
)
|
||||
|
||||
from app.api import sync as sync_mod
|
||||
|
||||
captured = {}
|
||||
|
||||
def _spy(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return {"rows": 1, "size_bytes": 100, "query_mode": "materialized"}
|
||||
|
||||
# The function reads `get_value` via a local import in the body — patch
|
||||
# the import target instead.
|
||||
with patch(
|
||||
"app.instance_config.get_value",
|
||||
side_effect=lambda *args, **kw: (
|
||||
0 if args[-1] == "max_bytes_per_materialize"
|
||||
else kw.get("default", "")
|
||||
),
|
||||
), patch("app.api.sync._materialize_table", side_effect=_spy):
|
||||
sync_mod._run_materialized_pass(system_db, stub_bq)
|
||||
|
||||
assert captured["max_bytes"] is None
|
||||
93
tests/test_table_registry_source_query.py
Normal file
93
tests/test_table_registry_source_query.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
"""Repository round-trips source_query column for query_mode='materialized'.
|
||||
|
||||
Lives alongside the schema-v20 migration: register() now accepts source_query
|
||||
as an Optional[str] kwarg, and the column flows through SELECT * via list/get.
|
||||
"""
|
||||
import duckdb
|
||||
import pytest
|
||||
|
||||
from src.db import _ensure_schema
|
||||
from src.repositories.table_registry import TableRegistryRepository
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def repo(tmp_path):
|
||||
conn = duckdb.connect(str(tmp_path / "system.duckdb"))
|
||||
_ensure_schema(conn)
|
||||
return TableRegistryRepository(conn)
|
||||
|
||||
|
||||
def test_register_persists_source_query(repo):
|
||||
repo.register(
|
||||
id="orders_90d",
|
||||
name="orders_90d",
|
||||
source_type="bigquery",
|
||||
query_mode="materialized",
|
||||
source_query=(
|
||||
"SELECT date, SUM(revenue) FROM `prj.ds.orders` "
|
||||
"WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY) GROUP BY 1"
|
||||
),
|
||||
sync_schedule="every 6h",
|
||||
)
|
||||
row = repo.get("orders_90d")
|
||||
assert row is not None
|
||||
assert row["query_mode"] == "materialized"
|
||||
assert "INTERVAL 90 DAY" in row["source_query"]
|
||||
assert row["sync_schedule"] == "every 6h"
|
||||
|
||||
|
||||
def test_register_omitted_source_query_stays_null(repo):
|
||||
"""Default registrations (Keboola local) must not stamp an empty string."""
|
||||
repo.register(id="t1", name="t1", source_type="keboola", query_mode="local")
|
||||
row = repo.get("t1")
|
||||
assert row is not None
|
||||
assert row["source_query"] is None
|
||||
|
||||
|
||||
def test_list_all_includes_source_query(repo):
|
||||
repo.register(
|
||||
id="m1", name="m1", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT 1",
|
||||
)
|
||||
rows = repo.list_all()
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["source_query"] == "SELECT 1"
|
||||
|
||||
|
||||
def test_register_updates_source_query_on_conflict(repo):
|
||||
"""Re-registering the same id must overwrite source_query (admin edit)."""
|
||||
repo.register(
|
||||
id="m1", name="m1", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT 1",
|
||||
)
|
||||
repo.register(
|
||||
id="m1", name="m1", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT 2",
|
||||
)
|
||||
row = repo.get("m1")
|
||||
assert row["source_query"] == "SELECT 2"
|
||||
|
||||
|
||||
def test_register_preserves_registered_at_when_supplied(repo):
|
||||
"""source_query addition must not break the existing registered_at
|
||||
preservation contract (admin edits keep the original timestamp)."""
|
||||
from datetime import datetime, timezone
|
||||
|
||||
original = datetime(2026, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
|
||||
repo.register(
|
||||
id="t", name="t", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT 1",
|
||||
registered_at=original,
|
||||
)
|
||||
repo.register(
|
||||
id="t", name="t", source_type="bigquery",
|
||||
query_mode="materialized", source_query="SELECT 2",
|
||||
registered_at=original,
|
||||
)
|
||||
row = repo.get("t")
|
||||
assert row["source_query"] == "SELECT 2"
|
||||
# Don't assert exact equality on naive vs aware (DuckDB strips tz);
|
||||
# just confirm the year+month+day didn't slide forward to 'now'.
|
||||
assert row["registered_at"].year == 2026
|
||||
assert row["registered_at"].month == 1
|
||||
assert row["registered_at"].day == 1
|
||||
Loading…
Reference in a new issue