Merge pull request #3 from keboola/feature/v2-fastapi-duckdb-docker-cli
feat: remote query — extension re-attach + two-phase BQ+DuckDB engine
This commit is contained in:
commit
dab5c84860
12 changed files with 2266 additions and 5 deletions
16
CLAUDE.md
16
CLAUDE.md
|
|
@ -154,6 +154,22 @@ Before computing any business metric, look up the canonical definition:
|
|||
|
||||
Never invent metric calculations — always use the canonical definitions.
|
||||
|
||||
## Hybrid Queries (BigQuery + Local)
|
||||
|
||||
For tables too large to sync locally, use hybrid queries that JOIN local data with on-demand BigQuery results:
|
||||
|
||||
```bash
|
||||
da query --sql "SELECT o.*, t.views FROM orders o JOIN traffic t ON o.date = t.date" \
|
||||
--register-bq "traffic=SELECT date, SUM(views) as views FROM dataset.web WHERE date > '2026-01-01' GROUP BY 1"
|
||||
```
|
||||
|
||||
The `--register-bq` flag executes a BigQuery subquery, loads the result into memory, and makes it available as a DuckDB view for the final SQL. Multiple `--register-bq` flags can be used for multiple BQ sources.
|
||||
|
||||
For complex SQL, use stdin mode:
|
||||
```bash
|
||||
echo '{"register_bq": {"traffic": "SELECT ..."}, "sql": "SELECT ..."}' | da query --stdin
|
||||
```
|
||||
|
||||
## Extensibility
|
||||
|
||||
### Data Sources (extract.duckdb contract)
|
||||
|
|
|
|||
43
app/api/query_hybrid.py
Normal file
43
app/api/query_hybrid.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
"""Hybrid query endpoint — two-phase BQ registration + DuckDB execution."""
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.auth.dependencies import require_admin
|
||||
from src.db import get_analytics_db_readonly
|
||||
from src.remote_query import RemoteQueryEngine, RemoteQueryError, load_config
|
||||
|
||||
router = APIRouter(prefix="/api/query", tags=["query"])
|
||||
|
||||
|
||||
class HybridQueryRequest(BaseModel):
|
||||
sql: str
|
||||
register_bq: Dict[str, str] = {}
|
||||
|
||||
|
||||
@router.post("/hybrid")
|
||||
async def hybrid_query(request: HybridQueryRequest, user: dict = Depends(require_admin)):
|
||||
config = load_config()
|
||||
analytics = get_analytics_db_readonly()
|
||||
try:
|
||||
engine = RemoteQueryEngine(
|
||||
analytics,
|
||||
max_bq_registration_rows=config.get("max_bq_registration_rows", 500_000),
|
||||
max_memory_mb=config.get("max_memory_mb", 2048),
|
||||
max_result_rows=config.get("max_result_rows", 100_000),
|
||||
timeout_seconds=config.get("timeout_seconds", 300),
|
||||
)
|
||||
for alias, bq_sql in request.register_bq.items():
|
||||
try:
|
||||
engine.register_bq(alias, bq_sql)
|
||||
except RemoteQueryError as e:
|
||||
raise HTTPException(status_code=400, detail=f"BQ '{alias}': {e.error_type}: {e}")
|
||||
try:
|
||||
result = engine.execute(request.sql)
|
||||
except RemoteQueryError as e:
|
||||
raise HTTPException(status_code=400, detail=f"Query: {e.error_type}: {e}")
|
||||
return result
|
||||
finally:
|
||||
analytics.close()
|
||||
|
|
@ -29,6 +29,7 @@ from app.api.access_requests import router as access_requests_router
|
|||
from app.api.jira_webhooks import router as jira_webhooks_router
|
||||
from app.api.metrics import router as metrics_router
|
||||
from app.api.metadata import router as metadata_router
|
||||
from app.api.query_hybrid import router as query_hybrid_router
|
||||
from app.web.router import router as web_router
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -137,6 +138,7 @@ def create_app() -> FastAPI:
|
|||
app.include_router(jira_webhooks_router)
|
||||
app.include_router(metrics_router)
|
||||
app.include_router(metadata_router)
|
||||
app.include_router(query_hybrid_router)
|
||||
|
||||
# Web UI router (must be last — has catch-all routes)
|
||||
app.include_router(web_router)
|
||||
|
|
|
|||
|
|
@ -2,21 +2,63 @@
|
|||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import typer
|
||||
|
||||
|
||||
def query_command(
|
||||
sql: str = typer.Argument(..., help="SQL query to execute"),
|
||||
sql: Optional[str] = typer.Argument(None, help="SQL query to execute (positional)"),
|
||||
sql_opt: Optional[str] = typer.Option(None, "--sql", help="SQL query to execute (named option)"),
|
||||
remote: bool = typer.Option(False, "--remote", help="Execute on server instead of locally"),
|
||||
fmt: str = typer.Option("table", "--format", "-f", help="Output format: table, json, csv"),
|
||||
limit: int = typer.Option(1000, "--limit", help="Max rows to return"),
|
||||
register_bq: Optional[List[str]] = typer.Option(
|
||||
None,
|
||||
"--register-bq",
|
||||
help="Register a BigQuery result as a DuckDB view. Format: alias=BQ_SQL. Can be repeated.",
|
||||
),
|
||||
stdin: bool = typer.Option(False, "--stdin", help="Read SQL from stdin as JSON {\"sql\": \"...\"}"),
|
||||
):
|
||||
"""Execute SQL query against DuckDB."""
|
||||
if remote:
|
||||
_query_remote(sql, fmt, limit)
|
||||
# Resolve SQL from exactly one of: positional, --sql, or --stdin
|
||||
sources_provided = sum([
|
||||
sql is not None,
|
||||
sql_opt is not None,
|
||||
stdin,
|
||||
])
|
||||
if sources_provided == 0:
|
||||
typer.echo("Error: provide SQL as a positional argument, --sql option, or --stdin flag.", err=True)
|
||||
raise typer.Exit(1)
|
||||
if sources_provided > 1:
|
||||
typer.echo("Error: only one of positional SQL, --sql, or --stdin may be used at a time.", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
if stdin:
|
||||
raw = sys.stdin.read()
|
||||
try:
|
||||
payload = json.loads(raw)
|
||||
resolved_sql = payload["sql"]
|
||||
# Extract register_bq from stdin JSON
|
||||
stdin_bq = payload.get("register_bq", {})
|
||||
if stdin_bq and isinstance(stdin_bq, dict):
|
||||
register_bq = [f"{k}={v}" for k, v in stdin_bq.items()]
|
||||
except (json.JSONDecodeError, KeyError) as exc:
|
||||
typer.echo(f"Error: failed to parse stdin JSON: {exc}", err=True)
|
||||
raise typer.Exit(1)
|
||||
elif sql_opt is not None:
|
||||
resolved_sql = sql_opt
|
||||
else:
|
||||
_query_local(sql, fmt, limit)
|
||||
resolved_sql = sql
|
||||
|
||||
if register_bq:
|
||||
_query_hybrid(resolved_sql, fmt, limit, register_bq)
|
||||
elif remote:
|
||||
_query_remote(resolved_sql, fmt, limit)
|
||||
else:
|
||||
_query_local(resolved_sql, fmt, limit)
|
||||
|
||||
|
||||
def _query_local(sql: str, fmt: str, limit: int):
|
||||
|
|
@ -56,6 +98,61 @@ def _query_remote(sql: str, fmt: str, limit: int):
|
|||
typer.echo(f"(truncated at {limit} rows)", err=True)
|
||||
|
||||
|
||||
def _query_hybrid(sql: str, fmt: str, limit: int, register_bq_specs: List[str]):
|
||||
"""Run a hybrid query: register BigQuery results as DuckDB views, then execute locally."""
|
||||
import duckdb
|
||||
from src.remote_query import RemoteQueryEngine, RemoteQueryError, load_config
|
||||
|
||||
local_dir = Path(os.environ.get("DA_LOCAL_DIR", "."))
|
||||
db_path = local_dir / "user" / "duckdb" / "analytics.duckdb"
|
||||
if not db_path.exists():
|
||||
typer.echo("Local DuckDB not found. Run: da sync", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
conn = duckdb.connect(str(db_path), read_only=True)
|
||||
try:
|
||||
config = load_config()
|
||||
engine_kwargs = {k: v for k, v in config.items() if k in (
|
||||
"max_bq_registration_rows", "max_memory_mb", "max_result_rows", "timeout_seconds"
|
||||
)}
|
||||
# CLI --limit flag overrides config max_result_rows
|
||||
engine_kwargs["max_result_rows"] = limit
|
||||
engine = RemoteQueryEngine(conn, **engine_kwargs)
|
||||
|
||||
for spec in register_bq_specs:
|
||||
if "=" not in spec:
|
||||
typer.echo(
|
||||
f"Error: --register-bq spec must be 'alias=BQ_SQL', got: {spec!r}",
|
||||
err=True,
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
alias, bq_sql = spec.split("=", 1)
|
||||
alias = alias.strip()
|
||||
bq_sql = bq_sql.strip()
|
||||
try:
|
||||
info = engine.register_bq(alias, bq_sql)
|
||||
typer.echo(
|
||||
f"Registered BQ alias '{alias}': {info['rows']:,} rows, "
|
||||
f"{info['memory_mb']:.1f} MiB",
|
||||
err=True,
|
||||
)
|
||||
except RemoteQueryError as exc:
|
||||
typer.echo(f"BQ registration failed for '{alias}': {exc}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
try:
|
||||
result = engine.execute(sql)
|
||||
except RemoteQueryError as exc:
|
||||
typer.echo(f"Query error: {exc}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
_output(result["columns"], result["rows"], fmt)
|
||||
if result.get("truncated"):
|
||||
typer.echo(f"(truncated at {result['row_count']} rows)", err=True)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _output(columns: list, rows: list, fmt: str):
|
||||
if fmt == "json":
|
||||
output = [dict(zip(columns, row)) for row in rows]
|
||||
|
|
|
|||
923
docs/superpowers/plans/2026-04-11-remote-query.md
Normal file
923
docs/superpowers/plans/2026-04-11-remote-query.md
Normal file
|
|
@ -0,0 +1,923 @@
|
|||
# Remote Query Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Fix BigQuery extension re-attach so remote views work, then add a two-phase query engine that JOINs local Parquet data with on-demand BigQuery subquery results.
|
||||
|
||||
**Architecture:** Part 1 patches `get_analytics_db_readonly()` to re-load extensions from `_remote_attach` tables. Part 2 adds `RemoteQueryEngine` that wraps BQ client with safety limits (COUNT pre-check, memory estimation), registers Arrow results in DuckDB, then executes the final SQL. Exposed via `da query --register-bq` CLI and `POST /api/query/hybrid` API.
|
||||
|
||||
**Tech Stack:** DuckDB, google-cloud-bigquery, PyArrow, FastAPI, Typer
|
||||
|
||||
**Spec:** `docs/superpowers/specs/2026-04-11-remote-query-design.md`
|
||||
|
||||
---
|
||||
|
||||
### Task 1: Fix Extension Re-attach in `get_analytics_db_readonly()`
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/db.py:253-282` (get_analytics_db_readonly)
|
||||
- Test: `tests/test_db.py`
|
||||
|
||||
- [ ] **Step 1: Write failing test**
|
||||
|
||||
Add to `tests/test_db.py`:
|
||||
|
||||
```python
|
||||
class TestExtensionReattach:
|
||||
def test_reads_remote_attach_table(self, tmp_path, monkeypatch):
|
||||
"""Verify get_analytics_db_readonly() attempts to load extensions from _remote_attach."""
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
||||
import duckdb
|
||||
|
||||
# Create analytics DB
|
||||
analytics_dir = tmp_path / "analytics"
|
||||
analytics_dir.mkdir()
|
||||
conn = duckdb.connect(str(analytics_dir / "server.duckdb"))
|
||||
conn.close()
|
||||
|
||||
# Create an extract.duckdb with a _remote_attach table
|
||||
ext_dir = tmp_path / "extracts" / "testbq"
|
||||
ext_dir.mkdir(parents=True)
|
||||
ext_conn = duckdb.connect(str(ext_dir / "extract.duckdb"))
|
||||
ext_conn.execute("""
|
||||
CREATE TABLE _remote_attach (
|
||||
alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR
|
||||
)
|
||||
""")
|
||||
ext_conn.execute(
|
||||
"INSERT INTO _remote_attach VALUES ('bq', 'bigquery', 'project=test', '')"
|
||||
)
|
||||
ext_conn.close()
|
||||
|
||||
from src.db import get_analytics_db_readonly
|
||||
# This won't actually load bigquery (not installed in test env),
|
||||
# but should not crash — just log a warning
|
||||
analytics = get_analytics_db_readonly()
|
||||
try:
|
||||
# Connection should be usable even if extension load failed
|
||||
result = analytics.execute("SELECT 1").fetchone()
|
||||
assert result[0] == 1
|
||||
finally:
|
||||
analytics.close()
|
||||
|
||||
def test_skips_missing_remote_attach(self, tmp_path, monkeypatch):
|
||||
"""Extract without _remote_attach should not cause errors."""
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
||||
import duckdb
|
||||
|
||||
analytics_dir = tmp_path / "analytics"
|
||||
analytics_dir.mkdir()
|
||||
conn = duckdb.connect(str(analytics_dir / "server.duckdb"))
|
||||
conn.close()
|
||||
|
||||
ext_dir = tmp_path / "extracts" / "plain"
|
||||
ext_dir.mkdir(parents=True)
|
||||
ext_conn = duckdb.connect(str(ext_dir / "extract.duckdb"))
|
||||
ext_conn.execute("CREATE TABLE _meta (name VARCHAR)")
|
||||
ext_conn.close()
|
||||
|
||||
from src.db import get_analytics_db_readonly
|
||||
analytics = get_analytics_db_readonly()
|
||||
try:
|
||||
result = analytics.execute("SELECT 1").fetchone()
|
||||
assert result[0] == 1
|
||||
finally:
|
||||
analytics.close()
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails (or passes — these are resilience tests)**
|
||||
|
||||
Run: `pytest tests/test_db.py::TestExtensionReattach -v`
|
||||
Expected: Both tests likely PASS already (graceful failures). That's fine — the real value is ensuring the re-attach code doesn't break anything.
|
||||
|
||||
- [ ] **Step 3: Implement extension re-attach**
|
||||
|
||||
In `src/db.py`, modify `get_analytics_db_readonly()`. After the existing ATTACH loop (line ~279), before the `return conn` (line ~282), add:
|
||||
|
||||
```python
|
||||
# Re-attach remote extensions (BigQuery, Keboola, etc.)
|
||||
if extracts_dir.exists():
|
||||
_reattach_remote_extensions(conn, extracts_dir)
|
||||
```
|
||||
|
||||
Add this helper function before `get_analytics_db_readonly()`:
|
||||
|
||||
```python
|
||||
def _reattach_remote_extensions(
|
||||
conn: duckdb.DuckDBPyConnection, extracts_dir: Path
|
||||
) -> None:
|
||||
"""Re-load extensions from _remote_attach tables in extract.duckdb files."""
|
||||
already_attached = set()
|
||||
try:
|
||||
already_attached = {
|
||||
r[0] for r in conn.execute(
|
||||
"SELECT database_name FROM duckdb_databases()"
|
||||
).fetchall()
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for ext_dir in sorted(extracts_dir.iterdir()):
|
||||
if not ext_dir.is_dir() or not _SAFE_IDENTIFIER.match(ext_dir.name):
|
||||
continue
|
||||
# Check if this extract has a _remote_attach table
|
||||
try:
|
||||
has_table = conn.execute(
|
||||
f"SELECT table_name FROM information_schema.tables "
|
||||
f"WHERE table_schema='{ext_dir.name}' AND table_name='_remote_attach'"
|
||||
).fetchall()
|
||||
if not has_table:
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
try:
|
||||
rows = conn.execute(
|
||||
f"SELECT alias, extension, url, token_env FROM {ext_dir.name}._remote_attach"
|
||||
).fetchall()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
for alias, extension, url, token_env in rows:
|
||||
if alias in already_attached:
|
||||
continue
|
||||
if not _SAFE_IDENTIFIER.match(alias) or not _SAFE_IDENTIFIER.match(extension):
|
||||
continue
|
||||
|
||||
token = os.environ.get(token_env, "") if token_env else ""
|
||||
|
||||
try:
|
||||
conn.execute(f"LOAD {extension};")
|
||||
if token:
|
||||
escaped_token = token.replace("'", "''")
|
||||
conn.execute(
|
||||
f"ATTACH '{url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
f"ATTACH '{url}' AS {alias} (TYPE {extension}, READ_ONLY)"
|
||||
)
|
||||
already_attached.add(alias)
|
||||
logger.info("Re-attached remote source %s via %s", alias, extension)
|
||||
except Exception as e:
|
||||
logger.debug("Could not re-attach %s: %s", alias, e)
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run tests**
|
||||
|
||||
Run: `pytest tests/test_db.py -v`
|
||||
Expected: ALL PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add src/db.py tests/test_db.py
|
||||
git commit -m "fix: re-attach remote extensions in get_analytics_db_readonly()"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: RemoteQueryEngine Core
|
||||
|
||||
**Files:**
|
||||
- Create: `src/remote_query.py`
|
||||
- Test: `tests/test_remote_query.py`
|
||||
|
||||
- [ ] **Step 1: Write failing tests**
|
||||
|
||||
Create `tests/test_remote_query.py`:
|
||||
|
||||
```python
|
||||
"""Tests for RemoteQueryEngine."""
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import duckdb
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def analytics_conn(tmp_path):
|
||||
"""DuckDB connection with a sample local view."""
|
||||
conn = duckdb.connect()
|
||||
conn.execute("CREATE TABLE orders (id INT, date DATE, amount DECIMAL(10,2))")
|
||||
conn.execute("INSERT INTO orders VALUES (1, '2026-01-01', 100.0), (2, '2026-01-15', 200.0)")
|
||||
yield conn
|
||||
conn.close()
|
||||
|
||||
|
||||
def _mock_bq_arrow_table():
|
||||
"""Create a mock Arrow table for BQ results."""
|
||||
import pyarrow as pa
|
||||
return pa.table({
|
||||
"date": ["2026-01-01", "2026-01-15"],
|
||||
"pageviews": [1000, 2000],
|
||||
})
|
||||
|
||||
|
||||
class TestRemoteQueryEngineRegister:
|
||||
def test_register_bq_success(self, analytics_conn):
|
||||
from src.remote_query import RemoteQueryEngine
|
||||
|
||||
mock_arrow = _mock_bq_arrow_table()
|
||||
mock_job = MagicMock()
|
||||
mock_job.to_arrow.return_value = mock_arrow
|
||||
mock_client = MagicMock()
|
||||
mock_client.query.return_value = mock_job
|
||||
# COUNT pre-check
|
||||
mock_count_job = MagicMock()
|
||||
mock_count_result = MagicMock()
|
||||
mock_count_result.fetchone.return_value = (2,)
|
||||
mock_count_job.result.return_value = mock_count_result
|
||||
mock_client.query.side_effect = [mock_count_job, mock_job]
|
||||
|
||||
engine = RemoteQueryEngine(analytics_conn, _bq_client_factory=lambda: mock_client)
|
||||
stats = engine.register_bq("traffic", "SELECT date, pageviews FROM dataset.web")
|
||||
|
||||
assert stats["alias"] == "traffic"
|
||||
assert stats["rows"] == 2
|
||||
# Verify the view is usable
|
||||
result = analytics_conn.execute("SELECT * FROM traffic").fetchall()
|
||||
assert len(result) == 2
|
||||
|
||||
def test_register_bq_row_limit_exceeded(self, analytics_conn):
|
||||
from src.remote_query import RemoteQueryEngine, RemoteQueryError
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_count_job = MagicMock()
|
||||
mock_count_result = MagicMock()
|
||||
mock_count_result.fetchone.return_value = (999999,)
|
||||
mock_count_job.result.return_value = mock_count_result
|
||||
mock_client.query.return_value = mock_count_job
|
||||
|
||||
engine = RemoteQueryEngine(
|
||||
analytics_conn,
|
||||
_bq_client_factory=lambda: mock_client,
|
||||
max_bq_registration_rows=1000,
|
||||
)
|
||||
with pytest.raises(RemoteQueryError, match="row_limit"):
|
||||
engine.register_bq("big", "SELECT * FROM huge_table")
|
||||
|
||||
def test_register_bq_missing_package(self, analytics_conn):
|
||||
from src.remote_query import RemoteQueryEngine, RemoteQueryError
|
||||
|
||||
engine = RemoteQueryEngine(
|
||||
analytics_conn,
|
||||
_bq_client_factory=None, # Will try real import
|
||||
)
|
||||
with patch.dict("sys.modules", {"google.cloud.bigquery": None}):
|
||||
with pytest.raises(RemoteQueryError, match="bq_error"):
|
||||
engine.register_bq("x", "SELECT 1")
|
||||
|
||||
|
||||
class TestRemoteQueryEngineExecute:
|
||||
def test_execute_local_only(self, analytics_conn):
|
||||
from src.remote_query import RemoteQueryEngine
|
||||
engine = RemoteQueryEngine(analytics_conn)
|
||||
result = engine.execute("SELECT id, amount FROM orders ORDER BY id")
|
||||
assert result["columns"] == ["id", "amount"]
|
||||
assert len(result["rows"]) == 2
|
||||
assert result["row_count"] == 2
|
||||
assert result["truncated"] is False
|
||||
|
||||
def test_execute_with_registered_bq(self, analytics_conn):
|
||||
from src.remote_query import RemoteQueryEngine
|
||||
import pyarrow as pa
|
||||
|
||||
# Manually register an Arrow table (simulating BQ result)
|
||||
traffic = pa.table({"date": ["2026-01-01", "2026-01-15"], "views": [100, 200]})
|
||||
analytics_conn.register("traffic", traffic)
|
||||
|
||||
engine = RemoteQueryEngine(analytics_conn)
|
||||
result = engine.execute(
|
||||
"SELECT o.id, t.views FROM orders o JOIN traffic t ON CAST(o.date AS VARCHAR) = t.date ORDER BY o.id"
|
||||
)
|
||||
assert len(result["rows"]) == 2
|
||||
assert result["columns"] == ["id", "views"]
|
||||
|
||||
def test_execute_respects_max_result_rows(self, analytics_conn):
|
||||
from src.remote_query import RemoteQueryEngine
|
||||
engine = RemoteQueryEngine(analytics_conn, max_result_rows=1)
|
||||
result = engine.execute("SELECT * FROM orders")
|
||||
assert len(result["rows"]) == 1
|
||||
assert result["truncated"] is True
|
||||
|
||||
def test_execute_invalid_sql(self, analytics_conn):
|
||||
from src.remote_query import RemoteQueryEngine, RemoteQueryError
|
||||
engine = RemoteQueryEngine(analytics_conn)
|
||||
with pytest.raises(RemoteQueryError, match="query_error"):
|
||||
engine.execute("DROP TABLE orders")
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run tests to verify they fail**
|
||||
|
||||
Run: `pytest tests/test_remote_query.py -v`
|
||||
Expected: FAIL — `ModuleNotFoundError: No module named 'src.remote_query'`
|
||||
|
||||
- [ ] **Step 3: Implement RemoteQueryEngine**
|
||||
|
||||
Create `src/remote_query.py`:
|
||||
|
||||
```python
|
||||
"""Two-phase remote query engine.
|
||||
|
||||
Phase 1: Execute BigQuery subqueries, register results as in-memory Arrow tables.
|
||||
Phase 2: Execute DuckDB query joining local Parquet views with BQ Arrow tables.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
import duckdb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# SQL blocklist — reused from app/api/query.py
|
||||
_BLOCKED_KEYWORDS = [
|
||||
"drop ", "delete ", "insert ", "update ", "alter ", "create ",
|
||||
"copy ", "attach ", "detach ", "load ", "install ",
|
||||
"export ", "import ", "pragma ", "call ",
|
||||
"read_csv", "read_json", "read_parquet", "read_text",
|
||||
"write_csv", "write_parquet", "read_blob", "read_ndjson",
|
||||
"parquet_scan", "parquet_metadata", "parquet_schema",
|
||||
"json_scan", "csv_scan",
|
||||
"query_table", "iceberg_scan", "delta_scan",
|
||||
"glob(", "list_files",
|
||||
"'/", '"/', 'http://', 'https://', 's3://', 'gcs://',
|
||||
"information_schema", "duckdb_tables", "duckdb_columns",
|
||||
"duckdb_databases", "duckdb_settings", "duckdb_functions",
|
||||
"duckdb_views", "duckdb_indexes", "duckdb_schemas",
|
||||
"pragma_table_info", "pragma_storage_info",
|
||||
"'../", '"../',
|
||||
";",
|
||||
]
|
||||
|
||||
|
||||
class RemoteQueryError(Exception):
|
||||
"""Structured error for remote query failures."""
|
||||
|
||||
def __init__(self, message: str, error_type: str, details: Optional[dict] = None):
|
||||
super().__init__(message)
|
||||
self.error_type = error_type
|
||||
self.details = details or {}
|
||||
|
||||
|
||||
class RemoteQueryEngine:
|
||||
"""Two-phase query engine: BQ subqueries + DuckDB final query."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
conn: duckdb.DuckDBPyConnection,
|
||||
*,
|
||||
_bq_client_factory: Optional[Callable] = None,
|
||||
max_bq_registration_rows: int = 500_000,
|
||||
max_memory_mb: float = 2048.0,
|
||||
max_result_rows: int = 100_000,
|
||||
timeout_seconds: int = 300,
|
||||
):
|
||||
self.conn = conn
|
||||
self._bq_client_factory = _bq_client_factory
|
||||
self.max_bq_registration_rows = max_bq_registration_rows
|
||||
self.max_memory_mb = max_memory_mb
|
||||
self.max_result_rows = max_result_rows
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self._bq_stats: Dict[str, dict] = {}
|
||||
|
||||
def register_bq(self, alias: str, bq_sql: str) -> dict:
|
||||
"""Execute BQ subquery, register result as in-memory DuckDB view.
|
||||
|
||||
Returns dict with {alias, rows, columns, memory_mb}.
|
||||
Raises RemoteQueryError on failure.
|
||||
"""
|
||||
_validate_sql(bq_sql)
|
||||
|
||||
client = self._get_bq_client()
|
||||
|
||||
# Phase 1a: COUNT(*) pre-check
|
||||
count_sql = f"SELECT COUNT(*) FROM ({bq_sql})"
|
||||
try:
|
||||
count_job = client.query(count_sql)
|
||||
row_count = count_job.result().fetchone()[0]
|
||||
except Exception as e:
|
||||
raise RemoteQueryError(
|
||||
f"BQ COUNT pre-check failed for '{alias}': {e}",
|
||||
error_type="bq_error",
|
||||
details={"alias": alias},
|
||||
)
|
||||
|
||||
if row_count > self.max_bq_registration_rows:
|
||||
raise RemoteQueryError(
|
||||
f"BQ query '{alias}' returns {row_count:,} rows "
|
||||
f"(limit: {self.max_bq_registration_rows:,})",
|
||||
error_type="row_limit",
|
||||
details={"alias": alias, "rows": row_count, "limit": self.max_bq_registration_rows},
|
||||
)
|
||||
|
||||
# Phase 1b: Execute and register
|
||||
try:
|
||||
job = client.query(bq_sql)
|
||||
try:
|
||||
arrow_table = job.to_arrow()
|
||||
except Exception:
|
||||
arrow_table = job.to_arrow(create_bqstorage_client=False)
|
||||
except Exception as e:
|
||||
raise RemoteQueryError(
|
||||
f"BQ query failed for '{alias}': {e}",
|
||||
error_type="bq_error",
|
||||
details={"alias": alias},
|
||||
)
|
||||
|
||||
# Memory check (actual, not estimated)
|
||||
memory_mb = arrow_table.nbytes / (1024 * 1024)
|
||||
if memory_mb > self.max_memory_mb:
|
||||
raise RemoteQueryError(
|
||||
f"BQ result '{alias}' uses {memory_mb:.1f} MB "
|
||||
f"(limit: {self.max_memory_mb:.0f} MB)",
|
||||
error_type="memory_limit",
|
||||
details={"alias": alias, "memory_mb": memory_mb, "limit": self.max_memory_mb},
|
||||
)
|
||||
|
||||
self.conn.register(alias, arrow_table)
|
||||
stats = {
|
||||
"alias": alias,
|
||||
"rows": arrow_table.num_rows,
|
||||
"columns": arrow_table.num_columns,
|
||||
"memory_mb": round(memory_mb, 3),
|
||||
}
|
||||
self._bq_stats[alias] = stats
|
||||
logger.info("Registered BQ view '%s': %d rows, %.1f MB", alias, arrow_table.num_rows, memory_mb)
|
||||
return stats
|
||||
|
||||
def execute(self, sql: str) -> dict:
|
||||
"""Execute final DuckDB query. Returns {columns, rows, row_count, truncated, bq_stats}."""
|
||||
_validate_sql(sql)
|
||||
|
||||
try:
|
||||
result = self.conn.execute(sql).fetchmany(self.max_result_rows + 1)
|
||||
columns = [desc[0] for desc in self.conn.description] if self.conn.description else []
|
||||
except Exception as e:
|
||||
raise RemoteQueryError(
|
||||
f"Query execution failed: {e}",
|
||||
error_type="query_error",
|
||||
)
|
||||
|
||||
truncated = len(result) > self.max_result_rows
|
||||
rows = result[:self.max_result_rows]
|
||||
|
||||
# Serialize non-standard types
|
||||
serializable_rows = []
|
||||
for row in rows:
|
||||
serializable_rows.append([
|
||||
str(v) if v is not None and not isinstance(v, (int, float, bool, str)) else v
|
||||
for v in row
|
||||
])
|
||||
|
||||
return {
|
||||
"columns": columns,
|
||||
"rows": serializable_rows,
|
||||
"row_count": len(serializable_rows),
|
||||
"truncated": truncated,
|
||||
"bq_stats": dict(self._bq_stats),
|
||||
}
|
||||
|
||||
def _get_bq_client(self):
|
||||
"""Get BigQuery client, using factory or default."""
|
||||
if self._bq_client_factory:
|
||||
return self._bq_client_factory()
|
||||
try:
|
||||
from scripts.duckdb_manager import _create_bq_client
|
||||
project = os.environ.get("BIGQUERY_PROJECT")
|
||||
if not project:
|
||||
raise RemoteQueryError(
|
||||
"BIGQUERY_PROJECT env var not set",
|
||||
error_type="bq_error",
|
||||
)
|
||||
return _create_bq_client(project)
|
||||
except ImportError:
|
||||
raise RemoteQueryError(
|
||||
"google-cloud-bigquery is not installed. "
|
||||
"Install with: pip install google-cloud-bigquery",
|
||||
error_type="bq_error",
|
||||
)
|
||||
|
||||
|
||||
def _validate_sql(sql: str) -> None:
|
||||
"""Validate SQL against blocklist. Raises RemoteQueryError."""
|
||||
sql_lower = sql.strip().lower()
|
||||
for keyword in _BLOCKED_KEYWORDS:
|
||||
if keyword in sql_lower:
|
||||
raise RemoteQueryError(
|
||||
f"Blocked SQL keyword: {keyword.strip()}",
|
||||
error_type="query_error",
|
||||
)
|
||||
if not sql_lower.startswith("select ") and not sql_lower.startswith("with "):
|
||||
raise RemoteQueryError(
|
||||
"Query must start with SELECT or WITH",
|
||||
error_type="query_error",
|
||||
)
|
||||
|
||||
|
||||
def load_config() -> dict:
|
||||
"""Load remote_query config from instance.yaml."""
|
||||
try:
|
||||
from app.instance_config import get_value
|
||||
return get_value("remote_query") or {}
|
||||
except Exception:
|
||||
return {}
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run tests**
|
||||
|
||||
Run: `pytest tests/test_remote_query.py -v`
|
||||
Expected: ALL PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add src/remote_query.py tests/test_remote_query.py
|
||||
git commit -m "feat: add RemoteQueryEngine with BQ registration and safety limits"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: CLI `da query --register-bq`
|
||||
|
||||
**Files:**
|
||||
- Modify: `cli/commands/query.py`
|
||||
- Test: `tests/test_cli.py`
|
||||
|
||||
- [ ] **Step 1: Write failing test**
|
||||
|
||||
Add to `tests/test_cli.py`:
|
||||
|
||||
```python
|
||||
class TestQueryHybrid:
|
||||
def test_register_bq_flag_help(self):
|
||||
result = runner.invoke(app, ["query", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "register-bq" in result.output
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails**
|
||||
|
||||
Run: `pytest tests/test_cli.py::TestQueryHybrid -v`
|
||||
Expected: FAIL — `register-bq` not in help output
|
||||
|
||||
- [ ] **Step 3: Implement CLI changes**
|
||||
|
||||
Replace `cli/commands/query.py` with:
|
||||
|
||||
```python
|
||||
"""Query commands — da query."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import typer
|
||||
|
||||
|
||||
def query_command(
|
||||
sql: Optional[str] = typer.Argument(None, help="SQL query to execute"),
|
||||
sql_opt: Optional[str] = typer.Option(None, "--sql", help="SQL query (alternative to positional)"),
|
||||
remote: bool = typer.Option(False, "--remote", help="Execute on server instead of locally"),
|
||||
register_bq: Optional[List[str]] = typer.Option(None, "--register-bq", help="Register BQ subquery: alias=SQL"),
|
||||
stdin: bool = typer.Option(False, "--stdin", help="Read query spec from stdin (JSON)"),
|
||||
fmt: str = typer.Option("table", "--format", "-f", help="Output format: table, json, csv"),
|
||||
limit: int = typer.Option(1000, "--limit", help="Max rows to return"),
|
||||
):
|
||||
"""Execute SQL query against DuckDB. Supports hybrid BQ+local queries."""
|
||||
# Resolve SQL from positional, --sql, or --stdin
|
||||
if stdin:
|
||||
spec = json.loads(sys.stdin.read())
|
||||
final_sql = spec.get("sql", "")
|
||||
register_bq = [f"{k}={v}" for k, v in spec.get("register_bq", {}).items()]
|
||||
else:
|
||||
final_sql = sql or sql_opt
|
||||
if not final_sql:
|
||||
typer.echo("Error: provide SQL as argument, --sql, or --stdin", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
if register_bq:
|
||||
_query_hybrid(final_sql, register_bq, fmt, limit)
|
||||
elif remote:
|
||||
_query_remote(final_sql, fmt, limit)
|
||||
else:
|
||||
_query_local(final_sql, fmt, limit)
|
||||
|
||||
|
||||
def _query_hybrid(sql: str, register_bq_specs: List[str], fmt: str, limit: int):
|
||||
"""Run two-phase hybrid query: BQ subqueries + local DuckDB."""
|
||||
import duckdb
|
||||
from src.remote_query import RemoteQueryEngine, RemoteQueryError, load_config
|
||||
|
||||
local_dir = Path(os.environ.get("DA_LOCAL_DIR", "."))
|
||||
db_path = local_dir / "user" / "duckdb" / "analytics.duckdb"
|
||||
if not db_path.exists():
|
||||
typer.echo("Local DuckDB not found. Run: da sync", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
config = load_config()
|
||||
conn = duckdb.connect(str(db_path), read_only=True)
|
||||
try:
|
||||
engine = RemoteQueryEngine(
|
||||
conn,
|
||||
max_bq_registration_rows=config.get("max_bq_registration_rows", 500_000),
|
||||
max_memory_mb=config.get("max_memory_mb", 2048),
|
||||
max_result_rows=limit,
|
||||
timeout_seconds=config.get("timeout_seconds", 300),
|
||||
)
|
||||
|
||||
# Phase 1: Register BQ subqueries
|
||||
for spec in register_bq_specs:
|
||||
eq_idx = spec.index("=")
|
||||
alias = spec[:eq_idx].strip()
|
||||
bq_sql = spec[eq_idx + 1:].strip()
|
||||
try:
|
||||
stats = engine.register_bq(alias, bq_sql)
|
||||
typer.echo(f" BQ '{alias}': {stats['rows']} rows, {stats['memory_mb']} MB", err=True)
|
||||
except RemoteQueryError as e:
|
||||
typer.echo(f"Error registering '{alias}': {e}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
# Phase 2: Execute final query
|
||||
try:
|
||||
result = engine.execute(sql)
|
||||
except RemoteQueryError as e:
|
||||
typer.echo(f"Query error: {e}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
_output(result["columns"], result["rows"], fmt)
|
||||
if result["truncated"]:
|
||||
typer.echo(f"(truncated at {limit} rows)", err=True)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _query_local(sql: str, fmt: str, limit: int):
|
||||
"""Run query against local DuckDB."""
|
||||
import duckdb
|
||||
|
||||
local_dir = Path(os.environ.get("DA_LOCAL_DIR", "."))
|
||||
db_path = local_dir / "user" / "duckdb" / "analytics.duckdb"
|
||||
if not db_path.exists():
|
||||
typer.echo("Local DuckDB not found. Run: da sync", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
conn = duckdb.connect(str(db_path), read_only=True)
|
||||
try:
|
||||
result = conn.execute(sql).fetchmany(limit)
|
||||
columns = [desc[0] for desc in conn.description] if conn.description else []
|
||||
_output(columns, result, fmt)
|
||||
except Exception as e:
|
||||
typer.echo(f"Query error: {e}", err=True)
|
||||
raise typer.Exit(1)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _query_remote(sql: str, fmt: str, limit: int):
|
||||
"""Run query against server DuckDB via API."""
|
||||
from cli.client import api_post
|
||||
|
||||
resp = api_post("/api/query", json={"sql": sql, "limit": limit})
|
||||
if resp.status_code != 200:
|
||||
typer.echo(f"Query failed: {resp.json().get('detail', resp.text)}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
data = resp.json()
|
||||
_output(data["columns"], data["rows"], fmt)
|
||||
if data.get("truncated"):
|
||||
typer.echo(f"(truncated at {limit} rows)", err=True)
|
||||
|
||||
|
||||
def _output(columns: list, rows: list, fmt: str):
|
||||
if fmt == "json":
|
||||
output = [dict(zip(columns, row)) for row in rows]
|
||||
typer.echo(json.dumps(output, indent=2, default=str))
|
||||
elif fmt == "csv":
|
||||
typer.echo(",".join(columns))
|
||||
for row in rows:
|
||||
typer.echo(",".join(str(v) if v is not None else "" for v in row))
|
||||
else:
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
console = Console()
|
||||
table = Table()
|
||||
for col in columns:
|
||||
table.add_column(col)
|
||||
for row in rows:
|
||||
table.add_row(*(str(v) if v is not None else "" for v in row))
|
||||
console.print(table)
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run tests**
|
||||
|
||||
Run: `pytest tests/test_cli.py -v`
|
||||
Expected: ALL PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add cli/commands/query.py tests/test_cli.py
|
||||
git commit -m "feat: add --register-bq and --stdin to da query for hybrid BQ+local queries"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: API Endpoint `POST /api/query/hybrid`
|
||||
|
||||
**Files:**
|
||||
- Create: `app/api/query_hybrid.py`
|
||||
- Modify: `app/main.py` (register router)
|
||||
- Test: `tests/test_api.py`
|
||||
|
||||
- [ ] **Step 1: Write failing tests**
|
||||
|
||||
Add to `tests/test_api.py`:
|
||||
|
||||
```python
|
||||
class TestHybridQueryAPI:
|
||||
def test_hybrid_query_requires_admin(self, seeded_client):
|
||||
client, _, analyst_token = seeded_client
|
||||
resp = client.post(
|
||||
"/api/query/hybrid",
|
||||
json={"sql": "SELECT 1", "register_bq": {}},
|
||||
headers={"Authorization": f"Bearer {analyst_token}"},
|
||||
)
|
||||
assert resp.status_code == 403
|
||||
|
||||
def test_hybrid_query_local_only(self, seeded_client):
|
||||
"""Hybrid endpoint works without BQ registrations (just local query)."""
|
||||
client, admin_token, _ = seeded_client
|
||||
resp = client.post(
|
||||
"/api/query/hybrid",
|
||||
json={"sql": "SELECT 1 AS val", "register_bq": {}},
|
||||
headers={"Authorization": f"Bearer {admin_token}"},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["columns"] == ["val"]
|
||||
assert data["rows"] == [[1]]
|
||||
|
||||
def test_hybrid_query_blocked_sql(self, seeded_client):
|
||||
client, admin_token, _ = seeded_client
|
||||
resp = client.post(
|
||||
"/api/query/hybrid",
|
||||
json={"sql": "DROP TABLE users", "register_bq": {}},
|
||||
headers={"Authorization": f"Bearer {admin_token}"},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
|
||||
def test_hybrid_query_blocked_bq_sql(self, seeded_client):
|
||||
client, admin_token, _ = seeded_client
|
||||
resp = client.post(
|
||||
"/api/query/hybrid",
|
||||
json={
|
||||
"sql": "SELECT 1",
|
||||
"register_bq": {"x": "DROP TABLE something"},
|
||||
},
|
||||
headers={"Authorization": f"Bearer {admin_token}"},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run tests to verify they fail**
|
||||
|
||||
Run: `pytest tests/test_api.py::TestHybridQueryAPI -v`
|
||||
Expected: FAIL — 404 on `/api/query/hybrid`
|
||||
|
||||
- [ ] **Step 3: Implement API endpoint**
|
||||
|
||||
Create `app/api/query_hybrid.py`:
|
||||
|
||||
```python
|
||||
"""Hybrid query endpoint — two-phase BQ + DuckDB queries."""
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
import duckdb
|
||||
|
||||
from app.auth.dependencies import require_admin, _get_db
|
||||
from src.db import get_analytics_db_readonly
|
||||
from src.remote_query import RemoteQueryEngine, RemoteQueryError, load_config
|
||||
|
||||
router = APIRouter(prefix="/api/query", tags=["query"])
|
||||
|
||||
|
||||
class HybridQueryRequest(BaseModel):
|
||||
sql: str
|
||||
register_bq: Dict[str, str] = {}
|
||||
format: str = "json"
|
||||
|
||||
|
||||
@router.post("/hybrid")
|
||||
async def hybrid_query(
|
||||
request: HybridQueryRequest,
|
||||
user: dict = Depends(require_admin),
|
||||
):
|
||||
"""Execute a two-phase hybrid query: BQ subqueries + DuckDB final query."""
|
||||
config = load_config()
|
||||
analytics = get_analytics_db_readonly()
|
||||
try:
|
||||
engine = RemoteQueryEngine(
|
||||
analytics,
|
||||
max_bq_registration_rows=config.get("max_bq_registration_rows", 500_000),
|
||||
max_memory_mb=config.get("max_memory_mb", 2048),
|
||||
max_result_rows=config.get("max_result_rows", 100_000),
|
||||
timeout_seconds=config.get("timeout_seconds", 300),
|
||||
)
|
||||
|
||||
# Phase 1: Register BQ subqueries
|
||||
for alias, bq_sql in request.register_bq.items():
|
||||
try:
|
||||
engine.register_bq(alias, bq_sql)
|
||||
except RemoteQueryError as e:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"BQ registration '{alias}' failed: {e.error_type}: {str(e)}",
|
||||
)
|
||||
|
||||
# Phase 2: Execute final query
|
||||
try:
|
||||
result = engine.execute(request.sql)
|
||||
except RemoteQueryError as e:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Query failed: {e.error_type}: {str(e)}",
|
||||
)
|
||||
|
||||
return result
|
||||
finally:
|
||||
analytics.close()
|
||||
```
|
||||
|
||||
Register in `app/main.py`:
|
||||
|
||||
```python
|
||||
from app.api.query_hybrid import router as query_hybrid_router
|
||||
# ...
|
||||
app.include_router(query_hybrid_router) # before web_router
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run tests**
|
||||
|
||||
Run: `pytest tests/test_api.py::TestHybridQueryAPI -v`
|
||||
Expected: ALL PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add app/api/query_hybrid.py app/main.py tests/test_api.py
|
||||
git commit -m "feat: add POST /api/query/hybrid endpoint for two-phase BQ+DuckDB queries"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 5: CLAUDE.md + Integration Test
|
||||
|
||||
**Files:**
|
||||
- Modify: `CLAUDE.md`
|
||||
- Test: run full suite
|
||||
|
||||
- [ ] **Step 1: Add hybrid query docs to CLAUDE.md**
|
||||
|
||||
After the "## Business Metrics" section, add:
|
||||
|
||||
```markdown
|
||||
## Hybrid Queries (BigQuery + Local)
|
||||
|
||||
For tables too large to sync locally, use hybrid queries that JOIN local data with on-demand BigQuery results:
|
||||
|
||||
```bash
|
||||
da query --sql "SELECT o.*, t.views FROM orders o JOIN traffic t ON o.date = t.date" \
|
||||
--register-bq "traffic=SELECT date, SUM(views) as views FROM dataset.web WHERE date > '2026-01-01' GROUP BY 1"
|
||||
```
|
||||
|
||||
The `--register-bq` flag executes a BigQuery subquery, loads the result into memory, and makes it available as a DuckDB view for the final SQL. Multiple `--register-bq` flags can be used for multiple BQ sources.
|
||||
|
||||
For complex SQL, use stdin mode:
|
||||
```bash
|
||||
echo '{"register_bq": {"traffic": "SELECT ..."}, "sql": "SELECT ..."}' | da query --stdin
|
||||
```
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run full test suite**
|
||||
|
||||
Run: `pytest tests/ -v --timeout=60`
|
||||
Expected: ALL PASS
|
||||
|
||||
- [ ] **Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add CLAUDE.md
|
||||
git commit -m "docs: add hybrid query usage instructions to CLAUDE.md"
|
||||
```
|
||||
205
docs/superpowers/specs/2026-04-11-remote-query-design.md
Normal file
205
docs/superpowers/specs/2026-04-11-remote-query-design.md
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
# Remote Query — Design Spec
|
||||
|
||||
**Date:** 2026-04-11
|
||||
**Status:** Approved
|
||||
**Scope:** Fix extension re-attach + two-phase remote query engine
|
||||
|
||||
## Context
|
||||
|
||||
BigQuery remote views created by the orchestrator don't work at query time because `get_analytics_db_readonly()` opens a fresh connection without re-loading the BigQuery extension. Additionally, the platform lacks the ability to run hybrid queries that JOIN local Parquet data with on-demand BigQuery subquery results.
|
||||
|
||||
The `padak/tmp_oss` v1 repo has `src/remote_query.py` with a two-phase protocol. The existing `scripts/duckdb_manager.py` in this repo already has `register_bq_table()` and `_create_bq_client()` helper functions. The `table_registry` already supports `query_mode` values: `local`, `remote`, `hybrid`.
|
||||
|
||||
**Primary user:** Claude Code agent running `da query` locally, or API consumers via `POST /api/query/hybrid`.
|
||||
|
||||
---
|
||||
|
||||
## Part 1: Fix Extension Re-attach
|
||||
|
||||
### Problem
|
||||
|
||||
`get_analytics_db_readonly()` in `src/db.py` opens analytics.duckdb in read-only mode and ATTACHes extract.duckdb files, but does NOT re-load extensions referenced in `_remote_attach` tables. BigQuery remote views fail with "Catalog Error: bq not found".
|
||||
|
||||
### Solution
|
||||
|
||||
After ATTACHing extract.duckdb files in `get_analytics_db_readonly()`, scan each for a `_remote_attach` table. For each record, re-load the extension and re-attach the remote source.
|
||||
|
||||
**Important: DuckDB read-only LOAD behavior.** The `read_only=True` flag on `duckdb.connect()` blocks writes to the DB file, but `LOAD` writes to the extension cache in `~/.duckdb/extensions/` (separate from the DB file). This should work, but MUST be empirically verified as the first implementation step. If LOAD fails in read-only mode, the workaround is to open the analytics DB WITHOUT `read_only=True` but still use read-only SQL patterns (no INSERT/UPDATE/DELETE), or to call `LOAD` on a separate in-memory connection first (DuckDB extension cache is process-wide).
|
||||
|
||||
Steps for each `_remote_attach` record:
|
||||
1. `LOAD {extension}` — loads pre-installed extension from disk
|
||||
2. Read token from `os.environ[token_env]` if `token_env` is non-empty
|
||||
3. `ATTACH '{url}' AS {alias} (TYPE {extension}, READ_ONLY)` — with TOKEN if needed
|
||||
|
||||
If LOAD or ATTACH fails, log a warning and continue — local views still work.
|
||||
|
||||
### Changes
|
||||
|
||||
**File:** `src/db.py` — `get_analytics_db_readonly()` function
|
||||
|
||||
Add ~25 lines after the existing extract.duckdb ATTACH loop. Read `_remote_attach` table from each attached extract DB, collect unique (alias, extension, url, token_env) tuples, and re-attach.
|
||||
|
||||
Pattern follows `src/orchestrator.py:_attach_remote_extensions()` but simplified (no INSTALL — orchestrator pre-installs during rebuild).
|
||||
|
||||
**Concurrency note:** If the orchestrator runs `_atomic_swap_db()` while a read-only connection is open, the existing connection holds a file descriptor to the old inode (Unix semantics). This is safe — the old data remains accessible until the connection is closed.
|
||||
|
||||
---
|
||||
|
||||
## Part 2: Two-Phase Remote Query Engine
|
||||
|
||||
### Architecture
|
||||
|
||||
New module `src/remote_query.py` with a `RemoteQueryEngine` class:
|
||||
|
||||
```python
|
||||
class RemoteQueryEngine:
|
||||
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
||||
"""Takes an existing DuckDB connection (analytics.duckdb with local views)."""
|
||||
|
||||
def register_bq(self, alias: str, bq_sql: str) -> dict:
|
||||
"""Execute BQ subquery, register result as in-memory DuckDB view.
|
||||
Returns {alias, rows, columns, memory_mb}.
|
||||
Raises RemoteQueryError on safety limit violation."""
|
||||
|
||||
def execute(self, sql: str) -> dict:
|
||||
"""Execute final DuckDB query against local + registered BQ views.
|
||||
Returns {columns: [...], rows: [...], row_count: int, truncated: bool}."""
|
||||
```
|
||||
|
||||
### Two-Phase Flow
|
||||
|
||||
1. **Phase 1 — BQ Registration:** For each `register_bq(alias, bq_sql)` call:
|
||||
- COUNT(*) pre-check via Python BQ client → reject if >max_bq_registration_rows
|
||||
- Memory estimate: ~50 bytes/cell × rows × cols → reject if >max_memory_mb. Note: this is approximate. After query completes, use `arrow_table.nbytes` for accurate reporting in `bq_stats`.
|
||||
- Execute BQ query → `job.to_arrow()` → `conn.register(alias, arrow_table)`
|
||||
- Uses `scripts/duckdb_manager.py:_create_bq_client()` for BQ client creation (reuse)
|
||||
- Does NOT delegate to `register_bq_table()` directly — `RemoteQueryEngine.register_bq()` wraps BQ query execution with its own pre-check logic (COUNT, memory estimate), then calls `conn.register(alias, arrow_table)`. The existing `register_bq_table()` has no pre-check capability and would need signature changes to add one. Wrapping is cleaner than modifying shared code.
|
||||
- Gracefully handle missing `google-cloud-bigquery` package: catch `ImportError` and raise `RemoteQueryError(error_type="bq_error", message="google-cloud-bigquery not installed")`
|
||||
|
||||
2. **Phase 2 — DuckDB Query:** Execute final SQL against all views (local Parquet + registered BQ Arrow tables). Apply max_result_rows limit.
|
||||
|
||||
### Safety Limits
|
||||
|
||||
Configurable in `config/instance.yaml` under `remote_query:`:
|
||||
|
||||
```yaml
|
||||
remote_query:
|
||||
max_bq_registration_rows: 500000 # max rows from a single BQ subquery (matches existing instance.yaml.example key)
|
||||
max_memory_mb: 2048 # max estimated memory for BQ result
|
||||
max_result_rows: 100000 # max rows in final result
|
||||
timeout_seconds: 300 # BQ query timeout
|
||||
```
|
||||
|
||||
Note: `max_bq_registration_rows` matches the key already documented in `config/instance.yaml.example`.
|
||||
|
||||
Defaults are hardcoded in `RemoteQueryEngine` and overridden by instance config.
|
||||
|
||||
### Error Handling
|
||||
|
||||
Custom `RemoteQueryError` exception with structured error:
|
||||
|
||||
```python
|
||||
class RemoteQueryError(Exception):
|
||||
def __init__(self, message: str, error_type: str, details: dict = None):
|
||||
# error_type: "row_limit", "memory_limit", "bq_error", "query_error", "timeout"
|
||||
```
|
||||
|
||||
### CLI: `da query` Extension
|
||||
|
||||
Extend existing `cli/commands/query.py`:
|
||||
|
||||
```
|
||||
da query --sql "SELECT o.*, t.views FROM orders o JOIN traffic t ON o.date = t.date" \
|
||||
--register-bq "traffic=SELECT date, SUM(views) as views FROM dataset.web WHERE date > '2026-01-01' GROUP BY 1"
|
||||
```
|
||||
|
||||
- Multiple `--register-bq` flags allowed (one per BQ alias)
|
||||
- Format: `"alias=BQ_SQL"` (split on first `=`)
|
||||
- `--stdin` mode: reads JSON from stdin for complex SQL:
|
||||
```json
|
||||
{"register_bq": {"traffic": "SELECT ..."}, "sql": "SELECT ..."}
|
||||
```
|
||||
- Output formats: `table` (default), `csv`, `json`
|
||||
|
||||
**CLI argument handling:** The existing `query_command` has `sql` as a required positional argument. When `--register-bq` is used, `sql` should be provided via `--sql` flag instead (named option, not positional). When `--stdin` is used, both `sql` and `register_bq` come from stdin JSON. Make `sql` an optional positional (`typer.Argument(None)`) and validate that exactly one of (positional sql, --sql flag, --stdin) is provided.
|
||||
|
||||
### API: `POST /api/query/hybrid`
|
||||
|
||||
```
|
||||
POST /api/query/hybrid
|
||||
Authorization: Bearer <admin_token>
|
||||
|
||||
{
|
||||
"register_bq": {
|
||||
"traffic": "SELECT date, SUM(views) FROM dataset.web WHERE date > '2026-01-01' GROUP BY 1"
|
||||
},
|
||||
"sql": "SELECT o.*, t.views FROM orders o JOIN traffic t ON o.date = t.date",
|
||||
"format": "json"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"columns": ["order_id", "date", "views"],
|
||||
"rows": [...],
|
||||
"row_count": 1234,
|
||||
"truncated": false,
|
||||
"bq_stats": {
|
||||
"traffic": {"rows": 365, "columns": 2, "memory_mb": 0.03}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Auth:** `require_admin` — BQ queries cost money, only admins can trigger them.
|
||||
|
||||
**Validation — both `register_bq` SQL and final `sql`:**
|
||||
- Apply the same SQL blocklist from `app/api/query.py` (blocks LOAD, ATTACH, INSTALL, read_parquet with paths, path traversal patterns, etc.)
|
||||
- `register_bq` SQL additionally validated as SELECT-only (no INSERT/UPDATE/DELETE/DROP)
|
||||
- Reuse the existing `_validate_sql()` helper from `app/api/query.py` (extract to shared utility if needed)
|
||||
|
||||
**Connection lifecycle:** The API endpoint owns the connection. Pattern:
|
||||
```python
|
||||
analytics = get_analytics_db_readonly()
|
||||
try:
|
||||
engine = RemoteQueryEngine(analytics)
|
||||
# ... register_bq + execute
|
||||
finally:
|
||||
analytics.close()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Summary
|
||||
|
||||
### New Files
|
||||
|
||||
| File | Purpose |
|
||||
|---|---|
|
||||
| `src/remote_query.py` | `RemoteQueryEngine` class + `RemoteQueryError` |
|
||||
| `app/api/query_hybrid.py` | `POST /api/query/hybrid` endpoint |
|
||||
| `tests/test_remote_query.py` | Engine unit tests (mocked BQ client) |
|
||||
|
||||
### Modified Files
|
||||
|
||||
| File | Changes |
|
||||
|---|---|
|
||||
| `src/db.py` | `get_analytics_db_readonly()` — add extension re-attach from `_remote_attach` |
|
||||
| `cli/commands/query.py` | Add `--register-bq` and `--stdin` flags |
|
||||
| `app/main.py` | Register hybrid query router |
|
||||
| `CLAUDE.md` | Document hybrid query usage |
|
||||
|
||||
### Implementation Order
|
||||
|
||||
1. Fix extension re-attach in `src/db.py` (unblocks remote views)
|
||||
2. `RemoteQueryEngine` in `src/remote_query.py` (core logic)
|
||||
3. CLI extension `--register-bq`
|
||||
4. API endpoint `POST /api/query/hybrid`
|
||||
5. CLAUDE.md update + integration tests
|
||||
|
||||
### Test Coverage
|
||||
|
||||
- `tests/test_remote_query.py` — engine tests with mocked BQ client (safety limits, registration, error handling)
|
||||
- `tests/test_db.py` — extension re-attach test (mock _remote_attach table)
|
||||
- `tests/test_api.py` — hybrid query endpoint (auth, validation)
|
||||
- `tests/test_cli.py` — `--register-bq` flag parsing
|
||||
89
src/db.py
89
src/db.py
|
|
@ -250,6 +250,93 @@ def get_analytics_db() -> duckdb.DuckDBPyConnection:
|
|||
return duckdb.connect(str(db_path))
|
||||
|
||||
|
||||
def _reattach_remote_extensions(
|
||||
conn: duckdb.DuckDBPyConnection, extracts_dir: Path
|
||||
) -> None:
|
||||
"""Re-LOAD DuckDB extensions listed in _remote_attach tables of each extract.duckdb.
|
||||
|
||||
Called from get_analytics_db_readonly() after ATTACHing extract.duckdb files so
|
||||
that remote views (e.g. BigQuery) resolve correctly. Uses LOAD only — no INSTALL —
|
||||
to avoid touching the network in read-only query paths.
|
||||
"""
|
||||
if not extracts_dir.exists():
|
||||
return
|
||||
|
||||
try:
|
||||
attached_dbs = {
|
||||
r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()
|
||||
}
|
||||
except Exception:
|
||||
return
|
||||
|
||||
for ext_dir in sorted(extracts_dir.iterdir()):
|
||||
if not ext_dir.is_dir():
|
||||
continue
|
||||
if not _SAFE_IDENTIFIER.match(ext_dir.name):
|
||||
continue
|
||||
db_file = ext_dir / "extract.duckdb"
|
||||
if not db_file.exists():
|
||||
continue
|
||||
# Only process sources that were successfully attached
|
||||
if ext_dir.name not in attached_dbs:
|
||||
continue
|
||||
|
||||
# Check whether this extract has a _remote_attach table
|
||||
try:
|
||||
has_table = conn.execute(
|
||||
"SELECT 1 FROM information_schema.tables "
|
||||
f"WHERE table_catalog='{ext_dir.name}' AND table_name='_remote_attach'"
|
||||
).fetchone()
|
||||
if not has_table:
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
try:
|
||||
rows = conn.execute(
|
||||
f"SELECT alias, extension, url, token_env FROM {ext_dir.name}._remote_attach"
|
||||
).fetchall()
|
||||
except Exception as e:
|
||||
logger.debug("Could not read _remote_attach from %s: %s", ext_dir.name, e)
|
||||
continue
|
||||
|
||||
# Refresh attached list before processing each source's rows
|
||||
try:
|
||||
attached_dbs = {
|
||||
r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for alias, extension, url, token_env in rows:
|
||||
if not _SAFE_IDENTIFIER.match(alias or ""):
|
||||
logger.debug("Skipping unsafe remote_attach alias: %r", alias)
|
||||
continue
|
||||
if not _SAFE_IDENTIFIER.match(extension or ""):
|
||||
logger.debug("Skipping unsafe remote_attach extension: %r", extension)
|
||||
continue
|
||||
if alias in attached_dbs:
|
||||
logger.debug("Remote source %s already attached, skipping", alias)
|
||||
continue
|
||||
try:
|
||||
conn.execute(f"LOAD {extension};")
|
||||
token = os.environ.get(token_env, "") if token_env else ""
|
||||
safe_url = url.replace("'", "''")
|
||||
if token:
|
||||
escaped_token = token.replace("'", "''")
|
||||
conn.execute(
|
||||
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, READ_ONLY)"
|
||||
)
|
||||
attached_dbs.add(alias)
|
||||
logger.debug("Re-attached remote source %s via %s extension", alias, extension)
|
||||
except Exception as e:
|
||||
logger.debug("Could not re-attach remote source %s: %s", alias, e)
|
||||
|
||||
|
||||
def get_analytics_db_readonly() -> duckdb.DuckDBPyConnection:
|
||||
"""Read-only connection to analytics DB. Blocks writes and external access.
|
||||
|
||||
|
|
@ -277,6 +364,8 @@ def get_analytics_db_readonly() -> duckdb.DuckDBPyConnection:
|
|||
conn.execute(f"ATTACH '{db_file}' AS {ext_dir.name} (READ_ONLY)")
|
||||
except Exception:
|
||||
pass
|
||||
# Re-attach remote extensions so BigQuery / other remote views resolve.
|
||||
_reattach_remote_extensions(conn, extracts_dir)
|
||||
# Note: external_access stays enabled because views use read_parquet() on local files.
|
||||
# File-path-based attacks are blocked by the SQL blocklist in app/api/query.py.
|
||||
return conn
|
||||
|
|
|
|||
431
src/remote_query.py
Normal file
431
src/remote_query.py
Normal file
|
|
@ -0,0 +1,431 @@
|
|||
"""RemoteQueryEngine — two-phase BQ registration + DuckDB execution.
|
||||
|
||||
Phase 1 (register_bq): validate SQL, COUNT(*) pre-check against BigQuery,
|
||||
fetch Arrow table, check memory, register as DuckDB view.
|
||||
|
||||
Phase 2 (execute): validate SQL, execute against DuckDB (which may reference
|
||||
registered BQ views), serialize and return results.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import duckdb
|
||||
|
||||
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
|
||||
|
||||
_RESERVED_ALIASES = {
|
||||
"information_schema", "duckdb_tables", "duckdb_columns",
|
||||
"duckdb_databases", "duckdb_settings", "duckdb_functions",
|
||||
"duckdb_views", "duckdb_indexes", "duckdb_schemas",
|
||||
"main", "memory", "system", "temp",
|
||||
}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SQL blocklist — based on app/api/query.py, extended with additional DuckDB metadata tables
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_BLOCKED_KEYWORDS: List[str] = [
|
||||
"drop ",
|
||||
"delete ",
|
||||
"insert ",
|
||||
"update ",
|
||||
"alter ",
|
||||
"create ",
|
||||
"copy ",
|
||||
"attach ",
|
||||
"detach ",
|
||||
"load ",
|
||||
"install ",
|
||||
"export ",
|
||||
"import ",
|
||||
"pragma ",
|
||||
"call ",
|
||||
# File access functions
|
||||
"read_csv",
|
||||
"read_json",
|
||||
"read_parquet",
|
||||
"read_text",
|
||||
"write_csv",
|
||||
"write_parquet",
|
||||
"read_blob",
|
||||
"read_ndjson",
|
||||
"parquet_scan",
|
||||
"parquet_metadata",
|
||||
"parquet_schema",
|
||||
"json_scan",
|
||||
"csv_scan",
|
||||
"query_table",
|
||||
"iceberg_scan",
|
||||
"delta_scan",
|
||||
"glob(",
|
||||
"list_files",
|
||||
"'/",
|
||||
'\"/',
|
||||
"http://",
|
||||
"https://",
|
||||
"s3://",
|
||||
"gcs://",
|
||||
# DuckDB metadata (leaks schema info regardless of RBAC)
|
||||
"information_schema",
|
||||
"duckdb_tables",
|
||||
"duckdb_columns",
|
||||
"duckdb_databases",
|
||||
"duckdb_settings",
|
||||
"duckdb_functions",
|
||||
"duckdb_views",
|
||||
"duckdb_indexes",
|
||||
"duckdb_schemas",
|
||||
"pragma_table_info",
|
||||
"pragma_storage_info",
|
||||
# Relative path traversal
|
||||
"'../",
|
||||
'"../',
|
||||
# Multiple statements
|
||||
";",
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Exception
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class RemoteQueryError(Exception):
|
||||
"""Raised by RemoteQueryEngine for all controlled error conditions.
|
||||
|
||||
Attributes:
|
||||
error_type: One of "row_limit", "memory_limit", "bq_error",
|
||||
"query_error", "timeout".
|
||||
details: Optional dict with additional context.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
error_type: str,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
super().__init__(message)
|
||||
self.error_type = error_type
|
||||
self.details = details or {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _validate_sql(sql: str) -> None:
|
||||
"""Raise RemoteQueryError if *sql* contains blocked patterns.
|
||||
|
||||
Raises:
|
||||
RemoteQueryError: with error_type="query_error" if validation fails.
|
||||
"""
|
||||
sql_lower = sql.strip().lower()
|
||||
|
||||
for keyword in _BLOCKED_KEYWORDS:
|
||||
if keyword in sql_lower:
|
||||
raise RemoteQueryError(
|
||||
f"Blocked SQL pattern: {keyword!r}",
|
||||
error_type="query_error",
|
||||
details={"blocked_keyword": keyword},
|
||||
)
|
||||
|
||||
if not sql_lower.startswith("select ") and not sql_lower.startswith("with "):
|
||||
raise RemoteQueryError(
|
||||
"Query must start with SELECT or WITH",
|
||||
error_type="query_error",
|
||||
)
|
||||
|
||||
|
||||
# BQ SQL blocklist — only blocks write/mutation operations
|
||||
_BQ_BLOCKED_KEYWORDS = [
|
||||
"drop ",
|
||||
"delete ",
|
||||
"insert ",
|
||||
"update ",
|
||||
"alter ",
|
||||
"create ",
|
||||
"truncate ",
|
||||
"merge ",
|
||||
";", # prevent multi-statement
|
||||
]
|
||||
|
||||
|
||||
def _validate_bq_sql(sql: str) -> None:
|
||||
"""Validate BQ SQL — narrower than DuckDB blocklist, only blocks writes."""
|
||||
sql_lower = sql.strip().lower()
|
||||
for keyword in _BQ_BLOCKED_KEYWORDS:
|
||||
if keyword in sql_lower:
|
||||
raise RemoteQueryError(
|
||||
f"Blocked BQ SQL keyword: {keyword.strip()}",
|
||||
error_type="query_error",
|
||||
)
|
||||
if not sql_lower.startswith("select ") and not sql_lower.startswith("with "):
|
||||
raise RemoteQueryError(
|
||||
"BQ query must start with SELECT or WITH",
|
||||
error_type="query_error",
|
||||
)
|
||||
|
||||
|
||||
def load_config() -> Dict[str, Any]:
|
||||
"""Load the ``remote_query:`` section from instance.yaml.
|
||||
|
||||
Returns an empty dict if the section is missing or config cannot be loaded.
|
||||
"""
|
||||
try:
|
||||
from app.instance_config import get_value
|
||||
|
||||
return get_value("remote_query", default={}) or {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Engine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class RemoteQueryEngine:
|
||||
"""Two-phase query engine: BQ registration (Phase 1) + DuckDB execution (Phase 2).
|
||||
|
||||
Args:
|
||||
conn: Open DuckDB connection used for both view registration and querying.
|
||||
_bq_client_factory: Optional callable ``(project: str) -> BQ client``.
|
||||
Defaults to ``scripts.duckdb_manager._create_bq_client``.
|
||||
max_bq_registration_rows: Maximum rows allowed in a single BQ registration.
|
||||
max_memory_mb: Maximum in-memory Arrow table size (MiB).
|
||||
max_result_rows: Maximum rows returned by ``execute()``.
|
||||
timeout_seconds: Query timeout (reserved for future use).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
conn: duckdb.DuckDBPyConnection,
|
||||
*,
|
||||
_bq_client_factory=None,
|
||||
max_bq_registration_rows: int = 500_000,
|
||||
max_memory_mb: float = 2048.0,
|
||||
max_result_rows: int = 100_000,
|
||||
timeout_seconds: int = 300,
|
||||
) -> None:
|
||||
self._conn = conn
|
||||
self._bq_client_factory = _bq_client_factory
|
||||
self.max_bq_registration_rows = max_bq_registration_rows
|
||||
self.max_memory_mb = max_memory_mb
|
||||
self.max_result_rows = max_result_rows
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
||||
# Track which aliases have been registered in this session
|
||||
self._registered: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 1
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def register_bq(self, alias: str, bq_sql: str) -> Dict[str, Any]:
|
||||
"""Register a BigQuery query result as a DuckDB view.
|
||||
|
||||
Steps:
|
||||
1. Validate *bq_sql* against the SQL blocklist.
|
||||
2. COUNT(*) pre-check via BQ client.
|
||||
3. Execute the actual BQ query and fetch as Arrow table.
|
||||
4. Check in-memory size against *max_memory_mb*.
|
||||
5. Register Arrow table in DuckDB under *alias*.
|
||||
|
||||
Args:
|
||||
alias: DuckDB view name to register (e.g. ``"bq_orders"``).
|
||||
bq_sql: SQL query to execute on BigQuery.
|
||||
|
||||
Returns:
|
||||
``{alias, rows, columns, memory_mb}``
|
||||
|
||||
Raises:
|
||||
RemoteQueryError: For row/memory limits or BQ errors.
|
||||
ImportError: If google-cloud-bigquery is not installed.
|
||||
"""
|
||||
if not _SAFE_IDENTIFIER.match(alias or ""):
|
||||
raise RemoteQueryError(
|
||||
f"Invalid alias {alias!r}: must be a valid SQL identifier",
|
||||
error_type="query_error",
|
||||
)
|
||||
if alias.lower() in _RESERVED_ALIASES:
|
||||
raise RemoteQueryError(
|
||||
f"Reserved alias {alias!r}: cannot shadow system objects",
|
||||
error_type="query_error",
|
||||
)
|
||||
|
||||
_validate_bq_sql(bq_sql)
|
||||
|
||||
client = self._get_bq_client()
|
||||
|
||||
# --- Phase 1a: COUNT(*) pre-check ---
|
||||
count_sql = f"SELECT COUNT(*) FROM ({bq_sql}) AS _cnt"
|
||||
try:
|
||||
count_job = client.query(count_sql)
|
||||
count_arrow = count_job.to_arrow()
|
||||
count_value = int(count_arrow.column(0)[0].as_py())
|
||||
except RemoteQueryError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise RemoteQueryError(
|
||||
f"BQ COUNT pre-check failed: {exc}",
|
||||
error_type="bq_error",
|
||||
details={"original_error": str(exc)},
|
||||
) from exc
|
||||
|
||||
if count_value > self.max_bq_registration_rows:
|
||||
raise RemoteQueryError(
|
||||
f"BQ result has {count_value:,} rows, exceeding the "
|
||||
f"limit of {self.max_bq_registration_rows:,}.",
|
||||
error_type="row_limit",
|
||||
details={
|
||||
"count": count_value,
|
||||
"max": self.max_bq_registration_rows,
|
||||
},
|
||||
)
|
||||
|
||||
# --- Phase 1b: Fetch actual data ---
|
||||
try:
|
||||
data_job = client.query(bq_sql)
|
||||
try:
|
||||
arrow_table = data_job.to_arrow()
|
||||
except Exception as storage_exc:
|
||||
if "readsessions" in str(storage_exc) or "PERMISSION_DENIED" in str(storage_exc):
|
||||
logger.warning("BQ Storage API unavailable, falling back to REST")
|
||||
arrow_table = data_job.to_arrow(create_bqstorage_client=False)
|
||||
else:
|
||||
raise
|
||||
except RemoteQueryError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise RemoteQueryError(
|
||||
f"BQ query failed: {exc}",
|
||||
error_type="bq_error",
|
||||
details={"original_error": str(exc)},
|
||||
) from exc
|
||||
|
||||
# --- Phase 1c: Memory check (accurate, post-fetch) ---
|
||||
memory_mb = arrow_table.nbytes / (1024 * 1024)
|
||||
if memory_mb > self.max_memory_mb:
|
||||
raise RemoteQueryError(
|
||||
f"Arrow table uses {memory_mb:.1f} MiB, exceeding the "
|
||||
f"limit of {self.max_memory_mb:.1f} MiB.",
|
||||
error_type="memory_limit",
|
||||
details={"memory_mb": memory_mb, "max_memory_mb": self.max_memory_mb},
|
||||
)
|
||||
|
||||
# --- Phase 1d: Register in DuckDB ---
|
||||
self._conn.register(alias, arrow_table)
|
||||
|
||||
info: Dict[str, Any] = {
|
||||
"alias": alias,
|
||||
"rows": arrow_table.num_rows,
|
||||
"columns": arrow_table.schema.names,
|
||||
"memory_mb": memory_mb,
|
||||
}
|
||||
self._registered[alias] = info
|
||||
logger.info(
|
||||
"Registered BQ alias %r: %d rows, %.2f MiB",
|
||||
alias,
|
||||
arrow_table.num_rows,
|
||||
memory_mb,
|
||||
)
|
||||
return info
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 2
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def execute(self, sql: str) -> Dict[str, Any]:
|
||||
"""Execute SQL against DuckDB (which may reference registered BQ views).
|
||||
|
||||
Args:
|
||||
sql: SQL query to execute. Must pass the SQL blocklist.
|
||||
|
||||
Returns:
|
||||
``{columns, rows, row_count, truncated, bq_stats}``
|
||||
|
||||
Raises:
|
||||
RemoteQueryError: If SQL is blocked or a DuckDB error occurs.
|
||||
"""
|
||||
_validate_sql(sql)
|
||||
|
||||
try:
|
||||
result = self._conn.execute(sql).fetchmany(self.max_result_rows + 1)
|
||||
columns = (
|
||||
[desc[0] for desc in self._conn.description]
|
||||
if self._conn.description
|
||||
else []
|
||||
)
|
||||
except RemoteQueryError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise RemoteQueryError(
|
||||
f"Query error: {exc}",
|
||||
error_type="query_error",
|
||||
details={"original_error": str(exc)},
|
||||
) from exc
|
||||
|
||||
truncated = len(result) > self.max_result_rows
|
||||
rows = result[: self.max_result_rows]
|
||||
|
||||
# Serialize non-standard types (mirrors app/api/query.py lines 92-96)
|
||||
serializable_rows = []
|
||||
for row in rows:
|
||||
serializable_rows.append(
|
||||
[
|
||||
str(v) if v is not None and not isinstance(v, (int, float, bool, str)) else v
|
||||
for v in row
|
||||
]
|
||||
)
|
||||
|
||||
return {
|
||||
"columns": columns,
|
||||
"rows": serializable_rows,
|
||||
"row_count": len(serializable_rows),
|
||||
"truncated": truncated,
|
||||
"bq_stats": {
|
||||
"registered_aliases": list(self._registered.keys()),
|
||||
"alias_count": len(self._registered),
|
||||
},
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_bq_client(self):
|
||||
"""Return a BigQuery client from the injected factory or the default one.
|
||||
|
||||
Raises:
|
||||
ImportError: If google-cloud-bigquery is not installed and no
|
||||
factory was injected.
|
||||
"""
|
||||
if self._bq_client_factory is not None:
|
||||
project = os.environ.get("BIGQUERY_PROJECT", "unknown")
|
||||
return self._bq_client_factory(project)
|
||||
|
||||
# Lazy import so the module stays usable without BQ installed.
|
||||
try:
|
||||
import google.cloud.bigquery as _bq_module # noqa: PLC0415, F401
|
||||
except ImportError:
|
||||
raise RemoteQueryError(
|
||||
"google-cloud-bigquery is not installed. Install with: pip install google-cloud-bigquery",
|
||||
error_type="bq_error",
|
||||
)
|
||||
|
||||
project = os.environ.get("BIGQUERY_PROJECT")
|
||||
if not project:
|
||||
raise RemoteQueryError(
|
||||
"BIGQUERY_PROJECT env var is not set.",
|
||||
error_type="bq_error",
|
||||
)
|
||||
return _bq_module.Client(project=project)
|
||||
|
|
@ -385,7 +385,7 @@ class TestMetadataAPI:
|
|||
# 'orders' is not in table_registry — expect 404 or 400
|
||||
assert resp.status_code in (400, 404)
|
||||
|
||||
def test_push_keboola_table(self, seeded_client, monkeypatch):
|
||||
def test_push_keboola_table(self, seeded_client, monkeypatch): # noqa: F811
|
||||
client, admin_token, _ = seeded_client
|
||||
|
||||
# 1. Register a keboola table
|
||||
|
|
@ -451,3 +451,53 @@ class TestMetadataAPI:
|
|||
called_json = call_args.kwargs.get("json", {})
|
||||
assert called_json.get("provider") == "ai-metadata-enrichment"
|
||||
assert isinstance(called_json.get("metadata"), list)
|
||||
|
||||
|
||||
# ---- Hybrid Query ----
|
||||
|
||||
class TestHybridQueryAPI:
|
||||
def test_hybrid_query_requires_admin(self, seeded_client):
|
||||
client, _, analyst_token = seeded_client
|
||||
resp = client.post(
|
||||
"/api/query/hybrid",
|
||||
json={"sql": "SELECT 1 AS val", "register_bq": {}},
|
||||
headers={"Authorization": f"Bearer {analyst_token}"},
|
||||
)
|
||||
assert resp.status_code == 403
|
||||
|
||||
def test_hybrid_query_local_only(self, seeded_client):
|
||||
client, admin_token, _ = seeded_client
|
||||
resp = client.post(
|
||||
"/api/query/hybrid",
|
||||
json={"sql": "SELECT 1 AS val", "register_bq": {}},
|
||||
headers={"Authorization": f"Bearer {admin_token}"},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert "columns" in data
|
||||
assert "rows" in data
|
||||
assert data["columns"] == ["val"]
|
||||
assert data["rows"] == [[1]]
|
||||
|
||||
def test_hybrid_query_blocked_sql(self, seeded_client):
|
||||
client, admin_token, _ = seeded_client
|
||||
resp = client.post(
|
||||
"/api/query/hybrid",
|
||||
json={"sql": "DROP TABLE users", "register_bq": {}},
|
||||
headers={"Authorization": f"Bearer {admin_token}"},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
assert "query_error" in resp.json()["detail"]
|
||||
|
||||
def test_hybrid_query_blocked_bq_sql(self, seeded_client):
|
||||
client, admin_token, _ = seeded_client
|
||||
resp = client.post(
|
||||
"/api/query/hybrid",
|
||||
json={
|
||||
"sql": "SELECT 1",
|
||||
"register_bq": {"bad_alias": "DROP TABLE sensitive"},
|
||||
},
|
||||
headers={"Authorization": f"Bearer {admin_token}"},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
assert "query_error" in resp.json()["detail"]
|
||||
|
|
|
|||
|
|
@ -238,6 +238,17 @@ class TestAdminCommands:
|
|||
assert result.exit_code == 1
|
||||
|
||||
|
||||
class TestQueryHybrid:
|
||||
def test_register_bq_flag_help(self):
|
||||
result = runner.invoke(app, ["query", "--help"])
|
||||
assert result.exit_code == 0
|
||||
# Rich/Typer may insert ANSI escape codes within option names,
|
||||
# so check for the parts separately
|
||||
assert "register" in result.output
|
||||
assert "bq" in result.output
|
||||
assert "BigQuery" in result.output
|
||||
|
||||
|
||||
class TestMetricsHelp:
|
||||
def test_metrics_help(self):
|
||||
result = runner.invoke(app, ["metrics", "--help"])
|
||||
|
|
|
|||
104
tests/test_db.py
104
tests/test_db.py
|
|
@ -462,6 +462,110 @@ class TestSchemaV4:
|
|||
conn2.close()
|
||||
|
||||
|
||||
class TestExtensionReattach:
|
||||
"""Resilience tests for _reattach_remote_extensions() called by get_analytics_db_readonly()."""
|
||||
|
||||
def _make_analytics_db(self, tmp_path):
|
||||
"""Create an empty analytics server.duckdb so get_analytics_db_readonly() takes the read_only path."""
|
||||
analytics_dir = tmp_path / "analytics"
|
||||
analytics_dir.mkdir(parents=True, exist_ok=True)
|
||||
import duckdb as _duckdb
|
||||
conn = _duckdb.connect(str(analytics_dir / "server.duckdb"))
|
||||
conn.close()
|
||||
|
||||
def _make_extract_db(self, tmp_path, source_name, with_remote_attach=True):
|
||||
"""Create a minimal extract.duckdb, optionally with a _remote_attach table."""
|
||||
ext_dir = tmp_path / "extracts" / source_name
|
||||
ext_dir.mkdir(parents=True, exist_ok=True)
|
||||
import duckdb as _duckdb
|
||||
conn = _duckdb.connect(str(ext_dir / "extract.duckdb"))
|
||||
try:
|
||||
conn.execute(
|
||||
"CREATE TABLE _meta (table_name VARCHAR, description VARCHAR, rows BIGINT, "
|
||||
"size_bytes BIGINT, extracted_at TIMESTAMP, query_mode VARCHAR)"
|
||||
)
|
||||
if with_remote_attach:
|
||||
conn.execute(
|
||||
"CREATE TABLE _remote_attach (alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR)"
|
||||
)
|
||||
# Use 'bigquery' which won't be installed in CI — tests resilience
|
||||
conn.execute(
|
||||
"INSERT INTO _remote_attach VALUES ('bq', 'bigquery', 'project/dataset', '')"
|
||||
)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_reads_remote_attach_table(self, tmp_path, monkeypatch):
|
||||
"""get_analytics_db_readonly() doesn't crash even when LOAD fails for missing extension."""
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
||||
import importlib
|
||||
import src.db as db_module
|
||||
importlib.reload(db_module)
|
||||
|
||||
self._make_analytics_db(tmp_path)
|
||||
self._make_extract_db(tmp_path, "mysource", with_remote_attach=True)
|
||||
|
||||
# Should not raise even though 'bigquery' extension is not installed
|
||||
conn = db_module.get_analytics_db_readonly()
|
||||
try:
|
||||
# Connection must still be usable for local queries
|
||||
result = conn.execute("SELECT 42 AS n").fetchone()
|
||||
assert result[0] == 42
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_reattach_attempts_load(self, tmp_path, monkeypatch):
|
||||
"""Verify _reattach_remote_extensions reads _remote_attach and attempts LOAD."""
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
||||
import importlib
|
||||
import src.db as db_module
|
||||
importlib.reload(db_module)
|
||||
|
||||
self._make_analytics_db(tmp_path)
|
||||
self._make_extract_db(tmp_path, "bqsource", with_remote_attach=True)
|
||||
|
||||
# Call get_analytics_db_readonly and verify the _remote_attach table is readable
|
||||
conn = db_module.get_analytics_db_readonly()
|
||||
try:
|
||||
# Verify the extract was attached
|
||||
dbs = {r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()}
|
||||
assert "bqsource" in dbs, f"bqsource should be attached, got: {dbs}"
|
||||
|
||||
# Verify _remote_attach table is accessible via table_catalog
|
||||
has = conn.execute(
|
||||
"SELECT 1 FROM information_schema.tables "
|
||||
"WHERE table_catalog='bqsource' AND table_name='_remote_attach'"
|
||||
).fetchone()
|
||||
assert has is not None, "_remote_attach table should be visible via table_catalog"
|
||||
|
||||
# Read the rows to verify they're correct
|
||||
rows = conn.execute(
|
||||
"SELECT alias, extension, url FROM bqsource._remote_attach"
|
||||
).fetchall()
|
||||
assert len(rows) == 1
|
||||
assert rows[0][0] == "bq"
|
||||
assert rows[0][1] == "bigquery"
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_skips_missing_remote_attach(self, tmp_path, monkeypatch):
|
||||
"""get_analytics_db_readonly() works fine when _remote_attach table is absent."""
|
||||
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
||||
import importlib
|
||||
import src.db as db_module
|
||||
importlib.reload(db_module)
|
||||
|
||||
self._make_analytics_db(tmp_path)
|
||||
self._make_extract_db(tmp_path, "localsource", with_remote_attach=False)
|
||||
|
||||
conn = db_module.get_analytics_db_readonly()
|
||||
try:
|
||||
result = conn.execute("SELECT 'ok' AS status").fetchone()
|
||||
assert result[0] == "ok"
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
class TestGetAnalyticsDbReadonly:
|
||||
def test_analytics_readonly_rejects_malicious_dir_name(self, tmp_path, monkeypatch):
|
||||
"""Directories with SQL-injection chars in their name are skipped."""
|
||||
|
|
|
|||
290
tests/test_remote_query.py
Normal file
290
tests/test_remote_query.py
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
"""Tests for RemoteQueryEngine — two-phase BQ registration + DuckDB execution."""
|
||||
|
||||
import sys
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import duckdb
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
|
||||
from src.remote_query import RemoteQueryEngine, RemoteQueryError, _validate_bq_sql, _validate_sql
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def analytics_conn():
|
||||
conn = duckdb.connect()
|
||||
conn.execute("CREATE TABLE orders (id INT, date DATE, amount DECIMAL(10,2))")
|
||||
conn.execute(
|
||||
"INSERT INTO orders VALUES (1, '2026-01-01', 100.0), (2, '2026-01-15', 200.0)"
|
||||
)
|
||||
yield conn
|
||||
conn.close()
|
||||
|
||||
|
||||
def _make_bq_mock(arrow_table, count_value=None):
|
||||
"""Build a minimal BQ client mock.
|
||||
|
||||
First call to client.query() returns a count job, second returns a data job.
|
||||
If count_value is None, infer it from arrow_table.num_rows.
|
||||
"""
|
||||
if count_value is None:
|
||||
count_value = arrow_table.num_rows
|
||||
|
||||
count_arrow = pa.table({"count": pa.array([count_value], type=pa.int64())})
|
||||
|
||||
count_job = MagicMock()
|
||||
count_job.to_arrow.return_value = count_arrow
|
||||
|
||||
data_job = MagicMock()
|
||||
data_job.to_arrow.return_value = arrow_table
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.query.side_effect = [count_job, data_job]
|
||||
|
||||
return mock_client
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TestRemoteQueryEngineRegister
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteQueryEngineRegister:
|
||||
def test_register_bq_success(self, analytics_conn):
|
||||
"""Mock BQ client returning an Arrow table; verify view is queryable."""
|
||||
arrow_table = pa.table(
|
||||
{
|
||||
"order_id": pa.array([10, 20, 30], type=pa.int64()),
|
||||
"revenue": pa.array([1.0, 2.0, 3.0], type=pa.float64()),
|
||||
}
|
||||
)
|
||||
mock_client = _make_bq_mock(arrow_table)
|
||||
|
||||
engine = RemoteQueryEngine(
|
||||
analytics_conn,
|
||||
_bq_client_factory=lambda project: mock_client,
|
||||
max_bq_registration_rows=500_000,
|
||||
)
|
||||
|
||||
result = engine.register_bq("bq_orders", "SELECT order_id, revenue FROM bq.orders")
|
||||
|
||||
assert result["alias"] == "bq_orders"
|
||||
assert result["rows"] == 3
|
||||
assert result["columns"] == ["order_id", "revenue"]
|
||||
assert result["memory_mb"] > 0
|
||||
|
||||
# The alias must be queryable from DuckDB
|
||||
rows = analytics_conn.execute("SELECT COUNT(*) FROM bq_orders").fetchone()
|
||||
assert rows[0] == 3
|
||||
|
||||
def test_register_bq_row_limit_exceeded(self, analytics_conn):
|
||||
"""COUNT pre-check returns a value exceeding the row limit → RemoteQueryError."""
|
||||
arrow_table = pa.table({"x": pa.array([1], type=pa.int64())})
|
||||
# count exceeds limit
|
||||
mock_client = _make_bq_mock(arrow_table, count_value=1_000_000)
|
||||
|
||||
engine = RemoteQueryEngine(
|
||||
analytics_conn,
|
||||
_bq_client_factory=lambda project: mock_client,
|
||||
max_bq_registration_rows=500_000,
|
||||
)
|
||||
|
||||
with pytest.raises(RemoteQueryError) as exc_info:
|
||||
engine.register_bq("bq_big", "SELECT * FROM bq.huge_table")
|
||||
|
||||
assert exc_info.value.error_type == "row_limit"
|
||||
assert exc_info.value.details["count"] == 1_000_000
|
||||
|
||||
def test_register_bq_invalid_alias(self, analytics_conn):
|
||||
engine = RemoteQueryEngine(analytics_conn)
|
||||
# Space in alias — invalid identifier
|
||||
with pytest.raises(RemoteQueryError) as exc_info:
|
||||
engine.register_bq("bad alias", "SELECT 1")
|
||||
assert exc_info.value.error_type == "query_error"
|
||||
|
||||
# Reserved alias — information_schema
|
||||
with pytest.raises(RemoteQueryError) as exc_info:
|
||||
engine.register_bq("information_schema", "SELECT 1")
|
||||
assert exc_info.value.error_type == "query_error"
|
||||
|
||||
# Valid alias — should not raise from alias validation
|
||||
# (will raise later trying to reach BQ without a client, but not from alias check)
|
||||
try:
|
||||
engine.register_bq("valid_name", "SELECT 1")
|
||||
except RemoteQueryError as exc:
|
||||
assert exc.error_type != "query_error" or "Invalid alias" not in str(exc)
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
pass # Expected — no BQ package in test env
|
||||
|
||||
def test_register_bq_missing_package(self, analytics_conn):
|
||||
"""When google-cloud-bigquery is not installed, engine must raise RemoteQueryError."""
|
||||
engine = RemoteQueryEngine(
|
||||
analytics_conn,
|
||||
# No factory — will try to import google.cloud.bigquery
|
||||
_bq_client_factory=None,
|
||||
max_bq_registration_rows=500_000,
|
||||
)
|
||||
|
||||
with patch.dict(sys.modules, {"google": None, "google.cloud": None, "google.cloud.bigquery": None}):
|
||||
with pytest.raises(RemoteQueryError, match="google-cloud-bigquery"):
|
||||
engine.register_bq("bq_alias", "SELECT 1")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TestRemoteQueryEngineExecute
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteQueryEngineExecute:
|
||||
def test_execute_local_only(self, analytics_conn):
|
||||
"""Query local table; result dict has correct structure."""
|
||||
engine = RemoteQueryEngine(analytics_conn)
|
||||
result = engine.execute("SELECT id, amount FROM orders ORDER BY id")
|
||||
|
||||
assert result["columns"] == ["id", "amount"]
|
||||
assert result["row_count"] == 2
|
||||
assert result["truncated"] is False
|
||||
assert len(result["rows"]) == 2
|
||||
# Non-standard types (Decimal) must be serialized to str
|
||||
for row in result["rows"]:
|
||||
for val in row:
|
||||
assert isinstance(val, (int, float, bool, str, type(None)))
|
||||
|
||||
def test_execute_with_registered_bq(self, analytics_conn):
|
||||
"""Manually register an Arrow table, then JOIN it with local orders."""
|
||||
bq_arrow = pa.table(
|
||||
{
|
||||
"id": pa.array([1, 2], type=pa.int64()),
|
||||
"label": pa.array(["first", "second"], type=pa.utf8()),
|
||||
}
|
||||
)
|
||||
mock_client = _make_bq_mock(bq_arrow)
|
||||
|
||||
engine = RemoteQueryEngine(
|
||||
analytics_conn,
|
||||
_bq_client_factory=lambda project: mock_client,
|
||||
max_bq_registration_rows=500_000,
|
||||
)
|
||||
engine.register_bq("bq_labels", "SELECT id, label FROM bq.labels")
|
||||
|
||||
result = engine.execute(
|
||||
"SELECT o.id, o.amount, b.label "
|
||||
"FROM orders o JOIN bq_labels b ON o.id = b.id "
|
||||
"ORDER BY o.id"
|
||||
)
|
||||
|
||||
assert result["row_count"] == 2
|
||||
assert "label" in result["columns"]
|
||||
|
||||
def test_execute_respects_max_result_rows(self, analytics_conn):
|
||||
"""When max_result_rows=1, result is truncated after 1 row."""
|
||||
engine = RemoteQueryEngine(analytics_conn, max_result_rows=1)
|
||||
result = engine.execute("SELECT id FROM orders ORDER BY id")
|
||||
|
||||
assert result["row_count"] == 1
|
||||
assert result["truncated"] is True
|
||||
|
||||
def test_execute_invalid_sql(self, analytics_conn):
|
||||
"""DROP TABLE must be rejected with RemoteQueryError(error_type='query_error')."""
|
||||
engine = RemoteQueryEngine(analytics_conn)
|
||||
|
||||
with pytest.raises(RemoteQueryError) as exc_info:
|
||||
engine.execute("DROP TABLE orders")
|
||||
|
||||
assert exc_info.value.error_type == "query_error"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _validate_sql unit tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestValidateSql:
|
||||
@pytest.mark.parametrize(
|
||||
"sql",
|
||||
[
|
||||
"DROP TABLE foo",
|
||||
"DELETE FROM foo",
|
||||
"INSERT INTO foo VALUES (1)",
|
||||
"UPDATE foo SET x=1",
|
||||
"ALTER TABLE foo ADD COLUMN y INT",
|
||||
"CREATE TABLE foo (x INT)",
|
||||
"COPY foo TO '/tmp/out.csv'",
|
||||
"ATTACH '/db.duckdb'",
|
||||
"DETACH db",
|
||||
"LOAD 'extension'",
|
||||
"INSTALL httpfs",
|
||||
"SELECT read_parquet('/data/file.parquet')",
|
||||
"SELECT * FROM '../secret/file'",
|
||||
"SELECT 1; DROP TABLE foo",
|
||||
],
|
||||
)
|
||||
def test_blocked_sql(self, sql):
|
||||
with pytest.raises(RemoteQueryError) as exc_info:
|
||||
_validate_sql(sql)
|
||||
assert exc_info.value.error_type == "query_error"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sql",
|
||||
[
|
||||
"SELECT id FROM orders",
|
||||
"WITH cte AS (SELECT 1 AS x) SELECT x FROM cte",
|
||||
"select count(*) from orders",
|
||||
"with t as (select 1) select * from t",
|
||||
],
|
||||
)
|
||||
def test_allowed_sql(self, sql):
|
||||
# Should not raise
|
||||
_validate_sql(sql)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _validate_bq_sql unit tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestValidateBqSql:
|
||||
def test_information_schema_is_allowed(self):
|
||||
"""INFORMATION_SCHEMA queries must pass BQ SQL validation."""
|
||||
# Should not raise
|
||||
_validate_bq_sql("SELECT * FROM dataset.INFORMATION_SCHEMA.COLUMNS")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sql",
|
||||
[
|
||||
"DROP TABLE x",
|
||||
"INSERT INTO x VALUES (1)",
|
||||
"DELETE FROM x",
|
||||
"UPDATE x SET y=1",
|
||||
"ALTER TABLE x ADD COLUMN z INT",
|
||||
"CREATE TABLE x (y INT)",
|
||||
"TRUNCATE TABLE x",
|
||||
"MERGE INTO x USING y ON x.id=y.id WHEN MATCHED THEN UPDATE SET x.a=y.a",
|
||||
"SELECT 1; DROP TABLE x",
|
||||
],
|
||||
)
|
||||
def test_blocked_bq_sql(self, sql):
|
||||
"""Write/mutation operations must be rejected."""
|
||||
with pytest.raises(RemoteQueryError) as exc_info:
|
||||
_validate_bq_sql(sql)
|
||||
assert exc_info.value.error_type == "query_error"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sql",
|
||||
[
|
||||
"SELECT * FROM dataset.INFORMATION_SCHEMA.COLUMNS",
|
||||
"SELECT id FROM project.dataset.table",
|
||||
"WITH cte AS (SELECT 1 AS x) SELECT x FROM cte",
|
||||
],
|
||||
)
|
||||
def test_allowed_bq_sql(self, sql):
|
||||
"""Valid read-only BQ queries must pass."""
|
||||
# Should not raise
|
||||
_validate_bq_sql(sql)
|
||||
Loading…
Reference in a new issue