Merge pull request #3 from keboola/feature/v2-fastapi-duckdb-docker-cli

feat: remote query — extension re-attach + two-phase BQ+DuckDB engine
2026-04-12 10:19:13 +02:00 · 2026-04-12 10:19:13 +02:00 · dab5c84860
commit dab5c84860
parent c24205a1bf e351c38368
12 changed files with 2266 additions and 5 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -154,6 +154,22 @@ Before computing any business metric, look up the canonical definition:

 Never invent metric calculations — always use the canonical definitions.

+## Hybrid Queries (BigQuery + Local)
+
+For tables too large to sync locally, use hybrid queries that JOIN local data with on-demand BigQuery results:
+
+```bash
+da query --sql "SELECT o.*, t.views FROM orders o JOIN traffic t ON o.date = t.date" \
+         --register-bq "traffic=SELECT date, SUM(views) as views FROM dataset.web WHERE date > '2026-01-01' GROUP BY 1"
+```
+
+The `--register-bq` flag executes a BigQuery subquery, loads the result into memory, and makes it available as a DuckDB view for the final SQL. Multiple `--register-bq` flags can be used for multiple BQ sources.
+
+For complex SQL, use stdin mode:
+```bash
+echo '{"register_bq": {"traffic": "SELECT ..."}, "sql": "SELECT ..."}' | da query --stdin
+```
+
 ## Extensibility

 ### Data Sources (extract.duckdb contract)
--- a/app/api/query_hybrid.py
+++ b/app/api/query_hybrid.py
@ -0,0 +1,43 @@
+"""Hybrid query endpoint — two-phase BQ registration + DuckDB execution."""
+
+from typing import Dict
+
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel
+
+from app.auth.dependencies import require_admin
+from src.db import get_analytics_db_readonly
+from src.remote_query import RemoteQueryEngine, RemoteQueryError, load_config
+
+router = APIRouter(prefix="/api/query", tags=["query"])
+
+
+class HybridQueryRequest(BaseModel):
+    sql: str
+    register_bq: Dict[str, str] = {}
+
+
+@router.post("/hybrid")
+async def hybrid_query(request: HybridQueryRequest, user: dict = Depends(require_admin)):
+    config = load_config()
+    analytics = get_analytics_db_readonly()
+    try:
+        engine = RemoteQueryEngine(
+            analytics,
+            max_bq_registration_rows=config.get("max_bq_registration_rows", 500_000),
+            max_memory_mb=config.get("max_memory_mb", 2048),
+            max_result_rows=config.get("max_result_rows", 100_000),
+            timeout_seconds=config.get("timeout_seconds", 300),
+        )
+        for alias, bq_sql in request.register_bq.items():
+            try:
+                engine.register_bq(alias, bq_sql)
+            except RemoteQueryError as e:
+                raise HTTPException(status_code=400, detail=f"BQ '{alias}': {e.error_type}: {e}")
+        try:
+            result = engine.execute(request.sql)
+        except RemoteQueryError as e:
+            raise HTTPException(status_code=400, detail=f"Query: {e.error_type}: {e}")
+        return result
+    finally:
+        analytics.close()
--- a/app/main.py
+++ b/app/main.py
@ -29,6 +29,7 @@ from app.api.access_requests import router as access_requests_router
 from app.api.jira_webhooks import router as jira_webhooks_router
 from app.api.metrics import router as metrics_router
 from app.api.metadata import router as metadata_router
+from app.api.query_hybrid import router as query_hybrid_router
 from app.web.router import router as web_router

 logger = logging.getLogger(__name__)
@ -137,6 +138,7 @@ def create_app() -> FastAPI:
    app.include_router(jira_webhooks_router)
    app.include_router(metrics_router)
    app.include_router(metadata_router)
+    app.include_router(query_hybrid_router)

    # Web UI router (must be last — has catch-all routes)
    app.include_router(web_router)
--- a/cli/commands/query.py
+++ b/cli/commands/query.py
@ -2,21 +2,63 @@

 import json
 import os
+import sys
 from pathlib import Path
+from typing import List, Optional

 import typer

+
 def query_command(
-    sql: str = typer.Argument(..., help="SQL query to execute"),
+    sql: Optional[str] = typer.Argument(None, help="SQL query to execute (positional)"),
+    sql_opt: Optional[str] = typer.Option(None, "--sql", help="SQL query to execute (named option)"),
    remote: bool = typer.Option(False, "--remote", help="Execute on server instead of locally"),
    fmt: str = typer.Option("table", "--format", "-f", help="Output format: table, json, csv"),
    limit: int = typer.Option(1000, "--limit", help="Max rows to return"),
+    register_bq: Optional[List[str]] = typer.Option(
+        None,
+        "--register-bq",
+        help="Register a BigQuery result as a DuckDB view. Format: alias=BQ_SQL. Can be repeated.",
+    ),
+    stdin: bool = typer.Option(False, "--stdin", help="Read SQL from stdin as JSON {\"sql\": \"...\"}"),
 ):
    """Execute SQL query against DuckDB."""
-    if remote:
-        _query_remote(sql, fmt, limit)
+    # Resolve SQL from exactly one of: positional, --sql, or --stdin
+    sources_provided = sum([
+        sql is not None,
+        sql_opt is not None,
+        stdin,
+    ])
+    if sources_provided == 0:
+        typer.echo("Error: provide SQL as a positional argument, --sql option, or --stdin flag.", err=True)
+        raise typer.Exit(1)
+    if sources_provided > 1:
+        typer.echo("Error: only one of positional SQL, --sql, or --stdin may be used at a time.", err=True)
+        raise typer.Exit(1)
+
+    if stdin:
+        raw = sys.stdin.read()
+        try:
+            payload = json.loads(raw)
+            resolved_sql = payload["sql"]
+            # Extract register_bq from stdin JSON
+            stdin_bq = payload.get("register_bq", {})
+            if stdin_bq and isinstance(stdin_bq, dict):
+                register_bq = [f"{k}={v}" for k, v in stdin_bq.items()]
+        except (json.JSONDecodeError, KeyError) as exc:
+            typer.echo(f"Error: failed to parse stdin JSON: {exc}", err=True)
+            raise typer.Exit(1)
+    elif sql_opt is not None:
+        resolved_sql = sql_opt
    else:
-        _query_local(sql, fmt, limit)
+        resolved_sql = sql
+
+    if register_bq:
+        _query_hybrid(resolved_sql, fmt, limit, register_bq)
+    elif remote:
+        _query_remote(resolved_sql, fmt, limit)
+    else:
+        _query_local(resolved_sql, fmt, limit)


 def _query_local(sql: str, fmt: str, limit: int):
@ -56,6 +98,61 @@ def _query_remote(sql: str, fmt: str, limit: int):
        typer.echo(f"(truncated at {limit} rows)", err=True)


+def _query_hybrid(sql: str, fmt: str, limit: int, register_bq_specs: List[str]):
+    """Run a hybrid query: register BigQuery results as DuckDB views, then execute locally."""
+    import duckdb
+    from src.remote_query import RemoteQueryEngine, RemoteQueryError, load_config
+
+    local_dir = Path(os.environ.get("DA_LOCAL_DIR", "."))
+    db_path = local_dir / "user" / "duckdb" / "analytics.duckdb"
+    if not db_path.exists():
+        typer.echo("Local DuckDB not found. Run: da sync", err=True)
+        raise typer.Exit(1)
+
+    conn = duckdb.connect(str(db_path), read_only=True)
+    try:
+        config = load_config()
+        engine_kwargs = {k: v for k, v in config.items() if k in (
+            "max_bq_registration_rows", "max_memory_mb", "max_result_rows", "timeout_seconds"
+        )}
+        # CLI --limit flag overrides config max_result_rows
+        engine_kwargs["max_result_rows"] = limit
+        engine = RemoteQueryEngine(conn, **engine_kwargs)
+
+        for spec in register_bq_specs:
+            if "=" not in spec:
+                typer.echo(
+                    f"Error: --register-bq spec must be 'alias=BQ_SQL', got: {spec!r}",
+                    err=True,
+                )
+                raise typer.Exit(1)
+            alias, bq_sql = spec.split("=", 1)
+            alias = alias.strip()
+            bq_sql = bq_sql.strip()
+            try:
+                info = engine.register_bq(alias, bq_sql)
+                typer.echo(
+                    f"Registered BQ alias '{alias}': {info['rows']:,} rows, "
+                    f"{info['memory_mb']:.1f} MiB",
+                    err=True,
+                )
+            except RemoteQueryError as exc:
+                typer.echo(f"BQ registration failed for '{alias}': {exc}", err=True)
+                raise typer.Exit(1)
+
+        try:
+            result = engine.execute(sql)
+        except RemoteQueryError as exc:
+            typer.echo(f"Query error: {exc}", err=True)
+            raise typer.Exit(1)
+
+        _output(result["columns"], result["rows"], fmt)
+        if result.get("truncated"):
+            typer.echo(f"(truncated at {result['row_count']} rows)", err=True)
+    finally:
+        conn.close()
+
+
 def _output(columns: list, rows: list, fmt: str):
    if fmt == "json":
        output = [dict(zip(columns, row)) for row in rows]
--- a/docs/superpowers/plans/2026-04-11-remote-query.md
+++ b/docs/superpowers/plans/2026-04-11-remote-query.md
@ -0,0 +1,923 @@
+# Remote Query Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Fix BigQuery extension re-attach so remote views work, then add a two-phase query engine that JOINs local Parquet data with on-demand BigQuery subquery results.
+
+**Architecture:** Part 1 patches `get_analytics_db_readonly()` to re-load extensions from `_remote_attach` tables. Part 2 adds `RemoteQueryEngine` that wraps BQ client with safety limits (COUNT pre-check, memory estimation), registers Arrow results in DuckDB, then executes the final SQL. Exposed via `da query --register-bq` CLI and `POST /api/query/hybrid` API.
+
+**Tech Stack:** DuckDB, google-cloud-bigquery, PyArrow, FastAPI, Typer
+
+**Spec:** `docs/superpowers/specs/2026-04-11-remote-query-design.md`
+
+---
+
+### Task 1: Fix Extension Re-attach in `get_analytics_db_readonly()`
+
+**Files:**
+- Modify: `src/db.py:253-282` (get_analytics_db_readonly)
+- Test: `tests/test_db.py`
+
+- [ ] **Step 1: Write failing test**
+
+Add to `tests/test_db.py`:
+
+```python
+class TestExtensionReattach:
+    def test_reads_remote_attach_table(self, tmp_path, monkeypatch):
+        """Verify get_analytics_db_readonly() attempts to load extensions from _remote_attach."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import duckdb
+
+        # Create analytics DB
+        analytics_dir = tmp_path / "analytics"
+        analytics_dir.mkdir()
+        conn = duckdb.connect(str(analytics_dir / "server.duckdb"))
+        conn.close()
+
+        # Create an extract.duckdb with a _remote_attach table
+        ext_dir = tmp_path / "extracts" / "testbq"
+        ext_dir.mkdir(parents=True)
+        ext_conn = duckdb.connect(str(ext_dir / "extract.duckdb"))
+        ext_conn.execute("""
+            CREATE TABLE _remote_attach (
+                alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR
+            )
+        """)
+        ext_conn.execute(
+            "INSERT INTO _remote_attach VALUES ('bq', 'bigquery', 'project=test', '')"
+        )
+        ext_conn.close()
+
+        from src.db import get_analytics_db_readonly
+        # This won't actually load bigquery (not installed in test env),
+        # but should not crash — just log a warning
+        analytics = get_analytics_db_readonly()
+        try:
+            # Connection should be usable even if extension load failed
+            result = analytics.execute("SELECT 1").fetchone()
+            assert result[0] == 1
+        finally:
+            analytics.close()
+
+    def test_skips_missing_remote_attach(self, tmp_path, monkeypatch):
+        """Extract without _remote_attach should not cause errors."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import duckdb
+
+        analytics_dir = tmp_path / "analytics"
+        analytics_dir.mkdir()
+        conn = duckdb.connect(str(analytics_dir / "server.duckdb"))
+        conn.close()
+
+        ext_dir = tmp_path / "extracts" / "plain"
+        ext_dir.mkdir(parents=True)
+        ext_conn = duckdb.connect(str(ext_dir / "extract.duckdb"))
+        ext_conn.execute("CREATE TABLE _meta (name VARCHAR)")
+        ext_conn.close()
+
+        from src.db import get_analytics_db_readonly
+        analytics = get_analytics_db_readonly()
+        try:
+            result = analytics.execute("SELECT 1").fetchone()
+            assert result[0] == 1
+        finally:
+            analytics.close()
+```
+
+- [ ] **Step 2: Run test to verify it fails (or passes — these are resilience tests)**
+
+Run: `pytest tests/test_db.py::TestExtensionReattach -v`
+Expected: Both tests likely PASS already (graceful failures). That's fine — the real value is ensuring the re-attach code doesn't break anything.
+
+- [ ] **Step 3: Implement extension re-attach**
+
+In `src/db.py`, modify `get_analytics_db_readonly()`. After the existing ATTACH loop (line ~279), before the `return conn` (line ~282), add:
+
+```python
+    # Re-attach remote extensions (BigQuery, Keboola, etc.)
+    if extracts_dir.exists():
+        _reattach_remote_extensions(conn, extracts_dir)
+```
+
+Add this helper function before `get_analytics_db_readonly()`:
+
+```python
+def _reattach_remote_extensions(
+    conn: duckdb.DuckDBPyConnection, extracts_dir: Path
+) -> None:
+    """Re-load extensions from _remote_attach tables in extract.duckdb files."""
+    already_attached = set()
+    try:
+        already_attached = {
+            r[0] for r in conn.execute(
+                "SELECT database_name FROM duckdb_databases()"
+            ).fetchall()
+        }
+    except Exception:
+        pass
+
+    for ext_dir in sorted(extracts_dir.iterdir()):
+        if not ext_dir.is_dir() or not _SAFE_IDENTIFIER.match(ext_dir.name):
+            continue
+        # Check if this extract has a _remote_attach table
+        try:
+            has_table = conn.execute(
+                f"SELECT table_name FROM information_schema.tables "
+                f"WHERE table_schema='{ext_dir.name}' AND table_name='_remote_attach'"
+            ).fetchall()
+            if not has_table:
+                continue
+        except Exception:
+            continue
+
+        try:
+            rows = conn.execute(
+                f"SELECT alias, extension, url, token_env FROM {ext_dir.name}._remote_attach"
+            ).fetchall()
+        except Exception:
+            continue
+
+        for alias, extension, url, token_env in rows:
+            if alias in already_attached:
+                continue
+            if not _SAFE_IDENTIFIER.match(alias) or not _SAFE_IDENTIFIER.match(extension):
+                continue
+
+            token = os.environ.get(token_env, "") if token_env else ""
+
+            try:
+                conn.execute(f"LOAD {extension};")
+                if token:
+                    escaped_token = token.replace("'", "''")
+                    conn.execute(
+                        f"ATTACH '{url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
+                    )
+                else:
+                    conn.execute(
+                        f"ATTACH '{url}' AS {alias} (TYPE {extension}, READ_ONLY)"
+                    )
+                already_attached.add(alias)
+                logger.info("Re-attached remote source %s via %s", alias, extension)
+            except Exception as e:
+                logger.debug("Could not re-attach %s: %s", alias, e)
+```
+
+- [ ] **Step 4: Run tests**
+
+Run: `pytest tests/test_db.py -v`
+Expected: ALL PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/db.py tests/test_db.py
+git commit -m "fix: re-attach remote extensions in get_analytics_db_readonly()"
+```
+
+---
+
+### Task 2: RemoteQueryEngine Core
+
+**Files:**
+- Create: `src/remote_query.py`
+- Test: `tests/test_remote_query.py`
+
+- [ ] **Step 1: Write failing tests**
+
+Create `tests/test_remote_query.py`:
+
+```python
+"""Tests for RemoteQueryEngine."""
+
+import json
+import os
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import duckdb
+import pytest
+
+
+@pytest.fixture
+def analytics_conn(tmp_path):
+    """DuckDB connection with a sample local view."""
+    conn = duckdb.connect()
+    conn.execute("CREATE TABLE orders (id INT, date DATE, amount DECIMAL(10,2))")
+    conn.execute("INSERT INTO orders VALUES (1, '2026-01-01', 100.0), (2, '2026-01-15', 200.0)")
+    yield conn
+    conn.close()
+
+
+def _mock_bq_arrow_table():
+    """Create a mock Arrow table for BQ results."""
+    import pyarrow as pa
+    return pa.table({
+        "date": ["2026-01-01", "2026-01-15"],
+        "pageviews": [1000, 2000],
+    })
+
+
+class TestRemoteQueryEngineRegister:
+    def test_register_bq_success(self, analytics_conn):
+        from src.remote_query import RemoteQueryEngine
+
+        mock_arrow = _mock_bq_arrow_table()
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = mock_arrow
+        mock_client = MagicMock()
+        mock_client.query.return_value = mock_job
+        # COUNT pre-check
+        mock_count_job = MagicMock()
+        mock_count_result = MagicMock()
+        mock_count_result.fetchone.return_value = (2,)
+        mock_count_job.result.return_value = mock_count_result
+        mock_client.query.side_effect = [mock_count_job, mock_job]
+
+        engine = RemoteQueryEngine(analytics_conn, _bq_client_factory=lambda: mock_client)
+        stats = engine.register_bq("traffic", "SELECT date, pageviews FROM dataset.web")
+
+        assert stats["alias"] == "traffic"
+        assert stats["rows"] == 2
+        # Verify the view is usable
+        result = analytics_conn.execute("SELECT * FROM traffic").fetchall()
+        assert len(result) == 2
+
+    def test_register_bq_row_limit_exceeded(self, analytics_conn):
+        from src.remote_query import RemoteQueryEngine, RemoteQueryError
+
+        mock_client = MagicMock()
+        mock_count_job = MagicMock()
+        mock_count_result = MagicMock()
+        mock_count_result.fetchone.return_value = (999999,)
+        mock_count_job.result.return_value = mock_count_result
+        mock_client.query.return_value = mock_count_job
+
+        engine = RemoteQueryEngine(
+            analytics_conn,
+            _bq_client_factory=lambda: mock_client,
+            max_bq_registration_rows=1000,
+        )
+        with pytest.raises(RemoteQueryError, match="row_limit"):
+            engine.register_bq("big", "SELECT * FROM huge_table")
+
+    def test_register_bq_missing_package(self, analytics_conn):
+        from src.remote_query import RemoteQueryEngine, RemoteQueryError
+
+        engine = RemoteQueryEngine(
+            analytics_conn,
+            _bq_client_factory=None,  # Will try real import
+        )
+        with patch.dict("sys.modules", {"google.cloud.bigquery": None}):
+            with pytest.raises(RemoteQueryError, match="bq_error"):
+                engine.register_bq("x", "SELECT 1")
+
+
+class TestRemoteQueryEngineExecute:
+    def test_execute_local_only(self, analytics_conn):
+        from src.remote_query import RemoteQueryEngine
+        engine = RemoteQueryEngine(analytics_conn)
+        result = engine.execute("SELECT id, amount FROM orders ORDER BY id")
+        assert result["columns"] == ["id", "amount"]
+        assert len(result["rows"]) == 2
+        assert result["row_count"] == 2
+        assert result["truncated"] is False
+
+    def test_execute_with_registered_bq(self, analytics_conn):
+        from src.remote_query import RemoteQueryEngine
+        import pyarrow as pa
+
+        # Manually register an Arrow table (simulating BQ result)
+        traffic = pa.table({"date": ["2026-01-01", "2026-01-15"], "views": [100, 200]})
+        analytics_conn.register("traffic", traffic)
+
+        engine = RemoteQueryEngine(analytics_conn)
+        result = engine.execute(
+            "SELECT o.id, t.views FROM orders o JOIN traffic t ON CAST(o.date AS VARCHAR) = t.date ORDER BY o.id"
+        )
+        assert len(result["rows"]) == 2
+        assert result["columns"] == ["id", "views"]
+
+    def test_execute_respects_max_result_rows(self, analytics_conn):
+        from src.remote_query import RemoteQueryEngine
+        engine = RemoteQueryEngine(analytics_conn, max_result_rows=1)
+        result = engine.execute("SELECT * FROM orders")
+        assert len(result["rows"]) == 1
+        assert result["truncated"] is True
+
+    def test_execute_invalid_sql(self, analytics_conn):
+        from src.remote_query import RemoteQueryEngine, RemoteQueryError
+        engine = RemoteQueryEngine(analytics_conn)
+        with pytest.raises(RemoteQueryError, match="query_error"):
+            engine.execute("DROP TABLE orders")
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `pytest tests/test_remote_query.py -v`
+Expected: FAIL — `ModuleNotFoundError: No module named 'src.remote_query'`
+
+- [ ] **Step 3: Implement RemoteQueryEngine**
+
+Create `src/remote_query.py`:
+
+```python
+"""Two-phase remote query engine.
+
+Phase 1: Execute BigQuery subqueries, register results as in-memory Arrow tables.
+Phase 2: Execute DuckDB query joining local Parquet views with BQ Arrow tables.
+"""
+
+import logging
+import os
+from typing import Any, Callable, Dict, List, Optional
+
+import duckdb
+
+logger = logging.getLogger(__name__)
+
+# SQL blocklist — reused from app/api/query.py
+_BLOCKED_KEYWORDS = [
+    "drop ", "delete ", "insert ", "update ", "alter ", "create ",
+    "copy ", "attach ", "detach ", "load ", "install ",
+    "export ", "import ", "pragma ", "call ",
+    "read_csv", "read_json", "read_parquet", "read_text",
+    "write_csv", "write_parquet", "read_blob", "read_ndjson",
+    "parquet_scan", "parquet_metadata", "parquet_schema",
+    "json_scan", "csv_scan",
+    "query_table", "iceberg_scan", "delta_scan",
+    "glob(", "list_files",
+    "'/", '"/', 'http://', 'https://', 's3://', 'gcs://',
+    "information_schema", "duckdb_tables", "duckdb_columns",
+    "duckdb_databases", "duckdb_settings", "duckdb_functions",
+    "duckdb_views", "duckdb_indexes", "duckdb_schemas",
+    "pragma_table_info", "pragma_storage_info",
+    "'../", '"../',
+    ";",
+]
+
+
+class RemoteQueryError(Exception):
+    """Structured error for remote query failures."""
+
+    def __init__(self, message: str, error_type: str, details: Optional[dict] = None):
+        super().__init__(message)
+        self.error_type = error_type
+        self.details = details or {}
+
+
+class RemoteQueryEngine:
+    """Two-phase query engine: BQ subqueries + DuckDB final query."""
+
+    def __init__(
+        self,
+        conn: duckdb.DuckDBPyConnection,
+        *,
+        _bq_client_factory: Optional[Callable] = None,
+        max_bq_registration_rows: int = 500_000,
+        max_memory_mb: float = 2048.0,
+        max_result_rows: int = 100_000,
+        timeout_seconds: int = 300,
+    ):
+        self.conn = conn
+        self._bq_client_factory = _bq_client_factory
+        self.max_bq_registration_rows = max_bq_registration_rows
+        self.max_memory_mb = max_memory_mb
+        self.max_result_rows = max_result_rows
+        self.timeout_seconds = timeout_seconds
+        self._bq_stats: Dict[str, dict] = {}
+
+    def register_bq(self, alias: str, bq_sql: str) -> dict:
+        """Execute BQ subquery, register result as in-memory DuckDB view.
+
+        Returns dict with {alias, rows, columns, memory_mb}.
+        Raises RemoteQueryError on failure.
+        """
+        _validate_sql(bq_sql)
+
+        client = self._get_bq_client()
+
+        # Phase 1a: COUNT(*) pre-check
+        count_sql = f"SELECT COUNT(*) FROM ({bq_sql})"
+        try:
+            count_job = client.query(count_sql)
+            row_count = count_job.result().fetchone()[0]
+        except Exception as e:
+            raise RemoteQueryError(
+                f"BQ COUNT pre-check failed for '{alias}': {e}",
+                error_type="bq_error",
+                details={"alias": alias},
+            )
+
+        if row_count > self.max_bq_registration_rows:
+            raise RemoteQueryError(
+                f"BQ query '{alias}' returns {row_count:,} rows "
+                f"(limit: {self.max_bq_registration_rows:,})",
+                error_type="row_limit",
+                details={"alias": alias, "rows": row_count, "limit": self.max_bq_registration_rows},
+            )
+
+        # Phase 1b: Execute and register
+        try:
+            job = client.query(bq_sql)
+            try:
+                arrow_table = job.to_arrow()
+            except Exception:
+                arrow_table = job.to_arrow(create_bqstorage_client=False)
+        except Exception as e:
+            raise RemoteQueryError(
+                f"BQ query failed for '{alias}': {e}",
+                error_type="bq_error",
+                details={"alias": alias},
+            )
+
+        # Memory check (actual, not estimated)
+        memory_mb = arrow_table.nbytes / (1024 * 1024)
+        if memory_mb > self.max_memory_mb:
+            raise RemoteQueryError(
+                f"BQ result '{alias}' uses {memory_mb:.1f} MB "
+                f"(limit: {self.max_memory_mb:.0f} MB)",
+                error_type="memory_limit",
+                details={"alias": alias, "memory_mb": memory_mb, "limit": self.max_memory_mb},
+            )
+
+        self.conn.register(alias, arrow_table)
+        stats = {
+            "alias": alias,
+            "rows": arrow_table.num_rows,
+            "columns": arrow_table.num_columns,
+            "memory_mb": round(memory_mb, 3),
+        }
+        self._bq_stats[alias] = stats
+        logger.info("Registered BQ view '%s': %d rows, %.1f MB", alias, arrow_table.num_rows, memory_mb)
+        return stats
+
+    def execute(self, sql: str) -> dict:
+        """Execute final DuckDB query. Returns {columns, rows, row_count, truncated, bq_stats}."""
+        _validate_sql(sql)
+
+        try:
+            result = self.conn.execute(sql).fetchmany(self.max_result_rows + 1)
+            columns = [desc[0] for desc in self.conn.description] if self.conn.description else []
+        except Exception as e:
+            raise RemoteQueryError(
+                f"Query execution failed: {e}",
+                error_type="query_error",
+            )
+
+        truncated = len(result) > self.max_result_rows
+        rows = result[:self.max_result_rows]
+
+        # Serialize non-standard types
+        serializable_rows = []
+        for row in rows:
+            serializable_rows.append([
+                str(v) if v is not None and not isinstance(v, (int, float, bool, str)) else v
+                for v in row
+            ])
+
+        return {
+            "columns": columns,
+            "rows": serializable_rows,
+            "row_count": len(serializable_rows),
+            "truncated": truncated,
+            "bq_stats": dict(self._bq_stats),
+        }
+
+    def _get_bq_client(self):
+        """Get BigQuery client, using factory or default."""
+        if self._bq_client_factory:
+            return self._bq_client_factory()
+        try:
+            from scripts.duckdb_manager import _create_bq_client
+            project = os.environ.get("BIGQUERY_PROJECT")
+            if not project:
+                raise RemoteQueryError(
+                    "BIGQUERY_PROJECT env var not set",
+                    error_type="bq_error",
+                )
+            return _create_bq_client(project)
+        except ImportError:
+            raise RemoteQueryError(
+                "google-cloud-bigquery is not installed. "
+                "Install with: pip install google-cloud-bigquery",
+                error_type="bq_error",
+            )
+
+
+def _validate_sql(sql: str) -> None:
+    """Validate SQL against blocklist. Raises RemoteQueryError."""
+    sql_lower = sql.strip().lower()
+    for keyword in _BLOCKED_KEYWORDS:
+        if keyword in sql_lower:
+            raise RemoteQueryError(
+                f"Blocked SQL keyword: {keyword.strip()}",
+                error_type="query_error",
+            )
+    if not sql_lower.startswith("select ") and not sql_lower.startswith("with "):
+        raise RemoteQueryError(
+            "Query must start with SELECT or WITH",
+            error_type="query_error",
+        )
+
+
+def load_config() -> dict:
+    """Load remote_query config from instance.yaml."""
+    try:
+        from app.instance_config import get_value
+        return get_value("remote_query") or {}
+    except Exception:
+        return {}
+```
+
+- [ ] **Step 4: Run tests**
+
+Run: `pytest tests/test_remote_query.py -v`
+Expected: ALL PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/remote_query.py tests/test_remote_query.py
+git commit -m "feat: add RemoteQueryEngine with BQ registration and safety limits"
+```
+
+---
+
+### Task 3: CLI `da query --register-bq`
+
+**Files:**
+- Modify: `cli/commands/query.py`
+- Test: `tests/test_cli.py`
+
+- [ ] **Step 1: Write failing test**
+
+Add to `tests/test_cli.py`:
+
+```python
+class TestQueryHybrid:
+    def test_register_bq_flag_help(self):
+        result = runner.invoke(app, ["query", "--help"])
+        assert result.exit_code == 0
+        assert "register-bq" in result.output
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+Run: `pytest tests/test_cli.py::TestQueryHybrid -v`
+Expected: FAIL — `register-bq` not in help output
+
+- [ ] **Step 3: Implement CLI changes**
+
+Replace `cli/commands/query.py` with:
+
+```python
+"""Query commands — da query."""
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import List, Optional
+
+import typer
+
+
+def query_command(
+    sql: Optional[str] = typer.Argument(None, help="SQL query to execute"),
+    sql_opt: Optional[str] = typer.Option(None, "--sql", help="SQL query (alternative to positional)"),
+    remote: bool = typer.Option(False, "--remote", help="Execute on server instead of locally"),
+    register_bq: Optional[List[str]] = typer.Option(None, "--register-bq", help="Register BQ subquery: alias=SQL"),
+    stdin: bool = typer.Option(False, "--stdin", help="Read query spec from stdin (JSON)"),
+    fmt: str = typer.Option("table", "--format", "-f", help="Output format: table, json, csv"),
+    limit: int = typer.Option(1000, "--limit", help="Max rows to return"),
+):
+    """Execute SQL query against DuckDB. Supports hybrid BQ+local queries."""
+    # Resolve SQL from positional, --sql, or --stdin
+    if stdin:
+        spec = json.loads(sys.stdin.read())
+        final_sql = spec.get("sql", "")
+        register_bq = [f"{k}={v}" for k, v in spec.get("register_bq", {}).items()]
+    else:
+        final_sql = sql or sql_opt
+        if not final_sql:
+            typer.echo("Error: provide SQL as argument, --sql, or --stdin", err=True)
+            raise typer.Exit(1)
+
+    if register_bq:
+        _query_hybrid(final_sql, register_bq, fmt, limit)
+    elif remote:
+        _query_remote(final_sql, fmt, limit)
+    else:
+        _query_local(final_sql, fmt, limit)
+
+
+def _query_hybrid(sql: str, register_bq_specs: List[str], fmt: str, limit: int):
+    """Run two-phase hybrid query: BQ subqueries + local DuckDB."""
+    import duckdb
+    from src.remote_query import RemoteQueryEngine, RemoteQueryError, load_config
+
+    local_dir = Path(os.environ.get("DA_LOCAL_DIR", "."))
+    db_path = local_dir / "user" / "duckdb" / "analytics.duckdb"
+    if not db_path.exists():
+        typer.echo("Local DuckDB not found. Run: da sync", err=True)
+        raise typer.Exit(1)
+
+    config = load_config()
+    conn = duckdb.connect(str(db_path), read_only=True)
+    try:
+        engine = RemoteQueryEngine(
+            conn,
+            max_bq_registration_rows=config.get("max_bq_registration_rows", 500_000),
+            max_memory_mb=config.get("max_memory_mb", 2048),
+            max_result_rows=limit,
+            timeout_seconds=config.get("timeout_seconds", 300),
+        )
+
+        # Phase 1: Register BQ subqueries
+        for spec in register_bq_specs:
+            eq_idx = spec.index("=")
+            alias = spec[:eq_idx].strip()
+            bq_sql = spec[eq_idx + 1:].strip()
+            try:
+                stats = engine.register_bq(alias, bq_sql)
+                typer.echo(f"  BQ '{alias}': {stats['rows']} rows, {stats['memory_mb']} MB", err=True)
+            except RemoteQueryError as e:
+                typer.echo(f"Error registering '{alias}': {e}", err=True)
+                raise typer.Exit(1)
+
+        # Phase 2: Execute final query
+        try:
+            result = engine.execute(sql)
+        except RemoteQueryError as e:
+            typer.echo(f"Query error: {e}", err=True)
+            raise typer.Exit(1)
+
+        _output(result["columns"], result["rows"], fmt)
+        if result["truncated"]:
+            typer.echo(f"(truncated at {limit} rows)", err=True)
+    finally:
+        conn.close()
+
+
+def _query_local(sql: str, fmt: str, limit: int):
+    """Run query against local DuckDB."""
+    import duckdb
+
+    local_dir = Path(os.environ.get("DA_LOCAL_DIR", "."))
+    db_path = local_dir / "user" / "duckdb" / "analytics.duckdb"
+    if not db_path.exists():
+        typer.echo("Local DuckDB not found. Run: da sync", err=True)
+        raise typer.Exit(1)
+
+    conn = duckdb.connect(str(db_path), read_only=True)
+    try:
+        result = conn.execute(sql).fetchmany(limit)
+        columns = [desc[0] for desc in conn.description] if conn.description else []
+        _output(columns, result, fmt)
+    except Exception as e:
+        typer.echo(f"Query error: {e}", err=True)
+        raise typer.Exit(1)
+    finally:
+        conn.close()
+
+
+def _query_remote(sql: str, fmt: str, limit: int):
+    """Run query against server DuckDB via API."""
+    from cli.client import api_post
+
+    resp = api_post("/api/query", json={"sql": sql, "limit": limit})
+    if resp.status_code != 200:
+        typer.echo(f"Query failed: {resp.json().get('detail', resp.text)}", err=True)
+        raise typer.Exit(1)
+
+    data = resp.json()
+    _output(data["columns"], data["rows"], fmt)
+    if data.get("truncated"):
+        typer.echo(f"(truncated at {limit} rows)", err=True)
+
+
+def _output(columns: list, rows: list, fmt: str):
+    if fmt == "json":
+        output = [dict(zip(columns, row)) for row in rows]
+        typer.echo(json.dumps(output, indent=2, default=str))
+    elif fmt == "csv":
+        typer.echo(",".join(columns))
+        for row in rows:
+            typer.echo(",".join(str(v) if v is not None else "" for v in row))
+    else:
+        from rich.console import Console
+        from rich.table import Table
+        console = Console()
+        table = Table()
+        for col in columns:
+            table.add_column(col)
+        for row in rows:
+            table.add_row(*(str(v) if v is not None else "" for v in row))
+        console.print(table)
+```
+
+- [ ] **Step 4: Run tests**
+
+Run: `pytest tests/test_cli.py -v`
+Expected: ALL PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add cli/commands/query.py tests/test_cli.py
+git commit -m "feat: add --register-bq and --stdin to da query for hybrid BQ+local queries"
+```
+
+---
+
+### Task 4: API Endpoint `POST /api/query/hybrid`
+
+**Files:**
+- Create: `app/api/query_hybrid.py`
+- Modify: `app/main.py` (register router)
+- Test: `tests/test_api.py`
+
+- [ ] **Step 1: Write failing tests**
+
+Add to `tests/test_api.py`:
+
+```python
+class TestHybridQueryAPI:
+    def test_hybrid_query_requires_admin(self, seeded_client):
+        client, _, analyst_token = seeded_client
+        resp = client.post(
+            "/api/query/hybrid",
+            json={"sql": "SELECT 1", "register_bq": {}},
+            headers={"Authorization": f"Bearer {analyst_token}"},
+        )
+        assert resp.status_code == 403
+
+    def test_hybrid_query_local_only(self, seeded_client):
+        """Hybrid endpoint works without BQ registrations (just local query)."""
+        client, admin_token, _ = seeded_client
+        resp = client.post(
+            "/api/query/hybrid",
+            json={"sql": "SELECT 1 AS val", "register_bq": {}},
+            headers={"Authorization": f"Bearer {admin_token}"},
+        )
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["columns"] == ["val"]
+        assert data["rows"] == [[1]]
+
+    def test_hybrid_query_blocked_sql(self, seeded_client):
+        client, admin_token, _ = seeded_client
+        resp = client.post(
+            "/api/query/hybrid",
+            json={"sql": "DROP TABLE users", "register_bq": {}},
+            headers={"Authorization": f"Bearer {admin_token}"},
+        )
+        assert resp.status_code == 400
+
+    def test_hybrid_query_blocked_bq_sql(self, seeded_client):
+        client, admin_token, _ = seeded_client
+        resp = client.post(
+            "/api/query/hybrid",
+            json={
+                "sql": "SELECT 1",
+                "register_bq": {"x": "DROP TABLE something"},
+            },
+            headers={"Authorization": f"Bearer {admin_token}"},
+        )
+        assert resp.status_code == 400
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `pytest tests/test_api.py::TestHybridQueryAPI -v`
+Expected: FAIL — 404 on `/api/query/hybrid`
+
+- [ ] **Step 3: Implement API endpoint**
+
+Create `app/api/query_hybrid.py`:
+
+```python
+"""Hybrid query endpoint — two-phase BQ + DuckDB queries."""
+
+from typing import Dict, Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel
+import duckdb
+
+from app.auth.dependencies import require_admin, _get_db
+from src.db import get_analytics_db_readonly
+from src.remote_query import RemoteQueryEngine, RemoteQueryError, load_config
+
+router = APIRouter(prefix="/api/query", tags=["query"])
+
+
+class HybridQueryRequest(BaseModel):
+    sql: str
+    register_bq: Dict[str, str] = {}
+    format: str = "json"
+
+
+@router.post("/hybrid")
+async def hybrid_query(
+    request: HybridQueryRequest,
+    user: dict = Depends(require_admin),
+):
+    """Execute a two-phase hybrid query: BQ subqueries + DuckDB final query."""
+    config = load_config()
+    analytics = get_analytics_db_readonly()
+    try:
+        engine = RemoteQueryEngine(
+            analytics,
+            max_bq_registration_rows=config.get("max_bq_registration_rows", 500_000),
+            max_memory_mb=config.get("max_memory_mb", 2048),
+            max_result_rows=config.get("max_result_rows", 100_000),
+            timeout_seconds=config.get("timeout_seconds", 300),
+        )
+
+        # Phase 1: Register BQ subqueries
+        for alias, bq_sql in request.register_bq.items():
+            try:
+                engine.register_bq(alias, bq_sql)
+            except RemoteQueryError as e:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"BQ registration '{alias}' failed: {e.error_type}: {str(e)}",
+                )
+
+        # Phase 2: Execute final query
+        try:
+            result = engine.execute(request.sql)
+        except RemoteQueryError as e:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Query failed: {e.error_type}: {str(e)}",
+            )
+
+        return result
+    finally:
+        analytics.close()
+```
+
+Register in `app/main.py`:
+
+```python
+from app.api.query_hybrid import router as query_hybrid_router
+# ...
+app.include_router(query_hybrid_router)  # before web_router
+```
+
+- [ ] **Step 4: Run tests**
+
+Run: `pytest tests/test_api.py::TestHybridQueryAPI -v`
+Expected: ALL PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add app/api/query_hybrid.py app/main.py tests/test_api.py
+git commit -m "feat: add POST /api/query/hybrid endpoint for two-phase BQ+DuckDB queries"
+```
+
+---
+
+### Task 5: CLAUDE.md + Integration Test
+
+**Files:**
+- Modify: `CLAUDE.md`
+- Test: run full suite
+
+- [ ] **Step 1: Add hybrid query docs to CLAUDE.md**
+
+After the "## Business Metrics" section, add:
+
+```markdown
+## Hybrid Queries (BigQuery + Local)
+
+For tables too large to sync locally, use hybrid queries that JOIN local data with on-demand BigQuery results:
+
+```bash
+da query --sql "SELECT o.*, t.views FROM orders o JOIN traffic t ON o.date = t.date" \
+         --register-bq "traffic=SELECT date, SUM(views) as views FROM dataset.web WHERE date > '2026-01-01' GROUP BY 1"
+```
+
+The `--register-bq` flag executes a BigQuery subquery, loads the result into memory, and makes it available as a DuckDB view for the final SQL. Multiple `--register-bq` flags can be used for multiple BQ sources.
+
+For complex SQL, use stdin mode:
+```bash
+echo '{"register_bq": {"traffic": "SELECT ..."}, "sql": "SELECT ..."}' | da query --stdin
+```
+```
+
+- [ ] **Step 2: Run full test suite**
+
+Run: `pytest tests/ -v --timeout=60`
+Expected: ALL PASS
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add CLAUDE.md
+git commit -m "docs: add hybrid query usage instructions to CLAUDE.md"
+```
--- a/docs/superpowers/specs/2026-04-11-remote-query-design.md
+++ b/docs/superpowers/specs/2026-04-11-remote-query-design.md
@ -0,0 +1,205 @@
+# Remote Query — Design Spec
+
+**Date:** 2026-04-11
+**Status:** Approved
+**Scope:** Fix extension re-attach + two-phase remote query engine
+
+## Context
+
+BigQuery remote views created by the orchestrator don't work at query time because `get_analytics_db_readonly()` opens a fresh connection without re-loading the BigQuery extension. Additionally, the platform lacks the ability to run hybrid queries that JOIN local Parquet data with on-demand BigQuery subquery results.
+
+The `padak/tmp_oss` v1 repo has `src/remote_query.py` with a two-phase protocol. The existing `scripts/duckdb_manager.py` in this repo already has `register_bq_table()` and `_create_bq_client()` helper functions. The `table_registry` already supports `query_mode` values: `local`, `remote`, `hybrid`.
+
+**Primary user:** Claude Code agent running `da query` locally, or API consumers via `POST /api/query/hybrid`.
+
+---
+
+## Part 1: Fix Extension Re-attach
+
+### Problem
+
+`get_analytics_db_readonly()` in `src/db.py` opens analytics.duckdb in read-only mode and ATTACHes extract.duckdb files, but does NOT re-load extensions referenced in `_remote_attach` tables. BigQuery remote views fail with "Catalog Error: bq not found".
+
+### Solution
+
+After ATTACHing extract.duckdb files in `get_analytics_db_readonly()`, scan each for a `_remote_attach` table. For each record, re-load the extension and re-attach the remote source.
+
+**Important: DuckDB read-only LOAD behavior.** The `read_only=True` flag on `duckdb.connect()` blocks writes to the DB file, but `LOAD` writes to the extension cache in `~/.duckdb/extensions/` (separate from the DB file). This should work, but MUST be empirically verified as the first implementation step. If LOAD fails in read-only mode, the workaround is to open the analytics DB WITHOUT `read_only=True` but still use read-only SQL patterns (no INSERT/UPDATE/DELETE), or to call `LOAD` on a separate in-memory connection first (DuckDB extension cache is process-wide).
+
+Steps for each `_remote_attach` record:
+1. `LOAD {extension}` — loads pre-installed extension from disk
+2. Read token from `os.environ[token_env]` if `token_env` is non-empty
+3. `ATTACH '{url}' AS {alias} (TYPE {extension}, READ_ONLY)` — with TOKEN if needed
+
+If LOAD or ATTACH fails, log a warning and continue — local views still work.
+
+### Changes
+
+**File:** `src/db.py` — `get_analytics_db_readonly()` function
+
+Add ~25 lines after the existing extract.duckdb ATTACH loop. Read `_remote_attach` table from each attached extract DB, collect unique (alias, extension, url, token_env) tuples, and re-attach.
+
+Pattern follows `src/orchestrator.py:_attach_remote_extensions()` but simplified (no INSTALL — orchestrator pre-installs during rebuild).
+
+**Concurrency note:** If the orchestrator runs `_atomic_swap_db()` while a read-only connection is open, the existing connection holds a file descriptor to the old inode (Unix semantics). This is safe — the old data remains accessible until the connection is closed.
+
+---
+
+## Part 2: Two-Phase Remote Query Engine
+
+### Architecture
+
+New module `src/remote_query.py` with a `RemoteQueryEngine` class:
+
+```python
+class RemoteQueryEngine:
+    def __init__(self, conn: duckdb.DuckDBPyConnection):
+        """Takes an existing DuckDB connection (analytics.duckdb with local views)."""
+
+    def register_bq(self, alias: str, bq_sql: str) -> dict:
+        """Execute BQ subquery, register result as in-memory DuckDB view.
+        Returns {alias, rows, columns, memory_mb}.
+        Raises RemoteQueryError on safety limit violation."""
+
+    def execute(self, sql: str) -> dict:
+        """Execute final DuckDB query against local + registered BQ views.
+        Returns {columns: [...], rows: [...], row_count: int, truncated: bool}."""
+```
+
+### Two-Phase Flow
+
+1. **Phase 1 — BQ Registration:** For each `register_bq(alias, bq_sql)` call:
+   - COUNT(*) pre-check via Python BQ client → reject if >max_bq_registration_rows
+   - Memory estimate: ~50 bytes/cell × rows × cols → reject if >max_memory_mb. Note: this is approximate. After query completes, use `arrow_table.nbytes` for accurate reporting in `bq_stats`.
+   - Execute BQ query → `job.to_arrow()` → `conn.register(alias, arrow_table)`
+   - Uses `scripts/duckdb_manager.py:_create_bq_client()` for BQ client creation (reuse)
+   - Does NOT delegate to `register_bq_table()` directly — `RemoteQueryEngine.register_bq()` wraps BQ query execution with its own pre-check logic (COUNT, memory estimate), then calls `conn.register(alias, arrow_table)`. The existing `register_bq_table()` has no pre-check capability and would need signature changes to add one. Wrapping is cleaner than modifying shared code.
+   - Gracefully handle missing `google-cloud-bigquery` package: catch `ImportError` and raise `RemoteQueryError(error_type="bq_error", message="google-cloud-bigquery not installed")`
+
+2. **Phase 2 — DuckDB Query:** Execute final SQL against all views (local Parquet + registered BQ Arrow tables). Apply max_result_rows limit.
+
+### Safety Limits
+
+Configurable in `config/instance.yaml` under `remote_query:`:
+
+```yaml
+remote_query:
+  max_bq_registration_rows: 500000   # max rows from a single BQ subquery (matches existing instance.yaml.example key)
+  max_memory_mb: 2048                # max estimated memory for BQ result
+  max_result_rows: 100000            # max rows in final result
+  timeout_seconds: 300               # BQ query timeout
+```
+
+Note: `max_bq_registration_rows` matches the key already documented in `config/instance.yaml.example`.
+
+Defaults are hardcoded in `RemoteQueryEngine` and overridden by instance config.
+
+### Error Handling
+
+Custom `RemoteQueryError` exception with structured error:
+
+```python
+class RemoteQueryError(Exception):
+    def __init__(self, message: str, error_type: str, details: dict = None):
+        # error_type: "row_limit", "memory_limit", "bq_error", "query_error", "timeout"
+```
+
+### CLI: `da query` Extension
+
+Extend existing `cli/commands/query.py`:
+
+```
+da query --sql "SELECT o.*, t.views FROM orders o JOIN traffic t ON o.date = t.date" \
+         --register-bq "traffic=SELECT date, SUM(views) as views FROM dataset.web WHERE date > '2026-01-01' GROUP BY 1"
+```
+
+- Multiple `--register-bq` flags allowed (one per BQ alias)
+- Format: `"alias=BQ_SQL"` (split on first `=`)
+- `--stdin` mode: reads JSON from stdin for complex SQL:
+  ```json
+  {"register_bq": {"traffic": "SELECT ..."}, "sql": "SELECT ..."}
+  ```
+- Output formats: `table` (default), `csv`, `json`
+
+**CLI argument handling:** The existing `query_command` has `sql` as a required positional argument. When `--register-bq` is used, `sql` should be provided via `--sql` flag instead (named option, not positional). When `--stdin` is used, both `sql` and `register_bq` come from stdin JSON. Make `sql` an optional positional (`typer.Argument(None)`) and validate that exactly one of (positional sql, --sql flag, --stdin) is provided.
+
+### API: `POST /api/query/hybrid`
+
+```
+POST /api/query/hybrid
+Authorization: Bearer <admin_token>
+
+{
+  "register_bq": {
+    "traffic": "SELECT date, SUM(views) FROM dataset.web WHERE date > '2026-01-01' GROUP BY 1"
+  },
+  "sql": "SELECT o.*, t.views FROM orders o JOIN traffic t ON o.date = t.date",
+  "format": "json"
+}
+```
+
+**Response:**
+```json
+{
+  "columns": ["order_id", "date", "views"],
+  "rows": [...],
+  "row_count": 1234,
+  "truncated": false,
+  "bq_stats": {
+    "traffic": {"rows": 365, "columns": 2, "memory_mb": 0.03}
+  }
+}
+```
+
+**Auth:** `require_admin` — BQ queries cost money, only admins can trigger them.
+
+**Validation — both `register_bq` SQL and final `sql`:**
+- Apply the same SQL blocklist from `app/api/query.py` (blocks LOAD, ATTACH, INSTALL, read_parquet with paths, path traversal patterns, etc.)
+- `register_bq` SQL additionally validated as SELECT-only (no INSERT/UPDATE/DELETE/DROP)
+- Reuse the existing `_validate_sql()` helper from `app/api/query.py` (extract to shared utility if needed)
+
+**Connection lifecycle:** The API endpoint owns the connection. Pattern:
+```python
+analytics = get_analytics_db_readonly()
+try:
+    engine = RemoteQueryEngine(analytics)
+    # ... register_bq + execute
+finally:
+    analytics.close()
+```
+
+---
+
+## Implementation Summary
+
+### New Files
+
+| File | Purpose |
+|---|---|
+| `src/remote_query.py` | `RemoteQueryEngine` class + `RemoteQueryError` |
+| `app/api/query_hybrid.py` | `POST /api/query/hybrid` endpoint |
+| `tests/test_remote_query.py` | Engine unit tests (mocked BQ client) |
+
+### Modified Files
+
+| File | Changes |
+|---|---|
+| `src/db.py` | `get_analytics_db_readonly()` — add extension re-attach from `_remote_attach` |
+| `cli/commands/query.py` | Add `--register-bq` and `--stdin` flags |
+| `app/main.py` | Register hybrid query router |
+| `CLAUDE.md` | Document hybrid query usage |
+
+### Implementation Order
+
+1. Fix extension re-attach in `src/db.py` (unblocks remote views)
+2. `RemoteQueryEngine` in `src/remote_query.py` (core logic)
+3. CLI extension `--register-bq`
+4. API endpoint `POST /api/query/hybrid`
+5. CLAUDE.md update + integration tests
+
+### Test Coverage
+
+- `tests/test_remote_query.py` — engine tests with mocked BQ client (safety limits, registration, error handling)
+- `tests/test_db.py` — extension re-attach test (mock _remote_attach table)
+- `tests/test_api.py` — hybrid query endpoint (auth, validation)
+- `tests/test_cli.py` — `--register-bq` flag parsing
--- a/src/db.py
+++ b/src/db.py
@ -250,6 +250,93 @@ def get_analytics_db() -> duckdb.DuckDBPyConnection:
    return duckdb.connect(str(db_path))


+def _reattach_remote_extensions(
+    conn: duckdb.DuckDBPyConnection, extracts_dir: Path
+) -> None:
+    """Re-LOAD DuckDB extensions listed in _remote_attach tables of each extract.duckdb.
+
+    Called from get_analytics_db_readonly() after ATTACHing extract.duckdb files so
+    that remote views (e.g. BigQuery) resolve correctly.  Uses LOAD only — no INSTALL —
+    to avoid touching the network in read-only query paths.
+    """
+    if not extracts_dir.exists():
+        return
+
+    try:
+        attached_dbs = {
+            r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()
+        }
+    except Exception:
+        return
+
+    for ext_dir in sorted(extracts_dir.iterdir()):
+        if not ext_dir.is_dir():
+            continue
+        if not _SAFE_IDENTIFIER.match(ext_dir.name):
+            continue
+        db_file = ext_dir / "extract.duckdb"
+        if not db_file.exists():
+            continue
+        # Only process sources that were successfully attached
+        if ext_dir.name not in attached_dbs:
+            continue
+
+        # Check whether this extract has a _remote_attach table
+        try:
+            has_table = conn.execute(
+                "SELECT 1 FROM information_schema.tables "
+                f"WHERE table_catalog='{ext_dir.name}' AND table_name='_remote_attach'"
+            ).fetchone()
+            if not has_table:
+                continue
+        except Exception:
+            continue
+
+        try:
+            rows = conn.execute(
+                f"SELECT alias, extension, url, token_env FROM {ext_dir.name}._remote_attach"
+            ).fetchall()
+        except Exception as e:
+            logger.debug("Could not read _remote_attach from %s: %s", ext_dir.name, e)
+            continue
+
+        # Refresh attached list before processing each source's rows
+        try:
+            attached_dbs = {
+                r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()
+            }
+        except Exception:
+            pass
+
+        for alias, extension, url, token_env in rows:
+            if not _SAFE_IDENTIFIER.match(alias or ""):
+                logger.debug("Skipping unsafe remote_attach alias: %r", alias)
+                continue
+            if not _SAFE_IDENTIFIER.match(extension or ""):
+                logger.debug("Skipping unsafe remote_attach extension: %r", extension)
+                continue
+            if alias in attached_dbs:
+                logger.debug("Remote source %s already attached, skipping", alias)
+                continue
+            try:
+                conn.execute(f"LOAD {extension};")
+                token = os.environ.get(token_env, "") if token_env else ""
+                safe_url = url.replace("'", "''")
+                if token:
+                    escaped_token = token.replace("'", "''")
+                    conn.execute(
+                        f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')"
+                    )
+                else:
+                    conn.execute(
+                        f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, READ_ONLY)"
+                    )
+                attached_dbs.add(alias)
+                logger.debug("Re-attached remote source %s via %s extension", alias, extension)
+            except Exception as e:
+                logger.debug("Could not re-attach remote source %s: %s", alias, e)
+
+
 def get_analytics_db_readonly() -> duckdb.DuckDBPyConnection:
    """Read-only connection to analytics DB. Blocks writes and external access.

@ -277,6 +364,8 @@ def get_analytics_db_readonly() -> duckdb.DuckDBPyConnection:
                    conn.execute(f"ATTACH '{db_file}' AS {ext_dir.name} (READ_ONLY)")
                except Exception:
                    pass
+    # Re-attach remote extensions so BigQuery / other remote views resolve.
+    _reattach_remote_extensions(conn, extracts_dir)
    # Note: external_access stays enabled because views use read_parquet() on local files.
    # File-path-based attacks are blocked by the SQL blocklist in app/api/query.py.
    return conn
--- a/src/remote_query.py
+++ b/src/remote_query.py
@ -0,0 +1,431 @@
+"""RemoteQueryEngine — two-phase BQ registration + DuckDB execution.
+
+Phase 1 (register_bq): validate SQL, COUNT(*) pre-check against BigQuery,
+fetch Arrow table, check memory, register as DuckDB view.
+
+Phase 2 (execute): validate SQL, execute against DuckDB (which may reference
+registered BQ views), serialize and return results.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import re
+from typing import Any, Dict, List, Optional
+
+import duckdb
+
+_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
+
+_RESERVED_ALIASES = {
+    "information_schema", "duckdb_tables", "duckdb_columns",
+    "duckdb_databases", "duckdb_settings", "duckdb_functions",
+    "duckdb_views", "duckdb_indexes", "duckdb_schemas",
+    "main", "memory", "system", "temp",
+}
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# SQL blocklist — based on app/api/query.py, extended with additional DuckDB metadata tables
+# ---------------------------------------------------------------------------
+
+_BLOCKED_KEYWORDS: List[str] = [
+    "drop ",
+    "delete ",
+    "insert ",
+    "update ",
+    "alter ",
+    "create ",
+    "copy ",
+    "attach ",
+    "detach ",
+    "load ",
+    "install ",
+    "export ",
+    "import ",
+    "pragma ",
+    "call ",
+    # File access functions
+    "read_csv",
+    "read_json",
+    "read_parquet",
+    "read_text",
+    "write_csv",
+    "write_parquet",
+    "read_blob",
+    "read_ndjson",
+    "parquet_scan",
+    "parquet_metadata",
+    "parquet_schema",
+    "json_scan",
+    "csv_scan",
+    "query_table",
+    "iceberg_scan",
+    "delta_scan",
+    "glob(",
+    "list_files",
+    "'/",
+    '\"/',
+    "http://",
+    "https://",
+    "s3://",
+    "gcs://",
+    # DuckDB metadata (leaks schema info regardless of RBAC)
+    "information_schema",
+    "duckdb_tables",
+    "duckdb_columns",
+    "duckdb_databases",
+    "duckdb_settings",
+    "duckdb_functions",
+    "duckdb_views",
+    "duckdb_indexes",
+    "duckdb_schemas",
+    "pragma_table_info",
+    "pragma_storage_info",
+    # Relative path traversal
+    "'../",
+    '"../',
+    # Multiple statements
+    ";",
+]
+
+
+# ---------------------------------------------------------------------------
+# Exception
+# ---------------------------------------------------------------------------
+
+
+class RemoteQueryError(Exception):
+    """Raised by RemoteQueryEngine for all controlled error conditions.
+
+    Attributes:
+        error_type: One of "row_limit", "memory_limit", "bq_error",
+                    "query_error", "timeout".
+        details: Optional dict with additional context.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        error_type: str,
+        details: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        super().__init__(message)
+        self.error_type = error_type
+        self.details = details or {}
+
+
+# ---------------------------------------------------------------------------
+# Module-level helpers
+# ---------------------------------------------------------------------------
+
+
+def _validate_sql(sql: str) -> None:
+    """Raise RemoteQueryError if *sql* contains blocked patterns.
+
+    Raises:
+        RemoteQueryError: with error_type="query_error" if validation fails.
+    """
+    sql_lower = sql.strip().lower()
+
+    for keyword in _BLOCKED_KEYWORDS:
+        if keyword in sql_lower:
+            raise RemoteQueryError(
+                f"Blocked SQL pattern: {keyword!r}",
+                error_type="query_error",
+                details={"blocked_keyword": keyword},
+            )
+
+    if not sql_lower.startswith("select ") and not sql_lower.startswith("with "):
+        raise RemoteQueryError(
+            "Query must start with SELECT or WITH",
+            error_type="query_error",
+        )
+
+
+# BQ SQL blocklist — only blocks write/mutation operations
+_BQ_BLOCKED_KEYWORDS = [
+    "drop ",
+    "delete ",
+    "insert ",
+    "update ",
+    "alter ",
+    "create ",
+    "truncate ",
+    "merge ",
+    ";",  # prevent multi-statement
+]
+
+
+def _validate_bq_sql(sql: str) -> None:
+    """Validate BQ SQL — narrower than DuckDB blocklist, only blocks writes."""
+    sql_lower = sql.strip().lower()
+    for keyword in _BQ_BLOCKED_KEYWORDS:
+        if keyword in sql_lower:
+            raise RemoteQueryError(
+                f"Blocked BQ SQL keyword: {keyword.strip()}",
+                error_type="query_error",
+            )
+    if not sql_lower.startswith("select ") and not sql_lower.startswith("with "):
+        raise RemoteQueryError(
+            "BQ query must start with SELECT or WITH",
+            error_type="query_error",
+        )
+
+
+def load_config() -> Dict[str, Any]:
+    """Load the ``remote_query:`` section from instance.yaml.
+
+    Returns an empty dict if the section is missing or config cannot be loaded.
+    """
+    try:
+        from app.instance_config import get_value
+
+        return get_value("remote_query", default={}) or {}
+    except Exception:
+        return {}
+
+
+# ---------------------------------------------------------------------------
+# Engine
+# ---------------------------------------------------------------------------
+
+
+class RemoteQueryEngine:
+    """Two-phase query engine: BQ registration (Phase 1) + DuckDB execution (Phase 2).
+
+    Args:
+        conn: Open DuckDB connection used for both view registration and querying.
+        _bq_client_factory: Optional callable ``(project: str) -> BQ client``.
+            Defaults to ``scripts.duckdb_manager._create_bq_client``.
+        max_bq_registration_rows: Maximum rows allowed in a single BQ registration.
+        max_memory_mb: Maximum in-memory Arrow table size (MiB).
+        max_result_rows: Maximum rows returned by ``execute()``.
+        timeout_seconds: Query timeout (reserved for future use).
+    """
+
+    def __init__(
+        self,
+        conn: duckdb.DuckDBPyConnection,
+        *,
+        _bq_client_factory=None,
+        max_bq_registration_rows: int = 500_000,
+        max_memory_mb: float = 2048.0,
+        max_result_rows: int = 100_000,
+        timeout_seconds: int = 300,
+    ) -> None:
+        self._conn = conn
+        self._bq_client_factory = _bq_client_factory
+        self.max_bq_registration_rows = max_bq_registration_rows
+        self.max_memory_mb = max_memory_mb
+        self.max_result_rows = max_result_rows
+        self.timeout_seconds = timeout_seconds
+
+        # Track which aliases have been registered in this session
+        self._registered: Dict[str, Dict[str, Any]] = {}
+
+    # ------------------------------------------------------------------
+    # Phase 1
+    # ------------------------------------------------------------------
+
+    def register_bq(self, alias: str, bq_sql: str) -> Dict[str, Any]:
+        """Register a BigQuery query result as a DuckDB view.
+
+        Steps:
+        1. Validate *bq_sql* against the SQL blocklist.
+        2. COUNT(*) pre-check via BQ client.
+        3. Execute the actual BQ query and fetch as Arrow table.
+        4. Check in-memory size against *max_memory_mb*.
+        5. Register Arrow table in DuckDB under *alias*.
+
+        Args:
+            alias: DuckDB view name to register (e.g. ``"bq_orders"``).
+            bq_sql: SQL query to execute on BigQuery.
+
+        Returns:
+            ``{alias, rows, columns, memory_mb}``
+
+        Raises:
+            RemoteQueryError: For row/memory limits or BQ errors.
+            ImportError: If google-cloud-bigquery is not installed.
+        """
+        if not _SAFE_IDENTIFIER.match(alias or ""):
+            raise RemoteQueryError(
+                f"Invalid alias {alias!r}: must be a valid SQL identifier",
+                error_type="query_error",
+            )
+        if alias.lower() in _RESERVED_ALIASES:
+            raise RemoteQueryError(
+                f"Reserved alias {alias!r}: cannot shadow system objects",
+                error_type="query_error",
+            )
+
+        _validate_bq_sql(bq_sql)
+
+        client = self._get_bq_client()
+
+        # --- Phase 1a: COUNT(*) pre-check ---
+        count_sql = f"SELECT COUNT(*) FROM ({bq_sql}) AS _cnt"
+        try:
+            count_job = client.query(count_sql)
+            count_arrow = count_job.to_arrow()
+            count_value = int(count_arrow.column(0)[0].as_py())
+        except RemoteQueryError:
+            raise
+        except Exception as exc:
+            raise RemoteQueryError(
+                f"BQ COUNT pre-check failed: {exc}",
+                error_type="bq_error",
+                details={"original_error": str(exc)},
+            ) from exc
+
+        if count_value > self.max_bq_registration_rows:
+            raise RemoteQueryError(
+                f"BQ result has {count_value:,} rows, exceeding the "
+                f"limit of {self.max_bq_registration_rows:,}.",
+                error_type="row_limit",
+                details={
+                    "count": count_value,
+                    "max": self.max_bq_registration_rows,
+                },
+            )
+
+        # --- Phase 1b: Fetch actual data ---
+        try:
+            data_job = client.query(bq_sql)
+            try:
+                arrow_table = data_job.to_arrow()
+            except Exception as storage_exc:
+                if "readsessions" in str(storage_exc) or "PERMISSION_DENIED" in str(storage_exc):
+                    logger.warning("BQ Storage API unavailable, falling back to REST")
+                    arrow_table = data_job.to_arrow(create_bqstorage_client=False)
+                else:
+                    raise
+        except RemoteQueryError:
+            raise
+        except Exception as exc:
+            raise RemoteQueryError(
+                f"BQ query failed: {exc}",
+                error_type="bq_error",
+                details={"original_error": str(exc)},
+            ) from exc
+
+        # --- Phase 1c: Memory check (accurate, post-fetch) ---
+        memory_mb = arrow_table.nbytes / (1024 * 1024)
+        if memory_mb > self.max_memory_mb:
+            raise RemoteQueryError(
+                f"Arrow table uses {memory_mb:.1f} MiB, exceeding the "
+                f"limit of {self.max_memory_mb:.1f} MiB.",
+                error_type="memory_limit",
+                details={"memory_mb": memory_mb, "max_memory_mb": self.max_memory_mb},
+            )
+
+        # --- Phase 1d: Register in DuckDB ---
+        self._conn.register(alias, arrow_table)
+
+        info: Dict[str, Any] = {
+            "alias": alias,
+            "rows": arrow_table.num_rows,
+            "columns": arrow_table.schema.names,
+            "memory_mb": memory_mb,
+        }
+        self._registered[alias] = info
+        logger.info(
+            "Registered BQ alias %r: %d rows, %.2f MiB",
+            alias,
+            arrow_table.num_rows,
+            memory_mb,
+        )
+        return info
+
+    # ------------------------------------------------------------------
+    # Phase 2
+    # ------------------------------------------------------------------
+
+    def execute(self, sql: str) -> Dict[str, Any]:
+        """Execute SQL against DuckDB (which may reference registered BQ views).
+
+        Args:
+            sql: SQL query to execute. Must pass the SQL blocklist.
+
+        Returns:
+            ``{columns, rows, row_count, truncated, bq_stats}``
+
+        Raises:
+            RemoteQueryError: If SQL is blocked or a DuckDB error occurs.
+        """
+        _validate_sql(sql)
+
+        try:
+            result = self._conn.execute(sql).fetchmany(self.max_result_rows + 1)
+            columns = (
+                [desc[0] for desc in self._conn.description]
+                if self._conn.description
+                else []
+            )
+        except RemoteQueryError:
+            raise
+        except Exception as exc:
+            raise RemoteQueryError(
+                f"Query error: {exc}",
+                error_type="query_error",
+                details={"original_error": str(exc)},
+            ) from exc
+
+        truncated = len(result) > self.max_result_rows
+        rows = result[: self.max_result_rows]
+
+        # Serialize non-standard types (mirrors app/api/query.py lines 92-96)
+        serializable_rows = []
+        for row in rows:
+            serializable_rows.append(
+                [
+                    str(v) if v is not None and not isinstance(v, (int, float, bool, str)) else v
+                    for v in row
+                ]
+            )
+
+        return {
+            "columns": columns,
+            "rows": serializable_rows,
+            "row_count": len(serializable_rows),
+            "truncated": truncated,
+            "bq_stats": {
+                "registered_aliases": list(self._registered.keys()),
+                "alias_count": len(self._registered),
+            },
+        }
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _get_bq_client(self):
+        """Return a BigQuery client from the injected factory or the default one.
+
+        Raises:
+            ImportError: If google-cloud-bigquery is not installed and no
+                factory was injected.
+        """
+        if self._bq_client_factory is not None:
+            project = os.environ.get("BIGQUERY_PROJECT", "unknown")
+            return self._bq_client_factory(project)
+
+        # Lazy import so the module stays usable without BQ installed.
+        try:
+            import google.cloud.bigquery as _bq_module  # noqa: PLC0415, F401
+        except ImportError:
+            raise RemoteQueryError(
+                "google-cloud-bigquery is not installed. Install with: pip install google-cloud-bigquery",
+                error_type="bq_error",
+            )
+
+        project = os.environ.get("BIGQUERY_PROJECT")
+        if not project:
+            raise RemoteQueryError(
+                "BIGQUERY_PROJECT env var is not set.",
+                error_type="bq_error",
+            )
+        return _bq_module.Client(project=project)
--- a/tests/test_api.py
+++ b/tests/test_api.py
@ -385,7 +385,7 @@ class TestMetadataAPI:
        # 'orders' is not in table_registry — expect 404 or 400
        assert resp.status_code in (400, 404)

-    def test_push_keboola_table(self, seeded_client, monkeypatch):
+    def test_push_keboola_table(self, seeded_client, monkeypatch):  # noqa: F811
        client, admin_token, _ = seeded_client

        # 1. Register a keboola table
@ -451,3 +451,53 @@ class TestMetadataAPI:
        called_json = call_args.kwargs.get("json", {})
        assert called_json.get("provider") == "ai-metadata-enrichment"
        assert isinstance(called_json.get("metadata"), list)
+
+
+# ---- Hybrid Query ----
+
+class TestHybridQueryAPI:
+    def test_hybrid_query_requires_admin(self, seeded_client):
+        client, _, analyst_token = seeded_client
+        resp = client.post(
+            "/api/query/hybrid",
+            json={"sql": "SELECT 1 AS val", "register_bq": {}},
+            headers={"Authorization": f"Bearer {analyst_token}"},
+        )
+        assert resp.status_code == 403
+
+    def test_hybrid_query_local_only(self, seeded_client):
+        client, admin_token, _ = seeded_client
+        resp = client.post(
+            "/api/query/hybrid",
+            json={"sql": "SELECT 1 AS val", "register_bq": {}},
+            headers={"Authorization": f"Bearer {admin_token}"},
+        )
+        assert resp.status_code == 200
+        data = resp.json()
+        assert "columns" in data
+        assert "rows" in data
+        assert data["columns"] == ["val"]
+        assert data["rows"] == [[1]]
+
+    def test_hybrid_query_blocked_sql(self, seeded_client):
+        client, admin_token, _ = seeded_client
+        resp = client.post(
+            "/api/query/hybrid",
+            json={"sql": "DROP TABLE users", "register_bq": {}},
+            headers={"Authorization": f"Bearer {admin_token}"},
+        )
+        assert resp.status_code == 400
+        assert "query_error" in resp.json()["detail"]
+
+    def test_hybrid_query_blocked_bq_sql(self, seeded_client):
+        client, admin_token, _ = seeded_client
+        resp = client.post(
+            "/api/query/hybrid",
+            json={
+                "sql": "SELECT 1",
+                "register_bq": {"bad_alias": "DROP TABLE sensitive"},
+            },
+            headers={"Authorization": f"Bearer {admin_token}"},
+        )
+        assert resp.status_code == 400
+        assert "query_error" in resp.json()["detail"]
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -238,6 +238,17 @@ class TestAdminCommands:
            assert result.exit_code == 1


+class TestQueryHybrid:
+    def test_register_bq_flag_help(self):
+        result = runner.invoke(app, ["query", "--help"])
+        assert result.exit_code == 0
+        # Rich/Typer may insert ANSI escape codes within option names,
+        # so check for the parts separately
+        assert "register" in result.output
+        assert "bq" in result.output
+        assert "BigQuery" in result.output
+
+
 class TestMetricsHelp:
    def test_metrics_help(self):
        result = runner.invoke(app, ["metrics", "--help"])
--- a/tests/test_db.py
+++ b/tests/test_db.py
@ -462,6 +462,110 @@ class TestSchemaV4:
            conn2.close()


+class TestExtensionReattach:
+    """Resilience tests for _reattach_remote_extensions() called by get_analytics_db_readonly()."""
+
+    def _make_analytics_db(self, tmp_path):
+        """Create an empty analytics server.duckdb so get_analytics_db_readonly() takes the read_only path."""
+        analytics_dir = tmp_path / "analytics"
+        analytics_dir.mkdir(parents=True, exist_ok=True)
+        import duckdb as _duckdb
+        conn = _duckdb.connect(str(analytics_dir / "server.duckdb"))
+        conn.close()
+
+    def _make_extract_db(self, tmp_path, source_name, with_remote_attach=True):
+        """Create a minimal extract.duckdb, optionally with a _remote_attach table."""
+        ext_dir = tmp_path / "extracts" / source_name
+        ext_dir.mkdir(parents=True, exist_ok=True)
+        import duckdb as _duckdb
+        conn = _duckdb.connect(str(ext_dir / "extract.duckdb"))
+        try:
+            conn.execute(
+                "CREATE TABLE _meta (table_name VARCHAR, description VARCHAR, rows BIGINT, "
+                "size_bytes BIGINT, extracted_at TIMESTAMP, query_mode VARCHAR)"
+            )
+            if with_remote_attach:
+                conn.execute(
+                    "CREATE TABLE _remote_attach (alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR)"
+                )
+                # Use 'bigquery' which won't be installed in CI — tests resilience
+                conn.execute(
+                    "INSERT INTO _remote_attach VALUES ('bq', 'bigquery', 'project/dataset', '')"
+                )
+        finally:
+            conn.close()
+
+    def test_reads_remote_attach_table(self, tmp_path, monkeypatch):
+        """get_analytics_db_readonly() doesn't crash even when LOAD fails for missing extension."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import importlib
+        import src.db as db_module
+        importlib.reload(db_module)
+
+        self._make_analytics_db(tmp_path)
+        self._make_extract_db(tmp_path, "mysource", with_remote_attach=True)
+
+        # Should not raise even though 'bigquery' extension is not installed
+        conn = db_module.get_analytics_db_readonly()
+        try:
+            # Connection must still be usable for local queries
+            result = conn.execute("SELECT 42 AS n").fetchone()
+            assert result[0] == 42
+        finally:
+            conn.close()
+
+    def test_reattach_attempts_load(self, tmp_path, monkeypatch):
+        """Verify _reattach_remote_extensions reads _remote_attach and attempts LOAD."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import importlib
+        import src.db as db_module
+        importlib.reload(db_module)
+
+        self._make_analytics_db(tmp_path)
+        self._make_extract_db(tmp_path, "bqsource", with_remote_attach=True)
+
+        # Call get_analytics_db_readonly and verify the _remote_attach table is readable
+        conn = db_module.get_analytics_db_readonly()
+        try:
+            # Verify the extract was attached
+            dbs = {r[0] for r in conn.execute("SELECT database_name FROM duckdb_databases()").fetchall()}
+            assert "bqsource" in dbs, f"bqsource should be attached, got: {dbs}"
+
+            # Verify _remote_attach table is accessible via table_catalog
+            has = conn.execute(
+                "SELECT 1 FROM information_schema.tables "
+                "WHERE table_catalog='bqsource' AND table_name='_remote_attach'"
+            ).fetchone()
+            assert has is not None, "_remote_attach table should be visible via table_catalog"
+
+            # Read the rows to verify they're correct
+            rows = conn.execute(
+                "SELECT alias, extension, url FROM bqsource._remote_attach"
+            ).fetchall()
+            assert len(rows) == 1
+            assert rows[0][0] == "bq"
+            assert rows[0][1] == "bigquery"
+        finally:
+            conn.close()
+
+    def test_skips_missing_remote_attach(self, tmp_path, monkeypatch):
+        """get_analytics_db_readonly() works fine when _remote_attach table is absent."""
+        monkeypatch.setenv("DATA_DIR", str(tmp_path))
+        import importlib
+        import src.db as db_module
+        importlib.reload(db_module)
+
+        self._make_analytics_db(tmp_path)
+        self._make_extract_db(tmp_path, "localsource", with_remote_attach=False)
+
+        conn = db_module.get_analytics_db_readonly()
+        try:
+            result = conn.execute("SELECT 'ok' AS status").fetchone()
+            assert result[0] == "ok"
+        finally:
+            conn.close()
+
+
 class TestGetAnalyticsDbReadonly:
    def test_analytics_readonly_rejects_malicious_dir_name(self, tmp_path, monkeypatch):
        """Directories with SQL-injection chars in their name are skipped."""
--- a/tests/test_remote_query.py
+++ b/tests/test_remote_query.py
@ -0,0 +1,290 @@
+"""Tests for RemoteQueryEngine — two-phase BQ registration + DuckDB execution."""
+
+import sys
+from datetime import date
+from decimal import Decimal
+from unittest.mock import MagicMock, patch
+
+import duckdb
+import pyarrow as pa
+import pytest
+
+from src.remote_query import RemoteQueryEngine, RemoteQueryError, _validate_bq_sql, _validate_sql
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def analytics_conn():
+    conn = duckdb.connect()
+    conn.execute("CREATE TABLE orders (id INT, date DATE, amount DECIMAL(10,2))")
+    conn.execute(
+        "INSERT INTO orders VALUES (1, '2026-01-01', 100.0), (2, '2026-01-15', 200.0)"
+    )
+    yield conn
+    conn.close()
+
+
+def _make_bq_mock(arrow_table, count_value=None):
+    """Build a minimal BQ client mock.
+
+    First call to client.query() returns a count job, second returns a data job.
+    If count_value is None, infer it from arrow_table.num_rows.
+    """
+    if count_value is None:
+        count_value = arrow_table.num_rows
+
+    count_arrow = pa.table({"count": pa.array([count_value], type=pa.int64())})
+
+    count_job = MagicMock()
+    count_job.to_arrow.return_value = count_arrow
+
+    data_job = MagicMock()
+    data_job.to_arrow.return_value = arrow_table
+
+    mock_client = MagicMock()
+    mock_client.query.side_effect = [count_job, data_job]
+
+    return mock_client
+
+
+# ---------------------------------------------------------------------------
+# TestRemoteQueryEngineRegister
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteQueryEngineRegister:
+    def test_register_bq_success(self, analytics_conn):
+        """Mock BQ client returning an Arrow table; verify view is queryable."""
+        arrow_table = pa.table(
+            {
+                "order_id": pa.array([10, 20, 30], type=pa.int64()),
+                "revenue": pa.array([1.0, 2.0, 3.0], type=pa.float64()),
+            }
+        )
+        mock_client = _make_bq_mock(arrow_table)
+
+        engine = RemoteQueryEngine(
+            analytics_conn,
+            _bq_client_factory=lambda project: mock_client,
+            max_bq_registration_rows=500_000,
+        )
+
+        result = engine.register_bq("bq_orders", "SELECT order_id, revenue FROM bq.orders")
+
+        assert result["alias"] == "bq_orders"
+        assert result["rows"] == 3
+        assert result["columns"] == ["order_id", "revenue"]
+        assert result["memory_mb"] > 0
+
+        # The alias must be queryable from DuckDB
+        rows = analytics_conn.execute("SELECT COUNT(*) FROM bq_orders").fetchone()
+        assert rows[0] == 3
+
+    def test_register_bq_row_limit_exceeded(self, analytics_conn):
+        """COUNT pre-check returns a value exceeding the row limit → RemoteQueryError."""
+        arrow_table = pa.table({"x": pa.array([1], type=pa.int64())})
+        # count exceeds limit
+        mock_client = _make_bq_mock(arrow_table, count_value=1_000_000)
+
+        engine = RemoteQueryEngine(
+            analytics_conn,
+            _bq_client_factory=lambda project: mock_client,
+            max_bq_registration_rows=500_000,
+        )
+
+        with pytest.raises(RemoteQueryError) as exc_info:
+            engine.register_bq("bq_big", "SELECT * FROM bq.huge_table")
+
+        assert exc_info.value.error_type == "row_limit"
+        assert exc_info.value.details["count"] == 1_000_000
+
+    def test_register_bq_invalid_alias(self, analytics_conn):
+        engine = RemoteQueryEngine(analytics_conn)
+        # Space in alias — invalid identifier
+        with pytest.raises(RemoteQueryError) as exc_info:
+            engine.register_bq("bad alias", "SELECT 1")
+        assert exc_info.value.error_type == "query_error"
+
+        # Reserved alias — information_schema
+        with pytest.raises(RemoteQueryError) as exc_info:
+            engine.register_bq("information_schema", "SELECT 1")
+        assert exc_info.value.error_type == "query_error"
+
+        # Valid alias — should not raise from alias validation
+        # (will raise later trying to reach BQ without a client, but not from alias check)
+        try:
+            engine.register_bq("valid_name", "SELECT 1")
+        except RemoteQueryError as exc:
+            assert exc.error_type != "query_error" or "Invalid alias" not in str(exc)
+        except (ImportError, ModuleNotFoundError):
+            pass  # Expected — no BQ package in test env
+
+    def test_register_bq_missing_package(self, analytics_conn):
+        """When google-cloud-bigquery is not installed, engine must raise RemoteQueryError."""
+        engine = RemoteQueryEngine(
+            analytics_conn,
+            # No factory — will try to import google.cloud.bigquery
+            _bq_client_factory=None,
+            max_bq_registration_rows=500_000,
+        )
+
+        with patch.dict(sys.modules, {"google": None, "google.cloud": None, "google.cloud.bigquery": None}):
+            with pytest.raises(RemoteQueryError, match="google-cloud-bigquery"):
+                engine.register_bq("bq_alias", "SELECT 1")
+
+
+# ---------------------------------------------------------------------------
+# TestRemoteQueryEngineExecute
+# ---------------------------------------------------------------------------
+
+
+class TestRemoteQueryEngineExecute:
+    def test_execute_local_only(self, analytics_conn):
+        """Query local table; result dict has correct structure."""
+        engine = RemoteQueryEngine(analytics_conn)
+        result = engine.execute("SELECT id, amount FROM orders ORDER BY id")
+
+        assert result["columns"] == ["id", "amount"]
+        assert result["row_count"] == 2
+        assert result["truncated"] is False
+        assert len(result["rows"]) == 2
+        # Non-standard types (Decimal) must be serialized to str
+        for row in result["rows"]:
+            for val in row:
+                assert isinstance(val, (int, float, bool, str, type(None)))
+
+    def test_execute_with_registered_bq(self, analytics_conn):
+        """Manually register an Arrow table, then JOIN it with local orders."""
+        bq_arrow = pa.table(
+            {
+                "id": pa.array([1, 2], type=pa.int64()),
+                "label": pa.array(["first", "second"], type=pa.utf8()),
+            }
+        )
+        mock_client = _make_bq_mock(bq_arrow)
+
+        engine = RemoteQueryEngine(
+            analytics_conn,
+            _bq_client_factory=lambda project: mock_client,
+            max_bq_registration_rows=500_000,
+        )
+        engine.register_bq("bq_labels", "SELECT id, label FROM bq.labels")
+
+        result = engine.execute(
+            "SELECT o.id, o.amount, b.label "
+            "FROM orders o JOIN bq_labels b ON o.id = b.id "
+            "ORDER BY o.id"
+        )
+
+        assert result["row_count"] == 2
+        assert "label" in result["columns"]
+
+    def test_execute_respects_max_result_rows(self, analytics_conn):
+        """When max_result_rows=1, result is truncated after 1 row."""
+        engine = RemoteQueryEngine(analytics_conn, max_result_rows=1)
+        result = engine.execute("SELECT id FROM orders ORDER BY id")
+
+        assert result["row_count"] == 1
+        assert result["truncated"] is True
+
+    def test_execute_invalid_sql(self, analytics_conn):
+        """DROP TABLE must be rejected with RemoteQueryError(error_type='query_error')."""
+        engine = RemoteQueryEngine(analytics_conn)
+
+        with pytest.raises(RemoteQueryError) as exc_info:
+            engine.execute("DROP TABLE orders")
+
+        assert exc_info.value.error_type == "query_error"
+
+
+# ---------------------------------------------------------------------------
+# _validate_sql unit tests
+# ---------------------------------------------------------------------------
+
+
+class TestValidateSql:
+    @pytest.mark.parametrize(
+        "sql",
+        [
+            "DROP TABLE foo",
+            "DELETE FROM foo",
+            "INSERT INTO foo VALUES (1)",
+            "UPDATE foo SET x=1",
+            "ALTER TABLE foo ADD COLUMN y INT",
+            "CREATE TABLE foo (x INT)",
+            "COPY foo TO '/tmp/out.csv'",
+            "ATTACH '/db.duckdb'",
+            "DETACH db",
+            "LOAD 'extension'",
+            "INSTALL httpfs",
+            "SELECT read_parquet('/data/file.parquet')",
+            "SELECT * FROM '../secret/file'",
+            "SELECT 1; DROP TABLE foo",
+        ],
+    )
+    def test_blocked_sql(self, sql):
+        with pytest.raises(RemoteQueryError) as exc_info:
+            _validate_sql(sql)
+        assert exc_info.value.error_type == "query_error"
+
+    @pytest.mark.parametrize(
+        "sql",
+        [
+            "SELECT id FROM orders",
+            "WITH cte AS (SELECT 1 AS x) SELECT x FROM cte",
+            "select count(*) from orders",
+            "with t as (select 1) select * from t",
+        ],
+    )
+    def test_allowed_sql(self, sql):
+        # Should not raise
+        _validate_sql(sql)
+
+
+# ---------------------------------------------------------------------------
+# _validate_bq_sql unit tests
+# ---------------------------------------------------------------------------
+
+
+class TestValidateBqSql:
+    def test_information_schema_is_allowed(self):
+        """INFORMATION_SCHEMA queries must pass BQ SQL validation."""
+        # Should not raise
+        _validate_bq_sql("SELECT * FROM dataset.INFORMATION_SCHEMA.COLUMNS")
+
+    @pytest.mark.parametrize(
+        "sql",
+        [
+            "DROP TABLE x",
+            "INSERT INTO x VALUES (1)",
+            "DELETE FROM x",
+            "UPDATE x SET y=1",
+            "ALTER TABLE x ADD COLUMN z INT",
+            "CREATE TABLE x (y INT)",
+            "TRUNCATE TABLE x",
+            "MERGE INTO x USING y ON x.id=y.id WHEN MATCHED THEN UPDATE SET x.a=y.a",
+            "SELECT 1; DROP TABLE x",
+        ],
+    )
+    def test_blocked_bq_sql(self, sql):
+        """Write/mutation operations must be rejected."""
+        with pytest.raises(RemoteQueryError) as exc_info:
+            _validate_bq_sql(sql)
+        assert exc_info.value.error_type == "query_error"
+
+    @pytest.mark.parametrize(
+        "sql",
+        [
+            "SELECT * FROM dataset.INFORMATION_SCHEMA.COLUMNS",
+            "SELECT id FROM project.dataset.table",
+            "WITH cte AS (SELECT 1 AS x) SELECT x FROM cte",
+        ],
+    )
+    def test_allowed_bq_sql(self, sql):
+        """Valid read-only BQ queries must pass."""
+        # Should not raise
+        _validate_bq_sql(sql)