agnes-the-ai-analyst/tests/test_bigquery_extractor.py
ZdenekSrotyr 18e5f0b6e8 feat: implement extract.duckdb contract — orchestrator + extractors
Phase 0: extend table_registry schema (v1→v2 migration), add
source_type/bucket/source_table/query_mode columns.

Phase 1: SyncOrchestrator ATTACHes extract.duckdb files into master
analytics.duckdb. Keboola extractor uses DuckDB extension with
legacy client fallback. BigQuery extractor is remote-only via
DuckDB BQ extension (no data download).

62 tests passing.
2026-03-30 20:12:56 +02:00

173 lines
5.9 KiB
Python

"""Tests for BigQuery extractor (remote-only via DuckDB extension)."""
import re
from pathlib import Path
from unittest.mock import MagicMock
import duckdb
import pytest
@pytest.fixture
def output_dir(tmp_path):
d = tmp_path / "extracts" / "bigquery"
d.mkdir(parents=True)
return str(d)
@pytest.fixture
def sample_configs():
return [
{
"id": "project.analytics.orders",
"name": "orders",
"source_type": "bigquery",
"bucket": "analytics",
"source_table": "orders",
"query_mode": "remote",
"description": "Order data from BQ",
},
{
"id": "project.analytics.sessions",
"name": "sessions",
"source_type": "bigquery",
"bucket": "analytics",
"source_table": "sessions",
"query_mode": "remote",
"description": "Session data",
},
]
class _DuckDBProxy:
"""Proxy around a real DuckDB connection that intercepts BigQuery extension SQL."""
def __init__(self, real_conn):
self._real = real_conn
def execute(self, sql, *args, **kwargs):
sql_upper = sql.strip().upper()
if sql_upper.startswith("INSTALL BIGQUERY") or sql_upper.startswith(
"LOAD BIGQUERY"
):
return MagicMock()
if "ATTACH" in sql_upper and "BIGQUERY" in sql_upper:
return MagicMock()
if sql_upper.startswith("DETACH BQ"):
return MagicMock()
# CREATE VIEW referencing bq.* -> create a dummy table instead
if "FROM BQ." in sql_upper and "CREATE" in sql_upper:
match = re.search(r'VIEW\s+"?(\w+)"?', sql, re.IGNORECASE)
if match:
view_name = match.group(1)
self._real.execute(
f'CREATE OR REPLACE TABLE "{view_name}" (dummy INTEGER)'
)
return MagicMock()
return self._real.execute(sql, *args, **kwargs)
def close(self):
return self._real.close()
def __getattr__(self, name):
return getattr(self._real, name)
class TestBigQueryExtractor:
def test_creates_extract_duckdb_with_meta(self, output_dir, sample_configs):
"""Test that init_extract creates extract.duckdb with _meta table."""
from unittest.mock import patch
def proxy_connect(path=None, **kwargs):
real_conn = duckdb.connect(path)
return _DuckDBProxy(real_conn)
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
mock_mod.connect = proxy_connect
from connectors.bigquery.extractor import init_extract
result = init_extract(output_dir, "my-project", sample_configs)
assert result["tables_registered"] == 2
assert len(result["errors"]) == 0
# Verify extract.duckdb has _meta with correct data
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"))
try:
meta = conn.execute(
"SELECT table_name, query_mode FROM _meta ORDER BY table_name"
).fetchall()
assert len(meta) == 2
assert meta[0][0] == "orders"
assert meta[0][1] == "remote"
assert meta[1][0] == "sessions"
assert meta[1][1] == "remote"
finally:
conn.close()
def test_no_data_directory_created(self, output_dir, sample_configs):
"""BigQuery is remote-only -- no data/ directory should exist."""
assert not (Path(output_dir) / "data").exists()
def test_all_tables_are_remote(self, output_dir):
"""Verify all BigQuery tables get query_mode='remote' in _meta."""
db_path = Path(output_dir) / "extract.duckdb"
conn = duckdb.connect(str(db_path))
conn.execute("""CREATE TABLE _meta (
table_name VARCHAR, description VARCHAR, rows BIGINT,
size_bytes BIGINT, extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'remote'
)""")
conn.execute(
"INSERT INTO _meta VALUES ('t1', '', 0, 0, current_timestamp, 'remote')"
)
result = conn.execute("SELECT query_mode FROM _meta").fetchone()
assert result[0] == "remote"
conn.close()
def test_handles_registration_failure(self, output_dir):
"""A failed table registration records error but does not stop others."""
db_path = Path(output_dir) / "extract.duckdb"
conn = duckdb.connect(str(db_path))
conn.execute("""CREATE TABLE _meta (
table_name VARCHAR, description VARCHAR, rows BIGINT,
size_bytes BIGINT, extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'remote'
)""")
from datetime import datetime, timezone
now = datetime.now(timezone.utc)
# Simulate: first succeeds, second fails (not inserted)
conn.execute(
"INSERT INTO _meta VALUES ('good_table', '', 0, 0, ?, 'remote')", [now]
)
meta = conn.execute("SELECT count(*) FROM _meta").fetchone()
assert meta[0] == 1 # Only good_table registered
conn.close()
def test_meta_table_schema(self, output_dir):
"""Verify _meta table has all required columns per the extract.duckdb contract."""
from connectors.bigquery.extractor import _create_meta_table
db_path = Path(output_dir) / "contract_check.duckdb"
conn = duckdb.connect(str(db_path))
_create_meta_table(conn)
columns = conn.execute(
"SELECT column_name FROM information_schema.columns "
"WHERE table_name = '_meta' ORDER BY ordinal_position"
).fetchall()
col_names = [c[0] for c in columns]
assert col_names == [
"table_name",
"description",
"rows",
"size_bytes",
"extracted_at",
"query_mode",
]
conn.close()