agnes-the-ai-analyst/tests/test_bigquery_extractor_full.py
ZdenekSrotyr 3c653b6dc2 test: add connector test suite (Block D) — 5 files, 58 tests
Tests cover Keboola extractor (extension + legacy fallback, _remote_attach),
BigQuery extractor (remote views, contract validation), Jira service
(webhook processing, HMAC verification, HTTP mocking), Jira incremental
transform (upsert/delete, monthly parquet partitioning), and LLM providers
(factory, AnthropicExtractor retry/auth, OpenAICompatExtractor strategy
cascade, JSON extraction helpers). Also adds tests/helpers/factories.py
with WebhookEventFactory.
2026-04-12 11:12:50 +02:00

210 lines
8.3 KiB
Python

"""Full tests for the BigQuery extractor connector."""
import re
from pathlib import Path
from unittest.mock import MagicMock, patch
import duckdb
import pytest
from tests.helpers.contract import validate_extract_contract
@pytest.fixture
def output_dir(tmp_path):
d = tmp_path / "extracts" / "bigquery"
d.mkdir(parents=True)
return str(d)
@pytest.fixture
def sample_configs():
return [
{
"id": "proj.analytics.orders",
"name": "orders",
"source_type": "bigquery",
"bucket": "analytics",
"source_table": "orders",
"query_mode": "remote",
"description": "Order data from BQ",
},
{
"id": "proj.analytics.sessions",
"name": "sessions",
"source_type": "bigquery",
"bucket": "analytics",
"source_table": "sessions",
"query_mode": "remote",
"description": "Session data from BQ",
},
]
class _DuckDBProxy:
"""Proxy around a real DuckDB connection that intercepts BigQuery extension SQL."""
def __init__(self, real_conn):
self._real = real_conn
def execute(self, sql, *args, **kwargs):
sql_upper = sql.strip().upper()
if sql_upper.startswith("INSTALL BIGQUERY") or sql_upper.startswith("LOAD BIGQUERY"):
return MagicMock()
if "ATTACH" in sql_upper and "BIGQUERY" in sql_upper:
return MagicMock()
if sql_upper.startswith("DETACH BQ"):
return MagicMock()
# CREATE VIEW referencing bq.* -> create a dummy table instead
if "FROM BQ." in sql_upper and "CREATE" in sql_upper:
match = re.search(r'VIEW\s+"?(\w+)"?', sql, re.IGNORECASE)
if match:
view_name = match.group(1)
self._real.execute(f'CREATE OR REPLACE TABLE "{view_name}" (dummy INTEGER)')
return MagicMock()
return self._real.execute(sql, *args, **kwargs)
def close(self):
return self._real.close()
def __getattr__(self, name):
return getattr(self._real, name)
def _proxy_connect(path=None, **kwargs):
real_conn = duckdb.connect(path)
return _DuckDBProxy(real_conn)
class TestBigQueryExtractorFull:
def test_init_extract_creates_contract_compliant_db(self, output_dir, sample_configs):
"""init_extract() creates extract.duckdb that passes contract validation."""
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
mock_mod.connect = _proxy_connect
from connectors.bigquery.extractor import init_extract
result = init_extract(output_dir, "my-gcp-project", sample_configs)
assert result["tables_registered"] == 2
assert result["errors"] == []
db_path = str(Path(output_dir) / "extract.duckdb")
validate_extract_contract(db_path)
def test_remote_attach_table_has_correct_values(self, output_dir, sample_configs):
"""_remote_attach row must have alias=bq, extension=bigquery, url=project=<id>, token_env=''."""
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
mock_mod.connect = _proxy_connect
from connectors.bigquery.extractor import init_extract
init_extract(output_dir, "acme-project", sample_configs)
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"), read_only=True)
ra = conn.execute("SELECT alias, extension, url, token_env FROM _remote_attach").fetchone()
conn.close()
assert ra[0] == "bq"
assert ra[1] == "bigquery"
assert ra[2] == "project=acme-project"
assert ra[3] == "" # BigQuery uses GOOGLE_APPLICATION_CREDENTIALS, not token_env
def test_all_tables_have_remote_query_mode(self, output_dir, sample_configs):
"""All BigQuery tables must have query_mode='remote' in _meta."""
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
mock_mod.connect = _proxy_connect
from connectors.bigquery.extractor import init_extract
init_extract(output_dir, "my-project", sample_configs)
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"), read_only=True)
modes = conn.execute("SELECT DISTINCT query_mode FROM _meta").fetchall()
conn.close()
assert len(modes) == 1
assert modes[0][0] == "remote"
def test_no_data_directory_created(self, output_dir, sample_configs):
"""BigQuery is remote-only — no data/ directory should be created."""
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
mock_mod.connect = _proxy_connect
from connectors.bigquery.extractor import init_extract
init_extract(output_dir, "my-project", sample_configs)
assert not (Path(output_dir) / "data").exists()
def test_meta_table_schema(self, output_dir):
"""_meta table must have the exact contract-required columns."""
from connectors.bigquery.extractor import _create_meta_table
db_path = Path(output_dir) / "schema_check.duckdb"
conn = duckdb.connect(str(db_path))
_create_meta_table(conn)
cols = conn.execute(
"SELECT column_name FROM information_schema.columns "
"WHERE table_name='_meta' ORDER BY ordinal_position"
).fetchall()
conn.close()
assert [c[0] for c in cols] == [
"table_name", "description", "rows", "size_bytes", "extracted_at", "query_mode"
]
def test_remote_attach_table_schema(self, output_dir):
"""_remote_attach table must have the exact contract-required columns."""
from connectors.bigquery.extractor import _create_remote_attach_table
db_path = Path(output_dir) / "ra_check.duckdb"
conn = duckdb.connect(str(db_path))
_create_remote_attach_table(conn, "test-project")
cols = conn.execute(
"SELECT column_name FROM information_schema.columns "
"WHERE table_name='_remote_attach' ORDER BY ordinal_position"
).fetchall()
conn.close()
assert [c[0] for c in cols] == ["alias", "extension", "url", "token_env"]
def test_table_registration_failure_records_error(self, output_dir):
"""A failed table registration records the error but continues others."""
configs = [
{"name": "good", "bucket": "ds", "source_table": "good", "query_mode": "remote", "description": ""},
{"name": "bad", "bucket": "ds", "source_table": "bad", "query_mode": "remote", "description": ""},
]
call_count = [0]
class FailingProxy(_DuckDBProxy):
def execute(self, sql, *args, **kwargs):
sql_upper = sql.strip().upper()
# Intercept: fail view creation for 'bad'
if "FROM BQ." in sql_upper and "CREATE" in sql_upper and '"bad"' in sql.lower():
call_count[0] += 1
raise Exception("Table not found in BigQuery")
return super().execute(sql, *args, **kwargs)
def failing_connect(path=None, **kwargs):
real_conn = duckdb.connect(path)
return FailingProxy(real_conn)
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
mock_mod.connect = failing_connect
from connectors.bigquery.extractor import init_extract
result = init_extract(output_dir, "my-project", configs)
assert result["tables_registered"] == 1
assert len(result["errors"]) == 1
assert result["errors"][0]["table"] == "bad"
def test_empty_table_list(self, output_dir):
"""init_extract with no tables still creates a valid (empty) extract.duckdb."""
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
mock_mod.connect = _proxy_connect
from connectors.bigquery.extractor import init_extract
result = init_extract(output_dir, "my-project", [])
assert result["tables_registered"] == 0
assert result["errors"] == []
db_path = Path(output_dir) / "extract.duckdb"
assert db_path.exists()
conn = duckdb.connect(str(db_path), read_only=True)
count = conn.execute("SELECT count(*) FROM _meta").fetchone()[0]
conn.close()
assert count == 0