Tests cover Keboola extractor (extension + legacy fallback, _remote_attach), BigQuery extractor (remote views, contract validation), Jira service (webhook processing, HMAC verification, HTTP mocking), Jira incremental transform (upsert/delete, monthly parquet partitioning), and LLM providers (factory, AnthropicExtractor retry/auth, OpenAICompatExtractor strategy cascade, JSON extraction helpers). Also adds tests/helpers/factories.py with WebhookEventFactory.
210 lines
8.3 KiB
Python
210 lines
8.3 KiB
Python
"""Full tests for the BigQuery extractor connector."""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import duckdb
|
|
import pytest
|
|
|
|
from tests.helpers.contract import validate_extract_contract
|
|
|
|
|
|
@pytest.fixture
|
|
def output_dir(tmp_path):
|
|
d = tmp_path / "extracts" / "bigquery"
|
|
d.mkdir(parents=True)
|
|
return str(d)
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_configs():
|
|
return [
|
|
{
|
|
"id": "proj.analytics.orders",
|
|
"name": "orders",
|
|
"source_type": "bigquery",
|
|
"bucket": "analytics",
|
|
"source_table": "orders",
|
|
"query_mode": "remote",
|
|
"description": "Order data from BQ",
|
|
},
|
|
{
|
|
"id": "proj.analytics.sessions",
|
|
"name": "sessions",
|
|
"source_type": "bigquery",
|
|
"bucket": "analytics",
|
|
"source_table": "sessions",
|
|
"query_mode": "remote",
|
|
"description": "Session data from BQ",
|
|
},
|
|
]
|
|
|
|
|
|
class _DuckDBProxy:
|
|
"""Proxy around a real DuckDB connection that intercepts BigQuery extension SQL."""
|
|
|
|
def __init__(self, real_conn):
|
|
self._real = real_conn
|
|
|
|
def execute(self, sql, *args, **kwargs):
|
|
sql_upper = sql.strip().upper()
|
|
if sql_upper.startswith("INSTALL BIGQUERY") or sql_upper.startswith("LOAD BIGQUERY"):
|
|
return MagicMock()
|
|
if "ATTACH" in sql_upper and "BIGQUERY" in sql_upper:
|
|
return MagicMock()
|
|
if sql_upper.startswith("DETACH BQ"):
|
|
return MagicMock()
|
|
# CREATE VIEW referencing bq.* -> create a dummy table instead
|
|
if "FROM BQ." in sql_upper and "CREATE" in sql_upper:
|
|
match = re.search(r'VIEW\s+"?(\w+)"?', sql, re.IGNORECASE)
|
|
if match:
|
|
view_name = match.group(1)
|
|
self._real.execute(f'CREATE OR REPLACE TABLE "{view_name}" (dummy INTEGER)')
|
|
return MagicMock()
|
|
return self._real.execute(sql, *args, **kwargs)
|
|
|
|
def close(self):
|
|
return self._real.close()
|
|
|
|
def __getattr__(self, name):
|
|
return getattr(self._real, name)
|
|
|
|
|
|
def _proxy_connect(path=None, **kwargs):
|
|
real_conn = duckdb.connect(path)
|
|
return _DuckDBProxy(real_conn)
|
|
|
|
|
|
class TestBigQueryExtractorFull:
|
|
def test_init_extract_creates_contract_compliant_db(self, output_dir, sample_configs):
|
|
"""init_extract() creates extract.duckdb that passes contract validation."""
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = _proxy_connect
|
|
from connectors.bigquery.extractor import init_extract
|
|
result = init_extract(output_dir, "my-gcp-project", sample_configs)
|
|
|
|
assert result["tables_registered"] == 2
|
|
assert result["errors"] == []
|
|
|
|
db_path = str(Path(output_dir) / "extract.duckdb")
|
|
validate_extract_contract(db_path)
|
|
|
|
def test_remote_attach_table_has_correct_values(self, output_dir, sample_configs):
|
|
"""_remote_attach row must have alias=bq, extension=bigquery, url=project=<id>, token_env=''."""
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = _proxy_connect
|
|
from connectors.bigquery.extractor import init_extract
|
|
init_extract(output_dir, "acme-project", sample_configs)
|
|
|
|
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"), read_only=True)
|
|
ra = conn.execute("SELECT alias, extension, url, token_env FROM _remote_attach").fetchone()
|
|
conn.close()
|
|
|
|
assert ra[0] == "bq"
|
|
assert ra[1] == "bigquery"
|
|
assert ra[2] == "project=acme-project"
|
|
assert ra[3] == "" # BigQuery uses GOOGLE_APPLICATION_CREDENTIALS, not token_env
|
|
|
|
def test_all_tables_have_remote_query_mode(self, output_dir, sample_configs):
|
|
"""All BigQuery tables must have query_mode='remote' in _meta."""
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = _proxy_connect
|
|
from connectors.bigquery.extractor import init_extract
|
|
init_extract(output_dir, "my-project", sample_configs)
|
|
|
|
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"), read_only=True)
|
|
modes = conn.execute("SELECT DISTINCT query_mode FROM _meta").fetchall()
|
|
conn.close()
|
|
|
|
assert len(modes) == 1
|
|
assert modes[0][0] == "remote"
|
|
|
|
def test_no_data_directory_created(self, output_dir, sample_configs):
|
|
"""BigQuery is remote-only — no data/ directory should be created."""
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = _proxy_connect
|
|
from connectors.bigquery.extractor import init_extract
|
|
init_extract(output_dir, "my-project", sample_configs)
|
|
|
|
assert not (Path(output_dir) / "data").exists()
|
|
|
|
def test_meta_table_schema(self, output_dir):
|
|
"""_meta table must have the exact contract-required columns."""
|
|
from connectors.bigquery.extractor import _create_meta_table
|
|
|
|
db_path = Path(output_dir) / "schema_check.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
_create_meta_table(conn)
|
|
cols = conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name='_meta' ORDER BY ordinal_position"
|
|
).fetchall()
|
|
conn.close()
|
|
|
|
assert [c[0] for c in cols] == [
|
|
"table_name", "description", "rows", "size_bytes", "extracted_at", "query_mode"
|
|
]
|
|
|
|
def test_remote_attach_table_schema(self, output_dir):
|
|
"""_remote_attach table must have the exact contract-required columns."""
|
|
from connectors.bigquery.extractor import _create_remote_attach_table
|
|
|
|
db_path = Path(output_dir) / "ra_check.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
_create_remote_attach_table(conn, "test-project")
|
|
cols = conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name='_remote_attach' ORDER BY ordinal_position"
|
|
).fetchall()
|
|
conn.close()
|
|
|
|
assert [c[0] for c in cols] == ["alias", "extension", "url", "token_env"]
|
|
|
|
def test_table_registration_failure_records_error(self, output_dir):
|
|
"""A failed table registration records the error but continues others."""
|
|
configs = [
|
|
{"name": "good", "bucket": "ds", "source_table": "good", "query_mode": "remote", "description": ""},
|
|
{"name": "bad", "bucket": "ds", "source_table": "bad", "query_mode": "remote", "description": ""},
|
|
]
|
|
|
|
call_count = [0]
|
|
|
|
class FailingProxy(_DuckDBProxy):
|
|
def execute(self, sql, *args, **kwargs):
|
|
sql_upper = sql.strip().upper()
|
|
# Intercept: fail view creation for 'bad'
|
|
if "FROM BQ." in sql_upper and "CREATE" in sql_upper and '"bad"' in sql.lower():
|
|
call_count[0] += 1
|
|
raise Exception("Table not found in BigQuery")
|
|
return super().execute(sql, *args, **kwargs)
|
|
|
|
def failing_connect(path=None, **kwargs):
|
|
real_conn = duckdb.connect(path)
|
|
return FailingProxy(real_conn)
|
|
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = failing_connect
|
|
from connectors.bigquery.extractor import init_extract
|
|
result = init_extract(output_dir, "my-project", configs)
|
|
|
|
assert result["tables_registered"] == 1
|
|
assert len(result["errors"]) == 1
|
|
assert result["errors"][0]["table"] == "bad"
|
|
|
|
def test_empty_table_list(self, output_dir):
|
|
"""init_extract with no tables still creates a valid (empty) extract.duckdb."""
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = _proxy_connect
|
|
from connectors.bigquery.extractor import init_extract
|
|
result = init_extract(output_dir, "my-project", [])
|
|
|
|
assert result["tables_registered"] == 0
|
|
assert result["errors"] == []
|
|
|
|
db_path = Path(output_dir) / "extract.duckdb"
|
|
assert db_path.exists()
|
|
conn = duckdb.connect(str(db_path), read_only=True)
|
|
count = conn.execute("SELECT count(*) FROM _meta").fetchone()[0]
|
|
conn.close()
|
|
assert count == 0
|