Partitioned sync: iterates day-by-day instead of loading full dataset. Each partition: query BQ -> stream to disk -> free RAM. Peak ~50 MB. New helpers: _sync_single_partition, _cleanup_old_partitions, _generate_partition_dates. Config: added partition_column_type (DATE/TIMESTAMP/DATETIME), query_mode (local/remote/hybrid). DuckDB manager: hybrid architecture support (local Parquet + remote BQ tables). Data sync: skips remote tables, filters by query_mode. Tests: 113 passing (adapter, client, config, data_sync, duckdb_manager).
1018 lines
42 KiB
Python
1018 lines
42 KiB
Python
"""Tests for the BigQuery client connector.
|
|
|
|
All external dependencies (google.cloud.bigquery, src.config) are mocked.
|
|
Tests cover initialization, metadata caching, schema building, query methods,
|
|
and connection testing.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, mock_open, patch
|
|
|
|
import pyarrow as pa
|
|
import pytest
|
|
|
|
# Pre-populate sys.modules with a mock google.cloud.bigquery if not installed,
|
|
# so the client module can be imported without the real SDK.
|
|
_bq_mock_installed = False
|
|
try:
|
|
from google.cloud import bigquery as _bq_test # noqa: F401
|
|
except ImportError:
|
|
_bq_mock_installed = True
|
|
_mock_bigquery = MagicMock()
|
|
# Expose commonly used classes as MagicMock so the client module
|
|
# can reference bigquery.Client, bigquery.QueryJobConfig, etc.
|
|
sys.modules.setdefault("google", MagicMock())
|
|
sys.modules.setdefault("google.cloud", MagicMock())
|
|
sys.modules.setdefault("google.cloud.bigquery", _mock_bigquery)
|
|
|
|
from connectors.bigquery.client import (
|
|
BIGQUERY_TO_PYARROW_TYPES,
|
|
BigQueryClient,
|
|
create_client,
|
|
)
|
|
|
|
# Import the real or mock bigquery reference used in the client module
|
|
from google.cloud import bigquery
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_bq_field(name: str, field_type: str, description: str = None):
|
|
"""Create a mock BigQuery SchemaField."""
|
|
field = MagicMock()
|
|
field.name = name
|
|
field.field_type = field_type
|
|
field.description = description
|
|
return field
|
|
|
|
|
|
def _make_table_ref(
|
|
table_id: str = "my-project.my_dataset.my_table",
|
|
schema=None,
|
|
num_rows: int = 1000,
|
|
num_bytes: int = 50000,
|
|
created: datetime = None,
|
|
modified: datetime = None,
|
|
time_partitioning=None,
|
|
):
|
|
"""Create a mock BigQuery Table reference object."""
|
|
table_ref = MagicMock()
|
|
table_ref.table_id = table_id.split(".")[-1]
|
|
table_ref.dataset_id = table_id.split(".")[1] if "." in table_id else "dataset"
|
|
table_ref.project = table_id.split(".")[0] if "." in table_id else "project"
|
|
table_ref.schema = schema or []
|
|
table_ref.num_rows = num_rows
|
|
table_ref.num_bytes = num_bytes
|
|
table_ref.created = created or datetime(2025, 1, 1, 12, 0, 0)
|
|
table_ref.modified = modified or datetime(2025, 6, 1, 12, 0, 0)
|
|
table_ref.time_partitioning = time_partitioning
|
|
return table_ref
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_config(tmp_path):
|
|
"""Mock get_config() to return a config with metadata path in tmp_path."""
|
|
config = MagicMock()
|
|
metadata_dir = tmp_path / "metadata"
|
|
metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
config.get_metadata_path.return_value = metadata_dir
|
|
return config
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_bq_client():
|
|
"""Create a mock BigQuery Client."""
|
|
return MagicMock()
|
|
|
|
|
|
@pytest.fixture
|
|
def client(mock_config, mock_bq_client):
|
|
"""Create a BigQueryClient instance with mocked dependencies."""
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq_client),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": "test-project"}),
|
|
):
|
|
bq_client = BigQueryClient()
|
|
return bq_client
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 1. Init validates BIGQUERY_PROJECT env var
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestInit:
|
|
def test_raises_value_error_when_project_not_set(self, mock_config):
|
|
"""Init raises ValueError if project_id is None and env var is missing."""
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client"),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {}, clear=True),
|
|
):
|
|
with pytest.raises(ValueError, match="BigQuery project ID not set"):
|
|
BigQueryClient()
|
|
|
|
def test_raises_value_error_when_project_empty_string(self, mock_config):
|
|
"""Init raises ValueError if BIGQUERY_PROJECT is set to empty string."""
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client"),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": ""}, clear=True),
|
|
):
|
|
with pytest.raises(ValueError, match="BigQuery project ID not set"):
|
|
BigQueryClient()
|
|
|
|
# -------------------------------------------------------------------
|
|
# 2. Init creates client with correct project_id
|
|
# -------------------------------------------------------------------
|
|
|
|
def test_creates_client_with_env_project_id(self, mock_config):
|
|
"""Client uses BIGQUERY_PROJECT from environment."""
|
|
mock_bq = MagicMock()
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq) as bq_cls,
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": "env-project-123"}),
|
|
):
|
|
client = BigQueryClient()
|
|
bq_cls.assert_called_once_with(project="env-project-123")
|
|
assert client.project_id == "env-project-123"
|
|
|
|
def test_creates_client_with_explicit_project_id(self, mock_config):
|
|
"""Explicit project_id argument takes precedence over env var."""
|
|
mock_bq = MagicMock()
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq) as bq_cls,
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
):
|
|
client = BigQueryClient(project_id="explicit-project")
|
|
bq_cls.assert_called_once_with(project="explicit-project")
|
|
assert client.project_id == "explicit-project"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 3. get_table_metadata fetches and caches metadata correctly
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestGetTableMetadata:
|
|
def test_fetches_metadata_from_bigquery(self, client, mock_bq_client):
|
|
"""get_table_metadata calls client.get_table and returns correct dict."""
|
|
table_id = "proj.dataset.orders"
|
|
schema = [
|
|
_make_bq_field("order_id", "INTEGER"),
|
|
_make_bq_field("customer_name", "STRING", description="Full name"),
|
|
_make_bq_field("created_at", "TIMESTAMP"),
|
|
]
|
|
table_ref = _make_table_ref(
|
|
table_id=table_id,
|
|
schema=schema,
|
|
num_rows=5000,
|
|
num_bytes=120000,
|
|
)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
metadata = client.get_table_metadata(table_id, use_cache=False)
|
|
|
|
mock_bq_client.get_table.assert_called_once_with(table_id)
|
|
assert metadata["table_id"] == table_id
|
|
assert metadata["name"] == "orders"
|
|
assert metadata["dataset"] == "dataset"
|
|
assert metadata["project"] == "proj"
|
|
assert metadata["columns"] == ["order_id", "customer_name", "created_at"]
|
|
assert metadata["column_types"]["order_id"] == "INTEGER"
|
|
assert metadata["column_types"]["customer_name"] == "STRING"
|
|
assert metadata["column_types"]["created_at"] == "TIMESTAMP"
|
|
assert metadata["column_descriptions"]["customer_name"] == "Full name"
|
|
assert "order_id" not in metadata["column_descriptions"]
|
|
assert metadata["row_count"] == 5000
|
|
assert metadata["size_bytes"] == 120000
|
|
assert "_cached_at" in metadata
|
|
|
|
def test_caches_metadata_in_memory(self, client, mock_bq_client):
|
|
"""After first fetch, metadata is stored in the in-memory cache."""
|
|
table_id = "proj.dataset.tbl"
|
|
table_ref = _make_table_ref(table_id=table_id)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
client.get_table_metadata(table_id, use_cache=False)
|
|
|
|
assert table_id in client.metadata_cache
|
|
assert client.metadata_cache[table_id]["table_id"] == table_id
|
|
|
|
def test_captures_partitioning_info(self, client, mock_bq_client):
|
|
"""Partitioning metadata is captured when table is partitioned."""
|
|
table_id = "proj.dataset.events"
|
|
partition = MagicMock()
|
|
partition.type_ = "DAY"
|
|
partition.field = "event_date"
|
|
partition.expiration_ms = 7776000000
|
|
|
|
table_ref = _make_table_ref(table_id=table_id, time_partitioning=partition)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
metadata = client.get_table_metadata(table_id, use_cache=False)
|
|
|
|
assert metadata["partitioning"] is not None
|
|
assert metadata["partitioning"]["type"] == "DAY"
|
|
assert metadata["partitioning"]["field"] == "event_date"
|
|
assert metadata["partitioning"]["expiration_ms"] == 7776000000
|
|
|
|
def test_no_partitioning_when_absent(self, client, mock_bq_client):
|
|
"""Partitioning is None when table has no partitioning."""
|
|
table_id = "proj.dataset.simple"
|
|
table_ref = _make_table_ref(table_id=table_id, time_partitioning=None)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
metadata = client.get_table_metadata(table_id, use_cache=False)
|
|
assert metadata["partitioning"] is None
|
|
|
|
# -------------------------------------------------------------------
|
|
# 4. get_table_metadata uses cache when available (within TTL)
|
|
# -------------------------------------------------------------------
|
|
|
|
def test_uses_cache_within_ttl(self, client, mock_bq_client):
|
|
"""When cache is fresh (within TTL), BQ API is not called again."""
|
|
table_id = "proj.dataset.cached_tbl"
|
|
now = datetime.now()
|
|
client.metadata_cache[table_id] = {
|
|
"table_id": table_id,
|
|
"columns": ["a", "b"],
|
|
"column_types": {"a": "STRING", "b": "INTEGER"},
|
|
"_cached_at": now.isoformat(),
|
|
}
|
|
|
|
result = client.get_table_metadata(table_id, use_cache=True, cache_ttl_hours=24)
|
|
|
|
mock_bq_client.get_table.assert_not_called()
|
|
assert result["table_id"] == table_id
|
|
assert result["columns"] == ["a", "b"]
|
|
|
|
def test_refetches_when_cache_expired(self, client, mock_bq_client):
|
|
"""When cache is older than TTL, metadata is re-fetched from BQ."""
|
|
table_id = "proj.dataset.stale_tbl"
|
|
old_time = (datetime.now() - timedelta(hours=48)).isoformat()
|
|
client.metadata_cache[table_id] = {
|
|
"table_id": table_id,
|
|
"columns": ["old_col"],
|
|
"column_types": {"old_col": "STRING"},
|
|
"_cached_at": old_time,
|
|
}
|
|
|
|
table_ref = _make_table_ref(
|
|
table_id=table_id,
|
|
schema=[_make_bq_field("new_col", "INTEGER")],
|
|
)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
result = client.get_table_metadata(table_id, use_cache=True, cache_ttl_hours=24)
|
|
|
|
mock_bq_client.get_table.assert_called_once_with(table_id)
|
|
assert result["columns"] == ["new_col"]
|
|
|
|
def test_bypasses_cache_when_use_cache_false(self, client, mock_bq_client):
|
|
"""When use_cache=False, always fetches from BQ even if cache is fresh."""
|
|
table_id = "proj.dataset.force_fetch"
|
|
client.metadata_cache[table_id] = {
|
|
"table_id": table_id,
|
|
"columns": ["cached"],
|
|
"column_types": {"cached": "STRING"},
|
|
"_cached_at": datetime.now().isoformat(),
|
|
}
|
|
|
|
table_ref = _make_table_ref(
|
|
table_id=table_id,
|
|
schema=[_make_bq_field("fresh", "INTEGER")],
|
|
)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
result = client.get_table_metadata(table_id, use_cache=False)
|
|
mock_bq_client.get_table.assert_called_once()
|
|
assert result["columns"] == ["fresh"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 5. get_pyarrow_schema builds correct schema from BQ types
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestGetPyarrowSchema:
|
|
def test_builds_correct_schema(self, client, mock_bq_client):
|
|
"""Schema maps BQ types to correct PyArrow types."""
|
|
table_id = "proj.dataset.typed_tbl"
|
|
schema = [
|
|
_make_bq_field("id", "INT64"),
|
|
_make_bq_field("name", "STRING"),
|
|
_make_bq_field("price", "FLOAT64"),
|
|
_make_bq_field("active", "BOOLEAN"),
|
|
_make_bq_field("created", "DATE"),
|
|
_make_bq_field("updated_at", "TIMESTAMP"),
|
|
]
|
|
table_ref = _make_table_ref(table_id=table_id, schema=schema)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
pa_schema = client.get_pyarrow_schema(table_id)
|
|
|
|
assert pa_schema is not None
|
|
assert pa_schema.field("id").type == pa.int64()
|
|
assert pa_schema.field("name").type == pa.string()
|
|
assert pa_schema.field("price").type == pa.float64()
|
|
assert pa_schema.field("active").type == pa.bool_()
|
|
assert pa_schema.field("created").type == pa.date32()
|
|
assert pa_schema.field("updated_at").type == pa.timestamp("us", tz="UTC")
|
|
|
|
def test_returns_none_when_no_column_types(self, client):
|
|
"""Returns None when metadata has no column_types."""
|
|
table_id = "proj.dataset.empty_schema"
|
|
client.metadata_cache[table_id] = {
|
|
"table_id": table_id,
|
|
"columns": [],
|
|
"column_types": {},
|
|
"_cached_at": datetime.now().isoformat(),
|
|
}
|
|
|
|
result = client.get_pyarrow_schema(table_id)
|
|
assert result is None
|
|
|
|
def test_unknown_type_falls_back_to_string(self, client, mock_bq_client):
|
|
"""Unknown BQ types default to pa.string() in the schema."""
|
|
table_id = "proj.dataset.exotic_types"
|
|
schema = [_make_bq_field("exotic_col", "SOME_UNKNOWN_TYPE")]
|
|
table_ref = _make_table_ref(table_id=table_id, schema=schema)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
pa_schema = client.get_pyarrow_schema(table_id)
|
|
assert pa_schema.field("exotic_col").type == pa.string()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 6. get_date_columns returns only DATE columns
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestGetDateColumns:
|
|
def test_returns_only_date_columns(self, client, mock_bq_client):
|
|
"""Only columns with BQ type DATE are returned."""
|
|
table_id = "proj.dataset.mixed_dates"
|
|
schema = [
|
|
_make_bq_field("event_date", "DATE"),
|
|
_make_bq_field("created_at", "TIMESTAMP"),
|
|
_make_bq_field("name", "STRING"),
|
|
_make_bq_field("birth_date", "DATE"),
|
|
_make_bq_field("updated_ts", "DATETIME"),
|
|
]
|
|
table_ref = _make_table_ref(table_id=table_id, schema=schema)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
date_cols = client.get_date_columns(table_id)
|
|
assert sorted(date_cols) == ["birth_date", "event_date"]
|
|
|
|
def test_returns_empty_when_no_date_columns(self, client, mock_bq_client):
|
|
"""Returns empty list when no DATE columns exist."""
|
|
table_id = "proj.dataset.no_dates"
|
|
schema = [
|
|
_make_bq_field("id", "INTEGER"),
|
|
_make_bq_field("ts", "TIMESTAMP"),
|
|
]
|
|
table_ref = _make_table_ref(table_id=table_id, schema=schema)
|
|
mock_bq_client.get_table.return_value = table_ref
|
|
|
|
date_cols = client.get_date_columns(table_id)
|
|
assert date_cols == []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 7. query_to_arrow executes SQL and returns PyArrow table
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestQueryToArrow:
|
|
def test_executes_query_and_returns_arrow(self, client, mock_bq_client):
|
|
"""query_to_arrow passes SQL to BQ and returns the arrow result."""
|
|
expected_table = pa.table({"col1": [1, 2, 3]})
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = expected_table
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock(query_parameters=None)
|
|
client.client = mock_bq_client
|
|
|
|
result = client.query_to_arrow("SELECT * FROM `proj.dataset.tbl`")
|
|
|
|
mock_bq_client.query.assert_called_once()
|
|
call_args = mock_bq_client.query.call_args
|
|
assert call_args[0][0] == "SELECT * FROM `proj.dataset.tbl`"
|
|
assert result.equals(expected_table)
|
|
|
|
def test_passes_query_parameters(self, client, mock_bq_client):
|
|
"""query_to_arrow forwards BQ query parameters in job config."""
|
|
expected_table = pa.table({"col1": [10]})
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = expected_table
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
mock_job_config = MagicMock()
|
|
params = [MagicMock()] # Mock ScalarQueryParameter
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = mock_job_config
|
|
client.client = mock_bq_client
|
|
|
|
client.query_to_arrow("SELECT 1 WHERE x > @val", params=params)
|
|
|
|
# Verify params were set on the job config
|
|
assert mock_job_config.query_parameters == params
|
|
|
|
def test_no_params_does_not_set_query_parameters(self, client, mock_bq_client):
|
|
"""When no params given, query_parameters is not set on job config."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"x": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
mock_job_config = MagicMock(spec=[])
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = mock_job_config
|
|
client.client = mock_bq_client
|
|
|
|
client.query_to_arrow("SELECT 1")
|
|
|
|
# query_parameters should not have been set
|
|
assert not hasattr(mock_job_config, "query_parameters") or not getattr(
|
|
mock_job_config, "query_parameters", None
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 8. read_table builds correct SQL query
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReadTable:
|
|
def test_full_table_select_all(self, client, mock_bq_client):
|
|
"""read_table with no columns or filter generates SELECT *."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
client.read_table("proj.dataset.tbl")
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "SELECT *" in sql
|
|
assert "`proj.dataset.tbl`" in sql
|
|
assert "WHERE" not in sql
|
|
|
|
def test_select_specific_columns(self, client, mock_bq_client):
|
|
"""read_table with columns list generates SELECT with backtick-quoted names."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
client.read_table("proj.dataset.tbl", columns=["col_a", "col_b"])
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "`col_a`" in sql
|
|
assert "`col_b`" in sql
|
|
assert "*" not in sql
|
|
|
|
def test_with_row_filter(self, client, mock_bq_client):
|
|
"""read_table with row_filter appends WHERE clause."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
client.read_table("proj.dataset.tbl", row_filter="status = 'active'")
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "WHERE status = 'active'" in sql
|
|
|
|
def test_columns_and_filter_combined(self, client, mock_bq_client):
|
|
"""read_table with both columns and filter generates correct SQL."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"x": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
client.read_table(
|
|
"proj.dataset.tbl",
|
|
columns=["id", "name"],
|
|
row_filter="id > 100",
|
|
)
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "`id`, `name`" in sql
|
|
assert "WHERE id > 100" in sql
|
|
assert "`proj.dataset.tbl`" in sql
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 9. read_table_incremental builds parameterized WHERE clause
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReadTableIncremental:
|
|
def test_incremental_query_structure(self, client, mock_bq_client):
|
|
"""read_table_incremental builds WHERE col > @since_value with params."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_param = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = mock_param
|
|
# Re-assign the client's bq client (the fixture already set it up)
|
|
client.client = mock_bq_client
|
|
|
|
client.read_table_incremental(
|
|
table_id="proj.dataset.events",
|
|
incremental_column="updated_at",
|
|
since_value="2025-01-01T00:00:00Z",
|
|
)
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "SELECT *" in sql
|
|
assert "`proj.dataset.events`" in sql
|
|
assert "`updated_at` > @since_value" in sql
|
|
|
|
# Verify ScalarQueryParameter was constructed correctly
|
|
mock_bq_module.ScalarQueryParameter.assert_called_once_with(
|
|
"since_value", "TIMESTAMP", "2025-01-01T00:00:00Z"
|
|
)
|
|
|
|
def test_incremental_with_columns(self, client, mock_bq_client):
|
|
"""read_table_incremental with columns list selects specific columns."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
|
|
client.read_table_incremental(
|
|
table_id="proj.dataset.events",
|
|
incremental_column="updated_at",
|
|
since_value="2025-01-01T00:00:00Z",
|
|
columns=["id", "name"],
|
|
)
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "`id`, `name`" in sql
|
|
assert "*" not in sql
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 10. read_table_partitioned builds correct range query
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReadTablePartitioned:
|
|
def test_partitioned_start_only(self, client, mock_bq_client):
|
|
"""With only start, generates >= @start_value without end clause."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
|
|
client.read_table_partitioned(
|
|
table_id="proj.dataset.events",
|
|
partition_column="event_date",
|
|
start="2025-01-01",
|
|
)
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "`event_date` >= @start_value" in sql
|
|
assert "@end_value" not in sql
|
|
|
|
# Only start_value parameter created
|
|
assert mock_bq_module.ScalarQueryParameter.call_count == 1
|
|
mock_bq_module.ScalarQueryParameter.assert_called_with(
|
|
"start_value", "TIMESTAMP", "2025-01-01"
|
|
)
|
|
|
|
def test_partitioned_start_and_end(self, client, mock_bq_client):
|
|
"""With start and end, generates >= @start_value AND < @end_value."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
|
|
client.read_table_partitioned(
|
|
table_id="proj.dataset.events",
|
|
partition_column="event_date",
|
|
start="2025-01-01",
|
|
end="2025-06-01",
|
|
)
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "`event_date` >= @start_value" in sql
|
|
assert "`event_date` < @end_value" in sql
|
|
|
|
# Both start_value and end_value parameters created
|
|
assert mock_bq_module.ScalarQueryParameter.call_count == 2
|
|
calls = mock_bq_module.ScalarQueryParameter.call_args_list
|
|
assert calls[0].args == ("start_value", "TIMESTAMP", "2025-01-01")
|
|
assert calls[1].args == ("end_value", "TIMESTAMP", "2025-06-01")
|
|
|
|
def test_partitioned_with_columns(self, client, mock_bq_client):
|
|
"""read_table_partitioned with columns selects specific columns."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
|
|
client.read_table_partitioned(
|
|
table_id="proj.dataset.events",
|
|
partition_column="event_date",
|
|
start="2025-01-01",
|
|
columns=["id", "event_date", "value"],
|
|
)
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "`id`, `event_date`, `value`" in sql
|
|
assert "*" not in sql
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 11. test_connection returns True on success, False on failure
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestTestConnection:
|
|
def test_returns_true_on_success(self, client, mock_bq_client):
|
|
"""test_connection returns True when SELECT 1 query succeeds."""
|
|
mock_job = MagicMock()
|
|
mock_job.result.return_value = iter([(1,)])
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
assert client.test_connection() is True
|
|
mock_bq_client.query.assert_called_once_with("SELECT 1")
|
|
|
|
def test_returns_false_on_failure(self, client, mock_bq_client):
|
|
"""test_connection returns False when the query raises an exception."""
|
|
mock_bq_client.query.side_effect = Exception("Connection refused")
|
|
|
|
assert client.test_connection() is False
|
|
|
|
def test_returns_false_when_result_fails(self, client, mock_bq_client):
|
|
"""test_connection returns False when result iteration fails."""
|
|
mock_job = MagicMock()
|
|
mock_job.result.side_effect = Exception("Timeout")
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
assert client.test_connection() is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 12. Type mapping completeness (all BQ types have PyArrow mapping)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestTypeMapping:
|
|
# All standard BigQuery types that should be mapped
|
|
EXPECTED_BQ_TYPES = [
|
|
"STRING", "BYTES", "INTEGER", "INT64",
|
|
"FLOAT", "FLOAT64", "NUMERIC", "BIGNUMERIC",
|
|
"BOOLEAN", "BOOL",
|
|
"TIMESTAMP", "DATE", "TIME", "DATETIME",
|
|
"GEOGRAPHY", "JSON",
|
|
"STRUCT", "RECORD", "ARRAY",
|
|
]
|
|
|
|
def test_all_standard_bq_types_are_mapped(self):
|
|
"""Every standard BigQuery type has an entry in BIGQUERY_TO_PYARROW_TYPES."""
|
|
for bq_type in self.EXPECTED_BQ_TYPES:
|
|
assert bq_type in BIGQUERY_TO_PYARROW_TYPES, (
|
|
f"Missing PyArrow mapping for BQ type: {bq_type}"
|
|
)
|
|
|
|
def test_all_mappings_produce_valid_pyarrow_types(self):
|
|
"""Every mapped value is a valid PyArrow DataType."""
|
|
for bq_type, pa_type in BIGQUERY_TO_PYARROW_TYPES.items():
|
|
assert isinstance(pa_type, pa.DataType), (
|
|
f"BQ type {bq_type} maps to non-DataType: {pa_type!r}"
|
|
)
|
|
|
|
def test_integer_types_map_to_int64(self):
|
|
"""Both INTEGER and INT64 map to pa.int64()."""
|
|
assert BIGQUERY_TO_PYARROW_TYPES["INTEGER"] == pa.int64()
|
|
assert BIGQUERY_TO_PYARROW_TYPES["INT64"] == pa.int64()
|
|
|
|
def test_float_types_map_to_float64(self):
|
|
"""FLOAT, FLOAT64, NUMERIC, BIGNUMERIC all map to pa.float64()."""
|
|
for t in ["FLOAT", "FLOAT64", "NUMERIC", "BIGNUMERIC"]:
|
|
assert BIGQUERY_TO_PYARROW_TYPES[t] == pa.float64()
|
|
|
|
def test_boolean_types_map_to_bool(self):
|
|
"""Both BOOLEAN and BOOL map to pa.bool_()."""
|
|
assert BIGQUERY_TO_PYARROW_TYPES["BOOLEAN"] == pa.bool_()
|
|
assert BIGQUERY_TO_PYARROW_TYPES["BOOL"] == pa.bool_()
|
|
|
|
def test_date_maps_to_date32(self):
|
|
"""DATE maps to pa.date32()."""
|
|
assert BIGQUERY_TO_PYARROW_TYPES["DATE"] == pa.date32()
|
|
|
|
def test_timestamp_has_utc_timezone(self):
|
|
"""TIMESTAMP maps to pa.timestamp with UTC timezone."""
|
|
ts_type = BIGQUERY_TO_PYARROW_TYPES["TIMESTAMP"]
|
|
assert ts_type == pa.timestamp("us", tz="UTC")
|
|
|
|
def test_datetime_has_no_timezone(self):
|
|
"""DATETIME maps to pa.timestamp without timezone."""
|
|
dt_type = BIGQUERY_TO_PYARROW_TYPES["DATETIME"]
|
|
assert dt_type == pa.timestamp("us")
|
|
|
|
def test_complex_types_map_to_string(self):
|
|
"""STRUCT, RECORD, ARRAY, GEOGRAPHY, JSON all serialize as string."""
|
|
for t in ["STRUCT", "RECORD", "ARRAY", "GEOGRAPHY", "JSON"]:
|
|
assert BIGQUERY_TO_PYARROW_TYPES[t] == pa.string()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 13. Metadata cache save/load from disk
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestMetadataCachePersistence:
|
|
def test_save_and_load_cache(self, tmp_path):
|
|
"""Metadata cache is persisted to disk and reloaded on new client init."""
|
|
metadata_dir = tmp_path / "metadata"
|
|
metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
cache_file = metadata_dir / "bq_table_metadata.json"
|
|
|
|
mock_config = MagicMock()
|
|
mock_config.get_metadata_path.return_value = metadata_dir
|
|
|
|
# First client: fetch metadata and save to cache
|
|
mock_bq = MagicMock()
|
|
table_id = "proj.ds.tbl"
|
|
schema = [_make_bq_field("col1", "STRING")]
|
|
table_ref = _make_table_ref(table_id=table_id, schema=schema)
|
|
mock_bq.get_table.return_value = table_ref
|
|
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
|
|
):
|
|
client1 = BigQueryClient()
|
|
client1.get_table_metadata(table_id, use_cache=False)
|
|
|
|
# Verify the cache file was written
|
|
assert cache_file.exists()
|
|
saved_data = json.loads(cache_file.read_text())
|
|
assert table_id in saved_data
|
|
assert saved_data[table_id]["columns"] == ["col1"]
|
|
|
|
# Second client: loads cache from disk on init
|
|
mock_bq2 = MagicMock()
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq2),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
|
|
):
|
|
client2 = BigQueryClient()
|
|
|
|
assert table_id in client2.metadata_cache
|
|
assert client2.metadata_cache[table_id]["columns"] == ["col1"]
|
|
|
|
def test_load_handles_corrupt_cache_file(self, tmp_path):
|
|
"""Client handles corrupt cache JSON gracefully without crashing."""
|
|
metadata_dir = tmp_path / "metadata"
|
|
metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
cache_file = metadata_dir / "bq_table_metadata.json"
|
|
cache_file.write_text("{corrupt json!!!")
|
|
|
|
mock_config = MagicMock()
|
|
mock_config.get_metadata_path.return_value = metadata_dir
|
|
|
|
mock_bq = MagicMock()
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
|
|
):
|
|
client = BigQueryClient()
|
|
|
|
# Cache should be empty after corrupt file
|
|
assert client.metadata_cache == {}
|
|
|
|
def test_load_handles_missing_cache_file(self, tmp_path):
|
|
"""Client initializes with empty cache when no cache file exists."""
|
|
metadata_dir = tmp_path / "metadata"
|
|
metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
# No cache file created
|
|
|
|
mock_config = MagicMock()
|
|
mock_config.get_metadata_path.return_value = metadata_dir
|
|
|
|
mock_bq = MagicMock()
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
|
|
):
|
|
client = BigQueryClient()
|
|
|
|
assert client.metadata_cache == {}
|
|
|
|
def test_save_creates_parent_directories(self, tmp_path):
|
|
"""_save_metadata_cache creates parent directories if they do not exist."""
|
|
# Use a nested path that does not yet exist
|
|
metadata_dir = tmp_path / "deep" / "nested" / "metadata"
|
|
# Do NOT create directories upfront
|
|
|
|
mock_config = MagicMock()
|
|
mock_config.get_metadata_path.return_value = metadata_dir
|
|
|
|
mock_bq = MagicMock()
|
|
table_id = "proj.ds.tbl"
|
|
schema = [_make_bq_field("x", "INTEGER")]
|
|
table_ref = _make_table_ref(table_id=table_id, schema=schema)
|
|
mock_bq.get_table.return_value = table_ref
|
|
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
|
|
):
|
|
client = BigQueryClient()
|
|
client.get_table_metadata(table_id, use_cache=False)
|
|
|
|
cache_file = metadata_dir / "bq_table_metadata.json"
|
|
assert cache_file.exists()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Factory function
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCreateClient:
|
|
def test_create_client_returns_bigquery_client(self, mock_config):
|
|
"""create_client() factory returns a BigQueryClient instance."""
|
|
mock_bq = MagicMock()
|
|
with (
|
|
patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
|
|
patch("connectors.bigquery.client.get_config", return_value=mock_config),
|
|
patch.dict("os.environ", {"BIGQUERY_PROJECT": "factory-project"}),
|
|
):
|
|
result = create_client()
|
|
|
|
assert isinstance(result, BigQueryClient)
|
|
assert result.project_id == "factory-project"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 14. read_table_partitioned_streaming yields RecordBatches
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReadTablePartitionedStreaming:
|
|
def test_streaming_yields_batches(self, client, mock_bq_client):
|
|
"""read_table_partitioned_streaming yields RecordBatches, not a Table."""
|
|
batch1 = pa.record_batch({"a": [1, 2]})
|
|
batch2 = pa.record_batch({"a": [3, 4]})
|
|
|
|
mock_query_job = MagicMock()
|
|
mock_row_iter = MagicMock()
|
|
mock_row_iter.to_arrow_iterable.return_value = iter([batch1, batch2])
|
|
mock_query_job.result.return_value = mock_row_iter
|
|
mock_bq_client.query.return_value = mock_query_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
client.bqstorage_client = None # disable Storage API for simplicity
|
|
|
|
batches = list(client.read_table_partitioned_streaming(
|
|
table_id="proj.dataset.events",
|
|
partition_column="event_date",
|
|
start="2025-01-01",
|
|
))
|
|
|
|
assert len(batches) == 2
|
|
assert isinstance(batches[0], pa.RecordBatch)
|
|
assert isinstance(batches[1], pa.RecordBatch)
|
|
|
|
def test_streaming_with_date_column_type(self, client, mock_bq_client):
|
|
"""With column_type='DATE', ScalarQueryParameter uses 'DATE' type."""
|
|
batch = pa.record_batch({"a": [1]})
|
|
|
|
mock_query_job = MagicMock()
|
|
mock_row_iter = MagicMock()
|
|
mock_row_iter.to_arrow_iterable.return_value = iter([batch])
|
|
mock_query_job.result.return_value = mock_row_iter
|
|
mock_bq_client.query.return_value = mock_query_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
client.bqstorage_client = None
|
|
|
|
list(client.read_table_partitioned_streaming(
|
|
table_id="proj.dataset.events",
|
|
partition_column="event_date",
|
|
start="2025-01-01",
|
|
column_type="DATE",
|
|
))
|
|
|
|
# Verify ScalarQueryParameter was called with "DATE" type
|
|
mock_bq_module.ScalarQueryParameter.assert_called_once_with(
|
|
"start_value", "DATE", "2025-01-01"
|
|
)
|
|
|
|
def test_streaming_start_and_end(self, client, mock_bq_client):
|
|
"""With start and end, both params are created with correct column_type."""
|
|
batch = pa.record_batch({"a": [1]})
|
|
|
|
mock_query_job = MagicMock()
|
|
mock_row_iter = MagicMock()
|
|
mock_row_iter.to_arrow_iterable.return_value = iter([batch])
|
|
mock_query_job.result.return_value = mock_row_iter
|
|
mock_bq_client.query.return_value = mock_query_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
client.bqstorage_client = None
|
|
|
|
list(client.read_table_partitioned_streaming(
|
|
table_id="proj.dataset.events",
|
|
partition_column="event_date",
|
|
start="2025-01-01",
|
|
end="2025-06-01",
|
|
column_type="DATE",
|
|
))
|
|
|
|
sql = mock_bq_client.query.call_args[0][0]
|
|
assert "`event_date` >= @start_value" in sql
|
|
assert "`event_date` < @end_value" in sql
|
|
|
|
# Both parameters created with "DATE" type
|
|
assert mock_bq_module.ScalarQueryParameter.call_count == 2
|
|
calls = mock_bq_module.ScalarQueryParameter.call_args_list
|
|
assert calls[0].args == ("start_value", "DATE", "2025-01-01")
|
|
assert calls[1].args == ("end_value", "DATE", "2025-06-01")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 15. read_table_partitioned column_type parameter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestReadTablePartitionedColumnType:
|
|
def test_date_column_type(self, client, mock_bq_client):
|
|
"""read_table_partitioned with column_type='DATE' creates DATE params."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
|
|
client.read_table_partitioned(
|
|
table_id="proj.dataset.events",
|
|
partition_column="event_date",
|
|
start="2025-01-01",
|
|
end="2025-06-01",
|
|
column_type="DATE",
|
|
)
|
|
|
|
# Both parameters should use "DATE" type
|
|
assert mock_bq_module.ScalarQueryParameter.call_count == 2
|
|
calls = mock_bq_module.ScalarQueryParameter.call_args_list
|
|
assert calls[0].args == ("start_value", "DATE", "2025-01-01")
|
|
assert calls[1].args == ("end_value", "DATE", "2025-06-01")
|
|
|
|
def test_default_column_type_is_timestamp(self, client, mock_bq_client):
|
|
"""Default column_type is TIMESTAMP when not specified."""
|
|
mock_job = MagicMock()
|
|
mock_job.to_arrow.return_value = pa.table({"a": [1]})
|
|
mock_bq_client.query.return_value = mock_job
|
|
|
|
with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
|
|
mock_bq_module.QueryJobConfig.return_value = MagicMock()
|
|
mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
|
|
client.client = mock_bq_client
|
|
|
|
client.read_table_partitioned(
|
|
table_id="proj.dataset.events",
|
|
partition_column="created_at",
|
|
start="2025-01-01T00:00:00Z",
|
|
)
|
|
|
|
# Should default to "TIMESTAMP"
|
|
mock_bq_module.ScalarQueryParameter.assert_called_once_with(
|
|
"start_value", "TIMESTAMP", "2025-01-01T00:00:00Z"
|
|
)
|