Add BigQuery data source adapter

BigQuery connector that syncs BQ tables to local Parquet files via PyArrow (no CSV intermediate step). Supports full refresh, timestamp-based incremental (via incremental_column), and partition-based sync strategies. - connectors/bigquery/client.py: BQ API wrapper with ADC auth, parameterized queries, metadata cache, cross-project support (job project != data project) - connectors/bigquery/adapter.py: DataSource implementation with merge/dedup - src/config.py: Add incremental_column field to TableConfig - 72 unit tests (mocked, no GCP SDK required)
2026-03-11 13:56:12 +01:00 · 2026-03-11 13:56:12 +01:00 · 758910463b
commit 758910463b
parent eb5264b903
9 changed files with 2619 additions and 2 deletions
--- a/config/instance.yaml.example
+++ b/config/instance.yaml.example
@ -44,13 +44,38 @@ auth:
  google_client_id: "${GOOGLE_CLIENT_ID}"
  google_client_secret: "${GOOGLE_CLIENT_SECRET}"

+# --- Theme (optional) ---
+# Customize colors, fonts, and shape to match your brand.
+# All values are optional - defaults provide a clean blue theme.
+# See docs/theme-reference.html for a visual guide.
+theme:
+  # primary: "#0073D1"              # Main brand color (buttons, links, accents)
+  # primary_dark: "#005BA3"         # Hover/active state of primary
+  # primary_light: "rgba(0, 115, 209, 0.1)"  # Light tint backgrounds
+  # text_primary: "#1A253C"         # Main text color
+  # text_secondary: "#6B7280"       # Muted/secondary text
+  # background: "#F5F7FA"           # Page background
+  # surface: "#FFFFFF"              # Card/panel background
+  # border: "#E5E7EB"              # Borders and dividers
+  # font_primary: "'Inter', system-ui, sans-serif"
+  # font_url: "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"
+  # radius: "6px"                   # Border radius (cards, buttons, inputs)
+  # success: "#10B77F"
+  # warning: "#F59F0A"
+  # error: "#EA580C"
+
 # --- Data source ---
 data_source:
-  type: "keboola"                 # keboola | csv (bigquery planned)
+  type: "keboola"                 # keboola | bigquery | local
  keboola:
    storage_token: "${KEBOOLA_STORAGE_TOKEN}"
    stack_url: ""                 # e.g., "https://connection.keboola.com"
    project_id: ""
+  bigquery:
+    project: "${BIGQUERY_PROJECT}"       # GCP project for job execution/billing
+    location: "${BIGQUERY_LOCATION}"     # BigQuery location (e.g., "us-central1", "US")
+    # Uses ADC (Application Default Credentials) - VM service account on GCP
+    # Data can live in a different project -- use fully-qualified table IDs in data_description.md

 # --- Email delivery (optional, for magic link auth) ---
 # Without SMTP, magic links are shown directly in browser (development mode).
--- a/connectors/bigquery/init.py
+++ b/connectors/bigquery/init.py
@ -0,0 +1,11 @@
+"""
+BigQuery connector - data source adapter for Google BigQuery.
+
+Syncs tables from BigQuery using the BigQuery Storage API,
+converting query results directly to Parquet files via PyArrow
+(no CSV intermediate step).
+
+Enable by setting data_source.type: "bigquery" in config/instance.yaml
+and providing BIGQUERY_PROJECT environment variable.
+Uses Application Default Credentials (ADC) for authentication.
+"""
--- a/connectors/bigquery/adapter.py
+++ b/connectors/bigquery/adapter.py
@ -0,0 +1,475 @@
+"""
+BigQuery data source adapter.
+
+Implements the DataSource interface for Google BigQuery.
+Reads tables via the BigQuery API, converts directly to Parquet files
+using PyArrow (no CSV intermediate step).
+"""
+
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from datetime import datetime, timedelta
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from src.config import get_config, TableConfig
+from src.data_sync import DataSource, SyncState, _get_uncompressed_size
+from src.parquet_manager import (
+    convert_date_columns_to_date32,
+    apply_schema_to_table,
+)
+from .client import create_client as create_bq_client
+
+
+logger = logging.getLogger(__name__)
+
+
+class BigQueryDataSource(DataSource):
+    """
+    Data source: Google BigQuery.
+
+    Downloads data directly from BigQuery via PyArrow (no CSV step),
+    writes to local Parquet files with schema enforcement.
+    """
+
+    def __init__(self):
+        """Initialize BigQuery source with env var validation."""
+        self.config = get_config()
+        self.bq_client = create_bq_client()
+
+    def get_column_metadata(self, table_id: str) -> Optional[Dict[str, Any]]:
+        """Return BigQuery column metadata for schema generation.
+
+        Returns:
+            {"columns": {"col_name": {"source_type": "...", "description": "..."}}}
+            or None if metadata unavailable.
+        """
+        raw = self.bq_client.get_table_metadata(table_id)
+        column_types = raw.get("column_types", {})
+        column_descriptions = raw.get("column_descriptions", {})
+
+        if not column_types:
+            return None
+
+        result = {}
+        for col_name, bq_type in column_types.items():
+            entry = {"source_type": bq_type}
+            if col_name in column_descriptions:
+                entry["description"] = column_descriptions[col_name]
+            result[col_name] = entry
+
+        return {"columns": result}
+
+    def discover_tables(self) -> List[Dict[str, Any]]:
+        """Discover all available tables from BigQuery."""
+        return self.bq_client.discover_all_tables()
+
+    def get_source_name(self) -> str:
+        """Display name of this data source."""
+        return "Google BigQuery"
+
+    def sync_table(
+        self,
+        table_config: TableConfig,
+        sync_state: SyncState,
+    ) -> Dict[str, Any]:
+        """
+        Synchronize table from BigQuery.
+
+        Dispatches to the appropriate strategy based on table config.
+
+        Args:
+            table_config: Table configuration
+            sync_state: Sync state manager
+
+        Returns:
+            Dictionary with sync result
+        """
+        logger.info(f"Syncing BQ table: {table_config.name} ({table_config.sync_strategy})")
+
+        # Clear metadata cache for fresh types
+        if table_config.id in self.bq_client.metadata_cache:
+            del self.bq_client.metadata_cache[table_config.id]
+            logger.debug(f"Cleared BQ metadata cache for {table_config.id}")
+
+        try:
+            if table_config.sync_strategy == "full_refresh":
+                result = self._full_refresh(table_config)
+            elif table_config.sync_strategy == "incremental":
+                result = self._incremental_sync(table_config, sync_state)
+            elif table_config.sync_strategy == "partitioned":
+                result = self._partitioned_sync(table_config, sync_state)
+            else:
+                raise ValueError(f"Unknown sync strategy: {table_config.sync_strategy}")
+
+            # Update sync state
+            sync_state.update_sync(
+                table_id=table_config.id,
+                table_name=table_config.name,
+                strategy=table_config.sync_strategy,
+                rows=result["rows"],
+                file_size_bytes=result["file_size_bytes"],
+                columns=result.get("columns", 0),
+                uncompressed_bytes=result.get("uncompressed_bytes", 0),
+            )
+
+            return {
+                "success": True,
+                "rows": result["rows"],
+                "strategy": table_config.sync_strategy,
+                "file_size_mb": result["file_size_bytes"] / 1024 / 1024,
+            }
+
+        except Exception as e:
+            logger.error(f"Error syncing BQ table {table_config.name}: {e}")
+            return {
+                "success": False,
+                "error": str(e),
+                "strategy": table_config.sync_strategy,
+            }
+
+    def _full_refresh(self, table_config: TableConfig) -> Dict[str, Any]:
+        """
+        Full refresh: read entire table and replace Parquet file.
+        """
+        logger.info(f"Full refresh: {table_config.name}")
+
+        parquet_path = self.config.get_parquet_path(table_config)
+        date_columns = self.bq_client.get_date_columns(table_config.id)
+        pyarrow_schema = self.bq_client.get_pyarrow_schema(table_config.id)
+
+        # Read full table from BigQuery -> PyArrow
+        arrow_table = self.bq_client.read_table(table_config.id)
+
+        # Apply schema enforcement
+        if date_columns:
+            arrow_table = convert_date_columns_to_date32(arrow_table, date_columns)
+        if pyarrow_schema:
+            arrow_table = apply_schema_to_table(arrow_table, pyarrow_schema)
+
+        # Write to Parquet
+        parquet_path.parent.mkdir(parents=True, exist_ok=True)
+        pq.write_table(arrow_table, parquet_path, compression="snappy")
+
+        file_size = parquet_path.stat().st_size
+        logger.info(
+            f"Full refresh complete: {arrow_table.num_rows} rows, "
+            f"{file_size / 1024 / 1024:.2f} MB"
+        )
+
+        return {
+            "rows": arrow_table.num_rows,
+            "columns": arrow_table.num_columns,
+            "file_size_bytes": file_size,
+            "uncompressed_bytes": _get_uncompressed_size(parquet_path),
+        }
+
+    def _incremental_sync(
+        self,
+        table_config: TableConfig,
+        sync_state: SyncState,
+    ) -> Dict[str, Any]:
+        """
+        Incremental sync: dispatch to column-based or partition-based strategy.
+        """
+        # If partition_by is set, use partitioned incremental
+        if table_config.partition_by:
+            return self._partitioned_sync(table_config, sync_state)
+
+        # If incremental_column is set, use timestamp-based incremental
+        if table_config.incremental_column:
+            return self._incremental_column_sync(table_config, sync_state)
+
+        # Fallback: full refresh (no incremental column configured)
+        logger.warning(
+            f"Table {table_config.name}: incremental strategy but no "
+            f"incremental_column or partition_by configured, falling back to full refresh"
+        )
+        return self._full_refresh(table_config)
+
+    def _incremental_column_sync(
+        self,
+        table_config: TableConfig,
+        sync_state: SyncState,
+    ) -> Dict[str, Any]:
+        """
+        Timestamp-based incremental sync using incremental_column.
+
+        Reads rows WHERE incremental_column > last_sync_value,
+        merges with existing Parquet (dedup on primary key).
+        """
+        logger.info(
+            f"Incremental column sync: {table_config.name} "
+            f"(column: {table_config.incremental_column})"
+        )
+
+        parquet_path = self.config.get_parquet_path(table_config)
+        date_columns = self.bq_client.get_date_columns(table_config.id)
+        pyarrow_schema = self.bq_client.get_pyarrow_schema(table_config.id)
+
+        # Determine since_value from last sync
+        last_sync = sync_state.get_last_sync(table_config.id)
+
+        if last_sync and parquet_path.exists():
+            # Apply window: go back incremental_window_days from last sync
+            last_sync_dt = datetime.fromisoformat(last_sync)
+            window_days = table_config.incremental_window_days or 7
+            since_dt = last_sync_dt - timedelta(days=window_days)
+            since_value = since_dt.isoformat()
+
+            logger.info(f"  -> Since: {since_value} (window: {window_days} days)")
+
+            # Read incremental data
+            new_data = self.bq_client.read_table_incremental(
+                table_id=table_config.id,
+                incremental_column=table_config.incremental_column,
+                since_value=since_value,
+            )
+
+            if new_data.num_rows == 0:
+                logger.info("  -> No new data since last sync")
+                existing_pf = pq.ParquetFile(parquet_path)
+                return {
+                    "rows": existing_pf.metadata.num_rows,
+                    "columns": len(existing_pf.schema_arrow),
+                    "file_size_bytes": parquet_path.stat().st_size,
+                    "uncompressed_bytes": _get_uncompressed_size(parquet_path),
+                }
+
+            # Merge with existing data
+            logger.info(f"  -> Merging {new_data.num_rows} new rows with existing data")
+            existing_table = pq.read_table(parquet_path)
+            merged = self._merge_arrow_tables(
+                existing_table, new_data, table_config.get_primary_key_columns()
+            )
+
+            # Apply schema enforcement
+            if date_columns:
+                merged = convert_date_columns_to_date32(merged, date_columns)
+            if pyarrow_schema:
+                merged = apply_schema_to_table(merged, pyarrow_schema)
+
+            pq.write_table(merged, parquet_path, compression="snappy")
+
+            file_size = parquet_path.stat().st_size
+            logger.info(
+                f"  -> Incremental sync complete: {merged.num_rows} total rows"
+            )
+
+            return {
+                "rows": merged.num_rows,
+                "columns": merged.num_columns,
+                "file_size_bytes": file_size,
+                "uncompressed_bytes": _get_uncompressed_size(parquet_path),
+            }
+
+        else:
+            # First sync or no existing file -- full read
+            logger.info("  -> First sync, reading all data")
+
+            if table_config.max_history_days:
+                since_dt = datetime.now() - timedelta(days=table_config.max_history_days)
+                arrow_table = self.bq_client.read_table_incremental(
+                    table_id=table_config.id,
+                    incremental_column=table_config.incremental_column,
+                    since_value=since_dt.isoformat(),
+                )
+            else:
+                arrow_table = self.bq_client.read_table(table_config.id)
+
+            # Apply schema enforcement
+            if date_columns:
+                arrow_table = convert_date_columns_to_date32(arrow_table, date_columns)
+            if pyarrow_schema:
+                arrow_table = apply_schema_to_table(arrow_table, pyarrow_schema)
+
+            parquet_path.parent.mkdir(parents=True, exist_ok=True)
+            pq.write_table(arrow_table, parquet_path, compression="snappy")
+
+            file_size = parquet_path.stat().st_size
+            return {
+                "rows": arrow_table.num_rows,
+                "columns": arrow_table.num_columns,
+                "file_size_bytes": file_size,
+                "uncompressed_bytes": _get_uncompressed_size(parquet_path),
+            }
+
+    def _partitioned_sync(
+        self,
+        table_config: TableConfig,
+        sync_state: SyncState,
+    ) -> Dict[str, Any]:
+        """
+        Partition-based sync: read data by partition range and write partition files.
+        """
+        import pandas as pd
+
+        partition_col = table_config.partition_by
+        if not partition_col and table_config.incremental_column:
+            partition_col = table_config.incremental_column
+
+        if not partition_col:
+            logger.warning(
+                f"Table {table_config.name}: partitioned strategy but no "
+                f"partition_by or incremental_column, falling back to full refresh"
+            )
+            return self._full_refresh(table_config)
+
+        granularity = table_config.partition_granularity or "month"
+        logger.info(
+            f"Partitioned sync: {table_config.name} "
+            f"(by {partition_col}, {granularity})"
+        )
+
+        partition_dir = self.config.get_parquet_path(table_config)
+        date_columns = self.bq_client.get_date_columns(table_config.id)
+        pyarrow_schema = self.bq_client.get_pyarrow_schema(table_config.id)
+
+        # Determine time range
+        last_sync = sync_state.get_last_sync(table_config.id)
+
+        if last_sync:
+            last_sync_dt = datetime.fromisoformat(last_sync)
+            window_days = table_config.incremental_window_days or 7
+            start_dt = last_sync_dt - timedelta(days=window_days)
+            logger.info(f"  -> Reading from {start_dt.isoformat()} (window: {window_days} days)")
+        else:
+            if table_config.max_history_days:
+                start_dt = datetime.now() - timedelta(days=table_config.max_history_days)
+                logger.info(f"  -> First sync, limited to last {table_config.max_history_days} days")
+            else:
+                start_dt = None
+                logger.info("  -> First sync, reading all data")
+
+        # Read data from BigQuery
+        if start_dt:
+            arrow_table = self.bq_client.read_table_partitioned(
+                table_id=table_config.id,
+                partition_column=partition_col,
+                start=start_dt.isoformat(),
+            )
+        else:
+            arrow_table = self.bq_client.read_table(table_config.id)
+
+        if arrow_table.num_rows == 0:
+            logger.info("  -> No data to sync")
+            return self._get_partition_totals(partition_dir)
+
+        logger.info(f"  -> Processing {arrow_table.num_rows} rows into partitions")
+
+        # Convert to pandas for partitioning
+        df = arrow_table.to_pandas()
+
+        # Ensure partition column is datetime
+        if not pd.api.types.is_datetime64_any_dtype(df[partition_col]):
+            df[partition_col] = pd.to_datetime(df[partition_col], format="ISO8601", utc=True)
+
+        # Create partition key
+        if granularity == "month":
+            df["_partition_key"] = df[partition_col].dt.strftime("%Y_%m")
+        elif granularity == "day":
+            df["_partition_key"] = df[partition_col].dt.strftime("%Y_%m_%d")
+        elif granularity == "year":
+            df["_partition_key"] = df[partition_col].dt.strftime("%Y")
+
+        primary_key_cols = table_config.get_primary_key_columns()
+        partitions_updated = set()
+
+        for partition_key, group_df in df.groupby("_partition_key"):
+            group_df = group_df.drop(columns=["_partition_key"])
+            partition_path = self.config.get_partition_path(table_config, partition_key)
+            partitions_updated.add(partition_key)
+
+            # Merge with existing partition if it exists
+            if partition_path.exists():
+                existing_df = pd.read_parquet(partition_path)
+                merged_df = pd.concat([existing_df, group_df], ignore_index=True)
+                merged_df = merged_df.drop_duplicates(subset=primary_key_cols, keep="last")
+            else:
+                merged_df = group_df
+
+            # Write partition
+            table = pa.Table.from_pandas(merged_df, preserve_index=False)
+            if date_columns:
+                table = convert_date_columns_to_date32(table, date_columns)
+            if pyarrow_schema:
+                table = apply_schema_to_table(table, pyarrow_schema)
+            pq.write_table(table, partition_path, compression="snappy")
+
+        logger.info(f"  -> Partitioned sync complete: {len(partitions_updated)} partitions updated")
+
+        return self._get_partition_totals(partition_dir)
+
+    def _merge_arrow_tables(
+        self,
+        existing: pa.Table,
+        new_data: pa.Table,
+        primary_key: List[str],
+    ) -> pa.Table:
+        """
+        Merge two Arrow tables with deduplication on primary key.
+
+        New data overwrites existing rows with the same primary key.
+
+        Args:
+            existing: Existing data
+            new_data: New/changed data
+            primary_key: List of PK column names
+
+        Returns:
+            Merged PyArrow Table
+        """
+        import pandas as pd
+
+        existing_df = existing.to_pandas()
+        new_df = new_data.to_pandas()
+
+        # Concat and dedup (keep last = new data wins)
+        merged_df = pd.concat([existing_df, new_df], ignore_index=True)
+        merged_df = merged_df.drop_duplicates(subset=primary_key, keep="last")
+
+        return pa.Table.from_pandas(merged_df, preserve_index=False)
+
+    def _get_partition_totals(self, partition_dir: Path) -> Dict[str, Any]:
+        """
+        Calculate totals from all partition files in a directory.
+        """
+        total_rows = 0
+        total_size = 0
+        total_uncompressed = 0
+        total_columns = 0
+
+        if not partition_dir.exists():
+            return {"rows": 0, "file_size_bytes": 0, "columns": 0, "uncompressed_bytes": 0}
+
+        all_partitions = list(partition_dir.glob("*.parquet"))
+
+        for part_path in all_partitions:
+            try:
+                pf = pq.ParquetFile(part_path)
+                meta = pf.metadata
+                total_rows += meta.num_rows
+                total_size += part_path.stat().st_size
+                if total_columns == 0:
+                    total_columns = len(pf.schema_arrow)
+                for rg_idx in range(meta.num_row_groups):
+                    rg = meta.row_group(rg_idx)
+                    for col_idx in range(rg.num_columns):
+                        total_uncompressed += rg.column(col_idx).total_uncompressed_size
+            except Exception as e:
+                logger.warning(f"Skipping corrupt partition {part_path.name}: {e}")
+
+        return {
+            "rows": total_rows,
+            "file_size_bytes": total_size,
+            "partitions": len(all_partitions),
+            "columns": total_columns,
+            "uncompressed_bytes": total_uncompressed,
+        }
+
+
+def create_data_source() -> BigQueryDataSource:
+    """Factory function for dynamic import compatibility."""
+    return BigQueryDataSource()
--- a/connectors/bigquery/client.py
+++ b/connectors/bigquery/client.py
@ -0,0 +1,469 @@
+"""
+Google BigQuery API Client
+
+Low-level wrapper for Google BigQuery with these functions:
+1. Authentication using Application Default Credentials (ADC)
+2. Query tables to PyArrow (no CSV intermediate step)
+3. Get table metadata (schema, columns, data types)
+4. Cache metadata for faster repeated use
+5. Incremental reads (timestamp-based and partition-based)
+
+Uses google-cloud-bigquery with native PyArrow support.
+"""
+
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from datetime import datetime, timedelta
+
+import pyarrow as pa
+from google.cloud import bigquery
+
+from src.config import get_config
+
+
+logger = logging.getLogger(__name__)
+
+
+# Mapping BigQuery types to PyArrow types
+BIGQUERY_TO_PYARROW_TYPES = {
+    "STRING": pa.string(),
+    "BYTES": pa.binary(),
+    "INTEGER": pa.int64(),
+    "INT64": pa.int64(),
+    "FLOAT": pa.float64(),
+    "FLOAT64": pa.float64(),
+    "NUMERIC": pa.float64(),
+    "BIGNUMERIC": pa.float64(),
+    "BOOLEAN": pa.bool_(),
+    "BOOL": pa.bool_(),
+    "TIMESTAMP": pa.timestamp("us", tz="UTC"),
+    "DATE": pa.date32(),
+    "TIME": pa.string(),
+    "DATETIME": pa.timestamp("us"),
+    "GEOGRAPHY": pa.string(),
+    "JSON": pa.string(),
+    "STRUCT": pa.string(),
+    "RECORD": pa.string(),
+    "ARRAY": pa.string(),
+}
+
+
+class BigQueryClient:
+    """
+    Wrapper for Google BigQuery API.
+
+    Provides high-level methods for working with BigQuery tables:
+    - Query tables to PyArrow Tables (no CSV step)
+    - Get metadata (schema, columns)
+    - Incremental and partitioned reads
+    """
+
+    def __init__(
+        self,
+        project_id: Optional[str] = None,
+        location: Optional[str] = None,
+    ):
+        """
+        Initialize BigQuery client.
+
+        Args:
+            project_id: GCP project ID for job execution/billing.
+                        If None, reads from BIGQUERY_PROJECT env var.
+            location: BigQuery location for job execution (e.g., "us-central1").
+                      If None, reads from BIGQUERY_LOCATION env var.
+
+        Raises:
+            ValueError: If project_id is not provided and BIGQUERY_PROJECT is not set.
+        """
+        self.project_id = project_id or os.environ.get("BIGQUERY_PROJECT")
+
+        if not self.project_id:
+            raise ValueError(
+                "BigQuery project ID not set. "
+                "Set BIGQUERY_PROJECT environment variable."
+            )
+
+        self.location = location or os.environ.get("BIGQUERY_LOCATION")
+
+        # Initialize BigQuery client with ADC
+        # project_id is used for job execution and billing.
+        # Data can live in a different project -- table IDs in queries
+        # use fully-qualified format (project.dataset.table).
+        client_kwargs = {"project": self.project_id}
+        if self.location:
+            client_kwargs["location"] = self.location
+        self.client = bigquery.Client(**client_kwargs)
+
+        # Metadata cache
+        config = get_config()
+        self.metadata_cache: Dict[str, Dict[str, Any]] = {}
+        self.metadata_cache_path = config.get_metadata_path() / "bq_table_metadata.json"
+
+        # Load cache from disk if exists
+        self._load_metadata_cache()
+
+        logger.info(
+            f"BigQuery client initialized: project={self.project_id}, "
+            f"location={self.location or 'auto'}"
+        )
+
+    def _load_metadata_cache(self):
+        """Load metadata cache from disk."""
+        if self.metadata_cache_path.exists():
+            try:
+                with open(self.metadata_cache_path, "r") as f:
+                    self.metadata_cache = json.load(f)
+                logger.info(
+                    f"BQ metadata cache loaded: {len(self.metadata_cache)} tables"
+                )
+            except Exception as e:
+                logger.warning(f"Error loading BQ metadata cache: {e}")
+                self.metadata_cache = {}
+
+    def _save_metadata_cache(self):
+        """Save metadata cache to disk."""
+        try:
+            self.metadata_cache_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(self.metadata_cache_path, "w") as f:
+                json.dump(self.metadata_cache, f, indent=2)
+            logger.debug("BQ metadata cache saved")
+        except Exception as e:
+            logger.warning(f"Error saving BQ metadata cache: {e}")
+
+    def get_table_metadata(
+        self,
+        table_id: str,
+        use_cache: bool = True,
+        cache_ttl_hours: int = 24,
+    ) -> Dict[str, Any]:
+        """
+        Get table metadata from BigQuery.
+
+        Args:
+            table_id: Full table ID (e.g., "project.dataset.table")
+            use_cache: Use cache if available
+            cache_ttl_hours: Cache TTL in hours (default 24h)
+
+        Returns:
+            Dictionary with metadata including columns, types, descriptions, row count.
+        """
+        # Check cache
+        if use_cache and table_id in self.metadata_cache:
+            cached = self.metadata_cache[table_id]
+            cached_time = datetime.fromisoformat(cached.get("_cached_at", "2000-01-01"))
+            cache_age = datetime.now() - cached_time
+
+            if cache_age < timedelta(hours=cache_ttl_hours):
+                logger.debug(f"Using BQ metadata cache for {table_id}")
+                return cached
+
+        logger.info(f"Fetching metadata for BQ table: {table_id}")
+
+        try:
+            table_ref = self.client.get_table(table_id)
+
+            # Build column metadata
+            columns = []
+            column_types = {}
+            column_descriptions = {}
+            for field in table_ref.schema:
+                columns.append(field.name)
+                column_types[field.name] = field.field_type
+                if field.description:
+                    column_descriptions[field.name] = field.description
+
+            metadata = {
+                "table_id": table_id,
+                "name": table_ref.table_id,
+                "dataset": table_ref.dataset_id,
+                "project": table_ref.project,
+                "columns": columns,
+                "column_types": column_types,
+                "column_descriptions": column_descriptions,
+                "row_count": table_ref.num_rows,
+                "size_bytes": table_ref.num_bytes,
+                "created": table_ref.created.isoformat() if table_ref.created else None,
+                "modified": table_ref.modified.isoformat() if table_ref.modified else None,
+                "partitioning": None,
+                "_cached_at": datetime.now().isoformat(),
+            }
+
+            # Capture partitioning info
+            if table_ref.time_partitioning:
+                metadata["partitioning"] = {
+                    "type": table_ref.time_partitioning.type_,
+                    "field": table_ref.time_partitioning.field,
+                    "expiration_ms": table_ref.time_partitioning.expiration_ms,
+                }
+
+            # Save to cache
+            self.metadata_cache[table_id] = metadata
+            self._save_metadata_cache()
+
+            return metadata
+
+        except Exception as e:
+            logger.error(f"Error getting metadata for {table_id}: {e}")
+            raise
+
+    def get_pyarrow_schema(self, table_id: str) -> Optional[pa.Schema]:
+        """
+        Build PyArrow schema from BigQuery table schema.
+
+        Args:
+            table_id: Full table ID
+
+        Returns:
+            PyArrow schema or None if metadata unavailable
+        """
+        metadata = self.get_table_metadata(table_id)
+        column_types = metadata.get("column_types", {})
+
+        if not column_types:
+            logger.warning(f"No column types for {table_id}, schema will not be applied")
+            return None
+
+        fields = []
+        for col_name in metadata.get("columns", []):
+            bq_type = column_types.get(col_name, "STRING")
+            pa_type = BIGQUERY_TO_PYARROW_TYPES.get(bq_type, pa.string())
+            fields.append(pa.field(col_name, pa_type))
+
+        return pa.schema(fields)
+
+    def get_date_columns(self, table_id: str) -> List[str]:
+        """
+        Get list of DATE-only columns for a table.
+
+        Args:
+            table_id: Full table ID
+
+        Returns:
+            List of column names that have DATE type in BigQuery
+        """
+        metadata = self.get_table_metadata(table_id)
+        column_types = metadata.get("column_types", {})
+
+        return [
+            col_name for col_name, bq_type in column_types.items()
+            if bq_type == "DATE"
+        ]
+
+    def query_to_arrow(
+        self,
+        sql: str,
+        params: Optional[List[bigquery.ScalarQueryParameter]] = None,
+    ) -> pa.Table:
+        """
+        Execute SQL query and return results as PyArrow Table.
+
+        Args:
+            sql: SQL query string (use @param_name for parameterized values)
+            params: List of BigQuery query parameters
+
+        Returns:
+            PyArrow Table with query results
+        """
+        job_config = bigquery.QueryJobConfig()
+        if params:
+            job_config.query_parameters = params
+
+        logger.debug(f"Executing BQ query: {sql[:200]}...")
+
+        query_job = self.client.query(sql, job_config=job_config)
+        arrow_table = query_job.to_arrow()
+
+        logger.debug(f"Query returned {arrow_table.num_rows} rows, {arrow_table.num_columns} columns")
+        return arrow_table
+
+    def read_table(
+        self,
+        table_id: str,
+        columns: Optional[List[str]] = None,
+        row_filter: Optional[str] = None,
+    ) -> pa.Table:
+        """
+        Read full table (or filtered subset) as PyArrow Table.
+
+        Args:
+            table_id: Full table ID (e.g., "project.dataset.table")
+            columns: Optional list of columns to select
+            row_filter: Optional SQL WHERE clause (without WHERE keyword)
+
+        Returns:
+            PyArrow Table with table data
+        """
+        # Build SELECT clause
+        select_cols = ", ".join(f"`{c}`" for c in columns) if columns else "*"
+
+        sql = f"SELECT {select_cols} FROM `{table_id}`"
+        if row_filter:
+            sql += f" WHERE {row_filter}"
+
+        logger.info(f"Reading BQ table: {table_id} (filter: {row_filter or 'none'})")
+        return self.query_to_arrow(sql)
+
+    def read_table_incremental(
+        self,
+        table_id: str,
+        incremental_column: str,
+        since_value: str,
+        columns: Optional[List[str]] = None,
+    ) -> pa.Table:
+        """
+        Read rows where incremental_column > since_value.
+
+        Uses parameterized query to prevent SQL injection.
+
+        Args:
+            table_id: Full table ID
+            incremental_column: Column name for incremental filter
+            since_value: ISO timestamp string - fetch rows after this value
+            columns: Optional list of columns to select
+
+        Returns:
+            PyArrow Table with incremental data
+        """
+        select_cols = ", ".join(f"`{c}`" for c in columns) if columns else "*"
+
+        sql = (
+            f"SELECT {select_cols} FROM `{table_id}` "
+            f"WHERE `{incremental_column}` > @since_value"
+        )
+
+        params = [
+            bigquery.ScalarQueryParameter("since_value", "TIMESTAMP", since_value),
+        ]
+
+        logger.info(
+            f"Incremental read: {table_id} WHERE {incremental_column} > {since_value}"
+        )
+        return self.query_to_arrow(sql, params=params)
+
+    def read_table_partitioned(
+        self,
+        table_id: str,
+        partition_column: str,
+        start: str,
+        end: Optional[str] = None,
+        columns: Optional[List[str]] = None,
+    ) -> pa.Table:
+        """
+        Read data within a partition range.
+
+        Args:
+            table_id: Full table ID
+            partition_column: Partition column name
+            start: Start date/timestamp (inclusive)
+            end: End date/timestamp (exclusive). If None, reads to present.
+            columns: Optional list of columns to select
+
+        Returns:
+            PyArrow Table with partition range data
+        """
+        select_cols = ", ".join(f"`{c}`" for c in columns) if columns else "*"
+
+        sql = (
+            f"SELECT {select_cols} FROM `{table_id}` "
+            f"WHERE `{partition_column}` >= @start_value"
+        )
+        params = [
+            bigquery.ScalarQueryParameter("start_value", "TIMESTAMP", start),
+        ]
+
+        if end:
+            sql += f" AND `{partition_column}` < @end_value"
+            params.append(
+                bigquery.ScalarQueryParameter("end_value", "TIMESTAMP", end),
+            )
+
+        logger.info(
+            f"Partitioned read: {table_id} [{start} .. {end or 'now'})"
+        )
+        return self.query_to_arrow(sql, params=params)
+
+    def discover_all_tables(self, dataset_id: Optional[str] = None) -> List[Dict[str, Any]]:
+        """
+        List all tables in the project (or specific dataset).
+
+        Args:
+            dataset_id: Optional dataset ID to limit scope
+
+        Returns:
+            Normalized list of table dicts with id, name, columns, row_count, etc.
+        """
+        logger.info(f"Discovering BQ tables (dataset={dataset_id or 'all'})...")
+
+        result = []
+
+        if dataset_id:
+            datasets = [self.client.get_dataset(dataset_id)]
+        else:
+            datasets = list(self.client.list_datasets())
+
+        for dataset in datasets:
+            ds_ref = dataset.reference if hasattr(dataset, "reference") else dataset.dataset_id
+            ds_id = str(ds_ref)
+
+            try:
+                tables = list(self.client.list_tables(ds_ref))
+            except Exception as e:
+                logger.warning(f"Could not list tables in dataset {ds_id}: {e}")
+                continue
+
+            for table_item in tables:
+                full_id = f"{table_item.project}.{table_item.dataset_id}.{table_item.table_id}"
+
+                try:
+                    table_detail = self.client.get_table(full_id)
+                    columns = [f.name for f in table_detail.schema]
+
+                    result.append({
+                        "id": full_id,
+                        "name": table_item.table_id,
+                        "bucket_id": table_item.dataset_id,
+                        "bucket_name": table_item.dataset_id,
+                        "columns": columns,
+                        "row_count": table_detail.num_rows or 0,
+                        "size_bytes": table_detail.num_bytes or 0,
+                        "primary_key": [],
+                        "last_change": (
+                            table_detail.modified.isoformat()
+                            if table_detail.modified else None
+                        ),
+                        "last_import": None,
+                    })
+                except Exception as e:
+                    logger.warning(f"Could not get details for {full_id}: {e}")
+
+        logger.info(f"Discovered {len(result)} BQ tables")
+        return result
+
+    def test_connection(self) -> bool:
+        """
+        Test connection to BigQuery.
+
+        Returns:
+            True if connection works, False otherwise
+        """
+        try:
+            query_job = self.client.query("SELECT 1")
+            list(query_job.result())
+            logger.info(f"BigQuery connection OK (project: {self.project_id})")
+            return True
+        except Exception as e:
+            logger.error(f"BigQuery connection test failed: {e}")
+            return False
+
+
+def create_client() -> BigQueryClient:
+    """
+    Factory function to create BigQuery client.
+
+    Returns:
+        BigQueryClient instance
+    """
+    return BigQueryClient()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,7 @@
 # Data source adapters (install only what you need)
 kbcstorage>=0.9.0          # For Keboola adapter
+google-cloud-bigquery>=3.0.0           # For BigQuery adapter
+google-cloud-bigquery-storage>=2.0.0   # For BigQuery adapter (fast Arrow transfer)

 # Data processing
 # pandas - core tabular data processing library
--- a/src/config.py
+++ b/src/config.py
@ -101,6 +101,7 @@ class TableConfig:
    max_history_days: Optional[int] = None
    dataset: Optional[str] = None
    initial_load_chunk_days: int = 30
+    incremental_column: Optional[str] = None  # Column for timestamp-based incremental sync (BigQuery)

    def __post_init__(self):
        """Validate configuration after initialization."""
@ -429,6 +430,7 @@ class Config:
                max_history_days=table_data.get("max_history_days"),
                dataset=table_data.get("dataset"),
                initial_load_chunk_days=table_data.get("initial_load_chunk_days", 30),
+                incremental_column=table_data.get("incremental_column"),
            )
            table_configs.append(config)

--- a/src/data_sync.py
+++ b/src/data_sync.py
@ -511,7 +511,7 @@ def create_data_source(source_type: str = None) -> DataSource:

    raise ValueError(
        f"Unknown data source: '{source_type}'. "
-        f"Available connectors: keboola. "
+        f"Available connectors: keboola, bigquery. "
        f"Create connectors/{source_type}/adapter.py to add a new one."
    )

--- a/tests/test_bigquery_adapter.py
+++ b/tests/test_bigquery_adapter.py
@ -0,0 +1,763 @@
+"""
+Comprehensive unit tests for the BigQuery data source adapter.
+
+Tests the BigQueryDataSource class from connectors/bigquery/adapter.py
+with all external dependencies (BigQueryClient, config, parquet_manager) mocked.
+
+The google-cloud-bigquery package is not installed in test environments,
+so we install stub modules in sys.modules before importing the adapter.
+"""
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pytest
+
+# ---------------------------------------------------------------------------
+# Stub google.cloud.bigquery before any connector import
+# ---------------------------------------------------------------------------
+_bq_stub = MagicMock()
+sys.modules.setdefault("google", _bq_stub)
+sys.modules.setdefault("google.cloud", _bq_stub)
+sys.modules.setdefault("google.cloud.bigquery", _bq_stub)
+
+from src.config import TableConfig  # noqa: E402
+from src.data_sync import SyncState  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def tmp_parquet_dir(tmp_path):
+    """Provide a temporary directory for Parquet file output."""
+    parquet_dir = tmp_path / "parquet" / "test_bucket"
+    parquet_dir.mkdir(parents=True)
+    return parquet_dir
+
+
+@pytest.fixture
+def mock_config(tmp_parquet_dir):
+    """Create a mock Config object that returns paths inside tmp_parquet_dir."""
+    config = MagicMock()
+    config.get_parquet_path = MagicMock()
+    config.get_partition_path = MagicMock()
+    config.get_metadata_path.return_value = tmp_parquet_dir.parent / "metadata"
+    return config
+
+
+@pytest.fixture
+def mock_bq_client():
+    """Create a mock BigQueryClient with sensible defaults."""
+    client = MagicMock()
+    client.metadata_cache = {}
+    client.get_date_columns.return_value = []
+    client.get_pyarrow_schema.return_value = None
+    return client
+
+
+@pytest.fixture
+def sync_state(tmp_path):
+    """Create a real SyncState backed by a temp JSON file."""
+    state_file = tmp_path / "metadata" / "sync_state.json"
+    state_file.parent.mkdir(parents=True, exist_ok=True)
+    return SyncState(state_file)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_table_config(
+    *,
+    table_id: str = "project.dataset.orders",
+    name: str = "orders",
+    primary_key: str = "id",
+    sync_strategy: str = "full_refresh",
+    incremental_column: str | None = None,
+    incremental_window_days: int | None = None,
+    partition_by: str | None = None,
+    partition_granularity: str | None = None,
+    max_history_days: int | None = None,
+) -> TableConfig:
+    """Helper to build a TableConfig with safe defaults."""
+    return TableConfig(
+        id=table_id,
+        name=name,
+        description="Test table",
+        primary_key=primary_key,
+        sync_strategy=sync_strategy,
+        incremental_column=incremental_column,
+        incremental_window_days=incremental_window_days,
+        partition_by=partition_by,
+        partition_granularity=partition_granularity,
+        max_history_days=max_history_days,
+    )
+
+
+def _sample_arrow_table(ids: list[int], names: list[str]) -> pa.Table:
+    """Build a small PyArrow Table with id and name columns."""
+    return pa.table({"id": ids, "name": names})
+
+
+def _create_adapter(mock_config, mock_bq_client):
+    """Instantiate BigQueryDataSource with mocked dependencies.
+
+    Patches get_config and create_bq_client so that no real GCP
+    credentials or network access are needed.
+    """
+    with patch("connectors.bigquery.adapter.get_config", return_value=mock_config), \
+         patch("connectors.bigquery.adapter.create_bq_client", return_value=mock_bq_client):
+        from connectors.bigquery.adapter import BigQueryDataSource
+        adapter = BigQueryDataSource()
+    return adapter
+
+
+# ---------------------------------------------------------------------------
+# 1. full_refresh writes valid Parquet file from Arrow table
+# ---------------------------------------------------------------------------
+
+class TestFullRefresh:
+
+    def test_writes_valid_parquet(self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state):
+        """full_refresh should write a valid, readable Parquet file."""
+        table_config = _make_table_config(sync_strategy="full_refresh")
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+
+        arrow_data = _sample_arrow_table([1, 2, 3], ["Alice", "Bob", "Charlie"])
+        mock_bq_client.read_table.return_value = arrow_data
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.sync_table(table_config, sync_state)
+
+        assert result["success"] is True
+        assert result["rows"] == 3
+        assert parquet_path.exists()
+
+        # Verify Parquet content matches source data
+        read_back = pq.read_table(parquet_path)
+        assert read_back.num_rows == 3
+        assert read_back.column_names == ["id", "name"]
+
+    def test_applies_date_columns(self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state):
+        """full_refresh should call convert_date_columns_to_date32 when date columns exist."""
+        table_config = _make_table_config()
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+
+        arrow_data = _sample_arrow_table([1], ["Alice"])
+        mock_bq_client.read_table.return_value = arrow_data
+        mock_bq_client.get_date_columns.return_value = ["created_at"]
+
+        with patch("connectors.bigquery.adapter.convert_date_columns_to_date32", return_value=arrow_data) as mock_conv:
+            adapter = _create_adapter(mock_config, mock_bq_client)
+            adapter.sync_table(table_config, sync_state)
+            mock_conv.assert_called_once_with(arrow_data, ["created_at"])
+
+    def test_applies_pyarrow_schema(self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state):
+        """full_refresh should call apply_schema_to_table when schema is available."""
+        table_config = _make_table_config()
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+
+        arrow_data = _sample_arrow_table([1], ["Alice"])
+        mock_bq_client.read_table.return_value = arrow_data
+        schema = pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.string())])
+        mock_bq_client.get_pyarrow_schema.return_value = schema
+
+        with patch("connectors.bigquery.adapter.apply_schema_to_table", return_value=arrow_data) as mock_apply:
+            adapter = _create_adapter(mock_config, mock_bq_client)
+            adapter.sync_table(table_config, sync_state)
+            mock_apply.assert_called_once_with(arrow_data, schema)
+
+
+# ---------------------------------------------------------------------------
+# 2. incremental_column_sync merges correctly (dedup on PK, new data wins)
+# ---------------------------------------------------------------------------
+
+class TestIncrementalColumnSync:
+
+    def test_merge_dedup_new_data_wins(self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state):
+        """Incremental sync should overwrite existing rows when PK matches (new data wins)."""
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column="updated_at",
+            incremental_window_days=7,
+        )
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+
+        # Write existing data
+        existing = _sample_arrow_table([1, 2], ["Alice", "Bob"])
+        pq.write_table(existing, parquet_path)
+
+        # Simulate a previous sync timestamp
+        sync_state.update_sync(
+            table_id=table_config.id,
+            table_name=table_config.name,
+            strategy="incremental",
+            rows=2,
+            file_size_bytes=100,
+        )
+
+        # New data: id=2 gets updated name, id=3 is new
+        new_data = _sample_arrow_table([2, 3], ["Bob_Updated", "Charlie"])
+        mock_bq_client.read_table_incremental.return_value = new_data
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.sync_table(table_config, sync_state)
+
+        assert result["success"] is True
+        assert result["rows"] == 3  # Alice + Bob_Updated + Charlie
+
+        read_back = pq.read_table(parquet_path)
+        df = read_back.to_pandas()
+        assert set(df["id"].tolist()) == {1, 2, 3}
+        # id=2 should have the updated name
+        bob_row = df[df["id"] == 2].iloc[0]
+        assert bob_row["name"] == "Bob_Updated"
+
+
+# ---------------------------------------------------------------------------
+# 3. incremental_column_sync with no new data returns existing file info
+# ---------------------------------------------------------------------------
+
+class TestIncrementalNoNewData:
+
+    def test_returns_existing_file_info(self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state):
+        """When there is no new data, sync returns stats from the existing Parquet file."""
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column="updated_at",
+            incremental_window_days=7,
+        )
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+
+        # Write existing data
+        existing = _sample_arrow_table([1, 2, 3], ["A", "B", "C"])
+        pq.write_table(existing, parquet_path)
+
+        # Mark a previous sync
+        sync_state.update_sync(
+            table_id=table_config.id,
+            table_name=table_config.name,
+            strategy="incremental",
+            rows=3,
+            file_size_bytes=100,
+        )
+
+        # No new rows
+        empty_table = pa.table({
+            "id": pa.array([], type=pa.int64()),
+            "name": pa.array([], type=pa.string()),
+        })
+        mock_bq_client.read_table_incremental.return_value = empty_table
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.sync_table(table_config, sync_state)
+
+        assert result["success"] is True
+        assert result["rows"] == 3  # existing row count preserved
+
+
+# ---------------------------------------------------------------------------
+# 4. partitioned_sync creates partition files
+# ---------------------------------------------------------------------------
+
+class TestPartitionedSync:
+
+    def test_creates_partition_files(self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state):
+        """Partitioned sync should create separate Parquet files per partition key."""
+        import pandas as pd
+
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column="created_at",
+            partition_by="created_at",
+            partition_granularity="month",
+            incremental_window_days=7,
+        )
+
+        # For partitioned tables, parquet_path is a directory
+        partition_dir = tmp_parquet_dir / "orders"
+        partition_dir.mkdir(parents=True, exist_ok=True)
+        mock_config.get_parquet_path.return_value = partition_dir
+
+        # Configure partition paths
+        def _partition_path(tc, key):
+            return partition_dir / f"{key}.parquet"
+        mock_config.get_partition_path.side_effect = _partition_path
+
+        # Build arrow table with timestamps in two months
+        ts_jan = [pd.Timestamp("2026-01-15 10:00:00", tz="UTC")]
+        ts_feb = [pd.Timestamp("2026-02-20 14:00:00", tz="UTC")]
+        arrow_data = pa.table({
+            "id": [1, 2],
+            "name": ["Jan_Order", "Feb_Order"],
+            "created_at": pa.array(ts_jan + ts_feb, type=pa.timestamp("us", tz="UTC")),
+        })
+        mock_bq_client.read_table.return_value = arrow_data
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.sync_table(table_config, sync_state)
+
+        assert result["success"] is True
+
+        # Should have created two partition files
+        partition_files = list(partition_dir.glob("*.parquet"))
+        assert len(partition_files) == 2
+
+        partition_names = sorted(f.stem for f in partition_files)
+        assert "2026_01" in partition_names
+        assert "2026_02" in partition_names
+
+
+# ---------------------------------------------------------------------------
+# 5. discover_tables delegates to BigQueryClient.discover_all_tables()
+# ---------------------------------------------------------------------------
+
+class TestDiscoverTables:
+
+    def test_delegates_to_client(self, mock_config, mock_bq_client):
+        """discover_tables should forward the call to BigQueryClient.discover_all_tables."""
+        expected = [{"id": "proj.ds.t1", "name": "t1", "columns": ["a", "b"]}]
+        mock_bq_client.discover_all_tables.return_value = expected
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.discover_tables()
+
+        mock_bq_client.discover_all_tables.assert_called_once()
+        assert result == expected
+
+
+# ---------------------------------------------------------------------------
+# 6. get_source_name returns "Google BigQuery"
+# ---------------------------------------------------------------------------
+
+class TestGetSourceName:
+
+    def test_returns_google_bigquery(self, mock_config, mock_bq_client):
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        assert adapter.get_source_name() == "Google BigQuery"
+
+
+# ---------------------------------------------------------------------------
+# 7. get_column_metadata returns correct format
+# ---------------------------------------------------------------------------
+
+class TestGetColumnMetadata:
+
+    def test_returns_correct_format(self, mock_config, mock_bq_client):
+        """get_column_metadata should transform BQ raw metadata into {columns: ...} format."""
+        mock_bq_client.get_table_metadata.return_value = {
+            "column_types": {"id": "INT64", "name": "STRING", "email": "STRING"},
+            "column_descriptions": {"id": "Primary key", "email": "User email address"},
+        }
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.get_column_metadata("project.dataset.users")
+
+        assert "columns" in result
+        assert result["columns"]["id"] == {"source_type": "INT64", "description": "Primary key"}
+        assert result["columns"]["name"] == {"source_type": "STRING"}
+        assert result["columns"]["email"] == {
+            "source_type": "STRING",
+            "description": "User email address",
+        }
+
+    def test_returns_none_when_no_column_types(self, mock_config, mock_bq_client):
+        """get_column_metadata should return None if the metadata has no column types."""
+        mock_bq_client.get_table_metadata.return_value = {
+            "column_types": {},
+            "column_descriptions": {},
+        }
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.get_column_metadata("project.dataset.users")
+
+        assert result is None
+
+
+# ---------------------------------------------------------------------------
+# 8. Error handling (query failure -> {success: False, error: ...})
+# ---------------------------------------------------------------------------
+
+class TestErrorHandling:
+
+    def test_query_failure_returns_error_dict(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """When BigQuery query raises, sync_table returns {success: False, error: ...}."""
+        table_config = _make_table_config()
+        mock_config.get_parquet_path.return_value = tmp_parquet_dir / "orders.parquet"
+        mock_bq_client.read_table.side_effect = RuntimeError("BigQuery API timeout")
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.sync_table(table_config, sync_state)
+
+        assert result["success"] is False
+        assert "BigQuery API timeout" in result["error"]
+        assert result["strategy"] == "full_refresh"
+
+    def test_unknown_strategy_returns_error(self, mock_config, mock_bq_client, sync_state):
+        """Unknown sync_strategy in internal dispatch should produce an error result."""
+        # We cannot create a TableConfig with an invalid strategy via constructor
+        # (it validates). Instead, we mutate it after creation.
+        table_config = _make_table_config()
+        table_config.sync_strategy = "magic_sync"
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.sync_table(table_config, sync_state)
+
+        assert result["success"] is False
+        assert "Unknown sync strategy" in result["error"]
+
+
+# ---------------------------------------------------------------------------
+# 9. incremental_column config is used in WHERE clause
+# ---------------------------------------------------------------------------
+
+class TestIncrementalColumnUsedInWhere:
+
+    def test_incremental_column_passed_to_client(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """The configured incremental_column should be forwarded to read_table_incremental."""
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column="modified_at",
+            incremental_window_days=14,
+        )
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+
+        # Write existing data so we enter the incremental path
+        existing = _sample_arrow_table([1], ["Alice"])
+        pq.write_table(existing, parquet_path)
+
+        sync_state.update_sync(
+            table_id=table_config.id,
+            table_name=table_config.name,
+            strategy="incremental",
+            rows=1,
+            file_size_bytes=100,
+        )
+
+        # Return empty to keep the test simple
+        empty = pa.table({
+            "id": pa.array([], type=pa.int64()),
+            "name": pa.array([], type=pa.string()),
+        })
+        mock_bq_client.read_table_incremental.return_value = empty
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        adapter.sync_table(table_config, sync_state)
+
+        call_kwargs = mock_bq_client.read_table_incremental.call_args
+        assert call_kwargs.kwargs["incremental_column"] == "modified_at"
+        assert call_kwargs.kwargs["table_id"] == "project.dataset.orders"
+        # since_value should be an ISO string
+        assert "since_value" in call_kwargs.kwargs
+
+
+# ---------------------------------------------------------------------------
+# 10. First sync without existing file downloads all data
+# ---------------------------------------------------------------------------
+
+class TestFirstSyncDownloadsAll:
+
+    def test_first_sync_reads_full_table(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """On first incremental sync (no existing file), adapter should read all data."""
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column="updated_at",
+            incremental_window_days=7,
+        )
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+
+        # No previous sync, no existing file
+        arrow_data = _sample_arrow_table([1, 2, 3], ["A", "B", "C"])
+        mock_bq_client.read_table.return_value = arrow_data
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.sync_table(table_config, sync_state)
+
+        assert result["success"] is True
+        assert result["rows"] == 3
+        # Should call read_table (full), not read_table_incremental
+        mock_bq_client.read_table.assert_called_once_with(table_config.id)
+        mock_bq_client.read_table_incremental.assert_not_called()
+
+    def test_first_sync_with_max_history_days(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """First sync with max_history_days should use read_table_incremental."""
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column="updated_at",
+            incremental_window_days=7,
+            max_history_days=90,
+        )
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+
+        arrow_data = _sample_arrow_table([1, 2], ["A", "B"])
+        mock_bq_client.read_table_incremental.return_value = arrow_data
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        result = adapter.sync_table(table_config, sync_state)
+
+        assert result["success"] is True
+        # Should use read_table_incremental (not read_table) because max_history_days is set
+        mock_bq_client.read_table_incremental.assert_called_once()
+        call_kwargs = mock_bq_client.read_table_incremental.call_args.kwargs
+        assert call_kwargs["incremental_column"] == "updated_at"
+        mock_bq_client.read_table.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# 11. sync_table dispatches to correct strategy based on sync_strategy
+# ---------------------------------------------------------------------------
+
+class TestSyncTableDispatch:
+
+    def test_dispatches_full_refresh(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """sync_strategy='full_refresh' should call _full_refresh."""
+        table_config = _make_table_config(sync_strategy="full_refresh")
+        mock_config.get_parquet_path.return_value = tmp_parquet_dir / "orders.parquet"
+        mock_bq_client.read_table.return_value = _sample_arrow_table([1], ["A"])
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+
+        with patch.object(adapter, "_full_refresh", wraps=adapter._full_refresh) as spy:
+            adapter.sync_table(table_config, sync_state)
+            spy.assert_called_once_with(table_config)
+
+    def test_dispatches_incremental(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """sync_strategy='incremental' should call _incremental_sync."""
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column="updated_at",
+            incremental_window_days=7,
+        )
+        mock_config.get_parquet_path.return_value = tmp_parquet_dir / "orders.parquet"
+        mock_bq_client.read_table.return_value = _sample_arrow_table([1], ["A"])
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+
+        with patch.object(adapter, "_incremental_sync", wraps=adapter._incremental_sync) as spy:
+            adapter.sync_table(table_config, sync_state)
+            spy.assert_called_once_with(table_config, sync_state)
+
+    def test_dispatches_partitioned(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """sync_strategy='incremental' with partition_by should call _partitioned_sync."""
+        import pandas as pd
+
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column="created_at",
+            partition_by="created_at",
+            partition_granularity="month",
+            incremental_window_days=7,
+        )
+        partition_dir = tmp_parquet_dir / "orders"
+        partition_dir.mkdir(parents=True, exist_ok=True)
+        mock_config.get_parquet_path.return_value = partition_dir
+
+        def _partition_path(tc, key):
+            return partition_dir / f"{key}.parquet"
+        mock_config.get_partition_path.side_effect = _partition_path
+
+        ts = [pd.Timestamp("2026-01-15 10:00:00", tz="UTC")]
+        arrow_data = pa.table({
+            "id": [1],
+            "name": ["A"],
+            "created_at": pa.array(ts, type=pa.timestamp("us", tz="UTC")),
+        })
+        mock_bq_client.read_table.return_value = arrow_data
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+
+        with patch.object(adapter, "_partitioned_sync", wraps=adapter._partitioned_sync) as spy:
+            adapter.sync_table(table_config, sync_state)
+            spy.assert_called_once()
+
+    def test_incremental_without_column_falls_back_to_full_refresh(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """incremental strategy without incremental_column or partition_by falls back to full_refresh."""
+        table_config = _make_table_config(
+            sync_strategy="incremental",
+            incremental_column=None,
+            partition_by=None,
+            incremental_window_days=7,
+        )
+        mock_config.get_parquet_path.return_value = tmp_parquet_dir / "orders.parquet"
+        mock_bq_client.read_table.return_value = _sample_arrow_table([1], ["A"])
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+
+        with patch.object(adapter, "_full_refresh", wraps=adapter._full_refresh) as spy:
+            result = adapter.sync_table(table_config, sync_state)
+            spy.assert_called_once()
+            assert result["success"] is True
+
+
+# ---------------------------------------------------------------------------
+# 12. _merge_arrow_tables deduplicates correctly
+# ---------------------------------------------------------------------------
+
+class TestMergeArrowTables:
+
+    def test_dedup_on_single_pk(self, mock_config, mock_bq_client):
+        """Merge should deduplicate on single primary key column, new data wins."""
+        adapter = _create_adapter(mock_config, mock_bq_client)
+
+        existing = pa.table({"id": [1, 2, 3], "val": ["a", "b", "c"]})
+        new_data = pa.table({"id": [2, 4], "val": ["B_new", "d"]})
+
+        merged = adapter._merge_arrow_tables(existing, new_data, primary_key=["id"])
+        df = merged.to_pandas().sort_values("id").reset_index(drop=True)
+
+        assert list(df["id"]) == [1, 2, 3, 4]
+        assert list(df["val"]) == ["a", "B_new", "c", "d"]
+
+    def test_dedup_on_composite_pk(self, mock_config, mock_bq_client):
+        """Merge should deduplicate on composite primary key."""
+        adapter = _create_adapter(mock_config, mock_bq_client)
+
+        existing = pa.table({
+            "pk1": [1, 1, 2],
+            "pk2": ["a", "b", "a"],
+            "val": ["old_1a", "old_1b", "old_2a"],
+        })
+        new_data = pa.table({
+            "pk1": [1, 2],
+            "pk2": ["a", "a"],
+            "val": ["new_1a", "new_2a"],
+        })
+
+        merged = adapter._merge_arrow_tables(existing, new_data, primary_key=["pk1", "pk2"])
+        df = merged.to_pandas().sort_values(["pk1", "pk2"]).reset_index(drop=True)
+
+        assert len(df) == 3
+        # (1, a) should be updated
+        row_1a = df[(df["pk1"] == 1) & (df["pk2"] == "a")].iloc[0]
+        assert row_1a["val"] == "new_1a"
+        # (1, b) should be preserved
+        row_1b = df[(df["pk1"] == 1) & (df["pk2"] == "b")].iloc[0]
+        assert row_1b["val"] == "old_1b"
+        # (2, a) should be updated
+        row_2a = df[(df["pk1"] == 2) & (df["pk2"] == "a")].iloc[0]
+        assert row_2a["val"] == "new_2a"
+
+    def test_merge_with_empty_new_data(self, mock_config, mock_bq_client):
+        """Merging with empty new data should return existing data unchanged."""
+        adapter = _create_adapter(mock_config, mock_bq_client)
+
+        existing = pa.table({"id": [1, 2], "val": ["a", "b"]})
+        empty = pa.table({
+            "id": pa.array([], type=pa.int64()),
+            "val": pa.array([], type=pa.string()),
+        })
+
+        merged = adapter._merge_arrow_tables(existing, empty, primary_key=["id"])
+        assert merged.num_rows == 2
+
+    def test_merge_with_empty_existing(self, mock_config, mock_bq_client):
+        """Merging with empty existing data should return new data."""
+        adapter = _create_adapter(mock_config, mock_bq_client)
+
+        empty = pa.table({
+            "id": pa.array([], type=pa.int64()),
+            "val": pa.array([], type=pa.string()),
+        })
+        new_data = pa.table({"id": [1, 2], "val": ["a", "b"]})
+
+        merged = adapter._merge_arrow_tables(empty, new_data, primary_key=["id"])
+        assert merged.num_rows == 2
+
+
+# ---------------------------------------------------------------------------
+# Additional edge cases
+# ---------------------------------------------------------------------------
+
+class TestMetadataCacheClearing:
+
+    def test_clears_metadata_cache_before_sync(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """sync_table should clear the BQ metadata cache entry for the table being synced."""
+        table_config = _make_table_config()
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+        mock_bq_client.read_table.return_value = _sample_arrow_table([1], ["A"])
+
+        # Pre-populate cache
+        mock_bq_client.metadata_cache[table_config.id] = {"some": "cached_data"}
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        adapter.sync_table(table_config, sync_state)
+
+        assert table_config.id not in mock_bq_client.metadata_cache
+
+
+class TestSyncStateUpdate:
+
+    def test_sync_state_updated_after_success(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """After successful sync, the sync state should be updated with correct values."""
+        table_config = _make_table_config()
+        parquet_path = tmp_parquet_dir / "orders.parquet"
+        mock_config.get_parquet_path.return_value = parquet_path
+        mock_bq_client.read_table.return_value = _sample_arrow_table([1, 2], ["A", "B"])
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        adapter.sync_table(table_config, sync_state)
+
+        state = sync_state.get_table_state(table_config.id)
+        assert state["rows"] == 2
+        assert state["strategy"] == "full_refresh"
+        assert state["table_name"] == "orders"
+        assert "last_sync" in state
+
+    def test_sync_state_not_updated_on_failure(
+        self, mock_config, mock_bq_client, tmp_parquet_dir, sync_state
+    ):
+        """On sync failure, the sync state should NOT be updated."""
+        table_config = _make_table_config()
+        mock_config.get_parquet_path.return_value = tmp_parquet_dir / "orders.parquet"
+        mock_bq_client.read_table.side_effect = RuntimeError("boom")
+
+        adapter = _create_adapter(mock_config, mock_bq_client)
+        adapter.sync_table(table_config, sync_state)
+
+        state = sync_state.get_table_state(table_config.id)
+        assert state == {}
+
+
+class TestCreateDataSourceFactory:
+
+    def test_factory_returns_adapter_instance(self, mock_config, mock_bq_client):
+        """create_data_source() factory should return a BigQueryDataSource instance."""
+        with patch("connectors.bigquery.adapter.get_config", return_value=mock_config), \
+             patch("connectors.bigquery.adapter.create_bq_client", return_value=mock_bq_client):
+            from connectors.bigquery.adapter import create_data_source, BigQueryDataSource
+            instance = create_data_source()
+            assert isinstance(instance, BigQueryDataSource)
--- a/tests/test_bigquery_client.py
+++ b/tests/test_bigquery_client.py
@ -0,0 +1,870 @@
+"""Tests for the BigQuery client connector.
+
+All external dependencies (google.cloud.bigquery, src.config) are mocked.
+Tests cover initialization, metadata caching, schema building, query methods,
+and connection testing.
+"""
+
+import json
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from unittest.mock import MagicMock, mock_open, patch
+
+import pyarrow as pa
+import pytest
+
+# Pre-populate sys.modules with a mock google.cloud.bigquery if not installed,
+# so the client module can be imported without the real SDK.
+_bq_mock_installed = False
+try:
+    from google.cloud import bigquery as _bq_test  # noqa: F401
+except ImportError:
+    _bq_mock_installed = True
+    _mock_bigquery = MagicMock()
+    # Expose commonly used classes as MagicMock so the client module
+    # can reference bigquery.Client, bigquery.QueryJobConfig, etc.
+    sys.modules.setdefault("google", MagicMock())
+    sys.modules.setdefault("google.cloud", MagicMock())
+    sys.modules.setdefault("google.cloud.bigquery", _mock_bigquery)
+
+from connectors.bigquery.client import (
+    BIGQUERY_TO_PYARROW_TYPES,
+    BigQueryClient,
+    create_client,
+)
+
+# Import the real or mock bigquery reference used in the client module
+from google.cloud import bigquery
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_bq_field(name: str, field_type: str, description: str = None):
+    """Create a mock BigQuery SchemaField."""
+    field = MagicMock()
+    field.name = name
+    field.field_type = field_type
+    field.description = description
+    return field
+
+
+def _make_table_ref(
+    table_id: str = "my-project.my_dataset.my_table",
+    schema=None,
+    num_rows: int = 1000,
+    num_bytes: int = 50000,
+    created: datetime = None,
+    modified: datetime = None,
+    time_partitioning=None,
+):
+    """Create a mock BigQuery Table reference object."""
+    table_ref = MagicMock()
+    table_ref.table_id = table_id.split(".")[-1]
+    table_ref.dataset_id = table_id.split(".")[1] if "." in table_id else "dataset"
+    table_ref.project = table_id.split(".")[0] if "." in table_id else "project"
+    table_ref.schema = schema or []
+    table_ref.num_rows = num_rows
+    table_ref.num_bytes = num_bytes
+    table_ref.created = created or datetime(2025, 1, 1, 12, 0, 0)
+    table_ref.modified = modified or datetime(2025, 6, 1, 12, 0, 0)
+    table_ref.time_partitioning = time_partitioning
+    return table_ref
+
+
+@pytest.fixture
+def mock_config(tmp_path):
+    """Mock get_config() to return a config with metadata path in tmp_path."""
+    config = MagicMock()
+    metadata_dir = tmp_path / "metadata"
+    metadata_dir.mkdir(parents=True, exist_ok=True)
+    config.get_metadata_path.return_value = metadata_dir
+    return config
+
+
+@pytest.fixture
+def mock_bq_client():
+    """Create a mock BigQuery Client."""
+    return MagicMock()
+
+
+@pytest.fixture
+def client(mock_config, mock_bq_client):
+    """Create a BigQueryClient instance with mocked dependencies."""
+    with (
+        patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq_client),
+        patch("connectors.bigquery.client.get_config", return_value=mock_config),
+        patch.dict("os.environ", {"BIGQUERY_PROJECT": "test-project"}),
+    ):
+        bq_client = BigQueryClient()
+    return bq_client
+
+
+# ---------------------------------------------------------------------------
+# 1. Init validates BIGQUERY_PROJECT env var
+# ---------------------------------------------------------------------------
+
+class TestInit:
+    def test_raises_value_error_when_project_not_set(self, mock_config):
+        """Init raises ValueError if project_id is None and env var is missing."""
+        with (
+            patch("connectors.bigquery.client.bigquery.Client"),
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {}, clear=True),
+        ):
+            with pytest.raises(ValueError, match="BigQuery project ID not set"):
+                BigQueryClient()
+
+    def test_raises_value_error_when_project_empty_string(self, mock_config):
+        """Init raises ValueError if BIGQUERY_PROJECT is set to empty string."""
+        with (
+            patch("connectors.bigquery.client.bigquery.Client"),
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {"BIGQUERY_PROJECT": ""}, clear=True),
+        ):
+            with pytest.raises(ValueError, match="BigQuery project ID not set"):
+                BigQueryClient()
+
+    # -------------------------------------------------------------------
+    # 2. Init creates client with correct project_id
+    # -------------------------------------------------------------------
+
+    def test_creates_client_with_env_project_id(self, mock_config):
+        """Client uses BIGQUERY_PROJECT from environment."""
+        mock_bq = MagicMock()
+        with (
+            patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq) as bq_cls,
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {"BIGQUERY_PROJECT": "env-project-123"}),
+        ):
+            client = BigQueryClient()
+            bq_cls.assert_called_once_with(project="env-project-123")
+            assert client.project_id == "env-project-123"
+
+    def test_creates_client_with_explicit_project_id(self, mock_config):
+        """Explicit project_id argument takes precedence over env var."""
+        mock_bq = MagicMock()
+        with (
+            patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq) as bq_cls,
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+        ):
+            client = BigQueryClient(project_id="explicit-project")
+            bq_cls.assert_called_once_with(project="explicit-project")
+            assert client.project_id == "explicit-project"
+
+
+# ---------------------------------------------------------------------------
+# 3. get_table_metadata fetches and caches metadata correctly
+# ---------------------------------------------------------------------------
+
+class TestGetTableMetadata:
+    def test_fetches_metadata_from_bigquery(self, client, mock_bq_client):
+        """get_table_metadata calls client.get_table and returns correct dict."""
+        table_id = "proj.dataset.orders"
+        schema = [
+            _make_bq_field("order_id", "INTEGER"),
+            _make_bq_field("customer_name", "STRING", description="Full name"),
+            _make_bq_field("created_at", "TIMESTAMP"),
+        ]
+        table_ref = _make_table_ref(
+            table_id=table_id,
+            schema=schema,
+            num_rows=5000,
+            num_bytes=120000,
+        )
+        mock_bq_client.get_table.return_value = table_ref
+
+        metadata = client.get_table_metadata(table_id, use_cache=False)
+
+        mock_bq_client.get_table.assert_called_once_with(table_id)
+        assert metadata["table_id"] == table_id
+        assert metadata["name"] == "orders"
+        assert metadata["dataset"] == "dataset"
+        assert metadata["project"] == "proj"
+        assert metadata["columns"] == ["order_id", "customer_name", "created_at"]
+        assert metadata["column_types"]["order_id"] == "INTEGER"
+        assert metadata["column_types"]["customer_name"] == "STRING"
+        assert metadata["column_types"]["created_at"] == "TIMESTAMP"
+        assert metadata["column_descriptions"]["customer_name"] == "Full name"
+        assert "order_id" not in metadata["column_descriptions"]
+        assert metadata["row_count"] == 5000
+        assert metadata["size_bytes"] == 120000
+        assert "_cached_at" in metadata
+
+    def test_caches_metadata_in_memory(self, client, mock_bq_client):
+        """After first fetch, metadata is stored in the in-memory cache."""
+        table_id = "proj.dataset.tbl"
+        table_ref = _make_table_ref(table_id=table_id)
+        mock_bq_client.get_table.return_value = table_ref
+
+        client.get_table_metadata(table_id, use_cache=False)
+
+        assert table_id in client.metadata_cache
+        assert client.metadata_cache[table_id]["table_id"] == table_id
+
+    def test_captures_partitioning_info(self, client, mock_bq_client):
+        """Partitioning metadata is captured when table is partitioned."""
+        table_id = "proj.dataset.events"
+        partition = MagicMock()
+        partition.type_ = "DAY"
+        partition.field = "event_date"
+        partition.expiration_ms = 7776000000
+
+        table_ref = _make_table_ref(table_id=table_id, time_partitioning=partition)
+        mock_bq_client.get_table.return_value = table_ref
+
+        metadata = client.get_table_metadata(table_id, use_cache=False)
+
+        assert metadata["partitioning"] is not None
+        assert metadata["partitioning"]["type"] == "DAY"
+        assert metadata["partitioning"]["field"] == "event_date"
+        assert metadata["partitioning"]["expiration_ms"] == 7776000000
+
+    def test_no_partitioning_when_absent(self, client, mock_bq_client):
+        """Partitioning is None when table has no partitioning."""
+        table_id = "proj.dataset.simple"
+        table_ref = _make_table_ref(table_id=table_id, time_partitioning=None)
+        mock_bq_client.get_table.return_value = table_ref
+
+        metadata = client.get_table_metadata(table_id, use_cache=False)
+        assert metadata["partitioning"] is None
+
+    # -------------------------------------------------------------------
+    # 4. get_table_metadata uses cache when available (within TTL)
+    # -------------------------------------------------------------------
+
+    def test_uses_cache_within_ttl(self, client, mock_bq_client):
+        """When cache is fresh (within TTL), BQ API is not called again."""
+        table_id = "proj.dataset.cached_tbl"
+        now = datetime.now()
+        client.metadata_cache[table_id] = {
+            "table_id": table_id,
+            "columns": ["a", "b"],
+            "column_types": {"a": "STRING", "b": "INTEGER"},
+            "_cached_at": now.isoformat(),
+        }
+
+        result = client.get_table_metadata(table_id, use_cache=True, cache_ttl_hours=24)
+
+        mock_bq_client.get_table.assert_not_called()
+        assert result["table_id"] == table_id
+        assert result["columns"] == ["a", "b"]
+
+    def test_refetches_when_cache_expired(self, client, mock_bq_client):
+        """When cache is older than TTL, metadata is re-fetched from BQ."""
+        table_id = "proj.dataset.stale_tbl"
+        old_time = (datetime.now() - timedelta(hours=48)).isoformat()
+        client.metadata_cache[table_id] = {
+            "table_id": table_id,
+            "columns": ["old_col"],
+            "column_types": {"old_col": "STRING"},
+            "_cached_at": old_time,
+        }
+
+        table_ref = _make_table_ref(
+            table_id=table_id,
+            schema=[_make_bq_field("new_col", "INTEGER")],
+        )
+        mock_bq_client.get_table.return_value = table_ref
+
+        result = client.get_table_metadata(table_id, use_cache=True, cache_ttl_hours=24)
+
+        mock_bq_client.get_table.assert_called_once_with(table_id)
+        assert result["columns"] == ["new_col"]
+
+    def test_bypasses_cache_when_use_cache_false(self, client, mock_bq_client):
+        """When use_cache=False, always fetches from BQ even if cache is fresh."""
+        table_id = "proj.dataset.force_fetch"
+        client.metadata_cache[table_id] = {
+            "table_id": table_id,
+            "columns": ["cached"],
+            "column_types": {"cached": "STRING"},
+            "_cached_at": datetime.now().isoformat(),
+        }
+
+        table_ref = _make_table_ref(
+            table_id=table_id,
+            schema=[_make_bq_field("fresh", "INTEGER")],
+        )
+        mock_bq_client.get_table.return_value = table_ref
+
+        result = client.get_table_metadata(table_id, use_cache=False)
+        mock_bq_client.get_table.assert_called_once()
+        assert result["columns"] == ["fresh"]
+
+
+# ---------------------------------------------------------------------------
+# 5. get_pyarrow_schema builds correct schema from BQ types
+# ---------------------------------------------------------------------------
+
+class TestGetPyarrowSchema:
+    def test_builds_correct_schema(self, client, mock_bq_client):
+        """Schema maps BQ types to correct PyArrow types."""
+        table_id = "proj.dataset.typed_tbl"
+        schema = [
+            _make_bq_field("id", "INT64"),
+            _make_bq_field("name", "STRING"),
+            _make_bq_field("price", "FLOAT64"),
+            _make_bq_field("active", "BOOLEAN"),
+            _make_bq_field("created", "DATE"),
+            _make_bq_field("updated_at", "TIMESTAMP"),
+        ]
+        table_ref = _make_table_ref(table_id=table_id, schema=schema)
+        mock_bq_client.get_table.return_value = table_ref
+
+        pa_schema = client.get_pyarrow_schema(table_id)
+
+        assert pa_schema is not None
+        assert pa_schema.field("id").type == pa.int64()
+        assert pa_schema.field("name").type == pa.string()
+        assert pa_schema.field("price").type == pa.float64()
+        assert pa_schema.field("active").type == pa.bool_()
+        assert pa_schema.field("created").type == pa.date32()
+        assert pa_schema.field("updated_at").type == pa.timestamp("us", tz="UTC")
+
+    def test_returns_none_when_no_column_types(self, client):
+        """Returns None when metadata has no column_types."""
+        table_id = "proj.dataset.empty_schema"
+        client.metadata_cache[table_id] = {
+            "table_id": table_id,
+            "columns": [],
+            "column_types": {},
+            "_cached_at": datetime.now().isoformat(),
+        }
+
+        result = client.get_pyarrow_schema(table_id)
+        assert result is None
+
+    def test_unknown_type_falls_back_to_string(self, client, mock_bq_client):
+        """Unknown BQ types default to pa.string() in the schema."""
+        table_id = "proj.dataset.exotic_types"
+        schema = [_make_bq_field("exotic_col", "SOME_UNKNOWN_TYPE")]
+        table_ref = _make_table_ref(table_id=table_id, schema=schema)
+        mock_bq_client.get_table.return_value = table_ref
+
+        pa_schema = client.get_pyarrow_schema(table_id)
+        assert pa_schema.field("exotic_col").type == pa.string()
+
+
+# ---------------------------------------------------------------------------
+# 6. get_date_columns returns only DATE columns
+# ---------------------------------------------------------------------------
+
+class TestGetDateColumns:
+    def test_returns_only_date_columns(self, client, mock_bq_client):
+        """Only columns with BQ type DATE are returned."""
+        table_id = "proj.dataset.mixed_dates"
+        schema = [
+            _make_bq_field("event_date", "DATE"),
+            _make_bq_field("created_at", "TIMESTAMP"),
+            _make_bq_field("name", "STRING"),
+            _make_bq_field("birth_date", "DATE"),
+            _make_bq_field("updated_ts", "DATETIME"),
+        ]
+        table_ref = _make_table_ref(table_id=table_id, schema=schema)
+        mock_bq_client.get_table.return_value = table_ref
+
+        date_cols = client.get_date_columns(table_id)
+        assert sorted(date_cols) == ["birth_date", "event_date"]
+
+    def test_returns_empty_when_no_date_columns(self, client, mock_bq_client):
+        """Returns empty list when no DATE columns exist."""
+        table_id = "proj.dataset.no_dates"
+        schema = [
+            _make_bq_field("id", "INTEGER"),
+            _make_bq_field("ts", "TIMESTAMP"),
+        ]
+        table_ref = _make_table_ref(table_id=table_id, schema=schema)
+        mock_bq_client.get_table.return_value = table_ref
+
+        date_cols = client.get_date_columns(table_id)
+        assert date_cols == []
+
+
+# ---------------------------------------------------------------------------
+# 7. query_to_arrow executes SQL and returns PyArrow table
+# ---------------------------------------------------------------------------
+
+class TestQueryToArrow:
+    def test_executes_query_and_returns_arrow(self, client, mock_bq_client):
+        """query_to_arrow passes SQL to BQ and returns the arrow result."""
+        expected_table = pa.table({"col1": [1, 2, 3]})
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = expected_table
+        mock_bq_client.query.return_value = mock_job
+
+        with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
+            mock_bq_module.QueryJobConfig.return_value = MagicMock(query_parameters=None)
+            client.client = mock_bq_client
+
+            result = client.query_to_arrow("SELECT * FROM `proj.dataset.tbl`")
+
+        mock_bq_client.query.assert_called_once()
+        call_args = mock_bq_client.query.call_args
+        assert call_args[0][0] == "SELECT * FROM `proj.dataset.tbl`"
+        assert result.equals(expected_table)
+
+    def test_passes_query_parameters(self, client, mock_bq_client):
+        """query_to_arrow forwards BQ query parameters in job config."""
+        expected_table = pa.table({"col1": [10]})
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = expected_table
+        mock_bq_client.query.return_value = mock_job
+
+        mock_job_config = MagicMock()
+        params = [MagicMock()]  # Mock ScalarQueryParameter
+
+        with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
+            mock_bq_module.QueryJobConfig.return_value = mock_job_config
+            client.client = mock_bq_client
+
+            client.query_to_arrow("SELECT 1 WHERE x > @val", params=params)
+
+        # Verify params were set on the job config
+        assert mock_job_config.query_parameters == params
+
+    def test_no_params_does_not_set_query_parameters(self, client, mock_bq_client):
+        """When no params given, query_parameters is not set on job config."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"x": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        mock_job_config = MagicMock(spec=[])
+        with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
+            mock_bq_module.QueryJobConfig.return_value = mock_job_config
+            client.client = mock_bq_client
+
+            client.query_to_arrow("SELECT 1")
+
+        # query_parameters should not have been set
+        assert not hasattr(mock_job_config, "query_parameters") or not getattr(
+            mock_job_config, "query_parameters", None
+        )
+
+
+# ---------------------------------------------------------------------------
+# 8. read_table builds correct SQL query
+# ---------------------------------------------------------------------------
+
+class TestReadTable:
+    def test_full_table_select_all(self, client, mock_bq_client):
+        """read_table with no columns or filter generates SELECT *."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"a": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        client.read_table("proj.dataset.tbl")
+
+        sql = mock_bq_client.query.call_args[0][0]
+        assert "SELECT *" in sql
+        assert "`proj.dataset.tbl`" in sql
+        assert "WHERE" not in sql
+
+    def test_select_specific_columns(self, client, mock_bq_client):
+        """read_table with columns list generates SELECT with backtick-quoted names."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"a": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        client.read_table("proj.dataset.tbl", columns=["col_a", "col_b"])
+
+        sql = mock_bq_client.query.call_args[0][0]
+        assert "`col_a`" in sql
+        assert "`col_b`" in sql
+        assert "*" not in sql
+
+    def test_with_row_filter(self, client, mock_bq_client):
+        """read_table with row_filter appends WHERE clause."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"a": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        client.read_table("proj.dataset.tbl", row_filter="status = 'active'")
+
+        sql = mock_bq_client.query.call_args[0][0]
+        assert "WHERE status = 'active'" in sql
+
+    def test_columns_and_filter_combined(self, client, mock_bq_client):
+        """read_table with both columns and filter generates correct SQL."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"x": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        client.read_table(
+            "proj.dataset.tbl",
+            columns=["id", "name"],
+            row_filter="id > 100",
+        )
+
+        sql = mock_bq_client.query.call_args[0][0]
+        assert "`id`, `name`" in sql
+        assert "WHERE id > 100" in sql
+        assert "`proj.dataset.tbl`" in sql
+
+
+# ---------------------------------------------------------------------------
+# 9. read_table_incremental builds parameterized WHERE clause
+# ---------------------------------------------------------------------------
+
+class TestReadTableIncremental:
+    def test_incremental_query_structure(self, client, mock_bq_client):
+        """read_table_incremental builds WHERE col > @since_value with params."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"a": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
+            mock_bq_module.QueryJobConfig.return_value = MagicMock()
+            mock_param = MagicMock()
+            mock_bq_module.ScalarQueryParameter.return_value = mock_param
+            # Re-assign the client's bq client (the fixture already set it up)
+            client.client = mock_bq_client
+
+            client.read_table_incremental(
+                table_id="proj.dataset.events",
+                incremental_column="updated_at",
+                since_value="2025-01-01T00:00:00Z",
+            )
+
+            sql = mock_bq_client.query.call_args[0][0]
+            assert "SELECT *" in sql
+            assert "`proj.dataset.events`" in sql
+            assert "`updated_at` > @since_value" in sql
+
+            # Verify ScalarQueryParameter was constructed correctly
+            mock_bq_module.ScalarQueryParameter.assert_called_once_with(
+                "since_value", "TIMESTAMP", "2025-01-01T00:00:00Z"
+            )
+
+    def test_incremental_with_columns(self, client, mock_bq_client):
+        """read_table_incremental with columns list selects specific columns."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"a": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
+            mock_bq_module.QueryJobConfig.return_value = MagicMock()
+            mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
+            client.client = mock_bq_client
+
+            client.read_table_incremental(
+                table_id="proj.dataset.events",
+                incremental_column="updated_at",
+                since_value="2025-01-01T00:00:00Z",
+                columns=["id", "name"],
+            )
+
+            sql = mock_bq_client.query.call_args[0][0]
+            assert "`id`, `name`" in sql
+            assert "*" not in sql
+
+
+# ---------------------------------------------------------------------------
+# 10. read_table_partitioned builds correct range query
+# ---------------------------------------------------------------------------
+
+class TestReadTablePartitioned:
+    def test_partitioned_start_only(self, client, mock_bq_client):
+        """With only start, generates >= @start_value without end clause."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"a": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
+            mock_bq_module.QueryJobConfig.return_value = MagicMock()
+            mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
+            client.client = mock_bq_client
+
+            client.read_table_partitioned(
+                table_id="proj.dataset.events",
+                partition_column="event_date",
+                start="2025-01-01",
+            )
+
+            sql = mock_bq_client.query.call_args[0][0]
+            assert "`event_date` >= @start_value" in sql
+            assert "@end_value" not in sql
+
+            # Only start_value parameter created
+            assert mock_bq_module.ScalarQueryParameter.call_count == 1
+            mock_bq_module.ScalarQueryParameter.assert_called_with(
+                "start_value", "TIMESTAMP", "2025-01-01"
+            )
+
+    def test_partitioned_start_and_end(self, client, mock_bq_client):
+        """With start and end, generates >= @start_value AND < @end_value."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"a": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
+            mock_bq_module.QueryJobConfig.return_value = MagicMock()
+            mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
+            client.client = mock_bq_client
+
+            client.read_table_partitioned(
+                table_id="proj.dataset.events",
+                partition_column="event_date",
+                start="2025-01-01",
+                end="2025-06-01",
+            )
+
+            sql = mock_bq_client.query.call_args[0][0]
+            assert "`event_date` >= @start_value" in sql
+            assert "`event_date` < @end_value" in sql
+
+            # Both start_value and end_value parameters created
+            assert mock_bq_module.ScalarQueryParameter.call_count == 2
+            calls = mock_bq_module.ScalarQueryParameter.call_args_list
+            assert calls[0].args == ("start_value", "TIMESTAMP", "2025-01-01")
+            assert calls[1].args == ("end_value", "TIMESTAMP", "2025-06-01")
+
+    def test_partitioned_with_columns(self, client, mock_bq_client):
+        """read_table_partitioned with columns selects specific columns."""
+        mock_job = MagicMock()
+        mock_job.to_arrow.return_value = pa.table({"a": [1]})
+        mock_bq_client.query.return_value = mock_job
+
+        with patch("connectors.bigquery.client.bigquery") as mock_bq_module:
+            mock_bq_module.QueryJobConfig.return_value = MagicMock()
+            mock_bq_module.ScalarQueryParameter.return_value = MagicMock()
+            client.client = mock_bq_client
+
+            client.read_table_partitioned(
+                table_id="proj.dataset.events",
+                partition_column="event_date",
+                start="2025-01-01",
+                columns=["id", "event_date", "value"],
+            )
+
+            sql = mock_bq_client.query.call_args[0][0]
+            assert "`id`, `event_date`, `value`" in sql
+            assert "*" not in sql
+
+
+# ---------------------------------------------------------------------------
+# 11. test_connection returns True on success, False on failure
+# ---------------------------------------------------------------------------
+
+class TestTestConnection:
+    def test_returns_true_on_success(self, client, mock_bq_client):
+        """test_connection returns True when SELECT 1 query succeeds."""
+        mock_job = MagicMock()
+        mock_job.result.return_value = iter([(1,)])
+        mock_bq_client.query.return_value = mock_job
+
+        assert client.test_connection() is True
+        mock_bq_client.query.assert_called_once_with("SELECT 1")
+
+    def test_returns_false_on_failure(self, client, mock_bq_client):
+        """test_connection returns False when the query raises an exception."""
+        mock_bq_client.query.side_effect = Exception("Connection refused")
+
+        assert client.test_connection() is False
+
+    def test_returns_false_when_result_fails(self, client, mock_bq_client):
+        """test_connection returns False when result iteration fails."""
+        mock_job = MagicMock()
+        mock_job.result.side_effect = Exception("Timeout")
+        mock_bq_client.query.return_value = mock_job
+
+        assert client.test_connection() is False
+
+
+# ---------------------------------------------------------------------------
+# 12. Type mapping completeness (all BQ types have PyArrow mapping)
+# ---------------------------------------------------------------------------
+
+class TestTypeMapping:
+    # All standard BigQuery types that should be mapped
+    EXPECTED_BQ_TYPES = [
+        "STRING", "BYTES", "INTEGER", "INT64",
+        "FLOAT", "FLOAT64", "NUMERIC", "BIGNUMERIC",
+        "BOOLEAN", "BOOL",
+        "TIMESTAMP", "DATE", "TIME", "DATETIME",
+        "GEOGRAPHY", "JSON",
+        "STRUCT", "RECORD", "ARRAY",
+    ]
+
+    def test_all_standard_bq_types_are_mapped(self):
+        """Every standard BigQuery type has an entry in BIGQUERY_TO_PYARROW_TYPES."""
+        for bq_type in self.EXPECTED_BQ_TYPES:
+            assert bq_type in BIGQUERY_TO_PYARROW_TYPES, (
+                f"Missing PyArrow mapping for BQ type: {bq_type}"
+            )
+
+    def test_all_mappings_produce_valid_pyarrow_types(self):
+        """Every mapped value is a valid PyArrow DataType."""
+        for bq_type, pa_type in BIGQUERY_TO_PYARROW_TYPES.items():
+            assert isinstance(pa_type, pa.DataType), (
+                f"BQ type {bq_type} maps to non-DataType: {pa_type!r}"
+            )
+
+    def test_integer_types_map_to_int64(self):
+        """Both INTEGER and INT64 map to pa.int64()."""
+        assert BIGQUERY_TO_PYARROW_TYPES["INTEGER"] == pa.int64()
+        assert BIGQUERY_TO_PYARROW_TYPES["INT64"] == pa.int64()
+
+    def test_float_types_map_to_float64(self):
+        """FLOAT, FLOAT64, NUMERIC, BIGNUMERIC all map to pa.float64()."""
+        for t in ["FLOAT", "FLOAT64", "NUMERIC", "BIGNUMERIC"]:
+            assert BIGQUERY_TO_PYARROW_TYPES[t] == pa.float64()
+
+    def test_boolean_types_map_to_bool(self):
+        """Both BOOLEAN and BOOL map to pa.bool_()."""
+        assert BIGQUERY_TO_PYARROW_TYPES["BOOLEAN"] == pa.bool_()
+        assert BIGQUERY_TO_PYARROW_TYPES["BOOL"] == pa.bool_()
+
+    def test_date_maps_to_date32(self):
+        """DATE maps to pa.date32()."""
+        assert BIGQUERY_TO_PYARROW_TYPES["DATE"] == pa.date32()
+
+    def test_timestamp_has_utc_timezone(self):
+        """TIMESTAMP maps to pa.timestamp with UTC timezone."""
+        ts_type = BIGQUERY_TO_PYARROW_TYPES["TIMESTAMP"]
+        assert ts_type == pa.timestamp("us", tz="UTC")
+
+    def test_datetime_has_no_timezone(self):
+        """DATETIME maps to pa.timestamp without timezone."""
+        dt_type = BIGQUERY_TO_PYARROW_TYPES["DATETIME"]
+        assert dt_type == pa.timestamp("us")
+
+    def test_complex_types_map_to_string(self):
+        """STRUCT, RECORD, ARRAY, GEOGRAPHY, JSON all serialize as string."""
+        for t in ["STRUCT", "RECORD", "ARRAY", "GEOGRAPHY", "JSON"]:
+            assert BIGQUERY_TO_PYARROW_TYPES[t] == pa.string()
+
+
+# ---------------------------------------------------------------------------
+# 13. Metadata cache save/load from disk
+# ---------------------------------------------------------------------------
+
+class TestMetadataCachePersistence:
+    def test_save_and_load_cache(self, tmp_path):
+        """Metadata cache is persisted to disk and reloaded on new client init."""
+        metadata_dir = tmp_path / "metadata"
+        metadata_dir.mkdir(parents=True, exist_ok=True)
+        cache_file = metadata_dir / "bq_table_metadata.json"
+
+        mock_config = MagicMock()
+        mock_config.get_metadata_path.return_value = metadata_dir
+
+        # First client: fetch metadata and save to cache
+        mock_bq = MagicMock()
+        table_id = "proj.ds.tbl"
+        schema = [_make_bq_field("col1", "STRING")]
+        table_ref = _make_table_ref(table_id=table_id, schema=schema)
+        mock_bq.get_table.return_value = table_ref
+
+        with (
+            patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
+        ):
+            client1 = BigQueryClient()
+            client1.get_table_metadata(table_id, use_cache=False)
+
+        # Verify the cache file was written
+        assert cache_file.exists()
+        saved_data = json.loads(cache_file.read_text())
+        assert table_id in saved_data
+        assert saved_data[table_id]["columns"] == ["col1"]
+
+        # Second client: loads cache from disk on init
+        mock_bq2 = MagicMock()
+        with (
+            patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq2),
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
+        ):
+            client2 = BigQueryClient()
+
+        assert table_id in client2.metadata_cache
+        assert client2.metadata_cache[table_id]["columns"] == ["col1"]
+
+    def test_load_handles_corrupt_cache_file(self, tmp_path):
+        """Client handles corrupt cache JSON gracefully without crashing."""
+        metadata_dir = tmp_path / "metadata"
+        metadata_dir.mkdir(parents=True, exist_ok=True)
+        cache_file = metadata_dir / "bq_table_metadata.json"
+        cache_file.write_text("{corrupt json!!!")
+
+        mock_config = MagicMock()
+        mock_config.get_metadata_path.return_value = metadata_dir
+
+        mock_bq = MagicMock()
+        with (
+            patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
+        ):
+            client = BigQueryClient()
+
+        # Cache should be empty after corrupt file
+        assert client.metadata_cache == {}
+
+    def test_load_handles_missing_cache_file(self, tmp_path):
+        """Client initializes with empty cache when no cache file exists."""
+        metadata_dir = tmp_path / "metadata"
+        metadata_dir.mkdir(parents=True, exist_ok=True)
+        # No cache file created
+
+        mock_config = MagicMock()
+        mock_config.get_metadata_path.return_value = metadata_dir
+
+        mock_bq = MagicMock()
+        with (
+            patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
+        ):
+            client = BigQueryClient()
+
+        assert client.metadata_cache == {}
+
+    def test_save_creates_parent_directories(self, tmp_path):
+        """_save_metadata_cache creates parent directories if they do not exist."""
+        # Use a nested path that does not yet exist
+        metadata_dir = tmp_path / "deep" / "nested" / "metadata"
+        # Do NOT create directories upfront
+
+        mock_config = MagicMock()
+        mock_config.get_metadata_path.return_value = metadata_dir
+
+        mock_bq = MagicMock()
+        table_id = "proj.ds.tbl"
+        schema = [_make_bq_field("x", "INTEGER")]
+        table_ref = _make_table_ref(table_id=table_id, schema=schema)
+        mock_bq.get_table.return_value = table_ref
+
+        with (
+            patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {"BIGQUERY_PROJECT": "proj"}),
+        ):
+            client = BigQueryClient()
+            client.get_table_metadata(table_id, use_cache=False)
+
+        cache_file = metadata_dir / "bq_table_metadata.json"
+        assert cache_file.exists()
+
+
+# ---------------------------------------------------------------------------
+# Factory function
+# ---------------------------------------------------------------------------
+
+class TestCreateClient:
+    def test_create_client_returns_bigquery_client(self, mock_config):
+        """create_client() factory returns a BigQueryClient instance."""
+        mock_bq = MagicMock()
+        with (
+            patch("connectors.bigquery.client.bigquery.Client", return_value=mock_bq),
+            patch("connectors.bigquery.client.get_config", return_value=mock_config),
+            patch.dict("os.environ", {"BIGQUERY_PROJECT": "factory-project"}),
+        ):
+            result = create_client()
+
+        assert isinstance(result, BigQueryClient)
+        assert result.project_id == "factory-project"