"""Tests for BigQuery extractor (remote-only via DuckDB extension).""" import re from pathlib import Path from unittest.mock import MagicMock import duckdb import pytest from connectors.bigquery.extractor import _detect_table_type from tests.helpers.contract import validate_extract_contract @pytest.fixture def output_dir(tmp_path): d = tmp_path / "extracts" / "bigquery" d.mkdir(parents=True) return str(d) @pytest.fixture def sample_configs(): return [ { "id": "project.analytics.orders", "name": "orders", "source_type": "bigquery", "bucket": "analytics", "source_table": "orders", "query_mode": "remote", "description": "Order data from BQ", }, { "id": "project.analytics.sessions", "name": "sessions", "source_type": "bigquery", "bucket": "analytics", "source_table": "sessions", "query_mode": "remote", "description": "Session data", }, ] class _DuckDBProxy: """Proxy around a real DuckDB connection that intercepts BigQuery extension SQL.""" def __init__(self, real_conn): self._real = real_conn def execute(self, sql, *args, **kwargs): sql_upper = sql.strip().upper() if sql_upper.startswith("INSTALL BIGQUERY") or sql_upper.startswith( "LOAD BIGQUERY" ): return MagicMock() if sql_upper.startswith("CREATE SECRET"): return MagicMock() if "ATTACH" in sql_upper and "BIGQUERY" in sql_upper: return MagicMock() if sql_upper.startswith("DETACH BQ"): return MagicMock() # CREATE VIEW referencing bq.* -> create a dummy table instead if "FROM BQ." in sql_upper and "CREATE" in sql_upper: match = re.search(r'VIEW\s+"?(\w+)"?', sql, re.IGNORECASE) if match: view_name = match.group(1) self._real.execute( f'CREATE OR REPLACE TABLE "{view_name}" (dummy INTEGER)' ) return MagicMock() return self._real.execute(sql, *args, **kwargs) def close(self): return self._real.close() def __getattr__(self, name): return getattr(self._real, name) class TestBigQueryExtractor: def test_creates_extract_duckdb_with_meta(self, output_dir, sample_configs, monkeypatch): """Test that init_extract creates extract.duckdb with _meta and _remote_attach.""" from unittest.mock import patch # Mock metadata-token auth + entity type detection so the test runs offline. monkeypatch.setattr( "connectors.bigquery.extractor.get_metadata_token", lambda: "test-token", ) monkeypatch.setattr( "connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "BASE TABLE", ) def proxy_connect(path=None, **kwargs): real_conn = duckdb.connect(path) return _DuckDBProxy(real_conn) with patch("connectors.bigquery.extractor.duckdb") as mock_mod: mock_mod.connect = proxy_connect from connectors.bigquery.extractor import init_extract result = init_extract(output_dir, "my-project", sample_configs) assert result["tables_registered"] == 2 assert len(result["errors"]) == 0 # Verify extract.duckdb has _meta with correct data conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb")) try: meta = conn.execute( "SELECT table_name, query_mode FROM _meta ORDER BY table_name" ).fetchall() assert len(meta) == 2 assert meta[0][0] == "orders" assert meta[0][1] == "remote" assert meta[1][0] == "sessions" assert meta[1][1] == "remote" # Verify _remote_attach table for orchestrator re-ATTACH ra = conn.execute( "SELECT alias, extension, url, token_env FROM _remote_attach" ).fetchone() assert ra[0] == "bq" assert ra[1] == "bigquery" assert ra[2] == "project=my-project" assert ra[3] == "" # BQ handles auth via env automatically finally: conn.close() validate_extract_contract(str(Path(output_dir) / "extract.duckdb")) def test_no_data_directory_created(self, output_dir, sample_configs): """BigQuery is remote-only -- no data/ directory should exist.""" assert not (Path(output_dir) / "data").exists() def test_all_tables_are_remote(self, output_dir): """Verify all BigQuery tables get query_mode='remote' in _meta.""" db_path = Path(output_dir) / "extract.duckdb" conn = duckdb.connect(str(db_path)) conn.execute("""CREATE TABLE _meta ( table_name VARCHAR, description VARCHAR, rows BIGINT, size_bytes BIGINT, extracted_at TIMESTAMP, query_mode VARCHAR DEFAULT 'remote' )""") conn.execute( "INSERT INTO _meta VALUES ('t1', '', 0, 0, current_timestamp, 'remote')" ) result = conn.execute("SELECT query_mode FROM _meta").fetchone() assert result[0] == "remote" conn.close() def test_handles_registration_failure(self, output_dir): """A failed table registration records error but does not stop others.""" db_path = Path(output_dir) / "extract.duckdb" conn = duckdb.connect(str(db_path)) conn.execute("""CREATE TABLE _meta ( table_name VARCHAR, description VARCHAR, rows BIGINT, size_bytes BIGINT, extracted_at TIMESTAMP, query_mode VARCHAR DEFAULT 'remote' )""") from datetime import datetime, timezone now = datetime.now(timezone.utc) # Simulate: first succeeds, second fails (not inserted) conn.execute( "INSERT INTO _meta VALUES ('good_table', '', 0, 0, ?, 'remote')", [now] ) meta = conn.execute("SELECT count(*) FROM _meta").fetchone() assert meta[0] == 1 # Only good_table registered conn.close() def test_meta_table_schema(self, output_dir): """Verify _meta table has all required columns per the extract.duckdb contract.""" from connectors.bigquery.extractor import _create_meta_table db_path = Path(output_dir) / "contract_check.duckdb" conn = duckdb.connect(str(db_path)) _create_meta_table(conn) columns = conn.execute( "SELECT column_name FROM information_schema.columns " "WHERE table_name = '_meta' ORDER BY ordinal_position" ).fetchall() col_names = [c[0] for c in columns] assert col_names == [ "table_name", "description", "rows", "size_bytes", "extracted_at", "query_mode", ] conn.close() class TestDetectTableType: """Detect whether a BQ entity is a base table or a view.""" def test_base_table_returns_table(self): conn = MagicMock() conn.execute.return_value.fetchone.return_value = ("BASE TABLE",) result = _detect_table_type(conn, "proj", "ds", "tbl") assert result == "BASE TABLE" def test_view_returns_view(self): conn = MagicMock() conn.execute.return_value.fetchone.return_value = ("VIEW",) result = _detect_table_type(conn, "proj", "ds", "tbl") assert result == "VIEW" def test_missing_returns_none(self): conn = MagicMock() conn.execute.return_value.fetchone.return_value = None result = _detect_table_type(conn, "proj", "ds", "tbl") assert result is None def test_query_uses_bigquery_query_function(self): """Detection must use bigquery_query() table function (works on views via jobs API).""" conn = MagicMock() conn.execute.return_value.fetchone.return_value = ("VIEW",) _detect_table_type(conn, "my-proj", "my_ds", "my_tbl") # SQL must use the bigquery_query() table function (not direct ref) sql = conn.execute.call_args[0][0] assert "bigquery_query" in sql.lower() # The inner BQ SQL is passed as a parameter, not f-stringed in. # Verify both project and the BQ SQL appear in the bound params. params = conn.execute.call_args[0][1] assert "my-proj" in params, f"expected project in params, got: {params}" # The inner BQ SQL is one of the params; it should reference INFORMATION_SCHEMA.TABLES bq_sql_param = next( (p for p in params if isinstance(p, str) and "INFORMATION_SCHEMA.TABLES" in p), None, ) assert bq_sql_param is not None, f"inner BQ SQL not found in params: {params}" assert "my_ds" in bq_sql_param # dataset is f-stringed into the BQ SQL identifier path # Table name should NOT be inline in the BQ SQL — it goes through the param chain assert "my_tbl" in params, f"table name should be a separate param, got: {params}" class _CapturingProxy: """Wraps a real DuckDB connection, captures all SQL, stubs BQ-specific calls. DuckDBPyConnection.execute is a C-level read-only attribute, so we can't patch the method directly on the connection — we have to wrap with a proxy. """ def __init__(self, real_conn, captured: list): self._real = real_conn self._captured = captured def execute(self, sql, *args, **kwargs): self._captured.append(sql) stripped_u = sql.strip().upper() # Stub only commands that would talk to BQ; CREATE TABLE / INSERT etc. # must pass through to the real DuckDB so _meta + _remote_attach persist. if stripped_u.startswith(("INSTALL ", "LOAD ", "CREATE SECRET")): return MagicMock() if stripped_u.startswith("ATTACH ") and "BIGQUERY" in stripped_u: return MagicMock() if stripped_u.startswith("DETACH "): return MagicMock() if 'FROM bq.' in sql or 'FROM bigquery_query' in sql: return MagicMock() return self._real.execute(sql, *args, **kwargs) def close(self): return self._real.close() def __getattr__(self, name): return getattr(self._real, name) class TestViewVsTableTemplates: """init_extract must pick the right view template based on entity type.""" def test_base_table_uses_direct_attach_ref(self, tmp_path, monkeypatch): """For BASE TABLE, generated DuckDB view references bq.dataset.table directly.""" from connectors.bigquery.extractor import init_extract monkeypatch.setattr( "connectors.bigquery.extractor.get_metadata_token", lambda: "test-token", ) monkeypatch.setattr( "connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "BASE TABLE", ) captured = [] real_connect = duckdb.connect def spy_connect(*a, **kw): real_conn = real_connect(*a, **kw) return _CapturingProxy(real_conn, captured) monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", spy_connect) init_extract( str(tmp_path), "my-project", [{"name": "orders", "bucket": "my_ds", "source_table": "orders", "description": ""}], ) view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() or 'CREATE VIEW' in s.upper()] assert any('FROM bq."my_ds"."orders"' in s for s in view_sqls), \ f"expected direct bq.dataset.table ref for BASE TABLE; got: {view_sqls}" assert not any("bigquery_query(" in s for s in view_sqls), \ "BASE TABLE should not use bigquery_query() function" def test_view_uses_bigquery_query_function(self, tmp_path, monkeypatch): """For VIEW with legacy_wrap_views=True, generated DuckDB view wraps bigquery_query() (jobs API path).""" from connectors.bigquery.extractor import init_extract monkeypatch.setattr( "connectors.bigquery.extractor.get_metadata_token", lambda: "test-token", ) monkeypatch.setattr( "connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW", ) captured = [] real_connect = duckdb.connect def spy_connect(*a, **kw): real_conn = real_connect(*a, **kw) return _CapturingProxy(real_conn, captured) monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", spy_connect) # Enable legacy toggle so this test verifies the old wrap-view path still works. monkeypatch.setattr( "connectors.bigquery.extractor.get_value", lambda *args, default=None, **kw: True if "legacy_wrap_views" in args else default, ) init_extract( str(tmp_path), "my-project", [{"name": "session_view", "bucket": "my_ds", "source_table": "session_view", "description": ""}], ) view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() or 'CREATE VIEW' in s.upper()] view_create = next((s for s in view_sqls if '"session_view"' in s), None) assert view_create is not None, f"no CREATE VIEW for session_view; got: {view_sqls}" assert "bigquery_query(" in view_create assert "my-project" in view_create assert "`my-project.my_ds.session_view`" in view_create, \ f"expected backtick-quoted full path; got: {view_create}" class TestRemoteAttachForBQ: """For BQ source, _remote_attach must signal metadata-auth (empty token_env).""" def test_remote_attach_token_env_is_empty_for_bq(self, tmp_path, monkeypatch): from connectors.bigquery.extractor import init_extract monkeypatch.setattr( "connectors.bigquery.extractor.get_metadata_token", lambda: "test-token", ) monkeypatch.setattr( "connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "BASE TABLE", ) captured = [] real_connect = duckdb.connect def spy_connect(*a, **kw): real_conn = real_connect(*a, **kw) return _CapturingProxy(real_conn, captured) monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", spy_connect) init_extract( str(tmp_path), "my-project", [{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}], ) c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True) rows = c.execute( "SELECT alias, extension, url, token_env FROM _remote_attach" ).fetchall() c.close() assert len(rows) == 1 alias, extension, url, token_env = rows[0] assert alias == "bq" assert extension == "bigquery" assert url == "project=my-project" assert token_env == "", \ "BQ uses metadata auth — token_env must be empty so orchestrator triggers metadata path" class TestInitExtractAuthFailure: """init_extract must abort cleanly if metadata token fetch fails.""" def test_returns_error_when_metadata_unreachable(self, tmp_path, monkeypatch): from connectors.bigquery.extractor import init_extract from connectors.bigquery.auth import BQMetadataAuthError def boom(): raise BQMetadataAuthError("metadata server unreachable: simulated") monkeypatch.setattr( "connectors.bigquery.extractor.get_metadata_token", boom, ) result = init_extract( str(tmp_path), "my-project", [{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}], ) # No partial extract.duckdb — auth failure aborts before any DB writes assert not (tmp_path / "extract.duckdb").exists(), \ "extract.duckdb should not be created when auth fails" assert result["tables_registered"] == 0 assert any("metadata" in e.get("error", "").lower() for e in result["errors"]) class TestIdentifierValidation: """init_extract must reject unsafe identifiers before any SQL construction.""" def test_rejects_unsafe_dataset_name(self, tmp_path, monkeypatch): from connectors.bigquery.extractor import init_extract monkeypatch.setattr( "connectors.bigquery.extractor.get_metadata_token", lambda: "test-token", ) monkeypatch.setattr( "connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "BASE TABLE", ) # Stub all DuckDB BQ-extension calls so the test stays offline captured = [] real_connect = duckdb.connect def safe_connect(*a, **kw): return _CapturingProxy(real_connect(*a, **kw), captured) monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) result = init_extract( str(tmp_path), "my-project", [{ "name": "t", "bucket": 'evil"; DROP TABLE foo; --', "source_table": "t", "description": "", }], ) assert result["tables_registered"] == 0 assert any("dataset" in e.get("error", "").lower() for e in result["errors"]) def test_rejects_unsafe_source_table_name(self, tmp_path, monkeypatch): from connectors.bigquery.extractor import init_extract monkeypatch.setattr( "connectors.bigquery.extractor.get_metadata_token", lambda: "test-token", ) monkeypatch.setattr( "connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "BASE TABLE", ) captured = [] real_connect = duckdb.connect def safe_connect(*a, **kw): return _CapturingProxy(real_connect(*a, **kw), captured) monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) result = init_extract( str(tmp_path), "my-project", [{ "name": "t", "bucket": "ds", "source_table": "evil`name", "description": "", }], ) assert result["tables_registered"] == 0 assert any("source_table" in e.get("error", "").lower() for e in result["errors"]) class TestExtractorMainModule: """Standalone `python -m connectors.bigquery.extractor` reads config correctly.""" def test_main_reads_data_source_bigquery_project(self, tmp_path, monkeypatch): """__main__ must read project from data_source.bigquery.project (matches yaml example). Runs the production __main__ block via runpy and captures the project_id passed to init_extract. If __main__ ever regresses to reading config.get("bigquery", {}).get("project_id"), the captured value will be empty and the assertion will fail. """ from unittest.mock import MagicMock captured: dict = {} def fake_init_extract(out, project_id, tables): captured["project"] = project_id captured["tables"] = tables return {"tables_registered": len(tables), "errors": []} # Patch every external dependency the __main__ block touches. # Targets are at the module path the __main__ block imports from, # because runpy re-executes the module under __name__ == "__main__". monkeypatch.setattr( "config.loader.load_instance_config", lambda: { "data_source": { "type": "bigquery", "bigquery": {"project": "my-test-project", "location": "US"}, } }, ) fake_repo = MagicMock() fake_repo.list_by_source.return_value = [ {"name": "t1", "bucket": "ds", "source_table": "t1", "description": ""}, ] monkeypatch.setattr( "src.repositories.table_registry.TableRegistryRepository", lambda c: fake_repo, ) monkeypatch.setattr( "src.db.get_system_db", lambda: MagicMock(close=lambda: None), ) # __main__ looks up init_extract via the cached connectors.bigquery.extractor # module (sys.modules), so patching its attribute survives runpy's reimport. monkeypatch.setattr( "connectors.bigquery.extractor.init_extract", fake_init_extract, ) monkeypatch.setenv("DATA_DIR", str(tmp_path)) import runpy runpy.run_module("connectors.bigquery.extractor", run_name="__main__") assert captured.get("project") == "my-test-project", \ f"expected __main__ to pass project='my-test-project' to init_extract; got {captured!r}" assert captured.get("tables", [{}])[0].get("name") == "t1" def test_main_exits_when_project_missing(self, tmp_path, monkeypatch): """__main__ must SystemExit(2) when data_source.bigquery.project is empty/missing.""" monkeypatch.setattr( "config.loader.load_instance_config", lambda: {"data_source": {"type": "bigquery"}}, # no .bigquery.project ) monkeypatch.setenv("DATA_DIR", str(tmp_path)) import runpy with pytest.raises(SystemExit) as exc_info: runpy.run_module("connectors.bigquery.extractor", run_name="__main__") assert exc_info.value.code == 2 class TestDropWrapViewForBQViews: def test_view_entity_does_not_create_master_view_by_default(self, tmp_path, monkeypatch): from connectors.bigquery.extractor import init_extract monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok") monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW") # Stub BQ extension calls to avoid hitting real BQ real_connect = duckdb.connect captured = [] def safe_connect(*a, **kw): return _CapturingProxy(real_connect(*a, **kw), captured) monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) # legacy toggle is OFF by default → expect no CREATE VIEW for the BQ view monkeypatch.setattr( "connectors.bigquery.extractor.get_value", lambda *args, default=None, **kw: False if "legacy_wrap_views" in args else default, raising=False, ) init_extract( str(tmp_path), "my-project", [{"name": "myview", "bucket": "ds", "source_table": "myview", "description": ""}], ) # Confirm extract.duckdb has _meta + _remote_attach but NO master view for myview c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True) try: views = c.execute( "SELECT view_name FROM duckdb_views() WHERE view_name='myview'" ).fetchall() assert views == [], f"expected no wrap view for VIEW entity by default; got {views}" meta = c.execute("SELECT table_name FROM _meta").fetchall() assert ("myview",) in meta, "_meta must still record the view" finally: c.close() def test_legacy_wrap_views_toggle_restores_old_behavior(self, tmp_path, monkeypatch): from connectors.bigquery.extractor import init_extract monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok") monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW") real_connect = duckdb.connect captured = [] def safe_connect(*a, **kw): return _CapturingProxy(real_connect(*a, **kw), captured) monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect) # legacy toggle ON → should still create the wrap view monkeypatch.setattr( "connectors.bigquery.extractor.get_value", lambda *args, default=None, **kw: True if "legacy_wrap_views" in args else default, raising=False, ) init_extract( str(tmp_path), "my-project", [{"name": "myview", "bucket": "ds", "source_table": "myview", "description": ""}], ) # With legacy ON the wrap view SQL should have been emitted view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper()] myview_sqls = [s for s in view_sqls if '"myview"' in s] assert myview_sqls != [], \ f"expected wrap view SQL for VIEW entity when legacy_wrap_views=True; captured={captured}" assert any("bigquery_query(" in s for s in myview_sqls), \ f"legacy wrap view should use bigquery_query(); got: {myview_sqls}" class TestInitExtractProjectIdValidation: """init_extract must reject unsafe project_id before any auth or DB work.""" def test_rejects_unsafe_project_id_with_quote(self, tmp_path): """Project IDs containing SQL metacharacters must be rejected before any token fetch or DuckDB work.""" from connectors.bigquery.extractor import init_extract result = init_extract( str(tmp_path), "evil'; DROP TABLE foo; --", [{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}], ) assert result["tables_registered"] == 0 assert any("project_id" in e.get("error", "").lower() for e in result["errors"]), \ f"expected error mentioning project_id; got: {result['errors']}" # No partial extract.duckdb on rejection assert not (tmp_path / "extract.duckdb").exists() def test_rejects_uppercase_project_id(self, tmp_path): """GCP project IDs are lowercase-only.""" from connectors.bigquery.extractor import init_extract result = init_extract( str(tmp_path), "MY-PROJECT", [{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}], ) assert result["tables_registered"] == 0 assert any("project_id" in e.get("error", "").lower() for e in result["errors"]) def test_valid_project_id_passes_validation(self, tmp_path, monkeypatch): """A well-formed project_id must pass validation. We stub the metadata fetch to fail right after, which produces a different error shape — that confirms validation didn't reject the project_id itself.""" from connectors.bigquery.extractor import init_extract from connectors.bigquery.auth import BQMetadataAuthError def fail_metadata(): raise BQMetadataAuthError("simulated — beyond validation") monkeypatch.setattr( "connectors.bigquery.extractor.get_metadata_token", fail_metadata, ) result = init_extract( str(tmp_path), "my-valid-project", [{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}], ) assert result["tables_registered"] == 0 errors = result["errors"] assert errors, "expected metadata-stub error" assert all("project_id" not in e.get("error", "").lower() for e in errors), \ f"valid project_id should not trip the validator; got: {errors}"