Now that VIEW/MATERIALIZED_VIEW always wrap via bigquery_query() (the prior `legacy_wrap_views=True` branch behavior, made unconditional in the previous commit), the toggle has no semantic meaning and is removed across the codebase. Production code: - app/api/admin.py: drop the field from _OPTIONAL_FIELDS["data_source"] ["bigquery"]["fields"] and from _BQ_OPTIONAL_FIELD_DEFAULTS, plus the comment block above the defaults dict. - config/instance.yaml.example: drop the example snippet. - src/orchestrator.py: update the inner-objects skip-branch comment to reflect the new BQ behavior (the skip itself stays — keboola use_extension=False still inserts _meta rows without inner views). - app/web/templates/admin_tables.html: rewrite operator copy in the register and edit forms to reflect always-wrap. Tests: - tests/test_admin_server_config.py (TestServerConfigBigQueryFields): flip assertions from "field IS present" to "field NOT present" on legacy_wrap_views. Drop the test_post_persists_legacy_wrap_views test since the field no longer exists. - tests/test_admin_server_config_known_fields.py: same flip on the known-fields registry assertion. - tests/test_bigquery_extractor.py: drop the obsolete test_view_entity_does_not_create_master_view_by_default (asserted the bug we fixed) and test_legacy_wrap_views_toggle_restores_old_behavior (toggle no longer meaningful). Update remaining test docstrings. Operators with `legacy_wrap_views: true` set in their overlay get the new (equivalent) behavior automatically — the unrecognized key is silently ignored by the YAML loader. Operators with `false` get the issue-#160 fix as a behavior change, not a regression. Spec gate updated: production code grep gate grep -rn 'legacy_wrap_views' connectors app src config cli must return zero. tests/ excluded — historical "removed in #160" breadcrumbs and `assert "X" not in fields` regression guards retained as anti-regression signals.
944 lines
37 KiB
Python
944 lines
37 KiB
Python
"""Tests for BigQuery extractor (remote-only via DuckDB extension)."""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock
|
|
|
|
import duckdb
|
|
import pytest
|
|
|
|
from connectors.bigquery.extractor import _detect_table_type
|
|
from tests.helpers.contract import validate_extract_contract
|
|
|
|
|
|
@pytest.fixture
|
|
def output_dir(tmp_path):
|
|
d = tmp_path / "extracts" / "bigquery"
|
|
d.mkdir(parents=True)
|
|
return str(d)
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_configs():
|
|
return [
|
|
{
|
|
"id": "project.analytics.orders",
|
|
"name": "orders",
|
|
"source_type": "bigquery",
|
|
"bucket": "analytics",
|
|
"source_table": "orders",
|
|
"query_mode": "remote",
|
|
"description": "Order data from BQ",
|
|
},
|
|
{
|
|
"id": "project.analytics.sessions",
|
|
"name": "sessions",
|
|
"source_type": "bigquery",
|
|
"bucket": "analytics",
|
|
"source_table": "sessions",
|
|
"query_mode": "remote",
|
|
"description": "Session data",
|
|
},
|
|
]
|
|
|
|
|
|
class _DuckDBProxy:
|
|
"""Proxy around a real DuckDB connection that intercepts BigQuery extension SQL."""
|
|
|
|
def __init__(self, real_conn):
|
|
self._real = real_conn
|
|
|
|
def execute(self, sql, *args, **kwargs):
|
|
sql_upper = sql.strip().upper()
|
|
if sql_upper.startswith("INSTALL BIGQUERY") or sql_upper.startswith(
|
|
"LOAD BIGQUERY"
|
|
):
|
|
return MagicMock()
|
|
if sql_upper.startswith("CREATE SECRET"):
|
|
return MagicMock()
|
|
if "ATTACH" in sql_upper and "BIGQUERY" in sql_upper:
|
|
return MagicMock()
|
|
if sql_upper.startswith("DETACH BQ"):
|
|
return MagicMock()
|
|
# CREATE VIEW referencing bq.* -> create a dummy table instead
|
|
if "FROM BQ." in sql_upper and "CREATE" in sql_upper:
|
|
match = re.search(r'VIEW\s+"?(\w+)"?', sql, re.IGNORECASE)
|
|
if match:
|
|
view_name = match.group(1)
|
|
self._real.execute(
|
|
f'CREATE OR REPLACE TABLE "{view_name}" (dummy INTEGER)'
|
|
)
|
|
return MagicMock()
|
|
return self._real.execute(sql, *args, **kwargs)
|
|
|
|
def close(self):
|
|
return self._real.close()
|
|
|
|
def __getattr__(self, name):
|
|
return getattr(self._real, name)
|
|
|
|
|
|
class TestBigQueryExtractor:
|
|
def test_creates_extract_duckdb_with_meta(self, output_dir, sample_configs, monkeypatch):
|
|
"""Test that init_extract creates extract.duckdb with _meta and _remote_attach."""
|
|
from unittest.mock import patch
|
|
|
|
# Mock metadata-token auth + entity type detection so the test runs offline.
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "BASE TABLE",
|
|
)
|
|
|
|
def proxy_connect(path=None, **kwargs):
|
|
real_conn = duckdb.connect(path)
|
|
return _DuckDBProxy(real_conn)
|
|
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = proxy_connect
|
|
from connectors.bigquery.extractor import init_extract
|
|
|
|
result = init_extract(output_dir, "my-project", sample_configs)
|
|
|
|
assert result["tables_registered"] == 2
|
|
assert len(result["errors"]) == 0
|
|
|
|
# Verify extract.duckdb has _meta with correct data
|
|
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"))
|
|
try:
|
|
meta = conn.execute(
|
|
"SELECT table_name, query_mode FROM _meta ORDER BY table_name"
|
|
).fetchall()
|
|
assert len(meta) == 2
|
|
assert meta[0][0] == "orders"
|
|
assert meta[0][1] == "remote"
|
|
assert meta[1][0] == "sessions"
|
|
assert meta[1][1] == "remote"
|
|
|
|
# Verify _remote_attach table for orchestrator re-ATTACH
|
|
ra = conn.execute(
|
|
"SELECT alias, extension, url, token_env FROM _remote_attach"
|
|
).fetchone()
|
|
assert ra[0] == "bq"
|
|
assert ra[1] == "bigquery"
|
|
assert ra[2] == "project=my-project"
|
|
assert ra[3] == "" # BQ handles auth via env automatically
|
|
finally:
|
|
conn.close()
|
|
|
|
validate_extract_contract(str(Path(output_dir) / "extract.duckdb"))
|
|
|
|
def test_no_data_directory_created(self, output_dir, sample_configs):
|
|
"""BigQuery is remote-only -- no data/ directory should exist."""
|
|
assert not (Path(output_dir) / "data").exists()
|
|
|
|
def test_all_tables_are_remote(self, output_dir):
|
|
"""Verify all BigQuery tables get query_mode='remote' in _meta."""
|
|
db_path = Path(output_dir) / "extract.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
conn.execute("""CREATE TABLE _meta (
|
|
table_name VARCHAR, description VARCHAR, rows BIGINT,
|
|
size_bytes BIGINT, extracted_at TIMESTAMP,
|
|
query_mode VARCHAR DEFAULT 'remote'
|
|
)""")
|
|
conn.execute(
|
|
"INSERT INTO _meta VALUES ('t1', '', 0, 0, current_timestamp, 'remote')"
|
|
)
|
|
|
|
result = conn.execute("SELECT query_mode FROM _meta").fetchone()
|
|
assert result[0] == "remote"
|
|
conn.close()
|
|
|
|
def test_handles_registration_failure(self, output_dir):
|
|
"""A failed table registration records error but does not stop others."""
|
|
db_path = Path(output_dir) / "extract.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
|
|
conn.execute("""CREATE TABLE _meta (
|
|
table_name VARCHAR, description VARCHAR, rows BIGINT,
|
|
size_bytes BIGINT, extracted_at TIMESTAMP,
|
|
query_mode VARCHAR DEFAULT 'remote'
|
|
)""")
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
now = datetime.now(timezone.utc)
|
|
# Simulate: first succeeds, second fails (not inserted)
|
|
conn.execute(
|
|
"INSERT INTO _meta VALUES ('good_table', '', 0, 0, ?, 'remote')", [now]
|
|
)
|
|
|
|
meta = conn.execute("SELECT count(*) FROM _meta").fetchone()
|
|
assert meta[0] == 1 # Only good_table registered
|
|
conn.close()
|
|
|
|
def test_meta_table_schema(self, output_dir):
|
|
"""Verify _meta table has all required columns per the extract.duckdb contract."""
|
|
from connectors.bigquery.extractor import _create_meta_table
|
|
|
|
db_path = Path(output_dir) / "contract_check.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
_create_meta_table(conn)
|
|
|
|
columns = conn.execute(
|
|
"SELECT column_name FROM information_schema.columns "
|
|
"WHERE table_name = '_meta' ORDER BY ordinal_position"
|
|
).fetchall()
|
|
col_names = [c[0] for c in columns]
|
|
assert col_names == [
|
|
"table_name",
|
|
"description",
|
|
"rows",
|
|
"size_bytes",
|
|
"extracted_at",
|
|
"query_mode",
|
|
]
|
|
conn.close()
|
|
|
|
|
|
class TestDetectTableType:
|
|
"""Detect whether a BQ entity is a base table or a view."""
|
|
|
|
def test_base_table_returns_table(self):
|
|
conn = MagicMock()
|
|
conn.execute.return_value.fetchone.return_value = ("BASE TABLE",)
|
|
result = _detect_table_type(conn, "proj", "ds", "tbl")
|
|
assert result == "BASE TABLE"
|
|
|
|
def test_view_returns_view(self):
|
|
conn = MagicMock()
|
|
conn.execute.return_value.fetchone.return_value = ("VIEW",)
|
|
result = _detect_table_type(conn, "proj", "ds", "tbl")
|
|
assert result == "VIEW"
|
|
|
|
def test_missing_returns_none(self):
|
|
conn = MagicMock()
|
|
conn.execute.return_value.fetchone.return_value = None
|
|
result = _detect_table_type(conn, "proj", "ds", "tbl")
|
|
assert result is None
|
|
|
|
def test_query_uses_bigquery_query_function(self):
|
|
"""Detection must use bigquery_query() table function (works on views via jobs API)."""
|
|
conn = MagicMock()
|
|
conn.execute.return_value.fetchone.return_value = ("VIEW",)
|
|
_detect_table_type(conn, "my-proj", "my_ds", "my_tbl")
|
|
|
|
# SQL must use the bigquery_query() table function (not direct ref)
|
|
sql = conn.execute.call_args[0][0]
|
|
assert "bigquery_query" in sql.lower()
|
|
|
|
# The inner BQ SQL is passed as a parameter, not f-stringed in.
|
|
# Verify both project and the BQ SQL appear in the bound params.
|
|
params = conn.execute.call_args[0][1]
|
|
assert "my-proj" in params, f"expected project in params, got: {params}"
|
|
# The inner BQ SQL is one of the params; it should reference INFORMATION_SCHEMA.TABLES
|
|
bq_sql_param = next(
|
|
(p for p in params if isinstance(p, str) and "INFORMATION_SCHEMA.TABLES" in p),
|
|
None,
|
|
)
|
|
assert bq_sql_param is not None, f"inner BQ SQL not found in params: {params}"
|
|
assert "my_ds" in bq_sql_param # dataset is f-stringed into the BQ SQL identifier path
|
|
# Table name should NOT be inline in the BQ SQL — it goes through the param chain
|
|
assert "my_tbl" in params, f"table name should be a separate param, got: {params}"
|
|
|
|
|
|
class _CapturingProxy:
|
|
"""Wraps a real DuckDB connection, captures all SQL, stubs BQ-specific calls.
|
|
|
|
DuckDBPyConnection.execute is a C-level read-only attribute, so we can't
|
|
patch the method directly on the connection — we have to wrap with a proxy.
|
|
"""
|
|
|
|
def __init__(self, real_conn, captured: list):
|
|
self._real = real_conn
|
|
self._captured = captured
|
|
|
|
def execute(self, sql, *args, **kwargs):
|
|
self._captured.append(sql)
|
|
stripped_u = sql.strip().upper()
|
|
# Stub only commands that would talk to BQ; CREATE TABLE / INSERT etc.
|
|
# must pass through to the real DuckDB so _meta + _remote_attach persist.
|
|
if stripped_u.startswith(("INSTALL ", "LOAD ", "CREATE SECRET")):
|
|
return MagicMock()
|
|
if stripped_u.startswith("ATTACH ") and "BIGQUERY" in stripped_u:
|
|
return MagicMock()
|
|
if stripped_u.startswith("DETACH "):
|
|
return MagicMock()
|
|
if 'FROM bq.' in sql or 'FROM bigquery_query' in sql:
|
|
return MagicMock()
|
|
return self._real.execute(sql, *args, **kwargs)
|
|
|
|
def close(self):
|
|
return self._real.close()
|
|
|
|
def __getattr__(self, name):
|
|
return getattr(self._real, name)
|
|
|
|
|
|
class TestViewVsTableTemplates:
|
|
"""init_extract must pick the right view template based on entity type."""
|
|
|
|
def test_base_table_uses_direct_attach_ref(self, tmp_path, monkeypatch):
|
|
"""For BASE TABLE, generated DuckDB view references bq.dataset.table directly."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "BASE TABLE",
|
|
)
|
|
|
|
captured = []
|
|
real_connect = duckdb.connect
|
|
|
|
def spy_connect(*a, **kw):
|
|
real_conn = real_connect(*a, **kw)
|
|
return _CapturingProxy(real_conn, captured)
|
|
|
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", spy_connect)
|
|
|
|
init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{"name": "orders", "bucket": "my_ds", "source_table": "orders", "description": ""}],
|
|
)
|
|
|
|
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() or 'CREATE VIEW' in s.upper()]
|
|
assert any('FROM bq."my_ds"."orders"' in s for s in view_sqls), \
|
|
f"expected direct bq.dataset.table ref for BASE TABLE; got: {view_sqls}"
|
|
assert not any("bigquery_query(" in s for s in view_sqls), \
|
|
"BASE TABLE should not use bigquery_query() function"
|
|
|
|
def test_view_uses_bigquery_query_function(self, tmp_path, monkeypatch):
|
|
"""For VIEW entity, generated DuckDB master view wraps bigquery_query()
|
|
(jobs API path). Same SQL form as the prior `legacy_wrap_views=True`
|
|
branch — now unconditional per #160."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "VIEW",
|
|
)
|
|
|
|
captured = []
|
|
real_connect = duckdb.connect
|
|
|
|
def spy_connect(*a, **kw):
|
|
real_conn = real_connect(*a, **kw)
|
|
return _CapturingProxy(real_conn, captured)
|
|
|
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", spy_connect)
|
|
|
|
init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{"name": "session_view", "bucket": "my_ds", "source_table": "session_view", "description": ""}],
|
|
)
|
|
|
|
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() or 'CREATE VIEW' in s.upper()]
|
|
view_create = next((s for s in view_sqls if '"session_view"' in s), None)
|
|
assert view_create is not None, f"no CREATE VIEW for session_view; got: {view_sqls}"
|
|
assert "bigquery_query(" in view_create
|
|
assert "my-project" in view_create
|
|
assert "`my-project.my_ds.session_view`" in view_create, \
|
|
f"expected backtick-quoted full path; got: {view_create}"
|
|
|
|
|
|
class TestRemoteAttachForBQ:
|
|
"""For BQ source, _remote_attach must signal metadata-auth (empty token_env)."""
|
|
|
|
def test_remote_attach_token_env_is_empty_for_bq(self, tmp_path, monkeypatch):
|
|
from connectors.bigquery.extractor import init_extract
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "BASE TABLE",
|
|
)
|
|
|
|
captured = []
|
|
real_connect = duckdb.connect
|
|
|
|
def spy_connect(*a, **kw):
|
|
real_conn = real_connect(*a, **kw)
|
|
return _CapturingProxy(real_conn, captured)
|
|
|
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", spy_connect)
|
|
|
|
init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}],
|
|
)
|
|
|
|
c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True)
|
|
rows = c.execute(
|
|
"SELECT alias, extension, url, token_env FROM _remote_attach"
|
|
).fetchall()
|
|
c.close()
|
|
|
|
assert len(rows) == 1
|
|
alias, extension, url, token_env = rows[0]
|
|
assert alias == "bq"
|
|
assert extension == "bigquery"
|
|
assert url == "project=my-project"
|
|
assert token_env == "", \
|
|
"BQ uses metadata auth — token_env must be empty so orchestrator triggers metadata path"
|
|
|
|
|
|
class TestInitExtractAuthFailure:
|
|
"""init_extract must abort cleanly if metadata token fetch fails."""
|
|
|
|
def test_returns_error_when_metadata_unreachable(self, tmp_path, monkeypatch):
|
|
from connectors.bigquery.extractor import init_extract
|
|
from connectors.bigquery.auth import BQMetadataAuthError
|
|
|
|
def boom():
|
|
raise BQMetadataAuthError("metadata server unreachable: simulated")
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
boom,
|
|
)
|
|
|
|
result = init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}],
|
|
)
|
|
|
|
# No partial extract.duckdb — auth failure aborts before any DB writes
|
|
assert not (tmp_path / "extract.duckdb").exists(), \
|
|
"extract.duckdb should not be created when auth fails"
|
|
assert result["tables_registered"] == 0
|
|
assert any("metadata" in e.get("error", "").lower() for e in result["errors"])
|
|
|
|
|
|
class TestIdentifierValidation:
|
|
"""init_extract must reject unsafe identifiers before any SQL construction."""
|
|
|
|
def test_rejects_unsafe_dataset_name(self, tmp_path, monkeypatch):
|
|
from connectors.bigquery.extractor import init_extract
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "BASE TABLE",
|
|
)
|
|
# Stub all DuckDB BQ-extension calls so the test stays offline
|
|
captured = []
|
|
real_connect = duckdb.connect
|
|
def safe_connect(*a, **kw):
|
|
return _CapturingProxy(real_connect(*a, **kw), captured)
|
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
|
|
|
result = init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{
|
|
"name": "t",
|
|
"bucket": 'evil"; DROP TABLE foo; --',
|
|
"source_table": "t",
|
|
"description": "",
|
|
}],
|
|
)
|
|
assert result["tables_registered"] == 0
|
|
assert any("dataset" in e.get("error", "").lower() for e in result["errors"])
|
|
|
|
def test_rejects_unsafe_source_table_name(self, tmp_path, monkeypatch):
|
|
from connectors.bigquery.extractor import init_extract
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "BASE TABLE",
|
|
)
|
|
captured = []
|
|
real_connect = duckdb.connect
|
|
def safe_connect(*a, **kw):
|
|
return _CapturingProxy(real_connect(*a, **kw), captured)
|
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
|
|
|
result = init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{
|
|
"name": "t",
|
|
"bucket": "ds",
|
|
"source_table": "evil`name",
|
|
"description": "",
|
|
}],
|
|
)
|
|
assert result["tables_registered"] == 0
|
|
assert any("source_table" in e.get("error", "").lower() for e in result["errors"])
|
|
|
|
|
|
class TestExtractorMainModule:
|
|
"""Standalone `python -m connectors.bigquery.extractor` reads config correctly."""
|
|
|
|
def test_main_reads_data_source_bigquery_project(self, tmp_path, monkeypatch):
|
|
"""__main__ must read project from data_source.bigquery.project (matches yaml example).
|
|
|
|
Runs the production __main__ block via runpy and captures the project_id
|
|
passed to init_extract. If __main__ ever regresses to reading
|
|
config.get("bigquery", {}).get("project_id"), the captured value will
|
|
be empty and the assertion will fail.
|
|
"""
|
|
from unittest.mock import MagicMock
|
|
|
|
captured: dict = {}
|
|
|
|
def fake_init_extract(out, project_id, tables):
|
|
captured["project"] = project_id
|
|
captured["tables"] = tables
|
|
return {"tables_registered": len(tables), "errors": []}
|
|
|
|
# Patch every external dependency the __main__ block touches.
|
|
# Targets are at the module path the __main__ block imports from,
|
|
# because runpy re-executes the module under __name__ == "__main__".
|
|
monkeypatch.setattr(
|
|
"config.loader.load_instance_config",
|
|
lambda: {
|
|
"data_source": {
|
|
"type": "bigquery",
|
|
"bigquery": {"project": "my-test-project", "location": "US"},
|
|
}
|
|
},
|
|
)
|
|
fake_repo = MagicMock()
|
|
fake_repo.list_by_source.return_value = [
|
|
{"name": "t1", "bucket": "ds", "source_table": "t1", "description": ""},
|
|
]
|
|
monkeypatch.setattr(
|
|
"src.repositories.table_registry.TableRegistryRepository",
|
|
lambda c: fake_repo,
|
|
)
|
|
monkeypatch.setattr(
|
|
"src.db.get_system_db",
|
|
lambda: MagicMock(close=lambda: None),
|
|
)
|
|
# __main__ looks up init_extract via the cached connectors.bigquery.extractor
|
|
# module (sys.modules), so patching its attribute survives runpy's reimport.
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.init_extract",
|
|
fake_init_extract,
|
|
)
|
|
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
|
|
|
import runpy
|
|
runpy.run_module("connectors.bigquery.extractor", run_name="__main__")
|
|
|
|
assert captured.get("project") == "my-test-project", \
|
|
f"expected __main__ to pass project='my-test-project' to init_extract; got {captured!r}"
|
|
assert captured.get("tables", [{}])[0].get("name") == "t1"
|
|
|
|
|
|
def test_main_exits_when_project_missing(self, tmp_path, monkeypatch):
|
|
"""__main__ must SystemExit(2) when data_source.bigquery.project is empty/missing."""
|
|
# Reset the app.instance_config cache — `test_main_reads_data_source_bigquery_project`
|
|
# above populated it with a config that has the project set, and the
|
|
# cache survives across tests. Without this reset, `_resolve_bq_project_id`
|
|
# returns the stale cached value instead of the no-project mock below.
|
|
from app.instance_config import reset_cache
|
|
reset_cache()
|
|
monkeypatch.setattr(
|
|
"config.loader.load_instance_config",
|
|
lambda: {"data_source": {"type": "bigquery"}}, # no .bigquery.project
|
|
)
|
|
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
|
|
|
import runpy
|
|
with pytest.raises(SystemExit) as exc_info:
|
|
runpy.run_module("connectors.bigquery.extractor", run_name="__main__")
|
|
assert exc_info.value.code == 2
|
|
|
|
|
|
class TestWrapViewForBQViews:
|
|
"""Issue #160: query_mode='remote' BQ rows whose entity is VIEW or
|
|
MATERIALIZED_VIEW must get a master view via bigquery_query() — for any
|
|
other entity type we don't have proven runtime support for, skip both
|
|
the master view AND the _meta row."""
|
|
|
|
def test_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
|
|
"""VIEW entity must get a bigquery_query() wrap view (the previous
|
|
opt-in path under `legacy_wrap_views=True`, now unconditional).
|
|
Closes #160."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
import app.instance_config as _ic
|
|
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
|
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
|
monkeypatch.setattr("connectors.bigquery.extractor._detect_table_type", lambda *a, **kw: "VIEW")
|
|
|
|
real_connect = duckdb.connect
|
|
captured = []
|
|
|
|
def safe_connect(*a, **kw):
|
|
return _CapturingProxy(real_connect(*a, **kw), captured)
|
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
|
|
|
init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{"name": "ue", "bucket": "finance", "source_table": "ue", "description": ""}],
|
|
)
|
|
|
|
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ue"' in s]
|
|
assert view_sqls != [], \
|
|
f"VIEW entity must produce a wrap view by default; captured={captured}"
|
|
assert any("bigquery_query(" in s for s in view_sqls), \
|
|
f"VIEW wrap view must use bigquery_query(); got: {view_sqls}"
|
|
assert any("`my-project.finance.ue`" in s for s in view_sqls), \
|
|
f"wrap view must reference full project.dataset.table path; got: {view_sqls}"
|
|
|
|
def test_materialized_view_creates_wrap_view_with_default_config(self, tmp_path, monkeypatch):
|
|
"""MATERIALIZED_VIEW entity must get a bigquery_query() wrap view by default."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
import app.instance_config as _ic
|
|
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
|
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "MATERIALIZED_VIEW",
|
|
)
|
|
|
|
real_connect = duckdb.connect
|
|
captured = []
|
|
|
|
def safe_connect(*a, **kw):
|
|
return _CapturingProxy(real_connect(*a, **kw), captured)
|
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
|
|
|
init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{"name": "mv", "bucket": "ds", "source_table": "mv", "description": ""}],
|
|
)
|
|
|
|
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"mv"' in s]
|
|
assert view_sqls != [], \
|
|
f"MATERIALIZED_VIEW must produce a wrap view by default; captured={captured}"
|
|
assert any("bigquery_query(" in s for s in view_sqls)
|
|
|
|
def test_unsupported_entity_type_skips_meta_and_view(self, tmp_path, monkeypatch):
|
|
"""For entity_types we don't have proven runtime support for
|
|
(EXTERNAL, SNAPSHOT, CLONE, future types), skip BOTH the master
|
|
view AND the _meta row. Today the _meta row is inserted
|
|
unconditionally → orchestrator sees a `_meta` entry pointing to a
|
|
non-existent inner view, then skips master-view creation, leaving
|
|
the operator with a registered-but-unqueryable name."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
import app.instance_config as _ic
|
|
monkeypatch.setattr(_ic, "_instance_config", None, raising=False)
|
|
monkeypatch.setattr("connectors.bigquery.extractor.get_metadata_token", lambda: "tok")
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "EXTERNAL",
|
|
)
|
|
|
|
real_connect = duckdb.connect
|
|
captured = []
|
|
|
|
def safe_connect(*a, **kw):
|
|
return _CapturingProxy(real_connect(*a, **kw), captured)
|
|
monkeypatch.setattr("connectors.bigquery.extractor.duckdb.connect", safe_connect)
|
|
|
|
init_extract(
|
|
str(tmp_path),
|
|
"my-project",
|
|
[{"name": "ext_tbl", "bucket": "ds", "source_table": "ext_tbl", "description": ""}],
|
|
)
|
|
|
|
# No CREATE VIEW for ext_tbl
|
|
view_sqls = [s for s in captured if "CREATE OR REPLACE VIEW" in s.upper() and '"ext_tbl"' in s]
|
|
assert view_sqls == [], \
|
|
f"unsupported entity_type must NOT produce a wrap view; got {view_sqls}"
|
|
|
|
# _meta row also skipped — no INSERT INTO _meta for ext_tbl
|
|
c = duckdb.connect(str(tmp_path / "extract.duckdb"), read_only=True)
|
|
try:
|
|
meta = c.execute("SELECT table_name FROM _meta").fetchall()
|
|
assert ("ext_tbl",) not in meta, \
|
|
f"unsupported entity_type must NOT insert _meta row; got {meta}"
|
|
finally:
|
|
c.close()
|
|
|
|
|
|
class TestInitExtractProjectIdValidation:
|
|
"""init_extract must reject unsafe project_id before any auth or DB work."""
|
|
|
|
def test_rejects_unsafe_project_id_with_quote(self, tmp_path):
|
|
"""Project IDs containing SQL metacharacters must be rejected before
|
|
any token fetch or DuckDB work."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
|
|
result = init_extract(
|
|
str(tmp_path),
|
|
"evil'; DROP TABLE foo; --",
|
|
[{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}],
|
|
)
|
|
assert result["tables_registered"] == 0
|
|
assert any("project_id" in e.get("error", "").lower() for e in result["errors"]), \
|
|
f"expected error mentioning project_id; got: {result['errors']}"
|
|
# No partial extract.duckdb on rejection
|
|
assert not (tmp_path / "extract.duckdb").exists()
|
|
|
|
def test_rejects_uppercase_project_id(self, tmp_path):
|
|
"""GCP project IDs are lowercase-only."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
|
|
result = init_extract(
|
|
str(tmp_path),
|
|
"MY-PROJECT",
|
|
[{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}],
|
|
)
|
|
assert result["tables_registered"] == 0
|
|
assert any("project_id" in e.get("error", "").lower() for e in result["errors"])
|
|
|
|
def test_valid_project_id_passes_validation(self, tmp_path, monkeypatch):
|
|
"""A well-formed project_id must pass validation. We stub the metadata
|
|
fetch to fail right after, which produces a different error shape — that
|
|
confirms validation didn't reject the project_id itself."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
from connectors.bigquery.auth import BQMetadataAuthError
|
|
|
|
def fail_metadata():
|
|
raise BQMetadataAuthError("simulated — beyond validation")
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
fail_metadata,
|
|
)
|
|
|
|
result = init_extract(
|
|
str(tmp_path),
|
|
"my-valid-project",
|
|
[{"name": "t", "bucket": "ds", "source_table": "t", "description": ""}],
|
|
)
|
|
assert result["tables_registered"] == 0
|
|
errors = result["errors"]
|
|
assert errors, "expected metadata-stub error"
|
|
assert all("project_id" not in e.get("error", "").lower() for e in errors), \
|
|
f"valid project_id should not trip the validator; got: {errors}"
|
|
|
|
|
|
class TestBigQueryExtractorFailureModes:
|
|
"""Failure-mode tests for the BigQuery extractor — corrupted DB, partial
|
|
writes, network timeout, unsafe identifiers, atomic swap."""
|
|
|
|
def test_corrupted_extract_duckdb_orchestrator_skips(self, output_dir, monkeypatch):
|
|
"""A corrupted extract.duckdb should be skipped by the orchestrator
|
|
without crashing."""
|
|
from src.orchestrator import SyncOrchestrator
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
|
|
# Create a corrupted extract.duckdb
|
|
db_path = Path(output_dir) / "extract.duckdb"
|
|
db_path.write_bytes(b"this is not a valid duckdb file!!!")
|
|
|
|
analytics_db = str(Path(output_dir) / "analytics.duckdb")
|
|
orch = SyncOrchestrator(analytics_db_path=analytics_db)
|
|
# The rebuild should complete (possibly with warnings) but not raise
|
|
result = orch.rebuild()
|
|
# The corrupted source should not appear in results
|
|
assert "bigquery" not in result
|
|
|
|
def test_partial_data_write_incomplete_extract(self, output_dir, monkeypatch):
|
|
"""When init_extract fails partway through (e.g. one view creation
|
|
fails), the extract.duckdb is still created atomically and the
|
|
successful tables are preserved."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
from unittest.mock import patch
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "BASE TABLE",
|
|
)
|
|
|
|
configs = [
|
|
{
|
|
"name": "good_table",
|
|
"bucket": "analytics",
|
|
"source_table": "good_table",
|
|
"query_mode": "remote",
|
|
"description": "OK",
|
|
},
|
|
{
|
|
"name": "bad-table", # hyphen → unsafe identifier
|
|
"bucket": "analytics",
|
|
"source_table": "bad_table",
|
|
"query_mode": "remote",
|
|
"description": "Will fail validation",
|
|
},
|
|
]
|
|
|
|
def proxy_connect(path=None, **kwargs):
|
|
real_conn = duckdb.connect(path)
|
|
return _DuckDBProxy(real_conn)
|
|
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = proxy_connect
|
|
result = init_extract(output_dir, "my-project", configs)
|
|
|
|
# good_table registered, bad-table skipped
|
|
assert result["tables_registered"] == 1
|
|
assert len(result["errors"]) == 1
|
|
|
|
def test_network_timeout_during_extraction(self, output_dir, monkeypatch):
|
|
"""Network timeout during BQ extension ATTACH should be caught and
|
|
reported as an error, not crash the process."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
from unittest.mock import patch
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
|
|
configs = [
|
|
{
|
|
"name": "timeout_table",
|
|
"bucket": "analytics",
|
|
"source_table": "timeout_table",
|
|
"query_mode": "remote",
|
|
"description": "Will timeout",
|
|
},
|
|
]
|
|
|
|
def proxy_connect_timeout(path=None, **kwargs):
|
|
real_conn = duckdb.connect(path)
|
|
proxy = _DuckDBProxy(real_conn)
|
|
# Override execute to raise on ATTACH
|
|
original_execute = proxy.execute
|
|
def timeout_execute(sql, *args, **kwargs):
|
|
sql_upper = sql.strip().upper()
|
|
if "ATTACH" in sql_upper and "BIGQUERY" in sql_upper:
|
|
raise TimeoutError("BigQuery connection timed out")
|
|
return original_execute(sql, *args, **kwargs)
|
|
proxy.execute = timeout_execute
|
|
return proxy
|
|
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = proxy_connect_timeout
|
|
result = init_extract(output_dir, "my-project", configs)
|
|
|
|
# The timeout should be caught — no tables registered, error recorded
|
|
assert result["tables_registered"] == 0
|
|
assert len(result["errors"]) >= 1
|
|
|
|
def test_all_tables_fail_returns_errors(self, output_dir, monkeypatch):
|
|
"""When every table registration fails, the extractor returns all
|
|
errors without crashing."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
from unittest.mock import patch
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
|
|
configs = [
|
|
{"name": "bad-1", "bucket": "ds", "source_table": "t1",
|
|
"query_mode": "remote", "description": ""},
|
|
{"name": "bad-2", "bucket": "ds", "source_table": "t2",
|
|
"query_mode": "remote", "description": ""},
|
|
]
|
|
|
|
def proxy_connect(path=None, **kwargs):
|
|
real_conn = duckdb.connect(path)
|
|
return _DuckDBProxy(real_conn)
|
|
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = proxy_connect
|
|
result = init_extract(output_dir, "my-project", configs)
|
|
|
|
# Both have unsafe identifiers (hyphens)
|
|
assert result["tables_registered"] == 0
|
|
assert len(result["errors"]) == 2
|
|
|
|
def test_unsafe_identifier_skipped_not_crashed(self, output_dir, monkeypatch):
|
|
"""Tables with unsafe identifiers are skipped with an error in stats,
|
|
not causing a crash."""
|
|
from connectors.bigquery.extractor import init_extract
|
|
from unittest.mock import patch
|
|
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor.get_metadata_token",
|
|
lambda: "test-token",
|
|
)
|
|
monkeypatch.setattr(
|
|
"connectors.bigquery.extractor._detect_table_type",
|
|
lambda *a, **kw: "BASE TABLE",
|
|
)
|
|
|
|
configs = [
|
|
{"name": "bad-name", "bucket": "dataset", "source_table": "t",
|
|
"query_mode": "remote", "description": "hyphen not allowed"},
|
|
{"name": "good_name", "bucket": "dataset", "source_table": "t",
|
|
"query_mode": "remote", "description": "OK"},
|
|
]
|
|
|
|
def proxy_connect(path=None, **kwargs):
|
|
real_conn = duckdb.connect(path)
|
|
return _DuckDBProxy(real_conn)
|
|
|
|
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
|
|
mock_mod.connect = proxy_connect
|
|
result = init_extract(output_dir, "my-project", configs)
|
|
|
|
assert result["tables_registered"] == 1
|
|
assert len(result["errors"]) == 1
|
|
assert "unsafe" in result["errors"][0]["error"].lower()
|
|
|
|
def test_atomic_swap_prevents_corruption_on_crash(self, output_dir):
|
|
"""The extractor writes to a temp file then atomically swaps it into
|
|
place. If the process crashes mid-write, the old extract.duckdb
|
|
(if any) is not corrupted."""
|
|
# Create a valid existing extract.duckdb
|
|
db_path = Path(output_dir) / "extract.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
conn.execute("""CREATE TABLE _meta (
|
|
table_name VARCHAR, description VARCHAR, rows BIGINT,
|
|
size_bytes BIGINT, extracted_at TIMESTAMP,
|
|
query_mode VARCHAR DEFAULT 'remote'
|
|
)""")
|
|
conn.execute("INSERT INTO _meta VALUES ('existing', '', 0, 0, current_timestamp, 'remote')")
|
|
conn.close()
|
|
|
|
# Simulate a crash: the tmp file exists but is incomplete
|
|
tmp_path = Path(output_dir) / "extract.duckdb.tmp"
|
|
tmp_path.write_bytes(b"incomplete garbage")
|
|
|
|
# The existing extract.duckdb should still be valid
|
|
conn2 = duckdb.connect(str(db_path))
|
|
rows = conn2.execute("SELECT table_name FROM _meta").fetchall()
|
|
assert len(rows) == 1
|
|
assert rows[0][0] == "existing"
|
|
conn2.close()
|
|
|
|
# Clean up
|
|
tmp_path.unlink()
|