fix(bq-materialize): code-review follow-ups for d8a22996

- tests/test_bq_cost_guardrail.py: assert fail-open warning is logged
  (test previously only proved fail-open doesn't crash; review note:
  warning is the only operator-visible signal of the silent failure).
- extractor._wrap_admin_sql_for_jobs_api: docstring no longer claims
  DuckDB-flavor SQL is rejected — the function performs no inner-SQL
  validation; the v24 migration + register-time validator are the
  real enforcement points.
- extractor.materialize_query: safe_path uses _escape_sql_string_literal
  instead of inline replace, for one-place-to-update consistency.
- extractor: import hashlib hoisted to module-level imports.
This commit is contained in:
ZdenekSrotyr 2026-05-04 16:52:18 +02:00
parent d8a2299633
commit a2afcfe59a
2 changed files with 22 additions and 17 deletions

View file

@ -3,6 +3,7 @@
No data is downloaded. All queries go directly to BigQuery via DuckDB extension ATTACH.
"""
import hashlib
import logging
import os
import re
@ -89,10 +90,11 @@ def _wrap_admin_sql_for_jobs_api(billing_project: str, inner_sql: str) -> str:
should fail closed not silently lose budget to the wrong
project).
inner_sql: BigQuery-flavor SQL the admin registered as
``source_query``. Must use BQ syntax (backticks for dashed
identifiers, native function calls). DuckDB-flavor `bq."ds"."t"`
is NOT acceptable here the v24 migration converts existing
rows; new registrations are validated upstream.
``source_query``. Should be BigQuery-native; DuckDB-flavor
`bq."ds"."t"` references are not enforced here but will fail at
COPY time inside the BQ jobs API. Existing rows are converted by
the v24 schema migration; new rows are validated upstream at
register/PUT.
Returns:
A DuckDB-parseable SQL fragment suitable as the operand of
@ -471,7 +473,7 @@ def materialize_query(
)
try:
safe_path = str(tmp_path).replace("'", "''")
safe_path = _escape_sql_string_literal(str(tmp_path))
conn.execute(
f"COPY ({wrapped_sql}) TO '{safe_path}' (FORMAT PARQUET)"
)
@ -489,7 +491,6 @@ def materialize_query(
# thread — a 10 GiB parquet means 50+ seconds of disk I/O blocking other
# requests. Hashing here keeps the open-file handle hot from the COPY
# round and removes the second read. Devil's-advocate review item.
import hashlib
h = hashlib.md5()
with open(tmp_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):

View file

@ -128,22 +128,26 @@ def test_zero_max_bytes_skips_dry_run(tmp_path):
assert stats["rows"] == 1
def test_dry_run_failure_is_fail_open(tmp_path):
def test_dry_run_failure_is_fail_open(tmp_path, caplog):
"""If the dry-run errors (DuckDB syntax, missing google lib, transient
upstream failure) we don't block — log + proceed with COPY. Operators
who need hard-fail watch logs for the warning."""
import logging
out = tmp_path / "extracts" / "bigquery"
out.mkdir(parents=True)
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
with patch(
"app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
):
stats = materialize_query(
table_id="t1",
sql="SELECT * FROM bq.test.tiny",
bq=bq,
output_dir=str(out),
max_bytes=10 * 2**30,
)
with caplog.at_level(logging.WARNING, logger="connectors.bigquery.extractor"):
with patch(
"app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
):
stats = materialize_query(
table_id="t1",
sql="SELECT * FROM bq.test.tiny",
bq=bq,
output_dir=str(out),
max_bytes=10 * 2**30,
)
assert stats["rows"] == 1
assert "fail-open" in caplog.text