fix(bq-materialize): code-review follow-ups for d8a22996

- tests/test_bq_cost_guardrail.py: assert fail-open warning is logged
  (test previously only proved fail-open doesn't crash; review note:
  warning is the only operator-visible signal of the silent failure).
- extractor._wrap_admin_sql_for_jobs_api: docstring no longer claims
  DuckDB-flavor SQL is rejected — the function performs no inner-SQL
  validation; the v24 migration + register-time validator are the
  real enforcement points.
- extractor.materialize_query: safe_path uses _escape_sql_string_literal
  instead of inline replace, for one-place-to-update consistency.
- extractor: import hashlib hoisted to module-level imports.
This commit is contained in:
ZdenekSrotyr 2026-05-04 16:52:18 +02:00
parent d8a2299633
commit a2afcfe59a
2 changed files with 22 additions and 17 deletions

View file

@ -3,6 +3,7 @@
No data is downloaded. All queries go directly to BigQuery via DuckDB extension ATTACH. No data is downloaded. All queries go directly to BigQuery via DuckDB extension ATTACH.
""" """
import hashlib
import logging import logging
import os import os
import re import re
@ -89,10 +90,11 @@ def _wrap_admin_sql_for_jobs_api(billing_project: str, inner_sql: str) -> str:
should fail closed not silently lose budget to the wrong should fail closed not silently lose budget to the wrong
project). project).
inner_sql: BigQuery-flavor SQL the admin registered as inner_sql: BigQuery-flavor SQL the admin registered as
``source_query``. Must use BQ syntax (backticks for dashed ``source_query``. Should be BigQuery-native; DuckDB-flavor
identifiers, native function calls). DuckDB-flavor `bq."ds"."t"` `bq."ds"."t"` references are not enforced here but will fail at
is NOT acceptable here the v24 migration converts existing COPY time inside the BQ jobs API. Existing rows are converted by
rows; new registrations are validated upstream. the v24 schema migration; new rows are validated upstream at
register/PUT.
Returns: Returns:
A DuckDB-parseable SQL fragment suitable as the operand of A DuckDB-parseable SQL fragment suitable as the operand of
@ -471,7 +473,7 @@ def materialize_query(
) )
try: try:
safe_path = str(tmp_path).replace("'", "''") safe_path = _escape_sql_string_literal(str(tmp_path))
conn.execute( conn.execute(
f"COPY ({wrapped_sql}) TO '{safe_path}' (FORMAT PARQUET)" f"COPY ({wrapped_sql}) TO '{safe_path}' (FORMAT PARQUET)"
) )
@ -489,7 +491,6 @@ def materialize_query(
# thread — a 10 GiB parquet means 50+ seconds of disk I/O blocking other # thread — a 10 GiB parquet means 50+ seconds of disk I/O blocking other
# requests. Hashing here keeps the open-file handle hot from the COPY # requests. Hashing here keeps the open-file handle hot from the COPY
# round and removes the second read. Devil's-advocate review item. # round and removes the second read. Devil's-advocate review item.
import hashlib
h = hashlib.md5() h = hashlib.md5()
with open(tmp_path, "rb") as f: with open(tmp_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""): for chunk in iter(lambda: f.read(8192), b""):

View file

@ -128,14 +128,17 @@ def test_zero_max_bytes_skips_dry_run(tmp_path):
assert stats["rows"] == 1 assert stats["rows"] == 1
def test_dry_run_failure_is_fail_open(tmp_path): def test_dry_run_failure_is_fail_open(tmp_path, caplog):
"""If the dry-run errors (DuckDB syntax, missing google lib, transient """If the dry-run errors (DuckDB syntax, missing google lib, transient
upstream failure) we don't block — log + proceed with COPY. Operators upstream failure) we don't block — log + proceed with COPY. Operators
who need hard-fail watch logs for the warning.""" who need hard-fail watch logs for the warning."""
import logging
out = tmp_path / "extracts" / "bigquery" out = tmp_path / "extracts" / "bigquery"
out.mkdir(parents=True) out.mkdir(parents=True)
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"}) bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
with caplog.at_level(logging.WARNING, logger="connectors.bigquery.extractor"):
with patch( with patch(
"app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom") "app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
): ):
@ -147,3 +150,4 @@ def test_dry_run_failure_is_fail_open(tmp_path):
max_bytes=10 * 2**30, max_bytes=10 * 2**30,
) )
assert stats["rows"] == 1 assert stats["rows"] == 1
assert "fail-open" in caplog.text