fix(bq-materialize): code-review follow-ups for d8a22996
- tests/test_bq_cost_guardrail.py: assert fail-open warning is logged (test previously only proved fail-open doesn't crash; review note: warning is the only operator-visible signal of the silent failure). - extractor._wrap_admin_sql_for_jobs_api: docstring no longer claims DuckDB-flavor SQL is rejected — the function performs no inner-SQL validation; the v24 migration + register-time validator are the real enforcement points. - extractor.materialize_query: safe_path uses _escape_sql_string_literal instead of inline replace, for one-place-to-update consistency. - extractor: import hashlib hoisted to module-level imports.
This commit is contained in:
parent
d8a2299633
commit
a2afcfe59a
2 changed files with 22 additions and 17 deletions
|
|
@ -3,6 +3,7 @@
|
||||||
No data is downloaded. All queries go directly to BigQuery via DuckDB extension ATTACH.
|
No data is downloaded. All queries go directly to BigQuery via DuckDB extension ATTACH.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
@ -89,10 +90,11 @@ def _wrap_admin_sql_for_jobs_api(billing_project: str, inner_sql: str) -> str:
|
||||||
should fail closed not silently lose budget to the wrong
|
should fail closed not silently lose budget to the wrong
|
||||||
project).
|
project).
|
||||||
inner_sql: BigQuery-flavor SQL the admin registered as
|
inner_sql: BigQuery-flavor SQL the admin registered as
|
||||||
``source_query``. Must use BQ syntax (backticks for dashed
|
``source_query``. Should be BigQuery-native; DuckDB-flavor
|
||||||
identifiers, native function calls). DuckDB-flavor `bq."ds"."t"`
|
`bq."ds"."t"` references are not enforced here but will fail at
|
||||||
is NOT acceptable here — the v24 migration converts existing
|
COPY time inside the BQ jobs API. Existing rows are converted by
|
||||||
rows; new registrations are validated upstream.
|
the v24 schema migration; new rows are validated upstream at
|
||||||
|
register/PUT.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A DuckDB-parseable SQL fragment suitable as the operand of
|
A DuckDB-parseable SQL fragment suitable as the operand of
|
||||||
|
|
@ -471,7 +473,7 @@ def materialize_query(
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
safe_path = str(tmp_path).replace("'", "''")
|
safe_path = _escape_sql_string_literal(str(tmp_path))
|
||||||
conn.execute(
|
conn.execute(
|
||||||
f"COPY ({wrapped_sql}) TO '{safe_path}' (FORMAT PARQUET)"
|
f"COPY ({wrapped_sql}) TO '{safe_path}' (FORMAT PARQUET)"
|
||||||
)
|
)
|
||||||
|
|
@ -489,7 +491,6 @@ def materialize_query(
|
||||||
# thread — a 10 GiB parquet means 50+ seconds of disk I/O blocking other
|
# thread — a 10 GiB parquet means 50+ seconds of disk I/O blocking other
|
||||||
# requests. Hashing here keeps the open-file handle hot from the COPY
|
# requests. Hashing here keeps the open-file handle hot from the COPY
|
||||||
# round and removes the second read. Devil's-advocate review item.
|
# round and removes the second read. Devil's-advocate review item.
|
||||||
import hashlib
|
|
||||||
h = hashlib.md5()
|
h = hashlib.md5()
|
||||||
with open(tmp_path, "rb") as f:
|
with open(tmp_path, "rb") as f:
|
||||||
for chunk in iter(lambda: f.read(8192), b""):
|
for chunk in iter(lambda: f.read(8192), b""):
|
||||||
|
|
|
||||||
|
|
@ -128,22 +128,26 @@ def test_zero_max_bytes_skips_dry_run(tmp_path):
|
||||||
assert stats["rows"] == 1
|
assert stats["rows"] == 1
|
||||||
|
|
||||||
|
|
||||||
def test_dry_run_failure_is_fail_open(tmp_path):
|
def test_dry_run_failure_is_fail_open(tmp_path, caplog):
|
||||||
"""If the dry-run errors (DuckDB syntax, missing google lib, transient
|
"""If the dry-run errors (DuckDB syntax, missing google lib, transient
|
||||||
upstream failure) we don't block — log + proceed with COPY. Operators
|
upstream failure) we don't block — log + proceed with COPY. Operators
|
||||||
who need hard-fail watch logs for the warning."""
|
who need hard-fail watch logs for the warning."""
|
||||||
|
import logging
|
||||||
|
|
||||||
out = tmp_path / "extracts" / "bigquery"
|
out = tmp_path / "extracts" / "bigquery"
|
||||||
out.mkdir(parents=True)
|
out.mkdir(parents=True)
|
||||||
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
bq = _bq_with_seed({"bq.test.tiny": "SELECT 1 AS n"})
|
||||||
|
|
||||||
with patch(
|
with caplog.at_level(logging.WARNING, logger="connectors.bigquery.extractor"):
|
||||||
"app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
|
with patch(
|
||||||
):
|
"app.api.v2_scan._bq_dry_run_bytes", side_effect=RuntimeError("boom")
|
||||||
stats = materialize_query(
|
):
|
||||||
table_id="t1",
|
stats = materialize_query(
|
||||||
sql="SELECT * FROM bq.test.tiny",
|
table_id="t1",
|
||||||
bq=bq,
|
sql="SELECT * FROM bq.test.tiny",
|
||||||
output_dir=str(out),
|
bq=bq,
|
||||||
max_bytes=10 * 2**30,
|
output_dir=str(out),
|
||||||
)
|
max_bytes=10 * 2**30,
|
||||||
|
)
|
||||||
assert stats["rows"] == 1
|
assert stats["rows"] == 1
|
||||||
|
assert "fail-open" in caplog.text
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue