fix(bq-materialize): code-review follow-ups for 16eaf7a3

- extractor._try_acquire_file_lock: close fd and re-raise on non-
  BlockingIOError from fcntl.flock (read-only fs, unsupported flock,
  fd exhaustion). Pre-fix the fd leaked silently and the underlying
  OSError still propagated past the caller.
- extractor: reorder module-level layout so logger is bound before
  the new lock-related helpers reference it. Deferred import of
  app.instance_config inside _get_lock_ttl_seconds documented inline.
- extractor: comment _table_locks unbounded-by-design rationale.
- tests: docstring + monkeypatch-target rationale for the two
  concurrency tests where the contract isn't obvious from the body.
This commit is contained in:
ZdenekSrotyr 2026-05-04 17:52:52 +02:00
parent 16eaf7a399
commit dc7e27082d
2 changed files with 38 additions and 12 deletions

View file

@ -17,6 +17,15 @@ from typing import List, Dict, Any, Optional
import duckdb
from connectors.bigquery.auth import get_metadata_token, BQMetadataAuthError
from src.sql_safe import (
validate_identifier as _validate_identifier,
validate_project_id as _validate_project_id,
)
from src.identifier_validation import validate_identifier, validate_quoted_identifier
logger = logging.getLogger(__name__)
# Serializes the body of `init_extract` across threads so two concurrent
# materialize calls (e.g. the synchronous timeout-fallback BackgroundTask
# kicking in while the original daemon thread is still running) can't both
@ -46,6 +55,10 @@ class MaterializeInFlightError(Exception):
)
# Unbounded by design — each registered table_id gets one Lock for the
# process lifetime. Per-Lock cost is ~56 bytes; a deployment with even
# 10k registered tables holds <1 MB. No cleanup logic — clean would
# need ref-counting and risks freeing a Lock currently held by a worker.
_table_locks: dict[str, threading.Lock] = {}
_table_locks_registry: threading.Lock = threading.Lock()
@ -72,6 +85,9 @@ def _get_lock_ttl_seconds() -> int:
longer is a stuck process or a hung BQ session, both of which warrant
reclaim on next attempt."""
try:
# Deferred import: keeps the connectors module importable in
# contexts where the app layer isn't bootstrapped (e.g. unit tests
# that exercise extractor helpers without the FastAPI app).
from app.instance_config import get_value
v = get_value(
"materialize", "lock_ttl_seconds",
@ -97,13 +113,25 @@ def _try_acquire_file_lock(lock_path: Path):
lock_path.parent.mkdir(parents=True, exist_ok=True)
def _try_open_and_flock():
# Open in 'w' mode so the file's mtime updates on every successful
# acquisition — the mtime is the TTL signal for the next caller.
# Content is intentionally empty; the fd exists only to anchor flock.
f = open(lock_path, "w")
try:
fcntl.flock(f.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
return f
except BlockingIOError:
# Another holder owns the lock — return None so the caller can
# decide between TTL-reclaim and propagating MaterializeInFlightError.
f.close()
return None
except OSError:
# Anything else (read-only fs, unsupported, fd exhaustion) is a
# platform / config error, not a contention signal. Close the fd
# and re-raise so the caller (and operator) sees the real failure
# instead of a silent leak.
f.close()
raise
holder = _try_open_and_flock()
if holder is not None:
@ -128,16 +156,6 @@ def _try_acquire_file_lock(lock_path: Path):
return None
from connectors.bigquery.auth import get_metadata_token, BQMetadataAuthError
from app.instance_config import get_value
from src.sql_safe import (
validate_identifier as _validate_identifier,
validate_project_id as _validate_project_id,
)
from src.identifier_validation import validate_identifier, validate_quoted_identifier
logger = logging.getLogger(__name__)
def _detect_table_type(
conn: duckdb.DuckDBPyConnection,

View file

@ -141,8 +141,11 @@ def test_different_ids_run_in_parallel(tmp_path):
def test_stale_file_lock_is_reclaimed_after_ttl(tmp_path, monkeypatch):
"""Force a stale .lock file (mtime older than TTL) and verify a new
call reclaims it instead of raising MaterializeInFlightError."""
"""Verify a stale, unheld .lock file (old mtime, no live flock holder) does NOT
cause `MaterializeInFlightError`. The reclaim branch in `_try_acquire_file_lock`
is technically not reached here (the first `_try_open_and_flock` succeeds because
nobody holds the lock), but exercising the in-flight-by-mtime-only mistake is what
this test guards against."""
bq = _slow_bq(stall_seconds=0.05)
lock_path = Path(tmp_path) / "data" / "t1.parquet.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
@ -187,6 +190,11 @@ def test_fresh_file_lock_blocks_with_in_flight_error(tmp_path, monkeypatch):
def test_lock_ttl_reads_from_instance_config(tmp_path, monkeypatch):
"""When `materialize.lock_ttl_seconds` is set in instance.yaml, that
value overrides the default."""
# Patches `app.instance_config.get_value` directly. This works because
# `_get_lock_ttl_seconds` re-imports `get_value` on every call (see
# extractor.py for the deferred-import rationale). If a future change
# hoists the import to module-level, this patch must change to target
# `connectors.bigquery.extractor.get_value` instead.
monkeypatch.setattr(
"app.instance_config.get_value",
lambda *args, **kw: 60 if args == ("materialize", "lock_ttl_seconds") else kw.get("default"),