agnes-the-ai-analyst/tests/test_bq_materialize_concurrency.py
ZdenekSrotyr dc7e27082d fix(bq-materialize): code-review follow-ups for 16eaf7a3
- extractor._try_acquire_file_lock: close fd and re-raise on non-
  BlockingIOError from fcntl.flock (read-only fs, unsupported flock,
  fd exhaustion). Pre-fix the fd leaked silently and the underlying
  OSError still propagated past the caller.
- extractor: reorder module-level layout so logger is bound before
  the new lock-related helpers reference it. Deferred import of
  app.instance_config inside _get_lock_ttl_seconds documented inline.
- extractor: comment _table_locks unbounded-by-design rationale.
- tests: docstring + monkeypatch-target rationale for the two
  concurrency tests where the contract isn't obvious from the body.
2026-05-04 17:59:21 +02:00

204 lines
7 KiB
Python

"""Per-table_id concurrency: in-process mutex + advisory file lock with
TTL reclaim. Two overlapping materialize_query calls for the same id
must NOT corrupt each other's parquet."""
from __future__ import annotations
import os
import threading
import time
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from connectors.bigquery.extractor import (
materialize_query,
MaterializeInFlightError,
_get_table_lock,
_LOCK_TTL_DEFAULT_SECONDS,
)
@pytest.fixture(autouse=True)
def reset_locks(monkeypatch):
# Tests must not share lock state across runs.
import connectors.bigquery.extractor as mod
monkeypatch.setattr(mod, "_table_locks", {})
yield
def _slow_bq(stall_seconds: float = 1.0):
"""Build a fake BqAccess whose duckdb_session COPY blocks for
`stall_seconds` so we can race a second call against it."""
bq = MagicMock()
bq.projects.billing = "prj-billing"
bq.projects.data = "prj-data"
class _Session:
def __enter__(self):
return self
def __exit__(self, *a):
return False
def execute(self, sql):
if sql.startswith("SELECT database_name"):
class _R:
def fetchall(self):
return [("memory",)]
return _R()
if sql.startswith("ATTACH"):
return MagicMock()
if sql.startswith("COPY"):
# Simulate a long-running COPY by writing a stub parquet
# then sleeping so a second call can race us.
# Extract the path from the COPY statement.
import re
m = re.search(r"TO '([^']+)'", sql)
assert m
Path(m.group(1)).write_bytes(b"PARQUET_STUB_HEADER" + b"\x00" * 200)
time.sleep(stall_seconds)
return MagicMock()
if sql.startswith("SELECT count"):
class _R:
def fetchone(self):
return (42,)
return _R()
return MagicMock()
bq.duckdb_session.return_value = _Session()
return bq
def test_concurrent_calls_for_same_id_raise_in_flight(tmp_path):
bq = _slow_bq(stall_seconds=2.0)
out_dir = str(tmp_path)
captured: list = []
def runner(tag):
try:
r = materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=out_dir, max_bytes=None,
)
captured.append(("ok", tag, r))
except MaterializeInFlightError as e:
captured.append(("in_flight", tag, str(e)))
except Exception as e:
captured.append(("err", tag, str(e)))
t1 = threading.Thread(target=runner, args=("first",))
t2 = threading.Thread(target=runner, args=("second",))
t1.start()
time.sleep(0.2) # let t1 acquire the lock
t2.start()
t1.join()
t2.join()
outcomes = [c[0] for c in captured]
assert outcomes.count("ok") == 1, f"expected exactly one success, got {captured}"
assert outcomes.count("in_flight") == 1
def test_sequential_calls_for_same_id_both_succeed(tmp_path):
bq = _slow_bq(stall_seconds=0.05)
out_dir = str(tmp_path)
r1 = materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=out_dir, max_bytes=None,
)
r2 = materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=out_dir, max_bytes=None,
)
assert r1["rows"] == 42
assert r2["rows"] == 42
def test_different_ids_run_in_parallel(tmp_path):
bq = _slow_bq(stall_seconds=1.0)
out_dir = str(tmp_path)
captured: list = []
def runner(tid):
try:
r = materialize_query(
table_id=tid, sql="SELECT 1",
bq=bq, output_dir=out_dir, max_bytes=None,
)
captured.append((tid, r["rows"]))
except Exception as e:
captured.append((tid, "ERROR"))
threads = [threading.Thread(target=runner, args=(f"tab_{i}",)) for i in range(3)]
start = time.time()
for t in threads: t.start()
for t in threads: t.join()
elapsed = time.time() - start
# If they were serialized, would take >= 3s. Parallel: ~1s.
assert elapsed < 2.0, f"expected parallel, elapsed={elapsed:.2f}s"
assert len(captured) == 3
assert all(c[1] == 42 for c in captured)
def test_stale_file_lock_is_reclaimed_after_ttl(tmp_path, monkeypatch):
"""Verify a stale, unheld .lock file (old mtime, no live flock holder) does NOT
cause `MaterializeInFlightError`. The reclaim branch in `_try_acquire_file_lock`
is technically not reached here (the first `_try_open_and_flock` succeeds because
nobody holds the lock), but exercising the in-flight-by-mtime-only mistake is what
this test guards against."""
bq = _slow_bq(stall_seconds=0.05)
lock_path = Path(tmp_path) / "data" / "t1.parquet.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text("")
# Set mtime to 25h ago (> default 24h TTL).
old_ts = time.time() - 25 * 3600
os.utime(lock_path, (old_ts, old_ts))
r = materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=str(tmp_path), max_bytes=None,
)
assert r["rows"] == 42
def test_fresh_file_lock_blocks_with_in_flight_error(tmp_path, monkeypatch):
"""Force a fresh .lock file (mtime within TTL) and verify a new
call raises rather than reclaims."""
bq = _slow_bq(stall_seconds=0.05)
lock_path = Path(tmp_path) / "data" / "t1.parquet.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
# Open the lock file and HOLD a fcntl exclusive lock so the materialize
# call's flock(LOCK_NB) sees a real conflicting lock — relying on
# mtime-only would let the test pass even if flock acquisition was
# broken.
import fcntl
holder = open(lock_path, "w")
fcntl.flock(holder.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
try:
with pytest.raises(MaterializeInFlightError):
materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=str(tmp_path), max_bytes=None,
)
finally:
fcntl.flock(holder.fileno(), fcntl.LOCK_UN)
holder.close()
def test_lock_ttl_reads_from_instance_config(tmp_path, monkeypatch):
"""When `materialize.lock_ttl_seconds` is set in instance.yaml, that
value overrides the default."""
# Patches `app.instance_config.get_value` directly. This works because
# `_get_lock_ttl_seconds` re-imports `get_value` on every call (see
# extractor.py for the deferred-import rationale). If a future change
# hoists the import to module-level, this patch must change to target
# `connectors.bigquery.extractor.get_value` instead.
monkeypatch.setattr(
"app.instance_config.get_value",
lambda *args, **kw: 60 if args == ("materialize", "lock_ttl_seconds") else kw.get("default"),
)
from connectors.bigquery.extractor import _get_lock_ttl_seconds
assert _get_lock_ttl_seconds() == 60