agnes-the-ai-analyst/tests/test_bq_materialize_concurrency.py
ZdenekSrotyr c432e90f62 fix(bq-materialize): TTL reclaim was dead code (Devin Review on extractor.py:166)
`_try_acquire_file_lock` opened the lock file with `open(mode='w')`
BEFORE the mtime check, which truncated the file and refreshed mtime
to now. The subsequent age check always saw ~0, so the TTL reclaim
branch was never reachable and `materialize.lock_ttl_seconds` was
a silently no-op config knob.

Repro:
  before open(w): mtime age = 100000s
  after  open(w): mtime age = 0s

Fix: stat the lock path BEFORE any open(). If pre-probe mtime is
older than TTL, unlink (forcing a fresh inode for the open + flock
that follows). Order is now stat-then-decide-then-probe, not
probe-then-stat-then-decide.

Two regression tests added in tests/test_bq_materialize_concurrency.py:
- test_stale_held_lock_is_reclaimed_despite_live_holder — exercises
  the full reclaim path with a still-living fcntl holder. Pre-fix
  this returned None (in_flight forever); post-fix returns a holder
  fd on a new inode.
- test_failed_probe_does_not_self_refresh_lock_mtime — sister test
  pins that a failed acquisition's mode='w' truncate doesn't
  pathologically loop.

Residual cross-process risk (genuinely overrunning materialize past
TTL races a fresh attempt — both write to the same parquet.tmp,
inode-level flock independence means new acquisition succeeds while
old holder is still alive) stays documented in the helper docstring.
In-process threading.Lock keyed on table_id blocks the single-process
race; cross-process protection relies on TTL being well above
longest plausible COPY (24h default).
2026-05-04 22:36:56 +02:00

301 lines
11 KiB
Python

"""Per-table_id concurrency: in-process mutex + advisory file lock with
TTL reclaim. Two overlapping materialize_query calls for the same id
must NOT corrupt each other's parquet."""
from __future__ import annotations
import os
import threading
import time
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from connectors.bigquery.extractor import (
materialize_query,
MaterializeInFlightError,
_get_table_lock,
_LOCK_TTL_DEFAULT_SECONDS,
)
@pytest.fixture(autouse=True)
def reset_locks(monkeypatch):
# Tests must not share lock state across runs.
import connectors.bigquery.extractor as mod
monkeypatch.setattr(mod, "_table_locks", {})
yield
def _slow_bq(stall_seconds: float = 1.0):
"""Build a fake BqAccess whose duckdb_session COPY blocks for
`stall_seconds` so we can race a second call against it."""
bq = MagicMock()
bq.projects.billing = "prj-billing"
bq.projects.data = "prj-data"
class _Session:
def __enter__(self):
return self
def __exit__(self, *a):
return False
def execute(self, sql):
if sql.startswith("SELECT database_name"):
class _R:
def fetchall(self):
return [("memory",)]
return _R()
if sql.startswith("ATTACH"):
return MagicMock()
if sql.startswith("COPY"):
# Simulate a long-running COPY by writing a stub parquet
# then sleeping so a second call can race us.
# Extract the path from the COPY statement.
import re
m = re.search(r"TO '([^']+)'", sql)
assert m
Path(m.group(1)).write_bytes(b"PARQUET_STUB_HEADER" + b"\x00" * 200)
time.sleep(stall_seconds)
return MagicMock()
if sql.startswith("SELECT count"):
class _R:
def fetchone(self):
return (42,)
return _R()
return MagicMock()
bq.duckdb_session.return_value = _Session()
return bq
def test_concurrent_calls_for_same_id_raise_in_flight(tmp_path):
bq = _slow_bq(stall_seconds=2.0)
out_dir = str(tmp_path)
captured: list = []
def runner(tag):
try:
r = materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=out_dir, max_bytes=None,
)
captured.append(("ok", tag, r))
except MaterializeInFlightError as e:
captured.append(("in_flight", tag, str(e)))
except Exception as e:
captured.append(("err", tag, str(e)))
t1 = threading.Thread(target=runner, args=("first",))
t2 = threading.Thread(target=runner, args=("second",))
t1.start()
time.sleep(0.2) # let t1 acquire the lock
t2.start()
t1.join()
t2.join()
outcomes = [c[0] for c in captured]
assert outcomes.count("ok") == 1, f"expected exactly one success, got {captured}"
assert outcomes.count("in_flight") == 1
def test_sequential_calls_for_same_id_both_succeed(tmp_path):
bq = _slow_bq(stall_seconds=0.05)
out_dir = str(tmp_path)
r1 = materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=out_dir, max_bytes=None,
)
r2 = materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=out_dir, max_bytes=None,
)
assert r1["rows"] == 42
assert r2["rows"] == 42
def test_different_ids_run_in_parallel(tmp_path):
bq = _slow_bq(stall_seconds=1.0)
out_dir = str(tmp_path)
captured: list = []
def runner(tid):
try:
r = materialize_query(
table_id=tid, sql="SELECT 1",
bq=bq, output_dir=out_dir, max_bytes=None,
)
captured.append((tid, r["rows"]))
except Exception as e:
captured.append((tid, "ERROR"))
threads = [threading.Thread(target=runner, args=(f"tab_{i}",)) for i in range(3)]
start = time.time()
for t in threads: t.start()
for t in threads: t.join()
elapsed = time.time() - start
# If they were serialized, would take >= 3s. Parallel: ~1s.
assert elapsed < 2.0, f"expected parallel, elapsed={elapsed:.2f}s"
assert len(captured) == 3
assert all(c[1] == 42 for c in captured)
def test_stale_file_lock_is_reclaimed_after_ttl(tmp_path, monkeypatch):
"""Verify a stale, unheld .lock file (old mtime, no live flock holder) does NOT
cause `MaterializeInFlightError`. The reclaim branch in `_try_acquire_file_lock`
is technically not reached here (the first `_try_open_and_flock` succeeds because
nobody holds the lock), but exercising the in-flight-by-mtime-only mistake is what
this test guards against."""
bq = _slow_bq(stall_seconds=0.05)
lock_path = Path(tmp_path) / "data" / "t1.parquet.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text("")
# Set mtime to 25h ago (> default 24h TTL).
old_ts = time.time() - 25 * 3600
os.utime(lock_path, (old_ts, old_ts))
r = materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=str(tmp_path), max_bytes=None,
)
assert r["rows"] == 42
def test_fresh_file_lock_blocks_with_in_flight_error(tmp_path, monkeypatch):
"""Force a fresh .lock file (mtime within TTL) and verify a new
call raises rather than reclaims."""
bq = _slow_bq(stall_seconds=0.05)
lock_path = Path(tmp_path) / "data" / "t1.parquet.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
# Open the lock file and HOLD a fcntl exclusive lock so the materialize
# call's flock(LOCK_NB) sees a real conflicting lock — relying on
# mtime-only would let the test pass even if flock acquisition was
# broken.
import fcntl
holder = open(lock_path, "w")
fcntl.flock(holder.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
try:
with pytest.raises(MaterializeInFlightError):
materialize_query(
table_id="t1", sql="SELECT 1",
bq=bq, output_dir=str(tmp_path), max_bytes=None,
)
finally:
fcntl.flock(holder.fileno(), fcntl.LOCK_UN)
holder.close()
def test_stale_held_lock_is_reclaimed_despite_live_holder(tmp_path, monkeypatch):
"""Regression for Devin Review on extractor.py:166. The TTL reclaim
path used to be dead code: `_try_acquire_file_lock` opened the lock
file with `open(mode="w")` BEFORE checking mtime, which truncated
the file and refreshed mtime to now on every call. Subsequent
`time.time() - lock_path.stat().st_mtime` always saw age ~0, so
`age > TTL` never fired, so `materialize.lock_ttl_seconds` was
silently a no-op.
This test exercises the actual reclaim path: an OLD-mtime lock file
held by a still-living fcntl holder. Pre-fix path: failed probe
refreshes mtime → age check sees ~0 → never reclaims → caller
raises MaterializeInFlightError forever. Post-fix path: stat first,
see old mtime, unlink (creates new inode), open + flock new inode
succeeds (the live holder's flock is on the now-orphan old inode,
no inode-level conflict).
"""
import fcntl
import os
# Use the helper directly — exercising it through `materialize_query`
# would also work but obscures which acquisition we're testing.
from connectors.bigquery.extractor import _try_acquire_file_lock
lock_path = Path(tmp_path) / "t1.parquet.lock"
# Live holder: open + flock. Holder stays alive for the duration
# of the test (we close it in finally).
holder = open(lock_path, "w")
fcntl.flock(holder.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
# Backdate the mtime past the default 24h TTL. This is the
# condition the reclaim should detect.
old_ts = time.time() - 25 * 3600
os.utime(lock_path, (old_ts, old_ts))
try:
# Pre-fix this call returns None (failed probe refreshed mtime,
# age check < TTL, no reclaim). Post-fix: stat first → old mtime
# → unlink + new inode → flock succeeds → returns a holder fd.
new_holder = _try_acquire_file_lock(lock_path)
assert new_holder is not None, (
"TTL reclaim is dead code: the old-mtime lock should have "
"been unlinked and a new inode acquired"
)
assert new_holder.fileno() != holder.fileno()
new_holder.close()
finally:
fcntl.flock(holder.fileno(), fcntl.LOCK_UN)
holder.close()
def test_failed_probe_does_not_self_refresh_lock_mtime(tmp_path, monkeypatch):
"""Sister test: the pre-probe stat must read the REAL mtime, not a
value contaminated by the call's own `open(mode='w')`. Pre-fix the
probe ran first and truncated the file, so any subsequent caller —
including this test's own followup stat — saw ~now mtime. After
fix, a failed acquisition should NOT update mtime if the file
wasn't already due for reclaim.
Setup: lock file exists with mtime FRESH (within TTL) AND held by
a live holder. New call probes → fails → returns None. Assertion:
mtime after the failed call is no more than ~1 s newer than the
pre-call mtime — the failed probe's `open('w')` does still touch
the file (mode='w' inherently truncates on open), and we accept
that as documented behavior. But mtime must NOT have jumped from
"old fresh" to "way fresher" by some pathological refresh loop.
"""
import fcntl
from connectors.bigquery.extractor import _try_acquire_file_lock
lock_path = Path(tmp_path) / "t1.parquet.lock"
holder = open(lock_path, "w")
fcntl.flock(holder.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
pre_call_mtime = lock_path.stat().st_mtime
try:
# Probe fails — no reclaim because mtime is fresh.
result = _try_acquire_file_lock(lock_path)
assert result is None, "fresh held lock must block, not reclaim"
post_call_mtime = lock_path.stat().st_mtime
# The probe still opens with mode='w', which DOES update mtime.
# That's documented in the helper's docstring as the
# "last-attempted-acquire" signal. We're not asserting "mtime
# unchanged" — just that the operation is bounded (no runaway).
assert post_call_mtime - pre_call_mtime < 5, (
"failed probe shifted mtime by more than 5s — implausible "
"unless the helper looped"
)
finally:
fcntl.flock(holder.fileno(), fcntl.LOCK_UN)
holder.close()
def test_lock_ttl_reads_from_instance_config(tmp_path, monkeypatch):
"""When `materialize.lock_ttl_seconds` is set in instance.yaml, that
value overrides the default."""
# Patches `app.instance_config.get_value` directly. This works because
# `_get_lock_ttl_seconds` re-imports `get_value` on every call (see
# extractor.py for the deferred-import rationale). If a future change
# hoists the import to module-level, this patch must change to target
# `connectors.bigquery.extractor.get_value` instead.
monkeypatch.setattr(
"app.instance_config.get_value",
lambda *args, **kw: 60 if args == ("materialize", "lock_ttl_seconds") else kw.get("default"),
)
from connectors.bigquery.extractor import _get_lock_ttl_seconds
assert _get_lock_ttl_seconds() == 60