agnes-the-ai-analyst/connectors/jira/tests/test_parquet_lock.py
Petr 86edd27655 Extract Jira into connectors/jira module
Move all Jira-specific code into a self-contained connector module:
- 22 files moved via git mv (transform, service, webhook, scripts,
  systemd units, tests, docs, bin helper)
- All imports updated to use connectors.jira.* paths
- Jira is now conditional: auto-detected via JIRA_DOMAIN env var
- Webapp registers Jira blueprint only when available
- Health service monitors Jira timers only when enabled
- Profiler loads Jira tables dynamically from filesystem
- Sync settings uses config-driven dependency validation
- Renamed keboola_platform_url -> custom_url in transform
- Updated deploy.sh, sudoers-deploy, backfill_gap.sh paths
- Fixed pytest.ini to skip live tests by default
2026-03-09 11:17:50 +01:00

392 lines
14 KiB
Python

"""Tests for per-month Parquet advisory file locking (connectors/jira/file_lock.py).
Verifies that parquet_month_lock correctly:
- Acquires and releases locks via context manager
- Auto-creates the .locks/ directory
- Provides mutual exclusion for the same month key (threading)
- Allows concurrent locks on different month keys
- Integration: N threads calling transform_single_issue with different
issues in the same month produce a Parquet file with all issues
"""
import json
import threading
import time
from pathlib import Path
import pandas as pd
import pytest
from connectors.jira.file_lock import parquet_month_lock
# ---------------------------------------------------------------------------
# Helpers for integration test
# ---------------------------------------------------------------------------
def _make_minimal_issue_json(issue_key: str, created_at: str) -> dict:
"""Build a minimal raw Jira JSON that passes through transform_issue()."""
return {
"key": issue_key,
"id": issue_key.replace("-", ""),
"fields": {
"summary": f"Test issue {issue_key}",
"description": None,
"issuetype": {"name": "Bug"},
"status": {"name": "Open", "statusCategory": {"name": "To Do"}},
"priority": {"name": "Medium"},
"resolution": None,
"project": {"key": "TEST", "name": "Test Project"},
"creator": None,
"reporter": None,
"assignee": None,
"created": created_at,
"updated": created_at,
"resolutiondate": None,
"duedate": None,
"labels": [],
"attachment": [],
"comment": {"total": 0, "comments": []},
"issuelinks": [],
"customfield_10010": None,
"customfield_10004": None,
"customfield_10323": None,
"customfield_10511": None,
"customfield_10156": None,
"customfield_10002": None,
"customfield_10365": None,
"customfield_10330": None,
"customfield_10325": None,
"customfield_10350": None,
"customfield_10676": None,
"customfield_10475": None,
"customfield_10157": None,
"customfield_10328": None,
"customfield_10161": None,
"customfield_11831": None,
},
"changelog": {"histories": []},
"_remote_links": [],
"_synced_at": "2025-01-15T12:00:00Z",
}
# ===========================================================================
# TestBasicParquetLock
# ===========================================================================
class TestBasicParquetLock:
"""Context manager acquires and releases the parquet month lock cleanly."""
def test_lock_creates_lock_file(self, tmp_path: Path) -> None:
output_dir = tmp_path / "parquet"
output_dir.mkdir()
with parquet_month_lock(output_dir, "2025-01"):
lock_file = output_dir / ".locks" / "parquet-2025-01.lock"
assert lock_file.exists(), "Lock file should exist while lock is held"
def test_lock_file_persists_after_release(self, tmp_path: Path) -> None:
output_dir = tmp_path / "parquet"
output_dir.mkdir()
with parquet_month_lock(output_dir, "2025-06"):
pass
lock_file = output_dir / ".locks" / "parquet-2025-06.lock"
assert lock_file.exists(), "Lock file should persist after release"
def test_lock_can_be_reacquired(self, tmp_path: Path) -> None:
output_dir = tmp_path / "parquet"
output_dir.mkdir()
with parquet_month_lock(output_dir, "2025-03"):
pass
# Should not block
with parquet_month_lock(output_dir, "2025-03"):
pass
def test_locks_dir_created_when_missing(self, tmp_path: Path) -> None:
output_dir = tmp_path / "parquet"
output_dir.mkdir()
locks_dir = output_dir / ".locks"
assert not locks_dir.exists()
with parquet_month_lock(output_dir, "2025-01"):
assert locks_dir.is_dir()
def test_nested_parent_creation(self, tmp_path: Path) -> None:
output_dir = tmp_path / "deep" / "nested" / "parquet"
with parquet_month_lock(output_dir, "2025-12"):
assert (output_dir / ".locks").is_dir()
# ===========================================================================
# TestConcurrentParquetLock
# ===========================================================================
class TestConcurrentParquetLock:
"""Two threads cannot hold the same month lock simultaneously."""
def test_mutual_exclusion_same_month(self, tmp_path: Path) -> None:
"""Prove two threads locking the same month do not overlap."""
output_dir = tmp_path / "parquet"
output_dir.mkdir()
order: list[str] = []
barrier = threading.Barrier(2)
def worker(name: str) -> None:
barrier.wait()
with parquet_month_lock(output_dir, "2025-01"):
order.append(f"enter:{name}")
time.sleep(0.1)
order.append(f"exit:{name}")
t1 = threading.Thread(target=worker, args=("A",))
t2 = threading.Thread(target=worker, args=("B",))
t1.start()
t2.start()
t1.join(timeout=5)
t2.join(timeout=5)
assert len(order) == 4, f"Expected 4 events, got {order}"
first = order[0].split(":")[1]
second = order[2].split(":")[1]
assert order[0] == f"enter:{first}"
assert order[1] == f"exit:{first}"
assert order[2] == f"enter:{second}"
assert order[3] == f"exit:{second}"
assert first != second
def test_counter_integrity_under_contention(self, tmp_path: Path) -> None:
"""Multiple threads incrementing a shared counter must not lose updates."""
output_dir = tmp_path / "parquet"
output_dir.mkdir()
counter_file = tmp_path / "counter.txt"
counter_file.write_text("0")
num_threads = 4
increments_per_thread = 20
barrier = threading.Barrier(num_threads)
def increment_worker() -> None:
barrier.wait()
for _ in range(increments_per_thread):
with parquet_month_lock(output_dir, "2025-01"):
value = int(counter_file.read_text())
value += 1
counter_file.write_text(str(value))
threads = [
threading.Thread(target=increment_worker)
for _ in range(num_threads)
]
for t in threads:
t.start()
for t in threads:
t.join(timeout=30)
expected = num_threads * increments_per_thread
actual = int(counter_file.read_text())
assert actual == expected, (
f"Counter should be {expected} but got {actual} "
f"(indicates lost updates due to missing mutual exclusion)"
)
# ===========================================================================
# TestDifferentMonthsNotBlocked
# ===========================================================================
class TestDifferentMonthsNotBlocked:
"""Locks on different month keys do not block each other."""
def test_different_months_lock_concurrently(self, tmp_path: Path) -> None:
output_dir = tmp_path / "parquet"
output_dir.mkdir()
timings: dict[str, dict[str, float]] = {}
barrier = threading.Barrier(2)
def worker(month_key: str) -> None:
barrier.wait()
with parquet_month_lock(output_dir, month_key):
timings[month_key] = {"enter": time.monotonic()}
time.sleep(0.15)
timings[month_key]["exit"] = time.monotonic()
t1 = threading.Thread(target=worker, args=("2025-01",))
t2 = threading.Thread(target=worker, args=("2025-02",))
t1.start()
t2.start()
t1.join(timeout=5)
t2.join(timeout=5)
assert "2025-01" in timings and "2025-02" in timings
jan = timings["2025-01"]
feb = timings["2025-02"]
overlap = jan["enter"] < feb["exit"] and feb["enter"] < jan["exit"]
assert overlap, (
f"Different month locks should allow concurrent access. "
f"Jan: {jan}, Feb: {feb}"
)
def test_separate_lock_files_created(self, tmp_path: Path) -> None:
output_dir = tmp_path / "parquet"
output_dir.mkdir()
with parquet_month_lock(output_dir, "2025-01"):
with parquet_month_lock(output_dir, "2025-02"):
locks_dir = output_dir / ".locks"
lock_files = sorted(f.name for f in locks_dir.iterdir())
assert "parquet-2025-01.lock" in lock_files
assert "parquet-2025-02.lock" in lock_files
# ===========================================================================
# TestParquetLockIntegration
# ===========================================================================
class TestParquetLockIntegration:
"""Integration test: concurrent transform_single_issue calls preserve all data.
This is the key test that reproduces the race condition from issue #205.
N threads call transform_single_issue() with different issues that all
belong to the same month. The resulting Parquet file must contain ALL issues.
"""
def test_concurrent_transforms_no_data_loss(self, tmp_path: Path) -> None:
"""Simulate concurrent webhook transforms for same month."""
from connectors.jira.incremental_transform import transform_single_issue
raw_dir = tmp_path / "raw"
issues_dir = raw_dir / "issues"
issues_dir.mkdir(parents=True)
attachments_dir = raw_dir / "attachments"
attachments_dir.mkdir(parents=True)
output_dir = tmp_path / "parquet"
output_dir.mkdir()
# All issues created in January 2025 → same month partition
num_issues = 8
issue_keys = [f"TEST-{i}" for i in range(1, num_issues + 1)]
for key in issue_keys:
raw_json = _make_minimal_issue_json(key, "2025-01-15T10:00:00.000+0000")
json_path = issues_dir / f"{key}.json"
json_path.write_text(json.dumps(raw_json))
barrier = threading.Barrier(num_issues)
errors: list[str] = []
def transform_worker(issue_key: str) -> None:
try:
barrier.wait(timeout=10)
result = transform_single_issue(
issue_key=issue_key,
raw_dir=raw_dir,
output_dir=output_dir,
attachments_dir=attachments_dir,
)
if not result:
errors.append(f"transform_single_issue returned False for {issue_key}")
except Exception as e:
errors.append(f"Exception for {issue_key}: {e}")
threads = [
threading.Thread(target=transform_worker, args=(key,))
for key in issue_keys
]
for t in threads:
t.start()
for t in threads:
t.join(timeout=60)
assert not errors, f"Transform errors: {errors}"
# Verify: all issues must be present in the Parquet file
issues_parquet = output_dir / "issues" / "2025-01.parquet"
assert issues_parquet.exists(), "Issues Parquet file should exist"
df = pd.read_parquet(issues_parquet)
found_keys = set(df["issue_key"].tolist())
assert found_keys == set(issue_keys), (
f"Expected all {num_issues} issues in Parquet but found {len(found_keys)}. "
f"Missing: {set(issue_keys) - found_keys}"
)
def test_concurrent_transforms_different_months_independent(self, tmp_path: Path) -> None:
"""Issues in different months should not interfere with each other."""
from connectors.jira.incremental_transform import transform_single_issue
raw_dir = tmp_path / "raw"
issues_dir = raw_dir / "issues"
issues_dir.mkdir(parents=True)
attachments_dir = raw_dir / "attachments"
attachments_dir.mkdir(parents=True)
output_dir = tmp_path / "parquet"
output_dir.mkdir()
# 2 issues in Jan, 2 in Feb
jan_keys = ["JAN-1", "JAN-2"]
feb_keys = ["FEB-1", "FEB-2"]
for key in jan_keys:
raw_json = _make_minimal_issue_json(key, "2025-01-10T10:00:00.000+0000")
(issues_dir / f"{key}.json").write_text(json.dumps(raw_json))
for key in feb_keys:
raw_json = _make_minimal_issue_json(key, "2025-02-10T10:00:00.000+0000")
(issues_dir / f"{key}.json").write_text(json.dumps(raw_json))
all_keys = jan_keys + feb_keys
barrier = threading.Barrier(len(all_keys))
errors: list[str] = []
def transform_worker(issue_key: str) -> None:
try:
barrier.wait(timeout=10)
result = transform_single_issue(
issue_key=issue_key,
raw_dir=raw_dir,
output_dir=output_dir,
attachments_dir=attachments_dir,
)
if not result:
errors.append(f"transform_single_issue returned False for {issue_key}")
except Exception as e:
errors.append(f"Exception for {issue_key}: {e}")
threads = [
threading.Thread(target=transform_worker, args=(key,))
for key in all_keys
]
for t in threads:
t.start()
for t in threads:
t.join(timeout=60)
assert not errors, f"Transform errors: {errors}"
# Verify Jan
jan_df = pd.read_parquet(output_dir / "issues" / "2025-01.parquet")
assert set(jan_df["issue_key"].tolist()) == set(jan_keys)
# Verify Feb
feb_df = pd.read_parquet(output_dir / "issues" / "2025-02.parquet")
assert set(feb_df["issue_key"].tolist()) == set(feb_keys)