Open-source AI data analyst platform extracted from internal repo. Includes data sync engine, Keboola adapter, Flask web portal, server deployment scripts, and configuration templates.
392 lines
14 KiB
Python
392 lines
14 KiB
Python
"""Tests for per-month Parquet advisory file locking (src/jira_file_lock.py).
|
|
|
|
Verifies that parquet_month_lock correctly:
|
|
- Acquires and releases locks via context manager
|
|
- Auto-creates the .locks/ directory
|
|
- Provides mutual exclusion for the same month key (threading)
|
|
- Allows concurrent locks on different month keys
|
|
- Integration: N threads calling transform_single_issue with different
|
|
issues in the same month produce a Parquet file with all issues
|
|
"""
|
|
|
|
import json
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from src.jira_file_lock import parquet_month_lock
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers for integration test
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_minimal_issue_json(issue_key: str, created_at: str) -> dict:
|
|
"""Build a minimal raw Jira JSON that passes through transform_issue()."""
|
|
return {
|
|
"key": issue_key,
|
|
"id": issue_key.replace("-", ""),
|
|
"fields": {
|
|
"summary": f"Test issue {issue_key}",
|
|
"description": None,
|
|
"issuetype": {"name": "Bug"},
|
|
"status": {"name": "Open", "statusCategory": {"name": "To Do"}},
|
|
"priority": {"name": "Medium"},
|
|
"resolution": None,
|
|
"project": {"key": "TEST", "name": "Test Project"},
|
|
"creator": None,
|
|
"reporter": None,
|
|
"assignee": None,
|
|
"created": created_at,
|
|
"updated": created_at,
|
|
"resolutiondate": None,
|
|
"duedate": None,
|
|
"labels": [],
|
|
"attachment": [],
|
|
"comment": {"total": 0, "comments": []},
|
|
"issuelinks": [],
|
|
"customfield_10010": None,
|
|
"customfield_10004": None,
|
|
"customfield_10323": None,
|
|
"customfield_10511": None,
|
|
"customfield_10156": None,
|
|
"customfield_10002": None,
|
|
"customfield_10365": None,
|
|
"customfield_10330": None,
|
|
"customfield_10325": None,
|
|
"customfield_10350": None,
|
|
"customfield_10676": None,
|
|
"customfield_10475": None,
|
|
"customfield_10157": None,
|
|
"customfield_10328": None,
|
|
"customfield_10161": None,
|
|
"customfield_11831": None,
|
|
},
|
|
"changelog": {"histories": []},
|
|
"_remote_links": [],
|
|
"_synced_at": "2025-01-15T12:00:00Z",
|
|
}
|
|
|
|
|
|
# ===========================================================================
|
|
# TestBasicParquetLock
|
|
# ===========================================================================
|
|
|
|
|
|
class TestBasicParquetLock:
|
|
"""Context manager acquires and releases the parquet month lock cleanly."""
|
|
|
|
def test_lock_creates_lock_file(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
lock_file = output_dir / ".locks" / "parquet-2025-01.lock"
|
|
assert lock_file.exists(), "Lock file should exist while lock is held"
|
|
|
|
def test_lock_file_persists_after_release(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
with parquet_month_lock(output_dir, "2025-06"):
|
|
pass
|
|
|
|
lock_file = output_dir / ".locks" / "parquet-2025-06.lock"
|
|
assert lock_file.exists(), "Lock file should persist after release"
|
|
|
|
def test_lock_can_be_reacquired(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
with parquet_month_lock(output_dir, "2025-03"):
|
|
pass
|
|
|
|
# Should not block
|
|
with parquet_month_lock(output_dir, "2025-03"):
|
|
pass
|
|
|
|
def test_locks_dir_created_when_missing(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
locks_dir = output_dir / ".locks"
|
|
assert not locks_dir.exists()
|
|
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
assert locks_dir.is_dir()
|
|
|
|
def test_nested_parent_creation(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "deep" / "nested" / "parquet"
|
|
|
|
with parquet_month_lock(output_dir, "2025-12"):
|
|
assert (output_dir / ".locks").is_dir()
|
|
|
|
|
|
# ===========================================================================
|
|
# TestConcurrentParquetLock
|
|
# ===========================================================================
|
|
|
|
|
|
class TestConcurrentParquetLock:
|
|
"""Two threads cannot hold the same month lock simultaneously."""
|
|
|
|
def test_mutual_exclusion_same_month(self, tmp_path: Path) -> None:
|
|
"""Prove two threads locking the same month do not overlap."""
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
order: list[str] = []
|
|
barrier = threading.Barrier(2)
|
|
|
|
def worker(name: str) -> None:
|
|
barrier.wait()
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
order.append(f"enter:{name}")
|
|
time.sleep(0.1)
|
|
order.append(f"exit:{name}")
|
|
|
|
t1 = threading.Thread(target=worker, args=("A",))
|
|
t2 = threading.Thread(target=worker, args=("B",))
|
|
t1.start()
|
|
t2.start()
|
|
t1.join(timeout=5)
|
|
t2.join(timeout=5)
|
|
|
|
assert len(order) == 4, f"Expected 4 events, got {order}"
|
|
|
|
first = order[0].split(":")[1]
|
|
second = order[2].split(":")[1]
|
|
assert order[0] == f"enter:{first}"
|
|
assert order[1] == f"exit:{first}"
|
|
assert order[2] == f"enter:{second}"
|
|
assert order[3] == f"exit:{second}"
|
|
assert first != second
|
|
|
|
def test_counter_integrity_under_contention(self, tmp_path: Path) -> None:
|
|
"""Multiple threads incrementing a shared counter must not lose updates."""
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
counter_file = tmp_path / "counter.txt"
|
|
counter_file.write_text("0")
|
|
|
|
num_threads = 4
|
|
increments_per_thread = 20
|
|
barrier = threading.Barrier(num_threads)
|
|
|
|
def increment_worker() -> None:
|
|
barrier.wait()
|
|
for _ in range(increments_per_thread):
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
value = int(counter_file.read_text())
|
|
value += 1
|
|
counter_file.write_text(str(value))
|
|
|
|
threads = [
|
|
threading.Thread(target=increment_worker)
|
|
for _ in range(num_threads)
|
|
]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=30)
|
|
|
|
expected = num_threads * increments_per_thread
|
|
actual = int(counter_file.read_text())
|
|
assert actual == expected, (
|
|
f"Counter should be {expected} but got {actual} "
|
|
f"(indicates lost updates due to missing mutual exclusion)"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# TestDifferentMonthsNotBlocked
|
|
# ===========================================================================
|
|
|
|
|
|
class TestDifferentMonthsNotBlocked:
|
|
"""Locks on different month keys do not block each other."""
|
|
|
|
def test_different_months_lock_concurrently(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
timings: dict[str, dict[str, float]] = {}
|
|
barrier = threading.Barrier(2)
|
|
|
|
def worker(month_key: str) -> None:
|
|
barrier.wait()
|
|
with parquet_month_lock(output_dir, month_key):
|
|
timings[month_key] = {"enter": time.monotonic()}
|
|
time.sleep(0.15)
|
|
timings[month_key]["exit"] = time.monotonic()
|
|
|
|
t1 = threading.Thread(target=worker, args=("2025-01",))
|
|
t2 = threading.Thread(target=worker, args=("2025-02",))
|
|
t1.start()
|
|
t2.start()
|
|
t1.join(timeout=5)
|
|
t2.join(timeout=5)
|
|
|
|
assert "2025-01" in timings and "2025-02" in timings
|
|
|
|
jan = timings["2025-01"]
|
|
feb = timings["2025-02"]
|
|
|
|
overlap = jan["enter"] < feb["exit"] and feb["enter"] < jan["exit"]
|
|
assert overlap, (
|
|
f"Different month locks should allow concurrent access. "
|
|
f"Jan: {jan}, Feb: {feb}"
|
|
)
|
|
|
|
def test_separate_lock_files_created(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
with parquet_month_lock(output_dir, "2025-02"):
|
|
locks_dir = output_dir / ".locks"
|
|
lock_files = sorted(f.name for f in locks_dir.iterdir())
|
|
assert "parquet-2025-01.lock" in lock_files
|
|
assert "parquet-2025-02.lock" in lock_files
|
|
|
|
|
|
# ===========================================================================
|
|
# TestParquetLockIntegration
|
|
# ===========================================================================
|
|
|
|
|
|
class TestParquetLockIntegration:
|
|
"""Integration test: concurrent transform_single_issue calls preserve all data.
|
|
|
|
This is the key test that reproduces the race condition from issue #205.
|
|
N threads call transform_single_issue() with different issues that all
|
|
belong to the same month. The resulting Parquet file must contain ALL issues.
|
|
"""
|
|
|
|
def test_concurrent_transforms_no_data_loss(self, tmp_path: Path) -> None:
|
|
"""Simulate concurrent webhook transforms for same month."""
|
|
from src.incremental_jira_transform import transform_single_issue
|
|
|
|
raw_dir = tmp_path / "raw"
|
|
issues_dir = raw_dir / "issues"
|
|
issues_dir.mkdir(parents=True)
|
|
attachments_dir = raw_dir / "attachments"
|
|
attachments_dir.mkdir(parents=True)
|
|
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
# All issues created in January 2025 → same month partition
|
|
num_issues = 8
|
|
issue_keys = [f"TEST-{i}" for i in range(1, num_issues + 1)]
|
|
|
|
for key in issue_keys:
|
|
raw_json = _make_minimal_issue_json(key, "2025-01-15T10:00:00.000+0000")
|
|
json_path = issues_dir / f"{key}.json"
|
|
json_path.write_text(json.dumps(raw_json))
|
|
|
|
barrier = threading.Barrier(num_issues)
|
|
errors: list[str] = []
|
|
|
|
def transform_worker(issue_key: str) -> None:
|
|
try:
|
|
barrier.wait(timeout=10)
|
|
result = transform_single_issue(
|
|
issue_key=issue_key,
|
|
raw_dir=raw_dir,
|
|
output_dir=output_dir,
|
|
attachments_dir=attachments_dir,
|
|
)
|
|
if not result:
|
|
errors.append(f"transform_single_issue returned False for {issue_key}")
|
|
except Exception as e:
|
|
errors.append(f"Exception for {issue_key}: {e}")
|
|
|
|
threads = [
|
|
threading.Thread(target=transform_worker, args=(key,))
|
|
for key in issue_keys
|
|
]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=60)
|
|
|
|
assert not errors, f"Transform errors: {errors}"
|
|
|
|
# Verify: all issues must be present in the Parquet file
|
|
issues_parquet = output_dir / "issues" / "2025-01.parquet"
|
|
assert issues_parquet.exists(), "Issues Parquet file should exist"
|
|
|
|
df = pd.read_parquet(issues_parquet)
|
|
found_keys = set(df["issue_key"].tolist())
|
|
|
|
assert found_keys == set(issue_keys), (
|
|
f"Expected all {num_issues} issues in Parquet but found {len(found_keys)}. "
|
|
f"Missing: {set(issue_keys) - found_keys}"
|
|
)
|
|
|
|
def test_concurrent_transforms_different_months_independent(self, tmp_path: Path) -> None:
|
|
"""Issues in different months should not interfere with each other."""
|
|
from src.incremental_jira_transform import transform_single_issue
|
|
|
|
raw_dir = tmp_path / "raw"
|
|
issues_dir = raw_dir / "issues"
|
|
issues_dir.mkdir(parents=True)
|
|
attachments_dir = raw_dir / "attachments"
|
|
attachments_dir.mkdir(parents=True)
|
|
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
# 2 issues in Jan, 2 in Feb
|
|
jan_keys = ["JAN-1", "JAN-2"]
|
|
feb_keys = ["FEB-1", "FEB-2"]
|
|
|
|
for key in jan_keys:
|
|
raw_json = _make_minimal_issue_json(key, "2025-01-10T10:00:00.000+0000")
|
|
(issues_dir / f"{key}.json").write_text(json.dumps(raw_json))
|
|
|
|
for key in feb_keys:
|
|
raw_json = _make_minimal_issue_json(key, "2025-02-10T10:00:00.000+0000")
|
|
(issues_dir / f"{key}.json").write_text(json.dumps(raw_json))
|
|
|
|
all_keys = jan_keys + feb_keys
|
|
barrier = threading.Barrier(len(all_keys))
|
|
errors: list[str] = []
|
|
|
|
def transform_worker(issue_key: str) -> None:
|
|
try:
|
|
barrier.wait(timeout=10)
|
|
result = transform_single_issue(
|
|
issue_key=issue_key,
|
|
raw_dir=raw_dir,
|
|
output_dir=output_dir,
|
|
attachments_dir=attachments_dir,
|
|
)
|
|
if not result:
|
|
errors.append(f"transform_single_issue returned False for {issue_key}")
|
|
except Exception as e:
|
|
errors.append(f"Exception for {issue_key}: {e}")
|
|
|
|
threads = [
|
|
threading.Thread(target=transform_worker, args=(key,))
|
|
for key in all_keys
|
|
]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=60)
|
|
|
|
assert not errors, f"Transform errors: {errors}"
|
|
|
|
# Verify Jan
|
|
jan_df = pd.read_parquet(output_dir / "issues" / "2025-01.parquet")
|
|
assert set(jan_df["issue_key"].tolist()) == set(jan_keys)
|
|
|
|
# Verify Feb
|
|
feb_df = pd.read_parquet(output_dir / "issues" / "2025-02.parquet")
|
|
assert set(feb_df["issue_key"].tolist()) == set(feb_keys)
|