Move all Jira-specific code into a self-contained connector module: - 22 files moved via git mv (transform, service, webhook, scripts, systemd units, tests, docs, bin helper) - All imports updated to use connectors.jira.* paths - Jira is now conditional: auto-detected via JIRA_DOMAIN env var - Webapp registers Jira blueprint only when available - Health service monitors Jira timers only when enabled - Profiler loads Jira tables dynamically from filesystem - Sync settings uses config-driven dependency validation - Renamed keboola_platform_url -> custom_url in transform - Updated deploy.sh, sudoers-deploy, backfill_gap.sh paths - Fixed pytest.ini to skip live tests by default
392 lines
14 KiB
Python
392 lines
14 KiB
Python
"""Tests for per-month Parquet advisory file locking (connectors/jira/file_lock.py).
|
|
|
|
Verifies that parquet_month_lock correctly:
|
|
- Acquires and releases locks via context manager
|
|
- Auto-creates the .locks/ directory
|
|
- Provides mutual exclusion for the same month key (threading)
|
|
- Allows concurrent locks on different month keys
|
|
- Integration: N threads calling transform_single_issue with different
|
|
issues in the same month produce a Parquet file with all issues
|
|
"""
|
|
|
|
import json
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from connectors.jira.file_lock import parquet_month_lock
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers for integration test
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_minimal_issue_json(issue_key: str, created_at: str) -> dict:
|
|
"""Build a minimal raw Jira JSON that passes through transform_issue()."""
|
|
return {
|
|
"key": issue_key,
|
|
"id": issue_key.replace("-", ""),
|
|
"fields": {
|
|
"summary": f"Test issue {issue_key}",
|
|
"description": None,
|
|
"issuetype": {"name": "Bug"},
|
|
"status": {"name": "Open", "statusCategory": {"name": "To Do"}},
|
|
"priority": {"name": "Medium"},
|
|
"resolution": None,
|
|
"project": {"key": "TEST", "name": "Test Project"},
|
|
"creator": None,
|
|
"reporter": None,
|
|
"assignee": None,
|
|
"created": created_at,
|
|
"updated": created_at,
|
|
"resolutiondate": None,
|
|
"duedate": None,
|
|
"labels": [],
|
|
"attachment": [],
|
|
"comment": {"total": 0, "comments": []},
|
|
"issuelinks": [],
|
|
"customfield_10010": None,
|
|
"customfield_10004": None,
|
|
"customfield_10323": None,
|
|
"customfield_10511": None,
|
|
"customfield_10156": None,
|
|
"customfield_10002": None,
|
|
"customfield_10365": None,
|
|
"customfield_10330": None,
|
|
"customfield_10325": None,
|
|
"customfield_10350": None,
|
|
"customfield_10676": None,
|
|
"customfield_10475": None,
|
|
"customfield_10157": None,
|
|
"customfield_10328": None,
|
|
"customfield_10161": None,
|
|
"customfield_11831": None,
|
|
},
|
|
"changelog": {"histories": []},
|
|
"_remote_links": [],
|
|
"_synced_at": "2025-01-15T12:00:00Z",
|
|
}
|
|
|
|
|
|
# ===========================================================================
|
|
# TestBasicParquetLock
|
|
# ===========================================================================
|
|
|
|
|
|
class TestBasicParquetLock:
|
|
"""Context manager acquires and releases the parquet month lock cleanly."""
|
|
|
|
def test_lock_creates_lock_file(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
lock_file = output_dir / ".locks" / "parquet-2025-01.lock"
|
|
assert lock_file.exists(), "Lock file should exist while lock is held"
|
|
|
|
def test_lock_file_persists_after_release(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
with parquet_month_lock(output_dir, "2025-06"):
|
|
pass
|
|
|
|
lock_file = output_dir / ".locks" / "parquet-2025-06.lock"
|
|
assert lock_file.exists(), "Lock file should persist after release"
|
|
|
|
def test_lock_can_be_reacquired(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
with parquet_month_lock(output_dir, "2025-03"):
|
|
pass
|
|
|
|
# Should not block
|
|
with parquet_month_lock(output_dir, "2025-03"):
|
|
pass
|
|
|
|
def test_locks_dir_created_when_missing(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
locks_dir = output_dir / ".locks"
|
|
assert not locks_dir.exists()
|
|
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
assert locks_dir.is_dir()
|
|
|
|
def test_nested_parent_creation(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "deep" / "nested" / "parquet"
|
|
|
|
with parquet_month_lock(output_dir, "2025-12"):
|
|
assert (output_dir / ".locks").is_dir()
|
|
|
|
|
|
# ===========================================================================
|
|
# TestConcurrentParquetLock
|
|
# ===========================================================================
|
|
|
|
|
|
class TestConcurrentParquetLock:
|
|
"""Two threads cannot hold the same month lock simultaneously."""
|
|
|
|
def test_mutual_exclusion_same_month(self, tmp_path: Path) -> None:
|
|
"""Prove two threads locking the same month do not overlap."""
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
order: list[str] = []
|
|
barrier = threading.Barrier(2)
|
|
|
|
def worker(name: str) -> None:
|
|
barrier.wait()
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
order.append(f"enter:{name}")
|
|
time.sleep(0.1)
|
|
order.append(f"exit:{name}")
|
|
|
|
t1 = threading.Thread(target=worker, args=("A",))
|
|
t2 = threading.Thread(target=worker, args=("B",))
|
|
t1.start()
|
|
t2.start()
|
|
t1.join(timeout=5)
|
|
t2.join(timeout=5)
|
|
|
|
assert len(order) == 4, f"Expected 4 events, got {order}"
|
|
|
|
first = order[0].split(":")[1]
|
|
second = order[2].split(":")[1]
|
|
assert order[0] == f"enter:{first}"
|
|
assert order[1] == f"exit:{first}"
|
|
assert order[2] == f"enter:{second}"
|
|
assert order[3] == f"exit:{second}"
|
|
assert first != second
|
|
|
|
def test_counter_integrity_under_contention(self, tmp_path: Path) -> None:
|
|
"""Multiple threads incrementing a shared counter must not lose updates."""
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
counter_file = tmp_path / "counter.txt"
|
|
counter_file.write_text("0")
|
|
|
|
num_threads = 4
|
|
increments_per_thread = 20
|
|
barrier = threading.Barrier(num_threads)
|
|
|
|
def increment_worker() -> None:
|
|
barrier.wait()
|
|
for _ in range(increments_per_thread):
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
value = int(counter_file.read_text())
|
|
value += 1
|
|
counter_file.write_text(str(value))
|
|
|
|
threads = [
|
|
threading.Thread(target=increment_worker)
|
|
for _ in range(num_threads)
|
|
]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=30)
|
|
|
|
expected = num_threads * increments_per_thread
|
|
actual = int(counter_file.read_text())
|
|
assert actual == expected, (
|
|
f"Counter should be {expected} but got {actual} "
|
|
f"(indicates lost updates due to missing mutual exclusion)"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# TestDifferentMonthsNotBlocked
|
|
# ===========================================================================
|
|
|
|
|
|
class TestDifferentMonthsNotBlocked:
|
|
"""Locks on different month keys do not block each other."""
|
|
|
|
def test_different_months_lock_concurrently(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
timings: dict[str, dict[str, float]] = {}
|
|
barrier = threading.Barrier(2)
|
|
|
|
def worker(month_key: str) -> None:
|
|
barrier.wait()
|
|
with parquet_month_lock(output_dir, month_key):
|
|
timings[month_key] = {"enter": time.monotonic()}
|
|
time.sleep(0.15)
|
|
timings[month_key]["exit"] = time.monotonic()
|
|
|
|
t1 = threading.Thread(target=worker, args=("2025-01",))
|
|
t2 = threading.Thread(target=worker, args=("2025-02",))
|
|
t1.start()
|
|
t2.start()
|
|
t1.join(timeout=5)
|
|
t2.join(timeout=5)
|
|
|
|
assert "2025-01" in timings and "2025-02" in timings
|
|
|
|
jan = timings["2025-01"]
|
|
feb = timings["2025-02"]
|
|
|
|
overlap = jan["enter"] < feb["exit"] and feb["enter"] < jan["exit"]
|
|
assert overlap, (
|
|
f"Different month locks should allow concurrent access. "
|
|
f"Jan: {jan}, Feb: {feb}"
|
|
)
|
|
|
|
def test_separate_lock_files_created(self, tmp_path: Path) -> None:
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
with parquet_month_lock(output_dir, "2025-01"):
|
|
with parquet_month_lock(output_dir, "2025-02"):
|
|
locks_dir = output_dir / ".locks"
|
|
lock_files = sorted(f.name for f in locks_dir.iterdir())
|
|
assert "parquet-2025-01.lock" in lock_files
|
|
assert "parquet-2025-02.lock" in lock_files
|
|
|
|
|
|
# ===========================================================================
|
|
# TestParquetLockIntegration
|
|
# ===========================================================================
|
|
|
|
|
|
class TestParquetLockIntegration:
|
|
"""Integration test: concurrent transform_single_issue calls preserve all data.
|
|
|
|
This is the key test that reproduces the race condition from issue #205.
|
|
N threads call transform_single_issue() with different issues that all
|
|
belong to the same month. The resulting Parquet file must contain ALL issues.
|
|
"""
|
|
|
|
def test_concurrent_transforms_no_data_loss(self, tmp_path: Path) -> None:
|
|
"""Simulate concurrent webhook transforms for same month."""
|
|
from connectors.jira.incremental_transform import transform_single_issue
|
|
|
|
raw_dir = tmp_path / "raw"
|
|
issues_dir = raw_dir / "issues"
|
|
issues_dir.mkdir(parents=True)
|
|
attachments_dir = raw_dir / "attachments"
|
|
attachments_dir.mkdir(parents=True)
|
|
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
# All issues created in January 2025 → same month partition
|
|
num_issues = 8
|
|
issue_keys = [f"TEST-{i}" for i in range(1, num_issues + 1)]
|
|
|
|
for key in issue_keys:
|
|
raw_json = _make_minimal_issue_json(key, "2025-01-15T10:00:00.000+0000")
|
|
json_path = issues_dir / f"{key}.json"
|
|
json_path.write_text(json.dumps(raw_json))
|
|
|
|
barrier = threading.Barrier(num_issues)
|
|
errors: list[str] = []
|
|
|
|
def transform_worker(issue_key: str) -> None:
|
|
try:
|
|
barrier.wait(timeout=10)
|
|
result = transform_single_issue(
|
|
issue_key=issue_key,
|
|
raw_dir=raw_dir,
|
|
output_dir=output_dir,
|
|
attachments_dir=attachments_dir,
|
|
)
|
|
if not result:
|
|
errors.append(f"transform_single_issue returned False for {issue_key}")
|
|
except Exception as e:
|
|
errors.append(f"Exception for {issue_key}: {e}")
|
|
|
|
threads = [
|
|
threading.Thread(target=transform_worker, args=(key,))
|
|
for key in issue_keys
|
|
]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=60)
|
|
|
|
assert not errors, f"Transform errors: {errors}"
|
|
|
|
# Verify: all issues must be present in the Parquet file
|
|
issues_parquet = output_dir / "issues" / "2025-01.parquet"
|
|
assert issues_parquet.exists(), "Issues Parquet file should exist"
|
|
|
|
df = pd.read_parquet(issues_parquet)
|
|
found_keys = set(df["issue_key"].tolist())
|
|
|
|
assert found_keys == set(issue_keys), (
|
|
f"Expected all {num_issues} issues in Parquet but found {len(found_keys)}. "
|
|
f"Missing: {set(issue_keys) - found_keys}"
|
|
)
|
|
|
|
def test_concurrent_transforms_different_months_independent(self, tmp_path: Path) -> None:
|
|
"""Issues in different months should not interfere with each other."""
|
|
from connectors.jira.incremental_transform import transform_single_issue
|
|
|
|
raw_dir = tmp_path / "raw"
|
|
issues_dir = raw_dir / "issues"
|
|
issues_dir.mkdir(parents=True)
|
|
attachments_dir = raw_dir / "attachments"
|
|
attachments_dir.mkdir(parents=True)
|
|
|
|
output_dir = tmp_path / "parquet"
|
|
output_dir.mkdir()
|
|
|
|
# 2 issues in Jan, 2 in Feb
|
|
jan_keys = ["JAN-1", "JAN-2"]
|
|
feb_keys = ["FEB-1", "FEB-2"]
|
|
|
|
for key in jan_keys:
|
|
raw_json = _make_minimal_issue_json(key, "2025-01-10T10:00:00.000+0000")
|
|
(issues_dir / f"{key}.json").write_text(json.dumps(raw_json))
|
|
|
|
for key in feb_keys:
|
|
raw_json = _make_minimal_issue_json(key, "2025-02-10T10:00:00.000+0000")
|
|
(issues_dir / f"{key}.json").write_text(json.dumps(raw_json))
|
|
|
|
all_keys = jan_keys + feb_keys
|
|
barrier = threading.Barrier(len(all_keys))
|
|
errors: list[str] = []
|
|
|
|
def transform_worker(issue_key: str) -> None:
|
|
try:
|
|
barrier.wait(timeout=10)
|
|
result = transform_single_issue(
|
|
issue_key=issue_key,
|
|
raw_dir=raw_dir,
|
|
output_dir=output_dir,
|
|
attachments_dir=attachments_dir,
|
|
)
|
|
if not result:
|
|
errors.append(f"transform_single_issue returned False for {issue_key}")
|
|
except Exception as e:
|
|
errors.append(f"Exception for {issue_key}: {e}")
|
|
|
|
threads = [
|
|
threading.Thread(target=transform_worker, args=(key,))
|
|
for key in all_keys
|
|
]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=60)
|
|
|
|
assert not errors, f"Transform errors: {errors}"
|
|
|
|
# Verify Jan
|
|
jan_df = pd.read_parquet(output_dir / "issues" / "2025-01.parquet")
|
|
assert set(jan_df["issue_key"].tolist()) == set(jan_keys)
|
|
|
|
# Verify Feb
|
|
feb_df = pd.read_parquet(output_dir / "issues" / "2025-02.parquet")
|
|
assert set(feb_df["issue_key"].tolist()) == set(feb_keys)
|