* fix(jira): harden _remote_links fetch — transient API failure no longer wipes parquet rows Pre-fix, all three fetch_remote_links call sites (service.py, scripts/backfill.py, scripts/backfill_remote_links.py) silently returned [] on 401/403/429/5xx or httpx.RequestError. Callers overlaid that [] onto cached issue JSON, and transform_remote_links interpreted the empty list as 'issue legitimately has no remote links — delete existing rows', so a transient Jira auth blip permanently wiped remote-link history. Now: - Every fetch site raises JiraFetchError on non-200/non-404 status, on httpx.RequestError, and on the 'service not configured' path. - Overlay sites skip the _remote_links key when fetch raises, leaving it ABSENT (not present-but-empty). - transform_remote_links returns None for absent/null keys (preserve existing rows) vs [] (legitimate empty — wipe). - Both consumers (batch transform_all, incremental transform_single_issue) honor the new contract. - End-to-end tests test_incremental_preserves_remote_links_when_overlay_absent and test_incremental_wipes_remote_links_when_overlay_present_but_empty lock both halves. Adversarial-review fixes bundled: - service.py: unconfigured-service path now raises JiraFetchError instead of returning [] (a webhook can arrive while API creds are missing — HMAC verification uses a separate JIRA_WEBHOOK_SECRET). Regression guard test_raises_when_unconfigured added. - consistency_check.py: AUTO_FIX_THRESHOLD bumped 10 -> 20 to cover typical SLA-poller hiccups before escalating to ERROR. - CLAUDE.md: connectors/jira/transform.py removed from 'Files NOT to modify' (overlay-contract change required touching it; module remains sensitive but is no longer off-limits). * release: 0.54.19 — jira remote_links hardening (transient API failure no longer wipes parquet rows)
314 lines
12 KiB
Python
314 lines
12 KiB
Python
"""Tests for incremental Jira parquet transform (upsert_dataframe and friends)."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import duckdb
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from connectors.jira.incremental_transform import (
|
|
load_parquet_month,
|
|
save_parquet_month,
|
|
transform_single_issue,
|
|
upsert_dataframe,
|
|
)
|
|
from connectors.jira.transform import REMOTE_LINKS_SCHEMA
|
|
|
|
|
|
# Minimal schema compatible with ISSUES_SCHEMA for testing purposes
|
|
_SIMPLE_SCHEMA = {
|
|
"issue_key": "string",
|
|
"summary": "string",
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def parquet_dir(tmp_path):
|
|
d = tmp_path / "parquet_data"
|
|
d.mkdir()
|
|
return d
|
|
|
|
|
|
def _make_df(rows: list[dict]) -> pd.DataFrame:
|
|
return pd.DataFrame(rows)
|
|
|
|
|
|
class TestUpsertDataframe:
|
|
def test_insert_into_empty(self):
|
|
"""Upserting into None/empty creates a new DataFrame."""
|
|
new_records = [{"issue_key": "PROJ-1", "summary": "Bug A"}]
|
|
result = upsert_dataframe(None, new_records, "issue_key", "PROJ-1")
|
|
assert len(result) == 1
|
|
assert result.iloc[0]["issue_key"] == "PROJ-1"
|
|
|
|
def test_insert_new_issue(self):
|
|
"""Upserting a new issue_key adds a new row."""
|
|
existing = _make_df([{"issue_key": "PROJ-1", "summary": "Existing"}])
|
|
new_records = [{"issue_key": "PROJ-2", "summary": "New issue"}]
|
|
result = upsert_dataframe(existing, new_records, "issue_key", "PROJ-2")
|
|
assert len(result) == 2
|
|
keys = set(result["issue_key"].tolist())
|
|
assert keys == {"PROJ-1", "PROJ-2"}
|
|
|
|
def test_update_existing_issue(self):
|
|
"""Upserting an existing issue_key replaces the old row."""
|
|
existing = _make_df([
|
|
{"issue_key": "PROJ-1", "summary": "Old summary"},
|
|
{"issue_key": "PROJ-2", "summary": "Other issue"},
|
|
])
|
|
new_records = [{"issue_key": "PROJ-1", "summary": "Updated summary"}]
|
|
result = upsert_dataframe(existing, new_records, "issue_key", "PROJ-1")
|
|
assert len(result) == 2
|
|
proj1 = result[result["issue_key"] == "PROJ-1"]
|
|
assert proj1.iloc[0]["summary"] == "Updated summary"
|
|
|
|
def test_delete_issue(self):
|
|
"""Upserting with empty records removes the issue (deletion case)."""
|
|
existing = _make_df([
|
|
{"issue_key": "PROJ-1", "summary": "To be deleted"},
|
|
{"issue_key": "PROJ-2", "summary": "Keep this"},
|
|
])
|
|
result = upsert_dataframe(existing, [], "issue_key", "PROJ-1")
|
|
assert len(result) == 1
|
|
assert result.iloc[0]["issue_key"] == "PROJ-2"
|
|
|
|
def test_upsert_empty_existing_df(self):
|
|
"""Upserting into an empty (non-None) DataFrame works correctly."""
|
|
existing = pd.DataFrame(columns=["issue_key", "summary"])
|
|
new_records = [{"issue_key": "PROJ-5", "summary": "First issue"}]
|
|
result = upsert_dataframe(existing, new_records, "issue_key", "PROJ-5")
|
|
assert len(result) == 1
|
|
assert result.iloc[0]["issue_key"] == "PROJ-5"
|
|
|
|
def test_upsert_multiple_records_same_issue(self):
|
|
"""Multiple records for the same issue_key are all replaced."""
|
|
existing = _make_df([
|
|
{"issue_key": "PROJ-1", "summary": "Comment 1"},
|
|
{"issue_key": "PROJ-1", "summary": "Comment 2"},
|
|
{"issue_key": "PROJ-2", "summary": "Other"},
|
|
])
|
|
new_records = [{"issue_key": "PROJ-1", "summary": "Updated comment"}]
|
|
result = upsert_dataframe(existing, new_records, "issue_key", "PROJ-1")
|
|
proj1_rows = result[result["issue_key"] == "PROJ-1"]
|
|
assert len(proj1_rows) == 1 # Only the updated record
|
|
assert proj1_rows.iloc[0]["summary"] == "Updated comment"
|
|
|
|
|
|
class TestParquetMonthlyPartitioning:
|
|
def test_save_and_load_parquet(self, parquet_dir):
|
|
"""save_parquet_month writes and load_parquet_month reads correctly."""
|
|
df = _make_df([
|
|
{"issue_key": "PROJ-1", "summary": "Test issue"},
|
|
])
|
|
save_parquet_month(df, _SIMPLE_SCHEMA, parquet_dir, "2026-04")
|
|
loaded = load_parquet_month(parquet_dir, "2026-04")
|
|
assert loaded is not None
|
|
assert len(loaded) == 1
|
|
assert loaded.iloc[0]["issue_key"] == "PROJ-1"
|
|
|
|
def test_load_nonexistent_returns_none(self, parquet_dir):
|
|
"""load_parquet_month returns None if the file doesn't exist."""
|
|
result = load_parquet_month(parquet_dir, "2099-01")
|
|
assert result is None
|
|
|
|
def test_save_empty_df_removes_file(self, parquet_dir):
|
|
"""save_parquet_month with empty df removes existing parquet file."""
|
|
# First write a file
|
|
df = _make_df([{"issue_key": "PROJ-1", "summary": "Test"}])
|
|
save_parquet_month(df, _SIMPLE_SCHEMA, parquet_dir, "2026-01")
|
|
assert (parquet_dir / "2026-01.parquet").exists()
|
|
|
|
# Save empty df — file should be removed
|
|
empty = pd.DataFrame()
|
|
save_parquet_month(empty, _SIMPLE_SCHEMA, parquet_dir, "2026-01")
|
|
assert not (parquet_dir / "2026-01.parquet").exists()
|
|
|
|
def test_separate_months_independent_files(self, parquet_dir):
|
|
"""Different month_keys write to separate parquet files."""
|
|
df_april = _make_df([{"issue_key": "PROJ-A", "summary": "April issue"}])
|
|
df_may = _make_df([{"issue_key": "PROJ-B", "summary": "May issue"}])
|
|
|
|
save_parquet_month(df_april, _SIMPLE_SCHEMA, parquet_dir, "2026-04")
|
|
save_parquet_month(df_may, _SIMPLE_SCHEMA, parquet_dir, "2026-05")
|
|
|
|
assert (parquet_dir / "2026-04.parquet").exists()
|
|
assert (parquet_dir / "2026-05.parquet").exists()
|
|
|
|
april_loaded = load_parquet_month(parquet_dir, "2026-04")
|
|
may_loaded = load_parquet_month(parquet_dir, "2026-05")
|
|
|
|
assert april_loaded.iloc[0]["issue_key"] == "PROJ-A"
|
|
assert may_loaded.iloc[0]["issue_key"] == "PROJ-B"
|
|
|
|
def test_parquet_readable_by_duckdb(self, parquet_dir):
|
|
"""Parquet files written by save_parquet_month are readable by DuckDB."""
|
|
df = _make_df([
|
|
{"issue_key": "PROJ-1", "summary": "DuckDB readable"},
|
|
{"issue_key": "PROJ-2", "summary": "Also readable"},
|
|
])
|
|
save_parquet_month(df, _SIMPLE_SCHEMA, parquet_dir, "2026-04")
|
|
|
|
pq_file = str(parquet_dir / "2026-04.parquet")
|
|
conn = duckdb.connect()
|
|
rows = conn.execute(f"SELECT count(*) FROM read_parquet('{pq_file}')").fetchone()
|
|
conn.close()
|
|
assert rows[0] == 2
|
|
|
|
def test_upsert_round_trip_with_real_parquet(self, parquet_dir):
|
|
"""Full upsert round trip: write, load, upsert, save, verify."""
|
|
# Initial write
|
|
initial = _make_df([
|
|
{"issue_key": "PROJ-1", "summary": "Original"},
|
|
{"issue_key": "PROJ-2", "summary": "Keep"},
|
|
])
|
|
save_parquet_month(initial, _SIMPLE_SCHEMA, parquet_dir, "2026-04")
|
|
|
|
# Load existing
|
|
existing = load_parquet_month(parquet_dir, "2026-04")
|
|
|
|
# Upsert update for PROJ-1
|
|
updated = upsert_dataframe(
|
|
existing,
|
|
[{"issue_key": "PROJ-1", "summary": "Updated"}],
|
|
"issue_key",
|
|
"PROJ-1",
|
|
)
|
|
|
|
# Save back
|
|
save_parquet_month(updated, _SIMPLE_SCHEMA, parquet_dir, "2026-04")
|
|
|
|
# Reload and verify
|
|
final = load_parquet_month(parquet_dir, "2026-04")
|
|
assert len(final) == 2
|
|
proj1 = final[final["issue_key"] == "PROJ-1"]
|
|
assert proj1.iloc[0]["summary"] == "Updated"
|
|
proj2 = final[final["issue_key"] == "PROJ-2"]
|
|
assert proj2.iloc[0]["summary"] == "Keep"
|
|
|
|
|
|
def _seed_remote_links_parquet(parquet_root, month_key, rows):
|
|
"""Write a starter remote_links parquet so we can assert preservation."""
|
|
df = pd.DataFrame(rows)
|
|
target = parquet_root / "remote_links"
|
|
target.mkdir(parents=True, exist_ok=True)
|
|
save_parquet_month(df, REMOTE_LINKS_SCHEMA, target, month_key)
|
|
|
|
|
|
def _write_raw_issue(raw_dir, issue_key, payload):
|
|
"""Seed raw issue JSON at the path transform_single_issue reads from."""
|
|
issues_dir = raw_dir / "issues"
|
|
issues_dir.mkdir(parents=True, exist_ok=True)
|
|
(issues_dir / f"{issue_key}.json").write_text(json.dumps(payload))
|
|
|
|
|
|
def test_incremental_preserves_remote_links_when_overlay_absent(tmp_path):
|
|
"""When the _remote_links key is absent from the raw JSON (the writer
|
|
skipped the overlay due to a Jira fetch failure), transform_single_issue
|
|
must NOT wipe existing parquet rows for that issue. Existing rows must
|
|
remain untouched and the function must report success."""
|
|
raw_dir = tmp_path / "raw"
|
|
output_dir = tmp_path / "parquet"
|
|
attachments_dir = tmp_path / "attachments"
|
|
output_dir.mkdir()
|
|
attachments_dir.mkdir()
|
|
|
|
# Pre-seed an existing remote-link row for PROJ-1 in month 2026-05.
|
|
_seed_remote_links_parquet(output_dir, "2026-05", [{
|
|
"issue_key": "PROJ-1",
|
|
"remote_link_id": "rl-existing",
|
|
"url": "https://example.com/old",
|
|
"title": "Pre-existing link",
|
|
"application_name": "X",
|
|
"application_type": "x",
|
|
}])
|
|
|
|
# Raw issue WITHOUT _remote_links key — overlay was skipped upstream.
|
|
_write_raw_issue(raw_dir, "PROJ-1", {
|
|
"key": "PROJ-1",
|
|
"id": "10001",
|
|
"fields": {
|
|
"summary": "test",
|
|
"status": {"name": "Open"},
|
|
"issuetype": {"name": "Bug"},
|
|
"attachment": [],
|
|
"comment": {"comments": []},
|
|
"created": "2026-05-15T00:00:00.000+0000",
|
|
"updated": "2026-05-15T00:00:00.000+0000",
|
|
},
|
|
# NOTE: no _remote_links key — that is the test condition.
|
|
})
|
|
|
|
ok = transform_single_issue(
|
|
issue_key="PROJ-1",
|
|
raw_dir=raw_dir,
|
|
output_dir=output_dir,
|
|
attachments_dir=attachments_dir,
|
|
)
|
|
assert ok is True
|
|
|
|
df = load_parquet_month(output_dir / "remote_links", "2026-05")
|
|
assert df is not None and len(df) == 1, \
|
|
"Existing remote-link row was wiped — overlay-absent signal not honored"
|
|
assert df.iloc[0]["remote_link_id"] == "rl-existing"
|
|
|
|
|
|
def test_incremental_wipes_remote_links_when_overlay_present_but_empty(tmp_path):
|
|
"""The mirror-image case: when _remote_links IS present but the list is
|
|
empty, that's a successful fetch confirming the issue legitimately has no
|
|
remote links right now. The transform MUST wipe any existing parquet rows
|
|
for that issue — keeping them would be stale data.
|
|
|
|
Together with test_incremental_preserves_remote_links_when_overlay_absent,
|
|
this locks the absent-vs-empty contract end-to-end. A future regression
|
|
that 'simplifies' the transform to treat [] the same as absent (i.e.,
|
|
skip upsert) would be caught here."""
|
|
raw_dir = tmp_path / "raw"
|
|
output_dir = tmp_path / "parquet"
|
|
attachments_dir = tmp_path / "attachments"
|
|
output_dir.mkdir()
|
|
attachments_dir.mkdir()
|
|
|
|
# Pre-seed a stale row that should be wiped.
|
|
_seed_remote_links_parquet(output_dir, "2026-05", [{
|
|
"issue_key": "PROJ-2",
|
|
"remote_link_id": "rl-stale",
|
|
"url": "https://example.com/stale",
|
|
"title": "Stale link to be wiped",
|
|
"application_name": "X",
|
|
"application_type": "x",
|
|
}])
|
|
|
|
# Raw issue WITH _remote_links: [] — fresh fetch confirmed empty.
|
|
_write_raw_issue(raw_dir, "PROJ-2", {
|
|
"key": "PROJ-2",
|
|
"id": "10002",
|
|
"fields": {
|
|
"summary": "test",
|
|
"status": {"name": "Open"},
|
|
"issuetype": {"name": "Bug"},
|
|
"attachment": [],
|
|
"comment": {"comments": []},
|
|
"created": "2026-05-15T00:00:00.000+0000",
|
|
"updated": "2026-05-15T00:00:00.000+0000",
|
|
},
|
|
"_remote_links": [],
|
|
})
|
|
|
|
ok = transform_single_issue(
|
|
issue_key="PROJ-2",
|
|
raw_dir=raw_dir,
|
|
output_dir=output_dir,
|
|
attachments_dir=attachments_dir,
|
|
)
|
|
assert ok is True
|
|
|
|
df = load_parquet_month(output_dir / "remote_links", "2026-05")
|
|
# Either the file was unlinked (df is None) or the row was removed.
|
|
# Both outcomes satisfy the contract — the stale row must not survive.
|
|
if df is not None:
|
|
remaining = df[df["issue_key"] == "PROJ-2"]
|
|
assert len(remaining) == 0, \
|
|
"Stale remote-link row survived a successful empty-list fetch — " \
|
|
"the empty-list (legitimate) signal was misinterpreted as preserve"
|