agnes-the-ai-analyst/tests/test_jira_incremental.py

"""Tests for incremental Jira parquet transform (upsert_dataframe and friends)."""

import json
from pathlib import Path
from unittest.mock import patch

import duckdb
import pandas as pd
import pytest

from connectors.jira.incremental_transform import (
    load_parquet_month,
    save_parquet_month,
    transform_single_issue,
    upsert_dataframe,
)
from connectors.jira.transform import REMOTE_LINKS_SCHEMA


# Minimal schema compatible with ISSUES_SCHEMA for testing purposes
_SIMPLE_SCHEMA = {
    "issue_key": "string",
    "summary": "string",
}


@pytest.fixture
def parquet_dir(tmp_path):
    d = tmp_path / "parquet_data"
    d.mkdir()
    return d


def _make_df(rows: list[dict]) -> pd.DataFrame:
    return pd.DataFrame(rows)


class TestUpsertDataframe:
    def test_insert_into_empty(self):
        """Upserting into None/empty creates a new DataFrame."""
        new_records = [{"issue_key": "PROJ-1", "summary": "Bug A"}]
        result = upsert_dataframe(None, new_records, "issue_key", "PROJ-1")
        assert len(result) == 1
        assert result.iloc[0]["issue_key"] == "PROJ-1"

    def test_insert_new_issue(self):
        """Upserting a new issue_key adds a new row."""
        existing = _make_df([{"issue_key": "PROJ-1", "summary": "Existing"}])
        new_records = [{"issue_key": "PROJ-2", "summary": "New issue"}]
        result = upsert_dataframe(existing, new_records, "issue_key", "PROJ-2")
        assert len(result) == 2
        keys = set(result["issue_key"].tolist())
        assert keys == {"PROJ-1", "PROJ-2"}

    def test_update_existing_issue(self):
        """Upserting an existing issue_key replaces the old row."""
        existing = _make_df([
            {"issue_key": "PROJ-1", "summary": "Old summary"},
            {"issue_key": "PROJ-2", "summary": "Other issue"},
        ])
        new_records = [{"issue_key": "PROJ-1", "summary": "Updated summary"}]
        result = upsert_dataframe(existing, new_records, "issue_key", "PROJ-1")
        assert len(result) == 2
        proj1 = result[result["issue_key"] == "PROJ-1"]
        assert proj1.iloc[0]["summary"] == "Updated summary"

    def test_delete_issue(self):
        """Upserting with empty records removes the issue (deletion case)."""
        existing = _make_df([
            {"issue_key": "PROJ-1", "summary": "To be deleted"},
            {"issue_key": "PROJ-2", "summary": "Keep this"},
        ])
        result = upsert_dataframe(existing, [], "issue_key", "PROJ-1")
        assert len(result) == 1
        assert result.iloc[0]["issue_key"] == "PROJ-2"

    def test_upsert_empty_existing_df(self):
        """Upserting into an empty (non-None) DataFrame works correctly."""
        existing = pd.DataFrame(columns=["issue_key", "summary"])
        new_records = [{"issue_key": "PROJ-5", "summary": "First issue"}]
        result = upsert_dataframe(existing, new_records, "issue_key", "PROJ-5")
        assert len(result) == 1
        assert result.iloc[0]["issue_key"] == "PROJ-5"

    def test_upsert_multiple_records_same_issue(self):
        """Multiple records for the same issue_key are all replaced."""
        existing = _make_df([
            {"issue_key": "PROJ-1", "summary": "Comment 1"},
            {"issue_key": "PROJ-1", "summary": "Comment 2"},
            {"issue_key": "PROJ-2", "summary": "Other"},
        ])
        new_records = [{"issue_key": "PROJ-1", "summary": "Updated comment"}]
        result = upsert_dataframe(existing, new_records, "issue_key", "PROJ-1")
        proj1_rows = result[result["issue_key"] == "PROJ-1"]
        assert len(proj1_rows) == 1  # Only the updated record
        assert proj1_rows.iloc[0]["summary"] == "Updated comment"


class TestParquetMonthlyPartitioning:
    def test_save_and_load_parquet(self, parquet_dir):
        """save_parquet_month writes and load_parquet_month reads correctly."""
        df = _make_df([
            {"issue_key": "PROJ-1", "summary": "Test issue"},
        ])
        save_parquet_month(df, _SIMPLE_SCHEMA, parquet_dir, "2026-04")
        loaded = load_parquet_month(parquet_dir, "2026-04")
        assert loaded is not None
        assert len(loaded) == 1
        assert loaded.iloc[0]["issue_key"] == "PROJ-1"

    def test_load_nonexistent_returns_none(self, parquet_dir):
        """load_parquet_month returns None if the file doesn't exist."""
        result = load_parquet_month(parquet_dir, "2099-01")
        assert result is None

    def test_save_empty_df_removes_file(self, parquet_dir):
        """save_parquet_month with empty df removes existing parquet file."""
        # First write a file
        df = _make_df([{"issue_key": "PROJ-1", "summary": "Test"}])
        save_parquet_month(df, _SIMPLE_SCHEMA, parquet_dir, "2026-01")
        assert (parquet_dir / "2026-01.parquet").exists()

        # Save empty df — file should be removed
        empty = pd.DataFrame()
        save_parquet_month(empty, _SIMPLE_SCHEMA, parquet_dir, "2026-01")
        assert not (parquet_dir / "2026-01.parquet").exists()

    def test_separate_months_independent_files(self, parquet_dir):
        """Different month_keys write to separate parquet files."""
        df_april = _make_df([{"issue_key": "PROJ-A", "summary": "April issue"}])
        df_may = _make_df([{"issue_key": "PROJ-B", "summary": "May issue"}])

        save_parquet_month(df_april, _SIMPLE_SCHEMA, parquet_dir, "2026-04")
        save_parquet_month(df_may, _SIMPLE_SCHEMA, parquet_dir, "2026-05")

        assert (parquet_dir / "2026-04.parquet").exists()
        assert (parquet_dir / "2026-05.parquet").exists()

        april_loaded = load_parquet_month(parquet_dir, "2026-04")
        may_loaded = load_parquet_month(parquet_dir, "2026-05")

        assert april_loaded.iloc[0]["issue_key"] == "PROJ-A"
        assert may_loaded.iloc[0]["issue_key"] == "PROJ-B"

    def test_parquet_readable_by_duckdb(self, parquet_dir):
        """Parquet files written by save_parquet_month are readable by DuckDB."""
        df = _make_df([
            {"issue_key": "PROJ-1", "summary": "DuckDB readable"},
            {"issue_key": "PROJ-2", "summary": "Also readable"},
        ])
        save_parquet_month(df, _SIMPLE_SCHEMA, parquet_dir, "2026-04")

        pq_file = str(parquet_dir / "2026-04.parquet")
        conn = duckdb.connect()
        rows = conn.execute(f"SELECT count(*) FROM read_parquet('{pq_file}')").fetchone()
        conn.close()
        assert rows[0] == 2

    def test_upsert_round_trip_with_real_parquet(self, parquet_dir):
        """Full upsert round trip: write, load, upsert, save, verify."""
        # Initial write
        initial = _make_df([
            {"issue_key": "PROJ-1", "summary": "Original"},
            {"issue_key": "PROJ-2", "summary": "Keep"},
        ])
        save_parquet_month(initial, _SIMPLE_SCHEMA, parquet_dir, "2026-04")

        # Load existing
        existing = load_parquet_month(parquet_dir, "2026-04")

        # Upsert update for PROJ-1
        updated = upsert_dataframe(
            existing,
            [{"issue_key": "PROJ-1", "summary": "Updated"}],
            "issue_key",
            "PROJ-1",
        )

        # Save back
        save_parquet_month(updated, _SIMPLE_SCHEMA, parquet_dir, "2026-04")

        # Reload and verify
        final = load_parquet_month(parquet_dir, "2026-04")
        assert len(final) == 2
        proj1 = final[final["issue_key"] == "PROJ-1"]
        assert proj1.iloc[0]["summary"] == "Updated"
        proj2 = final[final["issue_key"] == "PROJ-2"]
        assert proj2.iloc[0]["summary"] == "Keep"


def _seed_remote_links_parquet(parquet_root, month_key, rows):
    """Write a starter remote_links parquet so we can assert preservation."""
    df = pd.DataFrame(rows)
    target = parquet_root / "remote_links"
    target.mkdir(parents=True, exist_ok=True)
    save_parquet_month(df, REMOTE_LINKS_SCHEMA, target, month_key)


def _write_raw_issue(raw_dir, issue_key, payload):
    """Seed raw issue JSON at the path transform_single_issue reads from."""
    issues_dir = raw_dir / "issues"
    issues_dir.mkdir(parents=True, exist_ok=True)
    (issues_dir / f"{issue_key}.json").write_text(json.dumps(payload))


def test_incremental_preserves_remote_links_when_overlay_absent(tmp_path):
    """When the _remote_links key is absent from the raw JSON (the writer
    skipped the overlay due to a Jira fetch failure), transform_single_issue
    must NOT wipe existing parquet rows for that issue. Existing rows must
    remain untouched and the function must report success."""
    raw_dir = tmp_path / "raw"
    output_dir = tmp_path / "parquet"
    attachments_dir = tmp_path / "attachments"
    output_dir.mkdir()
    attachments_dir.mkdir()

    # Pre-seed an existing remote-link row for PROJ-1 in month 2026-05.
    _seed_remote_links_parquet(output_dir, "2026-05", [{
        "issue_key": "PROJ-1",
        "remote_link_id": "rl-existing",
        "url": "https://example.com/old",
        "title": "Pre-existing link",
        "application_name": "X",
        "application_type": "x",
    }])

    # Raw issue WITHOUT _remote_links key — overlay was skipped upstream.
    _write_raw_issue(raw_dir, "PROJ-1", {
        "key": "PROJ-1",
        "id": "10001",
        "fields": {
            "summary": "test",
            "status": {"name": "Open"},
            "issuetype": {"name": "Bug"},
            "attachment": [],
            "comment": {"comments": []},
            "created": "2026-05-15T00:00:00.000+0000",
            "updated": "2026-05-15T00:00:00.000+0000",
        },
        # NOTE: no _remote_links key — that is the test condition.
    })

    ok = transform_single_issue(
        issue_key="PROJ-1",
        raw_dir=raw_dir,
        output_dir=output_dir,
        attachments_dir=attachments_dir,
    )
    assert ok is True

    df = load_parquet_month(output_dir / "remote_links", "2026-05")
    assert df is not None and len(df) == 1, \
        "Existing remote-link row was wiped — overlay-absent signal not honored"
    assert df.iloc[0]["remote_link_id"] == "rl-existing"


def test_incremental_wipes_remote_links_when_overlay_present_but_empty(tmp_path):
    """The mirror-image case: when _remote_links IS present but the list is
    empty, that's a successful fetch confirming the issue legitimately has no
    remote links right now. The transform MUST wipe any existing parquet rows
    for that issue — keeping them would be stale data.

    Together with test_incremental_preserves_remote_links_when_overlay_absent,
    this locks the absent-vs-empty contract end-to-end. A future regression
    that 'simplifies' the transform to treat [] the same as absent (i.e.,
    skip upsert) would be caught here."""
    raw_dir = tmp_path / "raw"
    output_dir = tmp_path / "parquet"
    attachments_dir = tmp_path / "attachments"
    output_dir.mkdir()
    attachments_dir.mkdir()

    # Pre-seed a stale row that should be wiped.
    _seed_remote_links_parquet(output_dir, "2026-05", [{
        "issue_key": "PROJ-2",
        "remote_link_id": "rl-stale",
        "url": "https://example.com/stale",
        "title": "Stale link to be wiped",
        "application_name": "X",
        "application_type": "x",
    }])

    # Raw issue WITH _remote_links: [] — fresh fetch confirmed empty.
    _write_raw_issue(raw_dir, "PROJ-2", {
        "key": "PROJ-2",
        "id": "10002",
        "fields": {
            "summary": "test",
            "status": {"name": "Open"},
            "issuetype": {"name": "Bug"},
            "attachment": [],
            "comment": {"comments": []},
            "created": "2026-05-15T00:00:00.000+0000",
            "updated": "2026-05-15T00:00:00.000+0000",
        },
        "_remote_links": [],
    })

    ok = transform_single_issue(
        issue_key="PROJ-2",
        raw_dir=raw_dir,
        output_dir=output_dir,
        attachments_dir=attachments_dir,
    )
    assert ok is True

    df = load_parquet_month(output_dir / "remote_links", "2026-05")
    # Either the file was unlinked (df is None) or the row was removed.
    # Both outcomes satisfy the contract — the stale row must not survive.
    if df is not None:
        remaining = df[df["issue_key"] == "PROJ-2"]
        assert len(remaining) == 0, \
            "Stale remote-link row survived a successful empty-list fetch — " \
            "the empty-list (legitimate) signal was misinterpreted as preserve"