agnes-the-ai-analyst/tests/test_session_collector.py

"""Tests for session_collector.collector."""

from pathlib import Path

import pytest

from services.session_collector.collector import copy_session_file, find_session_files


class TestCopySessionFile:
    def test_skips_if_target_exists(self, tmp_path):
        """Returns False and does not overwrite if target already exists."""
        source = tmp_path / "session.jsonl"
        source.write_text('{"event": "start"}')
        target = tmp_path / "dest" / "session.jsonl"
        target.parent.mkdir(parents=True)
        target.write_text("existing content")

        result = copy_session_file(source, target)
        assert result is False
        # Target content should not be overwritten
        assert target.read_text() == "existing content"

    def test_copies_new_file(self, tmp_path):
        """Returns True and creates the target when it does not exist."""
        source = tmp_path / "session.jsonl"
        source.write_text('{"event": "start"}')
        target = tmp_path / "dest" / "session.jsonl"

        result = copy_session_file(source, target)
        assert result is True
        assert target.exists()
        assert target.read_text() == '{"event": "start"}'

    def test_dry_run_returns_true_without_copying(self, tmp_path):
        """In dry_run mode, returns True but does not create the file."""
        source = tmp_path / "session.jsonl"
        source.write_text('{"event": "start"}')
        target = tmp_path / "dest" / "session.jsonl"

        result = copy_session_file(source, target, dry_run=True)
        assert result is True
        assert not target.exists()

    def test_creates_parent_directory(self, tmp_path):
        """Parent directories are created automatically."""
        source = tmp_path / "session.jsonl"
        source.write_text("data")
        target = tmp_path / "a" / "b" / "c" / "session.jsonl"

        copy_session_file(source, target)
        assert target.exists()

    def test_dry_run_skips_existing_target(self, tmp_path):
        """dry_run still returns False if target already exists."""
        source = tmp_path / "session.jsonl"
        source.write_text("data")
        target = tmp_path / "session.jsonl"
        target.write_text("old")

        result = copy_session_file(source, target, dry_run=True)
        assert result is False


class TestFindSessionFiles:
    def test_finds_jsonl_files(self, tmp_path):
        """find_session_files yields .jsonl files from user/sessions/."""
        user_home = tmp_path / "alice"
        sessions_dir = user_home / "user" / "sessions"
        sessions_dir.mkdir(parents=True)
        f1 = sessions_dir / "session1.jsonl"
        f2 = sessions_dir / "session2.jsonl"
        f1.write_text("{}")
        f2.write_text("{}")

        found = list(find_session_files(user_home))
        assert len(found) == 2
        assert all(f.suffix == ".jsonl" for f in found)

    def test_ignores_non_jsonl_files(self, tmp_path):
        """Non-.jsonl files are not returned."""
        user_home = tmp_path / "bob"
        sessions_dir = user_home / "user" / "sessions"
        sessions_dir.mkdir(parents=True)
        (sessions_dir / "notes.txt").write_text("ignore me")
        (sessions_dir / "session.jsonl").write_text("{}")

        found = list(find_session_files(user_home))
        assert len(found) == 1
        assert found[0].name == "session.jsonl"

    def test_returns_empty_when_no_sessions_dir(self, tmp_path):
        """Returns empty iterator when user/sessions/ doesn't exist."""
        user_home = tmp_path / "carol"
        user_home.mkdir()

        found = list(find_session_files(user_home))
        assert found == []


class TestRunHelper:
    """Argv-free run() entry point — regression for #179 review (SystemExit bug)."""

    def test_run_does_not_call_argparse(self, monkeypatch, tmp_path):
        """run() must not parse sys.argv — uvicorn's argv would SystemExit(2) the worker.

        Regression: app/api/admin.py:run_session_collector previously called
        collector.main() which did argparse.parse_args() on uvicorn's argv.
        """
        from services.session_collector import collector

        monkeypatch.setattr(
            "sys.argv",
            ["app.main:app", "--host", "0.0.0.0", "--port", "8000",
             "--proxy-headers", "--forwarded-allow-ips=*"],
        )
        monkeypatch.setattr(collector, "TARGET_BASE", tmp_path / "user_sessions")
        monkeypatch.setattr(collector, "find_user_home_dirs", lambda: iter([]))

        rc, stats = collector.run(dry_run=True, verbose=False)
        assert rc == 0
        assert stats == {"users_processed": 0, "files_copied": 0, "files_skipped": 0}

    def test_run_returns_stats_tuple(self, monkeypatch, tmp_path):
        """run() returns (exit_code, stats_dict) so the admin endpoint can audit."""
        from services.session_collector import collector

        monkeypatch.setattr(collector, "TARGET_BASE", tmp_path / "user_sessions")
        monkeypatch.setattr(collector, "find_user_home_dirs", lambda: iter([]))

        rc, stats = collector.run()
        assert rc == 0
        assert set(stats.keys()) == {"users_processed", "files_copied", "files_skipped"}

    def test_main_still_delegates_to_run(self, monkeypatch, tmp_path):
        """The CLI main() must continue to work — argparse + delegate."""
        from services.session_collector import collector

        monkeypatch.setattr("sys.argv", ["session_collector", "--dry-run"])
        monkeypatch.setattr(collector, "TARGET_BASE", tmp_path / "user_sessions")
        monkeypatch.setattr(collector, "find_user_home_dirs", lambda: iter([]))

        rc = collector.main()
        assert rc == 0


class TestRunSkipEnvVar:
    """AGNES_SKIP_LEGACY_COLLECTOR=1 short-circuits the run before any FS or
    grp lookups. Used in the Docker layout where /home/*/user/sessions/ is
    empty by design — keeps logs quiet without auto-detect logic that would
    mask real bare-VM mis-deploys.
    """

    def test_collector_run_skips_when_env_set(self, monkeypatch, tmp_path):
        """AGNES_SKIP_LEGACY_COLLECTOR=1 → return early with skipped=True."""
        from services.session_collector import collector

        monkeypatch.setenv("AGNES_SKIP_LEGACY_COLLECTOR", "1")
        # Point TARGET_BASE at tmp_path so even if the skip didn't fire we
        # wouldn't touch /data — but the assertion below is that mkdir
        # was NOT called on it.
        target = tmp_path / "user_sessions"
        monkeypatch.setattr(collector, "TARGET_BASE", target)

        # If the skip didn't fire, find_user_home_dirs would be called.
        called = []

        def _spy():
            called.append(True)
            return iter([])

        monkeypatch.setattr(collector, "find_user_home_dirs", _spy)

        rc, stats = collector.run()
        assert rc == 0
        assert stats.get("skipped") is True
        assert stats["files_copied"] == 0
        assert stats["users_processed"] == 0
        assert stats["files_skipped"] == 0
        # Skip path must NOT touch the target directory or call into the
        # /home scanner — those are exactly the operations we're avoiding.
        assert not target.exists(), "TARGET_BASE.mkdir should not have run"
        assert called == [], "find_user_home_dirs should not have been called"

    @pytest.mark.parametrize("val", ["1", "true", "TRUE"])
    def test_collector_run_skips_for_truthy_values(self, monkeypatch, tmp_path, val):
        """The accepted truthy spellings are 1 / true / TRUE. Anything else
        (including '0', 'false', 'yes') falls through to the normal pass."""
        from services.session_collector import collector

        monkeypatch.setenv("AGNES_SKIP_LEGACY_COLLECTOR", val)
        monkeypatch.setattr(collector, "TARGET_BASE", tmp_path / "user_sessions")
        monkeypatch.setattr(collector, "find_user_home_dirs", lambda: iter([]))

        rc, stats = collector.run()
        assert rc == 0
        assert stats.get("skipped") is True

    def test_collector_run_full_pass_when_env_unset(self, monkeypatch, tmp_path):
        """No env var → existing scan path runs (returns stats without 'skipped')."""
        from services.session_collector import collector

        monkeypatch.delenv("AGNES_SKIP_LEGACY_COLLECTOR", raising=False)
        target = tmp_path / "user_sessions"
        monkeypatch.setattr(collector, "TARGET_BASE", target)
        monkeypatch.setattr(collector, "find_user_home_dirs", lambda: iter([]))

        rc, stats = collector.run()
        assert rc == 0
        # Bare-VM path: we ran, even if no users were scanned.
        assert "skipped" not in stats
        # mkdir should have happened.
        assert target.exists()

    def test_collector_run_full_pass_for_falsy_values(self, monkeypatch, tmp_path):
        """AGNES_SKIP_LEGACY_COLLECTOR='0' should NOT skip — only the explicit
        truthy spellings short-circuit."""
        from services.session_collector import collector

        monkeypatch.setenv("AGNES_SKIP_LEGACY_COLLECTOR", "0")
        monkeypatch.setattr(collector, "TARGET_BASE", tmp_path / "user_sessions")
        monkeypatch.setattr(collector, "find_user_home_dirs", lambda: iter([]))

        rc, stats = collector.run()
        assert rc == 0
        assert "skipped" not in stats