agnes-the-ai-analyst/tests/test_tagger.py

"""Tests for services/corporate_memory/tagger.py — auto topic tagging."""

import pytest
from services.corporate_memory.tagger import (
    TOPIC_VOCABULARY,
    auto_tag_items,
    _build_prompt,
)
from connectors.llm.exceptions import LLMError


class _FakeExtractor:
    """Controllable fake extractor for testing."""

    def __init__(self, response=None, raises=None):
        self._response = response
        self._raises = raises
        self.calls: list[dict] = []

    def extract_json(self, prompt, max_tokens, json_schema, schema_name):
        self.calls.append({"prompt": prompt, "max_tokens": max_tokens})
        if self._raises:
            raise self._raises
        return self._response or {}


class TestTopicVocabulary:
    def test_vocabulary_is_non_empty(self):
        assert len(TOPIC_VOCABULARY) >= 5

    def test_vocabulary_contains_expected_topics(self):
        for topic in ("data", "automation", "reports", "metrics"):
            assert topic in TOPIC_VOCABULARY, f"'{topic}' should be in TOPIC_VOCABULARY"

    def test_vocabulary_has_no_duplicates(self):
        assert len(TOPIC_VOCABULARY) == len(set(TOPIC_VOCABULARY))


class TestBuildPrompt:
    def test_includes_all_vocab_terms(self):
        items = [{"id": "a", "title": "T", "content": "C"}]
        prompt = _build_prompt(items, TOPIC_VOCABULARY)
        for term in TOPIC_VOCABULARY:
            assert term in prompt

    def test_includes_item_id(self):
        items = [{"id": "km_abc123", "title": "My title", "content": "My content"}]
        prompt = _build_prompt(items, TOPIC_VOCABULARY)
        assert "km_abc123" in prompt

    def test_truncates_long_content(self):
        long_content = "x" * 500
        items = [{"id": "a", "title": "T", "content": long_content}]
        prompt = _build_prompt(items, TOPIC_VOCABULARY)
        # Only first 200 chars of content should appear
        assert "x" * 200 in prompt
        assert "x" * 300 not in prompt


class TestAutoTagItems:
    def test_returns_empty_for_empty_input(self):
        extractor = _FakeExtractor()
        result = auto_tag_items([], extractor)
        assert result == {}
        assert extractor.calls == []

    def test_parses_valid_response(self):
        extractor = _FakeExtractor(response={
            "assignments": [
                {"id": "item1", "topics": ["data", "queries"]},
                {"id": "item2", "topics": ["automation"]},
            ]
        })
        items = [
            {"id": "item1", "title": "SQL tips", "content": "use indexes"},
            {"id": "item2", "title": "CI setup", "content": "automate builds"},
        ]
        result = auto_tag_items(items, extractor)
        assert result["item1"] == ["data", "queries"]
        assert result["item2"] == ["automation"]

    def test_filters_out_vocabulary_hallucinations(self):
        """Topics not in TOPIC_VOCABULARY must be dropped."""
        extractor = _FakeExtractor(response={
            "assignments": [
                {"id": "x", "topics": ["data", "MADE_UP_TOPIC", "queries"]},
            ]
        })
        result = auto_tag_items([{"id": "x", "title": "T", "content": "C"}], extractor)
        assert "MADE_UP_TOPIC" not in result.get("x", [])
        assert "data" in result.get("x", [])

    def test_skips_entries_without_id(self):
        extractor = _FakeExtractor(response={
            "assignments": [
                {"id": "", "topics": ["data"]},
                {"id": "good", "topics": ["reports"]},
            ]
        })
        result = auto_tag_items([{"id": "good", "title": "T", "content": "C"}], extractor)
        assert "" not in result
        assert result.get("good") == ["reports"]

    def test_returns_empty_on_llm_error(self):
        extractor = _FakeExtractor(raises=LLMError("boom"))
        result = auto_tag_items([{"id": "a", "title": "T", "content": "C"}], extractor)
        assert result == {}

    def test_returns_empty_on_unexpected_exception(self):
        extractor = _FakeExtractor(raises=RuntimeError("network down"))
        result = auto_tag_items([{"id": "a", "title": "T", "content": "C"}], extractor)
        assert result == {}

    def test_returns_empty_when_assignments_missing(self):
        extractor = _FakeExtractor(response={})
        result = auto_tag_items([{"id": "a", "title": "T", "content": "C"}], extractor)
        assert result == {}

    def test_handles_empty_topics_list(self):
        extractor = _FakeExtractor(response={
            "assignments": [{"id": "a", "topics": []}]
        })
        result = auto_tag_items([{"id": "a", "title": "T", "content": "C"}], extractor)
        assert result.get("a") == []