agnes-the-ai-analyst/services/verification_detector/duplicates.py

"""Duplicate-candidate detection hook for the verification detector.

Issue #62 — when a new knowledge item lands via the verification detector
pipeline, look for already-stored items in the same ``domain`` whose
``entities`` set overlaps significantly. A heuristic-only detector — no LLM
call — so it stays cheap to run inline after every item create.

Heuristic, per design decisions in issue #62:
  - Both items must share the same ``domain`` (NULL domain → no candidates).
  - Entity overlap >= ``MIN_ENTITY_OVERLAP`` (default 2). Below this the
    signal is dominated by generic terms and noise.
  - Similarity score = Jaccard ratio = |A ∩ B| / |A ∪ B| over the two
    entity sets. Persisted on the relation row for downstream sorting.

Personal items are excluded by the repository helper unconditionally — even
though the detector path itself only writes non-personal items today, the
``find_*`` helper enforces the privacy boundary so future callers can't
accidentally bypass it.
"""

import logging
from typing import Optional

from src.repositories.knowledge import KnowledgeRepository

logger = logging.getLogger(__name__)

# Minimum number of shared entities for a duplicate-candidate hint.
# 2 is the lowest threshold where signal-to-noise stays acceptable on the
# 2-4-entity outputs the verification detector typically produces.
MIN_ENTITY_OVERLAP = 2

RELATION_TYPE = "likely_duplicate"


def _record_duplicate_candidates(
    repo: KnowledgeRepository,
    new_item: dict,
) -> int:
    """Record duplicate-candidate relations for ``new_item``.

    Returns the number of relation rows created. Skips silently when
    ``new_item`` lacks a domain or entities — these items can't participate
    in the entity-overlap heuristic so there's nothing to record.
    """
    item_id: Optional[str] = new_item.get("id")
    if not item_id:
        return 0

    entities = new_item.get("entities")
    if isinstance(entities, str):
        # The repo round-trips ``entities`` as JSON; tolerate either shape.
        import json
        try:
            entities = json.loads(entities)
        except json.JSONDecodeError:
            entities = None

    if not entities or not isinstance(entities, list):
        return 0

    domain = new_item.get("domain")
    if not domain:
        return 0

    candidates = repo.find_duplicate_candidates_by_entities(
        new_item_id=item_id,
        entities=entities,
        domain=domain,
        min_overlap=MIN_ENTITY_OVERLAP,
    )

    recorded = 0
    for cand in candidates:
        cand_id = cand.get("id")
        if not cand_id:
            continue
        try:
            repo.create_relation(
                item_a_id=item_id,
                item_b_id=cand_id,
                relation_type=RELATION_TYPE,
                score=cand.get("jaccard"),
            )
            recorded += 1
        except Exception as e:  # pragma: no cover - defensive, ON CONFLICT swallows dups
            logger.warning(
                "Failed to record duplicate-candidate relation %s <-> %s: %s",
                item_id, cand_id, e,
            )
    return recorded