Issue #62. Tree view with cross-axis filtering, duplicate-candidate hints (Jaccard score on entity overlap), bulk-edit endpoints (PATCH /api/memory/admin/{id} + POST /api/memory/admin/bulk-update), schema v17 (knowledge_item_relations), full CLI parity (da admin memory tree/edit/bulk-edit/duplicates list/resolve).
91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
"""Duplicate-candidate detection hook for the verification detector.
|
||
|
||
Issue #62 — when a new knowledge item lands via the verification detector
|
||
pipeline, look for already-stored items in the same ``domain`` whose
|
||
``entities`` set overlaps significantly. A heuristic-only detector — no LLM
|
||
call — so it stays cheap to run inline after every item create.
|
||
|
||
Heuristic, per design decisions in issue #62:
|
||
- Both items must share the same ``domain`` (NULL domain → no candidates).
|
||
- Entity overlap >= ``MIN_ENTITY_OVERLAP`` (default 2). Below this the
|
||
signal is dominated by generic terms and noise.
|
||
- Similarity score = Jaccard ratio = |A ∩ B| / |A ∪ B| over the two
|
||
entity sets. Persisted on the relation row for downstream sorting.
|
||
|
||
Personal items are excluded by the repository helper unconditionally — even
|
||
though the detector path itself only writes non-personal items today, the
|
||
``find_*`` helper enforces the privacy boundary so future callers can't
|
||
accidentally bypass it.
|
||
"""
|
||
|
||
import logging
|
||
from typing import Optional
|
||
|
||
from src.repositories.knowledge import KnowledgeRepository
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Minimum number of shared entities for a duplicate-candidate hint.
|
||
# 2 is the lowest threshold where signal-to-noise stays acceptable on the
|
||
# 2-4-entity outputs the verification detector typically produces.
|
||
MIN_ENTITY_OVERLAP = 2
|
||
|
||
RELATION_TYPE = "likely_duplicate"
|
||
|
||
|
||
def _record_duplicate_candidates(
|
||
repo: KnowledgeRepository,
|
||
new_item: dict,
|
||
) -> int:
|
||
"""Record duplicate-candidate relations for ``new_item``.
|
||
|
||
Returns the number of relation rows created. Skips silently when
|
||
``new_item`` lacks a domain or entities — these items can't participate
|
||
in the entity-overlap heuristic so there's nothing to record.
|
||
"""
|
||
item_id: Optional[str] = new_item.get("id")
|
||
if not item_id:
|
||
return 0
|
||
|
||
entities = new_item.get("entities")
|
||
if isinstance(entities, str):
|
||
# The repo round-trips ``entities`` as JSON; tolerate either shape.
|
||
import json
|
||
try:
|
||
entities = json.loads(entities)
|
||
except json.JSONDecodeError:
|
||
entities = None
|
||
|
||
if not entities or not isinstance(entities, list):
|
||
return 0
|
||
|
||
domain = new_item.get("domain")
|
||
if not domain:
|
||
return 0
|
||
|
||
candidates = repo.find_duplicate_candidates_by_entities(
|
||
new_item_id=item_id,
|
||
entities=entities,
|
||
domain=domain,
|
||
min_overlap=MIN_ENTITY_OVERLAP,
|
||
)
|
||
|
||
recorded = 0
|
||
for cand in candidates:
|
||
cand_id = cand.get("id")
|
||
if not cand_id:
|
||
continue
|
||
try:
|
||
repo.create_relation(
|
||
item_a_id=item_id,
|
||
item_b_id=cand_id,
|
||
relation_type=RELATION_TYPE,
|
||
score=cand.get("jaccard"),
|
||
)
|
||
recorded += 1
|
||
except Exception as e: # pragma: no cover - defensive, ON CONFLICT swallows dups
|
||
logger.warning(
|
||
"Failed to record duplicate-candidate relation %s <-> %s: %s",
|
||
item_id, cand_id, e,
|
||
)
|
||
return recorded
|