* feat(memory): DuckDB FTS BM25 search for knowledge items (#121) Replaces `title ILIKE '%q%' OR content ILIKE '%q%'` ranked by insertion order with BM25 relevance ranking via the DuckDB `fts` extension. Czech queries like `cesky` match documents containing `česky` (`strip_accents=1` + `lower=1`). Architecture: - src/fts.py — ensure_fts_loaded / ensure_knowledge_fts_index helpers. The extension is per-connection (INSTALL persisted at engine level, LOAD per-conn). Both helpers are idempotent and soft-fail on unavailability with a logged WARNING. - Schema v47 (_v46_to_v47) — builds the initial BM25 index over knowledge_items(title, content) keyed by id. Migration is best-effort against ANY exception (not just duckdb.Error) so the schema bump cannot get stuck on v46 if a non-DuckDB error escapes the helper. - KnowledgeRepository.search — FTS-or-ILIKE dichotomy with execute- time fallback. Same filter surface (statuses / category / domain / source_type / personal / audience / dismissed) either way. ensure_fts_loaded() returning True only guarantees the extension is loadable, NOT that the index exists — migration soft-fail or a concurrent overwrite=1 rebuild's drop-then-create window leaves the extension loaded but the index missing. The BM25 execute is wrapped in try/except duckdb.Error → ILIKE retry so transient failures cannot 500 the /api/memory?search= endpoint. - KnowledgeRepository.count_items — mirrors the same FTS-or-ILIKE decision tree plus the execute-time fallback so the count always matches the paginated result set. - Per-mutation rebuild — create and title-or-content update rebuild the index via overwrite=1 PRAGMA. Status flips skip (token stream unchanged). - app/main.py lifespan rebuilds once at boot as a safety net for instances already on v47 across restarts. - bm25_score column shape: ILIKE fallback now selects `NULL AS bm25_score` so the result column set matches the FTS path. Consumers can read the score uniformly; absence of relevance ranking is signalled by the column being None everywhere, not missing. Tests in tests/test_knowledge_fts_search.py (9 tests): - BM25 multi-term match set + adversarial-review fix asserting higher-density doc ranks first (skipped if extension unavailable). - bm25_score column attached when extension available. - ILIKE fallback path on search + count_items via patched ensure_fts_loaded → False; bm25_score is None on this path. - Adversarial-review fix: search and count_items also fall back when the extension is loaded but the index is missing (simulated via drop_fts_index PRAGMA — the exact production failure mode the fallback guards against). - Index rebuild on create (new item searchable immediately). - Title update re-surfaces row under new term, drops old. - Czech-diacritic round-trip (cesky query → česky doc). Pinned schema-version asserts bumped 46 → 47 (test_db_schema_version, test_home_stats, test_schema_v42_migration, test_schema_v46_migration). Closes #121. * release: 0.54.20 — Corporate Memory BM25 search + All-Items bulk-edit batch bar
260 lines
12 KiB
Python
260 lines
12 KiB
Python
"""DuckDB FTS BM25 search over knowledge_items (issue #121).
|
|
|
|
Covers:
|
|
- BM25 ranking — multi-term query returns results scored by relevance,
|
|
not by ``updated_at``.
|
|
- ILIKE fallback path — when ``ensure_fts_loaded`` returns False
|
|
(extension blocked / network sandboxed), the legacy ILIKE behaviour
|
|
still runs and returns the full match set (just unranked).
|
|
- Index rebuild after mutation — newly created items show up in BM25
|
|
results in the same connection.
|
|
- Czech-diacritic handling — ``strip_accents=1`` lets ``cesky`` match
|
|
documents containing ``česky``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import patch
|
|
|
|
import duckdb
|
|
import pytest
|
|
|
|
|
|
def _fresh_db(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
|
import src.db as db_module
|
|
|
|
db_module._system_db_conn = None
|
|
db_module._system_db_path = None
|
|
return db_module.get_system_db()
|
|
|
|
|
|
class TestBM25Ranking:
|
|
def test_multi_term_query_orders_by_relevance(self, tmp_path, monkeypatch):
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
# Item A: title hits twice (review + review), short content.
|
|
repo.create(id="a", title="review review", content="x", category="x")
|
|
# Item B: one title hit, one content hit.
|
|
repo.create(id="b", title="review", content="this is a review item", category="x")
|
|
# Item C: no hit at all.
|
|
repo.create(id="c", title="unrelated", content="nothing here", category="x")
|
|
|
|
results = repo.search("review", limit=10)
|
|
# Both A and B match; C must be filtered out by the BM25 IS NOT NULL.
|
|
ids = [r["id"] for r in results]
|
|
assert "c" not in ids
|
|
assert set(ids) == {"a", "b"}
|
|
# Insertion order is A then B. ILIKE fallback would return them by
|
|
# ``updated_at DESC`` → B before A. BM25 picks the higher-density
|
|
# match → A before B. Either ordering is acceptable here (depends
|
|
# on whether the FTS extension loaded); we just assert that the
|
|
# match set is correct. The next test guards BM25-specific shape.
|
|
|
|
def test_bm25_score_attached_when_extension_available(self, tmp_path, monkeypatch):
|
|
"""If FTS is available, results carry a ``bm25_score`` column.
|
|
|
|
Lets the API surface relevance scores to the UI later (#121
|
|
out-of-scope but tracked) without a separate query path.
|
|
"""
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.fts import ensure_fts_loaded
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
if not ensure_fts_loaded(conn):
|
|
pytest.skip("fts extension not loadable in this environment")
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
repo.create(id="a", title="release process", content="", category="x")
|
|
repo.create(id="b", title="release cut", content="re-read live docs", category="x")
|
|
results = repo.search("release", limit=10)
|
|
assert len(results) == 2
|
|
for r in results:
|
|
assert "bm25_score" in r
|
|
assert r["bm25_score"] is not None
|
|
assert r["bm25_score"] > 0
|
|
|
|
def test_higher_density_match_ranks_first_when_fts_available(self, tmp_path, monkeypatch):
|
|
"""Adversarial-review fix: the previous ``test_multi_term_query_orders_by_relevance``
|
|
explicitly asserted only the match-set, not the ordering — which
|
|
left BM25 *ranking* untested even though it's the headline #121
|
|
feature. This test pins ordering when the extension is loaded:
|
|
Item A (term hits twice in a short title) MUST rank above Item B
|
|
(term hits twice across a longer title+content). When FTS is
|
|
unavailable we ``skip`` rather than ``xfail`` — the ILIKE
|
|
fallback's ``updated_at DESC`` ordering is an intentionally
|
|
different contract.
|
|
"""
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.fts import ensure_fts_loaded
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
if not ensure_fts_loaded(conn):
|
|
pytest.skip("fts extension not loadable in this environment")
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
repo.create(id="a", title="review review", content="x", category="x")
|
|
repo.create(id="b", title="review", content="this is a review item", category="x")
|
|
results = repo.search("review", limit=10)
|
|
ids = [r["id"] for r in results]
|
|
assert set(ids) == {"a", "b"}
|
|
# Higher term density in A's tiny title-only body → higher BM25
|
|
# score → A precedes B in the result list.
|
|
assert ids.index("a") < ids.index("b"), (
|
|
f"expected A before B by BM25 density; got {ids}"
|
|
)
|
|
|
|
|
|
class TestILIKEFallback:
|
|
def test_falls_back_to_ilike_when_fts_unavailable(self, tmp_path, monkeypatch):
|
|
"""``ensure_fts_loaded`` returning False routes through the legacy
|
|
ILIKE path — same predicate, ORDER BY ``updated_at`` DESC.
|
|
"""
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
repo.create(id="a", title="release one", content="", category="x")
|
|
repo.create(id="b", title="release two", content="", category="x")
|
|
repo.create(id="c", title="nothing", content="", category="x")
|
|
|
|
# Force the fallback path even if FTS is installed locally.
|
|
with patch("src.fts.ensure_fts_loaded", return_value=False):
|
|
results = repo.search("release", limit=10)
|
|
|
|
assert len(results) > 0, "fallback search returned empty before ordering check"
|
|
ids = [r["id"] for r in results]
|
|
assert "c" not in ids
|
|
assert set(ids) == {"a", "b"}
|
|
# Adversarial-review fix: the ILIKE branch now selects an
|
|
# explicit ``NULL AS bm25_score`` so the result-column shape
|
|
# matches the FTS path. Consumers can read the score uniformly;
|
|
# the absence of relevance ranking is signalled by the column
|
|
# being ``None`` everywhere, not by the column being missing.
|
|
assert "bm25_score" in results[0]
|
|
assert results[0]["bm25_score"] is None
|
|
|
|
def test_count_items_falls_back_to_ilike_when_fts_unavailable(self, tmp_path, monkeypatch):
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
repo.create(id="a", title="release one", content="", category="x")
|
|
repo.create(id="b", title="release two", content="", category="x")
|
|
repo.create(id="c", title="nothing", content="", category="x")
|
|
|
|
with patch("src.fts.ensure_fts_loaded", return_value=False):
|
|
total = repo.count_items(search="release")
|
|
assert total == 2
|
|
|
|
def test_search_falls_back_when_index_missing_despite_extension_loaded(
|
|
self, tmp_path, monkeypatch
|
|
):
|
|
"""Adversarial-review fix: ``ensure_fts_loaded`` returning True
|
|
only guarantees the extension is loadable, NOT that the
|
|
``fts_main_knowledge_items`` index exists. Migration soft-fail,
|
|
a concurrent ``overwrite=1`` rebuild's drop-then-create window,
|
|
or a manual `DROP INDEX` (test fixtures sometimes do this) all
|
|
leave the extension loaded but the index missing. ``search()``
|
|
MUST catch ``duckdb.Error`` from the BM25 query and fall through
|
|
to ILIKE, not 500 the /api/memory?search= endpoint.
|
|
"""
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.fts import ensure_fts_loaded
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
if not ensure_fts_loaded(conn):
|
|
pytest.skip("fts extension not loadable in this environment")
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
repo.create(id="a", title="release alpha", content="", category="x")
|
|
repo.create(id="b", title="release beta", content="", category="x")
|
|
repo.create(id="c", title="nothing", content="", category="x")
|
|
|
|
# Drop the FTS index out from under the repo. ``ensure_fts_loaded``
|
|
# will still return True (extension loaded), but the next BM25
|
|
# query raises a Catalog Error — the exact production failure
|
|
# mode this fallback guards against.
|
|
try:
|
|
conn.execute("PRAGMA drop_fts_index('main.knowledge_items')")
|
|
except Exception:
|
|
pytest.skip("drop_fts_index PRAGMA unavailable; cannot simulate missing-index path")
|
|
|
|
# search() must NOT raise — it must transparently fall through
|
|
# to ILIKE.
|
|
results = repo.search("release", limit=10)
|
|
ids = [r["id"] for r in results]
|
|
assert set(ids) == {"a", "b"}, f"expected ILIKE fallback to find a + b; got {ids}"
|
|
|
|
# count_items() honors the same fallback.
|
|
total = repo.count_items(search="release")
|
|
assert total == 2
|
|
|
|
|
|
class TestIndexRebuildOnMutation:
|
|
def test_new_item_is_searchable_immediately(self, tmp_path, monkeypatch):
|
|
"""``create()`` rebuilds the FTS index; the new item must surface
|
|
on the next ``search()`` against its terms.
|
|
"""
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.fts import ensure_fts_loaded
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
if not ensure_fts_loaded(conn):
|
|
pytest.skip("fts extension not loadable in this environment")
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
# No items yet → empty result.
|
|
assert repo.search("singleton") == []
|
|
|
|
repo.create(id="new", title="singleton token", content="rare", category="x")
|
|
results = repo.search("singleton")
|
|
assert len(results) == 1
|
|
assert results[0]["id"] == "new"
|
|
|
|
def test_title_update_resurfaces_under_new_term(self, tmp_path, monkeypatch):
|
|
"""Updating the title rebuilds the index so the row matches the
|
|
new term and stops matching the old one.
|
|
"""
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.fts import ensure_fts_loaded
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
if not ensure_fts_loaded(conn):
|
|
pytest.skip("fts extension not loadable in this environment")
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
repo.create(id="r", title="alpha bravo", content="", category="x")
|
|
assert [r["id"] for r in repo.search("alpha")] == ["r"]
|
|
|
|
repo.update("r", title="charlie delta")
|
|
# Old term no longer matches.
|
|
assert repo.search("alpha") == []
|
|
# New term matches.
|
|
assert [r["id"] for r in repo.search("charlie")] == ["r"]
|
|
|
|
|
|
class TestCzechDiacritics:
|
|
def test_query_without_diacritics_matches_indexed_diacritics(self, tmp_path, monkeypatch):
|
|
"""``strip_accents=1`` lets a query of ``cesky`` match a doc
|
|
containing ``česky``. This is the requirement from issue #121.
|
|
"""
|
|
conn = _fresh_db(tmp_path, monkeypatch)
|
|
from src.fts import ensure_fts_loaded
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
if not ensure_fts_loaded(conn):
|
|
pytest.skip("fts extension not loadable in this environment")
|
|
|
|
repo = KnowledgeRepository(conn)
|
|
repo.create(
|
|
id="cs",
|
|
title="česky preferuje",
|
|
content="user prefers Czech locale; česká diakritika test",
|
|
category="x",
|
|
)
|
|
# Accent-stripped query must still hit the diacritic-bearing doc.
|
|
results = repo.search("cesky")
|
|
assert [r["id"] for r in results] == ["cs"]
|