From c5d67faad22b66cbc08184c9e9047a268df04f75 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr <139972147+ZdenekSrotyr@users.noreply.github.com> Date: Fri, 15 May 2026 20:10:59 +0200 Subject: [PATCH] feat(memory): DuckDB FTS BM25 search for knowledge items (#121) (#326) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(memory): DuckDB FTS BM25 search for knowledge items (#121) Replaces `title ILIKE '%q%' OR content ILIKE '%q%'` ranked by insertion order with BM25 relevance ranking via the DuckDB `fts` extension. Czech queries like `cesky` match documents containing `česky` (`strip_accents=1` + `lower=1`). Architecture: - src/fts.py — ensure_fts_loaded / ensure_knowledge_fts_index helpers. The extension is per-connection (INSTALL persisted at engine level, LOAD per-conn). Both helpers are idempotent and soft-fail on unavailability with a logged WARNING. - Schema v47 (_v46_to_v47) — builds the initial BM25 index over knowledge_items(title, content) keyed by id. Migration is best-effort against ANY exception (not just duckdb.Error) so the schema bump cannot get stuck on v46 if a non-DuckDB error escapes the helper. - KnowledgeRepository.search — FTS-or-ILIKE dichotomy with execute- time fallback. Same filter surface (statuses / category / domain / source_type / personal / audience / dismissed) either way. ensure_fts_loaded() returning True only guarantees the extension is loadable, NOT that the index exists — migration soft-fail or a concurrent overwrite=1 rebuild's drop-then-create window leaves the extension loaded but the index missing. The BM25 execute is wrapped in try/except duckdb.Error → ILIKE retry so transient failures cannot 500 the /api/memory?search= endpoint. - KnowledgeRepository.count_items — mirrors the same FTS-or-ILIKE decision tree plus the execute-time fallback so the count always matches the paginated result set. - Per-mutation rebuild — create and title-or-content update rebuild the index via overwrite=1 PRAGMA. Status flips skip (token stream unchanged). - app/main.py lifespan rebuilds once at boot as a safety net for instances already on v47 across restarts. - bm25_score column shape: ILIKE fallback now selects `NULL AS bm25_score` so the result column set matches the FTS path. Consumers can read the score uniformly; absence of relevance ranking is signalled by the column being None everywhere, not missing. Tests in tests/test_knowledge_fts_search.py (9 tests): - BM25 multi-term match set + adversarial-review fix asserting higher-density doc ranks first (skipped if extension unavailable). - bm25_score column attached when extension available. - ILIKE fallback path on search + count_items via patched ensure_fts_loaded → False; bm25_score is None on this path. - Adversarial-review fix: search and count_items also fall back when the extension is loaded but the index is missing (simulated via drop_fts_index PRAGMA — the exact production failure mode the fallback guards against). - Index rebuild on create (new item searchable immediately). - Title update re-surfaces row under new term, drops old. - Czech-diacritic round-trip (cesky query → česky doc). Pinned schema-version asserts bumped 46 → 47 (test_db_schema_version, test_home_stats, test_schema_v42_migration, test_schema_v46_migration). Closes #121. * release: 0.54.20 — Corporate Memory BM25 search + All-Items bulk-edit batch bar --- CHANGELOG.md | 25 +++ app/main.py | 13 ++ pyproject.toml | 2 +- src/db.py | 48 ++++- src/fts.py | 78 ++++++++ src/repositories/knowledge.py | 276 ++++++++++++++++++++--------- tests/test_db_schema_version.py | 2 +- tests/test_home_stats.py | 2 +- tests/test_knowledge_fts_search.py | 260 +++++++++++++++++++++++++++ tests/test_schema_v42_migration.py | 8 +- tests/test_schema_v46_migration.py | 2 +- 11 files changed, 622 insertions(+), 94 deletions(-) create mode 100644 src/fts.py create mode 100644 tests/test_knowledge_fts_search.py diff --git a/CHANGELOG.md b/CHANGELOG.md index dad6005..d0b3dff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,25 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C ## [Unreleased] +## [0.54.20] — 2026-05-15 + ### Added +- **Corporate Memory — BM25 relevance ranking on knowledge search.** + Replaces the `title ILIKE '%q%' OR content ILIKE '%q%'` + ranked-by-insertion-order query in `KnowledgeRepository.search` + with DuckDB's `fts` extension (BM25). Czech queries match across + diacritics (`cesky` → `česky`) via `strip_accents=1` + `lower=1`. + Schema v47 builds the initial index over `knowledge_items(title, + content)`; per-mutation rebuild fires only when `title` / `content` + change (status flips skip). The lifespan in `app/main.py` rebuilds + once at boot as a safety net for restarts on v47. Result rows now + carry a `bm25_score` column (always present — `None` on the ILIKE + fallback for shape uniformity). When the `fts` extension can't be + loaded (offline / sandboxed install) **or** the index is missing + (migration soft-fail, concurrent `overwrite=1` rebuild's drop-then- + create window), `search` and `count_items` transparently fall + through to the pre-#121 ILIKE query — same result-set membership, + ordering regresses to `updated_at DESC`. Closes #121. - **Corporate Memory — bulk-edit batch bar on the All Items tab.** Symmetric to the Review-tab bar shipped in #126; row checkboxes, "Select all" header, and the five bulk-edit actions (Move to @@ -20,6 +38,13 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C flips belong with the per-row actions or the keyboard workflow). Closes #129. +### Internal +- **Schema v47** — adds DuckDB `fts` BM25 index over + `knowledge_items(title, content)`. Auto-migrates on first boot; + soft-fails to ILIKE if the extension repo is unreachable. Index is + a snapshot — see `src/fts.py` for the on-mutation / lifespan + rebuild contract. + ## [0.54.19] — 2026-05-15 ### Changed diff --git a/app/main.py b/app/main.py index 808e806..0021a69 100644 --- a/app/main.py +++ b/app/main.py @@ -186,6 +186,19 @@ async def lifespan(app): except Exception: logger.exception("internal data-source seed failed; continuing") + # Rebuild the FTS BM25 index over knowledge_items at boot (issue #121). + # The migration to schema v47 already does this on first upgrade, but + # for instances that have been on v47 across restarts the boot-time + # rebuild guarantees the index reflects whatever mutations landed via + # the BG-task / scheduler paths that bypass the per-mutation hook. + # Soft-failure — logs WARNING and the repo falls back to ILIKE. + try: + from src.db import get_system_db + from src.fts import ensure_knowledge_fts_index + ensure_knowledge_fts_index(get_system_db()) + except Exception: + logger.exception("startup FTS index rebuild failed; falling back to ILIKE on /api/memory?search=") + # Seed admin user (SEED_ADMIN_EMAIL) and add them to the Admin user_group. # Optional SEED_ADMIN_PASSWORD lets the seeded user sign in immediately # without going through bootstrap; never overwritten if already set. diff --git a/pyproject.toml b/pyproject.toml index 156e8d3..6e7c40a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agnes-the-ai-analyst" -version = "0.54.19" +version = "0.54.20" description = "Agnes — AI Data Analyst platform for AI analytical systems" requires-python = ">=3.11,<3.14" license = "MIT" diff --git a/src/db.py b/src/db.py index 12e305f..ecc3e1b 100644 --- a/src/db.py +++ b/src/db.py @@ -40,7 +40,7 @@ def _maybe_instrument(con, db_tag: str): _SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$") -SCHEMA_VERSION = 46 +SCHEMA_VERSION = 47 _SYSTEM_SCHEMA = """ CREATE TABLE IF NOT EXISTS schema_version ( @@ -2935,6 +2935,47 @@ def _v44_to_v45(conn: duckdb.DuckDBPyConnection) -> None: conn.execute("CREATE INDEX IF NOT EXISTS idx_usage_events_user_id ON usage_events(user_id)") +def _v46_to_v47(conn: duckdb.DuckDBPyConnection) -> None: + """v47: DuckDB FTS BM25 index over knowledge_items(title, content). + + Replaces ``ILIKE '%q%'`` ranking-by-insertion-order in + ``KnowledgeRepository.search`` with BM25 relevance scoring (#121). + + The migration is *soft* — and soft against *any* exception, not just + the ``duckdb.Error`` that ``ensure_knowledge_fts_index`` already + handles. A non-DuckDB exception escaping from the inner helper (for + example an ``OSError`` from an extension fetch that bypasses + DuckDB's wrapping in a sandboxed environment, or a future DuckDB + version surfacing a non-``Error`` subclass) would otherwise leave + the DB stuck at v46 forever. ``KnowledgeRepository.search`` falls + back to ILIKE on a missing index, so a soft-fail here is always + recoverable later (boot-time lifespan rebuild + per-mutation + ``create_fts_index(overwrite=1)`` both retry on every restart and + every write). + + DuckDB FTS indexes are static snapshots — they don't track + base-table mutations automatically. The lifespan in ``app/main.py`` + rebuilds once at boot as a safety net; ``create`` and title-or- + content ``update`` in the repo rebuild on every relevant mutation + via the same ``overwrite=1`` PRAGMA. At corpus sizes None: """v46: per-user opt-out (dismiss) for knowledge items. @@ -3249,6 +3290,9 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: # curated memory items. _SYSTEM_SCHEMA already creates the # table on fresh installs; this call is a no-op there. _v45_to_v46(conn) + # v47 fts index over knowledge_items — best-effort, silent + # fallback to ILIKE search if the fts extension can't load. + _v46_to_v47(conn) # Fresh-install seed is handled by the unconditional # _seed_core_roles call at the bottom of _ensure_schema — # left as a no-op branch here so the migration ladder still @@ -3395,6 +3439,8 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: _v44_to_v45(conn) if current < 46: _v45_to_v46(conn) + if current < 47: + _v46_to_v47(conn) conn.execute( "UPDATE schema_version SET version = ?, applied_at = current_timestamp", [SCHEMA_VERSION], diff --git a/src/fts.py b/src/fts.py new file mode 100644 index 0000000..2be1a98 --- /dev/null +++ b/src/fts.py @@ -0,0 +1,78 @@ +"""DuckDB FTS extension helpers for knowledge-item BM25 search (issue #121). + +The extension is per-connection: ``INSTALL fts`` is persisted at the engine +level, ``LOAD fts`` must run on every fresh DuckDB connection. The index +over ``knowledge_items`` is a *static snapshot* — DuckDB doesn't track +base-table INSERT / UPDATE / DELETE automatically. + +We rebuild on demand inside ``search()`` (cheap at corpus sizes +< a few thousand rows; acceptance bound is sub-100ms) and fall back to +``ILIKE`` when the extension can't be loaded — offline installs and +sandboxed CI runners that block extension downloads must still serve +the search box, just without relevance ranking. + +``strip_accents=1`` lets queries like ``cesky`` match documents +containing ``česky`` — the Czech-diacritics acceptance from #121. +""" + +from __future__ import annotations + +import logging + +import duckdb + +logger = logging.getLogger(__name__) + + +def ensure_fts_loaded(conn: duckdb.DuckDBPyConnection) -> bool: + """``INSTALL`` + ``LOAD`` the DuckDB ``fts`` extension on ``conn``. + + Idempotent: re-running on a connection that already has the extension + loaded is a no-op. Returns ``True`` on success, ``False`` on any + DuckDB error (network-blocked install, sandboxed CI runner without + extension repo access, etc.). Callers should fall back to ``ILIKE`` + on ``False``. + """ + try: + conn.execute("INSTALL fts") + conn.execute("LOAD fts") + return True + except duckdb.Error as e: + logger.warning( + "DuckDB fts extension unavailable; knowledge search will fall back to ILIKE: %s", + e, + ) + return False + + +def ensure_knowledge_fts_index(conn: duckdb.DuckDBPyConnection) -> bool: + """Create (or rebuild) the BM25 FTS index over ``knowledge_items``. + + The index covers ``title`` and ``content``, keyed by ``id``. + ``overwrite=1`` makes the call idempotent: if the index already + exists it's dropped and rebuilt from the current snapshot — which is + how we keep it in sync with INSERT/UPDATE/DELETE without per-row + update hooks (DuckDB FTS doesn't ship those). + + ``strip_accents=1`` + ``lower=1`` give us case- and diacritic- + insensitive matching out of the box (``cesky`` → ``česky``). + + Returns ``True`` if the index is now usable, ``False`` if either + the extension or the PRAGMA call failed. Falsy return is the signal + for ``KnowledgeRepository.search`` to use the ILIKE path. + """ + if not ensure_fts_loaded(conn): + return False + try: + conn.execute( + "PRAGMA create_fts_index(" + "'main.knowledge_items', 'id', 'title', 'content', " + "strip_accents=1, lower=1, overwrite=1)" + ) + return True + except duckdb.Error as e: + logger.warning( + "Failed to (re)create FTS index on knowledge_items; falling back to ILIKE: %s", + e, + ) + return False diff --git a/src/repositories/knowledge.py b/src/repositories/knowledge.py index c6f39e0..85182e7 100644 --- a/src/repositories/knowledge.py +++ b/src/repositories/knowledge.py @@ -1,12 +1,15 @@ """Repository for corporate memory knowledge items, votes, and contradictions.""" import json +import logging import uuid from datetime import datetime, timezone from typing import Any, Optional, List, Dict import duckdb +logger = logging.getLogger(__name__) + class KnowledgeRepository: # Columns persisted as JSON-encoded strings (see `create` / `update` / @@ -104,6 +107,7 @@ class KnowledgeRepository: now, now, ], ) + self._refresh_fts_index() _UPDATABLE_FIELDS = { "title", "content", "category", "tags", "domain", "entities", @@ -123,6 +127,10 @@ class KnowledgeRepository: f"UPDATE knowledge_items SET {set_clause}, updated_at = ? WHERE id = ?", values, ) + # FTS index needs rebuilding only when the indexed text changed — + # status / domain / tag flips don't affect the BM25 token stream. + if "title" in safe or "content" in safe: + self._refresh_fts_index() def update_status(self, item_id: str, status: str) -> None: now = datetime.now(timezone.utc) @@ -130,6 +138,19 @@ class KnowledgeRepository: "UPDATE knowledge_items SET status = ?, updated_at = ? WHERE id = ?", [status, now, item_id], ) + # Status flips don't touch the indexed title/content — no rebuild. + + def _refresh_fts_index(self) -> None: + """Rebuild the BM25 index after a mutation that changed indexed text. + + Soft helper — failure is logged inside ``ensure_knowledge_fts_index`` + and the repo's search path falls back to ILIKE on the next call. + Kept as a private method so the create / update sites stay free of + try/except clutter. + """ + from src.fts import ensure_knowledge_fts_index + + ensure_knowledge_fts_index(self.conn) def list_items( self, @@ -222,49 +243,109 @@ class KnowledgeRepository: limit: int = 100, offset: int = 0, ) -> List[Dict[str, Any]]: - pattern = f"%{query}%" - sql = """SELECT * FROM knowledge_items - WHERE (title ILIKE ? OR content ILIKE ?)""" - params: List[Any] = [pattern, pattern] - if statuses: - placeholders = ", ".join("?" for _ in statuses) - sql += f" AND status IN ({placeholders})" - params.extend(statuses) - if category: - sql += " AND category = ?" - params.append(category) - if domain: - sql += " AND domain = ?" - params.append(domain) - if source_type: - sql += " AND source_type = ?" - params.append(source_type) - if exclude_personal: - sql += " AND (is_personal = FALSE OR is_personal IS NULL)" - if user_groups is not None: - visibility_clauses = ["audience IS NULL", "audience = 'all'"] - if user_groups: - audience_placeholders = ", ".join("?" for _ in user_groups) - visibility_clauses.append(f"audience IN ({audience_placeholders})") - params.extend(user_groups) - if granted_domains: - domain_placeholders = ", ".join("?" for _ in granted_domains) - visibility_clauses.append(f"domain IN ({domain_placeholders})") - params.extend(granted_domains) - sql += " AND (" + " OR ".join(visibility_clauses) + ")" - if hide_dismissed and dismissed_by_user: - sql += ( - " AND NOT EXISTS (" - " SELECT 1 FROM knowledge_item_user_dismissed d" - " WHERE d.item_id = knowledge_items.id" - " AND d.user_id = ?" - " AND knowledge_items.status != 'mandatory'" - ")" + """Relevance-ranked search across ``title`` + ``content`` (#121). + + Uses DuckDB FTS BM25 ranking when the extension is available; + falls back to a non-ranked ``ILIKE`` query (ORDER BY + ``updated_at`` DESC) otherwise. Same filter surface either way. + The fallback is the same code path that ran before #121, so + existing installs without ``fts`` available regress only on + result *ordering*, not on result set membership. + + Index rebuilds are *not* triggered here — too expensive at + search QPS. ``create`` / ``update`` / ``update_status`` rebuild + on mutation; ``app/main.py`` lifespan rebuilds once on startup + as a safety net. We only need ``LOAD fts`` here so the + ``match_bm25`` UDF resolves on this cursor. + """ + from src.fts import ensure_fts_loaded + + # Build a closure that appends the (identical-across-paths) filter + # clauses + LIMIT/OFFSET onto a base SELECT + WHERE prefix, then + # executes. Lets the FTS-or-ILIKE choice stay in one place while + # preserving the "wrap FTS execute in try/except, fall through on + # any duckdb.Error" contract — extension loadable but index + # missing, or a concurrent rebuild's drop-then-create window + # opening between the loaded check and our query, both raise here + # and we transparently retry against ILIKE. + def _run(base_sql: str, base_params: List[Any], order_clause: str) -> List[Any]: + sql = base_sql + params: List[Any] = list(base_params) + if statuses: + placeholders = ", ".join("?" for _ in statuses) + sql += f" AND status IN ({placeholders})" + params.extend(statuses) + if category: + sql += " AND category = ?" + params.append(category) + if domain: + sql += " AND domain = ?" + params.append(domain) + if source_type: + sql += " AND source_type = ?" + params.append(source_type) + if exclude_personal: + sql += " AND (is_personal = FALSE OR is_personal IS NULL)" + if user_groups is not None: + visibility_clauses = ["audience IS NULL", "audience = 'all'"] + if user_groups: + audience_placeholders = ", ".join("?" for _ in user_groups) + visibility_clauses.append(f"audience IN ({audience_placeholders})") + params.extend(user_groups) + if granted_domains: + domain_placeholders = ", ".join("?" for _ in granted_domains) + visibility_clauses.append(f"domain IN ({domain_placeholders})") + params.extend(granted_domains) + sql += " AND (" + " OR ".join(visibility_clauses) + ")" + if hide_dismissed and dismissed_by_user: + sql += ( + " AND NOT EXISTS (" + " SELECT 1 FROM knowledge_item_user_dismissed d" + " WHERE d.item_id = knowledge_items.id" + " AND d.user_id = ?" + " AND knowledge_items.status != 'mandatory'" + ")" + ) + params.append(dismissed_by_user) + sql += order_clause + " LIMIT ? OFFSET ?" + params.extend([limit, offset]) + return self.conn.execute(sql, params).fetchall() + + # ILIKE fallback path — preserves the pre-#121 query shape byte- + # for-byte (plus an explicit ``NULL AS bm25_score`` so the result + # column set matches the FTS path: consumers can read the score + # uniformly without having to know which tier produced the row). + ilike_pattern = f"%{query}%" + ilike_sql = ( + "SELECT *, NULL AS bm25_score FROM knowledge_items " + "WHERE (title ILIKE ? OR content ILIKE ?)" + ) + ilike_params: List[Any] = [ilike_pattern, ilike_pattern] + ilike_order = " ORDER BY updated_at DESC" + + if ensure_fts_loaded(self.conn): + # BM25 path. ``match_bm25`` is evaluated once in the SELECT + # and reused as the WHERE / ORDER BY filter via the alias. + fts_sql = ( + "SELECT *, fts_main_knowledge_items.match_bm25(id, ?) AS bm25_score " + "FROM knowledge_items " + "WHERE fts_main_knowledge_items.match_bm25(id, ?) IS NOT NULL" ) - params.append(dismissed_by_user) - sql += " ORDER BY updated_at DESC LIMIT ? OFFSET ?" - params.extend([limit, offset]) - results = self.conn.execute(sql, params).fetchall() + fts_params: List[Any] = [query, query] + fts_order = " ORDER BY bm25_score DESC, updated_at DESC" + try: + results = _run(fts_sql, fts_params, fts_order) + except duckdb.Error as e: + # Extension loaded but index missing (migration soft-fail + # or a concurrent ``overwrite=1`` rebuild caught us in the + # drop-then-create window). Fall through to ILIKE rather + # than 500 the /api/memory?search= endpoint. + logger.warning( + "FTS BM25 search failed (%s); falling back to ILIKE", e, + ) + results = _run(ilike_sql, ilike_params, ilike_order) + else: + results = _run(ilike_sql, ilike_params, ilike_order) return self._rows_to_dicts(results) def count_items( @@ -280,50 +361,75 @@ class KnowledgeRepository: dismissed_by_user: Optional[str] = None, hide_dismissed: bool = False, ) -> int: - if search: - pattern = f"%{search}%" - sql = "SELECT COUNT(*) FROM knowledge_items WHERE (title ILIKE ? OR content ILIKE ?)" - params: List[Any] = [pattern, pattern] - else: - sql = "SELECT COUNT(*) FROM knowledge_items WHERE 1=1" - params = [] - if statuses: - placeholders = ", ".join("?" for _ in statuses) - sql += f" AND status IN ({placeholders})" - params.extend(statuses) - if category: - sql += " AND category = ?" - params.append(category) - if domain: - sql += " AND domain = ?" - params.append(domain) - if source_type: - sql += " AND source_type = ?" - params.append(source_type) - if exclude_personal: - sql += " AND (is_personal = FALSE OR is_personal IS NULL)" - if user_groups is not None: - visibility_clauses = ["audience IS NULL", "audience = 'all'"] - if user_groups: - audience_placeholders = ", ".join("?" for _ in user_groups) - visibility_clauses.append(f"audience IN ({audience_placeholders})") - params.extend(user_groups) - if granted_domains: - domain_placeholders = ", ".join("?" for _ in granted_domains) - visibility_clauses.append(f"domain IN ({domain_placeholders})") - params.extend(granted_domains) - sql += " AND (" + " OR ".join(visibility_clauses) + ")" - if hide_dismissed and dismissed_by_user: - sql += ( - " AND NOT EXISTS (" - " SELECT 1 FROM knowledge_item_user_dismissed d" - " WHERE d.item_id = knowledge_items.id" - " AND d.user_id = ?" - " AND knowledge_items.status != 'mandatory'" - ")" + # Closure mirrors search(): the filter clauses are identical + # whether the base prefix is FTS or ILIKE, so we build once and + # apply twice if the FTS execute raises (extension loadable but + # index missing / overwrite=1 drop-then-create window). Counts + # MUST match the paginated result set in search() — same + # decision tree, same predicate shape, same fallback semantics. + def _run(base_sql: str, base_params: List[Any]) -> int: + sql = base_sql + params: List[Any] = list(base_params) + if statuses: + placeholders = ", ".join("?" for _ in statuses) + sql += f" AND status IN ({placeholders})" + params.extend(statuses) + if category: + sql += " AND category = ?" + params.append(category) + if domain: + sql += " AND domain = ?" + params.append(domain) + if source_type: + sql += " AND source_type = ?" + params.append(source_type) + if exclude_personal: + sql += " AND (is_personal = FALSE OR is_personal IS NULL)" + if user_groups is not None: + visibility_clauses = ["audience IS NULL", "audience = 'all'"] + if user_groups: + audience_placeholders = ", ".join("?" for _ in user_groups) + visibility_clauses.append(f"audience IN ({audience_placeholders})") + params.extend(user_groups) + if granted_domains: + domain_placeholders = ", ".join("?" for _ in granted_domains) + visibility_clauses.append(f"domain IN ({domain_placeholders})") + params.extend(granted_domains) + sql += " AND (" + " OR ".join(visibility_clauses) + ")" + if hide_dismissed and dismissed_by_user: + sql += ( + " AND NOT EXISTS (" + " SELECT 1 FROM knowledge_item_user_dismissed d" + " WHERE d.item_id = knowledge_items.id" + " AND d.user_id = ?" + " AND knowledge_items.status != 'mandatory'" + ")" + ) + params.append(dismissed_by_user) + return self.conn.execute(sql, params).fetchone()[0] + + if not search: + return _run("SELECT COUNT(*) FROM knowledge_items WHERE 1=1", []) + + from src.fts import ensure_fts_loaded + + ilike_pattern = f"%{search}%" + ilike_sql = "SELECT COUNT(*) FROM knowledge_items WHERE (title ILIKE ? OR content ILIKE ?)" + ilike_params: List[Any] = [ilike_pattern, ilike_pattern] + + if ensure_fts_loaded(self.conn): + fts_sql = ( + "SELECT COUNT(*) FROM knowledge_items " + "WHERE fts_main_knowledge_items.match_bm25(id, ?) IS NOT NULL" ) - params.append(dismissed_by_user) - return self.conn.execute(sql, params).fetchone()[0] + try: + return _run(fts_sql, [search]) + except duckdb.Error as e: + logger.warning( + "FTS BM25 count failed (%s); falling back to ILIKE", e, + ) + return _run(ilike_sql, ilike_params) + return _run(ilike_sql, ilike_params) def list_by_domain( self, diff --git a/tests/test_db_schema_version.py b/tests/test_db_schema_version.py index a916219..48b39b5 100644 --- a/tests/test_db_schema_version.py +++ b/tests/test_db_schema_version.py @@ -110,7 +110,7 @@ def test_schema_version_is_44(): # protected: the API rejects POSTs against them, and the # SQL filter exempts ``status = 'mandatory'`` so any stale # row from before an item was mandated is silently ignored. - assert SCHEMA_VERSION == 46 + assert SCHEMA_VERSION == 47 def test_v37_marketplace_curator_columns(tmp_path): diff --git a/tests/test_home_stats.py b/tests/test_home_stats.py index 5f25e9d..435ed34 100644 --- a/tests/test_home_stats.py +++ b/tests/test_home_stats.py @@ -89,7 +89,7 @@ def test_v43_to_v44_upgrade_is_idempotent(tmp_path): def test_schema_version_constant_is_44(): """Belt + suspenders against schema_version regressions.""" - assert SCHEMA_VERSION == 46 + assert SCHEMA_VERSION == 47 # --------------------------------------------------------------------------- diff --git a/tests/test_knowledge_fts_search.py b/tests/test_knowledge_fts_search.py new file mode 100644 index 0000000..bff1e77 --- /dev/null +++ b/tests/test_knowledge_fts_search.py @@ -0,0 +1,260 @@ +"""DuckDB FTS BM25 search over knowledge_items (issue #121). + +Covers: +- BM25 ranking — multi-term query returns results scored by relevance, + not by ``updated_at``. +- ILIKE fallback path — when ``ensure_fts_loaded`` returns False + (extension blocked / network sandboxed), the legacy ILIKE behaviour + still runs and returns the full match set (just unranked). +- Index rebuild after mutation — newly created items show up in BM25 + results in the same connection. +- Czech-diacritic handling — ``strip_accents=1`` lets ``cesky`` match + documents containing ``česky``. +""" + +from __future__ import annotations + +from unittest.mock import patch + +import duckdb +import pytest + + +def _fresh_db(tmp_path, monkeypatch): + monkeypatch.setenv("DATA_DIR", str(tmp_path)) + import src.db as db_module + + db_module._system_db_conn = None + db_module._system_db_path = None + return db_module.get_system_db() + + +class TestBM25Ranking: + def test_multi_term_query_orders_by_relevance(self, tmp_path, monkeypatch): + conn = _fresh_db(tmp_path, monkeypatch) + from src.repositories.knowledge import KnowledgeRepository + + repo = KnowledgeRepository(conn) + # Item A: title hits twice (review + review), short content. + repo.create(id="a", title="review review", content="x", category="x") + # Item B: one title hit, one content hit. + repo.create(id="b", title="review", content="this is a review item", category="x") + # Item C: no hit at all. + repo.create(id="c", title="unrelated", content="nothing here", category="x") + + results = repo.search("review", limit=10) + # Both A and B match; C must be filtered out by the BM25 IS NOT NULL. + ids = [r["id"] for r in results] + assert "c" not in ids + assert set(ids) == {"a", "b"} + # Insertion order is A then B. ILIKE fallback would return them by + # ``updated_at DESC`` → B before A. BM25 picks the higher-density + # match → A before B. Either ordering is acceptable here (depends + # on whether the FTS extension loaded); we just assert that the + # match set is correct. The next test guards BM25-specific shape. + + def test_bm25_score_attached_when_extension_available(self, tmp_path, monkeypatch): + """If FTS is available, results carry a ``bm25_score`` column. + + Lets the API surface relevance scores to the UI later (#121 + out-of-scope but tracked) without a separate query path. + """ + conn = _fresh_db(tmp_path, monkeypatch) + from src.fts import ensure_fts_loaded + from src.repositories.knowledge import KnowledgeRepository + + if not ensure_fts_loaded(conn): + pytest.skip("fts extension not loadable in this environment") + + repo = KnowledgeRepository(conn) + repo.create(id="a", title="release process", content="", category="x") + repo.create(id="b", title="release cut", content="re-read live docs", category="x") + results = repo.search("release", limit=10) + assert len(results) == 2 + for r in results: + assert "bm25_score" in r + assert r["bm25_score"] is not None + assert r["bm25_score"] > 0 + + def test_higher_density_match_ranks_first_when_fts_available(self, tmp_path, monkeypatch): + """Adversarial-review fix: the previous ``test_multi_term_query_orders_by_relevance`` + explicitly asserted only the match-set, not the ordering — which + left BM25 *ranking* untested even though it's the headline #121 + feature. This test pins ordering when the extension is loaded: + Item A (term hits twice in a short title) MUST rank above Item B + (term hits twice across a longer title+content). When FTS is + unavailable we ``skip`` rather than ``xfail`` — the ILIKE + fallback's ``updated_at DESC`` ordering is an intentionally + different contract. + """ + conn = _fresh_db(tmp_path, monkeypatch) + from src.fts import ensure_fts_loaded + from src.repositories.knowledge import KnowledgeRepository + + if not ensure_fts_loaded(conn): + pytest.skip("fts extension not loadable in this environment") + + repo = KnowledgeRepository(conn) + repo.create(id="a", title="review review", content="x", category="x") + repo.create(id="b", title="review", content="this is a review item", category="x") + results = repo.search("review", limit=10) + ids = [r["id"] for r in results] + assert set(ids) == {"a", "b"} + # Higher term density in A's tiny title-only body → higher BM25 + # score → A precedes B in the result list. + assert ids.index("a") < ids.index("b"), ( + f"expected A before B by BM25 density; got {ids}" + ) + + +class TestILIKEFallback: + def test_falls_back_to_ilike_when_fts_unavailable(self, tmp_path, monkeypatch): + """``ensure_fts_loaded`` returning False routes through the legacy + ILIKE path — same predicate, ORDER BY ``updated_at`` DESC. + """ + conn = _fresh_db(tmp_path, monkeypatch) + from src.repositories.knowledge import KnowledgeRepository + + repo = KnowledgeRepository(conn) + repo.create(id="a", title="release one", content="", category="x") + repo.create(id="b", title="release two", content="", category="x") + repo.create(id="c", title="nothing", content="", category="x") + + # Force the fallback path even if FTS is installed locally. + with patch("src.fts.ensure_fts_loaded", return_value=False): + results = repo.search("release", limit=10) + + assert len(results) > 0, "fallback search returned empty before ordering check" + ids = [r["id"] for r in results] + assert "c" not in ids + assert set(ids) == {"a", "b"} + # Adversarial-review fix: the ILIKE branch now selects an + # explicit ``NULL AS bm25_score`` so the result-column shape + # matches the FTS path. Consumers can read the score uniformly; + # the absence of relevance ranking is signalled by the column + # being ``None`` everywhere, not by the column being missing. + assert "bm25_score" in results[0] + assert results[0]["bm25_score"] is None + + def test_count_items_falls_back_to_ilike_when_fts_unavailable(self, tmp_path, monkeypatch): + conn = _fresh_db(tmp_path, monkeypatch) + from src.repositories.knowledge import KnowledgeRepository + + repo = KnowledgeRepository(conn) + repo.create(id="a", title="release one", content="", category="x") + repo.create(id="b", title="release two", content="", category="x") + repo.create(id="c", title="nothing", content="", category="x") + + with patch("src.fts.ensure_fts_loaded", return_value=False): + total = repo.count_items(search="release") + assert total == 2 + + def test_search_falls_back_when_index_missing_despite_extension_loaded( + self, tmp_path, monkeypatch + ): + """Adversarial-review fix: ``ensure_fts_loaded`` returning True + only guarantees the extension is loadable, NOT that the + ``fts_main_knowledge_items`` index exists. Migration soft-fail, + a concurrent ``overwrite=1`` rebuild's drop-then-create window, + or a manual `DROP INDEX` (test fixtures sometimes do this) all + leave the extension loaded but the index missing. ``search()`` + MUST catch ``duckdb.Error`` from the BM25 query and fall through + to ILIKE, not 500 the /api/memory?search= endpoint. + """ + conn = _fresh_db(tmp_path, monkeypatch) + from src.fts import ensure_fts_loaded + from src.repositories.knowledge import KnowledgeRepository + + if not ensure_fts_loaded(conn): + pytest.skip("fts extension not loadable in this environment") + + repo = KnowledgeRepository(conn) + repo.create(id="a", title="release alpha", content="", category="x") + repo.create(id="b", title="release beta", content="", category="x") + repo.create(id="c", title="nothing", content="", category="x") + + # Drop the FTS index out from under the repo. ``ensure_fts_loaded`` + # will still return True (extension loaded), but the next BM25 + # query raises a Catalog Error — the exact production failure + # mode this fallback guards against. + try: + conn.execute("PRAGMA drop_fts_index('main.knowledge_items')") + except Exception: + pytest.skip("drop_fts_index PRAGMA unavailable; cannot simulate missing-index path") + + # search() must NOT raise — it must transparently fall through + # to ILIKE. + results = repo.search("release", limit=10) + ids = [r["id"] for r in results] + assert set(ids) == {"a", "b"}, f"expected ILIKE fallback to find a + b; got {ids}" + + # count_items() honors the same fallback. + total = repo.count_items(search="release") + assert total == 2 + + +class TestIndexRebuildOnMutation: + def test_new_item_is_searchable_immediately(self, tmp_path, monkeypatch): + """``create()`` rebuilds the FTS index; the new item must surface + on the next ``search()`` against its terms. + """ + conn = _fresh_db(tmp_path, monkeypatch) + from src.fts import ensure_fts_loaded + from src.repositories.knowledge import KnowledgeRepository + + if not ensure_fts_loaded(conn): + pytest.skip("fts extension not loadable in this environment") + + repo = KnowledgeRepository(conn) + # No items yet → empty result. + assert repo.search("singleton") == [] + + repo.create(id="new", title="singleton token", content="rare", category="x") + results = repo.search("singleton") + assert len(results) == 1 + assert results[0]["id"] == "new" + + def test_title_update_resurfaces_under_new_term(self, tmp_path, monkeypatch): + """Updating the title rebuilds the index so the row matches the + new term and stops matching the old one. + """ + conn = _fresh_db(tmp_path, monkeypatch) + from src.fts import ensure_fts_loaded + from src.repositories.knowledge import KnowledgeRepository + + if not ensure_fts_loaded(conn): + pytest.skip("fts extension not loadable in this environment") + + repo = KnowledgeRepository(conn) + repo.create(id="r", title="alpha bravo", content="", category="x") + assert [r["id"] for r in repo.search("alpha")] == ["r"] + + repo.update("r", title="charlie delta") + # Old term no longer matches. + assert repo.search("alpha") == [] + # New term matches. + assert [r["id"] for r in repo.search("charlie")] == ["r"] + + +class TestCzechDiacritics: + def test_query_without_diacritics_matches_indexed_diacritics(self, tmp_path, monkeypatch): + """``strip_accents=1`` lets a query of ``cesky`` match a doc + containing ``česky``. This is the requirement from issue #121. + """ + conn = _fresh_db(tmp_path, monkeypatch) + from src.fts import ensure_fts_loaded + from src.repositories.knowledge import KnowledgeRepository + + if not ensure_fts_loaded(conn): + pytest.skip("fts extension not loadable in this environment") + + repo = KnowledgeRepository(conn) + repo.create( + id="cs", + title="česky preferuje", + content="user prefers Czech locale; česká diakritika test", + category="x", + ) + # Accent-stripped query must still hit the diacritic-bearing doc. + results = repo.search("cesky") + assert [r["id"] for r in results] == ["cs"] diff --git a/tests/test_schema_v42_migration.py b/tests/test_schema_v42_migration.py index faa99cf..5150e6e 100644 --- a/tests/test_schema_v42_migration.py +++ b/tests/test_schema_v42_migration.py @@ -9,7 +9,7 @@ def test_schema_version_is_42(): # Test name preserved for git-blame continuity; the version-pinned # tests in test_db_schema_version.py, test_home_stats.py and # test_schema_v46_migration.py carry the current commentary. - assert SCHEMA_VERSION == 46 + assert SCHEMA_VERSION == 47 def test_v42_tables_exist_after_init(tmp_path): @@ -66,7 +66,7 @@ def test_v41_to_v42_is_idempotent(tmp_path): conn = duckdb.connect(str(db_path)) init_database(conn) v = conn.execute("SELECT MAX(version) FROM schema_version").fetchone()[0] - assert v == 46 + assert v == 47 conn.close() @@ -87,7 +87,7 @@ def test_v41_db_upgrades_cleanly(tmp_path): conn = duckdb.connect(str(db_path)) init_database(conn) v = conn.execute("SELECT MAX(version) FROM schema_version").fetchone()[0] - assert v == 46 + assert v == 47 # All 7 new v41 tables exist after the v40→v41 upgrade tables = { row[0] @@ -119,7 +119,7 @@ def test_v30_db_ladders_all_the_way_up(tmp_path): conn = duckdb.connect(str(db_path)) init_database(conn) v = conn.execute("SELECT MAX(version) FROM schema_version").fetchone()[0] - assert v == 46 + assert v == 47 cnt = conn.execute("SELECT COUNT(*) FROM audit_log WHERE id='vintage'").fetchone()[0] assert cnt == 1 # New v41 table exists diff --git a/tests/test_schema_v46_migration.py b/tests/test_schema_v46_migration.py index ba14c64..31133cb 100644 --- a/tests/test_schema_v46_migration.py +++ b/tests/test_schema_v46_migration.py @@ -14,7 +14,7 @@ from src.db import SCHEMA_VERSION, _ensure_schema, _v45_to_v46, get_schema_versi def test_schema_version_is_46(): - assert SCHEMA_VERSION == 46 + assert SCHEMA_VERSION == 47 def test_fresh_install_creates_dismissed_table(tmp_path):