Cuts release 0.23.0.
## Highlights
- Single-item Edit button on every memory item card (modal hits PATCH /api/memory/admin/{id}).
- MEMORY_DOMAIN RBAC resource type — admins grant user_groups access to specific domains via /admin/access. Composes with existing audience filter (OR semantics, no-op when no grants).
- ai: section editable in /admin/server-config — admins can set ANTHROPIC_API_KEY / model / provider / base_url for the corporate-memory extractor without editing instance.yaml directly. api_key auto-masked.
## Devin findings addressed
- Modal NULL→empty fix (audience visibility wouldn't break).
- Stats endpoint granted_domains parity with list endpoint.
- Documented intentional MEMORY_DOMAIN→audience bypass.
- Documented conscious ai.base_url SSRF exclusion (legit internal LiteLLM/vLLM proxies).
See CHANGELOG [0.23.0] for full notes.
792 lines
31 KiB
Python
792 lines
31 KiB
Python
"""Repository for corporate memory knowledge items, votes, and contradictions."""
|
|
|
|
import json
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Optional, List, Dict
|
|
|
|
import duckdb
|
|
|
|
|
|
class KnowledgeRepository:
|
|
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
self.conn = conn
|
|
|
|
def _row_to_dict(self, row) -> Optional[Dict[str, Any]]:
|
|
if not row:
|
|
return None
|
|
columns = [desc[0] for desc in self.conn.description]
|
|
return dict(zip(columns, row))
|
|
|
|
def _rows_to_dicts(self, rows) -> List[Dict[str, Any]]:
|
|
if not rows:
|
|
return []
|
|
columns = [desc[0] for desc in self.conn.description]
|
|
return [dict(zip(columns, row)) for row in rows]
|
|
|
|
def get_by_id(self, item_id: str) -> Optional[Dict[str, Any]]:
|
|
result = self.conn.execute("SELECT * FROM knowledge_items WHERE id = ?", [item_id]).fetchone()
|
|
return self._row_to_dict(result)
|
|
|
|
def get_by_ids(self, item_ids: List[str]) -> Dict[str, Any]:
|
|
"""Fetch multiple items by ID in one query. Returns dict keyed by id."""
|
|
if not item_ids:
|
|
return {}
|
|
placeholders = ", ".join("?" for _ in item_ids)
|
|
rows = self.conn.execute(
|
|
f"SELECT * FROM knowledge_items WHERE id IN ({placeholders})",
|
|
item_ids,
|
|
).fetchall()
|
|
items = self._rows_to_dicts(rows)
|
|
return {item["id"]: item for item in items}
|
|
|
|
def create(
|
|
self,
|
|
id: str,
|
|
title: str,
|
|
content: str,
|
|
category: str,
|
|
source_user: Optional[str] = None,
|
|
tags: Optional[List[str]] = None,
|
|
status: str = "pending",
|
|
confidence: Optional[float] = None,
|
|
domain: Optional[str] = None,
|
|
entities: Optional[List[str]] = None,
|
|
source_type: str = "claude_local_md",
|
|
source_ref: Optional[str] = None,
|
|
valid_from: Optional[datetime] = None,
|
|
valid_until: Optional[datetime] = None,
|
|
supersedes: Optional[str] = None,
|
|
sensitivity: str = "internal",
|
|
is_personal: bool = False,
|
|
) -> None:
|
|
now = datetime.now(timezone.utc)
|
|
self.conn.execute(
|
|
"""INSERT INTO knowledge_items (
|
|
id, title, content, category, source_user, tags, status,
|
|
confidence, domain, entities, source_type, source_ref,
|
|
valid_from, valid_until, supersedes, sensitivity, is_personal,
|
|
created_at, updated_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
[
|
|
id, title, content, category, source_user,
|
|
json.dumps(tags) if tags else None, status,
|
|
confidence, domain,
|
|
json.dumps(entities) if entities else None,
|
|
source_type, source_ref,
|
|
valid_from, valid_until, supersedes, sensitivity, is_personal,
|
|
now, now,
|
|
],
|
|
)
|
|
|
|
_UPDATABLE_FIELDS = {
|
|
"title", "content", "category", "tags", "domain", "entities",
|
|
"source_type", "source_ref", "source_user", "audience",
|
|
"confidence", "status", "sensitivity", "is_personal",
|
|
"valid_from", "valid_until", "supersedes",
|
|
}
|
|
|
|
def update(self, item_id: str, **fields) -> None:
|
|
safe = {k: v for k, v in fields.items() if k in self._UPDATABLE_FIELDS}
|
|
if not safe:
|
|
return
|
|
now = datetime.now(timezone.utc)
|
|
set_clause = ", ".join(f"{k} = ?" for k in safe)
|
|
values = list(safe.values()) + [now, item_id]
|
|
self.conn.execute(
|
|
f"UPDATE knowledge_items SET {set_clause}, updated_at = ? WHERE id = ?",
|
|
values,
|
|
)
|
|
|
|
def update_status(self, item_id: str, status: str) -> None:
|
|
now = datetime.now(timezone.utc)
|
|
self.conn.execute(
|
|
"UPDATE knowledge_items SET status = ?, updated_at = ? WHERE id = ?",
|
|
[status, now, item_id],
|
|
)
|
|
|
|
def list_items(
|
|
self,
|
|
statuses: Optional[List[str]] = None,
|
|
category: Optional[str] = None,
|
|
domain: Optional[str] = None,
|
|
source_type: Optional[str] = None,
|
|
exclude_personal: bool = False,
|
|
user_groups: Optional[List[str]] = None,
|
|
granted_domains: Optional[List[str]] = None,
|
|
limit: int = 100,
|
|
offset: int = 0,
|
|
) -> List[Dict[str, Any]]:
|
|
query = "SELECT * FROM knowledge_items WHERE 1=1"
|
|
params: List[Any] = []
|
|
if statuses:
|
|
placeholders = ", ".join("?" for _ in statuses)
|
|
query += f" AND status IN ({placeholders})"
|
|
params.extend(statuses)
|
|
if category:
|
|
query += " AND category = ?"
|
|
params.append(category)
|
|
if domain:
|
|
query += " AND domain = ?"
|
|
params.append(domain)
|
|
if source_type:
|
|
query += " AND source_type = ?"
|
|
params.append(source_type)
|
|
if exclude_personal:
|
|
query += " AND (is_personal = FALSE OR is_personal IS NULL)"
|
|
if user_groups is not None:
|
|
# Visibility: audience-string match (null/all/group:X) OR
|
|
# caller has been granted access to the item's domain via
|
|
# resource_grants (MEMORY_DOMAIN). When ``granted_domains`` is
|
|
# falsy the OR clause collapses, preserving pre-RBAC behaviour.
|
|
visibility_clauses = ["audience IS NULL", "audience = 'all'"]
|
|
if user_groups:
|
|
audience_placeholders = ", ".join("?" for _ in user_groups)
|
|
visibility_clauses.append(f"audience IN ({audience_placeholders})")
|
|
params.extend(user_groups)
|
|
if granted_domains:
|
|
domain_placeholders = ", ".join("?" for _ in granted_domains)
|
|
visibility_clauses.append(f"domain IN ({domain_placeholders})")
|
|
params.extend(granted_domains)
|
|
query += " AND (" + " OR ".join(visibility_clauses) + ")"
|
|
query += " ORDER BY updated_at DESC LIMIT ? OFFSET ?"
|
|
params.extend([limit, offset])
|
|
return self._rows_to_dicts(self.conn.execute(query, params).fetchall())
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
exclude_personal: bool = False,
|
|
user_groups: Optional[List[str]] = None,
|
|
granted_domains: Optional[List[str]] = None,
|
|
statuses: Optional[List[str]] = None,
|
|
category: Optional[str] = None,
|
|
domain: Optional[str] = None,
|
|
source_type: Optional[str] = None,
|
|
limit: int = 100,
|
|
offset: int = 0,
|
|
) -> List[Dict[str, Any]]:
|
|
pattern = f"%{query}%"
|
|
sql = """SELECT * FROM knowledge_items
|
|
WHERE (title ILIKE ? OR content ILIKE ?)"""
|
|
params: List[Any] = [pattern, pattern]
|
|
if statuses:
|
|
placeholders = ", ".join("?" for _ in statuses)
|
|
sql += f" AND status IN ({placeholders})"
|
|
params.extend(statuses)
|
|
if category:
|
|
sql += " AND category = ?"
|
|
params.append(category)
|
|
if domain:
|
|
sql += " AND domain = ?"
|
|
params.append(domain)
|
|
if source_type:
|
|
sql += " AND source_type = ?"
|
|
params.append(source_type)
|
|
if exclude_personal:
|
|
sql += " AND (is_personal = FALSE OR is_personal IS NULL)"
|
|
if user_groups is not None:
|
|
visibility_clauses = ["audience IS NULL", "audience = 'all'"]
|
|
if user_groups:
|
|
audience_placeholders = ", ".join("?" for _ in user_groups)
|
|
visibility_clauses.append(f"audience IN ({audience_placeholders})")
|
|
params.extend(user_groups)
|
|
if granted_domains:
|
|
domain_placeholders = ", ".join("?" for _ in granted_domains)
|
|
visibility_clauses.append(f"domain IN ({domain_placeholders})")
|
|
params.extend(granted_domains)
|
|
sql += " AND (" + " OR ".join(visibility_clauses) + ")"
|
|
sql += " ORDER BY updated_at DESC LIMIT ? OFFSET ?"
|
|
params.extend([limit, offset])
|
|
results = self.conn.execute(sql, params).fetchall()
|
|
return self._rows_to_dicts(results)
|
|
|
|
def count_items(
|
|
self,
|
|
search: Optional[str] = None,
|
|
statuses: Optional[List[str]] = None,
|
|
category: Optional[str] = None,
|
|
domain: Optional[str] = None,
|
|
source_type: Optional[str] = None,
|
|
exclude_personal: bool = False,
|
|
user_groups: Optional[List[str]] = None,
|
|
granted_domains: Optional[List[str]] = None,
|
|
) -> int:
|
|
if search:
|
|
pattern = f"%{search}%"
|
|
sql = "SELECT COUNT(*) FROM knowledge_items WHERE (title ILIKE ? OR content ILIKE ?)"
|
|
params: List[Any] = [pattern, pattern]
|
|
else:
|
|
sql = "SELECT COUNT(*) FROM knowledge_items WHERE 1=1"
|
|
params = []
|
|
if statuses:
|
|
placeholders = ", ".join("?" for _ in statuses)
|
|
sql += f" AND status IN ({placeholders})"
|
|
params.extend(statuses)
|
|
if category:
|
|
sql += " AND category = ?"
|
|
params.append(category)
|
|
if domain:
|
|
sql += " AND domain = ?"
|
|
params.append(domain)
|
|
if source_type:
|
|
sql += " AND source_type = ?"
|
|
params.append(source_type)
|
|
if exclude_personal:
|
|
sql += " AND (is_personal = FALSE OR is_personal IS NULL)"
|
|
if user_groups is not None:
|
|
visibility_clauses = ["audience IS NULL", "audience = 'all'"]
|
|
if user_groups:
|
|
audience_placeholders = ", ".join("?" for _ in user_groups)
|
|
visibility_clauses.append(f"audience IN ({audience_placeholders})")
|
|
params.extend(user_groups)
|
|
if granted_domains:
|
|
domain_placeholders = ", ".join("?" for _ in granted_domains)
|
|
visibility_clauses.append(f"domain IN ({domain_placeholders})")
|
|
params.extend(granted_domains)
|
|
sql += " AND (" + " OR ".join(visibility_clauses) + ")"
|
|
return self.conn.execute(sql, params).fetchone()[0]
|
|
|
|
def list_by_domain(
|
|
self,
|
|
domain: str,
|
|
statuses: Optional[List[str]] = None,
|
|
limit: int = 100,
|
|
) -> List[Dict[str, Any]]:
|
|
query = "SELECT * FROM knowledge_items WHERE domain = ?"
|
|
params: List[Any] = [domain]
|
|
if statuses:
|
|
placeholders = ", ".join("?" for _ in statuses)
|
|
query += f" AND status IN ({placeholders})"
|
|
params.extend(statuses)
|
|
query += " ORDER BY updated_at DESC LIMIT ?"
|
|
params.append(limit)
|
|
return self._rows_to_dicts(self.conn.execute(query, params).fetchall())
|
|
|
|
def get_user_contributions(self, source_user: str) -> List[Dict[str, Any]]:
|
|
results = self.conn.execute(
|
|
"SELECT * FROM knowledge_items WHERE source_user = ? ORDER BY updated_at DESC",
|
|
[source_user],
|
|
).fetchall()
|
|
return self._rows_to_dicts(results)
|
|
|
|
def set_personal(self, item_id: str, is_personal: bool) -> None:
|
|
now = datetime.now(timezone.utc)
|
|
self.conn.execute(
|
|
"UPDATE knowledge_items SET is_personal = ?, updated_at = ? WHERE id = ?",
|
|
[is_personal, now, item_id],
|
|
)
|
|
|
|
# --- Votes ---
|
|
|
|
def vote(self, item_id: str, user_id: str, vote: int) -> None:
|
|
now = datetime.now(timezone.utc)
|
|
self.conn.execute(
|
|
"""INSERT INTO knowledge_votes (item_id, user_id, vote, voted_at)
|
|
VALUES (?, ?, ?, ?)
|
|
ON CONFLICT (item_id, user_id) DO UPDATE SET vote = excluded.vote, voted_at = excluded.voted_at""",
|
|
[item_id, user_id, vote, now],
|
|
)
|
|
|
|
def unvote(self, item_id: str, user_id: str) -> None:
|
|
self.conn.execute(
|
|
"DELETE FROM knowledge_votes WHERE item_id = ? AND user_id = ?",
|
|
[item_id, user_id],
|
|
)
|
|
|
|
def get_votes(self, item_id: str) -> Dict[str, int]:
|
|
result = self.conn.execute(
|
|
"""SELECT
|
|
COALESCE(SUM(CASE WHEN vote > 0 THEN 1 ELSE 0 END), 0) as upvotes,
|
|
COALESCE(SUM(CASE WHEN vote < 0 THEN 1 ELSE 0 END), 0) as downvotes
|
|
FROM knowledge_votes WHERE item_id = ?""",
|
|
[item_id],
|
|
).fetchone()
|
|
return {"upvotes": result[0], "downvotes": result[1]}
|
|
|
|
# --- Contradictions ---
|
|
|
|
def create_contradiction(
|
|
self,
|
|
item_a_id: str,
|
|
item_b_id: str,
|
|
explanation: str,
|
|
severity: Optional[str] = None,
|
|
suggested_resolution: Optional[Any] = None,
|
|
) -> str:
|
|
"""Persist a contradiction.
|
|
|
|
``suggested_resolution`` may be either a free-form string (legacy
|
|
callers) or a dict (the structured shape produced by Haiku — see ADR
|
|
Decision 4). Dicts are JSON-encoded into the existing TEXT column so
|
|
no schema migration is needed; the read side decodes back to dict.
|
|
"""
|
|
if isinstance(suggested_resolution, dict):
|
|
suggested_resolution_db: Optional[str] = json.dumps(suggested_resolution)
|
|
else:
|
|
suggested_resolution_db = suggested_resolution
|
|
contradiction_id = f"kc_{uuid.uuid4().hex[:12]}"
|
|
self.conn.execute(
|
|
"""INSERT INTO knowledge_contradictions (
|
|
id, item_a_id, item_b_id, explanation, severity, suggested_resolution
|
|
) VALUES (?, ?, ?, ?, ?, ?)""",
|
|
[contradiction_id, item_a_id, item_b_id, explanation, severity, suggested_resolution_db],
|
|
)
|
|
return contradiction_id
|
|
|
|
@staticmethod
|
|
def _decode_suggested_resolution(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""If the stored suggested_resolution is JSON, decode it to a dict.
|
|
Plain strings (legacy rows) are returned unchanged.
|
|
"""
|
|
raw = row.get("suggested_resolution")
|
|
if isinstance(raw, str) and raw and raw.lstrip().startswith("{"):
|
|
try:
|
|
row["suggested_resolution"] = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return row
|
|
|
|
def list_contradictions(
|
|
self,
|
|
resolved: Optional[bool] = None,
|
|
limit: int = 100,
|
|
) -> List[Dict[str, Any]]:
|
|
query = "SELECT * FROM knowledge_contradictions WHERE 1=1"
|
|
params: List[Any] = []
|
|
if resolved is not None:
|
|
query += " AND resolved = ?"
|
|
params.append(resolved)
|
|
query += " ORDER BY detected_at DESC LIMIT ?"
|
|
params.append(limit)
|
|
rows = self._rows_to_dicts(self.conn.execute(query, params).fetchall())
|
|
return [self._decode_suggested_resolution(r) for r in rows]
|
|
|
|
def resolve_contradiction(
|
|
self,
|
|
contradiction_id: str,
|
|
resolved_by: str,
|
|
resolution: str,
|
|
) -> None:
|
|
now = datetime.now(timezone.utc)
|
|
self.conn.execute(
|
|
"""UPDATE knowledge_contradictions
|
|
SET resolved = TRUE, resolved_by = ?, resolved_at = ?, resolution = ?
|
|
WHERE id = ?""",
|
|
[resolved_by, now, resolution, contradiction_id],
|
|
)
|
|
|
|
def get_contradiction(self, contradiction_id: str) -> Optional[Dict[str, Any]]:
|
|
result = self.conn.execute(
|
|
"SELECT * FROM knowledge_contradictions WHERE id = ?",
|
|
[contradiction_id],
|
|
).fetchone()
|
|
row = self._row_to_dict(result)
|
|
if row is not None:
|
|
self._decode_suggested_resolution(row)
|
|
return row
|
|
|
|
# --- Verification Evidence ---
|
|
|
|
def create_evidence(
|
|
self,
|
|
item_id: str,
|
|
source_user: Optional[str] = None,
|
|
source_ref: Optional[str] = None,
|
|
detection_type: Optional[str] = None,
|
|
user_quote: Optional[str] = None,
|
|
) -> str:
|
|
"""Persist one verification evidence row for a knowledge item.
|
|
|
|
Multiple evidence rows per item are expected — each new analyst
|
|
confirmation/correction adds one. user_quote and detection_type are the
|
|
raw signal future Bayesian re-calibration consumes.
|
|
"""
|
|
evidence_id = f"ev_{uuid.uuid4().hex[:12]}"
|
|
self.conn.execute(
|
|
"""INSERT INTO verification_evidence (
|
|
id, item_id, source_user, source_ref, detection_type, user_quote
|
|
) VALUES (?, ?, ?, ?, ?, ?)""",
|
|
[evidence_id, item_id, source_user, source_ref, detection_type, user_quote],
|
|
)
|
|
return evidence_id
|
|
|
|
def list_evidence(self, item_id: str) -> List[Dict[str, Any]]:
|
|
results = self.conn.execute(
|
|
"""SELECT * FROM verification_evidence
|
|
WHERE item_id = ?
|
|
ORDER BY created_at ASC""",
|
|
[item_id],
|
|
).fetchall()
|
|
return self._rows_to_dicts(results)
|
|
|
|
# --- Session Extraction State ---
|
|
|
|
def mark_session_processed(
|
|
self,
|
|
session_file: str,
|
|
username: str,
|
|
items_extracted: int = 0,
|
|
file_hash: Optional[str] = None,
|
|
) -> None:
|
|
now = datetime.now(timezone.utc)
|
|
self.conn.execute(
|
|
"""INSERT INTO session_extraction_state (session_file, username, processed_at, items_extracted, file_hash)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
ON CONFLICT (session_file) DO UPDATE
|
|
SET processed_at = excluded.processed_at,
|
|
items_extracted = excluded.items_extracted,
|
|
file_hash = excluded.file_hash""",
|
|
[session_file, username, now, items_extracted, file_hash],
|
|
)
|
|
|
|
def is_session_processed(self, session_file: str) -> bool:
|
|
result = self.conn.execute(
|
|
"SELECT 1 FROM session_extraction_state WHERE session_file = ?",
|
|
[session_file],
|
|
).fetchone()
|
|
return result is not None
|
|
|
|
# --- Item relations (duplicate-candidate hints, etc.) ---
|
|
|
|
@staticmethod
|
|
def _canonical_pair(a: str, b: str) -> tuple[str, str]:
|
|
"""Return (min(a,b), max(a,b)) — every unordered pair maps to one row."""
|
|
return (a, b) if a <= b else (b, a)
|
|
|
|
def create_relation(
|
|
self,
|
|
item_a_id: str,
|
|
item_b_id: str,
|
|
relation_type: str,
|
|
score: Optional[float] = None,
|
|
) -> None:
|
|
"""Persist a relation row. Idempotent on (item_a_id, item_b_id, relation_type).
|
|
|
|
The PK is canonicalized to (min, max) so duplicate calls with reversed
|
|
arguments don't create a second row. Self-relations (a == b) are
|
|
rejected — a pair must reference two distinct items.
|
|
"""
|
|
if item_a_id == item_b_id:
|
|
raise ValueError("Cannot create relation between an item and itself")
|
|
a, b = self._canonical_pair(item_a_id, item_b_id)
|
|
self.conn.execute(
|
|
"""INSERT INTO knowledge_item_relations
|
|
(item_a_id, item_b_id, relation_type, score)
|
|
VALUES (?, ?, ?, ?)
|
|
ON CONFLICT (item_a_id, item_b_id, relation_type) DO NOTHING""",
|
|
[a, b, relation_type, score],
|
|
)
|
|
|
|
def list_relations(
|
|
self,
|
|
relation_type: Optional[str] = None,
|
|
resolved: Optional[bool] = None,
|
|
limit: int = 100,
|
|
) -> List[Dict[str, Any]]:
|
|
sql = "SELECT * FROM knowledge_item_relations WHERE 1=1"
|
|
params: List[Any] = []
|
|
if relation_type is not None:
|
|
sql += " AND relation_type = ?"
|
|
params.append(relation_type)
|
|
if resolved is not None:
|
|
sql += " AND resolved = ?"
|
|
params.append(resolved)
|
|
sql += " ORDER BY created_at DESC LIMIT ?"
|
|
params.append(limit)
|
|
return self._rows_to_dicts(self.conn.execute(sql, params).fetchall())
|
|
|
|
def resolve_relation(
|
|
self,
|
|
item_a_id: str,
|
|
item_b_id: str,
|
|
relation_type: str,
|
|
resolved_by: str,
|
|
resolution: str,
|
|
) -> int:
|
|
"""Mark a relation row resolved. Returns rowcount (0 if not found)."""
|
|
a, b = self._canonical_pair(item_a_id, item_b_id)
|
|
now = datetime.now(timezone.utc)
|
|
# DuckDB doesn't expose UPDATE rowcount via the cursor API uniformly;
|
|
# do an existence check first so callers can report 404 vs success.
|
|
existing = self.conn.execute(
|
|
"""SELECT 1 FROM knowledge_item_relations
|
|
WHERE item_a_id = ? AND item_b_id = ? AND relation_type = ?""",
|
|
[a, b, relation_type],
|
|
).fetchone()
|
|
if not existing:
|
|
return 0
|
|
self.conn.execute(
|
|
"""UPDATE knowledge_item_relations
|
|
SET resolved = TRUE,
|
|
resolved_by = ?,
|
|
resolved_at = ?,
|
|
resolution = ?
|
|
WHERE item_a_id = ? AND item_b_id = ? AND relation_type = ?""",
|
|
[resolved_by, now, resolution, a, b, relation_type],
|
|
)
|
|
return 1
|
|
|
|
def get_relation(
|
|
self,
|
|
item_a_id: str,
|
|
item_b_id: str,
|
|
relation_type: str,
|
|
) -> Optional[Dict[str, Any]]:
|
|
a, b = self._canonical_pair(item_a_id, item_b_id)
|
|
result = self.conn.execute(
|
|
"""SELECT * FROM knowledge_item_relations
|
|
WHERE item_a_id = ? AND item_b_id = ? AND relation_type = ?""",
|
|
[a, b, relation_type],
|
|
).fetchone()
|
|
return self._row_to_dict(result)
|
|
|
|
def find_duplicate_candidates_by_entities(
|
|
self,
|
|
new_item_id: str,
|
|
entities: Optional[List[str]],
|
|
domain: Optional[str],
|
|
min_overlap: int,
|
|
limit: int = 100,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Same-domain candidates whose ``entities`` set shares >= ``min_overlap``
|
|
members with ``entities``.
|
|
|
|
Personal items are excluded (privacy boundary — see ADR Decision 1
|
|
precedent in ``find_contradiction_candidates``). Self-id is excluded.
|
|
Domain is a hard SQL conjunct: a NULL-domain item produces no
|
|
candidates (matches the verification-detector skip-empty contract).
|
|
Jaccard is computed in Python because DuckDB lacks a portable JSON
|
|
intersection helper; the SQL layer trims the candidate set to the
|
|
same domain so the Python loop scales linearly with that.
|
|
"""
|
|
if not entities or not domain:
|
|
return []
|
|
new_set = set(entities)
|
|
sql = """
|
|
SELECT * FROM knowledge_items
|
|
WHERE status IN ('approved', 'mandatory', 'pending')
|
|
AND (is_personal = FALSE OR is_personal IS NULL)
|
|
AND domain = ?
|
|
AND id != ?
|
|
AND entities IS NOT NULL
|
|
ORDER BY updated_at DESC
|
|
LIMIT ?
|
|
"""
|
|
rows = self._rows_to_dicts(
|
|
self.conn.execute(sql, [domain, new_item_id, limit]).fetchall()
|
|
)
|
|
out: List[Dict[str, Any]] = []
|
|
for row in rows:
|
|
cand_entities = row.get("entities")
|
|
if isinstance(cand_entities, str):
|
|
try:
|
|
cand_entities = json.loads(cand_entities)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
if not isinstance(cand_entities, list) or not cand_entities:
|
|
continue
|
|
cand_set = set(cand_entities)
|
|
overlap = new_set & cand_set
|
|
if len(overlap) < min_overlap:
|
|
continue
|
|
union = new_set | cand_set
|
|
jaccard = len(overlap) / len(union) if union else 0.0
|
|
row["overlap_count"] = len(overlap)
|
|
row["jaccard"] = jaccard
|
|
out.append(row)
|
|
return out
|
|
|
|
# --- Bulk update + tag/audience aggregations (issue #62) ---
|
|
|
|
def bulk_update(
|
|
self,
|
|
item_ids: List[str],
|
|
updates: Dict[str, Any],
|
|
) -> Dict[str, str]:
|
|
"""Apply ``updates`` to each item id; partial-failure tolerant.
|
|
|
|
``updates`` may include the standard ``_UPDATABLE_FIELDS`` keys plus
|
|
the bulk-only ``tags_add`` / ``tags_remove`` lists. Tag mutations are
|
|
merged with the item's existing tags so callers don't have to fetch
|
|
first. Returns a per-id status map: ``"updated"`` / ``"not_found"`` /
|
|
an error message.
|
|
"""
|
|
results: Dict[str, str] = {}
|
|
if not item_ids:
|
|
return results
|
|
|
|
plain_fields = {
|
|
k: v for k, v in updates.items()
|
|
if k in self._UPDATABLE_FIELDS and k != "tags"
|
|
}
|
|
# If the caller passed an explicit ``tags`` list, treat it as a hard
|
|
# set (same semantics as repo.update). Add/remove are applied per item.
|
|
explicit_tags = updates.get("tags") if "tags" in updates else None
|
|
tags_add = updates.get("tags_add") or []
|
|
tags_remove = updates.get("tags_remove") or []
|
|
|
|
for item_id in item_ids:
|
|
try:
|
|
item = self.get_by_id(item_id)
|
|
if not item:
|
|
results[item_id] = "not_found"
|
|
continue
|
|
|
|
per_item: Dict[str, Any] = dict(plain_fields)
|
|
if explicit_tags is not None:
|
|
per_item["tags"] = explicit_tags
|
|
elif tags_add or tags_remove:
|
|
existing = item.get("tags") or []
|
|
if isinstance(existing, str):
|
|
try:
|
|
existing = json.loads(existing)
|
|
except json.JSONDecodeError:
|
|
existing = []
|
|
if not isinstance(existing, list):
|
|
existing = []
|
|
new_tags = list(existing)
|
|
for t in tags_add:
|
|
if t not in new_tags:
|
|
new_tags.append(t)
|
|
if tags_remove:
|
|
rm = set(tags_remove)
|
|
new_tags = [t for t in new_tags if t not in rm]
|
|
per_item["tags"] = new_tags
|
|
|
|
if not per_item:
|
|
results[item_id] = "updated" # nothing to do, treat as success
|
|
continue
|
|
|
|
# JSON-encode tags before passing to .update (mirrors create()).
|
|
if "tags" in per_item:
|
|
per_item["tags"] = (
|
|
json.dumps(per_item["tags"]) if per_item["tags"] else None
|
|
)
|
|
if "entities" in per_item and isinstance(per_item["entities"], list):
|
|
per_item["entities"] = json.dumps(per_item["entities"]) if per_item["entities"] else None
|
|
|
|
self.update(item_id, **per_item)
|
|
results[item_id] = "updated"
|
|
except Exception as e: # pragma: no cover - defensive
|
|
results[item_id] = f"error: {e}"
|
|
return results
|
|
|
|
def count_by_tag(
|
|
self,
|
|
exclude_personal: bool = False,
|
|
user_groups: Optional[List[str]] = None,
|
|
granted_domains: Optional[List[str]] = None,
|
|
) -> Dict[str, int]:
|
|
"""Aggregate item counts per tag (one tag may belong to many items).
|
|
|
|
Uses DuckDB ``json_each`` to unnest the JSON tag list. Items with no
|
|
tags don't contribute. Visibility filter mirrors ``count_items``
|
|
(audience OR MEMORY_DOMAIN grant).
|
|
"""
|
|
where = ["tags IS NOT NULL"]
|
|
params: List[Any] = []
|
|
if exclude_personal:
|
|
where.append("(is_personal = FALSE OR is_personal IS NULL)")
|
|
if user_groups is not None:
|
|
visibility = ["audience IS NULL", "audience = 'all'"]
|
|
if user_groups:
|
|
ph = ",".join(["?"] * len(user_groups))
|
|
visibility.append(f"audience IN ({ph})")
|
|
params.extend(user_groups)
|
|
if granted_domains:
|
|
dph = ",".join(["?"] * len(granted_domains))
|
|
visibility.append(f"domain IN ({dph})")
|
|
params.extend(granted_domains)
|
|
where.append("(" + " OR ".join(visibility) + ")")
|
|
where_sql = " WHERE " + " AND ".join(where)
|
|
sql = (
|
|
"SELECT t.value AS tag, COUNT(*) AS cnt "
|
|
"FROM knowledge_items, json_each(knowledge_items.tags) AS t "
|
|
f"{where_sql} "
|
|
"GROUP BY t.value ORDER BY cnt DESC"
|
|
)
|
|
rows = self.conn.execute(sql, params).fetchall()
|
|
out: Dict[str, int] = {}
|
|
for tag, cnt in rows:
|
|
# json_each returns the raw scalar; strip wrapping quotes if needed.
|
|
key = tag if isinstance(tag, str) else str(tag)
|
|
if key.startswith('"') and key.endswith('"'):
|
|
key = key[1:-1]
|
|
out[key] = cnt
|
|
return out
|
|
|
|
def count_by_audience(
|
|
self,
|
|
exclude_personal: bool = False,
|
|
user_groups: Optional[List[str]] = None,
|
|
granted_domains: Optional[List[str]] = None,
|
|
) -> Dict[str, int]:
|
|
"""Aggregate item counts per audience bucket.
|
|
|
|
``audience`` is a free-form column whose canonical values are
|
|
``NULL`` / ``'all'`` / ``'group:<name>'``. NULL is bucketed as
|
|
``'all'`` so the chip-filter UI doesn't need a separate "no audience"
|
|
affordance. Visibility filter mirrors ``count_items`` (audience OR
|
|
MEMORY_DOMAIN grant).
|
|
"""
|
|
where: List[str] = []
|
|
params: List[Any] = []
|
|
if exclude_personal:
|
|
where.append("(is_personal = FALSE OR is_personal IS NULL)")
|
|
if user_groups is not None:
|
|
visibility = ["audience IS NULL", "audience = 'all'"]
|
|
if user_groups:
|
|
ph = ",".join(["?"] * len(user_groups))
|
|
visibility.append(f"audience IN ({ph})")
|
|
params.extend(user_groups)
|
|
if granted_domains:
|
|
dph = ",".join(["?"] * len(granted_domains))
|
|
visibility.append(f"domain IN ({dph})")
|
|
params.extend(granted_domains)
|
|
where.append("(" + " OR ".join(visibility) + ")")
|
|
where_sql = (" WHERE " + " AND ".join(where)) if where else ""
|
|
sql = (
|
|
"SELECT COALESCE(audience, 'all') AS aud, COUNT(*) AS cnt "
|
|
f"FROM knowledge_items{where_sql} "
|
|
"GROUP BY aud ORDER BY cnt DESC"
|
|
)
|
|
rows = self.conn.execute(sql, params).fetchall()
|
|
return {r[0]: r[1] for r in rows}
|
|
|
|
def find_contradiction_candidates(
|
|
self,
|
|
new_item_id: str,
|
|
domain: Optional[str] = None,
|
|
limit: int = 100,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Same-domain candidates for LLM-based contradiction judgment.
|
|
|
|
Domain is the only narrowing applied at the SQL layer. Topic / content
|
|
matching is delegated to the LLM judge in
|
|
services.corporate_memory.contradiction.find_and_judge() — see ADR
|
|
Decision 4. The brittle keyword-substring layer that used to live here
|
|
was removed; it had recall holes (synonyms, paraphrases) and the
|
|
domain conjunct alone is enough as a hard ACL.
|
|
|
|
Personal items (`is_personal = TRUE`) are excluded unconditionally —
|
|
the LLM call is a read site (and exfiltrates content to the external
|
|
API), so ADR Decision 1 ("hard privacy boundary, not a UI hint")
|
|
applies. Without this filter, personal item content would be
|
|
serialized into every contradiction prompt and could be paraphrased
|
|
into `knowledge_contradictions.suggested_resolution.merged_content`
|
|
— bypassing the contributor-only visibility rule.
|
|
"""
|
|
sql = """
|
|
SELECT * FROM knowledge_items
|
|
WHERE status IN ('approved', 'mandatory', 'pending')
|
|
AND (is_personal = FALSE OR is_personal IS NULL)
|
|
AND id != ?
|
|
"""
|
|
params: List[Any] = [new_item_id]
|
|
if domain:
|
|
sql += " AND domain = ?"
|
|
params.append(domain)
|
|
sql += " ORDER BY updated_at DESC LIMIT ?"
|
|
params.append(limit)
|
|
return self._rows_to_dicts(self.conn.execute(sql, params).fetchall())
|