agnes-the-ai-analyst/src/repositories/knowledge.py
ZdenekSrotyr 70672204fe
feat(memory): admin Edit + MEMORY_DOMAIN RBAC + ai-section UI (#141)
Cuts release 0.23.0.

## Highlights
- Single-item Edit button on every memory item card (modal hits PATCH /api/memory/admin/{id}).
- MEMORY_DOMAIN RBAC resource type — admins grant user_groups access to specific domains via /admin/access. Composes with existing audience filter (OR semantics, no-op when no grants).
- ai: section editable in /admin/server-config — admins can set ANTHROPIC_API_KEY / model / provider / base_url for the corporate-memory extractor without editing instance.yaml directly. api_key auto-masked.

## Devin findings addressed
- Modal NULL→empty fix (audience visibility wouldn't break).
- Stats endpoint granted_domains parity with list endpoint.
- Documented intentional MEMORY_DOMAIN→audience bypass.
- Documented conscious ai.base_url SSRF exclusion (legit internal LiteLLM/vLLM proxies).

See CHANGELOG [0.23.0] for full notes.
2026-04-30 11:04:41 +02:00

792 lines
31 KiB
Python

"""Repository for corporate memory knowledge items, votes, and contradictions."""
import json
import uuid
from datetime import datetime, timezone
from typing import Any, Optional, List, Dict
import duckdb
class KnowledgeRepository:
def __init__(self, conn: duckdb.DuckDBPyConnection):
self.conn = conn
def _row_to_dict(self, row) -> Optional[Dict[str, Any]]:
if not row:
return None
columns = [desc[0] for desc in self.conn.description]
return dict(zip(columns, row))
def _rows_to_dicts(self, rows) -> List[Dict[str, Any]]:
if not rows:
return []
columns = [desc[0] for desc in self.conn.description]
return [dict(zip(columns, row)) for row in rows]
def get_by_id(self, item_id: str) -> Optional[Dict[str, Any]]:
result = self.conn.execute("SELECT * FROM knowledge_items WHERE id = ?", [item_id]).fetchone()
return self._row_to_dict(result)
def get_by_ids(self, item_ids: List[str]) -> Dict[str, Any]:
"""Fetch multiple items by ID in one query. Returns dict keyed by id."""
if not item_ids:
return {}
placeholders = ", ".join("?" for _ in item_ids)
rows = self.conn.execute(
f"SELECT * FROM knowledge_items WHERE id IN ({placeholders})",
item_ids,
).fetchall()
items = self._rows_to_dicts(rows)
return {item["id"]: item for item in items}
def create(
self,
id: str,
title: str,
content: str,
category: str,
source_user: Optional[str] = None,
tags: Optional[List[str]] = None,
status: str = "pending",
confidence: Optional[float] = None,
domain: Optional[str] = None,
entities: Optional[List[str]] = None,
source_type: str = "claude_local_md",
source_ref: Optional[str] = None,
valid_from: Optional[datetime] = None,
valid_until: Optional[datetime] = None,
supersedes: Optional[str] = None,
sensitivity: str = "internal",
is_personal: bool = False,
) -> None:
now = datetime.now(timezone.utc)
self.conn.execute(
"""INSERT INTO knowledge_items (
id, title, content, category, source_user, tags, status,
confidence, domain, entities, source_type, source_ref,
valid_from, valid_until, supersedes, sensitivity, is_personal,
created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
[
id, title, content, category, source_user,
json.dumps(tags) if tags else None, status,
confidence, domain,
json.dumps(entities) if entities else None,
source_type, source_ref,
valid_from, valid_until, supersedes, sensitivity, is_personal,
now, now,
],
)
_UPDATABLE_FIELDS = {
"title", "content", "category", "tags", "domain", "entities",
"source_type", "source_ref", "source_user", "audience",
"confidence", "status", "sensitivity", "is_personal",
"valid_from", "valid_until", "supersedes",
}
def update(self, item_id: str, **fields) -> None:
safe = {k: v for k, v in fields.items() if k in self._UPDATABLE_FIELDS}
if not safe:
return
now = datetime.now(timezone.utc)
set_clause = ", ".join(f"{k} = ?" for k in safe)
values = list(safe.values()) + [now, item_id]
self.conn.execute(
f"UPDATE knowledge_items SET {set_clause}, updated_at = ? WHERE id = ?",
values,
)
def update_status(self, item_id: str, status: str) -> None:
now = datetime.now(timezone.utc)
self.conn.execute(
"UPDATE knowledge_items SET status = ?, updated_at = ? WHERE id = ?",
[status, now, item_id],
)
def list_items(
self,
statuses: Optional[List[str]] = None,
category: Optional[str] = None,
domain: Optional[str] = None,
source_type: Optional[str] = None,
exclude_personal: bool = False,
user_groups: Optional[List[str]] = None,
granted_domains: Optional[List[str]] = None,
limit: int = 100,
offset: int = 0,
) -> List[Dict[str, Any]]:
query = "SELECT * FROM knowledge_items WHERE 1=1"
params: List[Any] = []
if statuses:
placeholders = ", ".join("?" for _ in statuses)
query += f" AND status IN ({placeholders})"
params.extend(statuses)
if category:
query += " AND category = ?"
params.append(category)
if domain:
query += " AND domain = ?"
params.append(domain)
if source_type:
query += " AND source_type = ?"
params.append(source_type)
if exclude_personal:
query += " AND (is_personal = FALSE OR is_personal IS NULL)"
if user_groups is not None:
# Visibility: audience-string match (null/all/group:X) OR
# caller has been granted access to the item's domain via
# resource_grants (MEMORY_DOMAIN). When ``granted_domains`` is
# falsy the OR clause collapses, preserving pre-RBAC behaviour.
visibility_clauses = ["audience IS NULL", "audience = 'all'"]
if user_groups:
audience_placeholders = ", ".join("?" for _ in user_groups)
visibility_clauses.append(f"audience IN ({audience_placeholders})")
params.extend(user_groups)
if granted_domains:
domain_placeholders = ", ".join("?" for _ in granted_domains)
visibility_clauses.append(f"domain IN ({domain_placeholders})")
params.extend(granted_domains)
query += " AND (" + " OR ".join(visibility_clauses) + ")"
query += " ORDER BY updated_at DESC LIMIT ? OFFSET ?"
params.extend([limit, offset])
return self._rows_to_dicts(self.conn.execute(query, params).fetchall())
def search(
self,
query: str,
exclude_personal: bool = False,
user_groups: Optional[List[str]] = None,
granted_domains: Optional[List[str]] = None,
statuses: Optional[List[str]] = None,
category: Optional[str] = None,
domain: Optional[str] = None,
source_type: Optional[str] = None,
limit: int = 100,
offset: int = 0,
) -> List[Dict[str, Any]]:
pattern = f"%{query}%"
sql = """SELECT * FROM knowledge_items
WHERE (title ILIKE ? OR content ILIKE ?)"""
params: List[Any] = [pattern, pattern]
if statuses:
placeholders = ", ".join("?" for _ in statuses)
sql += f" AND status IN ({placeholders})"
params.extend(statuses)
if category:
sql += " AND category = ?"
params.append(category)
if domain:
sql += " AND domain = ?"
params.append(domain)
if source_type:
sql += " AND source_type = ?"
params.append(source_type)
if exclude_personal:
sql += " AND (is_personal = FALSE OR is_personal IS NULL)"
if user_groups is not None:
visibility_clauses = ["audience IS NULL", "audience = 'all'"]
if user_groups:
audience_placeholders = ", ".join("?" for _ in user_groups)
visibility_clauses.append(f"audience IN ({audience_placeholders})")
params.extend(user_groups)
if granted_domains:
domain_placeholders = ", ".join("?" for _ in granted_domains)
visibility_clauses.append(f"domain IN ({domain_placeholders})")
params.extend(granted_domains)
sql += " AND (" + " OR ".join(visibility_clauses) + ")"
sql += " ORDER BY updated_at DESC LIMIT ? OFFSET ?"
params.extend([limit, offset])
results = self.conn.execute(sql, params).fetchall()
return self._rows_to_dicts(results)
def count_items(
self,
search: Optional[str] = None,
statuses: Optional[List[str]] = None,
category: Optional[str] = None,
domain: Optional[str] = None,
source_type: Optional[str] = None,
exclude_personal: bool = False,
user_groups: Optional[List[str]] = None,
granted_domains: Optional[List[str]] = None,
) -> int:
if search:
pattern = f"%{search}%"
sql = "SELECT COUNT(*) FROM knowledge_items WHERE (title ILIKE ? OR content ILIKE ?)"
params: List[Any] = [pattern, pattern]
else:
sql = "SELECT COUNT(*) FROM knowledge_items WHERE 1=1"
params = []
if statuses:
placeholders = ", ".join("?" for _ in statuses)
sql += f" AND status IN ({placeholders})"
params.extend(statuses)
if category:
sql += " AND category = ?"
params.append(category)
if domain:
sql += " AND domain = ?"
params.append(domain)
if source_type:
sql += " AND source_type = ?"
params.append(source_type)
if exclude_personal:
sql += " AND (is_personal = FALSE OR is_personal IS NULL)"
if user_groups is not None:
visibility_clauses = ["audience IS NULL", "audience = 'all'"]
if user_groups:
audience_placeholders = ", ".join("?" for _ in user_groups)
visibility_clauses.append(f"audience IN ({audience_placeholders})")
params.extend(user_groups)
if granted_domains:
domain_placeholders = ", ".join("?" for _ in granted_domains)
visibility_clauses.append(f"domain IN ({domain_placeholders})")
params.extend(granted_domains)
sql += " AND (" + " OR ".join(visibility_clauses) + ")"
return self.conn.execute(sql, params).fetchone()[0]
def list_by_domain(
self,
domain: str,
statuses: Optional[List[str]] = None,
limit: int = 100,
) -> List[Dict[str, Any]]:
query = "SELECT * FROM knowledge_items WHERE domain = ?"
params: List[Any] = [domain]
if statuses:
placeholders = ", ".join("?" for _ in statuses)
query += f" AND status IN ({placeholders})"
params.extend(statuses)
query += " ORDER BY updated_at DESC LIMIT ?"
params.append(limit)
return self._rows_to_dicts(self.conn.execute(query, params).fetchall())
def get_user_contributions(self, source_user: str) -> List[Dict[str, Any]]:
results = self.conn.execute(
"SELECT * FROM knowledge_items WHERE source_user = ? ORDER BY updated_at DESC",
[source_user],
).fetchall()
return self._rows_to_dicts(results)
def set_personal(self, item_id: str, is_personal: bool) -> None:
now = datetime.now(timezone.utc)
self.conn.execute(
"UPDATE knowledge_items SET is_personal = ?, updated_at = ? WHERE id = ?",
[is_personal, now, item_id],
)
# --- Votes ---
def vote(self, item_id: str, user_id: str, vote: int) -> None:
now = datetime.now(timezone.utc)
self.conn.execute(
"""INSERT INTO knowledge_votes (item_id, user_id, vote, voted_at)
VALUES (?, ?, ?, ?)
ON CONFLICT (item_id, user_id) DO UPDATE SET vote = excluded.vote, voted_at = excluded.voted_at""",
[item_id, user_id, vote, now],
)
def unvote(self, item_id: str, user_id: str) -> None:
self.conn.execute(
"DELETE FROM knowledge_votes WHERE item_id = ? AND user_id = ?",
[item_id, user_id],
)
def get_votes(self, item_id: str) -> Dict[str, int]:
result = self.conn.execute(
"""SELECT
COALESCE(SUM(CASE WHEN vote > 0 THEN 1 ELSE 0 END), 0) as upvotes,
COALESCE(SUM(CASE WHEN vote < 0 THEN 1 ELSE 0 END), 0) as downvotes
FROM knowledge_votes WHERE item_id = ?""",
[item_id],
).fetchone()
return {"upvotes": result[0], "downvotes": result[1]}
# --- Contradictions ---
def create_contradiction(
self,
item_a_id: str,
item_b_id: str,
explanation: str,
severity: Optional[str] = None,
suggested_resolution: Optional[Any] = None,
) -> str:
"""Persist a contradiction.
``suggested_resolution`` may be either a free-form string (legacy
callers) or a dict (the structured shape produced by Haiku — see ADR
Decision 4). Dicts are JSON-encoded into the existing TEXT column so
no schema migration is needed; the read side decodes back to dict.
"""
if isinstance(suggested_resolution, dict):
suggested_resolution_db: Optional[str] = json.dumps(suggested_resolution)
else:
suggested_resolution_db = suggested_resolution
contradiction_id = f"kc_{uuid.uuid4().hex[:12]}"
self.conn.execute(
"""INSERT INTO knowledge_contradictions (
id, item_a_id, item_b_id, explanation, severity, suggested_resolution
) VALUES (?, ?, ?, ?, ?, ?)""",
[contradiction_id, item_a_id, item_b_id, explanation, severity, suggested_resolution_db],
)
return contradiction_id
@staticmethod
def _decode_suggested_resolution(row: Dict[str, Any]) -> Dict[str, Any]:
"""If the stored suggested_resolution is JSON, decode it to a dict.
Plain strings (legacy rows) are returned unchanged.
"""
raw = row.get("suggested_resolution")
if isinstance(raw, str) and raw and raw.lstrip().startswith("{"):
try:
row["suggested_resolution"] = json.loads(raw)
except json.JSONDecodeError:
pass
return row
def list_contradictions(
self,
resolved: Optional[bool] = None,
limit: int = 100,
) -> List[Dict[str, Any]]:
query = "SELECT * FROM knowledge_contradictions WHERE 1=1"
params: List[Any] = []
if resolved is not None:
query += " AND resolved = ?"
params.append(resolved)
query += " ORDER BY detected_at DESC LIMIT ?"
params.append(limit)
rows = self._rows_to_dicts(self.conn.execute(query, params).fetchall())
return [self._decode_suggested_resolution(r) for r in rows]
def resolve_contradiction(
self,
contradiction_id: str,
resolved_by: str,
resolution: str,
) -> None:
now = datetime.now(timezone.utc)
self.conn.execute(
"""UPDATE knowledge_contradictions
SET resolved = TRUE, resolved_by = ?, resolved_at = ?, resolution = ?
WHERE id = ?""",
[resolved_by, now, resolution, contradiction_id],
)
def get_contradiction(self, contradiction_id: str) -> Optional[Dict[str, Any]]:
result = self.conn.execute(
"SELECT * FROM knowledge_contradictions WHERE id = ?",
[contradiction_id],
).fetchone()
row = self._row_to_dict(result)
if row is not None:
self._decode_suggested_resolution(row)
return row
# --- Verification Evidence ---
def create_evidence(
self,
item_id: str,
source_user: Optional[str] = None,
source_ref: Optional[str] = None,
detection_type: Optional[str] = None,
user_quote: Optional[str] = None,
) -> str:
"""Persist one verification evidence row for a knowledge item.
Multiple evidence rows per item are expected — each new analyst
confirmation/correction adds one. user_quote and detection_type are the
raw signal future Bayesian re-calibration consumes.
"""
evidence_id = f"ev_{uuid.uuid4().hex[:12]}"
self.conn.execute(
"""INSERT INTO verification_evidence (
id, item_id, source_user, source_ref, detection_type, user_quote
) VALUES (?, ?, ?, ?, ?, ?)""",
[evidence_id, item_id, source_user, source_ref, detection_type, user_quote],
)
return evidence_id
def list_evidence(self, item_id: str) -> List[Dict[str, Any]]:
results = self.conn.execute(
"""SELECT * FROM verification_evidence
WHERE item_id = ?
ORDER BY created_at ASC""",
[item_id],
).fetchall()
return self._rows_to_dicts(results)
# --- Session Extraction State ---
def mark_session_processed(
self,
session_file: str,
username: str,
items_extracted: int = 0,
file_hash: Optional[str] = None,
) -> None:
now = datetime.now(timezone.utc)
self.conn.execute(
"""INSERT INTO session_extraction_state (session_file, username, processed_at, items_extracted, file_hash)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT (session_file) DO UPDATE
SET processed_at = excluded.processed_at,
items_extracted = excluded.items_extracted,
file_hash = excluded.file_hash""",
[session_file, username, now, items_extracted, file_hash],
)
def is_session_processed(self, session_file: str) -> bool:
result = self.conn.execute(
"SELECT 1 FROM session_extraction_state WHERE session_file = ?",
[session_file],
).fetchone()
return result is not None
# --- Item relations (duplicate-candidate hints, etc.) ---
@staticmethod
def _canonical_pair(a: str, b: str) -> tuple[str, str]:
"""Return (min(a,b), max(a,b)) — every unordered pair maps to one row."""
return (a, b) if a <= b else (b, a)
def create_relation(
self,
item_a_id: str,
item_b_id: str,
relation_type: str,
score: Optional[float] = None,
) -> None:
"""Persist a relation row. Idempotent on (item_a_id, item_b_id, relation_type).
The PK is canonicalized to (min, max) so duplicate calls with reversed
arguments don't create a second row. Self-relations (a == b) are
rejected — a pair must reference two distinct items.
"""
if item_a_id == item_b_id:
raise ValueError("Cannot create relation between an item and itself")
a, b = self._canonical_pair(item_a_id, item_b_id)
self.conn.execute(
"""INSERT INTO knowledge_item_relations
(item_a_id, item_b_id, relation_type, score)
VALUES (?, ?, ?, ?)
ON CONFLICT (item_a_id, item_b_id, relation_type) DO NOTHING""",
[a, b, relation_type, score],
)
def list_relations(
self,
relation_type: Optional[str] = None,
resolved: Optional[bool] = None,
limit: int = 100,
) -> List[Dict[str, Any]]:
sql = "SELECT * FROM knowledge_item_relations WHERE 1=1"
params: List[Any] = []
if relation_type is not None:
sql += " AND relation_type = ?"
params.append(relation_type)
if resolved is not None:
sql += " AND resolved = ?"
params.append(resolved)
sql += " ORDER BY created_at DESC LIMIT ?"
params.append(limit)
return self._rows_to_dicts(self.conn.execute(sql, params).fetchall())
def resolve_relation(
self,
item_a_id: str,
item_b_id: str,
relation_type: str,
resolved_by: str,
resolution: str,
) -> int:
"""Mark a relation row resolved. Returns rowcount (0 if not found)."""
a, b = self._canonical_pair(item_a_id, item_b_id)
now = datetime.now(timezone.utc)
# DuckDB doesn't expose UPDATE rowcount via the cursor API uniformly;
# do an existence check first so callers can report 404 vs success.
existing = self.conn.execute(
"""SELECT 1 FROM knowledge_item_relations
WHERE item_a_id = ? AND item_b_id = ? AND relation_type = ?""",
[a, b, relation_type],
).fetchone()
if not existing:
return 0
self.conn.execute(
"""UPDATE knowledge_item_relations
SET resolved = TRUE,
resolved_by = ?,
resolved_at = ?,
resolution = ?
WHERE item_a_id = ? AND item_b_id = ? AND relation_type = ?""",
[resolved_by, now, resolution, a, b, relation_type],
)
return 1
def get_relation(
self,
item_a_id: str,
item_b_id: str,
relation_type: str,
) -> Optional[Dict[str, Any]]:
a, b = self._canonical_pair(item_a_id, item_b_id)
result = self.conn.execute(
"""SELECT * FROM knowledge_item_relations
WHERE item_a_id = ? AND item_b_id = ? AND relation_type = ?""",
[a, b, relation_type],
).fetchone()
return self._row_to_dict(result)
def find_duplicate_candidates_by_entities(
self,
new_item_id: str,
entities: Optional[List[str]],
domain: Optional[str],
min_overlap: int,
limit: int = 100,
) -> List[Dict[str, Any]]:
"""Same-domain candidates whose ``entities`` set shares >= ``min_overlap``
members with ``entities``.
Personal items are excluded (privacy boundary — see ADR Decision 1
precedent in ``find_contradiction_candidates``). Self-id is excluded.
Domain is a hard SQL conjunct: a NULL-domain item produces no
candidates (matches the verification-detector skip-empty contract).
Jaccard is computed in Python because DuckDB lacks a portable JSON
intersection helper; the SQL layer trims the candidate set to the
same domain so the Python loop scales linearly with that.
"""
if not entities or not domain:
return []
new_set = set(entities)
sql = """
SELECT * FROM knowledge_items
WHERE status IN ('approved', 'mandatory', 'pending')
AND (is_personal = FALSE OR is_personal IS NULL)
AND domain = ?
AND id != ?
AND entities IS NOT NULL
ORDER BY updated_at DESC
LIMIT ?
"""
rows = self._rows_to_dicts(
self.conn.execute(sql, [domain, new_item_id, limit]).fetchall()
)
out: List[Dict[str, Any]] = []
for row in rows:
cand_entities = row.get("entities")
if isinstance(cand_entities, str):
try:
cand_entities = json.loads(cand_entities)
except json.JSONDecodeError:
continue
if not isinstance(cand_entities, list) or not cand_entities:
continue
cand_set = set(cand_entities)
overlap = new_set & cand_set
if len(overlap) < min_overlap:
continue
union = new_set | cand_set
jaccard = len(overlap) / len(union) if union else 0.0
row["overlap_count"] = len(overlap)
row["jaccard"] = jaccard
out.append(row)
return out
# --- Bulk update + tag/audience aggregations (issue #62) ---
def bulk_update(
self,
item_ids: List[str],
updates: Dict[str, Any],
) -> Dict[str, str]:
"""Apply ``updates`` to each item id; partial-failure tolerant.
``updates`` may include the standard ``_UPDATABLE_FIELDS`` keys plus
the bulk-only ``tags_add`` / ``tags_remove`` lists. Tag mutations are
merged with the item's existing tags so callers don't have to fetch
first. Returns a per-id status map: ``"updated"`` / ``"not_found"`` /
an error message.
"""
results: Dict[str, str] = {}
if not item_ids:
return results
plain_fields = {
k: v for k, v in updates.items()
if k in self._UPDATABLE_FIELDS and k != "tags"
}
# If the caller passed an explicit ``tags`` list, treat it as a hard
# set (same semantics as repo.update). Add/remove are applied per item.
explicit_tags = updates.get("tags") if "tags" in updates else None
tags_add = updates.get("tags_add") or []
tags_remove = updates.get("tags_remove") or []
for item_id in item_ids:
try:
item = self.get_by_id(item_id)
if not item:
results[item_id] = "not_found"
continue
per_item: Dict[str, Any] = dict(plain_fields)
if explicit_tags is not None:
per_item["tags"] = explicit_tags
elif tags_add or tags_remove:
existing = item.get("tags") or []
if isinstance(existing, str):
try:
existing = json.loads(existing)
except json.JSONDecodeError:
existing = []
if not isinstance(existing, list):
existing = []
new_tags = list(existing)
for t in tags_add:
if t not in new_tags:
new_tags.append(t)
if tags_remove:
rm = set(tags_remove)
new_tags = [t for t in new_tags if t not in rm]
per_item["tags"] = new_tags
if not per_item:
results[item_id] = "updated" # nothing to do, treat as success
continue
# JSON-encode tags before passing to .update (mirrors create()).
if "tags" in per_item:
per_item["tags"] = (
json.dumps(per_item["tags"]) if per_item["tags"] else None
)
if "entities" in per_item and isinstance(per_item["entities"], list):
per_item["entities"] = json.dumps(per_item["entities"]) if per_item["entities"] else None
self.update(item_id, **per_item)
results[item_id] = "updated"
except Exception as e: # pragma: no cover - defensive
results[item_id] = f"error: {e}"
return results
def count_by_tag(
self,
exclude_personal: bool = False,
user_groups: Optional[List[str]] = None,
granted_domains: Optional[List[str]] = None,
) -> Dict[str, int]:
"""Aggregate item counts per tag (one tag may belong to many items).
Uses DuckDB ``json_each`` to unnest the JSON tag list. Items with no
tags don't contribute. Visibility filter mirrors ``count_items``
(audience OR MEMORY_DOMAIN grant).
"""
where = ["tags IS NOT NULL"]
params: List[Any] = []
if exclude_personal:
where.append("(is_personal = FALSE OR is_personal IS NULL)")
if user_groups is not None:
visibility = ["audience IS NULL", "audience = 'all'"]
if user_groups:
ph = ",".join(["?"] * len(user_groups))
visibility.append(f"audience IN ({ph})")
params.extend(user_groups)
if granted_domains:
dph = ",".join(["?"] * len(granted_domains))
visibility.append(f"domain IN ({dph})")
params.extend(granted_domains)
where.append("(" + " OR ".join(visibility) + ")")
where_sql = " WHERE " + " AND ".join(where)
sql = (
"SELECT t.value AS tag, COUNT(*) AS cnt "
"FROM knowledge_items, json_each(knowledge_items.tags) AS t "
f"{where_sql} "
"GROUP BY t.value ORDER BY cnt DESC"
)
rows = self.conn.execute(sql, params).fetchall()
out: Dict[str, int] = {}
for tag, cnt in rows:
# json_each returns the raw scalar; strip wrapping quotes if needed.
key = tag if isinstance(tag, str) else str(tag)
if key.startswith('"') and key.endswith('"'):
key = key[1:-1]
out[key] = cnt
return out
def count_by_audience(
self,
exclude_personal: bool = False,
user_groups: Optional[List[str]] = None,
granted_domains: Optional[List[str]] = None,
) -> Dict[str, int]:
"""Aggregate item counts per audience bucket.
``audience`` is a free-form column whose canonical values are
``NULL`` / ``'all'`` / ``'group:<name>'``. NULL is bucketed as
``'all'`` so the chip-filter UI doesn't need a separate "no audience"
affordance. Visibility filter mirrors ``count_items`` (audience OR
MEMORY_DOMAIN grant).
"""
where: List[str] = []
params: List[Any] = []
if exclude_personal:
where.append("(is_personal = FALSE OR is_personal IS NULL)")
if user_groups is not None:
visibility = ["audience IS NULL", "audience = 'all'"]
if user_groups:
ph = ",".join(["?"] * len(user_groups))
visibility.append(f"audience IN ({ph})")
params.extend(user_groups)
if granted_domains:
dph = ",".join(["?"] * len(granted_domains))
visibility.append(f"domain IN ({dph})")
params.extend(granted_domains)
where.append("(" + " OR ".join(visibility) + ")")
where_sql = (" WHERE " + " AND ".join(where)) if where else ""
sql = (
"SELECT COALESCE(audience, 'all') AS aud, COUNT(*) AS cnt "
f"FROM knowledge_items{where_sql} "
"GROUP BY aud ORDER BY cnt DESC"
)
rows = self.conn.execute(sql, params).fetchall()
return {r[0]: r[1] for r in rows}
def find_contradiction_candidates(
self,
new_item_id: str,
domain: Optional[str] = None,
limit: int = 100,
) -> List[Dict[str, Any]]:
"""Same-domain candidates for LLM-based contradiction judgment.
Domain is the only narrowing applied at the SQL layer. Topic / content
matching is delegated to the LLM judge in
services.corporate_memory.contradiction.find_and_judge() — see ADR
Decision 4. The brittle keyword-substring layer that used to live here
was removed; it had recall holes (synonyms, paraphrases) and the
domain conjunct alone is enough as a hard ACL.
Personal items (`is_personal = TRUE`) are excluded unconditionally —
the LLM call is a read site (and exfiltrates content to the external
API), so ADR Decision 1 ("hard privacy boundary, not a UI hint")
applies. Without this filter, personal item content would be
serialized into every contradiction prompt and could be paraphrased
into `knowledge_contradictions.suggested_resolution.merged_content`
— bypassing the contributor-only visibility rule.
"""
sql = """
SELECT * FROM knowledge_items
WHERE status IN ('approved', 'mandatory', 'pending')
AND (is_personal = FALSE OR is_personal IS NULL)
AND id != ?
"""
params: List[Any] = [new_item_id]
if domain:
sql += " AND domain = ?"
params.append(domain)
sql += " ORDER BY updated_at DESC LIMIT ?"
params.append(limit)
return self._rows_to_dicts(self.conn.execute(sql, params).fetchall())