* fix(security): RBAC filter for agnes_sessions matches both email local-part and user_id
The upload API (POST /api/upload/sessions) stores session files under
user_sessions/{user_id}/ (UUID), while the session collector uses the
OS username (email local-part). The session pipeline writes the directory
name verbatim into usage_session_summary.username, so the column can
contain either value depending on the ingestion path.
The RBAC filter in build_filter_clause previously only matched the email
local-part, missing sessions uploaded via the API. The fix adds an OR
condition so non-admin users see rows where username matches either their
email local-part or their user_id.
Closes #293
Co-Authored-By: zdenek.srotyr <zdenek.srotyr@keboola.com>
* fix(security): RBAC filter uses stable user_id instead of mutable email local-part
Closes #293
Previous fix used OR condition matching both email local-part and user_id
in the username column. This was fragile: email changes would break
filtering. This commit introduces a dedicated user_id column populated
by the session pipeline via resolve_user_id(), and switches the RBAC
filter to use it exclusively.
Changes:
- Schema v45: add user_id column to usage_session_summary and usage_events
- UsageProcessor: accept and store user_id in both tables
- runner.py: resolve_user_id() maps directory name to users.id UUID
(exact match for UUID dirs, email LIKE for local-part dirs)
- INTERNAL_TABLES: agnes_sessions/agnes_telemetry filter on user_id column
- build_filter_clause: simplified to WHERE user_id = '<uuid>' (no OR)
- me.py/admin_user_sessions.py: query by user_id OR username for
backward compatibility during transition
- USAGE_PROCESSOR_VERSION bumped 2→3 to trigger reprocessing/backfill
- Tests updated: 27 pass including new email-change resilience test
Co-Authored-By: zdenek.srotyr <zdenek.srotyr@keboola.com>
* fix(tests): bump schema version assertions 44→45
Co-Authored-By: zdenek.srotyr <zdenek.srotyr@keboola.com>
* fix(docs): correct resolve_user_id docstring, add TypeError comment
Co-Authored-By: zdenek.srotyr <zdenek.srotyr@keboola.com>
* fix(security): address review — backward-compat OR, LIKE escaping, narrower TypeError
Co-Authored-By: zdenek.srotyr <zdenek.srotyr@keboola.com>
* fix(security): address code review — eliminate TypeError hack, add resolve_user_id tests
Co-Authored-By: zdenek.srotyr <zdenek.srotyr@keboola.com>
* fix(db): create user_id indexes in _v44_to_v45, not _SYSTEM_SCHEMA
_SYSTEM_SCHEMA runs before the migration ladder. On an upgrade from
v42/v43/v44, usage_events / usage_session_summary already exist without
the user_id column (CREATE TABLE IF NOT EXISTS is a no-op), so the
CREATE INDEX ... (user_id) lines in _SYSTEM_SCHEMA failed to bind and
aborted _ensure_schema — the app would not start post-upgrade. Move the
index creation to _v44_to_v45, which ADDs the column first. Same pattern
as the v41 audit_log indices.
* fix(usage): bump USAGE_PROCESSOR_VERSION 3→4 for user_id backfill
#303 shipped USAGE_PROCESSOR_VERSION=3 (release 0.54.12) for its
<command-name> slash extraction. This PR's 2→3 bump collided with it
on rebase, so the reprocess loop would not re-trigger to backfill the
new user_id column on deployments already running v3. Bump to 4.
* release: 0.54.13 — RBAC filter uses stable user_id (#293)
---------
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
178 lines
6.9 KiB
Python
178 lines
6.9 KiB
Python
"""VerificationProcessor — first plugin of the session-pipeline framework.
|
|
|
|
Wraps the body of the pre-refactor `verification_detector.detector.run()`
|
|
inner loop so the LLM extraction + persist behavior is unchanged after the
|
|
framework refactor. Tests in `tests/test_corporate_memory_v1.py` are the
|
|
regression contract.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import duckdb
|
|
|
|
from connectors.llm import StructuredExtractor
|
|
from connectors.llm.exceptions import LLMError
|
|
from services.corporate_memory import contradiction as contradiction_module
|
|
from services.corporate_memory.confidence import compute_confidence
|
|
from services.session_pipeline.contract import ProcessorResult
|
|
from services.session_pipeline.lib import parse_jsonl
|
|
from services.verification_detector.duplicates import _record_duplicate_candidates
|
|
from services.verification_detector.detector import (
|
|
_generate_id,
|
|
extract_verifications,
|
|
)
|
|
from src.repositories.knowledge import KnowledgeRepository
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class VerificationProcessor:
|
|
name: str = "verification"
|
|
cadence_minutes: int = 15
|
|
|
|
def __init__(self, extractor: StructuredExtractor):
|
|
self.extractor = extractor
|
|
|
|
def process_session(
|
|
self,
|
|
session_path: Path,
|
|
username: str,
|
|
session_key: str,
|
|
conn: duckdb.DuckDBPyConnection,
|
|
**kwargs: object,
|
|
) -> ProcessorResult:
|
|
repo = KnowledgeRepository(conn)
|
|
session_id = f"session-{session_path.stem}-{username}"
|
|
|
|
turns = parse_jsonl(session_path)
|
|
if not turns:
|
|
logger.info("Empty session: %s", session_key)
|
|
return ProcessorResult(items_count=0)
|
|
|
|
verifications = extract_verifications(self.extractor, username, session_id, turns)
|
|
|
|
items_created = 0
|
|
for v in verifications:
|
|
item_id = _generate_id(v["title"], v["content"])
|
|
existing = repo.get_by_id(item_id)
|
|
if existing:
|
|
# Hash collision on (title, content) → another analyst
|
|
# produced the same fact. ADR Decision 3 expects multiple
|
|
# evidence rows to accumulate (one per distinct
|
|
# verification event), so we still persist the new
|
|
# evidence row even though we skip the create+contradiction
|
|
# path. Without this, the second analyst's user_quote and
|
|
# detection_type are silently dropped and the
|
|
# "additional verifiers" boost cannot accumulate.
|
|
logger.info(
|
|
"Duplicate item — recording evidence on existing: %s",
|
|
item_id,
|
|
)
|
|
repo.create_evidence(
|
|
item_id=item_id,
|
|
source_user=username,
|
|
source_ref=session_id,
|
|
detection_type=v.get("detection_type"),
|
|
user_quote=v.get("user_quote"),
|
|
)
|
|
continue
|
|
|
|
# Confidence is computed in code from (source_type, detection_type).
|
|
# The LLM is not trusted to set its own credibility — see Q3 in
|
|
# docs/pd-ps-comments.md and the ADR.
|
|
detection_type = v.get("detection_type")
|
|
try:
|
|
confidence_value = compute_confidence("user_verification", detection_type)
|
|
except ValueError:
|
|
# Unknown detection_type from the LLM; fall back to a
|
|
# lookup-keyed default rather than the LLM-supplied value.
|
|
confidence_value = compute_confidence("user_verification", "confirmation")
|
|
repo.create(
|
|
id=item_id,
|
|
title=v["title"],
|
|
content=v["content"],
|
|
category="business_logic",
|
|
source_user=username,
|
|
tags=v.get("entities", []),
|
|
status="pending",
|
|
confidence=confidence_value,
|
|
domain=v.get("domain"),
|
|
entities=v.get("entities"),
|
|
source_type="user_verification",
|
|
source_ref=session_id,
|
|
sensitivity="internal",
|
|
)
|
|
# Persist the verification evidence row — user_quote and
|
|
# detection_type are the raw signal Bayesian re-calibration
|
|
# will need later (Q3).
|
|
repo.create_evidence(
|
|
item_id=item_id,
|
|
source_user=username,
|
|
source_ref=session_id,
|
|
detection_type=detection_type,
|
|
user_quote=v.get("user_quote"),
|
|
)
|
|
items_created += 1
|
|
|
|
# Record duplicate-candidate hints inline. Heuristic-only (no
|
|
# LLM call) so it stays cheap; failures must never abort
|
|
# session processing — log and continue. Issue #62.
|
|
try:
|
|
new_item = repo.get_by_id(item_id)
|
|
if new_item is not None:
|
|
_record_duplicate_candidates(repo, new_item)
|
|
except Exception as e:
|
|
logger.warning(
|
|
"Duplicate-candidate detection failed for %s: %s",
|
|
item_id,
|
|
e,
|
|
)
|
|
|
|
# Run contradiction detection inline. Failure of the LLM
|
|
# judge must not abort session processing — log and move on.
|
|
try:
|
|
new_item = repo.get_by_id(item_id)
|
|
if new_item is not None:
|
|
contradiction_module.detect_and_record(self.extractor, new_item, repo)
|
|
except LLMError as e:
|
|
logger.warning("Contradiction check failed for %s: %s", item_id, e)
|
|
except Exception as e:
|
|
logger.warning(
|
|
"Unexpected error during contradiction check for %s: %s",
|
|
item_id,
|
|
e,
|
|
)
|
|
|
|
logger.info(
|
|
"Processed %s: %d verifications, %d items created",
|
|
session_key,
|
|
len(verifications),
|
|
items_created,
|
|
)
|
|
return ProcessorResult(items_count=items_created)
|
|
|
|
|
|
def build_verification_processor() -> VerificationProcessor:
|
|
"""Factory that constructs the LLM extractor from instance config + env.
|
|
|
|
Mirrors the pattern in services/verification_detector/__main__.py and
|
|
app/api/admin.py:run_verification_detector — both built the extractor
|
|
lazily at call time. Raises if the LLM isn't configured."""
|
|
from connectors.llm import create_extractor_from_env_or_config
|
|
|
|
try:
|
|
from app.instance_config import load_instance_config
|
|
|
|
try:
|
|
config = load_instance_config()
|
|
except (ValueError, FileNotFoundError):
|
|
config = {}
|
|
ai_config = config.get("ai") if config else None
|
|
except Exception:
|
|
ai_config = None
|
|
|
|
extractor = create_extractor_from_env_or_config(ai_config)
|
|
return VerificationProcessor(extractor=extractor)
|