"""LLM-side helpers for the verification detector. After the session-pipeline refactor, the orchestration loop (scan unprocessed → parse jsonl → mark processed) lives in services/session_pipeline/, and the per-session persistence flow lives in services/session_processors/verification.py (VerificationProcessor). This module retains only the pieces specific to LLM extraction — prompt formatting, the structured-output call, and the deterministic-id helper — which both the new processor and the legacy __main__.py CLI shim still import. """ import hashlib import logging from connectors.llm import StructuredExtractor from connectors.llm.exceptions import LLMError from .prompts import VERIFICATION_EXTRACT_PROMPT from .schemas import VERIFICATION_SCHEMA logger = logging.getLogger(__name__) MAX_TURNS_PER_SESSION = 100 def _generate_id(title: str, content: str) -> str: """Generate deterministic ID from title + content (same pattern as corporate memory collector).""" raw = f"{title}:{content}" return "kv_" + hashlib.sha256(raw.encode()).hexdigest()[:12] def _format_turns(turns: list[dict]) -> str: """Format conversation turns as a parseable, prompt-injection-hardened block. Session transcripts are heavily user-influenced (anything the analyst typed lands here). Each turn is wrapped in `` tags with `` neutralized inside the content so a crafted message cannot break out of the wrapper. The trust-boundary instruction in VERIFICATION_EXTRACT_PROMPT tells the LLM to treat content inside `` as data, not directives. """ lines: list[str] = [] for turn in turns: role = turn.get("role", "unknown") content = (turn.get("content") or "").replace("", "</turn>") lines.append(f'{content}') return "\n".join(lines) def extract_verifications( extractor: StructuredExtractor, username: str, session_id: str, turns: list[dict], max_turns: int = MAX_TURNS_PER_SESSION, ) -> list[dict]: """Send conversation turns to LLM for verification detection.""" if not turns: return [] # Truncate to last N turns if too long if len(turns) > max_turns: turns = turns[-max_turns:] conversation_text = _format_turns(turns) prompt = VERIFICATION_EXTRACT_PROMPT.format( username=username, session_id=session_id, conversation=conversation_text, ) try: result = extractor.extract_json( prompt=prompt, max_tokens=4096, json_schema=VERIFICATION_SCHEMA, schema_name="verification_extract", ) return result.get("verifications", []) except LLMError as e: logger.error("LLM extraction failed for session %s: %s", session_id, e) return []