* Extract session pipeline framework, refactor verification, add UsageProcessor skeleton Pluggable framework under services/session_pipeline/ (contract + lib + per-processor runner) so multiple processors can read /data/user_sessions/<key>/*.jsonl on their own cadence with full failure isolation. Verification flow becomes the first plugin; a no-op UsageProcessor reserves the second slot pending a separate brainstorm on extraction logic + storage shape. Schema v28→v29: rename session_extraction_state → session_processor_state with composite PK (processor_name, session_file). Existing rows copied over with processor_name='verification'; legacy table dropped. Migration is idempotent and no-ops the copy step on fresh installs that came up at the new schema. Endpoint: /api/admin/run-verification-detector replaced by parametrized /api/admin/run-session-processor?processor=<name>. Audit action format follows. Scheduler JOBS: verification-detector entry split into session-processor:verification + session-processor:usage. SCHEDULER_VERIFICATION_DETECTOR_INTERVAL retained for operator compatibility (drives both cadence and health-check grace window); SCHEDULER_USAGE_PROCESSOR_INTERVAL added. * Address PR #232 review: scan dead branch + per-processor lock - `SessionProcessorStateRepository.scan_unprocessed_for` dead else: both branches surfaced every jsonl, the SELECT was unused, runner MD5-rehashed every stable session per tick. Replaced with an mtime precheck — stable sessions (mtime <= processed_at) are filtered at scan; modified files still surface for the runner's authoritative `file_hash` invalidation. Naive-local comparison matches the existing health-check idiom (DuckDB TIMESTAMP strips tz on storage). - Per-processor advisory lock around `_run_processor` in `/api/admin/run-session-processor`. Scheduler tick + manual admin POST could otherwise both run, both call create_evidence on overlapping detections, and accumulate duplicate verification_evidence rows (the dedup short-circuit only covers create+contradiction, not evidence per ADR Decision 3). Non-blocking acquire → 409 Conflict on concurrent invocation; release in finally so a runner exception doesn't wedge the processor. Tests: two new scan unit tests (mtime filter + post-mark mtime bump), 409 endpoint test, lock-released-on-exception test. Two existing tests updated for the new "filtered at scan" stat shape (previously asserted skipped == 1, now scanned == 0). * Address PR #232 review #2: parallel scheduler tick + last_run on terminal state Two pre-existing scaffold bugs in services/scheduler/__main__.py amplified by adding more session-pipeline jobs: 1. Serial for-loop over jobs with synchronous httpx.post(timeout=900) — a 10-minute verification run blocked every other job (data-refresh, health-check, usage, corporate-memory) for the whole window. The PR's stated isolation guarantee held inside the runner but broke at the scheduler dispatch layer. 2. last_run advanced only when _call_api returned True. Permanent-failure jobs hot-looped on every tick (30s) instead of cadence (15min). Fix: ThreadPoolExecutor.submit per due job + per-job in_flight set so a long-running job can't be re-launched on subsequent ticks. last_run advances unconditionally in finally; errors still surface via _call_api logging + audit_log on the receiving side. _run_job extracted to module-level for unit testing. New tests: - TestRunJobBookkeeping: advances on success / failure / unhandled raise - TestRunLoopParallelism: in_flight protection prevents duplicate launches across ticks for a single slow job --------- Co-authored-by: Minas Arustamyan <arustamyan.minas@gmail.com>
138 lines
6.1 KiB
Python
138 lines
6.1 KiB
Python
"""Repository for session_processor_state — per-(processor, session) bookkeeping
|
|
for the session pipeline framework (services/session_pipeline/).
|
|
|
|
Composite PK (processor_name, session_file) lets each processor track its own
|
|
processed-set independently. file_hash invalidates the row when a session jsonl
|
|
grows (Claude Code appending live to an active session) so processors reprocess
|
|
the new content rather than treating the first hash as final.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import duckdb
|
|
|
|
|
|
class SessionProcessorStateRepository:
|
|
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
self.conn = conn
|
|
|
|
def is_processed(
|
|
self,
|
|
processor_name: str,
|
|
session_file: str,
|
|
file_hash: str,
|
|
) -> bool:
|
|
"""True iff a state row exists for (processor_name, session_file) AND
|
|
the stored file_hash matches the supplied current hash. Hash mismatch
|
|
(e.g. session jsonl grew since last run) is treated as unprocessed
|
|
so the processor reprocesses on the next tick."""
|
|
result = self.conn.execute(
|
|
"""SELECT file_hash FROM session_processor_state
|
|
WHERE processor_name = ? AND session_file = ?""",
|
|
[processor_name, session_file],
|
|
).fetchone()
|
|
if result is None:
|
|
return False
|
|
return result[0] == file_hash
|
|
|
|
def mark_processed(
|
|
self,
|
|
processor_name: str,
|
|
session_file: str,
|
|
username: str,
|
|
items_count: int,
|
|
file_hash: str,
|
|
) -> None:
|
|
"""UPSERT — overwrites previous state row for (processor, session)."""
|
|
now = datetime.now(timezone.utc)
|
|
self.conn.execute(
|
|
"""INSERT INTO session_processor_state
|
|
(processor_name, session_file, username, processed_at, items_extracted, file_hash)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT (processor_name, session_file) DO UPDATE
|
|
SET processed_at = excluded.processed_at,
|
|
items_extracted = excluded.items_extracted,
|
|
file_hash = excluded.file_hash,
|
|
username = excluded.username""",
|
|
[processor_name, session_file, username, now, items_count, file_hash],
|
|
)
|
|
|
|
def scan_unprocessed_for(
|
|
self,
|
|
processor_name: str,
|
|
session_dir: Path,
|
|
) -> list[tuple[str, Path]]:
|
|
"""Return (username, jsonl_path) pairs in *session_dir* that this
|
|
processor needs to (re)process: no state row, OR state row with
|
|
an mtime newer than the stored processed_at (file modified since
|
|
last run — likely a live-append from an active Claude Code session).
|
|
|
|
The mtime precheck is a cheap stat-only optimization: for stable
|
|
sessions (mtime <= processed_at) we skip without reading the file.
|
|
Files that survive the precheck still go through the runner's
|
|
per-file ``is_processed(file_hash)`` check for authoritative
|
|
hash-based invalidation. Without this filter, the runner would
|
|
MD5-rehash every stable session on every scheduler tick.
|
|
"""
|
|
results: list[tuple[str, Path]] = []
|
|
if not session_dir.exists():
|
|
return results
|
|
|
|
# One query per scan, not per file. Storing processed_at (not file_hash)
|
|
# because mtime is the cheap precheck — file_hash compare lives in the
|
|
# runner where it's already paying the IO cost to hash.
|
|
known: dict[str, Optional[datetime]] = {}
|
|
rows = self.conn.execute(
|
|
"""SELECT session_file, processed_at FROM session_processor_state
|
|
WHERE processor_name = ?""",
|
|
[processor_name],
|
|
).fetchall()
|
|
for sf, pa in rows:
|
|
known[sf] = pa
|
|
|
|
for user_dir in session_dir.iterdir():
|
|
if not user_dir.is_dir():
|
|
continue
|
|
username = user_dir.name
|
|
for jsonl_file in sorted(user_dir.glob("*.jsonl")):
|
|
key = f"{username}/{jsonl_file.name}"
|
|
if key not in known:
|
|
# No state row → definitely needs processing.
|
|
results.append((username, jsonl_file))
|
|
continue
|
|
processed_at = known[key]
|
|
if processed_at is None:
|
|
# Defensive: row without processed_at shouldn't happen
|
|
# (mark_processed always sets it), but if it does,
|
|
# surface for the runner.
|
|
results.append((username, jsonl_file))
|
|
continue
|
|
try:
|
|
mtime_epoch = jsonl_file.stat().st_mtime
|
|
except OSError:
|
|
# Stat failure: surface for the runner — it'll fail the
|
|
# hash compute next and report a clean error in stats
|
|
# rather than us silently dropping the file here.
|
|
results.append((username, jsonl_file))
|
|
continue
|
|
# Compare in naive-local: DuckDB TIMESTAMP strips tz on
|
|
# storage and converts tz-aware writes to local time before
|
|
# storing (see app/api/health.py:_check_session_pipeline for
|
|
# the same idiom). `datetime.fromtimestamp(epoch)` without
|
|
# `tz=` returns naive-local, matching processed_at after
|
|
# the optional tz strip below.
|
|
mtime = datetime.fromtimestamp(mtime_epoch)
|
|
if processed_at.tzinfo is not None:
|
|
processed_at = processed_at.replace(tzinfo=None)
|
|
if mtime > processed_at:
|
|
# File touched since last run — could be a live-append
|
|
# (Claude Code writing to an active session). Surface
|
|
# for the runner; its hash compare will skip if content
|
|
# is identical (some editors rewrite-without-change).
|
|
results.append((username, jsonl_file))
|
|
# else: stable session, skip without hashing.
|
|
return results
|