agnes-the-ai-analyst/src/repositories/session_processor_state.py

"""Repository for session_processor_state — per-(processor, session) bookkeeping
for the session pipeline framework (services/session_pipeline/).

Composite PK (processor_name, session_file) lets each processor track its own
processed-set independently. file_hash invalidates the row when a session jsonl
grows (Claude Code appending live to an active session) so processors reprocess
the new content rather than treating the first hash as final.
"""

from __future__ import annotations

from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

import duckdb


class SessionProcessorStateRepository:
    def __init__(self, conn: duckdb.DuckDBPyConnection):
        self.conn = conn

    def is_processed(
        self,
        processor_name: str,
        session_file: str,
        file_hash: str,
    ) -> bool:
        """True iff a state row exists for (processor_name, session_file) AND
        the stored file_hash matches the supplied current hash. Hash mismatch
        (e.g. session jsonl grew since last run) is treated as unprocessed
        so the processor reprocesses on the next tick."""
        result = self.conn.execute(
            """SELECT file_hash FROM session_processor_state
                WHERE processor_name = ? AND session_file = ?""",
            [processor_name, session_file],
        ).fetchone()
        if result is None:
            return False
        return result[0] == file_hash

    def mark_processed(
        self,
        processor_name: str,
        session_file: str,
        username: str,
        items_count: int,
        file_hash: str,
    ) -> None:
        """UPSERT — overwrites previous state row for (processor, session)."""
        now = datetime.now(timezone.utc)
        self.conn.execute(
            """INSERT INTO session_processor_state
                (processor_name, session_file, username, processed_at, items_extracted, file_hash)
                VALUES (?, ?, ?, ?, ?, ?)
                ON CONFLICT (processor_name, session_file) DO UPDATE
                SET processed_at = excluded.processed_at,
                    items_extracted = excluded.items_extracted,
                    file_hash = excluded.file_hash,
                    username = excluded.username""",
            [processor_name, session_file, username, now, items_count, file_hash],
        )

    def scan_unprocessed_for(
        self,
        processor_name: str,
        session_dir: Path,
    ) -> list[tuple[str, Path]]:
        """Return (username, jsonl_path) pairs in *session_dir* that this
        processor needs to (re)process: no state row, OR state row with
        an mtime newer than the stored processed_at (file modified since
        last run — likely a live-append from an active Claude Code session).

        The mtime precheck is a cheap stat-only optimization: for stable
        sessions (mtime <= processed_at) we skip without reading the file.
        Files that survive the precheck still go through the runner's
        per-file ``is_processed(file_hash)`` check for authoritative
        hash-based invalidation. Without this filter, the runner would
        MD5-rehash every stable session on every scheduler tick.
        """
        results: list[tuple[str, Path]] = []
        if not session_dir.exists():
            return results

        # One query per scan, not per file. Storing processed_at (not file_hash)
        # because mtime is the cheap precheck — file_hash compare lives in the
        # runner where it's already paying the IO cost to hash.
        known: dict[str, Optional[datetime]] = {}
        rows = self.conn.execute(
            """SELECT session_file, processed_at FROM session_processor_state
                WHERE processor_name = ?""",
            [processor_name],
        ).fetchall()
        for sf, pa in rows:
            known[sf] = pa

        for user_dir in session_dir.iterdir():
            if not user_dir.is_dir():
                continue
            username = user_dir.name
            for jsonl_file in sorted(user_dir.glob("*.jsonl")):
                key = f"{username}/{jsonl_file.name}"
                if key not in known:
                    # No state row → definitely needs processing.
                    results.append((username, jsonl_file))
                    continue
                processed_at = known[key]
                if processed_at is None:
                    # Defensive: row without processed_at shouldn't happen
                    # (mark_processed always sets it), but if it does,
                    # surface for the runner.
                    results.append((username, jsonl_file))
                    continue
                try:
                    mtime_epoch = jsonl_file.stat().st_mtime
                except OSError:
                    # Stat failure: surface for the runner — it'll fail the
                    # hash compute next and report a clean error in stats
                    # rather than us silently dropping the file here.
                    results.append((username, jsonl_file))
                    continue
                # Compare in naive-local: DuckDB TIMESTAMP strips tz on
                # storage and converts tz-aware writes to local time before
                # storing (see app/api/health.py:_check_session_pipeline for
                # the same idiom). `datetime.fromtimestamp(epoch)` without
                # `tz=` returns naive-local, matching processed_at after
                # the optional tz strip below.
                mtime = datetime.fromtimestamp(mtime_epoch)
                if processed_at.tzinfo is not None:
                    processed_at = processed_at.replace(tzinfo=None)
                if mtime > processed_at:
                    # File touched since last run — could be a live-append
                    # (Claude Code writing to an active session). Surface
                    # for the runner; its hash compare will skip if content
                    # is identical (some editors rewrite-without-change).
                    results.append((username, jsonl_file))
                # else: stable session, skip without hashing.
        return results