"""Repository for ``store_submissions`` — flea-market guardrail audit trail. Every POST/PUT to ``/api/store/entities`` writes a row here capturing the inline-check verdicts and (asynchronously) the LLM security review outcome. Powers ``/admin/store/submissions`` and the override workflow. See ``src/store_guardrails/`` for the check pipeline that fills these rows. """ from __future__ import annotations import json import uuid from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Tuple import duckdb VALID_STATUSES = { "pending_inline", "blocked_inline", "pending_llm", "approved", "blocked_llm", "review_error", "overridden", # 'deleted' is set when the linked entity is hard-deleted (admin DELETE # ?hard=true). The entity row is gone, so the JOIN-based filter can't # reach it — explicit marker required so the Deleted chip can surface # the row. "deleted", # 'archived' is DEPRECATED in writes. Post-v35, archive lifecycle is # read live from `store_entities.visibility_status` via LEFT JOIN # rather than denormalized onto submissions. Kept in the validator # only to preserve historical rows from instances that ran the prior # denormalized path (`mark_archived_for_entity`, removed in v36). # No new code path writes this value. "archived", } class StoreSubmissionsRepository: def __init__(self, conn: duckdb.DuckDBPyConnection): self.conn = conn @staticmethod def _row_to_dict(columns: List[str], row: tuple) -> Dict[str, Any]: d = dict(zip(columns, row)) for k in ("inline_checks", "llm_findings"): v = d.get(k) if isinstance(v, str): try: d[k] = json.loads(v) if v else None except (ValueError, TypeError): d[k] = None return d def create( self, *, submitter_id: str, submitter_email: Optional[str], type: str, name: str, version: Optional[str], status: str, entity_id: Optional[str] = None, inline_checks: Optional[Dict[str, Any]] = None, llm_findings: Optional[Dict[str, Any]] = None, file_size: Optional[int] = None, bundle_sha256: Optional[str] = None, ) -> str: if status not in VALID_STATUSES: raise ValueError(f"invalid submission status: {status!r}") sub_id = uuid.uuid4().hex now = datetime.now(timezone.utc) self.conn.execute( """INSERT INTO store_submissions (id, entity_id, submitter_id, submitter_email, type, name, version, status, inline_checks, llm_findings, file_size, bundle_sha256, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", [ sub_id, entity_id, submitter_id, submitter_email, type, name, version, status, json.dumps(inline_checks) if inline_checks is not None else None, json.dumps(llm_findings) if llm_findings is not None else None, int(file_size) if file_size is not None else None, bundle_sha256, now, now, ], ) return sub_id # mark_archived_for_entity removed in v36 — lifecycle is read live # via JOIN on store_entities.visibility_status. See list_for_admin. def mark_deleted_for_entity(self, entity_id: str) -> int: """Mark every submission row linked to ``entity_id`` as ``status='deleted'`` after a hard delete. ``entity_id`` is preserved as a tombstone pointer — the ``store_entities`` row is gone, but the linkage lets the admin detail page resolve the activity timeline by querying ``audit_log`` for ``store_entity:{entity_id}`` even after the live row is dropped. UUID collision risk is negligible. Submission row + sha256 + size + verdict survive — admin can still see what was hard-deleted under the "Deleted" filter chip. Bundle bytes are gone (mirrors the TTL purge contract). """ before = self.conn.execute( "SELECT COUNT(*) FROM store_submissions WHERE entity_id = ?", [entity_id], ).fetchone()[0] self.conn.execute( "UPDATE store_submissions " " SET status = 'deleted', updated_at = ? " "WHERE entity_id = ?", [datetime.now(timezone.utc), entity_id], ) return int(before) def mark_bundle_purged(self, id: str) -> None: """TTL job hook: bundle bytes have been removed from disk; persist the timestamp so the detail UI can render *"Bundle purged on …"* instead of leaving Download greyed with no explanation. Submission row + sha256 stay intact for forensics. """ self.conn.execute( """UPDATE store_submissions SET bundle_purged_at = ?, entity_id = NULL, updated_at = ? WHERE id = ?""", [datetime.now(timezone.utc), datetime.now(timezone.utc), id], ) def count_blocked_for_submitter_since( self, submitter_id: str, since, ) -> int: """Spam-quota helper. Counts submissions by ``submitter_id`` whose verdict is one of the rejected/error states (``blocked_inline | blocked_llm | review_error``) newer than ``since`` (a ``datetime`` — typically now - 24h). Called from the POST entry point; refusal bounds disk growth from a single bot looping on malformed/risky ZIPs. Pre-fix this counted ONLY ``blocked_inline``. A bad-actor submitter who triggered ten ``blocked_llm`` verdicts was unbounded. All three states represent rejected uploads — count them together. """ row = self.conn.execute( "SELECT COUNT(*) FROM store_submissions " "WHERE submitter_id = ? " " AND status IN ('blocked_inline', 'blocked_llm', 'review_error') " " AND created_at >= ?", [submitter_id, since], ).fetchone() return int(row[0]) if row else 0 # Backward-compat alias — still used in some operator scripts. # Routes to the broader counter post-#9. count_blocked_inline_for_submitter_since = count_blocked_for_submitter_since def update_status( self, id: str, *, status: str, llm_findings: Optional[Dict[str, Any]] = None, reviewed_by_model: Optional[str] = None, ) -> None: if status not in VALID_STATUSES: raise ValueError(f"invalid submission status: {status!r}") sets = ["status = ?", "updated_at = ?"] params: List[Any] = [status, datetime.now(timezone.utc)] if llm_findings is not None: sets.append("llm_findings = ?") params.append(json.dumps(llm_findings)) if reviewed_by_model is not None: sets.append("reviewed_by_model = ?") params.append(reviewed_by_model) params.append(id) self.conn.execute( f"UPDATE store_submissions SET {', '.join(sets)} WHERE id = ?", params, ) def set_override( self, id: str, *, admin_user_id: str, reason: str, ) -> None: """Mark a previously-blocked submission as admin-overridden. Visibility flip on the linked store_entities row is the caller's responsibility — the override path in ``app/api/admin.py`` calls ``StoreEntitiesRepository.set_visibility(entity_id, 'approved')`` in the same transaction. """ self.conn.execute( """UPDATE store_submissions SET status = 'overridden', override_by = ?, override_reason = ?, updated_at = ? WHERE id = ?""", [admin_user_id, reason, datetime.now(timezone.utc), id], ) def count_for_submitter(self, submitter_id: str, exclude_id: Optional[str] = None) -> int: """Number of submissions by a single user. Used by the detail page footer to render *"N other attempts by alice@x.com"* — pass the current submission's id as ``exclude_id`` to exclude it from the count so the link reads naturally as "others". """ if exclude_id: row = self.conn.execute( "SELECT COUNT(*) FROM store_submissions WHERE submitter_id = ? AND id != ?", [submitter_id, exclude_id], ).fetchone() else: row = self.conn.execute( "SELECT COUNT(*) FROM store_submissions WHERE submitter_id = ?", [submitter_id], ).fetchone() return int(row[0]) if row else 0 def get(self, id: str) -> Optional[Dict[str, Any]]: rows = self.conn.execute( "SELECT * FROM store_submissions WHERE id = ?", [id] ).fetchall() if not rows: return None columns = [d[0] for d in self.conn.description] return self._row_to_dict(columns, rows[0]) def latest_for_entity(self, entity_id: str) -> Optional[Dict[str, Any]]: rows = self.conn.execute( """SELECT * FROM store_submissions WHERE entity_id = ? ORDER BY created_at DESC LIMIT 1""", [entity_id], ).fetchall() if not rows: return None columns = [d[0] for d in self.conn.description] return self._row_to_dict(columns, rows[0]) # Whitelisted column names for the click-to-sort UI. ``status`` and # ``name`` get NULL-safe wrapping so the sort is deterministic across # legacy rows; epoch() bypass on ``created_at`` mirrors the bug # workaround in the default-order branch below. # # Mapping is sort-key → fully qualified SQL expression (already # disambiguated against the LEFT JOIN). Bad input raises 400 at the # API edge — see ``list_for_admin`` below. Pre-fix the qualification # used a chain of ``str.replace(...)`` calls that risked partial # replacement when one column name was a substring of another; # the explicit dict eliminates the footgun. _SORT_COLUMNS: Dict[str, str] = { "created_at": "epoch(s.created_at)", "file_size": "COALESCE(s.file_size, 0)", "status": "s.status", "name": "LOWER(s.name)", } def list_for_admin( self, *, status: Optional[List[str]] = None, submitter_id: Optional[str] = None, type_: Optional[str] = None, name_substr: Optional[str] = None, version_substr: Optional[str] = None, sort_by: Optional[str] = None, sort_order: Optional[str] = None, lifecycle: Optional[str] = None, limit: int = 100, skip: int = 0, ) -> Tuple[List[Dict[str, Any]], int]: """Filtered + paginated listing for /admin/store/submissions. v36+ architecture: ``status`` is the verdict (immutable, set at review time). The entity's ``visibility_status`` is the live lifecycle (archived / approved / hidden / pending) — read live via LEFT JOIN. Filtering by lifecycle (Archived / Deleted chips) goes through the JOIN; filtering by verdict (Pending / Needs review / Approved / Overridden) hits ``status`` directly. Adversarial review verdict (chip → SQL truth table): * default (no chip) : exclude lifecycle-end states * Pending : verdict = pending_* AND not archived * Needs review : verdict = blocked/error AND not archived * Approved : verdict = approved AND lifecycle = approved * Overridden : verdict = overridden AND not archived * Archived : entity.visibility_status = 'archived' * Deleted : status = 'deleted' ``status`` parameter retains the comma-separated ``status IN ()`` semantics for backward compat with admin scripts; the chip translation lives in the calling layer (``app/api/admin.py``) which builds the right combination of ``status`` + the new ``lifecycle`` filter param below. """ # Substring + scalar filters are AND-composed onto a base set # of clauses; lifecycle handling is its own branch below. clauses: List[str] = [] params: List[Any] = [] # Verdict filter: pass-through ``status IN (...)`` if explicitly # set. When the caller sets *only* lifecycle (e.g. archived # chip), they pass status=None; we don't over-filter on status. if status: placeholders = ",".join("?" for _ in status) clauses.append(f"s.status IN ({placeholders})") params.extend(status) if submitter_id: clauses.append("s.submitter_id = ?") params.append(submitter_id) if type_: clauses.append("s.type = ?") params.append(type_) if name_substr: clauses.append("LOWER(s.name) LIKE ?") params.append(f"%{name_substr.lower()}%") if version_substr: clauses.append("LOWER(COALESCE(s.version, '')) LIKE ?") params.append(f"%{version_substr.lower()}%") # Lifecycle filter — chip-driven, replaces the legacy # `status='archived'` / `status='deleted'` denormalization. # 'archived' reads live from entity.visibility_status; 'deleted' # uses the submission terminal marker (entity row is gone). if lifecycle == "archived": clauses.append("e.visibility_status = 'archived'") elif lifecycle == "deleted": clauses.append("s.status = 'deleted'") elif not status: # Default view: hide both lifecycle-end states so the queue # stays focused on actionable rows. Chip routing opts back # in by passing lifecycle='archived' or 'deleted'. clauses.append( "(e.visibility_status IS NULL OR e.visibility_status != 'archived')" ) clauses.append("s.status != 'deleted'") where = ("WHERE " + " AND ".join(clauses)) if clauses else "" # COUNT and SELECT both go through the LEFT JOIN so paging # totals match the items list under any filter. Index on # store_submissions(entity_id) (idx_store_submissions_entity) # already covers the JOIN key — no schema change needed. total_row = self.conn.execute( f"SELECT COUNT(*) FROM store_submissions s " f"LEFT JOIN store_entities e ON e.id = s.entity_id " f"{where}", params, ).fetchone() total = int(total_row[0]) if total_row else 0 # Whitelist lookup — values are already JOIN-qualified in # _SORT_COLUMNS. Unknown sort_by raises ValueError; the API # caller maps that to a 400. sort_key = sort_by or "created_at" if sort_key not in self._SORT_COLUMNS: raise ValueError(f"invalid_sort_key: {sort_key!r}") col_expr = self._SORT_COLUMNS[sort_key] order = "ASC" if (sort_order or "desc").lower() == "asc" else "DESC" sql = ( f"SELECT s.*, e.visibility_status AS entity_visibility_status " f"FROM store_submissions s " f"LEFT JOIN store_entities e ON e.id = s.entity_id " f"{where} " f"ORDER BY {col_expr} {order}, s.id " f"LIMIT {int(limit)} OFFSET {int(skip)}" ) rows = self.conn.execute(sql, params).fetchall() if not rows: return [], int(total) columns = [d[0] for d in self.conn.description] items = [self._row_to_dict(columns, r) for r in rows] return items, int(total)