agnes-the-ai-analyst/src/store_guardrails/_frontmatter.py

"""YAML-ish frontmatter parsing shared between the upload endpoint and the
content guardrail.

Lives in ``src/`` so the guardrail module (which has no app dependency) can
import it without creating an ``src/ → app/`` cycle. The upload endpoint
re-imports from here.
"""

from __future__ import annotations

import re

_FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---", re.DOTALL)


def parse_frontmatter(text: str) -> dict:
    m = _FRONTMATTER_RE.match(text)
    if not m:
        return {}
    body = m.group(1)
    out: dict = {}
    for line in body.splitlines():
        if not line.strip() or line.lstrip().startswith("#"):
            continue
        if ":" in line:
            k, v = line.split(":", 1)
            out[k.strip()] = v.strip().strip('"').strip("'")
    return out


def frontmatter_body_offset(text: str) -> int:
    """Return the character offset where the body starts (after `---\\n`).

    Zero if the document has no frontmatter. Used by content_check to slice
    off the metadata block before measuring body length.
    """
    m = _FRONTMATTER_RE.match(text)
    if not m:
        return 0
    return m.end()