"""HTML sanitizer for the admin-edited news entity. The /home news perex + /news full body are admin-authored HTML rendered to every authenticated user, so the sanitizer is the security boundary. nh3 (Rust-backed ammonia) is used in allowlist mode: anything not on the explicit per-tag attribute list is dropped. Iframe support is gated to a small list of video providers (YouTube, Vimeo, Loom). The pre-pass strips any iframe whose `src` is missing or not in the allowlist BEFORE handing to nh3 — nh3's own `attribute_filter` can drop attributes but not whole elements, so a pre-pass is the simplest way to enforce "iframe only when src is YouTube/Vimeo/Loom." The sanitizer is invoked once on save (in the repository's `save_draft`) before the row is written. Templates render with `{{ x | safe }}` and trust the stored content — no second-pass sanitization on read. """ from __future__ import annotations import re from urllib.parse import urlparse import nh3 # Tag allowlist for nh3. _ALLOWED_TAGS: set[str] = { "p", "br", "hr", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li", "strong", "em", "b", "i", "u", "s", "code", "pre", "blockquote", "a", "img", "span", "div", "section", "table", "thead", "tbody", "tr", "th", "td", "details", "summary", "figure", "figcaption", "iframe", } # Per-tag attribute allowlist. Anything not listed here is stripped by nh3. _ATTR_CLASS_TARGETS = {"span", "div", "section", "p", "h1", "h2", "h3", "h4", "h5", "h6", "table", "td", "th", "blockquote", "a"} _ALLOWED_ATTRIBUTES: dict[str, set[str]] = { # `rel` is managed by nh3's `link_rel="noopener noreferrer"` and must # NOT appear in this list (nh3 raises ValueError otherwise). "a": {"href", "title", "target", "class"}, "img": {"src", "alt", "width", "height"}, "iframe": {"src", "title", "width", "height", "allow", "allowfullscreen", "frameborder"}, } for _tag in _ATTR_CLASS_TARGETS: _ALLOWED_ATTRIBUTES.setdefault(_tag, set()).add("class") # URL scheme allowlist applied to / . _ALLOWED_URL_SCHEMES: set[str] = {"http", "https", "mailto"} # Iframe host allowlist — `src` must start with one of these prefixes # (scheme + host + the leading path segment). Pre-pass drops the whole # iframe element if `src` is missing or fails this check. _IFRAME_SRC_PREFIXES: tuple[str, ...] = ( "https://www.youtube.com/embed/", "https://youtube.com/embed/", "https://www.youtube-nocookie.com/embed/", "https://youtube-nocookie.com/embed/", "https://player.vimeo.com/video/", "https://www.loom.com/embed/", "https://www.loom.com/share/", ) # Pre-pass regex matching opening ` blocks whose src is not in the video-host allowlist. nh3 then sees only the surviving iframes plus the rest of the document untouched. The walk is destructive (rewrites the string position by position) rather than re.sub-based so we can match the close tag cleanly even when iframes contain inner whitespace / nested children (rare but legal in HTML5).""" out_parts: list[str] = [] i = 0 while True: m = _IFRAME_OPEN_RE.search(html, i) if not m: out_parts.append(html[i:]) break # Emit text before the iframe. out_parts.append(html[i:m.start()]) open_end = m.end() # Find the matching (case-insensitive). HTML5 disallows # nesting iframes, so the next close tag is the matching one. close_re = re.compile(r"", re.IGNORECASE) close_m = close_re.search(html, open_end) if close_m: inner_close_end = close_m.end() else: # Unclosed iframe — drop the rest of the document defensively. inner_close_end = len(html) if _iframe_src_allowed(m.group(0)): out_parts.append(html[m.start():inner_close_end]) # else: drop the whole iframe element (open tag + body + close tag). i = inner_close_end return "".join(out_parts) def sanitize(html: str | None) -> str: """Sanitize `html` against the news allowlist. Returns "" for None / "". The two-stage pipeline is: (1) strip non-allowlisted iframes via regex pre-pass, (2) hand the survivors to nh3 with the tag / attribute / url-scheme allowlists. nh3 enforces every other rule — event handlers stripped, javascript:/data: schemes blocked, unknown tags removed, comments stripped. """ if not html: return "" pre = _strip_disallowed_iframes(html) return nh3.clean( pre, tags=_ALLOWED_TAGS, attributes=_ALLOWED_ATTRIBUTES, url_schemes=_ALLOWED_URL_SCHEMES, link_rel="noopener noreferrer", strip_comments=True, ) def stripped_text(html: str | None, limit: int = 120) -> str: """Return a plain-text preview of `html` clamped to `limit` chars. Used by the admin UI's versions table where each row shows a short preview of the intro + body. Strips ALL tags, then collapses whitespace and truncates with an ellipsis. """ if not html: return "" plain = nh3.clean(html, tags=set(), attributes={}, strip_comments=True) plain = " ".join(plain.split()).strip() if len(plain) > limit: return plain[: limit - 1].rstrip() + "…" return plain __all__ = ["sanitize", "stripped_text"]