"""HTML sanitizer for the admin-edited news entity.
The /home news perex + /news full body are admin-authored HTML rendered
to every authenticated user, so the sanitizer is the security boundary.
nh3 (Rust-backed ammonia) is used in allowlist mode: anything not on
the explicit per-tag attribute list is dropped.
Iframe support is gated to a small list of video providers (YouTube,
Vimeo, Loom). The pre-pass strips any iframe whose `src` is missing or
not in the allowlist BEFORE handing to nh3 — nh3's own `attribute_filter`
can drop attributes but not whole elements, so a pre-pass is the
simplest way to enforce "iframe only when src is YouTube/Vimeo/Loom."
The sanitizer is invoked once on save (in the repository's `save_draft`)
before the row is written. Templates render with `{{ x | safe }}` and
trust the stored content — no second-pass sanitization on read.
"""
from __future__ import annotations
import re
from urllib.parse import urlparse
import nh3
# Tag allowlist for nh3.
_ALLOWED_TAGS: set[str] = {
"p", "br", "hr",
"h1", "h2", "h3", "h4", "h5", "h6",
"ul", "ol", "li",
"strong", "em", "b", "i", "u", "s",
"code", "pre", "blockquote",
"a", "img",
"span", "div", "section",
"table", "thead", "tbody", "tr", "th", "td",
"details", "summary",
"figure", "figcaption",
"iframe",
}
# Per-tag attribute allowlist. Anything not listed here is stripped by nh3.
_ATTR_CLASS_TARGETS = {"span", "div", "section", "p",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "td", "th", "blockquote", "a"}
_ALLOWED_ATTRIBUTES: dict[str, set[str]] = {
# `rel` is managed by nh3's `link_rel="noopener noreferrer"` and must
# NOT appear in this list (nh3 raises ValueError otherwise).
"a": {"href", "title", "target", "class"},
"img": {"src", "alt", "width", "height"},
"iframe": {"src", "title", "width", "height", "allow",
"allowfullscreen", "frameborder"},
}
for _tag in _ATTR_CLASS_TARGETS:
_ALLOWED_ATTRIBUTES.setdefault(_tag, set()).add("class")
# URL scheme allowlist applied to /
.
_ALLOWED_URL_SCHEMES: set[str] = {"http", "https", "mailto"}
# Iframe host allowlist — `src` must start with one of these prefixes
# (scheme + host + the leading path segment). Pre-pass drops the whole
# iframe element if `src` is missing or fails this check.
_IFRAME_SRC_PREFIXES: tuple[str, ...] = (
"https://www.youtube.com/embed/",
"https://youtube.com/embed/",
"https://www.youtube-nocookie.com/embed/",
"https://youtube-nocookie.com/embed/",
"https://player.vimeo.com/video/",
"https://www.loom.com/embed/",
"https://www.loom.com/share/",
)
# Pre-pass regex matching opening ", re.IGNORECASE)
close_m = close_re.search(html, open_end)
if close_m:
inner_close_end = close_m.end()
else:
# Unclosed iframe — drop the rest of the document defensively.
inner_close_end = len(html)
if _iframe_src_allowed(m.group(0)):
out_parts.append(html[m.start():inner_close_end])
# else: drop the whole iframe element (open tag + body + close tag).
i = inner_close_end
return "".join(out_parts)
def sanitize(html: str | None) -> str:
"""Sanitize `html` against the news allowlist. Returns "" for None / "".
The two-stage pipeline is: (1) strip non-allowlisted iframes via
regex pre-pass, (2) hand the survivors to nh3 with the tag /
attribute / url-scheme allowlists. nh3 enforces every other rule —
event handlers stripped, javascript:/data: schemes blocked, unknown
tags removed, comments stripped.
"""
if not html:
return ""
pre = _strip_disallowed_iframes(html)
return nh3.clean(
pre,
tags=_ALLOWED_TAGS,
attributes=_ALLOWED_ATTRIBUTES,
url_schemes=_ALLOWED_URL_SCHEMES,
link_rel="noopener noreferrer",
strip_comments=True,
)
def stripped_text(html: str | None, limit: int = 120) -> str:
"""Return a plain-text preview of `html` clamped to `limit` chars.
Used by the admin UI's versions table where each row shows a short
preview of the intro + body. Strips ALL tags, then collapses
whitespace and truncates with an ellipsis.
"""
if not html:
return ""
plain = nh3.clean(html, tags=set(), attributes={}, strip_comments=True)
plain = " ".join(plain.split()).strip()
if len(plain) > limit:
return plain[: limit - 1].rstrip() + "…"
return plain
__all__ = ["sanitize", "stripped_text"]