Strip HTML tags from OpenMetadata descriptions in YAML export

OpenMetadata stores descriptions as rich HTML (<p>, <strong>, &nbsp;, etc.).
Add strip_html() to transformer that converts to clean plain text for YAML
files consumed by Claude Code agent. Applied to metric descriptions, table
descriptions, and column descriptions. Webapp display dict keeps raw HTML
since the modal renders it correctly.
This commit is contained in:
Petr 2026-03-15 01:57:04 +01:00
parent 985f47cdb7
commit ab1a93ed67
2 changed files with 110 additions and 4 deletions

View file

@ -14,6 +14,7 @@ Extracts metadata from OpenMetadata tag conventions:
- Unit.* -> unit of measurement
"""
import html
import logging
import re
from typing import Any, Dict, List, Optional
@ -193,6 +194,51 @@ def extract_tag_names(tags: List[Dict[str, Any]]) -> List[str]:
return result
def strip_html(text: str) -> str:
"""
Strip HTML tags and decode entities from OpenMetadata descriptions.
OpenMetadata stores descriptions as rich HTML. This converts to clean
plain text suitable for YAML files and agent consumption.
Handles:
- HTML tags (<p>, <strong>, <em>, <ul>, <li>, etc.)
- HTML entities (&nbsp;, &amp;, etc.)
- List items (converted to "- " prefix)
- Excessive whitespace from tag removal
Args:
text: Raw HTML string from OpenMetadata
Returns:
Clean plain text string
"""
if not text:
return ""
# Convert <li> to list items before stripping tags
result = re.sub(r"<li[^>]*>", "\n- ", text)
# Convert block-level tags to newlines
result = re.sub(r"<br\s*/?>", "\n", result)
result = re.sub(r"</(?:p|div|h[1-6]|tr|ul|ol)>", "\n", result)
# Remove all remaining HTML tags
result = re.sub(r"<[^>]+>", "", result)
# Decode HTML entities (&nbsp; -> space, &amp; -> &, etc.)
result = html.unescape(result)
# Clean up whitespace: collapse multiple spaces, strip lines
lines = []
for line in result.split("\n"):
cleaned = " ".join(line.split())
if cleaned:
lines.append(cleaned)
return "\n".join(lines)
def sanitize_filename(name: str) -> str:
"""
Convert metric/entity name to safe filesystem name.
@ -247,7 +293,7 @@ def metric_to_yaml_dict(raw_metric: Dict[str, Any]) -> Dict[str, Any]:
"time_column": "",
"table": "",
"expression": extract_expression(raw_metric),
"description": (raw_metric.get("description", "") or "").strip(),
"description": strip_html(raw_metric.get("description", "") or ""),
"dimensions": extract_dimensions(tags),
"notes": notes,
"synonyms": [],
@ -315,7 +361,7 @@ def metric_to_detail_dict(raw_metric: Dict[str, Any], category_colors: Optional[
"time_column": "",
},
"overview": {
"description": description.strip(),
"description": strip_html(description),
"key_insights": [],
},
"validation": None,
@ -356,7 +402,7 @@ def table_to_yaml_dict(raw_table: Dict[str, Any]) -> Dict[str, Any]:
"""
fqn = raw_table.get("fullyQualifiedName", "")
name = raw_table.get("name", "")
description = raw_table.get("description", "") or ""
description = strip_html(raw_table.get("description", "") or "")
tags = raw_table.get("tags", [])
# Parse columns
@ -365,7 +411,7 @@ def table_to_yaml_dict(raw_table: Dict[str, Any]) -> Dict[str, Any]:
col_entry = {
"name": col.get("name", ""),
"type": col.get("dataType", ""),
"description": (col.get("description", "") or "").strip(),
"description": strip_html(col.get("description", "") or ""),
}
columns.append(col_entry)

View file

@ -19,6 +19,7 @@ from connectors.openmetadata.transformer import (
metric_to_display_dict,
metric_to_yaml_dict,
sanitize_filename,
strip_html,
table_to_yaml_dict,
)
@ -368,6 +369,65 @@ class TestExtractTagNames:
# ===========================================================================
# strip_html
# ===========================================================================
class TestStripHtml:
def test_strip_simple_tags(self):
assert strip_html("<p>Hello world</p>") == "Hello world"
def test_strip_nested_tags(self):
result = strip_html("<p><strong>Bold</strong> and <em>italic</em></p>")
assert result == "Bold and italic"
def test_decode_html_entities(self):
result = strip_html("price&nbsp;&amp;&nbsp;value")
assert "price" in result
assert "&" in result
assert "value" in result
assert "&nbsp;" not in result
assert "&amp;" not in result
def test_list_items(self):
result = strip_html('<ul><li class="x">First</li><li>Second</li></ul>')
assert "- First" in result
assert "- Second" in result
def test_empty_string(self):
assert strip_html("") == ""
def test_none_like(self):
assert strip_html("") == ""
def test_plain_text_unchanged(self):
assert strip_html("No HTML here") == "No HTML here"
def test_real_openmetadata_description(self):
"""Test with actual OpenMetadata HTML output."""
html_desc = (
'<p><strong>Business name: </strong>Live Deals</p>'
'<p><strong>Purpose:</strong></p>'
'<p>The&nbsp;<em>Live deals</em>&nbsp;metric measures the&nbsp;breadth '
'of active, purchasable supply on Groupon.</p>'
)
result = strip_html(html_desc)
assert "<" not in result
assert "&nbsp;" not in result
assert "Live Deals" in result
assert "Live deals" in result
assert "purchasable supply" in result
def test_collapses_whitespace(self):
result = strip_html("<p> too many spaces </p>")
assert result == "too many spaces"
def test_br_tags(self):
result = strip_html("line1<br/>line2<br>line3")
assert "line1" in result
assert "line2" in result
assert "line3" in result
# sanitize_filename
# ===========================================================================