Strip HTML tags from OpenMetadata descriptions in YAML export
OpenMetadata stores descriptions as rich HTML (<p>, <strong>, , etc.). Add strip_html() to transformer that converts to clean plain text for YAML files consumed by Claude Code agent. Applied to metric descriptions, table descriptions, and column descriptions. Webapp display dict keeps raw HTML since the modal renders it correctly.
This commit is contained in:
parent
985f47cdb7
commit
ab1a93ed67
2 changed files with 110 additions and 4 deletions
|
|
@ -14,6 +14,7 @@ Extracts metadata from OpenMetadata tag conventions:
|
|||
- Unit.* -> unit of measurement
|
||||
"""
|
||||
|
||||
import html
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
|
@ -193,6 +194,51 @@ def extract_tag_names(tags: List[Dict[str, Any]]) -> List[str]:
|
|||
return result
|
||||
|
||||
|
||||
def strip_html(text: str) -> str:
|
||||
"""
|
||||
Strip HTML tags and decode entities from OpenMetadata descriptions.
|
||||
|
||||
OpenMetadata stores descriptions as rich HTML. This converts to clean
|
||||
plain text suitable for YAML files and agent consumption.
|
||||
|
||||
Handles:
|
||||
- HTML tags (<p>, <strong>, <em>, <ul>, <li>, etc.)
|
||||
- HTML entities ( , &, etc.)
|
||||
- List items (converted to "- " prefix)
|
||||
- Excessive whitespace from tag removal
|
||||
|
||||
Args:
|
||||
text: Raw HTML string from OpenMetadata
|
||||
|
||||
Returns:
|
||||
Clean plain text string
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Convert <li> to list items before stripping tags
|
||||
result = re.sub(r"<li[^>]*>", "\n- ", text)
|
||||
|
||||
# Convert block-level tags to newlines
|
||||
result = re.sub(r"<br\s*/?>", "\n", result)
|
||||
result = re.sub(r"</(?:p|div|h[1-6]|tr|ul|ol)>", "\n", result)
|
||||
|
||||
# Remove all remaining HTML tags
|
||||
result = re.sub(r"<[^>]+>", "", result)
|
||||
|
||||
# Decode HTML entities ( -> space, & -> &, etc.)
|
||||
result = html.unescape(result)
|
||||
|
||||
# Clean up whitespace: collapse multiple spaces, strip lines
|
||||
lines = []
|
||||
for line in result.split("\n"):
|
||||
cleaned = " ".join(line.split())
|
||||
if cleaned:
|
||||
lines.append(cleaned)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def sanitize_filename(name: str) -> str:
|
||||
"""
|
||||
Convert metric/entity name to safe filesystem name.
|
||||
|
|
@ -247,7 +293,7 @@ def metric_to_yaml_dict(raw_metric: Dict[str, Any]) -> Dict[str, Any]:
|
|||
"time_column": "",
|
||||
"table": "",
|
||||
"expression": extract_expression(raw_metric),
|
||||
"description": (raw_metric.get("description", "") or "").strip(),
|
||||
"description": strip_html(raw_metric.get("description", "") or ""),
|
||||
"dimensions": extract_dimensions(tags),
|
||||
"notes": notes,
|
||||
"synonyms": [],
|
||||
|
|
@ -315,7 +361,7 @@ def metric_to_detail_dict(raw_metric: Dict[str, Any], category_colors: Optional[
|
|||
"time_column": "",
|
||||
},
|
||||
"overview": {
|
||||
"description": description.strip(),
|
||||
"description": strip_html(description),
|
||||
"key_insights": [],
|
||||
},
|
||||
"validation": None,
|
||||
|
|
@ -356,7 +402,7 @@ def table_to_yaml_dict(raw_table: Dict[str, Any]) -> Dict[str, Any]:
|
|||
"""
|
||||
fqn = raw_table.get("fullyQualifiedName", "")
|
||||
name = raw_table.get("name", "")
|
||||
description = raw_table.get("description", "") or ""
|
||||
description = strip_html(raw_table.get("description", "") or "")
|
||||
tags = raw_table.get("tags", [])
|
||||
|
||||
# Parse columns
|
||||
|
|
@ -365,7 +411,7 @@ def table_to_yaml_dict(raw_table: Dict[str, Any]) -> Dict[str, Any]:
|
|||
col_entry = {
|
||||
"name": col.get("name", ""),
|
||||
"type": col.get("dataType", ""),
|
||||
"description": (col.get("description", "") or "").strip(),
|
||||
"description": strip_html(col.get("description", "") or ""),
|
||||
}
|
||||
columns.append(col_entry)
|
||||
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ from connectors.openmetadata.transformer import (
|
|||
metric_to_display_dict,
|
||||
metric_to_yaml_dict,
|
||||
sanitize_filename,
|
||||
strip_html,
|
||||
table_to_yaml_dict,
|
||||
)
|
||||
|
||||
|
|
@ -368,6 +369,65 @@ class TestExtractTagNames:
|
|||
|
||||
|
||||
# ===========================================================================
|
||||
# strip_html
|
||||
# ===========================================================================
|
||||
|
||||
class TestStripHtml:
|
||||
def test_strip_simple_tags(self):
|
||||
assert strip_html("<p>Hello world</p>") == "Hello world"
|
||||
|
||||
def test_strip_nested_tags(self):
|
||||
result = strip_html("<p><strong>Bold</strong> and <em>italic</em></p>")
|
||||
assert result == "Bold and italic"
|
||||
|
||||
def test_decode_html_entities(self):
|
||||
result = strip_html("price & value")
|
||||
assert "price" in result
|
||||
assert "&" in result
|
||||
assert "value" in result
|
||||
assert " " not in result
|
||||
assert "&" not in result
|
||||
|
||||
def test_list_items(self):
|
||||
result = strip_html('<ul><li class="x">First</li><li>Second</li></ul>')
|
||||
assert "- First" in result
|
||||
assert "- Second" in result
|
||||
|
||||
def test_empty_string(self):
|
||||
assert strip_html("") == ""
|
||||
|
||||
def test_none_like(self):
|
||||
assert strip_html("") == ""
|
||||
|
||||
def test_plain_text_unchanged(self):
|
||||
assert strip_html("No HTML here") == "No HTML here"
|
||||
|
||||
def test_real_openmetadata_description(self):
|
||||
"""Test with actual OpenMetadata HTML output."""
|
||||
html_desc = (
|
||||
'<p><strong>Business name: </strong>Live Deals</p>'
|
||||
'<p><strong>Purpose:</strong></p>'
|
||||
'<p>The <em>Live deals</em> metric measures the breadth '
|
||||
'of active, purchasable supply on Groupon.</p>'
|
||||
)
|
||||
result = strip_html(html_desc)
|
||||
assert "<" not in result
|
||||
assert " " not in result
|
||||
assert "Live Deals" in result
|
||||
assert "Live deals" in result
|
||||
assert "purchasable supply" in result
|
||||
|
||||
def test_collapses_whitespace(self):
|
||||
result = strip_html("<p> too many spaces </p>")
|
||||
assert result == "too many spaces"
|
||||
|
||||
def test_br_tags(self):
|
||||
result = strip_html("line1<br/>line2<br>line3")
|
||||
assert "line1" in result
|
||||
assert "line2" in result
|
||||
assert "line3" in result
|
||||
|
||||
|
||||
# sanitize_filename
|
||||
# ===========================================================================
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue