""" Tests for OpenMetadata transformer. All transformer functions are pure (dict in -> dict/str/list out), so no mocks needed. """ import pytest from connectors.openmetadata.transformer import ( extract_category, extract_dimensions, extract_expression, extract_grain, extract_metric_type, extract_owners, extract_tag_names, extract_unit, has_tag, metric_to_detail_dict, metric_to_display_dict, metric_to_yaml_dict, sanitize_filename, strip_html, table_to_yaml_dict, ) # --------------------------------------------------------------------------- # Helper: build a tag dict the way OpenMetadata returns them # --------------------------------------------------------------------------- def _tag(fqn: str, name: str = "") -> dict: """Build a minimal OpenMetadata tag dict.""" tag = {"tagFQN": fqn} if name: tag["name"] = name return tag # =========================================================================== # extract_category # =========================================================================== class TestExtractCategory: def test_extract_category_from_metric_category_tag(self): """MetricCategory.finance tag -> 'finance'.""" tags = [_tag("MetricCategory.finance")] assert extract_category(tags) == "finance" def test_extract_category_from_category_tag(self): """Category.marketing tag -> 'marketing'.""" tags = [_tag("Category.marketing")] assert extract_category(tags) == "marketing" def test_extract_category_default(self): """No matching tags -> 'general'.""" tags = [_tag("SomeOther.tag"), _tag("Tier.Tier1")] assert extract_category(tags) == "general" def test_extract_category_empty_tags(self): """Empty tag list -> 'general'.""" assert extract_category([]) == "general" def test_extract_category_metric_category_takes_priority(self): """MetricCategory.* is checked before Category.* (iteration order).""" tags = [_tag("MetricCategory.finance"), _tag("Category.marketing")] assert extract_category(tags) == "finance" def test_extract_category_category_fallback_if_no_metric_category(self): """Category.* is used when MetricCategory.* is absent.""" tags = [_tag("Tier.Tier1"), _tag("Category.operations")] assert extract_category(tags) == "operations" def test_extract_category_with_nested_dot_in_value(self): """MetricCategory.sub.area -> 'sub.area' (split on first dot only).""" tags = [_tag("MetricCategory.sub.area")] assert extract_category(tags) == "sub.area" def test_extract_category_missing_tagfqn_key(self): """Tag dict without tagFQN key is safely skipped.""" tags = [{"name": "orphan"}] assert extract_category(tags) == "general" # =========================================================================== # extract_grain # =========================================================================== class TestExtractGrain: def test_extract_grain_from_field(self): """granularity field takes priority over tags.""" raw = { "granularity": "Daily", "tags": [_tag("Grain.monthly")], } assert extract_grain(raw) == "daily" def test_extract_grain_from_tag(self): """Grain.monthly tag used when granularity field is absent.""" raw = {"tags": [_tag("Grain.monthly")]} assert extract_grain(raw) == "monthly" def test_extract_grain_empty(self): """No grain info -> empty string.""" raw = {"tags": [_tag("Category.finance")]} assert extract_grain(raw) == "" def test_extract_grain_no_tags_no_field(self): """Completely empty metric -> empty string.""" assert extract_grain({}) == "" def test_extract_grain_field_is_none(self): """granularity=None should fall through to tags.""" raw = {"granularity": None, "tags": [_tag("Grain.weekly")]} assert extract_grain(raw) == "weekly" def test_extract_grain_field_is_empty_string(self): """granularity='' should fall through to tags.""" raw = {"granularity": "", "tags": [_tag("Grain.yearly")]} assert extract_grain(raw) == "yearly" def test_extract_grain_tag_lowercased(self): """Grain tag value is lowercased.""" raw = {"tags": [_tag("Grain.QUARTERLY")]} assert extract_grain(raw) == "quarterly" # =========================================================================== # extract_dimensions # =========================================================================== class TestExtractDimensions: def test_extract_dimensions(self): """Multiple Dimension.* tags -> list of dimension names.""" tags = [ _tag("Dimension.economic_area"), _tag("Dimension.country"), _tag("Category.finance"), ] result = extract_dimensions(tags) assert result == ["economic_area", "country"] def test_extract_dimensions_empty(self): """No Dimension tags -> empty list.""" tags = [_tag("Category.finance"), _tag("Tier.Tier1")] assert extract_dimensions(tags) == [] def test_extract_dimensions_empty_list(self): """Empty tag list -> empty list.""" assert extract_dimensions([]) == [] def test_extract_dimensions_preserves_order(self): """Dimensions are returned in tag order.""" tags = [_tag("Dimension.z_last"), _tag("Dimension.a_first")] assert extract_dimensions(tags) == ["z_last", "a_first"] # =========================================================================== # extract_expression # =========================================================================== class TestExtractExpression: def test_extract_expression_dict(self): """metricExpression as dict with 'expression' key.""" raw = {"metricExpression": {"expression": "SUM(revenue_usd)"}} assert extract_expression(raw) == "SUM(revenue_usd)" def test_extract_expression_string(self): """metricExpression as plain string.""" raw = {"metricExpression": "COUNT(DISTINCT order_id)"} assert extract_expression(raw) == "COUNT(DISTINCT order_id)" def test_extract_expression_empty(self): """No metricExpression -> empty string.""" raw = {"name": "some_metric"} assert extract_expression(raw) == "" def test_extract_expression_dict_missing_key(self): """Dict without 'expression' key -> empty string.""" raw = {"metricExpression": {"formula": "x + y"}} assert extract_expression(raw) == "" def test_extract_expression_dict_none_value(self): """Dict with expression=None -> empty string.""" raw = {"metricExpression": {"expression": None}} assert extract_expression(raw) == "" def test_extract_expression_none(self): """metricExpression=None -> empty string (default {} from .get()).""" raw = {"metricExpression": None} # None is not dict and not str, so returns "" assert extract_expression(raw) == "" def test_extract_expression_empty_dict(self): """metricExpression={} -> empty string.""" raw = {"metricExpression": {}} assert extract_expression(raw) == "" # =========================================================================== # extract_owners # =========================================================================== class TestExtractOwners: def test_extract_owners(self): """owners list with name/displayName.""" raw = { "owners": [ {"name": "alice", "displayName": "Alice Smith"}, {"name": "bob"}, ] } assert extract_owners(raw) == ["alice", "bob"] def test_extract_owners_display_name_fallback(self): """displayName is used when name is absent.""" raw = { "owners": [ {"displayName": "Charlie Brown"}, ] } assert extract_owners(raw) == ["Charlie Brown"] def test_extract_owners_empty(self): """No owners key -> empty list.""" raw = {"name": "something"} assert extract_owners(raw) == [] def test_extract_owners_empty_list(self): """Empty owners list -> empty list.""" raw = {"owners": []} assert extract_owners(raw) == [] def test_extract_owners_skips_empty_names(self): """Owners with no name or displayName are skipped.""" raw = { "owners": [ {"email": "no-name@example.com"}, {"name": "", "displayName": ""}, {"name": "valid_user"}, ] } assert extract_owners(raw) == ["valid_user"] def test_extract_owners_name_none_falls_to_display_name(self): """name=None falls back to displayName.""" raw = { "owners": [{"name": None, "displayName": "Fallback Name"}] } assert extract_owners(raw) == ["Fallback Name"] # =========================================================================== # extract_metric_type # =========================================================================== class TestExtractMetricType: def test_extract_metric_type_from_field(self): """metricType field takes priority.""" raw = { "metricType": "SUM", "tags": [_tag("MetricType.count")], } assert extract_metric_type(raw) == "sum" def test_extract_metric_type_from_tag(self): """MetricType.* tag used when field is absent.""" raw = {"tags": [_tag("MetricType.ratio")]} assert extract_metric_type(raw) == "ratio" def test_extract_metric_type_empty(self): """No metric type info -> empty string.""" raw = {"tags": [_tag("Category.finance")]} assert extract_metric_type(raw) == "" def test_extract_metric_type_field_none(self): """metricType=None falls through to tags.""" raw = {"metricType": None, "tags": [_tag("MetricType.average")]} assert extract_metric_type(raw) == "average" def test_extract_metric_type_lowercased(self): """Metric type from field is lowercased.""" raw = {"metricType": "COUNT", "tags": []} assert extract_metric_type(raw) == "count" def test_extract_metric_type_tag_lowercased(self): """Metric type from tag is lowercased.""" raw = {"tags": [_tag("MetricType.PERCENTAGE")]} assert extract_metric_type(raw) == "percentage" # =========================================================================== # extract_unit # =========================================================================== class TestExtractUnit: def test_extract_unit_from_field(self): """unitOfMeasurement field takes priority.""" raw = { "unitOfMeasurement": "USD", "tags": [_tag("Unit.EUR")], } assert extract_unit(raw) == "USD" def test_extract_unit_from_tag(self): """Unit.* tag used when field is absent.""" raw = {"tags": [_tag("Unit.count")]} assert extract_unit(raw) == "count" def test_extract_unit_empty(self): """No unit info -> empty string.""" raw = {"tags": [_tag("Category.finance")]} assert extract_unit(raw) == "" def test_extract_unit_field_none(self): """unitOfMeasurement=None falls through to tags.""" raw = {"unitOfMeasurement": None, "tags": [_tag("Unit.percent")]} assert extract_unit(raw) == "percent" def test_extract_unit_field_empty_string(self): """unitOfMeasurement='' falls through to tags.""" raw = {"unitOfMeasurement": "", "tags": [_tag("Unit.GBP")]} assert extract_unit(raw) == "GBP" def test_extract_unit_preserves_case(self): """Unit value from field is NOT lowercased (unlike metric_type).""" raw = {"unitOfMeasurement": "USD", "tags": []} assert extract_unit(raw) == "USD" # =========================================================================== # extract_tag_names # =========================================================================== class TestHasTag: def test_has_tag_present(self): """Returns True when tag with matching FQN is in the list.""" tags = [ {"tagFQN": "AIAgent.FoundryAI", "name": "FoundryAI"}, {"tagFQN": "Tier.Tier1"}, ] assert has_tag(tags, "AIAgent.FoundryAI") is True def test_has_tag_absent(self): """Returns False when tag is not in the list.""" tags = [{"tagFQN": "Tier.Tier2"}] assert has_tag(tags, "AIAgent.FoundryAI") is False def test_has_tag_empty_list(self): """Returns False for empty tag list.""" assert has_tag([], "AIAgent.FoundryAI") is False def test_has_tag_partial_match(self): """Does not match partial FQN.""" tags = [{"tagFQN": "AIAgent.FoundryAI_v2"}] assert has_tag(tags, "AIAgent.FoundryAI") is False class TestExtractTagNames: def test_extract_tag_names_with_name_field(self): """Tags with 'name' field use that value.""" tags = [ {"name": "finance", "tagFQN": "Category.finance"}, {"name": "Tier1", "tagFQN": "Tier.Tier1"}, ] assert extract_tag_names(tags) == ["finance", "Tier1"] def test_extract_tag_names_from_fqn(self): """Tags without 'name' extract last segment of tagFQN.""" tags = [ {"tagFQN": "Category.finance"}, {"tagFQN": "Tier.Tier1"}, ] assert extract_tag_names(tags) == ["finance", "Tier1"] def test_extract_tag_names_empty(self): """Empty tag list -> empty list.""" assert extract_tag_names([]) == [] def test_extract_tag_names_mixed(self): """Mix of tags with and without 'name' field.""" tags = [ {"name": "explicit_name", "tagFQN": "Category.something_else"}, {"tagFQN": "Dimension.country"}, ] result = extract_tag_names(tags) assert result == ["explicit_name", "country"] def test_extract_tag_names_no_name_no_fqn(self): """Tag without name or tagFQN is skipped (empty string).""" tags = [{"description": "orphan tag"}] # tagFQN defaults to "" -> split(".")[-1] is "" -> falsy, skipped assert extract_tag_names(tags) == [] # =========================================================================== # strip_html # =========================================================================== class TestStripHtml: def test_strip_simple_tags(self): assert strip_html("
Hello world
") == "Hello world" def test_strip_nested_tags(self): result = strip_html("Bold and italic
") assert result == "Bold and italic" def test_decode_html_entities(self): result = strip_html("price & value") assert "price" in result assert "&" in result assert "value" in result assert " " not in result assert "&" not in result def test_list_items(self): result = strip_html('Business name: Live Deals
' 'Purpose:
' 'The Live deals metric measures the breadth ' 'of active, purchasable supply on Groupon.
' ) result = strip_html(html_desc) assert "<" not in result assert " " not in result assert "Live Deals" in result assert "Live deals" in result assert "purchasable supply" in result def test_collapses_whitespace(self): result = strip_html("too many spaces
") assert result == "too many spaces" def test_br_tags(self): result = strip_html("line1