Add catalog export: generate YAML metrics and tables from OpenMetadata
- New `connectors/openmetadata/transformer.py` with shared parsing logic for extracting categories, grain, dimensions, expressions from OM tags - New `src/catalog_export.py` script (python -m src.catalog_export) that fetches metrics/tables from OpenMetadata API and writes YAML files to /data/docs/metrics/ and /data/docs/tables/ for agent consumption - Refactor webapp/app.py to delegate to transformer (with inline fallback) - Add `fields` parameter to client.get_metrics() and get_metric_by_fqn() for fetching tags+owners in a single API call - Fix pre-existing mock bug in test_openmetadata_enricher (base_url) - 101 new tests (80 transformer + 21 export), all passing
This commit is contained in:
parent
e17dd85504
commit
985f47cdb7
7 changed files with 2085 additions and 66 deletions
|
|
@ -87,22 +87,24 @@ class OpenMetadataClient:
|
||||||
|
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
def get_metrics(self, limit: int = 100) -> List[Dict[str, Any]]:
|
def get_metrics(self, limit: int = 100, fields: str = "tags,owners") -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Fetch list of available metrics from OpenMetadata (Phase 2).
|
Fetch list of available metrics from OpenMetadata.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
limit: Maximum number of metrics to return
|
limit: Maximum number of metrics to return
|
||||||
|
fields: Comma-separated list of fields to include (e.g., "tags,owners")
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of metric dictionaries with:
|
List of metric dictionaries with:
|
||||||
- id, name, fullyQualifiedName
|
- id, name, fullyQualifiedName
|
||||||
- description
|
- description
|
||||||
- metricExpression: metric calculation SQL/formula
|
- metricExpression: metric calculation SQL/formula
|
||||||
- owners, tags
|
- owners, tags (when requested via fields)
|
||||||
"""
|
"""
|
||||||
params = {
|
params = {
|
||||||
"limit": limit,
|
"limit": limit,
|
||||||
|
"fields": fields,
|
||||||
}
|
}
|
||||||
|
|
||||||
response = self._client.get("/api/v1/metrics", params=params)
|
response = self._client.get("/api/v1/metrics", params=params)
|
||||||
|
|
@ -111,25 +113,27 @@ class OpenMetadataClient:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
return data.get("data", [])
|
return data.get("data", [])
|
||||||
|
|
||||||
def get_metric_by_fqn(self, fqn: str) -> Dict[str, Any]:
|
def get_metric_by_fqn(self, fqn: str, fields: str = "tags,owners") -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Fetch a specific metric by FQN from OpenMetadata.
|
Fetch a specific metric by FQN from OpenMetadata.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
fqn: Fully qualified name (e.g., "Active2 Customers")
|
fqn: Fully qualified name (e.g., "Active2 Customers")
|
||||||
|
fields: Comma-separated list of fields to include
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with metric metadata:
|
Dictionary with metric metadata:
|
||||||
- id, name, fullyQualifiedName
|
- id, name, fullyQualifiedName
|
||||||
- description, metricExpression
|
- description, metricExpression
|
||||||
- owners, tags
|
- owners, tags (when requested via fields)
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
httpx.HTTPStatusError: If request fails (non-2xx status)
|
httpx.HTTPStatusError: If request fails (non-2xx status)
|
||||||
"""
|
"""
|
||||||
url = f"/api/v1/metrics/name/{fqn}"
|
url = f"/api/v1/metrics/name/{fqn}"
|
||||||
|
params = {"fields": fields}
|
||||||
|
|
||||||
response = self._client.get(url)
|
response = self._client.get(url, params=params)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
|
||||||
392
connectors/openmetadata/transformer.py
Normal file
392
connectors/openmetadata/transformer.py
Normal file
|
|
@ -0,0 +1,392 @@
|
||||||
|
"""
|
||||||
|
OpenMetadata data transformer.
|
||||||
|
|
||||||
|
Shared logic for parsing OpenMetadata API responses into structured dicts
|
||||||
|
suitable for YAML export and webapp display. Used by:
|
||||||
|
- src/catalog_export.py (YAML file generation)
|
||||||
|
- webapp/app.py (metric list and detail display)
|
||||||
|
|
||||||
|
Extracts metadata from OpenMetadata tag conventions:
|
||||||
|
- MetricCategory.* or Category.* -> category
|
||||||
|
- Grain.* -> grain/granularity
|
||||||
|
- Dimension.* -> dimensions list
|
||||||
|
- MetricType.* -> metric type
|
||||||
|
- Unit.* -> unit of measurement
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_category(tags: List[Dict[str, Any]]) -> str:
|
||||||
|
"""
|
||||||
|
Extract metric category from OpenMetadata tags.
|
||||||
|
|
||||||
|
Looks for tagFQN prefixed with "MetricCategory." or "Category.".
|
||||||
|
Returns the first match found, or "general" as fallback.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tags: List of tag dicts from OpenMetadata (each with "tagFQN" key)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Category string (e.g., "finance", "marketing")
|
||||||
|
"""
|
||||||
|
for tag in tags:
|
||||||
|
tag_fqn = tag.get("tagFQN", "")
|
||||||
|
if tag_fqn.startswith("MetricCategory."):
|
||||||
|
return tag_fqn.split(".", 1)[1]
|
||||||
|
if tag_fqn.startswith("Category."):
|
||||||
|
return tag_fqn.split(".", 1)[1]
|
||||||
|
return "general"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_grain(raw_metric: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Extract metric granularity from OpenMetadata metric data.
|
||||||
|
|
||||||
|
Checks the "granularity" field first, then falls back to Grain.* tags.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_metric: Raw metric dict from OpenMetadata API
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Grain string (e.g., "monthly", "daily"), lowercase. Empty string if not found.
|
||||||
|
"""
|
||||||
|
grain = raw_metric.get("granularity", "") or ""
|
||||||
|
if grain:
|
||||||
|
return grain.lower()
|
||||||
|
|
||||||
|
for tag in raw_metric.get("tags", []):
|
||||||
|
tag_fqn = tag.get("tagFQN", "")
|
||||||
|
if tag_fqn.startswith("Grain."):
|
||||||
|
return tag_fqn.split(".", 1)[1].lower()
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_dimensions(tags: List[Dict[str, Any]]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Extract dimension names from OpenMetadata tags.
|
||||||
|
|
||||||
|
Looks for tagFQN prefixed with "Dimension.".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tags: List of tag dicts from OpenMetadata
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dimension names (e.g., ["economic_area", "merchant_country"])
|
||||||
|
"""
|
||||||
|
dimensions = []
|
||||||
|
for tag in tags:
|
||||||
|
tag_fqn = tag.get("tagFQN", "")
|
||||||
|
if tag_fqn.startswith("Dimension."):
|
||||||
|
dimensions.append(tag_fqn.split(".", 1)[1])
|
||||||
|
return dimensions
|
||||||
|
|
||||||
|
|
||||||
|
def extract_expression(raw_metric: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Extract metric SQL expression from OpenMetadata metric data.
|
||||||
|
|
||||||
|
Handles both dict format ({"expression": "..."}) and plain string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_metric: Raw metric dict from OpenMetadata API
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
SQL expression string, or empty string if not found.
|
||||||
|
"""
|
||||||
|
metric_expr = raw_metric.get("metricExpression", {})
|
||||||
|
if isinstance(metric_expr, dict):
|
||||||
|
return metric_expr.get("expression", "") or ""
|
||||||
|
if isinstance(metric_expr, str):
|
||||||
|
return metric_expr
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_owners(raw: Dict[str, Any]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Extract owner names from OpenMetadata entity data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw: Raw entity dict with optional "owners" list
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of owner name strings
|
||||||
|
"""
|
||||||
|
names = []
|
||||||
|
for owner in raw.get("owners", []):
|
||||||
|
name = owner.get("name") or owner.get("displayName", "")
|
||||||
|
if name:
|
||||||
|
names.append(name)
|
||||||
|
return names
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metric_type(raw_metric: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Extract metric type from OpenMetadata metric data.
|
||||||
|
|
||||||
|
Checks "metricType" field first, then MetricType.* tags.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_metric: Raw metric dict from OpenMetadata API
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Metric type string (e.g., "sum", "count"), lowercase.
|
||||||
|
"""
|
||||||
|
metric_type = raw_metric.get("metricType", "") or ""
|
||||||
|
if metric_type:
|
||||||
|
return metric_type.lower()
|
||||||
|
|
||||||
|
for tag in raw_metric.get("tags", []):
|
||||||
|
tag_fqn = tag.get("tagFQN", "")
|
||||||
|
if tag_fqn.startswith("MetricType."):
|
||||||
|
return tag_fqn.split(".", 1)[1].lower()
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_unit(raw_metric: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Extract unit of measurement from OpenMetadata metric data.
|
||||||
|
|
||||||
|
Checks "unitOfMeasurement" field first, then Unit.* tags.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_metric: Raw metric dict from OpenMetadata API
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Unit string (e.g., "USD", "count").
|
||||||
|
"""
|
||||||
|
unit = raw_metric.get("unitOfMeasurement", "") or ""
|
||||||
|
if unit:
|
||||||
|
return unit
|
||||||
|
|
||||||
|
for tag in raw_metric.get("tags", []):
|
||||||
|
tag_fqn = tag.get("tagFQN", "")
|
||||||
|
if tag_fqn.startswith("Unit."):
|
||||||
|
return tag_fqn.split(".", 1)[1]
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_tag_names(tags: List[Dict[str, Any]]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Extract simple tag names from OpenMetadata tag list.
|
||||||
|
|
||||||
|
Uses "name" field if present, otherwise extracts last segment of "tagFQN".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tags: List of tag dicts from OpenMetadata
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tag name strings
|
||||||
|
"""
|
||||||
|
result = []
|
||||||
|
for tag in tags:
|
||||||
|
name = tag.get("name") or tag.get("tagFQN", "").split(".")[-1]
|
||||||
|
if name:
|
||||||
|
result.append(name)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_filename(name: str) -> str:
|
||||||
|
"""
|
||||||
|
Convert metric/entity name to safe filesystem name.
|
||||||
|
|
||||||
|
Replaces non-alphanumeric characters with underscores, collapses
|
||||||
|
consecutive underscores, strips leading/trailing underscores, lowercases.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Raw entity name (e.g., "M1 Operational Margin")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Safe filename (e.g., "m1_operational_margin")
|
||||||
|
"""
|
||||||
|
safe = re.sub(r"[^a-zA-Z0-9]+", "_", name)
|
||||||
|
safe = re.sub(r"_+", "_", safe)
|
||||||
|
return safe.strip("_").lower()
|
||||||
|
|
||||||
|
|
||||||
|
def metric_to_yaml_dict(raw_metric: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Transform raw OpenMetadata metric into YAML-compatible dict.
|
||||||
|
|
||||||
|
Output format is compatible with MetricParser._structure_metric_data()
|
||||||
|
and can be written directly as YAML for Claude Code agent consumption.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_metric: Raw metric dict from OpenMetadata API
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys: name, display_name, category, type, unit, grain,
|
||||||
|
time_column, table, expression, description, dimensions, notes, synonyms
|
||||||
|
"""
|
||||||
|
tags = raw_metric.get("tags", [])
|
||||||
|
name = raw_metric.get("name", "")
|
||||||
|
display_name = raw_metric.get("displayName", name)
|
||||||
|
fqn = raw_metric.get("fullyQualifiedName", "")
|
||||||
|
|
||||||
|
owner_names = extract_owners(raw_metric)
|
||||||
|
notes = []
|
||||||
|
if fqn:
|
||||||
|
notes.append(f"Source: OpenMetadata catalog (FQN: {fqn})")
|
||||||
|
if owner_names:
|
||||||
|
notes.append(f"Owners: {', '.join(owner_names)}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": sanitize_filename(name),
|
||||||
|
"display_name": display_name,
|
||||||
|
"category": extract_category(tags),
|
||||||
|
"type": extract_metric_type(raw_metric),
|
||||||
|
"unit": extract_unit(raw_metric),
|
||||||
|
"grain": extract_grain(raw_metric),
|
||||||
|
"time_column": "",
|
||||||
|
"table": "",
|
||||||
|
"expression": extract_expression(raw_metric),
|
||||||
|
"description": (raw_metric.get("description", "") or "").strip(),
|
||||||
|
"dimensions": extract_dimensions(tags),
|
||||||
|
"notes": notes,
|
||||||
|
"synonyms": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def metric_to_display_dict(raw_metric: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Parse raw OpenMetadata metric for metric list display in webapp.
|
||||||
|
|
||||||
|
Returns a lightweight dict for listing metrics (not full detail).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_metric: Raw metric dict from OpenMetadata API
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys: name, display_name, description, grain, category, path
|
||||||
|
"""
|
||||||
|
fqn = raw_metric.get("fullyQualifiedName", "")
|
||||||
|
name = raw_metric.get("name", "")
|
||||||
|
display_name = raw_metric.get("displayName", name)
|
||||||
|
description = raw_metric.get("description", "") or ""
|
||||||
|
tags = raw_metric.get("tags", [])
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"display_name": display_name,
|
||||||
|
"description": description,
|
||||||
|
"grain": extract_grain(raw_metric),
|
||||||
|
"category": extract_category(tags),
|
||||||
|
"path": f"catalog:{fqn}",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def metric_to_detail_dict(raw_metric: Dict[str, Any], category_colors: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert raw OpenMetadata metric into MetricParser-compatible detail dict for modal display.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_metric: Raw metric dict from OpenMetadata API
|
||||||
|
category_colors: Optional mapping of category -> CSS color hex
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict matching MetricParser._structure_metric_data() output format
|
||||||
|
"""
|
||||||
|
if category_colors is None:
|
||||||
|
category_colors = {}
|
||||||
|
|
||||||
|
tags = raw_metric.get("tags", [])
|
||||||
|
name = raw_metric.get("name", "")
|
||||||
|
display_name = raw_metric.get("displayName", name)
|
||||||
|
description = raw_metric.get("description", "") or ""
|
||||||
|
category = extract_category(tags)
|
||||||
|
expression = extract_expression(raw_metric)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"display_name": display_name,
|
||||||
|
"category": category,
|
||||||
|
"category_color": category_colors.get(category, "#6B7280"),
|
||||||
|
"metadata": {
|
||||||
|
"type": extract_metric_type(raw_metric),
|
||||||
|
"unit": extract_unit(raw_metric),
|
||||||
|
"grain": extract_grain(raw_metric),
|
||||||
|
"time_column": "",
|
||||||
|
},
|
||||||
|
"overview": {
|
||||||
|
"description": description.strip(),
|
||||||
|
"key_insights": [],
|
||||||
|
},
|
||||||
|
"validation": None,
|
||||||
|
"dimensions": extract_dimensions(tags),
|
||||||
|
"notes": {
|
||||||
|
"all": [],
|
||||||
|
"key_insights": [],
|
||||||
|
},
|
||||||
|
"sql_examples": {
|
||||||
|
"expression": {
|
||||||
|
"title": "Metric Expression",
|
||||||
|
"query": expression,
|
||||||
|
"complexity": "simple",
|
||||||
|
}
|
||||||
|
} if expression else {},
|
||||||
|
"technical": {
|
||||||
|
"table": "",
|
||||||
|
"expression": expression,
|
||||||
|
"synonyms": [],
|
||||||
|
"data_sources": [],
|
||||||
|
},
|
||||||
|
"special_sections": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def table_to_yaml_dict(raw_table: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Transform raw OpenMetadata table response into YAML-compatible dict.
|
||||||
|
|
||||||
|
Extracts table description, column metadata, owners, tags, and tier.
|
||||||
|
Reuses parsing logic from CatalogEnricher._parse_table_response().
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_table: Raw table dict from OpenMetadata /api/v1/tables/name/{fqn}
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys: name, fqn, description, owners, tags, tier, columns
|
||||||
|
"""
|
||||||
|
fqn = raw_table.get("fullyQualifiedName", "")
|
||||||
|
name = raw_table.get("name", "")
|
||||||
|
description = raw_table.get("description", "") or ""
|
||||||
|
tags = raw_table.get("tags", [])
|
||||||
|
|
||||||
|
# Parse columns
|
||||||
|
columns = []
|
||||||
|
for col in raw_table.get("columns", []):
|
||||||
|
col_entry = {
|
||||||
|
"name": col.get("name", ""),
|
||||||
|
"type": col.get("dataType", ""),
|
||||||
|
"description": (col.get("description", "") or "").strip(),
|
||||||
|
}
|
||||||
|
columns.append(col_entry)
|
||||||
|
|
||||||
|
# Parse tier from tags (Tier.Tier1 etc.) or extension
|
||||||
|
tier = None
|
||||||
|
extension = raw_table.get("extension", {})
|
||||||
|
if extension:
|
||||||
|
tier = extension.get("tier") or extension.get("Tier")
|
||||||
|
if not tier:
|
||||||
|
for tag in tags:
|
||||||
|
tag_fqn = tag.get("tagFQN", "")
|
||||||
|
if tag_fqn.startswith("Tier."):
|
||||||
|
tier = tag_fqn.split(".", 1)[1]
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"fqn": fqn,
|
||||||
|
"description": description.strip(),
|
||||||
|
"owners": extract_owners(raw_table),
|
||||||
|
"tags": extract_tag_names(tags),
|
||||||
|
"tier": tier or "",
|
||||||
|
"columns": columns,
|
||||||
|
}
|
||||||
424
src/catalog_export.py
Normal file
424
src/catalog_export.py
Normal file
|
|
@ -0,0 +1,424 @@
|
||||||
|
"""
|
||||||
|
Catalog Export - generate YAML metric and table definitions from OpenMetadata.
|
||||||
|
|
||||||
|
Fetches metrics and table metadata from the OpenMetadata catalog API,
|
||||||
|
transforms them into YAML files compatible with the AI Data Analyst agent
|
||||||
|
(MetricParser format), and writes them to the docs directory.
|
||||||
|
|
||||||
|
Output structure:
|
||||||
|
/data/docs/metrics/{category}/{metric_name}.yml - individual metric files
|
||||||
|
/data/docs/metrics/metrics.yml - summary index
|
||||||
|
/data/docs/tables/{table_name}.yml - table column descriptions
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m src.catalog_export
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from config.loader import load_instance_config
|
||||||
|
from connectors.openmetadata.client import OpenMetadataClient
|
||||||
|
from connectors.openmetadata.transformer import (
|
||||||
|
extract_category,
|
||||||
|
extract_grain,
|
||||||
|
metric_to_yaml_dict,
|
||||||
|
sanitize_filename,
|
||||||
|
table_to_yaml_dict,
|
||||||
|
)
|
||||||
|
from src.config import Config
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Logging
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Constants
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
AUTO_GENERATED_MARKER = "# AUTO-GENERATED from OpenMetadata catalog"
|
||||||
|
CATALOG_SOURCE_LINE = "# Source: {url}"
|
||||||
|
GENERATED_LINE = "# Generated: {timestamp}"
|
||||||
|
DO_NOT_EDIT_LINE = "# DO NOT EDIT - changes will be overwritten on next catalog export"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_docs_dir() -> Path:
|
||||||
|
"""
|
||||||
|
Derive docs output directory from DATA_DIR environment variable.
|
||||||
|
|
||||||
|
DATA_DIR is typically /data/src_data -> docs dir is /data/docs.
|
||||||
|
Falls back to ./docs if DATA_DIR is not set.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to docs directory (created if needed)
|
||||||
|
"""
|
||||||
|
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
||||||
|
# /data/src_data -> /data/docs
|
||||||
|
docs_dir = data_dir.parent / "docs"
|
||||||
|
return docs_dir
|
||||||
|
|
||||||
|
|
||||||
|
def _yaml_header(catalog_url: str, fqn: str = "") -> str:
|
||||||
|
"""
|
||||||
|
Generate AUTO-GENERATED header comment for YAML files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
catalog_url: OpenMetadata instance URL
|
||||||
|
fqn: Optional entity FQN for reference
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Multi-line comment string
|
||||||
|
"""
|
||||||
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
lines = [
|
||||||
|
AUTO_GENERATED_MARKER,
|
||||||
|
CATALOG_SOURCE_LINE.format(url=catalog_url),
|
||||||
|
GENERATED_LINE.format(timestamp=timestamp),
|
||||||
|
DO_NOT_EDIT_LINE,
|
||||||
|
]
|
||||||
|
if fqn:
|
||||||
|
lines.append(f"# FQN: {fqn}")
|
||||||
|
return "\n".join(lines) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def _is_auto_generated(file_path: Path) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a YAML file was auto-generated by this script.
|
||||||
|
|
||||||
|
Reads the first line and checks for the AUTO-GENERATED marker.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to YAML file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if file starts with AUTO-GENERATED marker
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
first_line = f.readline().strip()
|
||||||
|
return first_line == AUTO_GENERATED_MARKER
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def export_metrics(
|
||||||
|
client: OpenMetadataClient,
|
||||||
|
docs_dir: Path,
|
||||||
|
catalog_url: str,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Export metrics from OpenMetadata to YAML files.
|
||||||
|
|
||||||
|
For each metric:
|
||||||
|
1. Fetches all metrics from catalog API
|
||||||
|
2. Transforms each to YAML-compatible dict
|
||||||
|
3. Writes individual YAML files: {docs_dir}/metrics/{category}/{name}.yml
|
||||||
|
4. Writes index file: {docs_dir}/metrics/metrics.yml
|
||||||
|
5. Cleans up stale auto-generated files
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client: Initialized OpenMetadata API client
|
||||||
|
docs_dir: Base docs directory (e.g., /data/docs)
|
||||||
|
catalog_url: Catalog URL for header comments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of metrics exported
|
||||||
|
"""
|
||||||
|
metrics_dir = docs_dir / "metrics"
|
||||||
|
metrics_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Fetch all metrics with tags and owners
|
||||||
|
raw_metrics = client.get_metrics(limit=200, fields="tags,owners")
|
||||||
|
if not raw_metrics:
|
||||||
|
logger.warning("No metrics returned from catalog - preserving existing files")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
logger.info(f"Fetched {len(raw_metrics)} metrics from catalog")
|
||||||
|
|
||||||
|
# Track which files we write (for cleanup)
|
||||||
|
written_files: set[Path] = set()
|
||||||
|
index_entries: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
|
for raw in raw_metrics:
|
||||||
|
try:
|
||||||
|
yaml_dict = metric_to_yaml_dict(raw)
|
||||||
|
fqn = raw.get("fullyQualifiedName", "")
|
||||||
|
category = yaml_dict["category"]
|
||||||
|
metric_name = yaml_dict["name"]
|
||||||
|
|
||||||
|
if not metric_name:
|
||||||
|
logger.warning(f"Skipping metric with empty name: {fqn}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create category directory
|
||||||
|
cat_dir = metrics_dir / category
|
||||||
|
cat_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Write individual metric YAML
|
||||||
|
file_path = cat_dir / f"{metric_name}.yml"
|
||||||
|
header = _yaml_header(catalog_url, fqn)
|
||||||
|
# Wrap in list to match MetricParser expected format
|
||||||
|
yaml_content = yaml.dump(
|
||||||
|
[yaml_dict],
|
||||||
|
default_flow_style=False,
|
||||||
|
allow_unicode=True,
|
||||||
|
sort_keys=False,
|
||||||
|
)
|
||||||
|
file_path.write_text(header + "\n" + yaml_content, encoding="utf-8")
|
||||||
|
written_files.add(file_path)
|
||||||
|
|
||||||
|
# Add to index
|
||||||
|
index_entries.append({
|
||||||
|
"name": metric_name,
|
||||||
|
"display_name": yaml_dict["display_name"],
|
||||||
|
"category": category,
|
||||||
|
"grain": yaml_dict["grain"],
|
||||||
|
"file": f"{category}/{metric_name}.yml",
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.debug(f"Wrote metric: {file_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to export metric {raw.get('name', '?')}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Write metrics index
|
||||||
|
if index_entries:
|
||||||
|
_write_metrics_index(metrics_dir, index_entries, catalog_url)
|
||||||
|
written_files.add(metrics_dir / "metrics.yml")
|
||||||
|
|
||||||
|
# Cleanup stale auto-generated metric files
|
||||||
|
_cleanup_stale_files(metrics_dir, written_files, "*.yml")
|
||||||
|
|
||||||
|
logger.info(f"Exported {len(index_entries)} metrics to {metrics_dir}")
|
||||||
|
return len(index_entries)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_metrics_index(
|
||||||
|
metrics_dir: Path,
|
||||||
|
entries: List[Dict[str, Any]],
|
||||||
|
catalog_url: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Write metrics.yml summary index file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metrics_dir: Path to metrics directory
|
||||||
|
entries: List of index entry dicts (name, display_name, category, grain, file)
|
||||||
|
catalog_url: Catalog URL for header
|
||||||
|
"""
|
||||||
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
header_lines = [
|
||||||
|
AUTO_GENERATED_MARKER,
|
||||||
|
f"# Index of all available metrics from OpenMetadata catalog ({catalog_url})",
|
||||||
|
f"# Generated: {timestamp}",
|
||||||
|
DO_NOT_EDIT_LINE,
|
||||||
|
"#",
|
||||||
|
"# Usage: Read specific metric file for full details before calculating.",
|
||||||
|
'# Example: cat server/docs/metrics/finance/m1.yml',
|
||||||
|
]
|
||||||
|
|
||||||
|
index_data = {"metrics": entries}
|
||||||
|
yaml_content = yaml.dump(
|
||||||
|
index_data,
|
||||||
|
default_flow_style=False,
|
||||||
|
allow_unicode=True,
|
||||||
|
sort_keys=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
index_path = metrics_dir / "metrics.yml"
|
||||||
|
index_path.write_text("\n".join(header_lines) + "\n\n" + yaml_content, encoding="utf-8")
|
||||||
|
logger.debug(f"Wrote metrics index: {index_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def export_tables(
|
||||||
|
client: OpenMetadataClient,
|
||||||
|
config: Config,
|
||||||
|
docs_dir: Path,
|
||||||
|
catalog_url: str,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Export table metadata from OpenMetadata to YAML files.
|
||||||
|
|
||||||
|
For each table defined in data_description.md:
|
||||||
|
1. Derives the OpenMetadata FQN
|
||||||
|
2. Fetches table metadata (columns, owners, tags, description)
|
||||||
|
3. Transforms to YAML dict
|
||||||
|
4. Writes to {docs_dir}/tables/{table_name}.yml
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client: Initialized OpenMetadata API client
|
||||||
|
config: Application config with table definitions
|
||||||
|
docs_dir: Base docs directory
|
||||||
|
catalog_url: Catalog URL for header comments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of tables exported
|
||||||
|
"""
|
||||||
|
tables_dir = docs_dir / "tables"
|
||||||
|
tables_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
written_files: set[Path] = set()
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for table_config in config.tables:
|
||||||
|
try:
|
||||||
|
# Derive FQN: explicit override or auto-derive
|
||||||
|
fqn = table_config.catalog_fqn or f"bigquery.{table_config.id}"
|
||||||
|
|
||||||
|
logger.debug(f"Fetching table metadata: {fqn}")
|
||||||
|
raw_table = client.get_table(fqn)
|
||||||
|
|
||||||
|
yaml_dict = table_to_yaml_dict(raw_table)
|
||||||
|
|
||||||
|
# Write table YAML
|
||||||
|
file_path = tables_dir / f"{table_config.name}.yml"
|
||||||
|
header = _yaml_header(catalog_url, fqn)
|
||||||
|
yaml_content = yaml.dump(
|
||||||
|
yaml_dict,
|
||||||
|
default_flow_style=False,
|
||||||
|
allow_unicode=True,
|
||||||
|
sort_keys=False,
|
||||||
|
)
|
||||||
|
file_path.write_text(header + "\n" + yaml_content, encoding="utf-8")
|
||||||
|
written_files.add(file_path)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
logger.info(f"Exported table: {table_config.name} ({len(yaml_dict.get('columns', []))} columns)")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to export table {table_config.name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Cleanup stale auto-generated table files
|
||||||
|
_cleanup_stale_files(tables_dir, written_files, "*.yml")
|
||||||
|
|
||||||
|
logger.info(f"Exported {count} tables to {tables_dir}")
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_stale_files(
|
||||||
|
directory: Path,
|
||||||
|
current_files: set[Path],
|
||||||
|
glob_pattern: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Remove auto-generated files that are no longer in the current export set.
|
||||||
|
|
||||||
|
Only removes files with the AUTO-GENERATED marker in the first line.
|
||||||
|
Manual files (without the marker) are never touched.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory: Directory to scan
|
||||||
|
current_files: Set of file paths written in this export run
|
||||||
|
glob_pattern: Glob pattern to match (e.g., "*.yml")
|
||||||
|
"""
|
||||||
|
for file_path in directory.rglob(glob_pattern):
|
||||||
|
if file_path in current_files:
|
||||||
|
continue
|
||||||
|
if file_path.name == "metrics.yml" and file_path.parent == directory:
|
||||||
|
# Index file managed separately
|
||||||
|
continue
|
||||||
|
if _is_auto_generated(file_path):
|
||||||
|
logger.info(f"Removing stale auto-generated file: {file_path}")
|
||||||
|
file_path.unlink()
|
||||||
|
|
||||||
|
# Remove empty parent directory (category dir)
|
||||||
|
parent = file_path.parent
|
||||||
|
if parent != directory and not any(parent.iterdir()):
|
||||||
|
parent.rmdir()
|
||||||
|
logger.debug(f"Removed empty directory: {parent}")
|
||||||
|
|
||||||
|
|
||||||
|
def _write_sync_state(docs_dir: Path, metrics_count: int, tables_count: int) -> None:
|
||||||
|
"""
|
||||||
|
Write sync state file for tracking last export.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_dir: Base docs directory
|
||||||
|
metrics_count: Number of metrics exported
|
||||||
|
tables_count: Number of tables exported
|
||||||
|
"""
|
||||||
|
state = {
|
||||||
|
"last_export": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"metrics_count": metrics_count,
|
||||||
|
"tables_count": tables_count,
|
||||||
|
}
|
||||||
|
state_path = docs_dir / ".catalog_sync_state.json"
|
||||||
|
state_path.write_text(json.dumps(state, indent=2), encoding="utf-8")
|
||||||
|
logger.debug(f"Wrote sync state: {state_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""
|
||||||
|
Main entry point for catalog export.
|
||||||
|
|
||||||
|
Loads instance config, initializes OpenMetadata client,
|
||||||
|
exports metrics and tables to YAML, writes sync state.
|
||||||
|
Exits gracefully (code 0) on catalog unavailability.
|
||||||
|
"""
|
||||||
|
logger.info("=== Catalog Export starting ===")
|
||||||
|
|
||||||
|
try:
|
||||||
|
instance_config = load_instance_config()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to load instance config: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get OpenMetadata config
|
||||||
|
om_config = instance_config.get("openmetadata", {})
|
||||||
|
catalog_url = om_config.get("url", "").strip()
|
||||||
|
token = om_config.get("token", "").strip()
|
||||||
|
|
||||||
|
if not catalog_url or not token:
|
||||||
|
logger.warning("OpenMetadata not configured (url or token missing) - skipping export")
|
||||||
|
return
|
||||||
|
|
||||||
|
docs_dir = _get_docs_dir()
|
||||||
|
logger.info(f"Output directory: {docs_dir}")
|
||||||
|
|
||||||
|
# Initialize client
|
||||||
|
try:
|
||||||
|
client = OpenMetadataClient(base_url=catalog_url, token=token)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to initialize OpenMetadata client: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Export metrics
|
||||||
|
metrics_count = export_metrics(client, docs_dir, catalog_url)
|
||||||
|
|
||||||
|
# Export tables
|
||||||
|
try:
|
||||||
|
config = Config()
|
||||||
|
tables_count = export_tables(client, config, docs_dir, catalog_url)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Table export skipped (config error): {e}")
|
||||||
|
tables_count = 0
|
||||||
|
|
||||||
|
# Write sync state
|
||||||
|
_write_sync_state(docs_dir, metrics_count, tables_count)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"=== Catalog Export complete: {metrics_count} metrics, {tables_count} tables ==="
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Catalog export failed: {e}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
439
tests/test_catalog_export.py
Normal file
439
tests/test_catalog_export.py
Normal file
|
|
@ -0,0 +1,439 @@
|
||||||
|
"""
|
||||||
|
Tests for src/catalog_export.py
|
||||||
|
|
||||||
|
Covers YAML header generation, auto-generated file detection, metric/table
|
||||||
|
export to YAML files, stale file cleanup, sync state writing, and docs dir
|
||||||
|
resolution.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from src.catalog_export import (
|
||||||
|
AUTO_GENERATED_MARKER,
|
||||||
|
_get_docs_dir,
|
||||||
|
_is_auto_generated,
|
||||||
|
_write_sync_state,
|
||||||
|
_yaml_header,
|
||||||
|
export_metrics,
|
||||||
|
export_tables,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers / fixtures
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FakeTableConfig:
|
||||||
|
"""Minimal stand-in for src.config.TableConfig with attributes used by
|
||||||
|
export_tables()."""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
id: str
|
||||||
|
catalog_fqn: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _make_raw_metric(
|
||||||
|
name: str = "Total Revenue",
|
||||||
|
fqn: str = "metrics.total_revenue",
|
||||||
|
category_tag: str = "MetricCategory.finance",
|
||||||
|
description: str = "Sum of order revenue",
|
||||||
|
expression: str = "SUM(grs_revenue_plan_local)",
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Build a realistic raw metric dict as returned by the catalog API."""
|
||||||
|
return {
|
||||||
|
"id": f"id-{name}",
|
||||||
|
"name": name,
|
||||||
|
"fullyQualifiedName": fqn,
|
||||||
|
"displayName": name,
|
||||||
|
"description": description,
|
||||||
|
"metricExpression": {"expression": expression},
|
||||||
|
"metricType": "sum",
|
||||||
|
"unitOfMeasurement": "USD",
|
||||||
|
"granularity": "monthly",
|
||||||
|
"tags": [{"tagFQN": category_tag}],
|
||||||
|
"owners": [{"name": "Data Team"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _make_raw_table(
|
||||||
|
name: str = "order_economics",
|
||||||
|
fqn: str = "bigquery.project.dataset.order_economics",
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Build a realistic raw table dict as returned by the catalog API."""
|
||||||
|
return {
|
||||||
|
"id": "tbl-1",
|
||||||
|
"name": name,
|
||||||
|
"fullyQualifiedName": fqn,
|
||||||
|
"description": "Order-level economics table",
|
||||||
|
"columns": [
|
||||||
|
{"name": "order_id", "dataType": "STRING", "description": "PK"},
|
||||||
|
{"name": "grs_revenue", "dataType": "FLOAT", "description": "Revenue"},
|
||||||
|
],
|
||||||
|
"tags": [{"tagFQN": "Tier.Tier1"}],
|
||||||
|
"owners": [{"name": "Finance"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_client():
|
||||||
|
"""Return a MagicMock that behaves like OpenMetadataClient."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.get_metrics.return_value = [_make_raw_metric()]
|
||||||
|
client.get_table.return_value = _make_raw_table()
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_config():
|
||||||
|
"""Return a mock Config with a single table."""
|
||||||
|
cfg = MagicMock()
|
||||||
|
cfg.tables = [
|
||||||
|
FakeTableConfig(
|
||||||
|
name="order_economics",
|
||||||
|
id="prj.dataset.order_economics",
|
||||||
|
catalog_fqn="bigquery.prj.dataset.order_economics",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
CATALOG_URL = "https://catalog.example.com"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 1. _yaml_header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestYamlHeader:
|
||||||
|
def test_yaml_header_contains_marker(self):
|
||||||
|
"""_yaml_header() output starts with AUTO-GENERATED marker."""
|
||||||
|
header = _yaml_header(CATALOG_URL)
|
||||||
|
assert header.startswith(AUTO_GENERATED_MARKER)
|
||||||
|
|
||||||
|
def test_yaml_header_contains_url(self):
|
||||||
|
"""Header includes catalog URL."""
|
||||||
|
header = _yaml_header(CATALOG_URL)
|
||||||
|
assert CATALOG_URL in header
|
||||||
|
|
||||||
|
def test_yaml_header_contains_fqn(self):
|
||||||
|
"""Header includes FQN when provided."""
|
||||||
|
fqn = "metrics.total_revenue"
|
||||||
|
header = _yaml_header(CATALOG_URL, fqn=fqn)
|
||||||
|
assert f"# FQN: {fqn}" in header
|
||||||
|
|
||||||
|
def test_yaml_header_no_fqn_line_when_empty(self):
|
||||||
|
"""Header omits FQN line when fqn argument is empty."""
|
||||||
|
header = _yaml_header(CATALOG_URL, fqn="")
|
||||||
|
assert "# FQN:" not in header
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 2. _is_auto_generated
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsAutoGenerated:
|
||||||
|
def test_is_auto_generated_true(self, tmp_path: Path):
|
||||||
|
"""File starting with AUTO-GENERATED marker returns True."""
|
||||||
|
f = tmp_path / "metric.yml"
|
||||||
|
f.write_text(AUTO_GENERATED_MARKER + "\nsome: data\n")
|
||||||
|
assert _is_auto_generated(f) is True
|
||||||
|
|
||||||
|
def test_is_auto_generated_false(self, tmp_path: Path):
|
||||||
|
"""File without marker returns False."""
|
||||||
|
f = tmp_path / "manual.yml"
|
||||||
|
f.write_text("# Manually written metric\nname: custom\n")
|
||||||
|
assert _is_auto_generated(f) is False
|
||||||
|
|
||||||
|
def test_is_auto_generated_missing_file(self, tmp_path: Path):
|
||||||
|
"""Non-existent file returns False."""
|
||||||
|
f = tmp_path / "does_not_exist.yml"
|
||||||
|
assert _is_auto_generated(f) is False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 3. export_metrics
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestExportMetrics:
|
||||||
|
def test_export_metrics_writes_files(self, tmp_path: Path, mock_client):
|
||||||
|
"""Creates category dirs and .yml files for each metric."""
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
count = export_metrics(mock_client, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
assert count == 1
|
||||||
|
# Category directory should exist
|
||||||
|
category_dir = docs / "metrics" / "finance"
|
||||||
|
assert category_dir.is_dir()
|
||||||
|
|
||||||
|
# Metric file should exist
|
||||||
|
metric_files = list(category_dir.glob("*.yml"))
|
||||||
|
assert len(metric_files) == 1
|
||||||
|
assert metric_files[0].name == "total_revenue.yml"
|
||||||
|
|
||||||
|
def test_export_metrics_writes_index(self, tmp_path: Path, mock_client):
|
||||||
|
"""Creates metrics.yml index with correct structure."""
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
export_metrics(mock_client, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
index_path = docs / "metrics" / "metrics.yml"
|
||||||
|
assert index_path.exists()
|
||||||
|
|
||||||
|
# Skip header comments, parse YAML body
|
||||||
|
content = index_path.read_text()
|
||||||
|
# Remove all comment lines to parse pure YAML
|
||||||
|
yaml_lines = [
|
||||||
|
line for line in content.splitlines() if not line.startswith("#")
|
||||||
|
]
|
||||||
|
parsed = yaml.safe_load("\n".join(yaml_lines))
|
||||||
|
|
||||||
|
assert "metrics" in parsed
|
||||||
|
assert len(parsed["metrics"]) == 1
|
||||||
|
entry = parsed["metrics"][0]
|
||||||
|
assert entry["name"] == "total_revenue"
|
||||||
|
assert entry["category"] == "finance"
|
||||||
|
assert "file" in entry
|
||||||
|
|
||||||
|
def test_export_metrics_yaml_parseable(self, tmp_path: Path, mock_client):
|
||||||
|
"""Output metric YAML is valid and parseable by yaml.safe_load."""
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
export_metrics(mock_client, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
metric_file = docs / "metrics" / "finance" / "total_revenue.yml"
|
||||||
|
content = metric_file.read_text()
|
||||||
|
|
||||||
|
# Strip header comments before parsing
|
||||||
|
yaml_lines = [
|
||||||
|
line for line in content.splitlines() if not line.startswith("#")
|
||||||
|
]
|
||||||
|
parsed = yaml.safe_load("\n".join(yaml_lines))
|
||||||
|
|
||||||
|
assert isinstance(parsed, list)
|
||||||
|
assert len(parsed) == 1
|
||||||
|
assert parsed[0]["name"] == "total_revenue"
|
||||||
|
assert parsed[0]["expression"] == "SUM(grs_revenue_plan_local)"
|
||||||
|
|
||||||
|
def test_export_metrics_preserves_manual_files(
|
||||||
|
self, tmp_path: Path, mock_client
|
||||||
|
):
|
||||||
|
"""Files without AUTO-GENERATED marker are never deleted."""
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
metrics_dir = docs / "metrics" / "custom"
|
||||||
|
metrics_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
manual = metrics_dir / "hand_crafted.yml"
|
||||||
|
manual.write_text("# My custom metric\nname: hand_crafted\n")
|
||||||
|
|
||||||
|
export_metrics(mock_client, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
# Manual file must survive
|
||||||
|
assert manual.exists()
|
||||||
|
assert manual.read_text().startswith("# My custom metric")
|
||||||
|
|
||||||
|
def test_export_metrics_cleans_stale(self, tmp_path: Path, mock_client):
|
||||||
|
"""Old auto-generated files removed when metric no longer in catalog."""
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
stale_dir = docs / "metrics" / "old_category"
|
||||||
|
stale_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
stale = stale_dir / "gone_metric.yml"
|
||||||
|
stale.write_text(AUTO_GENERATED_MARKER + "\nname: gone\n")
|
||||||
|
|
||||||
|
export_metrics(mock_client, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
# Stale auto-generated file should be removed
|
||||||
|
assert not stale.exists()
|
||||||
|
|
||||||
|
def test_export_metrics_zero_results_preserves(
|
||||||
|
self, tmp_path: Path, mock_client
|
||||||
|
):
|
||||||
|
"""When API returns 0 metrics, existing files are untouched."""
|
||||||
|
mock_client.get_metrics.return_value = []
|
||||||
|
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
metrics_dir = docs / "metrics" / "finance"
|
||||||
|
metrics_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
existing = metrics_dir / "existing.yml"
|
||||||
|
existing.write_text(AUTO_GENERATED_MARKER + "\nname: existing\n")
|
||||||
|
|
||||||
|
count = export_metrics(mock_client, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
assert count == 0
|
||||||
|
# Existing file untouched (early return before cleanup)
|
||||||
|
assert existing.exists()
|
||||||
|
|
||||||
|
def test_export_metrics_multiple(self, tmp_path: Path, mock_client):
|
||||||
|
"""Multiple metrics across categories are all exported."""
|
||||||
|
mock_client.get_metrics.return_value = [
|
||||||
|
_make_raw_metric(
|
||||||
|
name="Total Revenue",
|
||||||
|
fqn="metrics.total_revenue",
|
||||||
|
category_tag="MetricCategory.finance",
|
||||||
|
),
|
||||||
|
_make_raw_metric(
|
||||||
|
name="Active Users",
|
||||||
|
fqn="metrics.active_users",
|
||||||
|
category_tag="MetricCategory.product",
|
||||||
|
expression="COUNT(DISTINCT user_id)",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
count = export_metrics(mock_client, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
assert count == 2
|
||||||
|
assert (docs / "metrics" / "finance" / "total_revenue.yml").exists()
|
||||||
|
assert (docs / "metrics" / "product" / "active_users.yml").exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 4. export_tables
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestExportTables:
|
||||||
|
def test_export_tables_writes_files(
|
||||||
|
self, tmp_path: Path, mock_client, mock_config
|
||||||
|
):
|
||||||
|
"""Creates table YAML with columns."""
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
count = export_tables(mock_client, mock_config, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
assert count == 1
|
||||||
|
|
||||||
|
table_file = docs / "tables" / "order_economics.yml"
|
||||||
|
assert table_file.exists()
|
||||||
|
|
||||||
|
# Parse YAML content (skip header comments)
|
||||||
|
content = table_file.read_text()
|
||||||
|
yaml_lines = [
|
||||||
|
line for line in content.splitlines() if not line.startswith("#")
|
||||||
|
]
|
||||||
|
parsed = yaml.safe_load("\n".join(yaml_lines))
|
||||||
|
|
||||||
|
assert parsed["name"] == "order_economics"
|
||||||
|
assert len(parsed["columns"]) == 2
|
||||||
|
assert parsed["columns"][0]["name"] == "order_id"
|
||||||
|
|
||||||
|
def test_export_tables_handles_api_error(
|
||||||
|
self, tmp_path: Path, mock_client, mock_config
|
||||||
|
):
|
||||||
|
"""Continues on per-table errors, exports remaining tables."""
|
||||||
|
# Two tables: first will fail, second succeeds
|
||||||
|
mock_config.tables = [
|
||||||
|
FakeTableConfig(
|
||||||
|
name="broken_table",
|
||||||
|
id="prj.dataset.broken",
|
||||||
|
catalog_fqn="bigquery.prj.dataset.broken",
|
||||||
|
),
|
||||||
|
FakeTableConfig(
|
||||||
|
name="good_table",
|
||||||
|
id="prj.dataset.good",
|
||||||
|
catalog_fqn="bigquery.prj.dataset.good",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
def side_effect(fqn):
|
||||||
|
if "broken" in fqn:
|
||||||
|
raise RuntimeError("API unreachable")
|
||||||
|
return _make_raw_table(name="good_table", fqn=fqn)
|
||||||
|
|
||||||
|
mock_client.get_table.side_effect = side_effect
|
||||||
|
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
count = export_tables(mock_client, mock_config, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
# Only the good table should succeed
|
||||||
|
assert count == 1
|
||||||
|
assert not (docs / "tables" / "broken_table.yml").exists()
|
||||||
|
assert (docs / "tables" / "good_table.yml").exists()
|
||||||
|
|
||||||
|
def test_export_tables_uses_catalog_fqn(
|
||||||
|
self, tmp_path: Path, mock_client, mock_config
|
||||||
|
):
|
||||||
|
"""Uses explicit catalog_fqn when set on table config."""
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
export_tables(mock_client, mock_config, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
mock_client.get_table.assert_called_once_with(
|
||||||
|
"bigquery.prj.dataset.order_economics"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_export_tables_derives_fqn_from_id(
|
||||||
|
self, tmp_path: Path, mock_client
|
||||||
|
):
|
||||||
|
"""When catalog_fqn is None, derives FQN as 'bigquery.{id}'."""
|
||||||
|
cfg = MagicMock()
|
||||||
|
cfg.tables = [
|
||||||
|
FakeTableConfig(
|
||||||
|
name="my_table",
|
||||||
|
id="project.dataset.my_table",
|
||||||
|
catalog_fqn=None,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
export_tables(mock_client, cfg, docs, CATALOG_URL)
|
||||||
|
|
||||||
|
mock_client.get_table.assert_called_once_with(
|
||||||
|
"bigquery.project.dataset.my_table"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 5. _write_sync_state
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestWriteSyncState:
|
||||||
|
def test_write_sync_state(self, tmp_path: Path):
|
||||||
|
"""Writes .catalog_sync_state.json with counts and timestamp."""
|
||||||
|
docs = tmp_path / "docs"
|
||||||
|
docs.mkdir()
|
||||||
|
|
||||||
|
_write_sync_state(docs, metrics_count=5, tables_count=2)
|
||||||
|
|
||||||
|
state_path = docs / ".catalog_sync_state.json"
|
||||||
|
assert state_path.exists()
|
||||||
|
|
||||||
|
state = json.loads(state_path.read_text())
|
||||||
|
assert state["metrics_count"] == 5
|
||||||
|
assert state["tables_count"] == 2
|
||||||
|
assert "last_export" in state
|
||||||
|
|
||||||
|
# Timestamp should be ISO format
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
datetime.fromisoformat(state["last_export"]) # raises on bad format
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 6. _get_docs_dir
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetDocsDir:
|
||||||
|
def test_get_docs_dir_from_env(self, monkeypatch):
|
||||||
|
"""DATA_DIR env var is used to derive docs directory."""
|
||||||
|
monkeypatch.setenv("DATA_DIR", "/data/src_data")
|
||||||
|
result = _get_docs_dir()
|
||||||
|
assert result == Path("/data/docs")
|
||||||
|
|
||||||
|
def test_get_docs_dir_default(self, monkeypatch):
|
||||||
|
"""Defaults to ./data/../docs when DATA_DIR is not set."""
|
||||||
|
monkeypatch.delenv("DATA_DIR", raising=False)
|
||||||
|
result = _get_docs_dir()
|
||||||
|
# ./data -> parent is "." -> docs is "./docs"
|
||||||
|
assert result == Path("docs")
|
||||||
|
|
@ -235,7 +235,8 @@ def test_derive_fqn_explicit_override():
|
||||||
|
|
||||||
def test_parse_table_response(sample_om_response):
|
def test_parse_table_response(sample_om_response):
|
||||||
"""Test parsing OpenMetadata table response."""
|
"""Test parsing OpenMetadata table response."""
|
||||||
with patch("connectors.openmetadata.enricher.OpenMetadataClient"):
|
with patch("connectors.openmetadata.enricher.OpenMetadataClient") as mock_client_cls:
|
||||||
|
mock_client_cls.return_value.base_url = "https://catalog.example.com"
|
||||||
enricher = CatalogEnricher(
|
enricher = CatalogEnricher(
|
||||||
{
|
{
|
||||||
"openmetadata": {
|
"openmetadata": {
|
||||||
|
|
|
||||||
783
tests/test_openmetadata_transformer.py
Normal file
783
tests/test_openmetadata_transformer.py
Normal file
|
|
@ -0,0 +1,783 @@
|
||||||
|
"""
|
||||||
|
Tests for OpenMetadata transformer.
|
||||||
|
|
||||||
|
All transformer functions are pure (dict in -> dict/str/list out), so no mocks needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from connectors.openmetadata.transformer import (
|
||||||
|
extract_category,
|
||||||
|
extract_dimensions,
|
||||||
|
extract_expression,
|
||||||
|
extract_grain,
|
||||||
|
extract_metric_type,
|
||||||
|
extract_owners,
|
||||||
|
extract_tag_names,
|
||||||
|
extract_unit,
|
||||||
|
metric_to_detail_dict,
|
||||||
|
metric_to_display_dict,
|
||||||
|
metric_to_yaml_dict,
|
||||||
|
sanitize_filename,
|
||||||
|
table_to_yaml_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helper: build a tag dict the way OpenMetadata returns them
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _tag(fqn: str, name: str = "") -> dict:
|
||||||
|
"""Build a minimal OpenMetadata tag dict."""
|
||||||
|
tag = {"tagFQN": fqn}
|
||||||
|
if name:
|
||||||
|
tag["name"] = name
|
||||||
|
return tag
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_category
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestExtractCategory:
|
||||||
|
def test_extract_category_from_metric_category_tag(self):
|
||||||
|
"""MetricCategory.finance tag -> 'finance'."""
|
||||||
|
tags = [_tag("MetricCategory.finance")]
|
||||||
|
assert extract_category(tags) == "finance"
|
||||||
|
|
||||||
|
def test_extract_category_from_category_tag(self):
|
||||||
|
"""Category.marketing tag -> 'marketing'."""
|
||||||
|
tags = [_tag("Category.marketing")]
|
||||||
|
assert extract_category(tags) == "marketing"
|
||||||
|
|
||||||
|
def test_extract_category_default(self):
|
||||||
|
"""No matching tags -> 'general'."""
|
||||||
|
tags = [_tag("SomeOther.tag"), _tag("Tier.Tier1")]
|
||||||
|
assert extract_category(tags) == "general"
|
||||||
|
|
||||||
|
def test_extract_category_empty_tags(self):
|
||||||
|
"""Empty tag list -> 'general'."""
|
||||||
|
assert extract_category([]) == "general"
|
||||||
|
|
||||||
|
def test_extract_category_metric_category_takes_priority(self):
|
||||||
|
"""MetricCategory.* is checked before Category.* (iteration order)."""
|
||||||
|
tags = [_tag("MetricCategory.finance"), _tag("Category.marketing")]
|
||||||
|
assert extract_category(tags) == "finance"
|
||||||
|
|
||||||
|
def test_extract_category_category_fallback_if_no_metric_category(self):
|
||||||
|
"""Category.* is used when MetricCategory.* is absent."""
|
||||||
|
tags = [_tag("Tier.Tier1"), _tag("Category.operations")]
|
||||||
|
assert extract_category(tags) == "operations"
|
||||||
|
|
||||||
|
def test_extract_category_with_nested_dot_in_value(self):
|
||||||
|
"""MetricCategory.sub.area -> 'sub.area' (split on first dot only)."""
|
||||||
|
tags = [_tag("MetricCategory.sub.area")]
|
||||||
|
assert extract_category(tags) == "sub.area"
|
||||||
|
|
||||||
|
def test_extract_category_missing_tagfqn_key(self):
|
||||||
|
"""Tag dict without tagFQN key is safely skipped."""
|
||||||
|
tags = [{"name": "orphan"}]
|
||||||
|
assert extract_category(tags) == "general"
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_grain
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestExtractGrain:
|
||||||
|
def test_extract_grain_from_field(self):
|
||||||
|
"""granularity field takes priority over tags."""
|
||||||
|
raw = {
|
||||||
|
"granularity": "Daily",
|
||||||
|
"tags": [_tag("Grain.monthly")],
|
||||||
|
}
|
||||||
|
assert extract_grain(raw) == "daily"
|
||||||
|
|
||||||
|
def test_extract_grain_from_tag(self):
|
||||||
|
"""Grain.monthly tag used when granularity field is absent."""
|
||||||
|
raw = {"tags": [_tag("Grain.monthly")]}
|
||||||
|
assert extract_grain(raw) == "monthly"
|
||||||
|
|
||||||
|
def test_extract_grain_empty(self):
|
||||||
|
"""No grain info -> empty string."""
|
||||||
|
raw = {"tags": [_tag("Category.finance")]}
|
||||||
|
assert extract_grain(raw) == ""
|
||||||
|
|
||||||
|
def test_extract_grain_no_tags_no_field(self):
|
||||||
|
"""Completely empty metric -> empty string."""
|
||||||
|
assert extract_grain({}) == ""
|
||||||
|
|
||||||
|
def test_extract_grain_field_is_none(self):
|
||||||
|
"""granularity=None should fall through to tags."""
|
||||||
|
raw = {"granularity": None, "tags": [_tag("Grain.weekly")]}
|
||||||
|
assert extract_grain(raw) == "weekly"
|
||||||
|
|
||||||
|
def test_extract_grain_field_is_empty_string(self):
|
||||||
|
"""granularity='' should fall through to tags."""
|
||||||
|
raw = {"granularity": "", "tags": [_tag("Grain.yearly")]}
|
||||||
|
assert extract_grain(raw) == "yearly"
|
||||||
|
|
||||||
|
def test_extract_grain_tag_lowercased(self):
|
||||||
|
"""Grain tag value is lowercased."""
|
||||||
|
raw = {"tags": [_tag("Grain.QUARTERLY")]}
|
||||||
|
assert extract_grain(raw) == "quarterly"
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_dimensions
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestExtractDimensions:
|
||||||
|
def test_extract_dimensions(self):
|
||||||
|
"""Multiple Dimension.* tags -> list of dimension names."""
|
||||||
|
tags = [
|
||||||
|
_tag("Dimension.economic_area"),
|
||||||
|
_tag("Dimension.country"),
|
||||||
|
_tag("Category.finance"),
|
||||||
|
]
|
||||||
|
result = extract_dimensions(tags)
|
||||||
|
assert result == ["economic_area", "country"]
|
||||||
|
|
||||||
|
def test_extract_dimensions_empty(self):
|
||||||
|
"""No Dimension tags -> empty list."""
|
||||||
|
tags = [_tag("Category.finance"), _tag("Tier.Tier1")]
|
||||||
|
assert extract_dimensions(tags) == []
|
||||||
|
|
||||||
|
def test_extract_dimensions_empty_list(self):
|
||||||
|
"""Empty tag list -> empty list."""
|
||||||
|
assert extract_dimensions([]) == []
|
||||||
|
|
||||||
|
def test_extract_dimensions_preserves_order(self):
|
||||||
|
"""Dimensions are returned in tag order."""
|
||||||
|
tags = [_tag("Dimension.z_last"), _tag("Dimension.a_first")]
|
||||||
|
assert extract_dimensions(tags) == ["z_last", "a_first"]
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_expression
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestExtractExpression:
|
||||||
|
def test_extract_expression_dict(self):
|
||||||
|
"""metricExpression as dict with 'expression' key."""
|
||||||
|
raw = {"metricExpression": {"expression": "SUM(revenue_usd)"}}
|
||||||
|
assert extract_expression(raw) == "SUM(revenue_usd)"
|
||||||
|
|
||||||
|
def test_extract_expression_string(self):
|
||||||
|
"""metricExpression as plain string."""
|
||||||
|
raw = {"metricExpression": "COUNT(DISTINCT order_id)"}
|
||||||
|
assert extract_expression(raw) == "COUNT(DISTINCT order_id)"
|
||||||
|
|
||||||
|
def test_extract_expression_empty(self):
|
||||||
|
"""No metricExpression -> empty string."""
|
||||||
|
raw = {"name": "some_metric"}
|
||||||
|
assert extract_expression(raw) == ""
|
||||||
|
|
||||||
|
def test_extract_expression_dict_missing_key(self):
|
||||||
|
"""Dict without 'expression' key -> empty string."""
|
||||||
|
raw = {"metricExpression": {"formula": "x + y"}}
|
||||||
|
assert extract_expression(raw) == ""
|
||||||
|
|
||||||
|
def test_extract_expression_dict_none_value(self):
|
||||||
|
"""Dict with expression=None -> empty string."""
|
||||||
|
raw = {"metricExpression": {"expression": None}}
|
||||||
|
assert extract_expression(raw) == ""
|
||||||
|
|
||||||
|
def test_extract_expression_none(self):
|
||||||
|
"""metricExpression=None -> empty string (default {} from .get())."""
|
||||||
|
raw = {"metricExpression": None}
|
||||||
|
# None is not dict and not str, so returns ""
|
||||||
|
assert extract_expression(raw) == ""
|
||||||
|
|
||||||
|
def test_extract_expression_empty_dict(self):
|
||||||
|
"""metricExpression={} -> empty string."""
|
||||||
|
raw = {"metricExpression": {}}
|
||||||
|
assert extract_expression(raw) == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_owners
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestExtractOwners:
|
||||||
|
def test_extract_owners(self):
|
||||||
|
"""owners list with name/displayName."""
|
||||||
|
raw = {
|
||||||
|
"owners": [
|
||||||
|
{"name": "alice", "displayName": "Alice Smith"},
|
||||||
|
{"name": "bob"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
assert extract_owners(raw) == ["alice", "bob"]
|
||||||
|
|
||||||
|
def test_extract_owners_display_name_fallback(self):
|
||||||
|
"""displayName is used when name is absent."""
|
||||||
|
raw = {
|
||||||
|
"owners": [
|
||||||
|
{"displayName": "Charlie Brown"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
assert extract_owners(raw) == ["Charlie Brown"]
|
||||||
|
|
||||||
|
def test_extract_owners_empty(self):
|
||||||
|
"""No owners key -> empty list."""
|
||||||
|
raw = {"name": "something"}
|
||||||
|
assert extract_owners(raw) == []
|
||||||
|
|
||||||
|
def test_extract_owners_empty_list(self):
|
||||||
|
"""Empty owners list -> empty list."""
|
||||||
|
raw = {"owners": []}
|
||||||
|
assert extract_owners(raw) == []
|
||||||
|
|
||||||
|
def test_extract_owners_skips_empty_names(self):
|
||||||
|
"""Owners with no name or displayName are skipped."""
|
||||||
|
raw = {
|
||||||
|
"owners": [
|
||||||
|
{"email": "no-name@example.com"},
|
||||||
|
{"name": "", "displayName": ""},
|
||||||
|
{"name": "valid_user"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
assert extract_owners(raw) == ["valid_user"]
|
||||||
|
|
||||||
|
def test_extract_owners_name_none_falls_to_display_name(self):
|
||||||
|
"""name=None falls back to displayName."""
|
||||||
|
raw = {
|
||||||
|
"owners": [{"name": None, "displayName": "Fallback Name"}]
|
||||||
|
}
|
||||||
|
assert extract_owners(raw) == ["Fallback Name"]
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_metric_type
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestExtractMetricType:
|
||||||
|
def test_extract_metric_type_from_field(self):
|
||||||
|
"""metricType field takes priority."""
|
||||||
|
raw = {
|
||||||
|
"metricType": "SUM",
|
||||||
|
"tags": [_tag("MetricType.count")],
|
||||||
|
}
|
||||||
|
assert extract_metric_type(raw) == "sum"
|
||||||
|
|
||||||
|
def test_extract_metric_type_from_tag(self):
|
||||||
|
"""MetricType.* tag used when field is absent."""
|
||||||
|
raw = {"tags": [_tag("MetricType.ratio")]}
|
||||||
|
assert extract_metric_type(raw) == "ratio"
|
||||||
|
|
||||||
|
def test_extract_metric_type_empty(self):
|
||||||
|
"""No metric type info -> empty string."""
|
||||||
|
raw = {"tags": [_tag("Category.finance")]}
|
||||||
|
assert extract_metric_type(raw) == ""
|
||||||
|
|
||||||
|
def test_extract_metric_type_field_none(self):
|
||||||
|
"""metricType=None falls through to tags."""
|
||||||
|
raw = {"metricType": None, "tags": [_tag("MetricType.average")]}
|
||||||
|
assert extract_metric_type(raw) == "average"
|
||||||
|
|
||||||
|
def test_extract_metric_type_lowercased(self):
|
||||||
|
"""Metric type from field is lowercased."""
|
||||||
|
raw = {"metricType": "COUNT", "tags": []}
|
||||||
|
assert extract_metric_type(raw) == "count"
|
||||||
|
|
||||||
|
def test_extract_metric_type_tag_lowercased(self):
|
||||||
|
"""Metric type from tag is lowercased."""
|
||||||
|
raw = {"tags": [_tag("MetricType.PERCENTAGE")]}
|
||||||
|
assert extract_metric_type(raw) == "percentage"
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_unit
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestExtractUnit:
|
||||||
|
def test_extract_unit_from_field(self):
|
||||||
|
"""unitOfMeasurement field takes priority."""
|
||||||
|
raw = {
|
||||||
|
"unitOfMeasurement": "USD",
|
||||||
|
"tags": [_tag("Unit.EUR")],
|
||||||
|
}
|
||||||
|
assert extract_unit(raw) == "USD"
|
||||||
|
|
||||||
|
def test_extract_unit_from_tag(self):
|
||||||
|
"""Unit.* tag used when field is absent."""
|
||||||
|
raw = {"tags": [_tag("Unit.count")]}
|
||||||
|
assert extract_unit(raw) == "count"
|
||||||
|
|
||||||
|
def test_extract_unit_empty(self):
|
||||||
|
"""No unit info -> empty string."""
|
||||||
|
raw = {"tags": [_tag("Category.finance")]}
|
||||||
|
assert extract_unit(raw) == ""
|
||||||
|
|
||||||
|
def test_extract_unit_field_none(self):
|
||||||
|
"""unitOfMeasurement=None falls through to tags."""
|
||||||
|
raw = {"unitOfMeasurement": None, "tags": [_tag("Unit.percent")]}
|
||||||
|
assert extract_unit(raw) == "percent"
|
||||||
|
|
||||||
|
def test_extract_unit_field_empty_string(self):
|
||||||
|
"""unitOfMeasurement='' falls through to tags."""
|
||||||
|
raw = {"unitOfMeasurement": "", "tags": [_tag("Unit.GBP")]}
|
||||||
|
assert extract_unit(raw) == "GBP"
|
||||||
|
|
||||||
|
def test_extract_unit_preserves_case(self):
|
||||||
|
"""Unit value from field is NOT lowercased (unlike metric_type)."""
|
||||||
|
raw = {"unitOfMeasurement": "USD", "tags": []}
|
||||||
|
assert extract_unit(raw) == "USD"
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# extract_tag_names
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestExtractTagNames:
|
||||||
|
def test_extract_tag_names_with_name_field(self):
|
||||||
|
"""Tags with 'name' field use that value."""
|
||||||
|
tags = [
|
||||||
|
{"name": "finance", "tagFQN": "Category.finance"},
|
||||||
|
{"name": "Tier1", "tagFQN": "Tier.Tier1"},
|
||||||
|
]
|
||||||
|
assert extract_tag_names(tags) == ["finance", "Tier1"]
|
||||||
|
|
||||||
|
def test_extract_tag_names_from_fqn(self):
|
||||||
|
"""Tags without 'name' extract last segment of tagFQN."""
|
||||||
|
tags = [
|
||||||
|
{"tagFQN": "Category.finance"},
|
||||||
|
{"tagFQN": "Tier.Tier1"},
|
||||||
|
]
|
||||||
|
assert extract_tag_names(tags) == ["finance", "Tier1"]
|
||||||
|
|
||||||
|
def test_extract_tag_names_empty(self):
|
||||||
|
"""Empty tag list -> empty list."""
|
||||||
|
assert extract_tag_names([]) == []
|
||||||
|
|
||||||
|
def test_extract_tag_names_mixed(self):
|
||||||
|
"""Mix of tags with and without 'name' field."""
|
||||||
|
tags = [
|
||||||
|
{"name": "explicit_name", "tagFQN": "Category.something_else"},
|
||||||
|
{"tagFQN": "Dimension.country"},
|
||||||
|
]
|
||||||
|
result = extract_tag_names(tags)
|
||||||
|
assert result == ["explicit_name", "country"]
|
||||||
|
|
||||||
|
def test_extract_tag_names_no_name_no_fqn(self):
|
||||||
|
"""Tag without name or tagFQN is skipped (empty string)."""
|
||||||
|
tags = [{"description": "orphan tag"}]
|
||||||
|
# tagFQN defaults to "" -> split(".")[-1] is "" -> falsy, skipped
|
||||||
|
assert extract_tag_names(tags) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# sanitize_filename
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestSanitizeFilename:
|
||||||
|
def test_sanitize_filename(self):
|
||||||
|
"""Spaces and mixed case -> underscores and lowercase."""
|
||||||
|
assert sanitize_filename("M1 Operational Margin") == "m1_operational_margin"
|
||||||
|
|
||||||
|
def test_sanitize_filename_special_chars(self):
|
||||||
|
"""Special characters replaced with underscores."""
|
||||||
|
assert sanitize_filename("Revenue (USD) - Net") == "revenue_usd_net"
|
||||||
|
|
||||||
|
def test_sanitize_filename_multiple_underscores_collapsed(self):
|
||||||
|
"""Consecutive underscores are collapsed."""
|
||||||
|
assert sanitize_filename("foo---bar___baz") == "foo_bar_baz"
|
||||||
|
|
||||||
|
def test_sanitize_filename_leading_trailing_stripped(self):
|
||||||
|
"""Leading and trailing underscores are stripped."""
|
||||||
|
assert sanitize_filename("__hello__") == "hello"
|
||||||
|
|
||||||
|
def test_sanitize_filename_already_clean(self):
|
||||||
|
"""Already clean name passes through unchanged."""
|
||||||
|
assert sanitize_filename("total_revenue") == "total_revenue"
|
||||||
|
|
||||||
|
def test_sanitize_filename_numbers(self):
|
||||||
|
"""Numbers are preserved."""
|
||||||
|
assert sanitize_filename("M1+VFM Margin 2024") == "m1_vfm_margin_2024"
|
||||||
|
|
||||||
|
def test_sanitize_filename_empty_string(self):
|
||||||
|
"""Empty string -> empty string."""
|
||||||
|
assert sanitize_filename("") == ""
|
||||||
|
|
||||||
|
def test_sanitize_filename_only_special_chars(self):
|
||||||
|
"""String of only special chars -> empty string."""
|
||||||
|
assert sanitize_filename("@#$%") == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# metric_to_yaml_dict
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestMetricToYamlDict:
|
||||||
|
def test_metric_to_yaml_dict(self):
|
||||||
|
"""Full transformation with all fields populated."""
|
||||||
|
raw = {
|
||||||
|
"name": "M1 Operational Margin",
|
||||||
|
"displayName": "M1 Operational Margin",
|
||||||
|
"fullyQualifiedName": "catalog.metrics.m1_margin",
|
||||||
|
"description": " Gross margin after operational costs ",
|
||||||
|
"granularity": "Monthly",
|
||||||
|
"metricType": "ratio",
|
||||||
|
"unitOfMeasurement": "USD",
|
||||||
|
"metricExpression": {"expression": "SUM(m1_margin_usd)"},
|
||||||
|
"tags": [
|
||||||
|
_tag("MetricCategory.finance"),
|
||||||
|
_tag("Dimension.economic_area"),
|
||||||
|
_tag("Dimension.country"),
|
||||||
|
],
|
||||||
|
"owners": [
|
||||||
|
{"name": "alice", "displayName": "Alice Smith"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
result = metric_to_yaml_dict(raw)
|
||||||
|
|
||||||
|
assert result["name"] == "m1_operational_margin"
|
||||||
|
assert result["display_name"] == "M1 Operational Margin"
|
||||||
|
assert result["category"] == "finance"
|
||||||
|
assert result["type"] == "ratio"
|
||||||
|
assert result["unit"] == "USD"
|
||||||
|
assert result["grain"] == "monthly"
|
||||||
|
assert result["time_column"] == ""
|
||||||
|
assert result["table"] == ""
|
||||||
|
assert result["expression"] == "SUM(m1_margin_usd)"
|
||||||
|
assert result["description"] == "Gross margin after operational costs"
|
||||||
|
assert result["dimensions"] == ["economic_area", "country"]
|
||||||
|
assert result["synonyms"] == []
|
||||||
|
# Notes contain FQN and owner info
|
||||||
|
assert any("catalog.metrics.m1_margin" in n for n in result["notes"])
|
||||||
|
assert any("alice" in n for n in result["notes"])
|
||||||
|
|
||||||
|
def test_metric_to_yaml_dict_minimal(self):
|
||||||
|
"""Minimal metric with empty/missing fields."""
|
||||||
|
raw = {"name": "simple"}
|
||||||
|
result = metric_to_yaml_dict(raw)
|
||||||
|
|
||||||
|
assert result["name"] == "simple"
|
||||||
|
assert result["display_name"] == "simple"
|
||||||
|
assert result["category"] == "general"
|
||||||
|
assert result["type"] == ""
|
||||||
|
assert result["unit"] == ""
|
||||||
|
assert result["grain"] == ""
|
||||||
|
assert result["expression"] == ""
|
||||||
|
assert result["description"] == ""
|
||||||
|
assert result["dimensions"] == []
|
||||||
|
assert result["synonyms"] == []
|
||||||
|
# No FQN -> no source note; no owners -> no owners note
|
||||||
|
assert result["notes"] == []
|
||||||
|
|
||||||
|
def test_metric_to_yaml_dict_notes_with_fqn_only(self):
|
||||||
|
"""Notes include FQN source but no owners when owners missing."""
|
||||||
|
raw = {
|
||||||
|
"name": "test",
|
||||||
|
"fullyQualifiedName": "catalog.test",
|
||||||
|
}
|
||||||
|
result = metric_to_yaml_dict(raw)
|
||||||
|
assert len(result["notes"]) == 1
|
||||||
|
assert "FQN: catalog.test" in result["notes"][0]
|
||||||
|
|
||||||
|
def test_metric_to_yaml_dict_description_stripped(self):
|
||||||
|
"""Description whitespace is stripped."""
|
||||||
|
raw = {
|
||||||
|
"name": "test",
|
||||||
|
"description": "\n Some description with spaces \n",
|
||||||
|
}
|
||||||
|
result = metric_to_yaml_dict(raw)
|
||||||
|
assert result["description"] == "Some description with spaces"
|
||||||
|
|
||||||
|
def test_metric_to_yaml_dict_description_none(self):
|
||||||
|
"""description=None -> empty string."""
|
||||||
|
raw = {"name": "test", "description": None}
|
||||||
|
result = metric_to_yaml_dict(raw)
|
||||||
|
assert result["description"] == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# metric_to_display_dict
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestMetricToDisplayDict:
|
||||||
|
def test_metric_to_display_dict(self):
|
||||||
|
"""Full display dict with all fields."""
|
||||||
|
raw = {
|
||||||
|
"name": "total_revenue",
|
||||||
|
"displayName": "Total Revenue",
|
||||||
|
"fullyQualifiedName": "catalog.metrics.total_revenue",
|
||||||
|
"description": "Total revenue in USD",
|
||||||
|
"granularity": "Daily",
|
||||||
|
"tags": [_tag("MetricCategory.finance")],
|
||||||
|
}
|
||||||
|
result = metric_to_display_dict(raw)
|
||||||
|
|
||||||
|
assert result["name"] == "total_revenue"
|
||||||
|
assert result["display_name"] == "Total Revenue"
|
||||||
|
assert result["description"] == "Total revenue in USD"
|
||||||
|
assert result["grain"] == "daily"
|
||||||
|
assert result["category"] == "finance"
|
||||||
|
assert result["path"] == "catalog:catalog.metrics.total_revenue"
|
||||||
|
|
||||||
|
def test_metric_to_display_dict_minimal(self):
|
||||||
|
"""Minimal metric produces valid display dict."""
|
||||||
|
raw = {"name": "bare"}
|
||||||
|
result = metric_to_display_dict(raw)
|
||||||
|
|
||||||
|
assert result["name"] == "bare"
|
||||||
|
assert result["display_name"] == "bare"
|
||||||
|
assert result["description"] == ""
|
||||||
|
assert result["grain"] == ""
|
||||||
|
assert result["category"] == "general"
|
||||||
|
assert result["path"] == "catalog:"
|
||||||
|
|
||||||
|
def test_metric_to_display_dict_display_name_fallback(self):
|
||||||
|
"""displayName falls back to name when absent."""
|
||||||
|
raw = {"name": "revenue_net"}
|
||||||
|
assert metric_to_display_dict(raw)["display_name"] == "revenue_net"
|
||||||
|
|
||||||
|
def test_metric_to_display_dict_description_none(self):
|
||||||
|
"""description=None -> empty string."""
|
||||||
|
raw = {"name": "test", "description": None}
|
||||||
|
assert metric_to_display_dict(raw)["description"] == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# metric_to_detail_dict
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestMetricToDetailDict:
|
||||||
|
def _full_raw_metric(self) -> dict:
|
||||||
|
"""Build a fully-populated raw metric for reuse."""
|
||||||
|
return {
|
||||||
|
"name": "m1_margin",
|
||||||
|
"displayName": "M1 Margin",
|
||||||
|
"fullyQualifiedName": "catalog.metrics.m1_margin",
|
||||||
|
"description": "M1 operational margin in USD",
|
||||||
|
"granularity": "Monthly",
|
||||||
|
"metricType": "ratio",
|
||||||
|
"unitOfMeasurement": "USD",
|
||||||
|
"metricExpression": {"expression": "SUM(m1_margin_usd)"},
|
||||||
|
"tags": [
|
||||||
|
_tag("MetricCategory.finance"),
|
||||||
|
_tag("Dimension.economic_area"),
|
||||||
|
_tag("Dimension.country"),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_metric_to_detail_dict(self):
|
||||||
|
"""Full detail dict with all sections populated."""
|
||||||
|
raw = self._full_raw_metric()
|
||||||
|
result = metric_to_detail_dict(raw)
|
||||||
|
|
||||||
|
assert result["name"] == "m1_margin"
|
||||||
|
assert result["display_name"] == "M1 Margin"
|
||||||
|
assert result["category"] == "finance"
|
||||||
|
# Default color when no category_colors provided
|
||||||
|
assert result["category_color"] == "#6B7280"
|
||||||
|
|
||||||
|
# metadata section
|
||||||
|
assert result["metadata"]["type"] == "ratio"
|
||||||
|
assert result["metadata"]["unit"] == "USD"
|
||||||
|
assert result["metadata"]["grain"] == "monthly"
|
||||||
|
assert result["metadata"]["time_column"] == ""
|
||||||
|
|
||||||
|
# overview section
|
||||||
|
assert result["overview"]["description"] == "M1 operational margin in USD"
|
||||||
|
assert result["overview"]["key_insights"] == []
|
||||||
|
|
||||||
|
# dimensions
|
||||||
|
assert result["dimensions"] == ["economic_area", "country"]
|
||||||
|
|
||||||
|
# sql_examples (expression present)
|
||||||
|
assert "expression" in result["sql_examples"]
|
||||||
|
assert result["sql_examples"]["expression"]["query"] == "SUM(m1_margin_usd)"
|
||||||
|
assert result["sql_examples"]["expression"]["title"] == "Metric Expression"
|
||||||
|
assert result["sql_examples"]["expression"]["complexity"] == "simple"
|
||||||
|
|
||||||
|
# technical
|
||||||
|
assert result["technical"]["expression"] == "SUM(m1_margin_usd)"
|
||||||
|
assert result["technical"]["table"] == ""
|
||||||
|
assert result["technical"]["synonyms"] == []
|
||||||
|
assert result["technical"]["data_sources"] == []
|
||||||
|
|
||||||
|
# other sections
|
||||||
|
assert result["validation"] is None
|
||||||
|
assert result["notes"] == {"all": [], "key_insights": []}
|
||||||
|
assert result["special_sections"] == {}
|
||||||
|
|
||||||
|
def test_metric_to_detail_dict_with_colors(self):
|
||||||
|
"""category_colors parameter maps category to color."""
|
||||||
|
raw = self._full_raw_metric()
|
||||||
|
colors = {"finance": "#10B981", "marketing": "#F59E0B"}
|
||||||
|
result = metric_to_detail_dict(raw, category_colors=colors)
|
||||||
|
|
||||||
|
assert result["category_color"] == "#10B981"
|
||||||
|
|
||||||
|
def test_metric_to_detail_dict_color_fallback(self):
|
||||||
|
"""Unknown category falls back to default gray."""
|
||||||
|
raw = self._full_raw_metric()
|
||||||
|
colors = {"marketing": "#F59E0B"}
|
||||||
|
result = metric_to_detail_dict(raw, category_colors=colors)
|
||||||
|
|
||||||
|
assert result["category_color"] == "#6B7280"
|
||||||
|
|
||||||
|
def test_metric_to_detail_dict_no_expression(self):
|
||||||
|
"""sql_examples is empty dict when no expression."""
|
||||||
|
raw = {"name": "test", "tags": []}
|
||||||
|
result = metric_to_detail_dict(raw)
|
||||||
|
|
||||||
|
assert result["sql_examples"] == {}
|
||||||
|
assert result["technical"]["expression"] == ""
|
||||||
|
|
||||||
|
def test_metric_to_detail_dict_minimal(self):
|
||||||
|
"""Minimal metric produces valid detail dict with all sections."""
|
||||||
|
raw = {"name": "bare"}
|
||||||
|
result = metric_to_detail_dict(raw)
|
||||||
|
|
||||||
|
assert result["name"] == "bare"
|
||||||
|
assert result["display_name"] == "bare"
|
||||||
|
assert result["category"] == "general"
|
||||||
|
assert result["category_color"] == "#6B7280"
|
||||||
|
assert result["metadata"]["type"] == ""
|
||||||
|
assert result["metadata"]["unit"] == ""
|
||||||
|
assert result["metadata"]["grain"] == ""
|
||||||
|
assert result["overview"]["description"] == ""
|
||||||
|
assert result["dimensions"] == []
|
||||||
|
assert result["sql_examples"] == {}
|
||||||
|
|
||||||
|
def test_metric_to_detail_dict_description_stripped(self):
|
||||||
|
"""Description whitespace is stripped in detail dict."""
|
||||||
|
raw = {
|
||||||
|
"name": "test",
|
||||||
|
"description": " leading and trailing spaces ",
|
||||||
|
"tags": [],
|
||||||
|
}
|
||||||
|
result = metric_to_detail_dict(raw)
|
||||||
|
assert result["overview"]["description"] == "leading and trailing spaces"
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# table_to_yaml_dict
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestTableToYamlDict:
|
||||||
|
def test_table_to_yaml_dict(self):
|
||||||
|
"""Full table with columns, owners, tags, tier."""
|
||||||
|
raw = {
|
||||||
|
"name": "order_economics",
|
||||||
|
"fullyQualifiedName": "bigquery.prj.dataset.order_economics",
|
||||||
|
"description": " Order-level economics data ",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"name": "order_id",
|
||||||
|
"dataType": "STRING",
|
||||||
|
"description": "Unique order identifier",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "revenue_usd",
|
||||||
|
"dataType": "FLOAT64",
|
||||||
|
"description": " Revenue in USD ",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "created_at",
|
||||||
|
"dataType": "TIMESTAMP",
|
||||||
|
"description": None,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"tags": [
|
||||||
|
{"name": "FoundryAI", "tagFQN": "AIAgent.FoundryAI"},
|
||||||
|
{"tagFQN": "Tier.Tier1"},
|
||||||
|
],
|
||||||
|
"owners": [
|
||||||
|
{"name": "data_team", "displayName": "Data Team"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
result = table_to_yaml_dict(raw)
|
||||||
|
|
||||||
|
assert result["name"] == "order_economics"
|
||||||
|
assert result["fqn"] == "bigquery.prj.dataset.order_economics"
|
||||||
|
assert result["description"] == "Order-level economics data"
|
||||||
|
assert result["owners"] == ["data_team"]
|
||||||
|
assert result["tags"] == ["FoundryAI", "Tier1"]
|
||||||
|
assert result["tier"] == "Tier1"
|
||||||
|
|
||||||
|
# Columns
|
||||||
|
assert len(result["columns"]) == 3
|
||||||
|
assert result["columns"][0] == {
|
||||||
|
"name": "order_id",
|
||||||
|
"type": "STRING",
|
||||||
|
"description": "Unique order identifier",
|
||||||
|
}
|
||||||
|
assert result["columns"][1] == {
|
||||||
|
"name": "revenue_usd",
|
||||||
|
"type": "FLOAT64",
|
||||||
|
"description": "Revenue in USD",
|
||||||
|
}
|
||||||
|
# description=None -> empty string after strip
|
||||||
|
assert result["columns"][2]["description"] == ""
|
||||||
|
|
||||||
|
def test_table_to_yaml_dict_minimal(self):
|
||||||
|
"""Minimal table with empty fields."""
|
||||||
|
raw = {"name": "empty_table"}
|
||||||
|
result = table_to_yaml_dict(raw)
|
||||||
|
|
||||||
|
assert result["name"] == "empty_table"
|
||||||
|
assert result["fqn"] == ""
|
||||||
|
assert result["description"] == ""
|
||||||
|
assert result["owners"] == []
|
||||||
|
assert result["tags"] == []
|
||||||
|
assert result["tier"] == ""
|
||||||
|
assert result["columns"] == []
|
||||||
|
|
||||||
|
def test_table_to_yaml_dict_tier_from_extension(self):
|
||||||
|
"""Tier extracted from extension field (priority over tags)."""
|
||||||
|
raw = {
|
||||||
|
"name": "test",
|
||||||
|
"extension": {"tier": "Gold"},
|
||||||
|
"tags": [{"tagFQN": "Tier.Silver"}],
|
||||||
|
}
|
||||||
|
result = table_to_yaml_dict(raw)
|
||||||
|
assert result["tier"] == "Gold"
|
||||||
|
|
||||||
|
def test_table_to_yaml_dict_tier_from_extension_capital(self):
|
||||||
|
"""Tier extracted from extension with capital 'Tier' key."""
|
||||||
|
raw = {
|
||||||
|
"name": "test",
|
||||||
|
"extension": {"Tier": "Platinum"},
|
||||||
|
"tags": [],
|
||||||
|
}
|
||||||
|
result = table_to_yaml_dict(raw)
|
||||||
|
assert result["tier"] == "Platinum"
|
||||||
|
|
||||||
|
def test_table_to_yaml_dict_tier_from_tag_fallback(self):
|
||||||
|
"""Tier from tag when extension is absent."""
|
||||||
|
raw = {
|
||||||
|
"name": "test",
|
||||||
|
"tags": [{"tagFQN": "Tier.Tier2"}],
|
||||||
|
}
|
||||||
|
result = table_to_yaml_dict(raw)
|
||||||
|
assert result["tier"] == "Tier2"
|
||||||
|
|
||||||
|
def test_table_to_yaml_dict_no_tier(self):
|
||||||
|
"""No tier info -> empty string."""
|
||||||
|
raw = {
|
||||||
|
"name": "test",
|
||||||
|
"tags": [{"tagFQN": "Category.finance"}],
|
||||||
|
}
|
||||||
|
result = table_to_yaml_dict(raw)
|
||||||
|
assert result["tier"] == ""
|
||||||
|
|
||||||
|
def test_table_to_yaml_dict_column_missing_fields(self):
|
||||||
|
"""Columns with missing fields get empty defaults."""
|
||||||
|
raw = {
|
||||||
|
"name": "test",
|
||||||
|
"columns": [{}],
|
||||||
|
}
|
||||||
|
result = table_to_yaml_dict(raw)
|
||||||
|
assert result["columns"] == [
|
||||||
|
{"name": "", "type": "", "description": ""},
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_table_to_yaml_dict_description_none(self):
|
||||||
|
"""description=None -> empty string."""
|
||||||
|
raw = {"name": "test", "description": None}
|
||||||
|
result = table_to_yaml_dict(raw)
|
||||||
|
assert result["description"] == ""
|
||||||
|
|
@ -62,6 +62,16 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
MetricParser = None
|
MetricParser = None
|
||||||
|
|
||||||
|
# Shared OpenMetadata transformer (catalog -> dict)
|
||||||
|
try:
|
||||||
|
from connectors.openmetadata.transformer import (
|
||||||
|
metric_to_detail_dict as _transformer_metric_detail,
|
||||||
|
metric_to_display_dict as _transformer_metric_display,
|
||||||
|
)
|
||||||
|
_TRANSFORMER_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
_TRANSFORMER_AVAILABLE = False
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
|
|
@ -645,31 +655,29 @@ def _parse_om_metric(raw_metric: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Parse raw OpenMetadata metric dict into format for metric list display.
|
Parse raw OpenMetadata metric dict into format for metric list display.
|
||||||
|
|
||||||
Extracts category, grain from tags with standard prefixes:
|
Delegates to shared transformer module for consistent parsing across
|
||||||
- Category: tagFQN like "MetricCategory.finance" or "Category.marketing"
|
webapp and catalog export.
|
||||||
- Grain: tagFQN like "Grain.monthly"
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
raw_metric: Raw metric dict from OpenMetadata (id, fullyQualifiedName, description, tags, etc.)
|
raw_metric: Raw metric dict from OpenMetadata
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict with keys: name, display_name, description, grain, path
|
Dict with keys: name, display_name, description, grain, category, path
|
||||||
(path = "catalog:{fullyQualifiedName}" for JS routing)
|
|
||||||
"""
|
"""
|
||||||
|
if _TRANSFORMER_AVAILABLE:
|
||||||
|
return _transformer_metric_display(raw_metric)
|
||||||
|
|
||||||
|
# Inline fallback if transformer module not available
|
||||||
fqn = raw_metric.get("fullyQualifiedName", "")
|
fqn = raw_metric.get("fullyQualifiedName", "")
|
||||||
name = raw_metric.get("name", "")
|
name = raw_metric.get("name", "")
|
||||||
display_name = raw_metric.get("displayName", name)
|
display_name = raw_metric.get("displayName", name)
|
||||||
description = raw_metric.get("description", "") or ""
|
description = raw_metric.get("description", "") or ""
|
||||||
|
|
||||||
# Extract category from tags, grain from granularity field
|
|
||||||
tags = raw_metric.get("tags", [])
|
tags = raw_metric.get("tags", [])
|
||||||
category = "general"
|
category = "general"
|
||||||
grain = raw_metric.get("granularity", "") or ""
|
grain = raw_metric.get("granularity", "") or ""
|
||||||
|
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
tag_fqn = tag.get("tagFQN", "")
|
tag_fqn = tag.get("tagFQN", "")
|
||||||
|
|
||||||
# Extract category from MetricCategory.* or Category.* tags
|
|
||||||
if tag_fqn.startswith("MetricCategory."):
|
if tag_fqn.startswith("MetricCategory."):
|
||||||
category = tag_fqn.split(".", 1)[1]
|
category = tag_fqn.split(".", 1)[1]
|
||||||
elif tag_fqn.startswith("Category."):
|
elif tag_fqn.startswith("Category."):
|
||||||
|
|
@ -679,9 +687,9 @@ def _parse_om_metric(raw_metric: dict) -> dict:
|
||||||
"name": name,
|
"name": name,
|
||||||
"display_name": display_name,
|
"display_name": display_name,
|
||||||
"description": description,
|
"description": description,
|
||||||
"grain": grain.lower() if grain else "", # Normalize to lowercase
|
"grain": grain.lower() if grain else "",
|
||||||
"category": category,
|
"category": category,
|
||||||
"path": f"catalog:{fqn}", # Special prefix for JS routing
|
"path": f"catalog:{fqn}",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -761,8 +769,7 @@ def _build_om_metric_detail(raw_metric: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Convert raw OpenMetadata metric into MetricParser-compatible JSON for modal.
|
Convert raw OpenMetadata metric into MetricParser-compatible JSON for modal.
|
||||||
|
|
||||||
Maps OpenMetadata fields to MetricParser structure (name, display_name, category, metadata, etc.).
|
Delegates to shared transformer module for consistent parsing.
|
||||||
Extracts type, unit, grain from OpenMetadata fields (metricType, unitOfMeasurement, granularity).
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
raw_metric: Raw metric dict from OpenMetadata
|
raw_metric: Raw metric dict from OpenMetadata
|
||||||
|
|
@ -770,12 +777,17 @@ def _build_om_metric_detail(raw_metric: dict) -> dict:
|
||||||
Returns:
|
Returns:
|
||||||
Dict matching MetricParser._structure_metric_data() format
|
Dict matching MetricParser._structure_metric_data() format
|
||||||
"""
|
"""
|
||||||
|
category_colors = MetricParser.CATEGORY_COLORS if MetricParser else {}
|
||||||
|
|
||||||
|
if _TRANSFORMER_AVAILABLE:
|
||||||
|
return _transformer_metric_detail(raw_metric, category_colors=category_colors)
|
||||||
|
|
||||||
|
# Inline fallback if transformer module not available
|
||||||
fqn = raw_metric.get("fullyQualifiedName", "")
|
fqn = raw_metric.get("fullyQualifiedName", "")
|
||||||
name = raw_metric.get("name", "")
|
name = raw_metric.get("name", "")
|
||||||
display_name = raw_metric.get("displayName", name)
|
display_name = raw_metric.get("displayName", name)
|
||||||
description = raw_metric.get("description", "") or ""
|
description = raw_metric.get("description", "") or ""
|
||||||
|
|
||||||
# OpenMetadata uses metricExpression instead of expression
|
|
||||||
expression = ""
|
expression = ""
|
||||||
metric_expr = raw_metric.get("metricExpression", {})
|
metric_expr = raw_metric.get("metricExpression", {})
|
||||||
if isinstance(metric_expr, dict):
|
if isinstance(metric_expr, dict):
|
||||||
|
|
@ -783,67 +795,31 @@ def _build_om_metric_detail(raw_metric: dict) -> dict:
|
||||||
elif isinstance(metric_expr, str):
|
elif isinstance(metric_expr, str):
|
||||||
expression = metric_expr
|
expression = metric_expr
|
||||||
|
|
||||||
owners = raw_metric.get("owners", [])
|
|
||||||
|
|
||||||
# Extract metadata from OpenMetadata fields and tags
|
|
||||||
metric_type = raw_metric.get("metricType", "") or ""
|
metric_type = raw_metric.get("metricType", "") or ""
|
||||||
unit = raw_metric.get("unitOfMeasurement", "") or ""
|
unit = raw_metric.get("unitOfMeasurement", "") or ""
|
||||||
grain = raw_metric.get("granularity", "") or ""
|
grain = raw_metric.get("granularity", "") or ""
|
||||||
category = "general"
|
category = "general"
|
||||||
dimensions = []
|
dimensions = []
|
||||||
|
|
||||||
# Also check tags for category and dimensions
|
for tag in raw_metric.get("tags", []):
|
||||||
tags = raw_metric.get("tags", [])
|
|
||||||
for tag in tags:
|
|
||||||
tag_fqn = tag.get("tagFQN", "")
|
tag_fqn = tag.get("tagFQN", "")
|
||||||
|
|
||||||
if tag_fqn.startswith("MetricCategory."):
|
if tag_fqn.startswith("MetricCategory."):
|
||||||
category = tag_fqn.split(".", 1)[1]
|
category = tag_fqn.split(".", 1)[1]
|
||||||
elif tag_fqn.startswith("Dimension."):
|
elif tag_fqn.startswith("Dimension."):
|
||||||
dimensions.append(tag_fqn.split(".", 1)[1])
|
dimensions.append(tag_fqn.split(".", 1)[1])
|
||||||
|
|
||||||
# Extract owner names
|
|
||||||
owner_names = []
|
|
||||||
for owner in owners:
|
|
||||||
name_val = owner.get("name") or owner.get("displayName", "")
|
|
||||||
if name_val:
|
|
||||||
owner_names.append(name_val)
|
|
||||||
|
|
||||||
# Build MetricParser-compatible structure
|
|
||||||
return {
|
return {
|
||||||
"name": name,
|
"name": name,
|
||||||
"display_name": display_name,
|
"display_name": display_name,
|
||||||
"category": category,
|
"category": category,
|
||||||
"category_color": MetricParser.CATEGORY_COLORS.get(category, "#6B7280"),
|
"category_color": category_colors.get(category, "#6B7280"),
|
||||||
"metadata": {
|
"metadata": {"type": metric_type, "unit": unit, "grain": grain, "time_column": ""},
|
||||||
"type": metric_type,
|
"overview": {"description": description.strip(), "key_insights": []},
|
||||||
"unit": unit,
|
"validation": None,
|
||||||
"grain": grain,
|
|
||||||
"time_column": "", # Not available in OpenMetadata
|
|
||||||
},
|
|
||||||
"overview": {
|
|
||||||
"description": description.strip(),
|
|
||||||
"key_insights": [], # Not available in OpenMetadata
|
|
||||||
},
|
|
||||||
"validation": None, # Not available in OpenMetadata
|
|
||||||
"dimensions": dimensions,
|
"dimensions": dimensions,
|
||||||
"notes": {
|
"notes": {"all": [], "key_insights": []},
|
||||||
"all": [], # Not available in OpenMetadata
|
"sql_examples": {"expression": {"title": "Metric Expression", "query": expression, "complexity": "simple"}} if expression else {},
|
||||||
"key_insights": [],
|
"technical": {"table": "", "expression": expression, "synonyms": [], "data_sources": []},
|
||||||
},
|
|
||||||
"sql_examples": {
|
|
||||||
"expression": {
|
|
||||||
"title": "Metric Expression",
|
|
||||||
"query": expression,
|
|
||||||
"complexity": "simple",
|
|
||||||
}
|
|
||||||
} if expression else {},
|
|
||||||
"technical": {
|
|
||||||
"table": "", # Not available in OpenMetadata
|
|
||||||
"expression": expression,
|
|
||||||
"synonyms": [],
|
|
||||||
"data_sources": [],
|
|
||||||
},
|
|
||||||
"special_sections": {},
|
"special_sections": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue