Add data product discovery, fix remove-analyst script
- client.py: add search_by_data_product() for OpenMetadata search API - catalog_export.py: prefer data product discovery over tag filtering (finds all 16 metrics in FoundryAIDataModel vs 3 with tag filter) - remove-analyst: fix GROUPS bash variable collision, improve messaging
This commit is contained in:
parent
ab99f0af92
commit
fb63a72a98
3 changed files with 107 additions and 30 deletions
|
|
@ -8,6 +8,7 @@ Low-level HTTP wrapper for OpenMetadata REST API with these functions:
|
|||
4. Proper error handling and logging
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any
|
||||
import warnings
|
||||
|
|
@ -138,6 +139,50 @@ class OpenMetadataClient:
|
|||
|
||||
return response.json()
|
||||
|
||||
def search_by_data_product(
|
||||
self,
|
||||
data_product_name: str,
|
||||
entity_type: str = "",
|
||||
limit: int = 50,
|
||||
fields: str = "tags,owners",
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for entities belonging to a data product.
|
||||
|
||||
Uses OpenMetadata search API with queryFilter to find all assets
|
||||
(metrics, tables, etc.) that are part of the specified data product.
|
||||
|
||||
Args:
|
||||
data_product_name: Name of the data product (e.g., "FoundryAIDataModel")
|
||||
entity_type: Filter by entity type (e.g., "metric", "table"). Empty = all types.
|
||||
limit: Maximum number of results
|
||||
fields: Comma-separated list of fields to include
|
||||
|
||||
Returns:
|
||||
List of entity dictionaries
|
||||
"""
|
||||
must_clauses = [
|
||||
{"term": {"dataProducts.name.keyword": data_product_name}},
|
||||
]
|
||||
if entity_type:
|
||||
must_clauses.append({"term": {"entityType": entity_type}})
|
||||
|
||||
query_filter = json.dumps({"bool": {"must": must_clauses}})
|
||||
|
||||
params = {
|
||||
"q": "*",
|
||||
"index": "all",
|
||||
"size": limit,
|
||||
"queryFilter": query_filter,
|
||||
}
|
||||
|
||||
response = self._client.get("/api/v1/search/query", params=params)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
hits = data.get("hits", {}).get("hits", [])
|
||||
return [hit.get("_source", {}) for hit in hits]
|
||||
|
||||
def close(self):
|
||||
"""Close HTTP client session."""
|
||||
self._client.close()
|
||||
|
|
|
|||
|
|
@ -47,12 +47,15 @@ if [[ "$USERNAME" == "$CURRENT_USER" ]]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
# Get user groups for info (safe extraction, no pipefail issues)
|
||||
GROUPS=$(id -nG "$USERNAME" 2>/dev/null) || GROUPS="(unknown)"
|
||||
# Get user info (avoid using GROUPS - it's a bash special variable for current user's GIDs)
|
||||
USER_GROUPS=$(id -nG "$USERNAME" 2>/dev/null) || USER_GROUPS="(unknown)"
|
||||
HOME_DIR="/home/$USERNAME"
|
||||
HOME_EXISTS=false
|
||||
[[ -d "$HOME_DIR" ]] && HOME_EXISTS=true
|
||||
|
||||
echo "Removing user: $USERNAME"
|
||||
echo " Groups: $GROUPS"
|
||||
echo " Home: /home/$USERNAME"
|
||||
echo " Groups: $USER_GROUPS"
|
||||
echo " Home: $HOME_DIR ($([ "$HOME_EXISTS" = true ] && echo "exists" || echo "already missing"))"
|
||||
|
||||
if [[ "$FORCE" != true ]]; then
|
||||
read -p "Are you sure? [y/N] " -n 1 -r
|
||||
|
|
@ -65,16 +68,23 @@ fi
|
|||
|
||||
# Remove user and home directory
|
||||
echo " Deleting OS user..."
|
||||
if userdel -r "$USERNAME" 2>/dev/null; then
|
||||
echo " User and home directory removed"
|
||||
USERDEL_ERR=$(userdel -r "$USERNAME" 2>&1)
|
||||
USERDEL_EXIT=$?
|
||||
if [[ $USERDEL_EXIT -eq 0 ]]; then
|
||||
if [[ "$HOME_EXISTS" = true ]]; then
|
||||
echo " User and home directory removed"
|
||||
else
|
||||
echo " User removed (home directory was already missing)"
|
||||
fi
|
||||
elif userdel "$USERNAME" 2>/dev/null; then
|
||||
echo " User removed (userdel -r failed, cleaning up home manually)"
|
||||
if [[ -d "/home/$USERNAME" ]]; then
|
||||
rm -rf "/home/$USERNAME"
|
||||
echo " Home directory /home/$USERNAME removed"
|
||||
echo " User removed (userdel -r failed: $USERDEL_ERR)"
|
||||
if [[ -d "$HOME_DIR" ]]; then
|
||||
rm -rf "$HOME_DIR"
|
||||
echo " Home directory $HOME_DIR removed"
|
||||
fi
|
||||
else
|
||||
echo "Error: Failed to remove user '$USERNAME'"
|
||||
echo " userdel error: $USERDEL_ERR"
|
||||
echo " Check if processes are running as this user: ps -u $USERNAME"
|
||||
exit 1
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -123,23 +123,24 @@ def export_metrics(
|
|||
docs_dir: Path,
|
||||
catalog_url: str,
|
||||
filter_tag: str = "",
|
||||
data_product: str = "",
|
||||
) -> int:
|
||||
"""
|
||||
Export metrics from OpenMetadata to YAML files.
|
||||
|
||||
For each metric:
|
||||
1. Fetches all metrics from catalog API
|
||||
2. Filters by required tag (if configured)
|
||||
3. Transforms each to YAML-compatible dict
|
||||
4. Writes individual YAML files: {docs_dir}/metrics/{category}/{name}.yml
|
||||
5. Writes index file: {docs_dir}/metrics/metrics.yml
|
||||
6. Cleans up stale auto-generated files
|
||||
1. Discovers metrics via data product (preferred) or fetches all + filters by tag
|
||||
2. Transforms each to YAML-compatible dict
|
||||
3. Writes individual YAML files: {docs_dir}/metrics/{category}/{name}.yml
|
||||
4. Writes index file: {docs_dir}/metrics/metrics.yml
|
||||
5. Cleans up stale auto-generated files
|
||||
|
||||
Args:
|
||||
client: Initialized OpenMetadata API client
|
||||
docs_dir: Base docs directory (e.g., /data/docs)
|
||||
catalog_url: Catalog URL for header comments
|
||||
filter_tag: If set, only export metrics that have this tag (e.g., "AIAgent.FoundryAI")
|
||||
data_product: If set, discover metrics via data product assets (preferred over filter_tag)
|
||||
|
||||
Returns:
|
||||
Number of metrics exported
|
||||
|
|
@ -147,21 +148,38 @@ def export_metrics(
|
|||
metrics_dir = docs_dir / "metrics"
|
||||
metrics_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Fetch all metrics with tags and owners
|
||||
raw_metrics = client.get_metrics(limit=200, fields="tags,owners")
|
||||
raw_metrics: List[Dict[str, Any]] = []
|
||||
|
||||
# Strategy 1: Discover metrics via data product (preferred)
|
||||
if data_product:
|
||||
try:
|
||||
raw_metrics = client.search_by_data_product(
|
||||
data_product_name=data_product,
|
||||
entity_type="metric",
|
||||
limit=200,
|
||||
)
|
||||
logger.info(
|
||||
f"Data product '{data_product}': found {len(raw_metrics)} metrics"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Data product search failed, falling back to tag filter: {e}")
|
||||
raw_metrics = []
|
||||
|
||||
# Strategy 2: Fallback to tag-based filter
|
||||
if not raw_metrics:
|
||||
logger.warning("No metrics returned from catalog - preserving existing files")
|
||||
return 0
|
||||
raw_metrics = client.get_metrics(limit=200, fields="tags,owners")
|
||||
if not raw_metrics:
|
||||
logger.warning("No metrics returned from catalog - preserving existing files")
|
||||
return 0
|
||||
|
||||
logger.info(f"Fetched {len(raw_metrics)} metrics from catalog")
|
||||
logger.info(f"Fetched {len(raw_metrics)} metrics from catalog")
|
||||
|
||||
# Filter by tag if configured
|
||||
if filter_tag:
|
||||
filtered = [m for m in raw_metrics if has_tag(m.get("tags", []), filter_tag)]
|
||||
logger.info(
|
||||
f"Tag filter '{filter_tag}': {len(filtered)}/{len(raw_metrics)} metrics matched"
|
||||
)
|
||||
raw_metrics = filtered
|
||||
if filter_tag:
|
||||
filtered = [m for m in raw_metrics if has_tag(m.get("tags", []), filter_tag)]
|
||||
logger.info(
|
||||
f"Tag filter '{filter_tag}': {len(filtered)}/{len(raw_metrics)} metrics matched"
|
||||
)
|
||||
raw_metrics = filtered
|
||||
|
||||
# Track which files we write (for cleanup)
|
||||
written_files: set[Path] = set()
|
||||
|
|
@ -412,12 +430,16 @@ def main() -> None:
|
|||
logger.warning(f"Failed to initialize OpenMetadata client: {e}")
|
||||
return
|
||||
|
||||
# Optional tag filter (only export metrics with this tag)
|
||||
# Discovery config: data product (preferred) or tag filter (fallback)
|
||||
filter_tag = om_config.get("filter_tag", "").strip()
|
||||
data_product = om_config.get("data_product", "").strip()
|
||||
|
||||
try:
|
||||
# Export metrics
|
||||
metrics_count = export_metrics(client, docs_dir, catalog_url, filter_tag=filter_tag)
|
||||
metrics_count = export_metrics(
|
||||
client, docs_dir, catalog_url,
|
||||
filter_tag=filter_tag, data_product=data_product,
|
||||
)
|
||||
|
||||
# Export tables
|
||||
try:
|
||||
|
|
|
|||
Loading…
Reference in a new issue