Add data product discovery, fix remove-analyst script

- client.py: add search_by_data_product() for OpenMetadata search API
- catalog_export.py: prefer data product discovery over tag filtering
  (finds all 16 metrics in FoundryAIDataModel vs 3 with tag filter)
- remove-analyst: fix GROUPS bash variable collision, improve messaging
This commit is contained in:
Petr 2026-03-18 12:52:41 +01:00
parent ab99f0af92
commit fb63a72a98
3 changed files with 107 additions and 30 deletions

View file

@ -8,6 +8,7 @@ Low-level HTTP wrapper for OpenMetadata REST API with these functions:
4. Proper error handling and logging
"""
import json
import logging
from typing import Dict, List, Optional, Any
import warnings
@ -138,6 +139,50 @@ class OpenMetadataClient:
return response.json()
def search_by_data_product(
self,
data_product_name: str,
entity_type: str = "",
limit: int = 50,
fields: str = "tags,owners",
) -> List[Dict[str, Any]]:
"""
Search for entities belonging to a data product.
Uses OpenMetadata search API with queryFilter to find all assets
(metrics, tables, etc.) that are part of the specified data product.
Args:
data_product_name: Name of the data product (e.g., "FoundryAIDataModel")
entity_type: Filter by entity type (e.g., "metric", "table"). Empty = all types.
limit: Maximum number of results
fields: Comma-separated list of fields to include
Returns:
List of entity dictionaries
"""
must_clauses = [
{"term": {"dataProducts.name.keyword": data_product_name}},
]
if entity_type:
must_clauses.append({"term": {"entityType": entity_type}})
query_filter = json.dumps({"bool": {"must": must_clauses}})
params = {
"q": "*",
"index": "all",
"size": limit,
"queryFilter": query_filter,
}
response = self._client.get("/api/v1/search/query", params=params)
response.raise_for_status()
data = response.json()
hits = data.get("hits", {}).get("hits", [])
return [hit.get("_source", {}) for hit in hits]
def close(self):
"""Close HTTP client session."""
self._client.close()

View file

@ -47,12 +47,15 @@ if [[ "$USERNAME" == "$CURRENT_USER" ]]; then
exit 1
fi
# Get user groups for info (safe extraction, no pipefail issues)
GROUPS=$(id -nG "$USERNAME" 2>/dev/null) || GROUPS="(unknown)"
# Get user info (avoid using GROUPS - it's a bash special variable for current user's GIDs)
USER_GROUPS=$(id -nG "$USERNAME" 2>/dev/null) || USER_GROUPS="(unknown)"
HOME_DIR="/home/$USERNAME"
HOME_EXISTS=false
[[ -d "$HOME_DIR" ]] && HOME_EXISTS=true
echo "Removing user: $USERNAME"
echo " Groups: $GROUPS"
echo " Home: /home/$USERNAME"
echo " Groups: $USER_GROUPS"
echo " Home: $HOME_DIR ($([ "$HOME_EXISTS" = true ] && echo "exists" || echo "already missing"))"
if [[ "$FORCE" != true ]]; then
read -p "Are you sure? [y/N] " -n 1 -r
@ -65,16 +68,23 @@ fi
# Remove user and home directory
echo " Deleting OS user..."
if userdel -r "$USERNAME" 2>/dev/null; then
echo " User and home directory removed"
USERDEL_ERR=$(userdel -r "$USERNAME" 2>&1)
USERDEL_EXIT=$?
if [[ $USERDEL_EXIT -eq 0 ]]; then
if [[ "$HOME_EXISTS" = true ]]; then
echo " User and home directory removed"
else
echo " User removed (home directory was already missing)"
fi
elif userdel "$USERNAME" 2>/dev/null; then
echo " User removed (userdel -r failed, cleaning up home manually)"
if [[ -d "/home/$USERNAME" ]]; then
rm -rf "/home/$USERNAME"
echo " Home directory /home/$USERNAME removed"
echo " User removed (userdel -r failed: $USERDEL_ERR)"
if [[ -d "$HOME_DIR" ]]; then
rm -rf "$HOME_DIR"
echo " Home directory $HOME_DIR removed"
fi
else
echo "Error: Failed to remove user '$USERNAME'"
echo " userdel error: $USERDEL_ERR"
echo " Check if processes are running as this user: ps -u $USERNAME"
exit 1
fi

View file

@ -123,23 +123,24 @@ def export_metrics(
docs_dir: Path,
catalog_url: str,
filter_tag: str = "",
data_product: str = "",
) -> int:
"""
Export metrics from OpenMetadata to YAML files.
For each metric:
1. Fetches all metrics from catalog API
2. Filters by required tag (if configured)
3. Transforms each to YAML-compatible dict
4. Writes individual YAML files: {docs_dir}/metrics/{category}/{name}.yml
5. Writes index file: {docs_dir}/metrics/metrics.yml
6. Cleans up stale auto-generated files
1. Discovers metrics via data product (preferred) or fetches all + filters by tag
2. Transforms each to YAML-compatible dict
3. Writes individual YAML files: {docs_dir}/metrics/{category}/{name}.yml
4. Writes index file: {docs_dir}/metrics/metrics.yml
5. Cleans up stale auto-generated files
Args:
client: Initialized OpenMetadata API client
docs_dir: Base docs directory (e.g., /data/docs)
catalog_url: Catalog URL for header comments
filter_tag: If set, only export metrics that have this tag (e.g., "AIAgent.FoundryAI")
data_product: If set, discover metrics via data product assets (preferred over filter_tag)
Returns:
Number of metrics exported
@ -147,21 +148,38 @@ def export_metrics(
metrics_dir = docs_dir / "metrics"
metrics_dir.mkdir(parents=True, exist_ok=True)
# Fetch all metrics with tags and owners
raw_metrics = client.get_metrics(limit=200, fields="tags,owners")
raw_metrics: List[Dict[str, Any]] = []
# Strategy 1: Discover metrics via data product (preferred)
if data_product:
try:
raw_metrics = client.search_by_data_product(
data_product_name=data_product,
entity_type="metric",
limit=200,
)
logger.info(
f"Data product '{data_product}': found {len(raw_metrics)} metrics"
)
except Exception as e:
logger.warning(f"Data product search failed, falling back to tag filter: {e}")
raw_metrics = []
# Strategy 2: Fallback to tag-based filter
if not raw_metrics:
logger.warning("No metrics returned from catalog - preserving existing files")
return 0
raw_metrics = client.get_metrics(limit=200, fields="tags,owners")
if not raw_metrics:
logger.warning("No metrics returned from catalog - preserving existing files")
return 0
logger.info(f"Fetched {len(raw_metrics)} metrics from catalog")
logger.info(f"Fetched {len(raw_metrics)} metrics from catalog")
# Filter by tag if configured
if filter_tag:
filtered = [m for m in raw_metrics if has_tag(m.get("tags", []), filter_tag)]
logger.info(
f"Tag filter '{filter_tag}': {len(filtered)}/{len(raw_metrics)} metrics matched"
)
raw_metrics = filtered
if filter_tag:
filtered = [m for m in raw_metrics if has_tag(m.get("tags", []), filter_tag)]
logger.info(
f"Tag filter '{filter_tag}': {len(filtered)}/{len(raw_metrics)} metrics matched"
)
raw_metrics = filtered
# Track which files we write (for cleanup)
written_files: set[Path] = set()
@ -412,12 +430,16 @@ def main() -> None:
logger.warning(f"Failed to initialize OpenMetadata client: {e}")
return
# Optional tag filter (only export metrics with this tag)
# Discovery config: data product (preferred) or tag filter (fallback)
filter_tag = om_config.get("filter_tag", "").strip()
data_product = om_config.get("data_product", "").strip()
try:
# Export metrics
metrics_count = export_metrics(client, docs_dir, catalog_url, filter_tag=filter_tag)
metrics_count = export_metrics(
client, docs_dir, catalog_url,
filter_tag=filter_tag, data_product=data_product,
)
# Export tables
try: