Add data product discovery, fix remove-analyst script

- client.py: add search_by_data_product() for OpenMetadata search API
- catalog_export.py: prefer data product discovery over tag filtering
  (finds all 16 metrics in FoundryAIDataModel vs 3 with tag filter)
- remove-analyst: fix GROUPS bash variable collision, improve messaging
This commit is contained in:
Petr 2026-03-18 12:52:41 +01:00
parent ab99f0af92
commit fb63a72a98
3 changed files with 107 additions and 30 deletions

View file

@ -8,6 +8,7 @@ Low-level HTTP wrapper for OpenMetadata REST API with these functions:
4. Proper error handling and logging 4. Proper error handling and logging
""" """
import json
import logging import logging
from typing import Dict, List, Optional, Any from typing import Dict, List, Optional, Any
import warnings import warnings
@ -138,6 +139,50 @@ class OpenMetadataClient:
return response.json() return response.json()
def search_by_data_product(
self,
data_product_name: str,
entity_type: str = "",
limit: int = 50,
fields: str = "tags,owners",
) -> List[Dict[str, Any]]:
"""
Search for entities belonging to a data product.
Uses OpenMetadata search API with queryFilter to find all assets
(metrics, tables, etc.) that are part of the specified data product.
Args:
data_product_name: Name of the data product (e.g., "FoundryAIDataModel")
entity_type: Filter by entity type (e.g., "metric", "table"). Empty = all types.
limit: Maximum number of results
fields: Comma-separated list of fields to include
Returns:
List of entity dictionaries
"""
must_clauses = [
{"term": {"dataProducts.name.keyword": data_product_name}},
]
if entity_type:
must_clauses.append({"term": {"entityType": entity_type}})
query_filter = json.dumps({"bool": {"must": must_clauses}})
params = {
"q": "*",
"index": "all",
"size": limit,
"queryFilter": query_filter,
}
response = self._client.get("/api/v1/search/query", params=params)
response.raise_for_status()
data = response.json()
hits = data.get("hits", {}).get("hits", [])
return [hit.get("_source", {}) for hit in hits]
def close(self): def close(self):
"""Close HTTP client session.""" """Close HTTP client session."""
self._client.close() self._client.close()

View file

@ -47,12 +47,15 @@ if [[ "$USERNAME" == "$CURRENT_USER" ]]; then
exit 1 exit 1
fi fi
# Get user groups for info (safe extraction, no pipefail issues) # Get user info (avoid using GROUPS - it's a bash special variable for current user's GIDs)
GROUPS=$(id -nG "$USERNAME" 2>/dev/null) || GROUPS="(unknown)" USER_GROUPS=$(id -nG "$USERNAME" 2>/dev/null) || USER_GROUPS="(unknown)"
HOME_DIR="/home/$USERNAME"
HOME_EXISTS=false
[[ -d "$HOME_DIR" ]] && HOME_EXISTS=true
echo "Removing user: $USERNAME" echo "Removing user: $USERNAME"
echo " Groups: $GROUPS" echo " Groups: $USER_GROUPS"
echo " Home: /home/$USERNAME" echo " Home: $HOME_DIR ($([ "$HOME_EXISTS" = true ] && echo "exists" || echo "already missing"))"
if [[ "$FORCE" != true ]]; then if [[ "$FORCE" != true ]]; then
read -p "Are you sure? [y/N] " -n 1 -r read -p "Are you sure? [y/N] " -n 1 -r
@ -65,16 +68,23 @@ fi
# Remove user and home directory # Remove user and home directory
echo " Deleting OS user..." echo " Deleting OS user..."
if userdel -r "$USERNAME" 2>/dev/null; then USERDEL_ERR=$(userdel -r "$USERNAME" 2>&1)
echo " User and home directory removed" USERDEL_EXIT=$?
if [[ $USERDEL_EXIT -eq 0 ]]; then
if [[ "$HOME_EXISTS" = true ]]; then
echo " User and home directory removed"
else
echo " User removed (home directory was already missing)"
fi
elif userdel "$USERNAME" 2>/dev/null; then elif userdel "$USERNAME" 2>/dev/null; then
echo " User removed (userdel -r failed, cleaning up home manually)" echo " User removed (userdel -r failed: $USERDEL_ERR)"
if [[ -d "/home/$USERNAME" ]]; then if [[ -d "$HOME_DIR" ]]; then
rm -rf "/home/$USERNAME" rm -rf "$HOME_DIR"
echo " Home directory /home/$USERNAME removed" echo " Home directory $HOME_DIR removed"
fi fi
else else
echo "Error: Failed to remove user '$USERNAME'" echo "Error: Failed to remove user '$USERNAME'"
echo " userdel error: $USERDEL_ERR"
echo " Check if processes are running as this user: ps -u $USERNAME" echo " Check if processes are running as this user: ps -u $USERNAME"
exit 1 exit 1
fi fi

View file

@ -123,23 +123,24 @@ def export_metrics(
docs_dir: Path, docs_dir: Path,
catalog_url: str, catalog_url: str,
filter_tag: str = "", filter_tag: str = "",
data_product: str = "",
) -> int: ) -> int:
""" """
Export metrics from OpenMetadata to YAML files. Export metrics from OpenMetadata to YAML files.
For each metric: For each metric:
1. Fetches all metrics from catalog API 1. Discovers metrics via data product (preferred) or fetches all + filters by tag
2. Filters by required tag (if configured) 2. Transforms each to YAML-compatible dict
3. Transforms each to YAML-compatible dict 3. Writes individual YAML files: {docs_dir}/metrics/{category}/{name}.yml
4. Writes individual YAML files: {docs_dir}/metrics/{category}/{name}.yml 4. Writes index file: {docs_dir}/metrics/metrics.yml
5. Writes index file: {docs_dir}/metrics/metrics.yml 5. Cleans up stale auto-generated files
6. Cleans up stale auto-generated files
Args: Args:
client: Initialized OpenMetadata API client client: Initialized OpenMetadata API client
docs_dir: Base docs directory (e.g., /data/docs) docs_dir: Base docs directory (e.g., /data/docs)
catalog_url: Catalog URL for header comments catalog_url: Catalog URL for header comments
filter_tag: If set, only export metrics that have this tag (e.g., "AIAgent.FoundryAI") filter_tag: If set, only export metrics that have this tag (e.g., "AIAgent.FoundryAI")
data_product: If set, discover metrics via data product assets (preferred over filter_tag)
Returns: Returns:
Number of metrics exported Number of metrics exported
@ -147,21 +148,38 @@ def export_metrics(
metrics_dir = docs_dir / "metrics" metrics_dir = docs_dir / "metrics"
metrics_dir.mkdir(parents=True, exist_ok=True) metrics_dir.mkdir(parents=True, exist_ok=True)
# Fetch all metrics with tags and owners raw_metrics: List[Dict[str, Any]] = []
raw_metrics = client.get_metrics(limit=200, fields="tags,owners")
# Strategy 1: Discover metrics via data product (preferred)
if data_product:
try:
raw_metrics = client.search_by_data_product(
data_product_name=data_product,
entity_type="metric",
limit=200,
)
logger.info(
f"Data product '{data_product}': found {len(raw_metrics)} metrics"
)
except Exception as e:
logger.warning(f"Data product search failed, falling back to tag filter: {e}")
raw_metrics = []
# Strategy 2: Fallback to tag-based filter
if not raw_metrics: if not raw_metrics:
logger.warning("No metrics returned from catalog - preserving existing files") raw_metrics = client.get_metrics(limit=200, fields="tags,owners")
return 0 if not raw_metrics:
logger.warning("No metrics returned from catalog - preserving existing files")
return 0
logger.info(f"Fetched {len(raw_metrics)} metrics from catalog") logger.info(f"Fetched {len(raw_metrics)} metrics from catalog")
# Filter by tag if configured if filter_tag:
if filter_tag: filtered = [m for m in raw_metrics if has_tag(m.get("tags", []), filter_tag)]
filtered = [m for m in raw_metrics if has_tag(m.get("tags", []), filter_tag)] logger.info(
logger.info( f"Tag filter '{filter_tag}': {len(filtered)}/{len(raw_metrics)} metrics matched"
f"Tag filter '{filter_tag}': {len(filtered)}/{len(raw_metrics)} metrics matched" )
) raw_metrics = filtered
raw_metrics = filtered
# Track which files we write (for cleanup) # Track which files we write (for cleanup)
written_files: set[Path] = set() written_files: set[Path] = set()
@ -412,12 +430,16 @@ def main() -> None:
logger.warning(f"Failed to initialize OpenMetadata client: {e}") logger.warning(f"Failed to initialize OpenMetadata client: {e}")
return return
# Optional tag filter (only export metrics with this tag) # Discovery config: data product (preferred) or tag filter (fallback)
filter_tag = om_config.get("filter_tag", "").strip() filter_tag = om_config.get("filter_tag", "").strip()
data_product = om_config.get("data_product", "").strip()
try: try:
# Export metrics # Export metrics
metrics_count = export_metrics(client, docs_dir, catalog_url, filter_tag=filter_tag) metrics_count = export_metrics(
client, docs_dir, catalog_url,
filter_tag=filter_tag, data_product=data_product,
)
# Export tables # Export tables
try: try: