Add OpenMetadata REST API connector and enricher to merge table/column metadata
from OpenMetadata catalog at sync and query time.
Changes:
- connectors/openmetadata/client.py: HTTP client for OM API
- connectors/openmetadata/enricher.py: Data enrichment with TTL cache
- tests/test_openmetadata_*: Unit tests for client and enricher
- src/config.py: Add catalog_fqn field to TableConfig
- src/data_sync.py: Use enricher in _generate_schema_yaml (catalog > BQ API > data_description.md)
- webapp/app.py: Initialize enricher, enrich catalog data with tags/tier/owners/url
- config/instance.yaml.example: Document openmetadata section
Features:
- FQN auto-derivation: bigquery.{table.id}
- TTL cache (default 1h) to avoid repeated API calls
- Graceful degradation: disabled if token missing, silent on HTTP errors
- Column description priority: catalog > BQ API > (none)
- Table description priority: catalog > data_description.md
120 lines
3.3 KiB
Python
120 lines
3.3 KiB
Python
"""
|
|
OpenMetadata REST API Client
|
|
|
|
Low-level HTTP wrapper for OpenMetadata REST API with these functions:
|
|
1. Authentication using JWT bearer token
|
|
2. Get table metadata (description, columns, tags, owners)
|
|
3. Get metrics (for Phase 2)
|
|
4. Proper error handling and logging
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
import httpx
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class OpenMetadataClient:
|
|
"""
|
|
HTTP client for OpenMetadata REST API.
|
|
|
|
Provides methods for querying table metadata:
|
|
- get_table(fqn) -> table metadata with columns, owners, tags
|
|
- get_metrics() -> list of available business metrics
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
base_url: str,
|
|
token: str,
|
|
timeout: int = 30,
|
|
):
|
|
"""
|
|
Initialize OpenMetadata API client.
|
|
|
|
Args:
|
|
base_url: Base URL of OpenMetadata instance (e.g., "https://catalog.example.com")
|
|
token: JWT bearer token for authentication
|
|
timeout: HTTP request timeout in seconds
|
|
"""
|
|
self.base_url = base_url.rstrip("/")
|
|
self.token = token
|
|
self.timeout = timeout
|
|
self._client = httpx.Client(
|
|
base_url=self.base_url,
|
|
headers={
|
|
"Authorization": f"Bearer {token}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
timeout=timeout,
|
|
)
|
|
|
|
def get_table(self, fqn: str) -> Dict[str, Any]:
|
|
"""
|
|
Fetch table metadata from OpenMetadata.
|
|
|
|
Args:
|
|
fqn: Fully qualified name (e.g., "bigquery.project.dataset.table")
|
|
|
|
Returns:
|
|
Dictionary with table metadata including:
|
|
- id, name, fullyQualifiedName
|
|
- description
|
|
- columns: list of column dicts with name, dataType, description
|
|
- tags: list of tag dicts
|
|
- owners: list of owner dicts with name, email
|
|
- extension: custom metadata (e.g., tier)
|
|
|
|
Raises:
|
|
httpx.HTTPStatusError: If request fails (non-2xx status)
|
|
"""
|
|
url = f"/api/v1/tables/name/{fqn}"
|
|
params = {
|
|
"fields": "columns,owners,tags,extension",
|
|
"include": "all",
|
|
}
|
|
|
|
response = self._client.get(url, params=params)
|
|
response.raise_for_status()
|
|
|
|
return response.json()
|
|
|
|
def get_metrics(self, limit: int = 100) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fetch list of available metrics from OpenMetadata (Phase 2).
|
|
|
|
Args:
|
|
limit: Maximum number of metrics to return
|
|
|
|
Returns:
|
|
List of metric dictionaries with:
|
|
- id, name, fullyQualifiedName
|
|
- description
|
|
- expression: metric calculation SQL/formula
|
|
- owners, tags
|
|
"""
|
|
params = {
|
|
"limit": limit,
|
|
"fields": "description,expression,owners,tags",
|
|
}
|
|
|
|
response = self._client.get("/api/v1/metrics", params=params)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
return data.get("data", [])
|
|
|
|
def close(self):
|
|
"""Close HTTP client session."""
|
|
self._client.close()
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry."""
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit."""
|
|
self.close()
|