agnes-the-ai-analyst/tests/test_catalog_export.py
ZdenekSrotyr b502bd8bdd refactor: delete old sync pipeline — 9,500 lines removed
Phase 5 cleanup: remove all code replaced by extract.duckdb architecture.

Deleted modules:
- src/config.py (653) — replaced by DuckDB table_registry
- src/parquet_manager.py (755) — replaced by DuckDB COPY TO
- src/data_sync.py (734) — replaced by SyncOrchestrator
- src/remote_query.py (636) — replaced by DuckDB BigQuery ATTACH
- src/table_registry.py (464) — replaced by DuckDB repository
- connectors/keboola/adapter.py (820) — replaced by extractor.py
- connectors/bigquery/adapter.py (665) — replaced by extractor.py
- connectors/bigquery/client.py (644) — replaced by DuckDB BQ extension

Updated all imports in webapp, catalog_export, enricher, router,
sync_settings_service, generate_sample_data. Kept keboola/client.py
as fallback (removed src.config dependency).

704 tests passing.
2026-03-31 07:50:37 +02:00

469 lines
16 KiB
Python

"""
Tests for src/catalog_export.py
Covers YAML header generation, auto-generated file detection, metric/table
export to YAML files, stale file cleanup, sync state writing, and docs dir
resolution.
"""
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from unittest.mock import MagicMock, patch
import pytest
import yaml
from src.catalog_export import (
AUTO_GENERATED_MARKER,
_get_docs_dir,
_is_auto_generated,
_write_sync_state,
_yaml_header,
export_metrics,
export_tables,
)
# ---------------------------------------------------------------------------
# Helpers / fixtures
# ---------------------------------------------------------------------------
@dataclass
class FakeTableConfig:
"""Minimal stand-in for src.config.TableConfig with attributes used by
export_tables()."""
name: str
id: str
catalog_fqn: Optional[str] = None
def _make_raw_metric(
name: str = "Total Revenue",
fqn: str = "metrics.total_revenue",
category_tag: str = "MetricCategory.finance",
description: str = "Sum of order revenue",
expression: str = "SUM(grs_revenue_plan_local)",
) -> Dict[str, Any]:
"""Build a realistic raw metric dict as returned by the catalog API."""
return {
"id": f"id-{name}",
"name": name,
"fullyQualifiedName": fqn,
"displayName": name,
"description": description,
"metricExpression": {"expression": expression},
"metricType": "sum",
"unitOfMeasurement": "USD",
"granularity": "monthly",
"tags": [{"tagFQN": category_tag}],
"owners": [{"name": "Data Team"}],
}
def _make_raw_table(
name: str = "order_economics",
fqn: str = "bigquery.project.dataset.order_economics",
) -> Dict[str, Any]:
"""Build a realistic raw table dict as returned by the catalog API."""
return {
"id": "tbl-1",
"name": name,
"fullyQualifiedName": fqn,
"description": "Order-level economics table",
"columns": [
{"name": "order_id", "dataType": "STRING", "description": "PK"},
{"name": "grs_revenue", "dataType": "FLOAT", "description": "Revenue"},
],
"tags": [{"tagFQN": "Tier.Tier1"}],
"owners": [{"name": "Finance"}],
}
@pytest.fixture
def mock_client():
"""Return a MagicMock that behaves like OpenMetadataClient."""
client = MagicMock()
client.get_metrics.return_value = [_make_raw_metric()]
client.get_table.return_value = _make_raw_table()
return client
@pytest.fixture
def mock_tables():
"""Return a list of table dicts matching TableRegistryRepository.list_all() format."""
return [
{
"id": "prj.dataset.order_economics",
"name": "order_economics",
"catalog_fqn": "bigquery.prj.dataset.order_economics",
}
]
CATALOG_URL = "https://catalog.example.com"
# ---------------------------------------------------------------------------
# 1. _yaml_header
# ---------------------------------------------------------------------------
class TestYamlHeader:
def test_yaml_header_contains_marker(self):
"""_yaml_header() output starts with AUTO-GENERATED marker."""
header = _yaml_header(CATALOG_URL)
assert header.startswith(AUTO_GENERATED_MARKER)
def test_yaml_header_contains_url(self):
"""Header includes catalog URL."""
header = _yaml_header(CATALOG_URL)
assert CATALOG_URL in header
def test_yaml_header_contains_fqn(self):
"""Header includes FQN when provided."""
fqn = "metrics.total_revenue"
header = _yaml_header(CATALOG_URL, fqn=fqn)
assert f"# FQN: {fqn}" in header
def test_yaml_header_no_fqn_line_when_empty(self):
"""Header omits FQN line when fqn argument is empty."""
header = _yaml_header(CATALOG_URL, fqn="")
assert "# FQN:" not in header
# ---------------------------------------------------------------------------
# 2. _is_auto_generated
# ---------------------------------------------------------------------------
class TestIsAutoGenerated:
def test_is_auto_generated_true(self, tmp_path: Path):
"""File starting with AUTO-GENERATED marker returns True."""
f = tmp_path / "metric.yml"
f.write_text(AUTO_GENERATED_MARKER + "\nsome: data\n")
assert _is_auto_generated(f) is True
def test_is_auto_generated_false(self, tmp_path: Path):
"""File without marker returns False."""
f = tmp_path / "manual.yml"
f.write_text("# Manually written metric\nname: custom\n")
assert _is_auto_generated(f) is False
def test_is_auto_generated_missing_file(self, tmp_path: Path):
"""Non-existent file returns False."""
f = tmp_path / "does_not_exist.yml"
assert _is_auto_generated(f) is False
# ---------------------------------------------------------------------------
# 3. export_metrics
# ---------------------------------------------------------------------------
class TestExportMetrics:
def test_export_metrics_writes_files(self, tmp_path: Path, mock_client):
"""Creates category dirs and .yml files for each metric."""
docs = tmp_path / "docs"
count = export_metrics(mock_client, docs, CATALOG_URL)
assert count == 1
# Category directory should exist
category_dir = docs / "metrics" / "finance"
assert category_dir.is_dir()
# Metric file should exist
metric_files = list(category_dir.glob("*.yml"))
assert len(metric_files) == 1
assert metric_files[0].name == "total_revenue.yml"
def test_export_metrics_writes_index(self, tmp_path: Path, mock_client):
"""Creates metrics.yml index with correct structure."""
docs = tmp_path / "docs"
export_metrics(mock_client, docs, CATALOG_URL)
index_path = docs / "metrics" / "metrics.yml"
assert index_path.exists()
# Skip header comments, parse YAML body
content = index_path.read_text()
# Remove all comment lines to parse pure YAML
yaml_lines = [
line for line in content.splitlines() if not line.startswith("#")
]
parsed = yaml.safe_load("\n".join(yaml_lines))
assert "metrics" in parsed
assert len(parsed["metrics"]) == 1
entry = parsed["metrics"][0]
assert entry["name"] == "total_revenue"
assert entry["category"] == "finance"
assert "file" in entry
def test_export_metrics_yaml_parseable(self, tmp_path: Path, mock_client):
"""Output metric YAML is valid and parseable by yaml.safe_load."""
docs = tmp_path / "docs"
export_metrics(mock_client, docs, CATALOG_URL)
metric_file = docs / "metrics" / "finance" / "total_revenue.yml"
content = metric_file.read_text()
# Strip header comments before parsing
yaml_lines = [
line for line in content.splitlines() if not line.startswith("#")
]
parsed = yaml.safe_load("\n".join(yaml_lines))
assert isinstance(parsed, list)
assert len(parsed) == 1
assert parsed[0]["name"] == "total_revenue"
assert parsed[0]["expression"] == "SUM(grs_revenue_plan_local)"
def test_export_metrics_preserves_manual_files(
self, tmp_path: Path, mock_client
):
"""Files without AUTO-GENERATED marker are never deleted."""
docs = tmp_path / "docs"
metrics_dir = docs / "metrics" / "custom"
metrics_dir.mkdir(parents=True)
manual = metrics_dir / "hand_crafted.yml"
manual.write_text("# My custom metric\nname: hand_crafted\n")
export_metrics(mock_client, docs, CATALOG_URL)
# Manual file must survive
assert manual.exists()
assert manual.read_text().startswith("# My custom metric")
def test_export_metrics_cleans_stale(self, tmp_path: Path, mock_client):
"""Old auto-generated files removed when metric no longer in catalog."""
docs = tmp_path / "docs"
stale_dir = docs / "metrics" / "old_category"
stale_dir.mkdir(parents=True)
stale = stale_dir / "gone_metric.yml"
stale.write_text(AUTO_GENERATED_MARKER + "\nname: gone\n")
export_metrics(mock_client, docs, CATALOG_URL)
# Stale auto-generated file should be removed
assert not stale.exists()
def test_export_metrics_zero_results_preserves(
self, tmp_path: Path, mock_client
):
"""When API returns 0 metrics, existing files are untouched."""
mock_client.get_metrics.return_value = []
docs = tmp_path / "docs"
metrics_dir = docs / "metrics" / "finance"
metrics_dir.mkdir(parents=True)
existing = metrics_dir / "existing.yml"
existing.write_text(AUTO_GENERATED_MARKER + "\nname: existing\n")
count = export_metrics(mock_client, docs, CATALOG_URL)
assert count == 0
# Existing file untouched (early return before cleanup)
assert existing.exists()
def test_export_metrics_multiple(self, tmp_path: Path, mock_client):
"""Multiple metrics across categories are all exported."""
mock_client.get_metrics.return_value = [
_make_raw_metric(
name="Total Revenue",
fqn="metrics.total_revenue",
category_tag="MetricCategory.finance",
),
_make_raw_metric(
name="Active Users",
fqn="metrics.active_users",
category_tag="MetricCategory.product",
expression="COUNT(DISTINCT user_id)",
),
]
docs = tmp_path / "docs"
count = export_metrics(mock_client, docs, CATALOG_URL)
assert count == 2
assert (docs / "metrics" / "finance" / "total_revenue.yml").exists()
assert (docs / "metrics" / "product" / "active_users.yml").exists()
def test_export_metrics_filter_tag_keeps_matching(self, tmp_path: Path, mock_client):
"""Only metrics with the filter_tag are exported."""
tagged = _make_raw_metric(name="M1", fqn="M1", category_tag="MetricCategory.finance")
tagged["tags"].append({"tagFQN": "AIAgent.FoundryAI", "name": "FoundryAI"})
untagged = _make_raw_metric(
name="Live Deals", fqn="LiveDeals", category_tag="MetricCategory.supply"
)
mock_client.get_metrics.return_value = [tagged, untagged]
docs = tmp_path / "docs"
count = export_metrics(mock_client, docs, CATALOG_URL, filter_tag="AIAgent.FoundryAI")
assert count == 1
assert (docs / "metrics" / "finance" / "m1.yml").exists()
assert not (docs / "metrics" / "supply").exists()
def test_export_metrics_filter_tag_empty_exports_all(self, tmp_path: Path, mock_client):
"""Empty filter_tag means no filtering - all metrics exported."""
mock_client.get_metrics.return_value = [
_make_raw_metric(name="A", fqn="A"),
_make_raw_metric(name="B", fqn="B"),
]
docs = tmp_path / "docs"
count = export_metrics(mock_client, docs, CATALOG_URL, filter_tag="")
assert count == 2
def test_export_metrics_filter_tag_cleans_stale_untagged(self, tmp_path: Path, mock_client):
"""Stale files from previously-exported untagged metrics get cleaned up."""
tagged = _make_raw_metric(name="M1", fqn="M1", category_tag="MetricCategory.finance")
tagged["tags"].append({"tagFQN": "AIAgent.FoundryAI", "name": "FoundryAI"})
mock_client.get_metrics.return_value = [tagged]
docs = tmp_path / "docs"
stale_dir = docs / "metrics" / "general"
stale_dir.mkdir(parents=True)
stale = stale_dir / "livedeals.yml"
stale.write_text(AUTO_GENERATED_MARKER + "\nname: livedeals\n")
export_metrics(mock_client, docs, CATALOG_URL, filter_tag="AIAgent.FoundryAI")
assert not stale.exists()
# ---------------------------------------------------------------------------
# 4. export_tables
# ---------------------------------------------------------------------------
class TestExportTables:
def test_export_tables_writes_files(
self, tmp_path: Path, mock_client, mock_tables
):
"""Creates table YAML with columns."""
docs = tmp_path / "docs"
count = export_tables(mock_client, mock_tables, docs, CATALOG_URL)
assert count == 1
table_file = docs / "tables" / "order_economics.yml"
assert table_file.exists()
# Parse YAML content (skip header comments)
content = table_file.read_text()
yaml_lines = [
line for line in content.splitlines() if not line.startswith("#")
]
parsed = yaml.safe_load("\n".join(yaml_lines))
assert parsed["name"] == "order_economics"
assert len(parsed["columns"]) == 2
assert parsed["columns"][0]["name"] == "order_id"
def test_export_tables_handles_api_error(
self, tmp_path: Path, mock_client
):
"""Continues on per-table errors, exports remaining tables."""
tables = [
{"id": "prj.dataset.broken", "name": "broken_table", "catalog_fqn": "bigquery.prj.dataset.broken"},
{"id": "prj.dataset.good", "name": "good_table", "catalog_fqn": "bigquery.prj.dataset.good"},
]
def side_effect(fqn):
if "broken" in fqn:
raise RuntimeError("API unreachable")
return _make_raw_table(name="good_table", fqn=fqn)
mock_client.get_table.side_effect = side_effect
docs = tmp_path / "docs"
count = export_tables(mock_client, tables, docs, CATALOG_URL)
# Only the good table should succeed
assert count == 1
assert not (docs / "tables" / "broken_table.yml").exists()
assert (docs / "tables" / "good_table.yml").exists()
def test_export_tables_uses_catalog_fqn(
self, tmp_path: Path, mock_client, mock_tables
):
"""Uses explicit catalog_fqn when set on table config."""
docs = tmp_path / "docs"
export_tables(mock_client, mock_tables, docs, CATALOG_URL)
mock_client.get_table.assert_called_once_with(
"bigquery.prj.dataset.order_economics"
)
def test_export_tables_derives_fqn_from_id(
self, tmp_path: Path, mock_client
):
"""When catalog_fqn is None, derives FQN as 'bigquery.{id}'."""
tables = [
{"id": "project.dataset.my_table", "name": "my_table"},
]
docs = tmp_path / "docs"
export_tables(mock_client, tables, docs, CATALOG_URL)
mock_client.get_table.assert_called_once_with(
"bigquery.project.dataset.my_table"
)
# ---------------------------------------------------------------------------
# 5. _write_sync_state
# ---------------------------------------------------------------------------
class TestWriteSyncState:
def test_write_sync_state(self, tmp_path: Path):
"""Writes .catalog_sync_state.json with counts and timestamp."""
docs = tmp_path / "docs"
docs.mkdir()
_write_sync_state(docs, metrics_count=5, tables_count=2)
state_path = docs / ".catalog_sync_state.json"
assert state_path.exists()
state = json.loads(state_path.read_text())
assert state["metrics_count"] == 5
assert state["tables_count"] == 2
assert "last_export" in state
# Timestamp should be ISO format
from datetime import datetime
datetime.fromisoformat(state["last_export"]) # raises on bad format
# ---------------------------------------------------------------------------
# 6. _get_docs_dir
# ---------------------------------------------------------------------------
class TestGetDocsDir:
def test_get_docs_dir_from_env(self, monkeypatch):
"""DATA_DIR env var is used to derive docs directory."""
monkeypatch.setenv("DATA_DIR", "/data/src_data")
result = _get_docs_dir()
assert result == Path("/data/docs")
def test_get_docs_dir_default(self, monkeypatch):
"""Defaults to ./data/../docs when DATA_DIR is not set."""
monkeypatch.delenv("DATA_DIR", raising=False)
result = _get_docs_dir()
# ./data -> parent is "." -> docs is "./docs"
assert result == Path("docs")