Phase 5 cleanup: remove all code replaced by extract.duckdb architecture. Deleted modules: - src/config.py (653) — replaced by DuckDB table_registry - src/parquet_manager.py (755) — replaced by DuckDB COPY TO - src/data_sync.py (734) — replaced by SyncOrchestrator - src/remote_query.py (636) — replaced by DuckDB BigQuery ATTACH - src/table_registry.py (464) — replaced by DuckDB repository - connectors/keboola/adapter.py (820) — replaced by extractor.py - connectors/bigquery/adapter.py (665) — replaced by extractor.py - connectors/bigquery/client.py (644) — replaced by DuckDB BQ extension Updated all imports in webapp, catalog_export, enricher, router, sync_settings_service, generate_sample_data. Kept keboola/client.py as fallback (removed src.config dependency). 704 tests passing.
469 lines
16 KiB
Python
469 lines
16 KiB
Python
"""
|
|
Tests for src/catalog_export.py
|
|
|
|
Covers YAML header generation, auto-generated file detection, metric/table
|
|
export to YAML files, stale file cleanup, sync state writing, and docs dir
|
|
resolution.
|
|
"""
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from src.catalog_export import (
|
|
AUTO_GENERATED_MARKER,
|
|
_get_docs_dir,
|
|
_is_auto_generated,
|
|
_write_sync_state,
|
|
_yaml_header,
|
|
export_metrics,
|
|
export_tables,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers / fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class FakeTableConfig:
|
|
"""Minimal stand-in for src.config.TableConfig with attributes used by
|
|
export_tables()."""
|
|
|
|
name: str
|
|
id: str
|
|
catalog_fqn: Optional[str] = None
|
|
|
|
|
|
def _make_raw_metric(
|
|
name: str = "Total Revenue",
|
|
fqn: str = "metrics.total_revenue",
|
|
category_tag: str = "MetricCategory.finance",
|
|
description: str = "Sum of order revenue",
|
|
expression: str = "SUM(grs_revenue_plan_local)",
|
|
) -> Dict[str, Any]:
|
|
"""Build a realistic raw metric dict as returned by the catalog API."""
|
|
return {
|
|
"id": f"id-{name}",
|
|
"name": name,
|
|
"fullyQualifiedName": fqn,
|
|
"displayName": name,
|
|
"description": description,
|
|
"metricExpression": {"expression": expression},
|
|
"metricType": "sum",
|
|
"unitOfMeasurement": "USD",
|
|
"granularity": "monthly",
|
|
"tags": [{"tagFQN": category_tag}],
|
|
"owners": [{"name": "Data Team"}],
|
|
}
|
|
|
|
|
|
def _make_raw_table(
|
|
name: str = "order_economics",
|
|
fqn: str = "bigquery.project.dataset.order_economics",
|
|
) -> Dict[str, Any]:
|
|
"""Build a realistic raw table dict as returned by the catalog API."""
|
|
return {
|
|
"id": "tbl-1",
|
|
"name": name,
|
|
"fullyQualifiedName": fqn,
|
|
"description": "Order-level economics table",
|
|
"columns": [
|
|
{"name": "order_id", "dataType": "STRING", "description": "PK"},
|
|
{"name": "grs_revenue", "dataType": "FLOAT", "description": "Revenue"},
|
|
],
|
|
"tags": [{"tagFQN": "Tier.Tier1"}],
|
|
"owners": [{"name": "Finance"}],
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_client():
|
|
"""Return a MagicMock that behaves like OpenMetadataClient."""
|
|
client = MagicMock()
|
|
client.get_metrics.return_value = [_make_raw_metric()]
|
|
client.get_table.return_value = _make_raw_table()
|
|
return client
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_tables():
|
|
"""Return a list of table dicts matching TableRegistryRepository.list_all() format."""
|
|
return [
|
|
{
|
|
"id": "prj.dataset.order_economics",
|
|
"name": "order_economics",
|
|
"catalog_fqn": "bigquery.prj.dataset.order_economics",
|
|
}
|
|
]
|
|
|
|
|
|
CATALOG_URL = "https://catalog.example.com"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 1. _yaml_header
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestYamlHeader:
|
|
def test_yaml_header_contains_marker(self):
|
|
"""_yaml_header() output starts with AUTO-GENERATED marker."""
|
|
header = _yaml_header(CATALOG_URL)
|
|
assert header.startswith(AUTO_GENERATED_MARKER)
|
|
|
|
def test_yaml_header_contains_url(self):
|
|
"""Header includes catalog URL."""
|
|
header = _yaml_header(CATALOG_URL)
|
|
assert CATALOG_URL in header
|
|
|
|
def test_yaml_header_contains_fqn(self):
|
|
"""Header includes FQN when provided."""
|
|
fqn = "metrics.total_revenue"
|
|
header = _yaml_header(CATALOG_URL, fqn=fqn)
|
|
assert f"# FQN: {fqn}" in header
|
|
|
|
def test_yaml_header_no_fqn_line_when_empty(self):
|
|
"""Header omits FQN line when fqn argument is empty."""
|
|
header = _yaml_header(CATALOG_URL, fqn="")
|
|
assert "# FQN:" not in header
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 2. _is_auto_generated
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestIsAutoGenerated:
|
|
def test_is_auto_generated_true(self, tmp_path: Path):
|
|
"""File starting with AUTO-GENERATED marker returns True."""
|
|
f = tmp_path / "metric.yml"
|
|
f.write_text(AUTO_GENERATED_MARKER + "\nsome: data\n")
|
|
assert _is_auto_generated(f) is True
|
|
|
|
def test_is_auto_generated_false(self, tmp_path: Path):
|
|
"""File without marker returns False."""
|
|
f = tmp_path / "manual.yml"
|
|
f.write_text("# Manually written metric\nname: custom\n")
|
|
assert _is_auto_generated(f) is False
|
|
|
|
def test_is_auto_generated_missing_file(self, tmp_path: Path):
|
|
"""Non-existent file returns False."""
|
|
f = tmp_path / "does_not_exist.yml"
|
|
assert _is_auto_generated(f) is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 3. export_metrics
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestExportMetrics:
|
|
def test_export_metrics_writes_files(self, tmp_path: Path, mock_client):
|
|
"""Creates category dirs and .yml files for each metric."""
|
|
docs = tmp_path / "docs"
|
|
count = export_metrics(mock_client, docs, CATALOG_URL)
|
|
|
|
assert count == 1
|
|
# Category directory should exist
|
|
category_dir = docs / "metrics" / "finance"
|
|
assert category_dir.is_dir()
|
|
|
|
# Metric file should exist
|
|
metric_files = list(category_dir.glob("*.yml"))
|
|
assert len(metric_files) == 1
|
|
assert metric_files[0].name == "total_revenue.yml"
|
|
|
|
def test_export_metrics_writes_index(self, tmp_path: Path, mock_client):
|
|
"""Creates metrics.yml index with correct structure."""
|
|
docs = tmp_path / "docs"
|
|
export_metrics(mock_client, docs, CATALOG_URL)
|
|
|
|
index_path = docs / "metrics" / "metrics.yml"
|
|
assert index_path.exists()
|
|
|
|
# Skip header comments, parse YAML body
|
|
content = index_path.read_text()
|
|
# Remove all comment lines to parse pure YAML
|
|
yaml_lines = [
|
|
line for line in content.splitlines() if not line.startswith("#")
|
|
]
|
|
parsed = yaml.safe_load("\n".join(yaml_lines))
|
|
|
|
assert "metrics" in parsed
|
|
assert len(parsed["metrics"]) == 1
|
|
entry = parsed["metrics"][0]
|
|
assert entry["name"] == "total_revenue"
|
|
assert entry["category"] == "finance"
|
|
assert "file" in entry
|
|
|
|
def test_export_metrics_yaml_parseable(self, tmp_path: Path, mock_client):
|
|
"""Output metric YAML is valid and parseable by yaml.safe_load."""
|
|
docs = tmp_path / "docs"
|
|
export_metrics(mock_client, docs, CATALOG_URL)
|
|
|
|
metric_file = docs / "metrics" / "finance" / "total_revenue.yml"
|
|
content = metric_file.read_text()
|
|
|
|
# Strip header comments before parsing
|
|
yaml_lines = [
|
|
line for line in content.splitlines() if not line.startswith("#")
|
|
]
|
|
parsed = yaml.safe_load("\n".join(yaml_lines))
|
|
|
|
assert isinstance(parsed, list)
|
|
assert len(parsed) == 1
|
|
assert parsed[0]["name"] == "total_revenue"
|
|
assert parsed[0]["expression"] == "SUM(grs_revenue_plan_local)"
|
|
|
|
def test_export_metrics_preserves_manual_files(
|
|
self, tmp_path: Path, mock_client
|
|
):
|
|
"""Files without AUTO-GENERATED marker are never deleted."""
|
|
docs = tmp_path / "docs"
|
|
metrics_dir = docs / "metrics" / "custom"
|
|
metrics_dir.mkdir(parents=True)
|
|
|
|
manual = metrics_dir / "hand_crafted.yml"
|
|
manual.write_text("# My custom metric\nname: hand_crafted\n")
|
|
|
|
export_metrics(mock_client, docs, CATALOG_URL)
|
|
|
|
# Manual file must survive
|
|
assert manual.exists()
|
|
assert manual.read_text().startswith("# My custom metric")
|
|
|
|
def test_export_metrics_cleans_stale(self, tmp_path: Path, mock_client):
|
|
"""Old auto-generated files removed when metric no longer in catalog."""
|
|
docs = tmp_path / "docs"
|
|
stale_dir = docs / "metrics" / "old_category"
|
|
stale_dir.mkdir(parents=True)
|
|
|
|
stale = stale_dir / "gone_metric.yml"
|
|
stale.write_text(AUTO_GENERATED_MARKER + "\nname: gone\n")
|
|
|
|
export_metrics(mock_client, docs, CATALOG_URL)
|
|
|
|
# Stale auto-generated file should be removed
|
|
assert not stale.exists()
|
|
|
|
def test_export_metrics_zero_results_preserves(
|
|
self, tmp_path: Path, mock_client
|
|
):
|
|
"""When API returns 0 metrics, existing files are untouched."""
|
|
mock_client.get_metrics.return_value = []
|
|
|
|
docs = tmp_path / "docs"
|
|
metrics_dir = docs / "metrics" / "finance"
|
|
metrics_dir.mkdir(parents=True)
|
|
|
|
existing = metrics_dir / "existing.yml"
|
|
existing.write_text(AUTO_GENERATED_MARKER + "\nname: existing\n")
|
|
|
|
count = export_metrics(mock_client, docs, CATALOG_URL)
|
|
|
|
assert count == 0
|
|
# Existing file untouched (early return before cleanup)
|
|
assert existing.exists()
|
|
|
|
def test_export_metrics_multiple(self, tmp_path: Path, mock_client):
|
|
"""Multiple metrics across categories are all exported."""
|
|
mock_client.get_metrics.return_value = [
|
|
_make_raw_metric(
|
|
name="Total Revenue",
|
|
fqn="metrics.total_revenue",
|
|
category_tag="MetricCategory.finance",
|
|
),
|
|
_make_raw_metric(
|
|
name="Active Users",
|
|
fqn="metrics.active_users",
|
|
category_tag="MetricCategory.product",
|
|
expression="COUNT(DISTINCT user_id)",
|
|
),
|
|
]
|
|
|
|
docs = tmp_path / "docs"
|
|
count = export_metrics(mock_client, docs, CATALOG_URL)
|
|
|
|
assert count == 2
|
|
assert (docs / "metrics" / "finance" / "total_revenue.yml").exists()
|
|
assert (docs / "metrics" / "product" / "active_users.yml").exists()
|
|
|
|
def test_export_metrics_filter_tag_keeps_matching(self, tmp_path: Path, mock_client):
|
|
"""Only metrics with the filter_tag are exported."""
|
|
tagged = _make_raw_metric(name="M1", fqn="M1", category_tag="MetricCategory.finance")
|
|
tagged["tags"].append({"tagFQN": "AIAgent.FoundryAI", "name": "FoundryAI"})
|
|
|
|
untagged = _make_raw_metric(
|
|
name="Live Deals", fqn="LiveDeals", category_tag="MetricCategory.supply"
|
|
)
|
|
|
|
mock_client.get_metrics.return_value = [tagged, untagged]
|
|
|
|
docs = tmp_path / "docs"
|
|
count = export_metrics(mock_client, docs, CATALOG_URL, filter_tag="AIAgent.FoundryAI")
|
|
|
|
assert count == 1
|
|
assert (docs / "metrics" / "finance" / "m1.yml").exists()
|
|
assert not (docs / "metrics" / "supply").exists()
|
|
|
|
def test_export_metrics_filter_tag_empty_exports_all(self, tmp_path: Path, mock_client):
|
|
"""Empty filter_tag means no filtering - all metrics exported."""
|
|
mock_client.get_metrics.return_value = [
|
|
_make_raw_metric(name="A", fqn="A"),
|
|
_make_raw_metric(name="B", fqn="B"),
|
|
]
|
|
|
|
docs = tmp_path / "docs"
|
|
count = export_metrics(mock_client, docs, CATALOG_URL, filter_tag="")
|
|
|
|
assert count == 2
|
|
|
|
def test_export_metrics_filter_tag_cleans_stale_untagged(self, tmp_path: Path, mock_client):
|
|
"""Stale files from previously-exported untagged metrics get cleaned up."""
|
|
tagged = _make_raw_metric(name="M1", fqn="M1", category_tag="MetricCategory.finance")
|
|
tagged["tags"].append({"tagFQN": "AIAgent.FoundryAI", "name": "FoundryAI"})
|
|
mock_client.get_metrics.return_value = [tagged]
|
|
|
|
docs = tmp_path / "docs"
|
|
stale_dir = docs / "metrics" / "general"
|
|
stale_dir.mkdir(parents=True)
|
|
stale = stale_dir / "livedeals.yml"
|
|
stale.write_text(AUTO_GENERATED_MARKER + "\nname: livedeals\n")
|
|
|
|
export_metrics(mock_client, docs, CATALOG_URL, filter_tag="AIAgent.FoundryAI")
|
|
|
|
assert not stale.exists()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 4. export_tables
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestExportTables:
|
|
def test_export_tables_writes_files(
|
|
self, tmp_path: Path, mock_client, mock_tables
|
|
):
|
|
"""Creates table YAML with columns."""
|
|
docs = tmp_path / "docs"
|
|
count = export_tables(mock_client, mock_tables, docs, CATALOG_URL)
|
|
|
|
assert count == 1
|
|
|
|
table_file = docs / "tables" / "order_economics.yml"
|
|
assert table_file.exists()
|
|
|
|
# Parse YAML content (skip header comments)
|
|
content = table_file.read_text()
|
|
yaml_lines = [
|
|
line for line in content.splitlines() if not line.startswith("#")
|
|
]
|
|
parsed = yaml.safe_load("\n".join(yaml_lines))
|
|
|
|
assert parsed["name"] == "order_economics"
|
|
assert len(parsed["columns"]) == 2
|
|
assert parsed["columns"][0]["name"] == "order_id"
|
|
|
|
def test_export_tables_handles_api_error(
|
|
self, tmp_path: Path, mock_client
|
|
):
|
|
"""Continues on per-table errors, exports remaining tables."""
|
|
tables = [
|
|
{"id": "prj.dataset.broken", "name": "broken_table", "catalog_fqn": "bigquery.prj.dataset.broken"},
|
|
{"id": "prj.dataset.good", "name": "good_table", "catalog_fqn": "bigquery.prj.dataset.good"},
|
|
]
|
|
|
|
def side_effect(fqn):
|
|
if "broken" in fqn:
|
|
raise RuntimeError("API unreachable")
|
|
return _make_raw_table(name="good_table", fqn=fqn)
|
|
|
|
mock_client.get_table.side_effect = side_effect
|
|
|
|
docs = tmp_path / "docs"
|
|
count = export_tables(mock_client, tables, docs, CATALOG_URL)
|
|
|
|
# Only the good table should succeed
|
|
assert count == 1
|
|
assert not (docs / "tables" / "broken_table.yml").exists()
|
|
assert (docs / "tables" / "good_table.yml").exists()
|
|
|
|
def test_export_tables_uses_catalog_fqn(
|
|
self, tmp_path: Path, mock_client, mock_tables
|
|
):
|
|
"""Uses explicit catalog_fqn when set on table config."""
|
|
docs = tmp_path / "docs"
|
|
export_tables(mock_client, mock_tables, docs, CATALOG_URL)
|
|
|
|
mock_client.get_table.assert_called_once_with(
|
|
"bigquery.prj.dataset.order_economics"
|
|
)
|
|
|
|
def test_export_tables_derives_fqn_from_id(
|
|
self, tmp_path: Path, mock_client
|
|
):
|
|
"""When catalog_fqn is None, derives FQN as 'bigquery.{id}'."""
|
|
tables = [
|
|
{"id": "project.dataset.my_table", "name": "my_table"},
|
|
]
|
|
|
|
docs = tmp_path / "docs"
|
|
export_tables(mock_client, tables, docs, CATALOG_URL)
|
|
|
|
mock_client.get_table.assert_called_once_with(
|
|
"bigquery.project.dataset.my_table"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 5. _write_sync_state
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestWriteSyncState:
|
|
def test_write_sync_state(self, tmp_path: Path):
|
|
"""Writes .catalog_sync_state.json with counts and timestamp."""
|
|
docs = tmp_path / "docs"
|
|
docs.mkdir()
|
|
|
|
_write_sync_state(docs, metrics_count=5, tables_count=2)
|
|
|
|
state_path = docs / ".catalog_sync_state.json"
|
|
assert state_path.exists()
|
|
|
|
state = json.loads(state_path.read_text())
|
|
assert state["metrics_count"] == 5
|
|
assert state["tables_count"] == 2
|
|
assert "last_export" in state
|
|
|
|
# Timestamp should be ISO format
|
|
from datetime import datetime
|
|
|
|
datetime.fromisoformat(state["last_export"]) # raises on bad format
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 6. _get_docs_dir
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestGetDocsDir:
|
|
def test_get_docs_dir_from_env(self, monkeypatch):
|
|
"""DATA_DIR env var is used to derive docs directory."""
|
|
monkeypatch.setenv("DATA_DIR", "/data/src_data")
|
|
result = _get_docs_dir()
|
|
assert result == Path("/data/docs")
|
|
|
|
def test_get_docs_dir_default(self, monkeypatch):
|
|
"""Defaults to ./data/../docs when DATA_DIR is not set."""
|
|
monkeypatch.delenv("DATA_DIR", raising=False)
|
|
result = _get_docs_dir()
|
|
# ./data -> parent is "." -> docs is "./docs"
|
|
assert result == Path("docs")
|