Open-source AI data analyst platform extracted from internal repo. Includes data sync engine, Keboola adapter, Flask web portal, server deployment scripts, and configuration templates.
318 lines
11 KiB
Python
318 lines
11 KiB
Python
"""Tests for the data profiler batch functions and profile_table output format."""
|
|
|
|
import tempfile
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
import duckdb
|
|
import pytest
|
|
|
|
from src.profiler import (
|
|
TableInfo,
|
|
_batch_base_stats,
|
|
_batch_boolean_stats,
|
|
_batch_date_stats,
|
|
_batch_numeric_stats,
|
|
_batch_string_stats,
|
|
_round,
|
|
classify_type,
|
|
profile_table,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
@pytest.fixture
|
|
def con_with_data():
|
|
"""Create a DuckDB connection with a materialized test table."""
|
|
con = duckdb.connect()
|
|
con.execute("""
|
|
CREATE TABLE tbl AS SELECT * FROM (VALUES
|
|
(1, 'alice', TRUE, DATE '2023-01-15', 10.5),
|
|
(2, 'bob', FALSE, DATE '2023-06-20', 0.0),
|
|
(3, 'alice', TRUE, DATE '2024-01-10', -3.2),
|
|
(4, NULL, NULL, NULL, NULL),
|
|
(5, 'charlie', FALSE, DATE '2024-07-01', 100.0)
|
|
) AS t(id, name, active, created, amount)
|
|
""")
|
|
yield con
|
|
con.close()
|
|
|
|
|
|
@pytest.fixture
|
|
def parquet_dir(tmp_path):
|
|
"""Write a small parquet file and return its path."""
|
|
con = duckdb.connect()
|
|
parquet_path = tmp_path / "test_table.parquet"
|
|
con.execute(f"""
|
|
COPY (
|
|
SELECT * FROM (VALUES
|
|
(1, 'alice', TRUE, DATE '2023-01-15', 10.5),
|
|
(2, 'bob', FALSE, DATE '2023-06-20', 0.0),
|
|
(3, 'alice', TRUE, DATE '2024-01-10', -3.2),
|
|
(4, NULL, NULL, NULL, NULL),
|
|
(5, 'charlie', FALSE, DATE '2024-07-01', 100.0)
|
|
) AS t(id, name, active, created, amount)
|
|
) TO '{parquet_path}' (FORMAT PARQUET)
|
|
""")
|
|
con.close()
|
|
return parquet_path
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Unit tests: batch functions
|
|
# ---------------------------------------------------------------------------
|
|
class TestBatchBaseStats:
|
|
def test_counts(self, con_with_data):
|
|
result = _batch_base_stats(con_with_data, "tbl", ["id", "name", "active", "created", "amount"])
|
|
# id: 5 non-null, 5 unique
|
|
assert result["id"] == (5, 5)
|
|
# name: 4 non-null (one NULL), 3 unique (alice, bob, charlie)
|
|
assert result["name"] == (4, 3)
|
|
# active: 4 non-null, 2 unique (TRUE, FALSE)
|
|
assert result["active"] == (4, 2)
|
|
# created: 4 non-null, 4 unique dates
|
|
assert result["created"] == (4, 4)
|
|
# amount: 4 non-null, 4 unique
|
|
assert result["amount"] == (4, 4)
|
|
|
|
def test_empty_columns(self, con_with_data):
|
|
result = _batch_base_stats(con_with_data, "tbl", [])
|
|
assert result == {}
|
|
|
|
|
|
class TestBatchNumericStats:
|
|
def test_aggregates(self, con_with_data):
|
|
result = _batch_numeric_stats(con_with_data, "tbl", ["id", "amount"])
|
|
assert "id" in result
|
|
assert "amount" in result
|
|
|
|
# id: min=1, max=5
|
|
assert result["id"]["min"] == 1
|
|
assert result["id"]["max"] == 5
|
|
|
|
# amount: min=-3.2, max=100.0, has zeros=1, negative=1
|
|
# DuckDB may return Decimal for DECIMAL columns; compare as float
|
|
assert float(result["amount"]["min"]) == pytest.approx(-3.2)
|
|
assert float(result["amount"]["max"]) == pytest.approx(100.0)
|
|
assert result["amount"]["zeros"] == 1
|
|
assert result["amount"]["negative"] == 1
|
|
|
|
def test_empty(self, con_with_data):
|
|
result = _batch_numeric_stats(con_with_data, "tbl", [])
|
|
assert result == {}
|
|
|
|
|
|
class TestBatchStringStats:
|
|
def test_lengths(self, con_with_data):
|
|
result = _batch_string_stats(con_with_data, "tbl", ["name"])
|
|
assert "name" in result
|
|
# alice=5, bob=3, charlie=7 -> min=3, max=7
|
|
assert result["name"]["min_length"] == 3
|
|
assert result["name"]["max_length"] == 7
|
|
assert result["name"]["avg_length"] == pytest.approx(5.0) # (5+3+5+7)/4 = 5.0
|
|
|
|
def test_empty(self, con_with_data):
|
|
result = _batch_string_stats(con_with_data, "tbl", [])
|
|
assert result == {}
|
|
|
|
|
|
class TestBatchDateStats:
|
|
def test_range(self, con_with_data):
|
|
result = _batch_date_stats(con_with_data, "tbl", ["created"], {"created": "DATE"})
|
|
assert "created" in result
|
|
assert result["created"]["earliest"] == "2023-01-15"
|
|
assert result["created"]["latest"] == "2024-07-01"
|
|
assert result["created"]["span_days"] is not None
|
|
assert result["created"]["span_days"] > 0
|
|
|
|
def test_empty(self, con_with_data):
|
|
result = _batch_date_stats(con_with_data, "tbl", [], {})
|
|
assert result == {}
|
|
|
|
|
|
class TestBatchBooleanStats:
|
|
def test_counts(self, con_with_data):
|
|
result = _batch_boolean_stats(con_with_data, "tbl", ["active"])
|
|
assert "active" in result
|
|
assert result["active"]["true_count"] == 2
|
|
assert result["active"]["false_count"] == 2
|
|
assert result["active"]["true_pct"] == 50.0
|
|
|
|
def test_empty(self, con_with_data):
|
|
result = _batch_boolean_stats(con_with_data, "tbl", [])
|
|
assert result == {}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Integration test: profile_table output format
|
|
# ---------------------------------------------------------------------------
|
|
class TestProfileTable:
|
|
def test_output_structure(self, parquet_dir):
|
|
"""Verify profile_table produces the expected JSON structure."""
|
|
table = TableInfo(
|
|
table_id="test.test_table",
|
|
name="test_table",
|
|
description="Test table",
|
|
primary_key="id",
|
|
sync_strategy="full",
|
|
)
|
|
profile = profile_table(
|
|
table=table,
|
|
parquet_path=parquet_dir,
|
|
all_tables=[table],
|
|
sync_state={},
|
|
metrics_map={},
|
|
)
|
|
|
|
# Top-level keys
|
|
assert profile["table_id"] == "test.test_table"
|
|
assert profile["row_count"] == 5
|
|
assert profile["column_count"] == 5
|
|
assert profile["sampled"] is False
|
|
assert "columns" in profile
|
|
assert "sample_rows" in profile
|
|
assert "alerts" in profile
|
|
assert "variable_types" in profile
|
|
|
|
# Column profiles
|
|
col_names = [c["name"] for c in profile["columns"]]
|
|
assert "id" in col_names
|
|
assert "name" in col_names
|
|
assert "active" in col_names
|
|
assert "created" in col_names
|
|
assert "amount" in col_names
|
|
|
|
# Check column structure
|
|
for col in profile["columns"]:
|
|
assert "name" in col
|
|
assert "type" in col
|
|
assert "type_category" in col
|
|
assert "completeness_pct" in col
|
|
assert "null_count" in col
|
|
assert "unique_count" in col
|
|
assert "unique_pct" in col
|
|
assert "sample_values" in col
|
|
assert "is_primary_key" in col
|
|
assert "alerts" in col
|
|
|
|
# Numeric column has numeric_stats
|
|
amount_col = next(c for c in profile["columns"] if c["name"] == "amount")
|
|
assert "numeric_stats" in amount_col
|
|
ns = amount_col["numeric_stats"]
|
|
assert "min" in ns
|
|
assert "max" in ns
|
|
assert "mean" in ns
|
|
assert "median" in ns
|
|
assert "histogram" in ns
|
|
|
|
# String column has string_stats
|
|
name_col = next(c for c in profile["columns"] if c["name"] == "name")
|
|
assert "string_stats" in name_col
|
|
ss = name_col["string_stats"]
|
|
assert "min_length" in ss
|
|
assert "top_values" in ss
|
|
|
|
# Date column has date_stats
|
|
created_col = next(c for c in profile["columns"] if c["name"] == "created")
|
|
assert "date_stats" in created_col
|
|
ds = created_col["date_stats"]
|
|
assert "earliest" in ds
|
|
assert "latest" in ds
|
|
assert "histogram" in ds
|
|
|
|
# Boolean column has boolean_stats
|
|
active_col = next(c for c in profile["columns"] if c["name"] == "active")
|
|
assert "boolean_stats" in active_col
|
|
bs = active_col["boolean_stats"]
|
|
assert "true_count" in bs
|
|
assert "false_count" in bs
|
|
|
|
def test_sample_rows(self, parquet_dir):
|
|
"""Verify sample_rows are populated."""
|
|
table = TableInfo(
|
|
table_id="test.test_table",
|
|
name="test_table",
|
|
description="Test",
|
|
primary_key="id",
|
|
sync_strategy="full",
|
|
)
|
|
profile = profile_table(table, parquet_dir, [table], {}, {})
|
|
assert len(profile["sample_rows"]) == 5
|
|
assert "id" in profile["sample_rows"][0]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Unit tests: helpers
|
|
# ---------------------------------------------------------------------------
|
|
class TestClassifyType:
|
|
def test_numeric(self):
|
|
assert classify_type("INTEGER") == "NUMERIC"
|
|
assert classify_type("BIGINT") == "NUMERIC"
|
|
assert classify_type("DOUBLE") == "NUMERIC"
|
|
assert classify_type("FLOAT") == "NUMERIC"
|
|
|
|
def test_string(self):
|
|
assert classify_type("VARCHAR") == "STRING"
|
|
assert classify_type("TEXT") == "STRING"
|
|
|
|
def test_boolean(self):
|
|
assert classify_type("BOOLEAN") == "BOOLEAN"
|
|
assert classify_type("BOOL") == "BOOLEAN"
|
|
|
|
def test_date(self):
|
|
assert classify_type("DATE") == "DATE"
|
|
assert classify_type("TIMESTAMP") == "TIMESTAMP"
|
|
assert classify_type("TIMESTAMP WITH TIME ZONE") == "TIMESTAMP"
|
|
|
|
|
|
class TestProfileTableEmpty:
|
|
def test_empty_table(self, tmp_path):
|
|
"""Verify profiler handles empty tables gracefully."""
|
|
con = duckdb.connect()
|
|
parquet_path = tmp_path / "empty.parquet"
|
|
con.execute(f"""
|
|
COPY (
|
|
SELECT 1 AS id, 'x' AS name WHERE false
|
|
) TO '{parquet_path}' (FORMAT PARQUET)
|
|
""")
|
|
con.close()
|
|
|
|
table = TableInfo(
|
|
table_id="test.empty",
|
|
name="empty",
|
|
description="Empty table",
|
|
primary_key="id",
|
|
sync_strategy="full",
|
|
)
|
|
profile = profile_table(table, parquet_path, [table], {}, {})
|
|
assert profile["row_count"] == 0
|
|
assert profile["column_count"] == 2
|
|
assert profile["missing_cells_pct"] == 0.0
|
|
|
|
|
|
class TestClassifyTypeParameterized:
|
|
def test_decimal_with_precision(self):
|
|
assert classify_type("DECIMAL(18,3)") == "NUMERIC"
|
|
assert classify_type("DECIMAL(4,1)") == "NUMERIC"
|
|
|
|
def test_numeric_with_precision(self):
|
|
assert classify_type("NUMERIC(10,2)") == "NUMERIC"
|
|
|
|
|
|
class TestRound:
|
|
def test_float(self):
|
|
assert _round(3.14159) == 3.14
|
|
|
|
def test_none(self):
|
|
assert _round(None) is None
|
|
|
|
def test_int(self):
|
|
assert _round(42) == 42
|
|
|
|
def test_nan(self):
|
|
assert _round(float("nan")) is None
|
|
|
|
def test_inf(self):
|
|
assert _round(float("inf")) is None
|