agnes-the-ai-analyst/tests/test_profiler.py
Petr c56905d34f Initial commit: OSS data distribution platform
Open-source AI data analyst platform extracted from internal repo.
Includes data sync engine, Keboola adapter, Flask web portal,
server deployment scripts, and configuration templates.
2026-03-08 23:31:28 +01:00

318 lines
11 KiB
Python

"""Tests for the data profiler batch functions and profile_table output format."""
import tempfile
from datetime import date
from pathlib import Path
import duckdb
import pytest
from src.profiler import (
TableInfo,
_batch_base_stats,
_batch_boolean_stats,
_batch_date_stats,
_batch_numeric_stats,
_batch_string_stats,
_round,
classify_type,
profile_table,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def con_with_data():
"""Create a DuckDB connection with a materialized test table."""
con = duckdb.connect()
con.execute("""
CREATE TABLE tbl AS SELECT * FROM (VALUES
(1, 'alice', TRUE, DATE '2023-01-15', 10.5),
(2, 'bob', FALSE, DATE '2023-06-20', 0.0),
(3, 'alice', TRUE, DATE '2024-01-10', -3.2),
(4, NULL, NULL, NULL, NULL),
(5, 'charlie', FALSE, DATE '2024-07-01', 100.0)
) AS t(id, name, active, created, amount)
""")
yield con
con.close()
@pytest.fixture
def parquet_dir(tmp_path):
"""Write a small parquet file and return its path."""
con = duckdb.connect()
parquet_path = tmp_path / "test_table.parquet"
con.execute(f"""
COPY (
SELECT * FROM (VALUES
(1, 'alice', TRUE, DATE '2023-01-15', 10.5),
(2, 'bob', FALSE, DATE '2023-06-20', 0.0),
(3, 'alice', TRUE, DATE '2024-01-10', -3.2),
(4, NULL, NULL, NULL, NULL),
(5, 'charlie', FALSE, DATE '2024-07-01', 100.0)
) AS t(id, name, active, created, amount)
) TO '{parquet_path}' (FORMAT PARQUET)
""")
con.close()
return parquet_path
# ---------------------------------------------------------------------------
# Unit tests: batch functions
# ---------------------------------------------------------------------------
class TestBatchBaseStats:
def test_counts(self, con_with_data):
result = _batch_base_stats(con_with_data, "tbl", ["id", "name", "active", "created", "amount"])
# id: 5 non-null, 5 unique
assert result["id"] == (5, 5)
# name: 4 non-null (one NULL), 3 unique (alice, bob, charlie)
assert result["name"] == (4, 3)
# active: 4 non-null, 2 unique (TRUE, FALSE)
assert result["active"] == (4, 2)
# created: 4 non-null, 4 unique dates
assert result["created"] == (4, 4)
# amount: 4 non-null, 4 unique
assert result["amount"] == (4, 4)
def test_empty_columns(self, con_with_data):
result = _batch_base_stats(con_with_data, "tbl", [])
assert result == {}
class TestBatchNumericStats:
def test_aggregates(self, con_with_data):
result = _batch_numeric_stats(con_with_data, "tbl", ["id", "amount"])
assert "id" in result
assert "amount" in result
# id: min=1, max=5
assert result["id"]["min"] == 1
assert result["id"]["max"] == 5
# amount: min=-3.2, max=100.0, has zeros=1, negative=1
# DuckDB may return Decimal for DECIMAL columns; compare as float
assert float(result["amount"]["min"]) == pytest.approx(-3.2)
assert float(result["amount"]["max"]) == pytest.approx(100.0)
assert result["amount"]["zeros"] == 1
assert result["amount"]["negative"] == 1
def test_empty(self, con_with_data):
result = _batch_numeric_stats(con_with_data, "tbl", [])
assert result == {}
class TestBatchStringStats:
def test_lengths(self, con_with_data):
result = _batch_string_stats(con_with_data, "tbl", ["name"])
assert "name" in result
# alice=5, bob=3, charlie=7 -> min=3, max=7
assert result["name"]["min_length"] == 3
assert result["name"]["max_length"] == 7
assert result["name"]["avg_length"] == pytest.approx(5.0) # (5+3+5+7)/4 = 5.0
def test_empty(self, con_with_data):
result = _batch_string_stats(con_with_data, "tbl", [])
assert result == {}
class TestBatchDateStats:
def test_range(self, con_with_data):
result = _batch_date_stats(con_with_data, "tbl", ["created"], {"created": "DATE"})
assert "created" in result
assert result["created"]["earliest"] == "2023-01-15"
assert result["created"]["latest"] == "2024-07-01"
assert result["created"]["span_days"] is not None
assert result["created"]["span_days"] > 0
def test_empty(self, con_with_data):
result = _batch_date_stats(con_with_data, "tbl", [], {})
assert result == {}
class TestBatchBooleanStats:
def test_counts(self, con_with_data):
result = _batch_boolean_stats(con_with_data, "tbl", ["active"])
assert "active" in result
assert result["active"]["true_count"] == 2
assert result["active"]["false_count"] == 2
assert result["active"]["true_pct"] == 50.0
def test_empty(self, con_with_data):
result = _batch_boolean_stats(con_with_data, "tbl", [])
assert result == {}
# ---------------------------------------------------------------------------
# Integration test: profile_table output format
# ---------------------------------------------------------------------------
class TestProfileTable:
def test_output_structure(self, parquet_dir):
"""Verify profile_table produces the expected JSON structure."""
table = TableInfo(
table_id="test.test_table",
name="test_table",
description="Test table",
primary_key="id",
sync_strategy="full",
)
profile = profile_table(
table=table,
parquet_path=parquet_dir,
all_tables=[table],
sync_state={},
metrics_map={},
)
# Top-level keys
assert profile["table_id"] == "test.test_table"
assert profile["row_count"] == 5
assert profile["column_count"] == 5
assert profile["sampled"] is False
assert "columns" in profile
assert "sample_rows" in profile
assert "alerts" in profile
assert "variable_types" in profile
# Column profiles
col_names = [c["name"] for c in profile["columns"]]
assert "id" in col_names
assert "name" in col_names
assert "active" in col_names
assert "created" in col_names
assert "amount" in col_names
# Check column structure
for col in profile["columns"]:
assert "name" in col
assert "type" in col
assert "type_category" in col
assert "completeness_pct" in col
assert "null_count" in col
assert "unique_count" in col
assert "unique_pct" in col
assert "sample_values" in col
assert "is_primary_key" in col
assert "alerts" in col
# Numeric column has numeric_stats
amount_col = next(c for c in profile["columns"] if c["name"] == "amount")
assert "numeric_stats" in amount_col
ns = amount_col["numeric_stats"]
assert "min" in ns
assert "max" in ns
assert "mean" in ns
assert "median" in ns
assert "histogram" in ns
# String column has string_stats
name_col = next(c for c in profile["columns"] if c["name"] == "name")
assert "string_stats" in name_col
ss = name_col["string_stats"]
assert "min_length" in ss
assert "top_values" in ss
# Date column has date_stats
created_col = next(c for c in profile["columns"] if c["name"] == "created")
assert "date_stats" in created_col
ds = created_col["date_stats"]
assert "earliest" in ds
assert "latest" in ds
assert "histogram" in ds
# Boolean column has boolean_stats
active_col = next(c for c in profile["columns"] if c["name"] == "active")
assert "boolean_stats" in active_col
bs = active_col["boolean_stats"]
assert "true_count" in bs
assert "false_count" in bs
def test_sample_rows(self, parquet_dir):
"""Verify sample_rows are populated."""
table = TableInfo(
table_id="test.test_table",
name="test_table",
description="Test",
primary_key="id",
sync_strategy="full",
)
profile = profile_table(table, parquet_dir, [table], {}, {})
assert len(profile["sample_rows"]) == 5
assert "id" in profile["sample_rows"][0]
# ---------------------------------------------------------------------------
# Unit tests: helpers
# ---------------------------------------------------------------------------
class TestClassifyType:
def test_numeric(self):
assert classify_type("INTEGER") == "NUMERIC"
assert classify_type("BIGINT") == "NUMERIC"
assert classify_type("DOUBLE") == "NUMERIC"
assert classify_type("FLOAT") == "NUMERIC"
def test_string(self):
assert classify_type("VARCHAR") == "STRING"
assert classify_type("TEXT") == "STRING"
def test_boolean(self):
assert classify_type("BOOLEAN") == "BOOLEAN"
assert classify_type("BOOL") == "BOOLEAN"
def test_date(self):
assert classify_type("DATE") == "DATE"
assert classify_type("TIMESTAMP") == "TIMESTAMP"
assert classify_type("TIMESTAMP WITH TIME ZONE") == "TIMESTAMP"
class TestProfileTableEmpty:
def test_empty_table(self, tmp_path):
"""Verify profiler handles empty tables gracefully."""
con = duckdb.connect()
parquet_path = tmp_path / "empty.parquet"
con.execute(f"""
COPY (
SELECT 1 AS id, 'x' AS name WHERE false
) TO '{parquet_path}' (FORMAT PARQUET)
""")
con.close()
table = TableInfo(
table_id="test.empty",
name="empty",
description="Empty table",
primary_key="id",
sync_strategy="full",
)
profile = profile_table(table, parquet_path, [table], {}, {})
assert profile["row_count"] == 0
assert profile["column_count"] == 2
assert profile["missing_cells_pct"] == 0.0
class TestClassifyTypeParameterized:
def test_decimal_with_precision(self):
assert classify_type("DECIMAL(18,3)") == "NUMERIC"
assert classify_type("DECIMAL(4,1)") == "NUMERIC"
def test_numeric_with_precision(self):
assert classify_type("NUMERIC(10,2)") == "NUMERIC"
class TestRound:
def test_float(self):
assert _round(3.14159) == 3.14
def test_none(self):
assert _round(None) is None
def test_int(self):
assert _round(42) == 42
def test_nan(self):
assert _round(float("nan")) is None
def test_inf(self):
assert _round(float("inf")) is None