From c25278538c5c594bca99eb53ccd19e105240b20e Mon Sep 17 00:00:00 2001 From: Petr Date: Thu, 12 Mar 2026 14:43:23 +0100 Subject: [PATCH] Simplify profiler config: use single SAMPLE_SIZE parameter (KISS) Replace SAMPLE_THRESHOLD + SAMPLE_SIZE with single SAMPLE_SIZE: - If table > SAMPLE_SIZE: sample that many rows - Otherwise: use all rows Cleaner, easier to configure. --- src/profiler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/profiler.py b/src/profiler.py index c4cf69a..00e658b 100644 --- a/src/profiler.py +++ b/src/profiler.py @@ -34,8 +34,7 @@ logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Profiler configuration # --------------------------------------------------------------------------- -SAMPLE_THRESHOLD = 500_000 # Sample tables larger than this -SAMPLE_SIZE = 500_000 +SAMPLE_SIZE = 500_000 # If table > this, sample this many rows; else use all MAX_CATEGORICAL_DISTINCT = 50 # Treat as categorical if unique <= this TOP_VALUES_LIMIT = 10 # Number of top values for categorical columns HISTOGRAM_BINS = 15 # Number of bins for numeric histograms @@ -689,7 +688,7 @@ def profile_table( # Materialize into temp table — reads parquet files once instead of per-query view_name = "tbl" - sampled = total_rows > SAMPLE_THRESHOLD + sampled = total_rows > SAMPLE_SIZE if sampled: con.execute( f"CREATE TEMP TABLE {view_name} AS SELECT * FROM {read_expr} USING SAMPLE {SAMPLE_SIZE} ROWS"