Simplify profiler config: use single SAMPLE_SIZE parameter (KISS)
Replace SAMPLE_THRESHOLD + SAMPLE_SIZE with single SAMPLE_SIZE: - If table > SAMPLE_SIZE: sample that many rows - Otherwise: use all rows Cleaner, easier to configure.
This commit is contained in:
parent
e2d3afade3
commit
c25278538c
1 changed files with 2 additions and 3 deletions
|
|
@ -34,8 +34,7 @@ logger = logging.getLogger(__name__)
|
|||
# ---------------------------------------------------------------------------
|
||||
# Profiler configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
SAMPLE_THRESHOLD = 500_000 # Sample tables larger than this
|
||||
SAMPLE_SIZE = 500_000
|
||||
SAMPLE_SIZE = 500_000 # If table > this, sample this many rows; else use all
|
||||
MAX_CATEGORICAL_DISTINCT = 50 # Treat as categorical if unique <= this
|
||||
TOP_VALUES_LIMIT = 10 # Number of top values for categorical columns
|
||||
HISTOGRAM_BINS = 15 # Number of bins for numeric histograms
|
||||
|
|
@ -689,7 +688,7 @@ def profile_table(
|
|||
|
||||
# Materialize into temp table — reads parquet files once instead of per-query
|
||||
view_name = "tbl"
|
||||
sampled = total_rows > SAMPLE_THRESHOLD
|
||||
sampled = total_rows > SAMPLE_SIZE
|
||||
if sampled:
|
||||
con.execute(
|
||||
f"CREATE TEMP TABLE {view_name} AS SELECT * FROM {read_expr} USING SAMPLE {SAMPLE_SIZE} ROWS"
|
||||
|
|
|
|||
Loading…
Reference in a new issue