fix: add union_by_name=true to read_parquet calls in profiler

Handles schema evolution across partitions when profiling tables
with multiple parquet files that may have different column sets.
This commit is contained in:
ZdenekSrotyr 2026-04-09 18:42:33 +02:00
parent 5e0e4ceb9e
commit 30987eef16

View file

@ -716,9 +716,9 @@ def profile_table(
# Determine read expression
if parquet_path.is_dir():
read_expr = f"read_parquet('{parquet_path}/*.parquet')"
read_expr = f"read_parquet('{parquet_path}/*.parquet', union_by_name=true)"
else:
read_expr = f"read_parquet('{parquet_path}')"
read_expr = f"read_parquet('{parquet_path}', union_by_name=true)"
# Get row count to decide on sampling
total_rows = con.execute(f"SELECT COUNT(*) FROM {read_expr}").fetchone()[0]