fix: add union_by_name=true to read_parquet calls in profiler
Handles schema evolution across partitions when profiling tables with multiple parquet files that may have different column sets.
This commit is contained in:
parent
5e0e4ceb9e
commit
30987eef16
1 changed files with 2 additions and 2 deletions
|
|
@ -716,9 +716,9 @@ def profile_table(
|
||||||
|
|
||||||
# Determine read expression
|
# Determine read expression
|
||||||
if parquet_path.is_dir():
|
if parquet_path.is_dir():
|
||||||
read_expr = f"read_parquet('{parquet_path}/*.parquet')"
|
read_expr = f"read_parquet('{parquet_path}/*.parquet', union_by_name=true)"
|
||||||
else:
|
else:
|
||||||
read_expr = f"read_parquet('{parquet_path}')"
|
read_expr = f"read_parquet('{parquet_path}', union_by_name=true)"
|
||||||
|
|
||||||
# Get row count to decide on sampling
|
# Get row count to decide on sampling
|
||||||
total_rows = con.execute(f"SELECT COUNT(*) FROM {read_expr}").fetchone()[0]
|
total_rows = con.execute(f"SELECT COUNT(*) FROM {read_expr}").fetchone()[0]
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue