From 30987eef164860902bb1df6067bfaacaf0a6dc59 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Thu, 9 Apr 2026 18:42:33 +0200 Subject: [PATCH] fix: add union_by_name=true to read_parquet calls in profiler Handles schema evolution across partitions when profiling tables with multiple parquet files that may have different column sets. --- src/profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/profiler.py b/src/profiler.py index 0cda5f7..ced8778 100644 --- a/src/profiler.py +++ b/src/profiler.py @@ -716,9 +716,9 @@ def profile_table( # Determine read expression if parquet_path.is_dir(): - read_expr = f"read_parquet('{parquet_path}/*.parquet')" + read_expr = f"read_parquet('{parquet_path}/*.parquet', union_by_name=true)" else: - read_expr = f"read_parquet('{parquet_path}')" + read_expr = f"read_parquet('{parquet_path}', union_by_name=true)" # Get row count to decide on sampling total_rows = con.execute(f"SELECT COUNT(*) FROM {read_expr}").fetchone()[0]