fix: add union_by_name=true to read_parquet calls in profiler

Handles schema evolution across partitions when profiling tables with multiple parquet files that may have different column sets.
2026-04-09 18:42:33 +02:00 · 2026-04-09 18:42:33 +02:00 · 30987eef16
commit 30987eef16
parent 5e0e4ceb9e
1 changed files with 2 additions and 2 deletions
--- a/src/profiler.py
+++ b/src/profiler.py
@ -716,9 +716,9 @@ def profile_table(
    # Determine read expression
    if parquet_path.is_dir():
-        read_expr = f"read_parquet('{parquet_path}/*.parquet')"
+        read_expr = f"read_parquet('{parquet_path}/*.parquet', union_by_name=true)"
    else:
-        read_expr = f"read_parquet('{parquet_path}')"
+        read_expr = f"read_parquet('{parquet_path}', union_by_name=true)"
    # Get row count to decide on sampling
    total_rows = con.execute(f"SELECT COUNT(*) FROM {read_expr}").fetchone()[0]