From 28543d98b112c4b9da988b8438ff50927482b9e7 Mon Sep 17 00:00:00 2001 From: Petr Date: Tue, 10 Mar 2026 22:12:46 +0100 Subject: [PATCH] Fix profiler file_size and catalog stats fallback - Profiler computes file_size_mb from actual parquet files when sync_state.json is absent (sample data / no-sync deployments) - Catalog header falls back to profiles.json for aggregate stats (tables count, total rows) when sync_state.json is missing --- src/profiler.py | 13 +++++++++++++ webapp/app.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/src/profiler.py b/src/profiler.py index c49f624..b26c2e2 100644 --- a/src/profiler.py +++ b/src/profiler.py @@ -1022,6 +1022,19 @@ def profile_table( last_sync = table_sync.get("last_sync") sync_strategy_state = table_sync.get("strategy", table.sync_strategy) + # Compute file size from parquet if not in sync state + if file_size_mb is None: + try: + if parquet_path.is_dir(): + total_bytes = sum(f.stat().st_size for f in parquet_path.glob("*.parquet")) + elif parquet_path.exists(): + total_bytes = parquet_path.stat().st_size + else: + total_bytes = 0 + file_size_mb = total_bytes / (1024 * 1024) + except OSError: + file_size_mb = None + # Date range from first date column date_range = None if first_date_col: diff --git a/webapp/app.py b/webapp/app.py index 6117774..5065d83 100644 --- a/webapp/app.py +++ b/webapp/app.py @@ -198,6 +198,42 @@ def _load_data_stats() -> dict: except Exception as e: logger.warning(f"Could not load data stats from sync_state.json: {e}") + # Fallback: derive stats from profiles.json (covers sample data / no-sync setups) + try: + profiles_path = _resolve_metadata_path("profiles.json") + if profiles_path.exists(): + with open(profiles_path) as f: + profiles = json.load(f) + tables_data = profiles.get("tables", {}) + if tables_data: + total_tables = len(tables_data) + total_rows = sum(t.get("row_count", 0) for t in tables_data.values()) + total_columns = sum(t.get("column_count", 0) for t in tables_data.values()) + total_size_mb = sum(t.get("file_size_mb", 0) or 0 for t in tables_data.values()) + if total_rows >= 1_000_000: + rows_display = f"{total_rows / 1_000_000:.0f}M+" + elif total_rows >= 1_000: + rows_display = f"{total_rows / 1_000:.0f}K+" + else: + rows_display = str(total_rows) + size_mb = round(total_size_mb) + size_display = f"{size_mb / 1000:.1f} GB" if size_mb >= 1000 else f"{size_mb} MB" + return { + "tables": total_tables, + "columns": total_columns, + "rows": total_rows, + "rows_display": rows_display, + "size_mb": size_mb, + "size_display": size_display, + "uncompressed_mb": 0, + "unstructured_gb": 0, + "unstructured_display": "", + "last_updated": None, + "highlights": {}, + } + except Exception as e: + logger.warning(f"Could not load data stats from profiles.json: {e}") + return dict(FALLBACK_DATA_STATS)