Fix profiler file_size and catalog stats fallback

- Profiler computes file_size_mb from actual parquet files when
  sync_state.json is absent (sample data / no-sync deployments)
- Catalog header falls back to profiles.json for aggregate stats
  (tables count, total rows) when sync_state.json is missing
This commit is contained in:
Petr 2026-03-10 22:12:46 +01:00
parent 1be0dc5300
commit 28543d98b1
2 changed files with 49 additions and 0 deletions

View file

@ -1022,6 +1022,19 @@ def profile_table(
last_sync = table_sync.get("last_sync")
sync_strategy_state = table_sync.get("strategy", table.sync_strategy)
# Compute file size from parquet if not in sync state
if file_size_mb is None:
try:
if parquet_path.is_dir():
total_bytes = sum(f.stat().st_size for f in parquet_path.glob("*.parquet"))
elif parquet_path.exists():
total_bytes = parquet_path.stat().st_size
else:
total_bytes = 0
file_size_mb = total_bytes / (1024 * 1024)
except OSError:
file_size_mb = None
# Date range from first date column
date_range = None
if first_date_col:

View file

@ -198,6 +198,42 @@ def _load_data_stats() -> dict:
except Exception as e:
logger.warning(f"Could not load data stats from sync_state.json: {e}")
# Fallback: derive stats from profiles.json (covers sample data / no-sync setups)
try:
profiles_path = _resolve_metadata_path("profiles.json")
if profiles_path.exists():
with open(profiles_path) as f:
profiles = json.load(f)
tables_data = profiles.get("tables", {})
if tables_data:
total_tables = len(tables_data)
total_rows = sum(t.get("row_count", 0) for t in tables_data.values())
total_columns = sum(t.get("column_count", 0) for t in tables_data.values())
total_size_mb = sum(t.get("file_size_mb", 0) or 0 for t in tables_data.values())
if total_rows >= 1_000_000:
rows_display = f"{total_rows / 1_000_000:.0f}M+"
elif total_rows >= 1_000:
rows_display = f"{total_rows / 1_000:.0f}K+"
else:
rows_display = str(total_rows)
size_mb = round(total_size_mb)
size_display = f"{size_mb / 1000:.1f} GB" if size_mb >= 1000 else f"{size_mb} MB"
return {
"tables": total_tables,
"columns": total_columns,
"rows": total_rows,
"rows_display": rows_display,
"size_mb": size_mb,
"size_display": size_display,
"uncompressed_mb": 0,
"unstructured_gb": 0,
"unstructured_display": "",
"last_updated": None,
"highlights": {},
}
except Exception as e:
logger.warning(f"Could not load data stats from profiles.json: {e}")
return dict(FALLBACK_DATA_STATS)