Fix profiler file_size and catalog stats fallback
- Profiler computes file_size_mb from actual parquet files when sync_state.json is absent (sample data / no-sync deployments) - Catalog header falls back to profiles.json for aggregate stats (tables count, total rows) when sync_state.json is missing
This commit is contained in:
parent
1be0dc5300
commit
28543d98b1
2 changed files with 49 additions and 0 deletions
|
|
@ -1022,6 +1022,19 @@ def profile_table(
|
|||
last_sync = table_sync.get("last_sync")
|
||||
sync_strategy_state = table_sync.get("strategy", table.sync_strategy)
|
||||
|
||||
# Compute file size from parquet if not in sync state
|
||||
if file_size_mb is None:
|
||||
try:
|
||||
if parquet_path.is_dir():
|
||||
total_bytes = sum(f.stat().st_size for f in parquet_path.glob("*.parquet"))
|
||||
elif parquet_path.exists():
|
||||
total_bytes = parquet_path.stat().st_size
|
||||
else:
|
||||
total_bytes = 0
|
||||
file_size_mb = total_bytes / (1024 * 1024)
|
||||
except OSError:
|
||||
file_size_mb = None
|
||||
|
||||
# Date range from first date column
|
||||
date_range = None
|
||||
if first_date_col:
|
||||
|
|
|
|||
|
|
@ -198,6 +198,42 @@ def _load_data_stats() -> dict:
|
|||
except Exception as e:
|
||||
logger.warning(f"Could not load data stats from sync_state.json: {e}")
|
||||
|
||||
# Fallback: derive stats from profiles.json (covers sample data / no-sync setups)
|
||||
try:
|
||||
profiles_path = _resolve_metadata_path("profiles.json")
|
||||
if profiles_path.exists():
|
||||
with open(profiles_path) as f:
|
||||
profiles = json.load(f)
|
||||
tables_data = profiles.get("tables", {})
|
||||
if tables_data:
|
||||
total_tables = len(tables_data)
|
||||
total_rows = sum(t.get("row_count", 0) for t in tables_data.values())
|
||||
total_columns = sum(t.get("column_count", 0) for t in tables_data.values())
|
||||
total_size_mb = sum(t.get("file_size_mb", 0) or 0 for t in tables_data.values())
|
||||
if total_rows >= 1_000_000:
|
||||
rows_display = f"{total_rows / 1_000_000:.0f}M+"
|
||||
elif total_rows >= 1_000:
|
||||
rows_display = f"{total_rows / 1_000:.0f}K+"
|
||||
else:
|
||||
rows_display = str(total_rows)
|
||||
size_mb = round(total_size_mb)
|
||||
size_display = f"{size_mb / 1000:.1f} GB" if size_mb >= 1000 else f"{size_mb} MB"
|
||||
return {
|
||||
"tables": total_tables,
|
||||
"columns": total_columns,
|
||||
"rows": total_rows,
|
||||
"rows_display": rows_display,
|
||||
"size_mb": size_mb,
|
||||
"size_display": size_display,
|
||||
"uncompressed_mb": 0,
|
||||
"unstructured_gb": 0,
|
||||
"unstructured_display": "",
|
||||
"last_updated": None,
|
||||
"highlights": {},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load data stats from profiles.json: {e}")
|
||||
|
||||
return dict(FALLBACK_DATA_STATS)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue