Fix profiler file_size and catalog stats fallback
- Profiler computes file_size_mb from actual parquet files when sync_state.json is absent (sample data / no-sync deployments) - Catalog header falls back to profiles.json for aggregate stats (tables count, total rows) when sync_state.json is missing
This commit is contained in:
parent
1be0dc5300
commit
28543d98b1
2 changed files with 49 additions and 0 deletions
|
|
@ -1022,6 +1022,19 @@ def profile_table(
|
||||||
last_sync = table_sync.get("last_sync")
|
last_sync = table_sync.get("last_sync")
|
||||||
sync_strategy_state = table_sync.get("strategy", table.sync_strategy)
|
sync_strategy_state = table_sync.get("strategy", table.sync_strategy)
|
||||||
|
|
||||||
|
# Compute file size from parquet if not in sync state
|
||||||
|
if file_size_mb is None:
|
||||||
|
try:
|
||||||
|
if parquet_path.is_dir():
|
||||||
|
total_bytes = sum(f.stat().st_size for f in parquet_path.glob("*.parquet"))
|
||||||
|
elif parquet_path.exists():
|
||||||
|
total_bytes = parquet_path.stat().st_size
|
||||||
|
else:
|
||||||
|
total_bytes = 0
|
||||||
|
file_size_mb = total_bytes / (1024 * 1024)
|
||||||
|
except OSError:
|
||||||
|
file_size_mb = None
|
||||||
|
|
||||||
# Date range from first date column
|
# Date range from first date column
|
||||||
date_range = None
|
date_range = None
|
||||||
if first_date_col:
|
if first_date_col:
|
||||||
|
|
|
||||||
|
|
@ -198,6 +198,42 @@ def _load_data_stats() -> dict:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not load data stats from sync_state.json: {e}")
|
logger.warning(f"Could not load data stats from sync_state.json: {e}")
|
||||||
|
|
||||||
|
# Fallback: derive stats from profiles.json (covers sample data / no-sync setups)
|
||||||
|
try:
|
||||||
|
profiles_path = _resolve_metadata_path("profiles.json")
|
||||||
|
if profiles_path.exists():
|
||||||
|
with open(profiles_path) as f:
|
||||||
|
profiles = json.load(f)
|
||||||
|
tables_data = profiles.get("tables", {})
|
||||||
|
if tables_data:
|
||||||
|
total_tables = len(tables_data)
|
||||||
|
total_rows = sum(t.get("row_count", 0) for t in tables_data.values())
|
||||||
|
total_columns = sum(t.get("column_count", 0) for t in tables_data.values())
|
||||||
|
total_size_mb = sum(t.get("file_size_mb", 0) or 0 for t in tables_data.values())
|
||||||
|
if total_rows >= 1_000_000:
|
||||||
|
rows_display = f"{total_rows / 1_000_000:.0f}M+"
|
||||||
|
elif total_rows >= 1_000:
|
||||||
|
rows_display = f"{total_rows / 1_000:.0f}K+"
|
||||||
|
else:
|
||||||
|
rows_display = str(total_rows)
|
||||||
|
size_mb = round(total_size_mb)
|
||||||
|
size_display = f"{size_mb / 1000:.1f} GB" if size_mb >= 1000 else f"{size_mb} MB"
|
||||||
|
return {
|
||||||
|
"tables": total_tables,
|
||||||
|
"columns": total_columns,
|
||||||
|
"rows": total_rows,
|
||||||
|
"rows_display": rows_display,
|
||||||
|
"size_mb": size_mb,
|
||||||
|
"size_display": size_display,
|
||||||
|
"uncompressed_mb": 0,
|
||||||
|
"unstructured_gb": 0,
|
||||||
|
"unstructured_display": "",
|
||||||
|
"last_updated": None,
|
||||||
|
"highlights": {},
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not load data stats from profiles.json: {e}")
|
||||||
|
|
||||||
return dict(FALLBACK_DATA_STATS)
|
return dict(FALLBACK_DATA_STATS)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue