#!/usr/bin/env python3
"""
Standalone Data Profiler — DuckDB-based table profiling for Parquet/CSV files.
Zero external dependencies beyond DuckDB. Produces a comprehensive JSON profile
with column statistics, histograms, alerts, and sample data.
Usage:
# Profile a single Parquet file
python standalone_profiler.py data/orders.parquet
# Profile a directory of Parquet files (treated as one table)
python standalone_profiler.py data/partitioned_orders/
# Profile a CSV file
python standalone_profiler.py data/customers.csv
# Custom output path
python standalone_profiler.py data/orders.parquet -o profiles/orders_profile.json
# Specify primary key for duplicate detection
python standalone_profiler.py data/orders.parquet --primary-key order_id
# Composite primary key
python standalone_profiler.py data/orders.parquet --primary-key "order_id,line_id"
# Profile multiple files at once
python standalone_profiler.py data/orders.parquet data/customers.parquet data/products.csv
# Generate HTML report alongside JSON
python standalone_profiler.py data/orders.parquet --html
# Generate HTML from existing profile JSON
python standalone_profiler.py --from-json profile.json
Output:
JSON file with table-level and column-level statistics, alerts, histograms,
top values for categorical columns, and sample rows.
With --html: self-contained HTML file viewable in any browser.
Requirements:
pip install duckdb
"""
import argparse
import html as html_mod
import json
import logging
import math
import os
import sys
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import duckdb
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger("profiler")
# ---------------------------------------------------------------------------
# Profiler configuration
# ---------------------------------------------------------------------------
SAMPLE_THRESHOLD = 500_000 # Sample tables larger than this
SAMPLE_SIZE = 500_000
MAX_CATEGORICAL_DISTINCT = 50 # Treat as categorical if unique <= this
TOP_VALUES_LIMIT = 10 # Number of top values for categorical columns
HISTOGRAM_BINS = 15 # Number of bins for numeric histograms
SAMPLE_ROWS_LIMIT = 5 # Number of sample rows to include
SAMPLE_VALUES_LIMIT = 5 # Number of sample distinct values per column
# Alert thresholds
ALERT_HIGH_MISSING_PCT = 30.0
ALERT_MISSING_PCT = 5.0
ALERT_IMBALANCE_PCT = 60.0
ALERT_ZEROS_PCT = 50.0
ALERT_HIGH_CARDINALITY = 50
# ---------------------------------------------------------------------------
# DuckDB type classification
# ---------------------------------------------------------------------------
def classify_type(duckdb_type: str) -> str:
"""Map a DuckDB type string to a simplified category."""
t = duckdb_type.upper()
if t in ("BOOLEAN", "BOOL"):
return "BOOLEAN"
if t in ("DATE",):
return "DATE"
if "TIMESTAMP" in t:
return "TIMESTAMP"
base_type = t.split("(")[0].strip()
if base_type in (
"FLOAT", "DOUBLE", "DECIMAL", "REAL", "FLOAT4", "FLOAT8",
"NUMERIC", "HUGEINT", "INTEGER", "INT", "BIGINT", "SMALLINT",
"TINYINT", "INT8", "INT4", "INT2", "INT1", "UBIGINT",
"UINTEGER", "USMALLINT", "UTINYINT",
):
return "NUMERIC"
return "STRING"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _round(value: Any, digits: int = 2) -> Any:
"""Round a value if it is a float, otherwise return as-is."""
if value is None:
return None
if isinstance(value, float):
if math.isnan(value) or math.isinf(value):
return None
return round(value, digits)
return value
def _format_number(n: float) -> str:
"""Format large numbers with human-readable suffixes for histogram bin labels."""
if n is None:
return "?"
abs_n = abs(n)
if abs_n >= 1_000_000_000:
return f"{n / 1_000_000_000:.1f}B"
if abs_n >= 1_000_000:
return f"{n / 1_000_000:.1f}M"
if abs_n >= 1_000:
return f"{n / 1_000:.1f}K"
if isinstance(n, float) and n != int(n):
return f"{n:.2f}"
return str(int(n))
def write_json_atomic(path: Path, data: Any) -> None:
"""Write JSON to path atomically via tempfile + os.replace."""
path.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(dir=str(path.parent), suffix=".tmp")
try:
with os.fdopen(fd, "w") as f:
json.dump(data, f, indent=2, default=str)
os.chmod(tmp_path, 0o644)
os.replace(tmp_path, str(path))
logger.info("Wrote %s", path)
except Exception:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
# ---------------------------------------------------------------------------
# Batch statistics functions
# ---------------------------------------------------------------------------
def _batch_base_stats(
con: duckdb.DuckDBPyConnection,
view_name: str,
columns: List[str],
) -> Dict[str, Tuple[int, int]]:
"""Get non_null and unique counts for all columns in a single query.
Returns: {col_name: (non_null_count, unique_count)}
"""
if not columns:
return {}
parts = []
for col_name in columns:
safe = f'"{col_name}"'
parts.append(f"COUNT({safe})")
parts.append(f"COUNT(DISTINCT {safe})")
sql = f"SELECT {', '.join(parts)} FROM {view_name}"
row = con.execute(sql).fetchone()
result: Dict[str, Tuple[int, int]] = {}
idx = 0
for col_name in columns:
result[col_name] = (row[idx], row[idx + 1])
idx += 2
return result
def _batch_numeric_stats(
con: duckdb.DuckDBPyConnection,
view_name: str,
numeric_cols: List[str],
) -> Dict[str, Dict[str, Any]]:
"""Get aggregate statistics for all numeric columns in a single query."""
if not numeric_cols:
return {}
parts = []
for col_name in numeric_cols:
safe = f'"{col_name}"'
parts.extend([
f"MIN({safe})",
f"MAX({safe})",
f"AVG({safe})",
f"MEDIAN({safe})",
f"STDDEV({safe})",
f"PERCENTILE_CONT(0.05) WITHIN GROUP (ORDER BY {safe})",
f"PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY {safe})",
f"PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY {safe})",
f"PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY {safe})",
f"SUM(CASE WHEN {safe} = 0 THEN 1 ELSE 0 END)",
f"SUM(CASE WHEN {safe} < 0 THEN 1 ELSE 0 END)",
])
sql = f"SELECT {', '.join(parts)} FROM {view_name}"
row = con.execute(sql).fetchone()
result: Dict[str, Dict[str, Any]] = {}
idx = 0
for col_name in numeric_cols:
result[col_name] = {
"min": row[idx], "max": row[idx + 1], "mean": row[idx + 2],
"median": row[idx + 3], "stddev": row[idx + 4],
"p5": row[idx + 5], "p25": row[idx + 6],
"p75": row[idx + 7], "p95": row[idx + 8],
"zeros": row[idx + 9], "negative": row[idx + 10],
}
idx += 11
return result
def _batch_string_stats(
con: duckdb.DuckDBPyConnection,
view_name: str,
string_cols: List[str],
) -> Dict[str, Dict[str, Any]]:
"""Get string length statistics for all string columns in a single query."""
if not string_cols:
return {}
parts = []
for col_name in string_cols:
safe = f'"{col_name}"'
parts.extend([
f"MIN(LENGTH({safe}))",
f"MAX(LENGTH({safe}))",
f"AVG(LENGTH({safe}))",
])
sql = f"SELECT {', '.join(parts)} FROM {view_name}"
row = con.execute(sql).fetchone()
result: Dict[str, Dict[str, Any]] = {}
idx = 0
for col_name in string_cols:
result[col_name] = {
"min_length": row[idx] if row[idx] is not None else 0,
"max_length": row[idx + 1] if row[idx + 1] is not None else 0,
"avg_length": _round(row[idx + 2]) if row[idx + 2] is not None else 0.0,
}
idx += 3
return result
def _batch_date_stats(
con: duckdb.DuckDBPyConnection,
view_name: str,
date_cols: List[str],
category_map: Dict[str, str],
) -> Dict[str, Dict[str, Any]]:
"""Get date range statistics for all date/timestamp columns in a single query."""
if not date_cols:
return {}
parts = []
for col_name in date_cols:
safe = f'"{col_name}"'
cast_expr = f"CAST({safe} AS DATE)" if category_map[col_name] == "TIMESTAMP" else safe
parts.extend([
f"MIN({cast_expr})",
f"MAX({cast_expr})",
])
sql = f"SELECT {', '.join(parts)} FROM {view_name}"
row = con.execute(sql).fetchone()
result: Dict[str, Dict[str, Any]] = {}
idx = 0
for col_name in date_cols:
earliest = row[idx]
latest = row[idx + 1]
span_days = None
if earliest is not None and latest is not None:
try:
delta = latest - earliest
span_days = delta.days if hasattr(delta, "days") else int(delta)
except (TypeError, ValueError):
span_days = None
result[col_name] = {
"earliest": str(earliest) if earliest is not None else None,
"latest": str(latest) if latest is not None else None,
"span_days": span_days,
}
idx += 2
return result
def _batch_boolean_stats(
con: duckdb.DuckDBPyConnection,
view_name: str,
bool_cols: List[str],
) -> Dict[str, Dict[str, Any]]:
"""Get boolean true/false counts for all boolean columns in a single query."""
if not bool_cols:
return {}
parts = []
for col_name in bool_cols:
safe = f'"{col_name}"'
parts.extend([
f"SUM(CASE WHEN {safe} = TRUE THEN 1 ELSE 0 END)",
f"SUM(CASE WHEN {safe} = FALSE THEN 1 ELSE 0 END)",
])
sql = f"SELECT {', '.join(parts)} FROM {view_name}"
row = con.execute(sql).fetchone()
result: Dict[str, Dict[str, Any]] = {}
idx = 0
for col_name in bool_cols:
true_count = int(row[idx]) if row[idx] is not None else 0
false_count = int(row[idx + 1]) if row[idx + 1] is not None else 0
total = true_count + false_count
result[col_name] = {
"true_count": true_count,
"false_count": false_count,
"true_pct": _round(100.0 * true_count / total) if total > 0 else 0.0,
}
idx += 2
return result
# ---------------------------------------------------------------------------
# Core: profile a single file/table
# ---------------------------------------------------------------------------
def profile_table(
source_path: Path,
table_name: Optional[str] = None,
primary_key: Optional[str] = None,
) -> Dict[str, Any]:
"""Profile a single Parquet file, Parquet directory, or CSV file.
Args:
source_path: Path to .parquet file, directory of .parquet files, or .csv file.
table_name: Display name for the table (defaults to filename stem).
primary_key: Comma-separated primary key column(s) for duplicate detection.
Returns:
Dict with complete profile (table-level + column-level statistics).
"""
source_path = Path(source_path)
if table_name is None:
table_name = source_path.stem
pk_columns: List[str] = []
if primary_key:
pk_columns = [c.strip() for c in primary_key.split(",")]
con = duckdb.connect()
# Determine read expression based on file type
if source_path.is_dir():
read_expr = f"read_parquet('{source_path}/*.parquet')"
elif source_path.suffix.lower() == ".csv":
read_expr = f"read_csv_auto('{source_path}')"
else:
read_expr = f"read_parquet('{source_path}')"
# Get row count to decide on sampling
total_rows = con.execute(f"SELECT COUNT(*) FROM {read_expr}").fetchone()[0]
# Materialize into temp table (reads source files once instead of per-query)
view_name = "tbl"
sampled = total_rows > SAMPLE_THRESHOLD
if sampled:
con.execute(
f"CREATE TEMP TABLE {view_name} AS SELECT * FROM {read_expr} USING SAMPLE {SAMPLE_SIZE} ROWS"
)
working_rows = con.execute(f"SELECT COUNT(*) FROM {view_name}").fetchone()[0]
else:
con.execute(f"CREATE TEMP TABLE {view_name} AS SELECT * FROM {read_expr}")
working_rows = total_rows
# Column metadata
col_info = con.execute(f"DESCRIBE {view_name}").fetchall()
# Classify columns by type
all_col_names: List[str] = []
type_map: Dict[str, str] = {}
category_map: Dict[str, str] = {}
numeric_cols: List[str] = []
string_cols: List[str] = []
date_cols: List[str] = []
bool_cols: List[str] = []
for col_row in col_info:
col_name = col_row[0]
col_type = col_row[1]
all_col_names.append(col_name)
type_map[col_name] = col_type
category = classify_type(col_type)
category_map[col_name] = category
if category == "NUMERIC":
numeric_cols.append(col_name)
elif category == "STRING":
string_cols.append(col_name)
elif category in ("DATE", "TIMESTAMP"):
date_cols.append(col_name)
elif category == "BOOLEAN":
bool_cols.append(col_name)
# ---- Batch queries (one scan per type category) ----
base_stats = _batch_base_stats(con, view_name, all_col_names)
numeric_batch: Dict[str, Dict[str, Any]] = {}
try:
numeric_batch = _batch_numeric_stats(con, view_name, numeric_cols)
except Exception as exc:
logger.warning("Batch numeric stats failed: %s", exc)
string_batch: Dict[str, Dict[str, Any]] = {}
try:
string_batch = _batch_string_stats(con, view_name, string_cols)
except Exception as exc:
logger.warning("Batch string stats failed: %s", exc)
date_batch: Dict[str, Dict[str, Any]] = {}
try:
date_batch = _batch_date_stats(con, view_name, date_cols, category_map)
except Exception as exc:
logger.warning("Batch date stats failed: %s", exc)
boolean_batch: Dict[str, Dict[str, Any]] = {}
try:
boolean_batch = _batch_boolean_stats(con, view_name, bool_cols)
except Exception as exc:
logger.warning("Batch boolean stats failed: %s", exc)
# ---- Build column profiles ----
columns: List[Dict[str, Any]] = []
variable_types: Dict[str, int] = {}
total_null_count = 0
total_cells = working_rows * len(col_info) if col_info else 0
first_date_col: Optional[Dict[str, Any]] = None
for col_name in all_col_names:
col_type = type_map[col_name]
category = category_map[col_name]
safe_col = f'"{col_name}"'
variable_types[category] = variable_types.get(category, 0) + 1
non_null, unique_count = base_stats.get(col_name, (0, 0))
null_count = working_rows - non_null
completeness_pct = _round(100.0 * non_null / working_rows) if working_rows > 0 else 0.0
unique_pct = _round(100.0 * unique_count / non_null) if non_null > 0 else 0.0
missing_pct = _round(100.0 * null_count / working_rows) if working_rows > 0 else 0.0
is_pk = col_name in pk_columns
# Sample values
sample_values: List[str] = []
try:
rows = con.execute(
f"""
SELECT DISTINCT CAST({safe_col} AS VARCHAR) AS v
FROM {view_name}
WHERE {safe_col} IS NOT NULL
LIMIT {SAMPLE_VALUES_LIMIT}
"""
).fetchall()
sample_values = [r[0] for r in rows if r[0] is not None]
except Exception:
pass
# Alerts
alerts: List[str] = []
if unique_count == 1 and null_count == 0:
alerts.append("constant")
if unique_pct == 100.0 and null_count == 0 and non_null > 0:
alerts.append("unique")
if missing_pct > ALERT_HIGH_MISSING_PCT:
alerts.append("high_missing")
elif missing_pct > ALERT_MISSING_PCT:
alerts.append("missing")
col_profile: Dict[str, Any] = {
"name": col_name,
"type": col_type,
"type_category": category,
"completeness_pct": completeness_pct,
"null_count": null_count,
"unique_count": unique_count,
"unique_pct": unique_pct,
"sample_values": sample_values,
"is_primary_key": is_pk,
"alerts": alerts,
}
# Type-specific stats
try:
if category == "NUMERIC" and col_name in numeric_batch:
raw = numeric_batch[col_name]
min_val = _round(raw["min"])
max_val = _round(raw["max"])
zeros = int(raw["zeros"]) if raw["zeros"] is not None else 0
negative = int(raw["negative"]) if raw["negative"] is not None else 0
zeros_pct = _round(100.0 * zeros / non_null) if non_null > 0 else 0.0
negative_pct = _round(100.0 * negative / non_null) if non_null > 0 else 0.0
if zeros_pct > ALERT_ZEROS_PCT and "zeros" not in alerts:
alerts.append("zeros")
# Histogram (FLOOR-based bucketing, works in all DuckDB versions)
histogram: Dict[str, Any] = {"bins": [], "counts": []}
if min_val is not None and max_val is not None and min_val != max_val:
try:
bin_width = (float(max_val) - float(min_val)) / HISTOGRAM_BINS
bucket_rows = con.execute(
f"""
SELECT
LEAST(FLOOR((CAST({safe_col} AS DOUBLE) - {float(min_val)}) / {bin_width}), {HISTOGRAM_BINS - 1}) + 1 AS bucket,
COUNT(*) AS cnt
FROM {view_name}
WHERE {safe_col} IS NOT NULL
GROUP BY bucket
ORDER BY bucket
"""
).fetchall()
bin_labels: List[str] = []
bin_counts: List[int] = []
bucket_dict = {int(r[0]): int(r[1]) for r in bucket_rows if r[0] is not None}
for i in range(1, HISTOGRAM_BINS + 1):
lo = float(min_val) + (i - 1) * bin_width
hi = float(min_val) + i * bin_width
bin_labels.append(f"{_format_number(lo)}-{_format_number(hi)}")
bin_counts.append(bucket_dict.get(i, 0))
histogram = {"bins": bin_labels, "counts": bin_counts}
except Exception as exc:
logger.debug("Histogram failed for column %s: %s", col_name, exc)
col_profile["numeric_stats"] = {
"min": min_val,
"max": max_val,
"mean": _round(raw["mean"]),
"median": _round(raw["median"]),
"stddev": _round(raw["stddev"]),
"p5": _round(raw["p5"]),
"p25": _round(raw["p25"]),
"p75": _round(raw["p75"]),
"p95": _round(raw["p95"]),
"zeros": zeros,
"zeros_pct": zeros_pct,
"negative": negative,
"negative_pct": negative_pct,
"histogram": histogram,
}
elif category == "STRING" and col_name in string_batch:
sl = string_batch[col_name]
is_categorical = unique_count <= MAX_CATEGORICAL_DISTINCT
top_values: List[Dict[str, Any]] = []
if is_categorical and non_null > 0:
rows = con.execute(
f"""
SELECT {safe_col} AS val, COUNT(*) AS cnt
FROM {view_name}
WHERE {safe_col} IS NOT NULL
GROUP BY {safe_col}
ORDER BY cnt DESC
LIMIT {TOP_VALUES_LIMIT}
"""
).fetchall()
for row in rows:
pct = _round(100.0 * row[1] / non_null) if non_null > 0 else 0.0
top_values.append({"value": str(row[0]), "count": row[1], "pct": pct})
if top_values and top_values[0]["pct"] > ALERT_IMBALANCE_PCT:
if "imbalance" not in alerts:
alerts.append("imbalance")
else:
if unique_count > ALERT_HIGH_CARDINALITY and "high_cardinality" not in alerts:
alerts.append("high_cardinality")
col_profile["string_stats"] = {
"min_length": sl["min_length"],
"max_length": sl["max_length"],
"avg_length": sl["avg_length"],
"top_values": top_values,
}
elif category in ("DATE", "TIMESTAMP") and col_name in date_batch:
dr = date_batch[col_name]
cast_expr = f"CAST({safe_col} AS DATE)" if category == "TIMESTAMP" else safe_col
# Date histogram (YEAR/QUARTER grouping)
histogram = {"bins": [], "counts": []}
try:
rows = con.execute(
f"""
SELECT
YEAR({cast_expr}) AS yr,
QUARTER({cast_expr}) AS qtr,
COUNT(*) AS cnt
FROM {view_name}
WHERE {safe_col} IS NOT NULL
GROUP BY yr, qtr
ORDER BY yr, qtr
"""
).fetchall()
histogram["bins"] = [f"{int(r[0])}-Q{int(r[1])}" for r in rows]
histogram["counts"] = [int(r[2]) for r in rows]
except Exception as exc:
logger.debug("Date histogram failed for %s: %s", col_name, exc)
col_profile["date_stats"] = {
"earliest": dr["earliest"],
"latest": dr["latest"],
"span_days": dr["span_days"],
"histogram": histogram,
}
if first_date_col is None and dr["earliest"]:
first_date_col = col_profile["date_stats"]
elif category == "BOOLEAN" and col_name in boolean_batch:
col_profile["boolean_stats"] = boolean_batch[col_name]
except Exception as exc:
logger.warning("Type-specific stats failed for %s: %s", col_name, exc)
columns.append(col_profile)
total_null_count += null_count
# Table-level completeness
avg_completeness = 0.0
if columns:
avg_completeness = _round(
sum(c["completeness_pct"] for c in columns) / len(columns)
)
missing_cells_pct = _round(100.0 * total_null_count / total_cells) if total_cells > 0 else 0.0
# Duplicate rows (by primary key)
duplicate_rows = 0
if pk_columns and working_rows > 0:
try:
pk_expr = ", ".join(f'"{c}"' for c in pk_columns)
distinct_pk = con.execute(
f"SELECT COUNT(DISTINCT ({pk_expr})) FROM {view_name}"
).fetchone()[0]
duplicate_rows = working_rows - distinct_pk
except Exception as exc:
logger.debug("Duplicate check failed: %s", exc)
# Sample rows
sample_rows: List[Dict[str, Any]] = []
try:
sample_result = con.execute(f"SELECT * FROM {view_name} LIMIT {SAMPLE_ROWS_LIMIT}")
sample_col_names = [desc[0] for desc in sample_result.description]
for row in sample_result.fetchall():
sample_rows.append(
{sample_col_names[i]: str(v) if v is not None else None for i, v in enumerate(row)}
)
except Exception as exc:
logger.debug("Sample rows failed: %s", exc)
# Aggregate column alerts to table level
table_alerts: List[Dict[str, str]] = []
alert_messages = {
"constant": "{col} is constant (single value)",
"unique": "{col} has all unique values",
"high_missing": "{col} has {pct}% missing values",
"missing": "{col} has {pct}% missing values",
"imbalance": "{col} is highly imbalanced (top value {pct}%)",
"zeros": "{col} has {pct}% zero values",
"high_cardinality": "{col} has high cardinality ({n} distinct)",
}
for col in columns:
col_alert_name = col.get("name", "")
missing_pct_val = _round(100.0 - col.get("completeness_pct", 100.0))
for a in col.get("alerts", []):
if a in ("high_missing", "missing"):
msg = alert_messages[a].format(col=col_alert_name, pct=missing_pct_val)
elif a == "imbalance":
top_pct = 0.0
ss = col.get("string_stats", {})
tv = ss.get("top_values", [])
if tv:
top_pct = tv[0].get("pct", 0.0)
msg = alert_messages[a].format(col=col_alert_name, pct=top_pct)
elif a == "zeros":
ns = col.get("numeric_stats", {})
msg = alert_messages[a].format(col=col_alert_name, pct=ns.get("zeros_pct", 0.0))
elif a == "high_cardinality":
msg = alert_messages[a].format(col=col_alert_name, n=col.get("unique_count", 0))
else:
msg = alert_messages.get(a, f"{col_alert_name}: {a}").format(col=col_alert_name)
table_alerts.append({"column": col_alert_name, "type": a, "message": msg})
# File size
file_size_mb = None
try:
if source_path.is_dir():
total_bytes = sum(f.stat().st_size for f in source_path.glob("*.parquet"))
elif source_path.exists():
total_bytes = source_path.stat().st_size
else:
total_bytes = 0
file_size_mb = _round(total_bytes / (1024 * 1024))
except OSError:
pass
# Date range from first date column
date_range = None
if first_date_col:
date_range = {
"earliest": first_date_col.get("earliest"),
"latest": first_date_col.get("latest"),
"span_days": first_date_col.get("span_days"),
}
con.close()
return {
"table_name": table_name,
"source_path": str(source_path),
"row_count": total_rows,
"column_count": len(col_info),
"file_size_mb": file_size_mb,
"primary_key": primary_key,
"avg_completeness": avg_completeness,
"missing_cells": total_null_count,
"missing_cells_pct": missing_cells_pct,
"duplicate_rows": duplicate_rows,
"variable_types": variable_types,
"date_range": date_range,
"alerts": table_alerts,
"sampled": sampled,
"columns": columns,
"sample_rows": sample_rows,
}
# ---------------------------------------------------------------------------
# HTML report generation
# ---------------------------------------------------------------------------
_TYPE_COLORS = {
"NUMERIC": "#8b5cf6",
"STRING": "#3b82f6",
"DATE": "#f59e0b",
"TIMESTAMP": "#f59e0b",
"BOOLEAN": "#10b981",
}
_ALERT_SEVERITY = {
"high_missing": "e",
"missing": "w",
"constant": "i",
"unique": "i",
"imbalance": "w",
"zeros": "w",
"high_cardinality": "i",
}
_CSS = """
*{margin:0;padding:0;box-sizing:border-box}
body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',system-ui,sans-serif;
background:#f8fafc;color:#0f172a;line-height:1.5;font-size:14px}
.wrap{max-width:1200px;margin:0 auto;padding:20px 24px 60px}
header{padding:20px 0 16px;border-bottom:1px solid #e2e8f0;margin-bottom:24px}
h1{font-size:22px;font-weight:700}
.meta{color:#64748b;font-size:12px;margin-top:2px}
.cards{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:10px;margin:16px 0}
.card{background:#fff;border-radius:8px;box-shadow:0 1px 3px rgba(0,0,0,.08);padding:14px 16px;text-align:center}
.card-v{font-size:26px;font-weight:700}.card-l{font-size:10px;color:#64748b;text-transform:uppercase;letter-spacing:.05em;margin-top:2px}
.tabs{display:flex;gap:4px;margin-bottom:20px;flex-wrap:wrap}
.tab{padding:7px 14px;border-radius:6px;cursor:pointer;font-size:13px;border:1px solid #e2e8f0;background:#fff;transition:all .15s}
.tab:hover{border-color:#93c5fd}.tab.active{background:#3b82f6;color:#fff;border-color:#3b82f6}
.tsec{display:none}.tsec.active{display:block}
.alerts{margin:12px 0}
.alert{padding:7px 12px;border-radius:6px;margin:3px 0;font-size:12px}
.alert-w{background:#fef3c7;color:#92400e}.alert-e{background:#fee2e2;color:#991b1b}.alert-i{background:#dbeafe;color:#1e40af}
.types{display:flex;gap:6px;margin:10px 0;flex-wrap:wrap}
.tbadge{padding:2px 10px;border-radius:12px;font-size:11px;font-weight:600;color:#fff}
.stitle{font-size:15px;font-weight:600;margin:20px 0 8px}
.col-list{background:#fff;border-radius:8px;box-shadow:0 1px 3px rgba(0,0,0,.08);overflow:hidden}
.col-hdr{display:grid;grid-template-columns:minmax(140px,1.5fr) 56px minmax(100px,1fr) 90px 50px;
align-items:center;padding:8px 14px;cursor:pointer;border-bottom:1px solid #f1f5f9;gap:8px;transition:background .1s}
.col-hdr:hover{background:#f8fafc}
.col-hdr-label{cursor:default;font-weight:600;font-size:11px;color:#64748b;border-bottom-width:2px}
.cn{font-weight:600;font-size:13px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
.pk{color:#f59e0b;font-size:10px;font-weight:700;margin-left:3px}
.ct{font-size:10px;padding:2px 6px;border-radius:4px;text-align:center;font-weight:600;color:#fff;white-space:nowrap}
.cbar-bg{height:5px;background:#e2e8f0;border-radius:3px;overflow:hidden;flex:1}
.cbar{height:100%;border-radius:3px}
.compl{display:flex;align-items:center;gap:6px}
.cpct{font-size:11px;color:#64748b;min-width:32px;text-align:right}
.cuniq{font-size:11px;color:#64748b;text-align:right;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
.calerts span{padding:1px 5px;border-radius:8px;background:#fee2e2;color:#991b1b;font-size:10px}
.col-det{display:none;padding:14px 16px;border-bottom:1px solid #e2e8f0;background:#fafbfc}
.col-det.open{display:block}
.dgrid{display:grid;grid-template-columns:1fr 1fr;gap:16px}
@media(max-width:768px){.dgrid{grid-template-columns:1fr}.col-hdr{grid-template-columns:1fr 50px 1fr 70px 40px;font-size:12px}}
.stbl{font-size:12px;width:100%;border-collapse:collapse}
.stbl td{padding:2px 0}.stbl td:first-child{color:#64748b;padding-right:10px;white-space:nowrap}
.stbl td:last-child{font-weight:500;text-align:right}
.histogram{display:flex;align-items:flex-end;gap:1px;height:72px;margin:10px 0}
.h-bar{flex:1;background:#3b82f6;border-radius:2px 2px 0 0;min-width:3px;transition:background .15s;cursor:default;min-height:1px}
.h-bar:hover{background:#2563eb}
.h-labels{display:flex;justify-content:space-between;font-size:9px;color:#94a3b8;margin-top:2px}
.tvr{display:grid;grid-template-columns:110px 1fr 42px 52px;align-items:center;gap:6px;padding:2px 0;font-size:12px}
.tvl{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
.tvb-bg{height:7px;background:#e2e8f0;border-radius:4px;overflow:hidden}
.tvb{height:100%;background:#3b82f6;border-radius:4px}
.tvp{text-align:right;color:#64748b;font-size:11px}
.tvc{text-align:right;color:#94a3b8;font-size:10px}
.bbar{display:flex;height:18px;border-radius:4px;overflow:hidden;font-size:10px}
.bt{background:#22c55e;color:#fff;display:flex;align-items:center;justify-content:center}
.bf{background:#e2e8f0;color:#64748b;display:flex;align-items:center;justify-content:center}
.svs{display:flex;gap:4px;flex-wrap:wrap;margin-top:6px}
.sv{background:#f1f5f9;padding:1px 7px;border-radius:4px;font-size:11px;color:#475569}
.swrap{margin-top:20px}
.stog{cursor:pointer;color:#3b82f6;font-size:13px;font-weight:500;user-select:none}
.sdata{display:none;margin-top:8px;overflow-x:auto}
.sdata.open{display:block}
table.dt{border-collapse:collapse;font-size:11px;width:100%}
table.dt th{background:#f1f5f9;padding:5px 8px;text-align:left;font-weight:600;border:1px solid #e2e8f0;white-space:nowrap}
table.dt td{padding:5px 8px;border:1px solid #e2e8f0;max-width:180px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
.foot{text-align:center;color:#94a3b8;font-size:11px;margin-top:40px;padding-top:16px;border-top:1px solid #e2e8f0}
@media print{.tabs,.stog{display:none}.tsec,.col-det,.sdata{display:block!important}body{background:#fff}.card{box-shadow:none;border:1px solid #e2e8f0}}
"""
_JS = """
function switchTab(n){
document.querySelectorAll('.tab').forEach(function(t){t.classList.toggle('active',t.dataset.t===n)});
document.querySelectorAll('.tsec').forEach(function(s){s.classList.toggle('active',s.id==='t-'+n)});
}
function toggleCol(el){el.nextElementSibling.classList.toggle('open')}
function toggleSample(el){el.nextElementSibling.classList.toggle('open')}
"""
def _esc(s: Any) -> str:
return html_mod.escape(str(s)) if s is not None else ""
def _slug(name: str) -> str:
return name.replace(" ", "-").replace(".", "-").replace("/", "-")
def _fnum(n: Any) -> str:
if n is None:
return "-"
if isinstance(n, float):
if n == int(n) and abs(n) < 1e15:
return f"{int(n):,}"
return f"{n:,.2f}"
if isinstance(n, int):
return f"{n:,}"
return str(n)
def _compl_color(pct: float) -> str:
if pct >= 95:
return "#22c55e"
if pct >= 70:
return "#eab308"
return "#ef4444"
def _render_hist(bins: list, counts: list) -> str:
if not bins or not counts:
return ""
max_c = max(counts) or 1
bars = []
for b, c in zip(bins, counts):
pct = c / max_c * 100
bars.append(f'
')
return (
f'{"".join(bars)}
'
f'{_esc(bins[0])}{_esc(bins[-1])}
'
)
def _render_top_vals(top_values: list) -> str:
if not top_values:
return ""
max_pct = max((tv.get("pct", 0) for tv in top_values), default=1) or 1
rows = []
for tv in top_values:
bar_w = tv.get("pct", 0) / max_pct * 100
rows.append(
f''
f'
{_esc(str(tv["value"])[:30])}'
f'
'
f'
{tv.get("pct", 0)}%'
f'
({_fnum(tv.get("count", 0))})'
f'
'
)
return "".join(rows)
def _render_col_detail(col: dict) -> str:
parts: List[str] = []
ns = col.get("numeric_stats")
if ns:
parts.append('')
for label, key in [
("Min", "min"), ("Max", "max"), ("Mean", "mean"),
("Median", "median"), ("Std Dev", "stddev"),
("P5", "p5"), ("P25", "p25"), ("P75", "p75"), ("P95", "p95"),
("Zeros", "zeros"), ("Zeros %", "zeros_pct"),
("Negative", "negative"), ("Negative %", "negative_pct"),
]:
parts.append(f'| {label} | {_fnum(ns.get(key))} |
')
parts.append('
')
h = ns.get("histogram", {})
parts.append(_render_hist(h.get("bins", []), h.get("counts", [])))
parts.append('
')
ss = col.get("string_stats")
if ss:
parts.append('')
parts.append(f'| Min length | {_fnum(ss.get("min_length"))} |
')
parts.append(f'| Max length | {_fnum(ss.get("max_length"))} |
')
parts.append(f'| Avg length | {_fnum(ss.get("avg_length"))} |
')
parts.append('
')
tv = ss.get("top_values", [])
if tv:
parts.append('Top Values
')
parts.append(_render_top_vals(tv))
ds = col.get("date_stats")
if ds:
parts.append('')
parts.append(f'| Earliest | {_esc(ds.get("earliest", "-"))} |
')
parts.append(f'| Latest | {_esc(ds.get("latest", "-"))} |
')
parts.append(f'| Span | {_fnum(ds.get("span_days"))} days |
')
parts.append('
')
h = ds.get("histogram", {})
parts.append(_render_hist(h.get("bins", []), h.get("counts", [])))
parts.append('
')
bs = col.get("boolean_stats")
if bs:
tc, fc = bs.get("true_count", 0), bs.get("false_count", 0)
tp = bs.get("true_pct", 0)
fp = round(100 - tp, 1) if tp else 0
parts.append(
f''
f'
True {tp}% ({tc:,})
'
f'
False {fp}% ({fc:,})
'
f'
'
)
sv = col.get("sample_values", [])
if sv:
parts.append('Sample values:
')
parts.append('')
for v in sv:
parts.append(f'{_esc(str(v)[:50])}')
parts.append('
')
return "".join(parts)
def generate_html_report(profile_data: Dict[str, Any], output_path: Path) -> None:
"""Generate a standalone HTML report from profile data.
Args:
profile_data: Full profile dict with "tables" key.
output_path: Path to write the HTML file.
"""
tables = profile_data.get("tables", {})
generated_at = profile_data.get("generated_at", "")
if not tables:
logger.warning("No tables in profile data")
return
total_tables = len(tables)
total_rows = sum(t.get("row_count", 0) for t in tables.values())
total_cols = sum(t.get("column_count", 0) for t in tables.values())
compl_vals = [t.get("avg_completeness", 0) for t in tables.values()]
avg_compl = round(sum(compl_vals) / len(compl_vals), 1) if compl_vals else 0
total_alerts = sum(len(t.get("alerts", [])) for t in tables.values())
table_names = list(tables.keys())
h: List[str] = []
h.append('')
h.append('')
h.append('Data Profile Report')
h.append(f'')
# Header
h.append('
')
# Summary cards
h.append('
')
for val, label in [
(_fnum(total_tables), "Tables"),
(_fnum(total_rows), "Total Rows"),
(_fnum(total_cols), "Total Columns"),
(f"{avg_compl}%", "Avg Completeness"),
(_fnum(total_alerts), "Alerts"),
]:
h.append(f'
')
h.append('
')
# Table tabs
if total_tables > 1:
h.append('
')
for i, name in enumerate(table_names):
act = " active" if i == 0 else ""
sl = _slug(name)
h.append(f'
{_esc(name)}
')
h.append('
')
# Table sections
for i, (name, tbl) in enumerate(tables.items()):
act = " active" if i == 0 or total_tables == 1 else ""
sl = _slug(name)
h.append(f'
')
h.append(f'{_esc(name)}
')
# Stat cards
h.append('')
rc = tbl.get("row_count", 0)
cc = tbl.get("column_count", 0)
tc = tbl.get("avg_completeness", 0)
sz = tbl.get("file_size_mb")
dupes = tbl.get("duplicate_rows", 0)
sampled = tbl.get("sampled", False)
for val, label in [
(_fnum(rc), "Rows"),
(_fnum(cc), "Columns"),
(f"{tc}%", "Completeness"),
(f"{sz} MB" if sz is not None else "-", "File Size"),
]:
h.append(f'
')
dr = tbl.get("date_range")
if dr and dr.get("earliest"):
h.append(
f'
'
f'{_esc(dr["earliest"])} — {_esc(dr["latest"])}
'
f'
Date Range ({_fnum(dr.get("span_days"))} days)
'
)
if dupes:
h.append(f'
{_fnum(dupes)}
Duplicate Rows
')
if sampled:
h.append(f'
')
h.append('
')
# Variable types
vt = tbl.get("variable_types", {})
if vt:
h.append('')
for cat, cnt in sorted(vt.items()):
color = _TYPE_COLORS.get(cat, "#6b7280")
h.append(f'{cat} {cnt}')
h.append('
')
# Alerts
alerts = tbl.get("alerts", [])
if alerts:
h.append('')
for a in alerts:
sev = _ALERT_SEVERITY.get(a.get("type", ""), "i")
h.append(f'
{_esc(a.get("message", ""))}
')
h.append('
')
# Column list
columns = tbl.get("columns", [])
if columns:
h.append('Columns
')
h.append('')
# Header row
h.append('
')
h.append('
Name
Type
')
h.append('
Completeness
')
h.append('
Unique
')
h.append('
')
for col in columns:
cname = col.get("name", "")
cat = col.get("type_category", "STRING")
ctype = col.get("type", "")
cpct = col.get("completeness_pct", 0)
uniq = col.get("unique_count", 0)
upct = col.get("unique_pct", 0)
ca = col.get("alerts", [])
is_pk = col.get("is_primary_key", False)
color = _TYPE_COLORS.get(cat, "#6b7280")
cc_col = _compl_color(cpct)
pk_html = '
PK' if is_pk else ""
alert_html = f'
{len(ca)}' if ca else ""
h.append('
')
h.append(f'
{_esc(cname)}{pk_html}
')
h.append(f'
{_esc(cat[:4])}
')
h.append(f'
')
h.append(f'
{_fnum(uniq)} ({upct}%)
')
h.append(f'
{alert_html}
')
h.append('
')
h.append(f'
{_render_col_detail(col)}
')
h.append('
')
# Sample data
sample_rows = tbl.get("sample_rows", [])
if sample_rows:
h.append('')
h.append(f'
▶ Sample Data ({len(sample_rows)} rows)
')
h.append('
')
headers = list(sample_rows[0].keys())
h.append('' + ''.join(f'| {_esc(hd)} | ' for hd in headers) + '
')
for row in sample_rows:
h.append('' + ''.join(
f'| {_esc(str(row.get(hd, ""))[:60])} | '
for hd in headers
) + '
')
h.append('
')
h.append('')
# Footer + JS
h.append('')
h.append(f'')
h.append('
')
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(h), encoding="utf-8")
logger.info("Wrote HTML report: %s", output_path)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Profile Parquet/CSV files and output JSON statistics + optional HTML report.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s data/orders.parquet
%(prog)s data/orders.parquet --primary-key order_id --html
%(prog)s data/orders.parquet data/customers.csv -o profiles.json --html
%(prog)s --from-json profile.json
""",
)
parser.add_argument(
"files",
nargs="*",
help="Parquet file(s), directory of Parquet files, or CSV file(s) to profile",
)
parser.add_argument(
"-o", "--output",
default="profile.json",
help="Output JSON file path (default: profile.json)",
)
parser.add_argument(
"--primary-key",
default=None,
help="Comma-separated primary key column(s) for duplicate detection",
)
parser.add_argument(
"--html",
action="store_true",
help="Also generate a standalone HTML report",
)
parser.add_argument(
"--from-json",
metavar="PATH",
default=None,
help="Generate HTML report from existing profile JSON (no profiling)",
)
parser.add_argument(
"--quiet", "-q",
action="store_true",
help="Suppress info logging",
)
args = parser.parse_args()
if args.quiet:
logging.getLogger("profiler").setLevel(logging.WARNING)
# Mode 1: Generate HTML from existing JSON
if args.from_json:
json_path = Path(args.from_json)
if not json_path.exists():
logger.error("File not found: %s", json_path)
sys.exit(1)
with open(json_path) as f:
profile_data = json.load(f)
html_path = json_path.with_suffix(".html")
generate_html_report(profile_data, html_path)
logger.info("Done: HTML report at %s", html_path)
return
# Mode 2: Profile files
if not args.files:
parser.error("Provide files to profile, or use --from-json")
profiles: Dict[str, Any] = {}
success = 0
errors = 0
for file_path_str in args.files:
file_path = Path(file_path_str)
if not file_path.exists():
logger.error("File not found: %s", file_path)
errors += 1
continue
try:
logger.info("Profiling %s ...", file_path)
profile = profile_table(
source_path=file_path,
primary_key=args.primary_key,
)
profiles[profile["table_name"]] = profile
success += 1
logger.info(
" %s: %d rows, %d cols, %d alerts",
profile["table_name"],
profile["row_count"],
profile["column_count"],
len(profile["alerts"]),
)
except Exception as exc:
logger.error("Failed to profile %s: %s", file_path, exc)
errors += 1
if not profiles:
logger.error("No tables profiled successfully")
sys.exit(1)
output = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"version": "1.0",
"tables": profiles,
}
output_path = Path(args.output)
write_json_atomic(output_path, output)
# Generate HTML if requested
if args.html:
html_path = output_path.with_suffix(".html")
generate_html_report(output, html_path)
logger.info("Done: %d profiled, %d errors. Output: %s", success, errors, output_path)
if __name__ == "__main__":
main()