diff --git a/scripts/standalone_profiler.py b/scripts/standalone_profiler.py
new file mode 100644
index 0000000..69fb241
--- /dev/null
+++ b/scripts/standalone_profiler.py
@@ -0,0 +1,1271 @@
+#!/usr/bin/env python3
+"""
+Standalone Data Profiler — DuckDB-based table profiling for Parquet/CSV files.
+
+Zero external dependencies beyond DuckDB. Produces a comprehensive JSON profile
+with column statistics, histograms, alerts, and sample data.
+
+Usage:
+ # Profile a single Parquet file
+ python standalone_profiler.py data/orders.parquet
+
+ # Profile a directory of Parquet files (treated as one table)
+ python standalone_profiler.py data/partitioned_orders/
+
+ # Profile a CSV file
+ python standalone_profiler.py data/customers.csv
+
+ # Custom output path
+ python standalone_profiler.py data/orders.parquet -o profiles/orders_profile.json
+
+ # Specify primary key for duplicate detection
+ python standalone_profiler.py data/orders.parquet --primary-key order_id
+
+ # Composite primary key
+ python standalone_profiler.py data/orders.parquet --primary-key "order_id,line_id"
+
+ # Profile multiple files at once
+ python standalone_profiler.py data/orders.parquet data/customers.parquet data/products.csv
+
+ # Generate HTML report alongside JSON
+ python standalone_profiler.py data/orders.parquet --html
+
+ # Generate HTML from existing profile JSON
+ python standalone_profiler.py --from-json profile.json
+
+Output:
+ JSON file with table-level and column-level statistics, alerts, histograms,
+ top values for categorical columns, and sample rows.
+ With --html: self-contained HTML file viewable in any browser.
+
+Requirements:
+ pip install duckdb
+"""
+
+import argparse
+import html as html_mod
+import json
+import logging
+import math
+import os
+import sys
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import duckdb
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger("profiler")
+
+# ---------------------------------------------------------------------------
+# Profiler configuration
+# ---------------------------------------------------------------------------
+SAMPLE_THRESHOLD = 500_000 # Sample tables larger than this
+SAMPLE_SIZE = 500_000
+MAX_CATEGORICAL_DISTINCT = 50 # Treat as categorical if unique <= this
+TOP_VALUES_LIMIT = 10 # Number of top values for categorical columns
+HISTOGRAM_BINS = 15 # Number of bins for numeric histograms
+SAMPLE_ROWS_LIMIT = 5 # Number of sample rows to include
+SAMPLE_VALUES_LIMIT = 5 # Number of sample distinct values per column
+
+# Alert thresholds
+ALERT_HIGH_MISSING_PCT = 30.0
+ALERT_MISSING_PCT = 5.0
+ALERT_IMBALANCE_PCT = 60.0
+ALERT_ZEROS_PCT = 50.0
+ALERT_HIGH_CARDINALITY = 50
+
+
+# ---------------------------------------------------------------------------
+# DuckDB type classification
+# ---------------------------------------------------------------------------
+def classify_type(duckdb_type: str) -> str:
+ """Map a DuckDB type string to a simplified category."""
+ t = duckdb_type.upper()
+ if t in ("BOOLEAN", "BOOL"):
+ return "BOOLEAN"
+ if t in ("DATE",):
+ return "DATE"
+ if "TIMESTAMP" in t:
+ return "TIMESTAMP"
+ base_type = t.split("(")[0].strip()
+ if base_type in (
+ "FLOAT", "DOUBLE", "DECIMAL", "REAL", "FLOAT4", "FLOAT8",
+ "NUMERIC", "HUGEINT", "INTEGER", "INT", "BIGINT", "SMALLINT",
+ "TINYINT", "INT8", "INT4", "INT2", "INT1", "UBIGINT",
+ "UINTEGER", "USMALLINT", "UTINYINT",
+ ):
+ return "NUMERIC"
+ return "STRING"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _round(value: Any, digits: int = 2) -> Any:
+ """Round a value if it is a float, otherwise return as-is."""
+ if value is None:
+ return None
+ if isinstance(value, float):
+ if math.isnan(value) or math.isinf(value):
+ return None
+ return round(value, digits)
+ return value
+
+
+def _format_number(n: float) -> str:
+ """Format large numbers with human-readable suffixes for histogram bin labels."""
+ if n is None:
+ return "?"
+ abs_n = abs(n)
+ if abs_n >= 1_000_000_000:
+ return f"{n / 1_000_000_000:.1f}B"
+ if abs_n >= 1_000_000:
+ return f"{n / 1_000_000:.1f}M"
+ if abs_n >= 1_000:
+ return f"{n / 1_000:.1f}K"
+ if isinstance(n, float) and n != int(n):
+ return f"{n:.2f}"
+ return str(int(n))
+
+
+def write_json_atomic(path: Path, data: Any) -> None:
+ """Write JSON to path atomically via tempfile + os.replace."""
+ path.parent.mkdir(parents=True, exist_ok=True)
+ fd, tmp_path = tempfile.mkstemp(dir=str(path.parent), suffix=".tmp")
+ try:
+ with os.fdopen(fd, "w") as f:
+ json.dump(data, f, indent=2, default=str)
+ os.chmod(tmp_path, 0o644)
+ os.replace(tmp_path, str(path))
+ logger.info("Wrote %s", path)
+ except Exception:
+ try:
+ os.unlink(tmp_path)
+ except OSError:
+ pass
+ raise
+
+
+# ---------------------------------------------------------------------------
+# Batch statistics functions
+# ---------------------------------------------------------------------------
+def _batch_base_stats(
+ con: duckdb.DuckDBPyConnection,
+ view_name: str,
+ columns: List[str],
+) -> Dict[str, Tuple[int, int]]:
+ """Get non_null and unique counts for all columns in a single query.
+
+ Returns: {col_name: (non_null_count, unique_count)}
+ """
+ if not columns:
+ return {}
+
+ parts = []
+ for col_name in columns:
+ safe = f'"{col_name}"'
+ parts.append(f"COUNT({safe})")
+ parts.append(f"COUNT(DISTINCT {safe})")
+
+ sql = f"SELECT {', '.join(parts)} FROM {view_name}"
+ row = con.execute(sql).fetchone()
+
+ result: Dict[str, Tuple[int, int]] = {}
+ idx = 0
+ for col_name in columns:
+ result[col_name] = (row[idx], row[idx + 1])
+ idx += 2
+ return result
+
+
+def _batch_numeric_stats(
+ con: duckdb.DuckDBPyConnection,
+ view_name: str,
+ numeric_cols: List[str],
+) -> Dict[str, Dict[str, Any]]:
+ """Get aggregate statistics for all numeric columns in a single query."""
+ if not numeric_cols:
+ return {}
+
+ parts = []
+ for col_name in numeric_cols:
+ safe = f'"{col_name}"'
+ parts.extend([
+ f"MIN({safe})",
+ f"MAX({safe})",
+ f"AVG({safe})",
+ f"MEDIAN({safe})",
+ f"STDDEV({safe})",
+ f"PERCENTILE_CONT(0.05) WITHIN GROUP (ORDER BY {safe})",
+ f"PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY {safe})",
+ f"PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY {safe})",
+ f"PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY {safe})",
+ f"SUM(CASE WHEN {safe} = 0 THEN 1 ELSE 0 END)",
+ f"SUM(CASE WHEN {safe} < 0 THEN 1 ELSE 0 END)",
+ ])
+
+ sql = f"SELECT {', '.join(parts)} FROM {view_name}"
+ row = con.execute(sql).fetchone()
+
+ result: Dict[str, Dict[str, Any]] = {}
+ idx = 0
+ for col_name in numeric_cols:
+ result[col_name] = {
+ "min": row[idx], "max": row[idx + 1], "mean": row[idx + 2],
+ "median": row[idx + 3], "stddev": row[idx + 4],
+ "p5": row[idx + 5], "p25": row[idx + 6],
+ "p75": row[idx + 7], "p95": row[idx + 8],
+ "zeros": row[idx + 9], "negative": row[idx + 10],
+ }
+ idx += 11
+ return result
+
+
+def _batch_string_stats(
+ con: duckdb.DuckDBPyConnection,
+ view_name: str,
+ string_cols: List[str],
+) -> Dict[str, Dict[str, Any]]:
+ """Get string length statistics for all string columns in a single query."""
+ if not string_cols:
+ return {}
+
+ parts = []
+ for col_name in string_cols:
+ safe = f'"{col_name}"'
+ parts.extend([
+ f"MIN(LENGTH({safe}))",
+ f"MAX(LENGTH({safe}))",
+ f"AVG(LENGTH({safe}))",
+ ])
+
+ sql = f"SELECT {', '.join(parts)} FROM {view_name}"
+ row = con.execute(sql).fetchone()
+
+ result: Dict[str, Dict[str, Any]] = {}
+ idx = 0
+ for col_name in string_cols:
+ result[col_name] = {
+ "min_length": row[idx] if row[idx] is not None else 0,
+ "max_length": row[idx + 1] if row[idx + 1] is not None else 0,
+ "avg_length": _round(row[idx + 2]) if row[idx + 2] is not None else 0.0,
+ }
+ idx += 3
+ return result
+
+
+def _batch_date_stats(
+ con: duckdb.DuckDBPyConnection,
+ view_name: str,
+ date_cols: List[str],
+ category_map: Dict[str, str],
+) -> Dict[str, Dict[str, Any]]:
+ """Get date range statistics for all date/timestamp columns in a single query."""
+ if not date_cols:
+ return {}
+
+ parts = []
+ for col_name in date_cols:
+ safe = f'"{col_name}"'
+ cast_expr = f"CAST({safe} AS DATE)" if category_map[col_name] == "TIMESTAMP" else safe
+ parts.extend([
+ f"MIN({cast_expr})",
+ f"MAX({cast_expr})",
+ ])
+
+ sql = f"SELECT {', '.join(parts)} FROM {view_name}"
+ row = con.execute(sql).fetchone()
+
+ result: Dict[str, Dict[str, Any]] = {}
+ idx = 0
+ for col_name in date_cols:
+ earliest = row[idx]
+ latest = row[idx + 1]
+ span_days = None
+ if earliest is not None and latest is not None:
+ try:
+ delta = latest - earliest
+ span_days = delta.days if hasattr(delta, "days") else int(delta)
+ except (TypeError, ValueError):
+ span_days = None
+ result[col_name] = {
+ "earliest": str(earliest) if earliest is not None else None,
+ "latest": str(latest) if latest is not None else None,
+ "span_days": span_days,
+ }
+ idx += 2
+ return result
+
+
+def _batch_boolean_stats(
+ con: duckdb.DuckDBPyConnection,
+ view_name: str,
+ bool_cols: List[str],
+) -> Dict[str, Dict[str, Any]]:
+ """Get boolean true/false counts for all boolean columns in a single query."""
+ if not bool_cols:
+ return {}
+
+ parts = []
+ for col_name in bool_cols:
+ safe = f'"{col_name}"'
+ parts.extend([
+ f"SUM(CASE WHEN {safe} = TRUE THEN 1 ELSE 0 END)",
+ f"SUM(CASE WHEN {safe} = FALSE THEN 1 ELSE 0 END)",
+ ])
+
+ sql = f"SELECT {', '.join(parts)} FROM {view_name}"
+ row = con.execute(sql).fetchone()
+
+ result: Dict[str, Dict[str, Any]] = {}
+ idx = 0
+ for col_name in bool_cols:
+ true_count = int(row[idx]) if row[idx] is not None else 0
+ false_count = int(row[idx + 1]) if row[idx + 1] is not None else 0
+ total = true_count + false_count
+ result[col_name] = {
+ "true_count": true_count,
+ "false_count": false_count,
+ "true_pct": _round(100.0 * true_count / total) if total > 0 else 0.0,
+ }
+ idx += 2
+ return result
+
+
+# ---------------------------------------------------------------------------
+# Core: profile a single file/table
+# ---------------------------------------------------------------------------
+def profile_table(
+ source_path: Path,
+ table_name: Optional[str] = None,
+ primary_key: Optional[str] = None,
+) -> Dict[str, Any]:
+ """Profile a single Parquet file, Parquet directory, or CSV file.
+
+ Args:
+ source_path: Path to .parquet file, directory of .parquet files, or .csv file.
+ table_name: Display name for the table (defaults to filename stem).
+ primary_key: Comma-separated primary key column(s) for duplicate detection.
+
+ Returns:
+ Dict with complete profile (table-level + column-level statistics).
+ """
+ source_path = Path(source_path)
+ if table_name is None:
+ table_name = source_path.stem
+
+ pk_columns: List[str] = []
+ if primary_key:
+ pk_columns = [c.strip() for c in primary_key.split(",")]
+
+ con = duckdb.connect()
+
+ # Determine read expression based on file type
+ if source_path.is_dir():
+ read_expr = f"read_parquet('{source_path}/*.parquet')"
+ elif source_path.suffix.lower() == ".csv":
+ read_expr = f"read_csv_auto('{source_path}')"
+ else:
+ read_expr = f"read_parquet('{source_path}')"
+
+ # Get row count to decide on sampling
+ total_rows = con.execute(f"SELECT COUNT(*) FROM {read_expr}").fetchone()[0]
+
+ # Materialize into temp table (reads source files once instead of per-query)
+ view_name = "tbl"
+ sampled = total_rows > SAMPLE_THRESHOLD
+ if sampled:
+ con.execute(
+ f"CREATE TEMP TABLE {view_name} AS SELECT * FROM {read_expr} USING SAMPLE {SAMPLE_SIZE} ROWS"
+ )
+ working_rows = con.execute(f"SELECT COUNT(*) FROM {view_name}").fetchone()[0]
+ else:
+ con.execute(f"CREATE TEMP TABLE {view_name} AS SELECT * FROM {read_expr}")
+ working_rows = total_rows
+
+ # Column metadata
+ col_info = con.execute(f"DESCRIBE {view_name}").fetchall()
+
+ # Classify columns by type
+ all_col_names: List[str] = []
+ type_map: Dict[str, str] = {}
+ category_map: Dict[str, str] = {}
+ numeric_cols: List[str] = []
+ string_cols: List[str] = []
+ date_cols: List[str] = []
+ bool_cols: List[str] = []
+
+ for col_row in col_info:
+ col_name = col_row[0]
+ col_type = col_row[1]
+ all_col_names.append(col_name)
+ type_map[col_name] = col_type
+ category = classify_type(col_type)
+ category_map[col_name] = category
+ if category == "NUMERIC":
+ numeric_cols.append(col_name)
+ elif category == "STRING":
+ string_cols.append(col_name)
+ elif category in ("DATE", "TIMESTAMP"):
+ date_cols.append(col_name)
+ elif category == "BOOLEAN":
+ bool_cols.append(col_name)
+
+ # ---- Batch queries (one scan per type category) ----
+ base_stats = _batch_base_stats(con, view_name, all_col_names)
+
+ numeric_batch: Dict[str, Dict[str, Any]] = {}
+ try:
+ numeric_batch = _batch_numeric_stats(con, view_name, numeric_cols)
+ except Exception as exc:
+ logger.warning("Batch numeric stats failed: %s", exc)
+
+ string_batch: Dict[str, Dict[str, Any]] = {}
+ try:
+ string_batch = _batch_string_stats(con, view_name, string_cols)
+ except Exception as exc:
+ logger.warning("Batch string stats failed: %s", exc)
+
+ date_batch: Dict[str, Dict[str, Any]] = {}
+ try:
+ date_batch = _batch_date_stats(con, view_name, date_cols, category_map)
+ except Exception as exc:
+ logger.warning("Batch date stats failed: %s", exc)
+
+ boolean_batch: Dict[str, Dict[str, Any]] = {}
+ try:
+ boolean_batch = _batch_boolean_stats(con, view_name, bool_cols)
+ except Exception as exc:
+ logger.warning("Batch boolean stats failed: %s", exc)
+
+ # ---- Build column profiles ----
+ columns: List[Dict[str, Any]] = []
+ variable_types: Dict[str, int] = {}
+ total_null_count = 0
+ total_cells = working_rows * len(col_info) if col_info else 0
+ first_date_col: Optional[Dict[str, Any]] = None
+
+ for col_name in all_col_names:
+ col_type = type_map[col_name]
+ category = category_map[col_name]
+ safe_col = f'"{col_name}"'
+ variable_types[category] = variable_types.get(category, 0) + 1
+
+ non_null, unique_count = base_stats.get(col_name, (0, 0))
+ null_count = working_rows - non_null
+
+ completeness_pct = _round(100.0 * non_null / working_rows) if working_rows > 0 else 0.0
+ unique_pct = _round(100.0 * unique_count / non_null) if non_null > 0 else 0.0
+ missing_pct = _round(100.0 * null_count / working_rows) if working_rows > 0 else 0.0
+ is_pk = col_name in pk_columns
+
+ # Sample values
+ sample_values: List[str] = []
+ try:
+ rows = con.execute(
+ f"""
+ SELECT DISTINCT CAST({safe_col} AS VARCHAR) AS v
+ FROM {view_name}
+ WHERE {safe_col} IS NOT NULL
+ LIMIT {SAMPLE_VALUES_LIMIT}
+ """
+ ).fetchall()
+ sample_values = [r[0] for r in rows if r[0] is not None]
+ except Exception:
+ pass
+
+ # Alerts
+ alerts: List[str] = []
+ if unique_count == 1 and null_count == 0:
+ alerts.append("constant")
+ if unique_pct == 100.0 and null_count == 0 and non_null > 0:
+ alerts.append("unique")
+ if missing_pct > ALERT_HIGH_MISSING_PCT:
+ alerts.append("high_missing")
+ elif missing_pct > ALERT_MISSING_PCT:
+ alerts.append("missing")
+
+ col_profile: Dict[str, Any] = {
+ "name": col_name,
+ "type": col_type,
+ "type_category": category,
+ "completeness_pct": completeness_pct,
+ "null_count": null_count,
+ "unique_count": unique_count,
+ "unique_pct": unique_pct,
+ "sample_values": sample_values,
+ "is_primary_key": is_pk,
+ "alerts": alerts,
+ }
+
+ # Type-specific stats
+ try:
+ if category == "NUMERIC" and col_name in numeric_batch:
+ raw = numeric_batch[col_name]
+ min_val = _round(raw["min"])
+ max_val = _round(raw["max"])
+ zeros = int(raw["zeros"]) if raw["zeros"] is not None else 0
+ negative = int(raw["negative"]) if raw["negative"] is not None else 0
+ zeros_pct = _round(100.0 * zeros / non_null) if non_null > 0 else 0.0
+ negative_pct = _round(100.0 * negative / non_null) if non_null > 0 else 0.0
+
+ if zeros_pct > ALERT_ZEROS_PCT and "zeros" not in alerts:
+ alerts.append("zeros")
+
+ # Histogram (FLOOR-based bucketing, works in all DuckDB versions)
+ histogram: Dict[str, Any] = {"bins": [], "counts": []}
+ if min_val is not None and max_val is not None and min_val != max_val:
+ try:
+ bin_width = (float(max_val) - float(min_val)) / HISTOGRAM_BINS
+ bucket_rows = con.execute(
+ f"""
+ SELECT
+ LEAST(FLOOR((CAST({safe_col} AS DOUBLE) - {float(min_val)}) / {bin_width}), {HISTOGRAM_BINS - 1}) + 1 AS bucket,
+ COUNT(*) AS cnt
+ FROM {view_name}
+ WHERE {safe_col} IS NOT NULL
+ GROUP BY bucket
+ ORDER BY bucket
+ """
+ ).fetchall()
+
+ bin_labels: List[str] = []
+ bin_counts: List[int] = []
+ bucket_dict = {int(r[0]): int(r[1]) for r in bucket_rows if r[0] is not None}
+ for i in range(1, HISTOGRAM_BINS + 1):
+ lo = float(min_val) + (i - 1) * bin_width
+ hi = float(min_val) + i * bin_width
+ bin_labels.append(f"{_format_number(lo)}-{_format_number(hi)}")
+ bin_counts.append(bucket_dict.get(i, 0))
+ histogram = {"bins": bin_labels, "counts": bin_counts}
+ except Exception as exc:
+ logger.debug("Histogram failed for column %s: %s", col_name, exc)
+
+ col_profile["numeric_stats"] = {
+ "min": min_val,
+ "max": max_val,
+ "mean": _round(raw["mean"]),
+ "median": _round(raw["median"]),
+ "stddev": _round(raw["stddev"]),
+ "p5": _round(raw["p5"]),
+ "p25": _round(raw["p25"]),
+ "p75": _round(raw["p75"]),
+ "p95": _round(raw["p95"]),
+ "zeros": zeros,
+ "zeros_pct": zeros_pct,
+ "negative": negative,
+ "negative_pct": negative_pct,
+ "histogram": histogram,
+ }
+
+ elif category == "STRING" and col_name in string_batch:
+ sl = string_batch[col_name]
+ is_categorical = unique_count <= MAX_CATEGORICAL_DISTINCT
+
+ top_values: List[Dict[str, Any]] = []
+ if is_categorical and non_null > 0:
+ rows = con.execute(
+ f"""
+ SELECT {safe_col} AS val, COUNT(*) AS cnt
+ FROM {view_name}
+ WHERE {safe_col} IS NOT NULL
+ GROUP BY {safe_col}
+ ORDER BY cnt DESC
+ LIMIT {TOP_VALUES_LIMIT}
+ """
+ ).fetchall()
+ for row in rows:
+ pct = _round(100.0 * row[1] / non_null) if non_null > 0 else 0.0
+ top_values.append({"value": str(row[0]), "count": row[1], "pct": pct})
+
+ if top_values and top_values[0]["pct"] > ALERT_IMBALANCE_PCT:
+ if "imbalance" not in alerts:
+ alerts.append("imbalance")
+ else:
+ if unique_count > ALERT_HIGH_CARDINALITY and "high_cardinality" not in alerts:
+ alerts.append("high_cardinality")
+
+ col_profile["string_stats"] = {
+ "min_length": sl["min_length"],
+ "max_length": sl["max_length"],
+ "avg_length": sl["avg_length"],
+ "top_values": top_values,
+ }
+
+ elif category in ("DATE", "TIMESTAMP") and col_name in date_batch:
+ dr = date_batch[col_name]
+ cast_expr = f"CAST({safe_col} AS DATE)" if category == "TIMESTAMP" else safe_col
+
+ # Date histogram (YEAR/QUARTER grouping)
+ histogram = {"bins": [], "counts": []}
+ try:
+ rows = con.execute(
+ f"""
+ SELECT
+ YEAR({cast_expr}) AS yr,
+ QUARTER({cast_expr}) AS qtr,
+ COUNT(*) AS cnt
+ FROM {view_name}
+ WHERE {safe_col} IS NOT NULL
+ GROUP BY yr, qtr
+ ORDER BY yr, qtr
+ """
+ ).fetchall()
+ histogram["bins"] = [f"{int(r[0])}-Q{int(r[1])}" for r in rows]
+ histogram["counts"] = [int(r[2]) for r in rows]
+ except Exception as exc:
+ logger.debug("Date histogram failed for %s: %s", col_name, exc)
+
+ col_profile["date_stats"] = {
+ "earliest": dr["earliest"],
+ "latest": dr["latest"],
+ "span_days": dr["span_days"],
+ "histogram": histogram,
+ }
+
+ if first_date_col is None and dr["earliest"]:
+ first_date_col = col_profile["date_stats"]
+
+ elif category == "BOOLEAN" and col_name in boolean_batch:
+ col_profile["boolean_stats"] = boolean_batch[col_name]
+
+ except Exception as exc:
+ logger.warning("Type-specific stats failed for %s: %s", col_name, exc)
+
+ columns.append(col_profile)
+ total_null_count += null_count
+
+ # Table-level completeness
+ avg_completeness = 0.0
+ if columns:
+ avg_completeness = _round(
+ sum(c["completeness_pct"] for c in columns) / len(columns)
+ )
+ missing_cells_pct = _round(100.0 * total_null_count / total_cells) if total_cells > 0 else 0.0
+
+ # Duplicate rows (by primary key)
+ duplicate_rows = 0
+ if pk_columns and working_rows > 0:
+ try:
+ pk_expr = ", ".join(f'"{c}"' for c in pk_columns)
+ distinct_pk = con.execute(
+ f"SELECT COUNT(DISTINCT ({pk_expr})) FROM {view_name}"
+ ).fetchone()[0]
+ duplicate_rows = working_rows - distinct_pk
+ except Exception as exc:
+ logger.debug("Duplicate check failed: %s", exc)
+
+ # Sample rows
+ sample_rows: List[Dict[str, Any]] = []
+ try:
+ sample_result = con.execute(f"SELECT * FROM {view_name} LIMIT {SAMPLE_ROWS_LIMIT}")
+ sample_col_names = [desc[0] for desc in sample_result.description]
+ for row in sample_result.fetchall():
+ sample_rows.append(
+ {sample_col_names[i]: str(v) if v is not None else None for i, v in enumerate(row)}
+ )
+ except Exception as exc:
+ logger.debug("Sample rows failed: %s", exc)
+
+ # Aggregate column alerts to table level
+ table_alerts: List[Dict[str, str]] = []
+ alert_messages = {
+ "constant": "{col} is constant (single value)",
+ "unique": "{col} has all unique values",
+ "high_missing": "{col} has {pct}% missing values",
+ "missing": "{col} has {pct}% missing values",
+ "imbalance": "{col} is highly imbalanced (top value {pct}%)",
+ "zeros": "{col} has {pct}% zero values",
+ "high_cardinality": "{col} has high cardinality ({n} distinct)",
+ }
+ for col in columns:
+ col_alert_name = col.get("name", "")
+ missing_pct_val = _round(100.0 - col.get("completeness_pct", 100.0))
+ for a in col.get("alerts", []):
+ if a in ("high_missing", "missing"):
+ msg = alert_messages[a].format(col=col_alert_name, pct=missing_pct_val)
+ elif a == "imbalance":
+ top_pct = 0.0
+ ss = col.get("string_stats", {})
+ tv = ss.get("top_values", [])
+ if tv:
+ top_pct = tv[0].get("pct", 0.0)
+ msg = alert_messages[a].format(col=col_alert_name, pct=top_pct)
+ elif a == "zeros":
+ ns = col.get("numeric_stats", {})
+ msg = alert_messages[a].format(col=col_alert_name, pct=ns.get("zeros_pct", 0.0))
+ elif a == "high_cardinality":
+ msg = alert_messages[a].format(col=col_alert_name, n=col.get("unique_count", 0))
+ else:
+ msg = alert_messages.get(a, f"{col_alert_name}: {a}").format(col=col_alert_name)
+ table_alerts.append({"column": col_alert_name, "type": a, "message": msg})
+
+ # File size
+ file_size_mb = None
+ try:
+ if source_path.is_dir():
+ total_bytes = sum(f.stat().st_size for f in source_path.glob("*.parquet"))
+ elif source_path.exists():
+ total_bytes = source_path.stat().st_size
+ else:
+ total_bytes = 0
+ file_size_mb = _round(total_bytes / (1024 * 1024))
+ except OSError:
+ pass
+
+ # Date range from first date column
+ date_range = None
+ if first_date_col:
+ date_range = {
+ "earliest": first_date_col.get("earliest"),
+ "latest": first_date_col.get("latest"),
+ "span_days": first_date_col.get("span_days"),
+ }
+
+ con.close()
+
+ return {
+ "table_name": table_name,
+ "source_path": str(source_path),
+ "row_count": total_rows,
+ "column_count": len(col_info),
+ "file_size_mb": file_size_mb,
+ "primary_key": primary_key,
+ "avg_completeness": avg_completeness,
+ "missing_cells": total_null_count,
+ "missing_cells_pct": missing_cells_pct,
+ "duplicate_rows": duplicate_rows,
+ "variable_types": variable_types,
+ "date_range": date_range,
+ "alerts": table_alerts,
+ "sampled": sampled,
+ "columns": columns,
+ "sample_rows": sample_rows,
+ }
+
+
+# ---------------------------------------------------------------------------
+# HTML report generation
+# ---------------------------------------------------------------------------
+
+_TYPE_COLORS = {
+ "NUMERIC": "#8b5cf6",
+ "STRING": "#3b82f6",
+ "DATE": "#f59e0b",
+ "TIMESTAMP": "#f59e0b",
+ "BOOLEAN": "#10b981",
+}
+
+_ALERT_SEVERITY = {
+ "high_missing": "e",
+ "missing": "w",
+ "constant": "i",
+ "unique": "i",
+ "imbalance": "w",
+ "zeros": "w",
+ "high_cardinality": "i",
+}
+
+_CSS = """
+*{margin:0;padding:0;box-sizing:border-box}
+body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',system-ui,sans-serif;
+ background:#f8fafc;color:#0f172a;line-height:1.5;font-size:14px}
+.wrap{max-width:1200px;margin:0 auto;padding:20px 24px 60px}
+header{padding:20px 0 16px;border-bottom:1px solid #e2e8f0;margin-bottom:24px}
+h1{font-size:22px;font-weight:700}
+.meta{color:#64748b;font-size:12px;margin-top:2px}
+.cards{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:10px;margin:16px 0}
+.card{background:#fff;border-radius:8px;box-shadow:0 1px 3px rgba(0,0,0,.08);padding:14px 16px;text-align:center}
+.card-v{font-size:26px;font-weight:700}.card-l{font-size:10px;color:#64748b;text-transform:uppercase;letter-spacing:.05em;margin-top:2px}
+.tabs{display:flex;gap:4px;margin-bottom:20px;flex-wrap:wrap}
+.tab{padding:7px 14px;border-radius:6px;cursor:pointer;font-size:13px;border:1px solid #e2e8f0;background:#fff;transition:all .15s}
+.tab:hover{border-color:#93c5fd}.tab.active{background:#3b82f6;color:#fff;border-color:#3b82f6}
+.tsec{display:none}.tsec.active{display:block}
+.alerts{margin:12px 0}
+.alert{padding:7px 12px;border-radius:6px;margin:3px 0;font-size:12px}
+.alert-w{background:#fef3c7;color:#92400e}.alert-e{background:#fee2e2;color:#991b1b}.alert-i{background:#dbeafe;color:#1e40af}
+.types{display:flex;gap:6px;margin:10px 0;flex-wrap:wrap}
+.tbadge{padding:2px 10px;border-radius:12px;font-size:11px;font-weight:600;color:#fff}
+.stitle{font-size:15px;font-weight:600;margin:20px 0 8px}
+.col-list{background:#fff;border-radius:8px;box-shadow:0 1px 3px rgba(0,0,0,.08);overflow:hidden}
+.col-hdr{display:grid;grid-template-columns:minmax(140px,1.5fr) 56px minmax(100px,1fr) 90px 50px;
+ align-items:center;padding:8px 14px;cursor:pointer;border-bottom:1px solid #f1f5f9;gap:8px;transition:background .1s}
+.col-hdr:hover{background:#f8fafc}
+.col-hdr-label{cursor:default;font-weight:600;font-size:11px;color:#64748b;border-bottom-width:2px}
+.cn{font-weight:600;font-size:13px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
+.pk{color:#f59e0b;font-size:10px;font-weight:700;margin-left:3px}
+.ct{font-size:10px;padding:2px 6px;border-radius:4px;text-align:center;font-weight:600;color:#fff;white-space:nowrap}
+.cbar-bg{height:5px;background:#e2e8f0;border-radius:3px;overflow:hidden;flex:1}
+.cbar{height:100%;border-radius:3px}
+.compl{display:flex;align-items:center;gap:6px}
+.cpct{font-size:11px;color:#64748b;min-width:32px;text-align:right}
+.cuniq{font-size:11px;color:#64748b;text-align:right;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
+.calerts span{padding:1px 5px;border-radius:8px;background:#fee2e2;color:#991b1b;font-size:10px}
+.col-det{display:none;padding:14px 16px;border-bottom:1px solid #e2e8f0;background:#fafbfc}
+.col-det.open{display:block}
+.dgrid{display:grid;grid-template-columns:1fr 1fr;gap:16px}
+@media(max-width:768px){.dgrid{grid-template-columns:1fr}.col-hdr{grid-template-columns:1fr 50px 1fr 70px 40px;font-size:12px}}
+.stbl{font-size:12px;width:100%;border-collapse:collapse}
+.stbl td{padding:2px 0}.stbl td:first-child{color:#64748b;padding-right:10px;white-space:nowrap}
+.stbl td:last-child{font-weight:500;text-align:right}
+.histogram{display:flex;align-items:flex-end;gap:1px;height:72px;margin:10px 0}
+.h-bar{flex:1;background:#3b82f6;border-radius:2px 2px 0 0;min-width:3px;transition:background .15s;cursor:default;min-height:1px}
+.h-bar:hover{background:#2563eb}
+.h-labels{display:flex;justify-content:space-between;font-size:9px;color:#94a3b8;margin-top:2px}
+.tvr{display:grid;grid-template-columns:110px 1fr 42px 52px;align-items:center;gap:6px;padding:2px 0;font-size:12px}
+.tvl{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
+.tvb-bg{height:7px;background:#e2e8f0;border-radius:4px;overflow:hidden}
+.tvb{height:100%;background:#3b82f6;border-radius:4px}
+.tvp{text-align:right;color:#64748b;font-size:11px}
+.tvc{text-align:right;color:#94a3b8;font-size:10px}
+.bbar{display:flex;height:18px;border-radius:4px;overflow:hidden;font-size:10px}
+.bt{background:#22c55e;color:#fff;display:flex;align-items:center;justify-content:center}
+.bf{background:#e2e8f0;color:#64748b;display:flex;align-items:center;justify-content:center}
+.svs{display:flex;gap:4px;flex-wrap:wrap;margin-top:6px}
+.sv{background:#f1f5f9;padding:1px 7px;border-radius:4px;font-size:11px;color:#475569}
+.swrap{margin-top:20px}
+.stog{cursor:pointer;color:#3b82f6;font-size:13px;font-weight:500;user-select:none}
+.sdata{display:none;margin-top:8px;overflow-x:auto}
+.sdata.open{display:block}
+table.dt{border-collapse:collapse;font-size:11px;width:100%}
+table.dt th{background:#f1f5f9;padding:5px 8px;text-align:left;font-weight:600;border:1px solid #e2e8f0;white-space:nowrap}
+table.dt td{padding:5px 8px;border:1px solid #e2e8f0;max-width:180px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
+.foot{text-align:center;color:#94a3b8;font-size:11px;margin-top:40px;padding-top:16px;border-top:1px solid #e2e8f0}
+@media print{.tabs,.stog{display:none}.tsec,.col-det,.sdata{display:block!important}body{background:#fff}.card{box-shadow:none;border:1px solid #e2e8f0}}
+"""
+
+_JS = """
+function switchTab(n){
+ document.querySelectorAll('.tab').forEach(function(t){t.classList.toggle('active',t.dataset.t===n)});
+ document.querySelectorAll('.tsec').forEach(function(s){s.classList.toggle('active',s.id==='t-'+n)});
+}
+function toggleCol(el){el.nextElementSibling.classList.toggle('open')}
+function toggleSample(el){el.nextElementSibling.classList.toggle('open')}
+"""
+
+
+def _esc(s: Any) -> str:
+ return html_mod.escape(str(s)) if s is not None else ""
+
+
+def _slug(name: str) -> str:
+ return name.replace(" ", "-").replace(".", "-").replace("/", "-")
+
+
+def _fnum(n: Any) -> str:
+ if n is None:
+ return "-"
+ if isinstance(n, float):
+ if n == int(n) and abs(n) < 1e15:
+ return f"{int(n):,}"
+ return f"{n:,.2f}"
+ if isinstance(n, int):
+ return f"{n:,}"
+ return str(n)
+
+
+def _compl_color(pct: float) -> str:
+ if pct >= 95:
+ return "#22c55e"
+ if pct >= 70:
+ return "#eab308"
+ return "#ef4444"
+
+
+def _render_hist(bins: list, counts: list) -> str:
+ if not bins or not counts:
+ return ""
+ max_c = max(counts) or 1
+ bars = []
+ for b, c in zip(bins, counts):
+ pct = c / max_c * 100
+ bars.append(f'
')
+ return (
+ f'{"".join(bars)}
'
+ f'{_esc(bins[0])}{_esc(bins[-1])}
'
+ )
+
+
+def _render_top_vals(top_values: list) -> str:
+ if not top_values:
+ return ""
+ max_pct = max((tv.get("pct", 0) for tv in top_values), default=1) or 1
+ rows = []
+ for tv in top_values:
+ bar_w = tv.get("pct", 0) / max_pct * 100
+ rows.append(
+ f''
+ f'
{_esc(str(tv["value"])[:30])}'
+ f'
'
+ f'
{tv.get("pct", 0)}%'
+ f'
({_fnum(tv.get("count", 0))})'
+ f'
'
+ )
+ return "".join(rows)
+
+
+def _render_col_detail(col: dict) -> str:
+ parts: List[str] = []
+ ns = col.get("numeric_stats")
+ if ns:
+ parts.append('')
+ for label, key in [
+ ("Min", "min"), ("Max", "max"), ("Mean", "mean"),
+ ("Median", "median"), ("Std Dev", "stddev"),
+ ("P5", "p5"), ("P25", "p25"), ("P75", "p75"), ("P95", "p95"),
+ ("Zeros", "zeros"), ("Zeros %", "zeros_pct"),
+ ("Negative", "negative"), ("Negative %", "negative_pct"),
+ ]:
+ parts.append(f'| {label} | {_fnum(ns.get(key))} |
')
+ parts.append('
')
+ h = ns.get("histogram", {})
+ parts.append(_render_hist(h.get("bins", []), h.get("counts", [])))
+ parts.append('
')
+
+ ss = col.get("string_stats")
+ if ss:
+ parts.append('')
+ parts.append(f'| Min length | {_fnum(ss.get("min_length"))} |
')
+ parts.append(f'| Max length | {_fnum(ss.get("max_length"))} |
')
+ parts.append(f'| Avg length | {_fnum(ss.get("avg_length"))} |
')
+ parts.append('
')
+ tv = ss.get("top_values", [])
+ if tv:
+ parts.append('Top Values
')
+ parts.append(_render_top_vals(tv))
+
+ ds = col.get("date_stats")
+ if ds:
+ parts.append('')
+ parts.append(f'| Earliest | {_esc(ds.get("earliest", "-"))} |
')
+ parts.append(f'| Latest | {_esc(ds.get("latest", "-"))} |
')
+ parts.append(f'| Span | {_fnum(ds.get("span_days"))} days |
')
+ parts.append('
')
+ h = ds.get("histogram", {})
+ parts.append(_render_hist(h.get("bins", []), h.get("counts", [])))
+ parts.append('
')
+
+ bs = col.get("boolean_stats")
+ if bs:
+ tc, fc = bs.get("true_count", 0), bs.get("false_count", 0)
+ tp = bs.get("true_pct", 0)
+ fp = round(100 - tp, 1) if tp else 0
+ parts.append(
+ f''
+ f'
True {tp}% ({tc:,})
'
+ f'
False {fp}% ({fc:,})
'
+ f'
'
+ )
+
+ sv = col.get("sample_values", [])
+ if sv:
+ parts.append('Sample values:
')
+ parts.append('')
+ for v in sv:
+ parts.append(f'{_esc(str(v)[:50])}')
+ parts.append('
')
+
+ return "".join(parts)
+
+
+def generate_html_report(profile_data: Dict[str, Any], output_path: Path) -> None:
+ """Generate a standalone HTML report from profile data.
+
+ Args:
+ profile_data: Full profile dict with "tables" key.
+ output_path: Path to write the HTML file.
+ """
+ tables = profile_data.get("tables", {})
+ generated_at = profile_data.get("generated_at", "")
+ if not tables:
+ logger.warning("No tables in profile data")
+ return
+
+ total_tables = len(tables)
+ total_rows = sum(t.get("row_count", 0) for t in tables.values())
+ total_cols = sum(t.get("column_count", 0) for t in tables.values())
+ compl_vals = [t.get("avg_completeness", 0) for t in tables.values()]
+ avg_compl = round(sum(compl_vals) / len(compl_vals), 1) if compl_vals else 0
+ total_alerts = sum(len(t.get("alerts", [])) for t in tables.values())
+ table_names = list(tables.keys())
+
+ h: List[str] = []
+ h.append('')
+ h.append('')
+ h.append('Data Profile Report')
+ h.append(f'')
+
+ # Header
+ h.append('
')
+
+ # Summary cards
+ h.append('
')
+ for val, label in [
+ (_fnum(total_tables), "Tables"),
+ (_fnum(total_rows), "Total Rows"),
+ (_fnum(total_cols), "Total Columns"),
+ (f"{avg_compl}%", "Avg Completeness"),
+ (_fnum(total_alerts), "Alerts"),
+ ]:
+ h.append(f'
')
+ h.append('
')
+
+ # Table tabs
+ if total_tables > 1:
+ h.append('
')
+ for i, name in enumerate(table_names):
+ act = " active" if i == 0 else ""
+ sl = _slug(name)
+ h.append(f'
{_esc(name)}
')
+ h.append('
')
+
+ # Table sections
+ for i, (name, tbl) in enumerate(tables.items()):
+ act = " active" if i == 0 or total_tables == 1 else ""
+ sl = _slug(name)
+ h.append(f'
')
+ h.append(f'{_esc(name)}
')
+
+ # Stat cards
+ h.append('')
+ rc = tbl.get("row_count", 0)
+ cc = tbl.get("column_count", 0)
+ tc = tbl.get("avg_completeness", 0)
+ sz = tbl.get("file_size_mb")
+ dupes = tbl.get("duplicate_rows", 0)
+ sampled = tbl.get("sampled", False)
+ for val, label in [
+ (_fnum(rc), "Rows"),
+ (_fnum(cc), "Columns"),
+ (f"{tc}%", "Completeness"),
+ (f"{sz} MB" if sz is not None else "-", "File Size"),
+ ]:
+ h.append(f'
')
+ dr = tbl.get("date_range")
+ if dr and dr.get("earliest"):
+ h.append(
+ f'
'
+ f'{_esc(dr["earliest"])} — {_esc(dr["latest"])}
'
+ f'
Date Range ({_fnum(dr.get("span_days"))} days)
'
+ )
+ if dupes:
+ h.append(f'
{_fnum(dupes)}
Duplicate Rows
')
+ if sampled:
+ h.append(f'
')
+ h.append('
')
+
+ # Variable types
+ vt = tbl.get("variable_types", {})
+ if vt:
+ h.append('')
+ for cat, cnt in sorted(vt.items()):
+ color = _TYPE_COLORS.get(cat, "#6b7280")
+ h.append(f'{cat} {cnt}')
+ h.append('
')
+
+ # Alerts
+ alerts = tbl.get("alerts", [])
+ if alerts:
+ h.append('')
+ for a in alerts:
+ sev = _ALERT_SEVERITY.get(a.get("type", ""), "i")
+ h.append(f'
{_esc(a.get("message", ""))}
')
+ h.append('
')
+
+ # Column list
+ columns = tbl.get("columns", [])
+ if columns:
+ h.append('Columns
')
+ h.append('')
+ # Header row
+ h.append('
')
+ h.append('
Name
Type
')
+ h.append('
Completeness
')
+ h.append('
Unique
')
+ h.append('
')
+
+ for col in columns:
+ cname = col.get("name", "")
+ cat = col.get("type_category", "STRING")
+ ctype = col.get("type", "")
+ cpct = col.get("completeness_pct", 0)
+ uniq = col.get("unique_count", 0)
+ upct = col.get("unique_pct", 0)
+ ca = col.get("alerts", [])
+ is_pk = col.get("is_primary_key", False)
+ color = _TYPE_COLORS.get(cat, "#6b7280")
+ cc_col = _compl_color(cpct)
+ pk_html = '
PK' if is_pk else ""
+ alert_html = f'
{len(ca)}' if ca else ""
+
+ h.append('
')
+ h.append(f'
{_esc(cname)}{pk_html}
')
+ h.append(f'
{_esc(cat[:4])}
')
+ h.append(f'
')
+ h.append(f'
{_fnum(uniq)} ({upct}%)
')
+ h.append(f'
{alert_html}
')
+ h.append('
')
+ h.append(f'
{_render_col_detail(col)}
')
+
+ h.append('
')
+
+ # Sample data
+ sample_rows = tbl.get("sample_rows", [])
+ if sample_rows:
+ h.append('')
+ h.append(f'
▶ Sample Data ({len(sample_rows)} rows)
')
+ h.append('
')
+ headers = list(sample_rows[0].keys())
+ h.append('' + ''.join(f'| {_esc(hd)} | ' for hd in headers) + '
')
+ for row in sample_rows:
+ h.append('' + ''.join(
+ f'| {_esc(str(row.get(hd, ""))[:60])} | '
+ for hd in headers
+ ) + '
')
+ h.append('
')
+
+ h.append('')
+
+ # Footer + JS
+ h.append('')
+ h.append(f'')
+ h.append('
')
+
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_text("\n".join(h), encoding="utf-8")
+ logger.info("Wrote HTML report: %s", output_path)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Profile Parquet/CSV files and output JSON statistics + optional HTML report.",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ %(prog)s data/orders.parquet
+ %(prog)s data/orders.parquet --primary-key order_id --html
+ %(prog)s data/orders.parquet data/customers.csv -o profiles.json --html
+ %(prog)s --from-json profile.json
+ """,
+ )
+ parser.add_argument(
+ "files",
+ nargs="*",
+ help="Parquet file(s), directory of Parquet files, or CSV file(s) to profile",
+ )
+ parser.add_argument(
+ "-o", "--output",
+ default="profile.json",
+ help="Output JSON file path (default: profile.json)",
+ )
+ parser.add_argument(
+ "--primary-key",
+ default=None,
+ help="Comma-separated primary key column(s) for duplicate detection",
+ )
+ parser.add_argument(
+ "--html",
+ action="store_true",
+ help="Also generate a standalone HTML report",
+ )
+ parser.add_argument(
+ "--from-json",
+ metavar="PATH",
+ default=None,
+ help="Generate HTML report from existing profile JSON (no profiling)",
+ )
+ parser.add_argument(
+ "--quiet", "-q",
+ action="store_true",
+ help="Suppress info logging",
+ )
+ args = parser.parse_args()
+
+ if args.quiet:
+ logging.getLogger("profiler").setLevel(logging.WARNING)
+
+ # Mode 1: Generate HTML from existing JSON
+ if args.from_json:
+ json_path = Path(args.from_json)
+ if not json_path.exists():
+ logger.error("File not found: %s", json_path)
+ sys.exit(1)
+ with open(json_path) as f:
+ profile_data = json.load(f)
+ html_path = json_path.with_suffix(".html")
+ generate_html_report(profile_data, html_path)
+ logger.info("Done: HTML report at %s", html_path)
+ return
+
+ # Mode 2: Profile files
+ if not args.files:
+ parser.error("Provide files to profile, or use --from-json")
+
+ profiles: Dict[str, Any] = {}
+ success = 0
+ errors = 0
+
+ for file_path_str in args.files:
+ file_path = Path(file_path_str)
+ if not file_path.exists():
+ logger.error("File not found: %s", file_path)
+ errors += 1
+ continue
+
+ try:
+ logger.info("Profiling %s ...", file_path)
+ profile = profile_table(
+ source_path=file_path,
+ primary_key=args.primary_key,
+ )
+ profiles[profile["table_name"]] = profile
+ success += 1
+ logger.info(
+ " %s: %d rows, %d cols, %d alerts",
+ profile["table_name"],
+ profile["row_count"],
+ profile["column_count"],
+ len(profile["alerts"]),
+ )
+ except Exception as exc:
+ logger.error("Failed to profile %s: %s", file_path, exc)
+ errors += 1
+
+ if not profiles:
+ logger.error("No tables profiled successfully")
+ sys.exit(1)
+
+ output = {
+ "generated_at": datetime.now(timezone.utc).isoformat(),
+ "version": "1.0",
+ "tables": profiles,
+ }
+
+ output_path = Path(args.output)
+ write_json_atomic(output_path, output)
+
+ # Generate HTML if requested
+ if args.html:
+ html_path = output_path.with_suffix(".html")
+ generate_html_report(output, html_path)
+
+ logger.info("Done: %d profiled, %d errors. Output: %s", success, errors, output_path)
+
+
+if __name__ == "__main__":
+ main()