Move profiler config to instance.yaml (KISS principle)

Instead of hardcoded Python constants, load profiler settings from config:
- instance.yaml: profiler section with all parameters
- Defaults: fallback to sensible defaults if config not found
- Centralized: all profiler tuning in one place, no code changes needed
This commit is contained in:
Petr 2026-03-12 14:45:14 +01:00
parent c25278538c
commit be58e63394
2 changed files with 60 additions and 13 deletions

View file

@ -163,3 +163,15 @@ datasets: {}
catalog: catalog:
categories: {} categories: {}
order: [] order: []
# --- Data profiler (optional) ---
# profiler:
# sample_size: 500000 # If table > this, sample this many rows; otherwise use all
# max_categorical_distinct: 50 # Treat as categorical if unique <= this
# top_values_limit: 10 # Top values per categorical column
# histogram_bins: 15 # Bins in histogram visualizations
# sample_rows_limit: 5 # Sample rows to show in UI "Sample" tab
# alert_high_missing_pct: 30.0 # Alert threshold for high missing %
# alert_missing_pct: 5.0 # Alert threshold for missing %
# alert_imbalance_pct: 60.0 # Alert threshold for imbalance %
# alert_high_cardinality: 50 # Alert threshold for high cardinality columns

View file

@ -32,21 +32,56 @@ logging.basicConfig(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Profiler configuration # Profiler configuration (loaded from instance.yaml, with defaults)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
SAMPLE_SIZE = 500_000 # If table > this, sample this many rows; else use all def _load_profiler_config():
MAX_CATEGORICAL_DISTINCT = 50 # Treat as categorical if unique <= this """Load profiler config from instance.yaml with sensible defaults."""
TOP_VALUES_LIMIT = 10 # Number of top values for categorical columns try:
HISTOGRAM_BINS = 15 # Number of bins for numeric histograms from config.loader import load_instance_config, get_instance_value
SAMPLE_ROWS_LIMIT = 5 # Number of sample rows to include config = load_instance_config()
SAMPLE_VALUES_LIMIT = 5 # Number of sample distinct values per column profiler_cfg = config.get("profiler", {})
# Alert thresholds return {
ALERT_HIGH_MISSING_PCT = 30.0 "sample_size": profiler_cfg.get("sample_size", 500_000),
ALERT_MISSING_PCT = 5.0 "max_categorical_distinct": profiler_cfg.get("max_categorical_distinct", 50),
ALERT_IMBALANCE_PCT = 60.0 "top_values_limit": profiler_cfg.get("top_values_limit", 10),
ALERT_ZEROS_PCT = 50.0 "histogram_bins": profiler_cfg.get("histogram_bins", 15),
ALERT_HIGH_CARDINALITY = 50 "sample_rows_limit": profiler_cfg.get("sample_rows_limit", 5),
"sample_values_limit": profiler_cfg.get("sample_values_limit", 5),
"alert_high_missing_pct": profiler_cfg.get("alert_high_missing_pct", 30.0),
"alert_missing_pct": profiler_cfg.get("alert_missing_pct", 5.0),
"alert_imbalance_pct": profiler_cfg.get("alert_imbalance_pct", 60.0),
"alert_high_cardinality": profiler_cfg.get("alert_high_cardinality", 50),
}
except Exception as e:
# Fallback to defaults if config loading fails
logger.warning(f"Could not load profiler config: {e}. Using defaults.")
return {
"sample_size": 500_000,
"max_categorical_distinct": 50,
"top_values_limit": 10,
"histogram_bins": 15,
"sample_rows_limit": 5,
"sample_values_limit": 5,
"alert_high_missing_pct": 30.0,
"alert_missing_pct": 5.0,
"alert_imbalance_pct": 60.0,
"alert_high_cardinality": 50,
}
_cfg = _load_profiler_config()
SAMPLE_SIZE = _cfg["sample_size"]
MAX_CATEGORICAL_DISTINCT = _cfg["max_categorical_distinct"]
TOP_VALUES_LIMIT = _cfg["top_values_limit"]
HISTOGRAM_BINS = _cfg["histogram_bins"]
SAMPLE_ROWS_LIMIT = _cfg["sample_rows_limit"]
SAMPLE_VALUES_LIMIT = _cfg["sample_values_limit"]
ALERT_HIGH_MISSING_PCT = _cfg["alert_high_missing_pct"]
ALERT_MISSING_PCT = _cfg["alert_missing_pct"]
ALERT_IMBALANCE_PCT = _cfg["alert_imbalance_pct"]
ALERT_ZEROS_PCT = 50.0 # Not in config (rarely needed)
ALERT_HIGH_CARDINALITY = _cfg["alert_high_cardinality"]
# Paths - configurable via environment or defaults for server # Paths - configurable via environment or defaults for server
DATA_DIR = Path(os.environ.get("PROFILER_DATA_DIR", "/data/src_data")) DATA_DIR = Path(os.environ.get("PROFILER_DATA_DIR", "/data/src_data"))