Closes M15 from issue #81 — SQL injection via attacker-controlled identifiers in connectors/keboola/extractor.py and connectors/bigquery/extractor.py. Lifted _validate_identifier from src/orchestrator.py into a new src/identifier_validation.py shared module (single source of truth for both layers). Two validator policies: - validate_identifier (strict, ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$) for table_name — matches the orchestrator's rebuild-time check, so dashed names fail fast at extraction rather than being silently dropped. - validate_quoted_identifier (relaxed, accepts dashes/dots) for bucket/dataset/source_table — Keboola in.c-foo and BigQuery my-dataset are legitimate, just need to be safe inside `"..."`. Both extractors skip-and-continue on unsafe rows (logged + counted in failure stats); _extract_via_extension re-validates as defense-in-depth. 71/71 extractor + orchestrator tests pass. Refs #81 Group D.
63 lines
2.8 KiB
Python
63 lines
2.8 KiB
Python
"""DuckDB identifier validation — shared across orchestrator and extractors.
|
|
|
|
Issue #81 Group D — extractor-layer SQL injection (M15) is the peer of the
|
|
orchestrator's `_meta.table_name` SQLi (M14, fixed previously by
|
|
`src/orchestrator.py:_validate_identifier`). Same trust problem at a
|
|
different layer: an attacker who controls the contents of `table_registry`
|
|
(admin or whoever can write to that table) can inject SQL via identifier
|
|
interpolation in a connector's `CREATE OR REPLACE VIEW` / `COPY` /
|
|
`INSERT INTO _meta` statements.
|
|
|
|
Lifted from `src/orchestrator.py` so both layers use the same regex.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Strict DuckDB identifier — letter or underscore start, alphanumeric/underscore body,
|
|
# bounded length. Use for orchestrator-side aliases, extension names, view names —
|
|
# anything we generate or that comes from a tightly-controlled namespace.
|
|
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
|
|
|
|
# Relaxed identifier allowing the dotted/dashed forms that real upstreams use
|
|
# — Keboola buckets (`in.c-foo`), BigQuery datasets, etc. Still refuses anything
|
|
# that could break out of a `"..."` quoted identifier (no `"`, no `'`, no `;`,
|
|
# no control chars, no NUL). 128-char cap matches common DB identifier limits.
|
|
_SAFE_QUOTED_IDENTIFIER = re.compile(r"^[a-zA-Z0-9_][a-zA-Z0-9_.\-]{0,127}$")
|
|
|
|
|
|
def is_safe_identifier(name: object) -> bool:
|
|
"""Return True if ``name`` is safe to interpolate into a strict
|
|
DuckDB identifier position (alias, extension, schema we generate)."""
|
|
return isinstance(name, str) and bool(_SAFE_IDENTIFIER.match(name))
|
|
|
|
|
|
def is_safe_quoted_identifier(name: object) -> bool:
|
|
"""Return True if ``name`` is safe to interpolate **inside** double-quotes
|
|
in a DuckDB identifier position. Allows `.` and `-` for upstream
|
|
naming conventions (Keboola buckets like `in.c-events`, BigQuery
|
|
datasets) but refuses anything that could close the quote or
|
|
inject control characters."""
|
|
return isinstance(name, str) and bool(_SAFE_QUOTED_IDENTIFIER.match(name))
|
|
|
|
|
|
def validate_identifier(name: str, context: str) -> bool:
|
|
"""Strict check — returns True if safe, False (with WARNING log) if not.
|
|
Use for identifiers that should match `[a-zA-Z_][a-zA-Z0-9_]*`."""
|
|
if not is_safe_identifier(name):
|
|
logger.warning("Rejected unsafe %s identifier: %r", context, name)
|
|
return False
|
|
return True
|
|
|
|
|
|
def validate_quoted_identifier(name: str, context: str) -> bool:
|
|
"""Relaxed check for upstream-typed identifiers (buckets, datasets).
|
|
Accepts dots and dashes; refuses quote/semicolon/control chars."""
|
|
if not is_safe_quoted_identifier(name):
|
|
logger.warning("Rejected unsafe %s identifier: %r", context, name)
|
|
return False
|
|
return True
|