fix(security): #81 Group D — extractor-side identifier validation (squashed) (#97)

Closes M15 from issue #81 — SQL injection via attacker-controlled
identifiers in connectors/keboola/extractor.py and
connectors/bigquery/extractor.py.

Lifted _validate_identifier from src/orchestrator.py into a new
src/identifier_validation.py shared module (single source of truth for
both layers). Two validator policies:

- validate_identifier (strict, ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$) for
  table_name — matches the orchestrator's rebuild-time check, so dashed
  names fail fast at extraction rather than being silently dropped.
- validate_quoted_identifier (relaxed, accepts dashes/dots) for
  bucket/dataset/source_table — Keboola in.c-foo and BigQuery
  my-dataset are legitimate, just need to be safe inside `"..."`.

Both extractors skip-and-continue on unsafe rows (logged + counted in
failure stats); _extract_via_extension re-validates as defense-in-depth.

71/71 extractor + orchestrator tests pass.
Refs #81 Group D.
This commit is contained in:
ZdenekSrotyr 2026-04-27 21:46:17 +02:00 committed by GitHub
parent 23be8ad46f
commit 569cd90d75
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 336 additions and 13 deletions

View file

@ -133,6 +133,25 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
`keboola, bigquery`) is visible at boot rather than at the next
failed attach. See
`docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md`.
- **Security (MEDIUM)**: extractor-side identifier validation (issue
#81 Group D / M15). The Keboola and BigQuery extractors interpolate
`table_name`, `bucket` / `dataset`, and `source_table` from
`table_registry` directly into `CREATE OR REPLACE VIEW`,
`INSERT INTO _meta`, and `COPY ... TO` SQL. Anyone with write access
to `table_registry` (admin, registry-write API) could inject SQL via
these identifiers. New shared module `src/identifier_validation.py`
exposes a strict `validate_identifier` (for our own view names —
`^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`, used for `table_name` so it matches
the orchestrator's rebuild-time check and dashed names fail fast at
extraction rather than being silently dropped at rebuild) and a
relaxed `validate_quoted_identifier` (for upstream-typed names like
Keboola `in.c-foo` / BigQuery `my-dataset`:
`[a-zA-Z0-9_][a-zA-Z0-9_.\-]*`, refusing any character that could
close a `"..."` identifier literal). The orchestrator's existing
`_validate_identifier` was lifted into the new module so both layers
share a single source of truth; both extractors skip-and-continue on
unsafe rows (logged + counted in failure stats; the rest of the
registry still processes).
### Removed

View file

@ -12,6 +12,8 @@ from typing import List, Dict, Any
import duckdb
from src.identifier_validation import validate_identifier, validate_quoted_identifier
logger = logging.getLogger(__name__)
@ -90,6 +92,25 @@ def init_extract(
dataset = tc.get("bucket", "") # BigQuery dataset
source_table = tc.get("source_table", table_name)
# #81 Group D — refuse rows with unsafe identifiers. Same
# rationale as the keboola extractor: registry is admin-controlled
# but anyone with write access can otherwise inject SQL via the
# CREATE VIEW interpolation below. Skip-and-continue.
# `table_name` is the DuckDB view name in the master
# analytics DB and the orchestrator uses the STRICT
# validator there — accept the same constraint upstream
# so a name with `-` or `.` fails fast in extraction
# rather than getting silently dropped at rebuild time.
# `dataset` and `source_table` are upstream-typed (BQ
# naming) so use the relaxed validator for those.
if not (validate_identifier(table_name, "BigQuery table_name") and
validate_quoted_identifier(dataset, "BigQuery dataset") and
validate_quoted_identifier(source_table, "BigQuery source_table")):
stats["errors"].append(
{"table": table_name, "error": "unsafe identifier"}
)
continue
try:
conn.execute(
f'CREATE OR REPLACE VIEW "{table_name}" AS '

View file

@ -8,6 +8,13 @@ from typing import List, Dict, Any
import duckdb
from src.identifier_validation import (
is_safe_identifier,
is_safe_quoted_identifier,
validate_identifier,
validate_quoted_identifier,
)
logger = logging.getLogger(__name__)
@ -95,10 +102,37 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
table_name = tc["name"]
query_mode = tc.get("query_mode", "local")
# #81 Group D — refuse rows whose identifiers don't pass the
# whitelist. The registry is admin-controlled but anyone with
# write access can otherwise inject SQL via the CREATE VIEW /
# COPY / SELECT interpolation below. Skip-and-continue rather
# than crashing the whole extraction; valid rows still process.
#
# `table_name` is the DuckDB view name in the master
# analytics DB. The orchestrator uses the STRICT validator
# (`^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`) when re-creating views,
# so any name with `-` or `.` would pass extraction here
# but be silently dropped at orchestrator-rebuild time.
# Use the strict validator here too so the failure is
# caught early and visible in tables_failed.
if not validate_identifier(table_name, "Keboola table_name"):
stats["tables_failed"] += 1
stats["errors"].append(
{"table": table_name, "error": "unsafe identifier"}
)
continue
if query_mode == "remote":
# Create view pointing to kbc extension (requires re-ATTACH at query time)
bucket = tc.get("bucket", "")
source_table = tc.get("source_table", table_name)
if not (validate_quoted_identifier(bucket, "Keboola bucket") and
validate_quoted_identifier(source_table, "Keboola source_table")):
stats["tables_failed"] += 1
stats["errors"].append(
{"table": table_name, "error": "unsafe bucket/source_table"}
)
continue
if use_extension and bucket:
conn.execute(
f'CREATE OR REPLACE VIEW "{table_name}" AS '
@ -119,13 +153,19 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
else:
_extract_via_legacy(tc, pq_path, keboola_url, keboola_token)
# Get row count and file size
rows = conn.execute(f"SELECT count(*) FROM read_parquet('{pq_path}')").fetchone()[0]
# Get row count and file size. pq_path is built from the
# validated table_name above, but escape the parquet path
# literal for defense-in-depth.
safe_pq_lit = pq_path.replace("'", "''")
rows = conn.execute(
f"SELECT count(*) FROM read_parquet('{safe_pq_lit}')"
).fetchone()[0]
size = os.path.getsize(pq_path)
# Create view and register in _meta
conn.execute(
f'CREATE OR REPLACE VIEW "{table_name}" AS SELECT * FROM read_parquet(\'{pq_path}\')'
f'CREATE OR REPLACE VIEW "{table_name}" AS '
f'SELECT * FROM read_parquet(\'{safe_pq_lit}\')'
)
conn.execute(
"INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')",
@ -172,8 +212,17 @@ def _extract_via_extension(
"""Extract a table using the DuckDB Keboola extension."""
bucket = tc.get("bucket", "")
source_table = tc.get("source_table", tc["name"])
# #81 Group D — defense-in-depth. The caller already validates these;
# refuse here too in case a future caller forgets. Use the relaxed
# quoted-identifier check that accepts Keboola's `in.c-foo` form.
if not (is_safe_quoted_identifier(bucket) and is_safe_quoted_identifier(source_table)):
raise ValueError(
f"unsafe bucket/source_table: {bucket!r}/{source_table!r}"
)
safe_pq_lit = pq_path.replace("'", "''")
conn.execute(
f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") TO \'{pq_path}\' (FORMAT PARQUET)'
f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") '
f'TO \'{safe_pq_lit}\' (FORMAT PARQUET)'
)

View file

@ -0,0 +1,63 @@
"""DuckDB identifier validation — shared across orchestrator and extractors.
Issue #81 Group D — extractor-layer SQL injection (M15) is the peer of the
orchestrator's `_meta.table_name` SQLi (M14, fixed previously by
`src/orchestrator.py:_validate_identifier`). Same trust problem at a
different layer: an attacker who controls the contents of `table_registry`
(admin or whoever can write to that table) can inject SQL via identifier
interpolation in a connector's `CREATE OR REPLACE VIEW` / `COPY` /
`INSERT INTO _meta` statements.
Lifted from `src/orchestrator.py` so both layers use the same regex.
"""
from __future__ import annotations
import logging
import re
logger = logging.getLogger(__name__)
# Strict DuckDB identifier — letter or underscore start, alphanumeric/underscore body,
# bounded length. Use for orchestrator-side aliases, extension names, view names —
# anything we generate or that comes from a tightly-controlled namespace.
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
# Relaxed identifier allowing the dotted/dashed forms that real upstreams use
# — Keboola buckets (`in.c-foo`), BigQuery datasets, etc. Still refuses anything
# that could break out of a `"..."` quoted identifier (no `"`, no `'`, no `;`,
# no control chars, no NUL). 128-char cap matches common DB identifier limits.
_SAFE_QUOTED_IDENTIFIER = re.compile(r"^[a-zA-Z0-9_][a-zA-Z0-9_.\-]{0,127}$")
def is_safe_identifier(name: object) -> bool:
"""Return True if ``name`` is safe to interpolate into a strict
DuckDB identifier position (alias, extension, schema we generate)."""
return isinstance(name, str) and bool(_SAFE_IDENTIFIER.match(name))
def is_safe_quoted_identifier(name: object) -> bool:
"""Return True if ``name`` is safe to interpolate **inside** double-quotes
in a DuckDB identifier position. Allows `.` and `-` for upstream
naming conventions (Keboola buckets like `in.c-events`, BigQuery
datasets) but refuses anything that could close the quote or
inject control characters."""
return isinstance(name, str) and bool(_SAFE_QUOTED_IDENTIFIER.match(name))
def validate_identifier(name: str, context: str) -> bool:
"""Strict check — returns True if safe, False (with WARNING log) if not.
Use for identifiers that should match `[a-zA-Z_][a-zA-Z0-9_]*`."""
if not is_safe_identifier(name):
logger.warning("Rejected unsafe %s identifier: %r", context, name)
return False
return True
def validate_quoted_identifier(name: str, context: str) -> bool:
"""Relaxed check for upstream-typed identifiers (buckets, datasets).
Accepts dots and dashes; refuses quote/semicolon/control chars."""
if not is_safe_quoted_identifier(name):
logger.warning("Rejected unsafe %s identifier: %r", context, name)
return False
return True

View file

@ -38,15 +38,14 @@ logger = logging.getLogger(__name__)
_rebuild_lock = threading.Lock()
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
def _validate_identifier(name: str, context: str) -> bool:
"""Validate a DuckDB identifier. Returns True if safe, False if not."""
if not _SAFE_IDENTIFIER.match(name):
logger.warning("Rejected unsafe %s identifier: %r", context, name)
return False
return True
# Identifier validation lives in src/identifier_validation.py so the
# orchestrator and the extractors share the same regex (#81 Group D).
# The local names are kept as aliases so existing call sites need no
# rename — they import from a single source of truth now.
from src.identifier_validation import ( # noqa: E402
_SAFE_IDENTIFIER, # noqa: F401 (re-exported for any historical caller)
validate_identifier as _validate_identifier,
)
def _atomic_swap_db(tmp_path: str, target_path: str) -> None:

View file

@ -0,0 +1,172 @@
"""Issue #81 Group D — extractor-layer identifier injection (M15).
Mirror of `_validate_identifier` coverage in `tests/test_orchestrator.py`,
but at the extractor layer. Each test feeds a registry row with an
attacker-shaped `name` / `bucket` / `source_table` into the extractor and
asserts the row is skipped (no SQL run, error recorded) while valid
sibling rows in the same registry continue to process.
"""
from unittest.mock import MagicMock
import pytest
from src.identifier_validation import (
is_safe_identifier,
is_safe_quoted_identifier,
validate_identifier,
validate_quoted_identifier,
)
class TestIsSafeIdentifier:
"""Strict identifier — for our own view names, aliases."""
@pytest.mark.parametrize(
"name",
["orders", "Orders", "_priv", "ABC_123", "x", "a" * 64],
)
def test_valid(self, name):
assert is_safe_identifier(name) is True
@pytest.mark.parametrize(
"name",
[
"",
"1starts_with_digit",
"has space",
"has-dash",
"has.dot",
"has;semi",
"has\"quote",
"has'quote",
"evil\"; DROP TABLE x; --",
"x" * 65, # too long
"café", # diacritic — DuckDB allows but our policy doesn't
None,
123,
["x"],
],
)
def test_invalid(self, name):
assert is_safe_identifier(name) is False
class TestIsSafeQuotedIdentifier:
"""Relaxed identifier for upstream-typed names (Keboola buckets,
BigQuery datasets). Allows `.` and `-`; refuses injection markers."""
@pytest.mark.parametrize(
"name",
[
"orders",
"in.c-events", # Keboola bucket
"out.c-marketing.roi", # nested Keboola bucket form
"my-bq-dataset",
"1starts_with_digit", # numeric start fine inside quotes
"ABC_123",
"a" * 128, # at the limit
],
)
def test_valid(self, name):
assert is_safe_quoted_identifier(name) is True
@pytest.mark.parametrize(
"name",
[
"", # empty
"has space",
"has\"quote", # would close the surrounding `"`
"has'apostrophe",
"has;semi",
"evil\"; DROP TABLE x; --",
"name\x00with-nul", # NUL
"name\nwith-newline", # control char
"x" * 129, # too long
".starts_with_dot",
"-starts_with_dash",
None,
123,
["x"],
],
)
def test_invalid(self, name):
assert is_safe_quoted_identifier(name) is False
class TestKeboolaExtractorRefusesUnsafeIdentifiers:
"""Verify keboola/extractor.py:run() skips registry rows with unsafe
identifiers but still processes the safe siblings."""
def test_unsafe_table_name_rejected_safe_kept(self, tmp_path, monkeypatch):
"""Behavioural test: pass mixed registry rows, observe stats."""
from connectors.keboola import extractor as kbe
# Use the extractor's run() with use_extension=False so we don't
# need a real Keboola server. We mock _extract_via_legacy to be a
# no-op that creates a tiny parquet so the size lookup works.
import duckdb
def fake_legacy(tc, pq_path, url, token):
d = duckdb.connect()
d.execute(f"COPY (SELECT 1 AS x) TO '{pq_path}' (FORMAT PARQUET)")
d.close()
monkeypatch.setattr(kbe, "_extract_via_legacy", fake_legacy)
# Three rows: a good underscore-only name, one with a dash (now
# rejected at the extractor — orchestrator's strict validator
# would have silently dropped it at rebuild time), and one with
# a hostile injection.
rows = [
{"name": "good_table", "query_mode": "local"},
{"name": "events-2026", "query_mode": "local"},
{"name": "evil\"; DROP TABLE x; --", "query_mode": "local"},
]
out_dir = tmp_path / "extracts" / "keboola"
result = kbe.run(
str(out_dir), rows, keboola_url="", keboola_token="",
)
# Only good_table passes. events-2026 is rejected up front because
# the orchestrator would silently drop it at rebuild time anyway —
# better to fail fast and visibly. Operators with dashed names
# rename them to underscore form.
assert result["tables_extracted"] == 1
assert result["tables_failed"] == 2
assert any("unsafe identifier" in (e.get("error") or "") for e in result["errors"])
def test_unsafe_bucket_in_remote_row_rejected(self, tmp_path, monkeypatch):
from connectors.keboola import extractor as kbe
# An adversarial bucket containing `"` would close the surrounding
# quote and inject SQL. Real Keboola buckets like `in.c-events`
# are accepted by the relaxed validator.
rows = [
{
"name": "good",
"query_mode": "remote",
"bucket": 'evil"; DROP--',
"source_table": "t",
},
]
out_dir = tmp_path / "extracts" / "keboola"
result = kbe.run(
str(out_dir), rows, keboola_url="", keboola_token="",
)
assert result["tables_failed"] == 1
assert any("unsafe bucket/source_table" in (e.get("error") or "") for e in result["errors"])
class TestBigQueryExtractorRefusesUnsafeIdentifiers:
def test_unsafe_dataset_rejected(self):
"""BigQuery extractor uses validate_quoted_identifier for dataset."""
# `my.dataset` is now legitimate (BigQuery datasets can contain dots).
assert validate_quoted_identifier("my-dataset", "BigQuery dataset") is True
# An injection attempt via embedded quote — refused.
assert validate_quoted_identifier("evil\"; DROP --", "BigQuery dataset") is False
# Sanity: a normal dataset name passes.
assert validate_quoted_identifier("marketing", "BigQuery dataset") is True
# The view-name (table_name) still uses the strict validator.
assert validate_identifier("marketing_view", "BigQuery table_name") is True
assert validate_identifier("marketing-view", "BigQuery table_name") is False