Closes M15 from issue #81 — SQL injection via attacker-controlled identifiers in connectors/keboola/extractor.py and connectors/bigquery/extractor.py. Lifted _validate_identifier from src/orchestrator.py into a new src/identifier_validation.py shared module (single source of truth for both layers). Two validator policies: - validate_identifier (strict, ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$) for table_name — matches the orchestrator's rebuild-time check, so dashed names fail fast at extraction rather than being silently dropped. - validate_quoted_identifier (relaxed, accepts dashes/dots) for bucket/dataset/source_table — Keboola in.c-foo and BigQuery my-dataset are legitimate, just need to be safe inside `"..."`. Both extractors skip-and-continue on unsafe rows (logged + counted in failure stats); _extract_via_extension re-validates as defense-in-depth. 71/71 extractor + orchestrator tests pass. Refs #81 Group D.
This commit is contained in:
parent
23be8ad46f
commit
569cd90d75
6 changed files with 336 additions and 13 deletions
19
CHANGELOG.md
19
CHANGELOG.md
|
|
@ -133,6 +133,25 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
|
|||
`keboola, bigquery`) is visible at boot rather than at the next
|
||||
failed attach. See
|
||||
`docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md`.
|
||||
- **Security (MEDIUM)**: extractor-side identifier validation (issue
|
||||
#81 Group D / M15). The Keboola and BigQuery extractors interpolate
|
||||
`table_name`, `bucket` / `dataset`, and `source_table` from
|
||||
`table_registry` directly into `CREATE OR REPLACE VIEW`,
|
||||
`INSERT INTO _meta`, and `COPY ... TO` SQL. Anyone with write access
|
||||
to `table_registry` (admin, registry-write API) could inject SQL via
|
||||
these identifiers. New shared module `src/identifier_validation.py`
|
||||
exposes a strict `validate_identifier` (for our own view names —
|
||||
`^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`, used for `table_name` so it matches
|
||||
the orchestrator's rebuild-time check and dashed names fail fast at
|
||||
extraction rather than being silently dropped at rebuild) and a
|
||||
relaxed `validate_quoted_identifier` (for upstream-typed names like
|
||||
Keboola `in.c-foo` / BigQuery `my-dataset`:
|
||||
`[a-zA-Z0-9_][a-zA-Z0-9_.\-]*`, refusing any character that could
|
||||
close a `"..."` identifier literal). The orchestrator's existing
|
||||
`_validate_identifier` was lifted into the new module so both layers
|
||||
share a single source of truth; both extractors skip-and-continue on
|
||||
unsafe rows (logged + counted in failure stats; the rest of the
|
||||
registry still processes).
|
||||
|
||||
### Removed
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ from typing import List, Dict, Any
|
|||
|
||||
import duckdb
|
||||
|
||||
from src.identifier_validation import validate_identifier, validate_quoted_identifier
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -90,6 +92,25 @@ def init_extract(
|
|||
dataset = tc.get("bucket", "") # BigQuery dataset
|
||||
source_table = tc.get("source_table", table_name)
|
||||
|
||||
# #81 Group D — refuse rows with unsafe identifiers. Same
|
||||
# rationale as the keboola extractor: registry is admin-controlled
|
||||
# but anyone with write access can otherwise inject SQL via the
|
||||
# CREATE VIEW interpolation below. Skip-and-continue.
|
||||
# `table_name` is the DuckDB view name in the master
|
||||
# analytics DB and the orchestrator uses the STRICT
|
||||
# validator there — accept the same constraint upstream
|
||||
# so a name with `-` or `.` fails fast in extraction
|
||||
# rather than getting silently dropped at rebuild time.
|
||||
# `dataset` and `source_table` are upstream-typed (BQ
|
||||
# naming) so use the relaxed validator for those.
|
||||
if not (validate_identifier(table_name, "BigQuery table_name") and
|
||||
validate_quoted_identifier(dataset, "BigQuery dataset") and
|
||||
validate_quoted_identifier(source_table, "BigQuery source_table")):
|
||||
stats["errors"].append(
|
||||
{"table": table_name, "error": "unsafe identifier"}
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
conn.execute(
|
||||
f'CREATE OR REPLACE VIEW "{table_name}" AS '
|
||||
|
|
|
|||
|
|
@ -8,6 +8,13 @@ from typing import List, Dict, Any
|
|||
|
||||
import duckdb
|
||||
|
||||
from src.identifier_validation import (
|
||||
is_safe_identifier,
|
||||
is_safe_quoted_identifier,
|
||||
validate_identifier,
|
||||
validate_quoted_identifier,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -95,10 +102,37 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
|
|||
table_name = tc["name"]
|
||||
query_mode = tc.get("query_mode", "local")
|
||||
|
||||
# #81 Group D — refuse rows whose identifiers don't pass the
|
||||
# whitelist. The registry is admin-controlled but anyone with
|
||||
# write access can otherwise inject SQL via the CREATE VIEW /
|
||||
# COPY / SELECT interpolation below. Skip-and-continue rather
|
||||
# than crashing the whole extraction; valid rows still process.
|
||||
#
|
||||
# `table_name` is the DuckDB view name in the master
|
||||
# analytics DB. The orchestrator uses the STRICT validator
|
||||
# (`^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`) when re-creating views,
|
||||
# so any name with `-` or `.` would pass extraction here
|
||||
# but be silently dropped at orchestrator-rebuild time.
|
||||
# Use the strict validator here too so the failure is
|
||||
# caught early and visible in tables_failed.
|
||||
if not validate_identifier(table_name, "Keboola table_name"):
|
||||
stats["tables_failed"] += 1
|
||||
stats["errors"].append(
|
||||
{"table": table_name, "error": "unsafe identifier"}
|
||||
)
|
||||
continue
|
||||
|
||||
if query_mode == "remote":
|
||||
# Create view pointing to kbc extension (requires re-ATTACH at query time)
|
||||
bucket = tc.get("bucket", "")
|
||||
source_table = tc.get("source_table", table_name)
|
||||
if not (validate_quoted_identifier(bucket, "Keboola bucket") and
|
||||
validate_quoted_identifier(source_table, "Keboola source_table")):
|
||||
stats["tables_failed"] += 1
|
||||
stats["errors"].append(
|
||||
{"table": table_name, "error": "unsafe bucket/source_table"}
|
||||
)
|
||||
continue
|
||||
if use_extension and bucket:
|
||||
conn.execute(
|
||||
f'CREATE OR REPLACE VIEW "{table_name}" AS '
|
||||
|
|
@ -119,13 +153,19 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
|
|||
else:
|
||||
_extract_via_legacy(tc, pq_path, keboola_url, keboola_token)
|
||||
|
||||
# Get row count and file size
|
||||
rows = conn.execute(f"SELECT count(*) FROM read_parquet('{pq_path}')").fetchone()[0]
|
||||
# Get row count and file size. pq_path is built from the
|
||||
# validated table_name above, but escape the parquet path
|
||||
# literal for defense-in-depth.
|
||||
safe_pq_lit = pq_path.replace("'", "''")
|
||||
rows = conn.execute(
|
||||
f"SELECT count(*) FROM read_parquet('{safe_pq_lit}')"
|
||||
).fetchone()[0]
|
||||
size = os.path.getsize(pq_path)
|
||||
|
||||
# Create view and register in _meta
|
||||
conn.execute(
|
||||
f'CREATE OR REPLACE VIEW "{table_name}" AS SELECT * FROM read_parquet(\'{pq_path}\')'
|
||||
f'CREATE OR REPLACE VIEW "{table_name}" AS '
|
||||
f'SELECT * FROM read_parquet(\'{safe_pq_lit}\')'
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')",
|
||||
|
|
@ -172,8 +212,17 @@ def _extract_via_extension(
|
|||
"""Extract a table using the DuckDB Keboola extension."""
|
||||
bucket = tc.get("bucket", "")
|
||||
source_table = tc.get("source_table", tc["name"])
|
||||
# #81 Group D — defense-in-depth. The caller already validates these;
|
||||
# refuse here too in case a future caller forgets. Use the relaxed
|
||||
# quoted-identifier check that accepts Keboola's `in.c-foo` form.
|
||||
if not (is_safe_quoted_identifier(bucket) and is_safe_quoted_identifier(source_table)):
|
||||
raise ValueError(
|
||||
f"unsafe bucket/source_table: {bucket!r}/{source_table!r}"
|
||||
)
|
||||
safe_pq_lit = pq_path.replace("'", "''")
|
||||
conn.execute(
|
||||
f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") TO \'{pq_path}\' (FORMAT PARQUET)'
|
||||
f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") '
|
||||
f'TO \'{safe_pq_lit}\' (FORMAT PARQUET)'
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
63
src/identifier_validation.py
Normal file
63
src/identifier_validation.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
"""DuckDB identifier validation — shared across orchestrator and extractors.
|
||||
|
||||
Issue #81 Group D — extractor-layer SQL injection (M15) is the peer of the
|
||||
orchestrator's `_meta.table_name` SQLi (M14, fixed previously by
|
||||
`src/orchestrator.py:_validate_identifier`). Same trust problem at a
|
||||
different layer: an attacker who controls the contents of `table_registry`
|
||||
(admin or whoever can write to that table) can inject SQL via identifier
|
||||
interpolation in a connector's `CREATE OR REPLACE VIEW` / `COPY` /
|
||||
`INSERT INTO _meta` statements.
|
||||
|
||||
Lifted from `src/orchestrator.py` so both layers use the same regex.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Strict DuckDB identifier — letter or underscore start, alphanumeric/underscore body,
|
||||
# bounded length. Use for orchestrator-side aliases, extension names, view names —
|
||||
# anything we generate or that comes from a tightly-controlled namespace.
|
||||
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
|
||||
|
||||
# Relaxed identifier allowing the dotted/dashed forms that real upstreams use
|
||||
# — Keboola buckets (`in.c-foo`), BigQuery datasets, etc. Still refuses anything
|
||||
# that could break out of a `"..."` quoted identifier (no `"`, no `'`, no `;`,
|
||||
# no control chars, no NUL). 128-char cap matches common DB identifier limits.
|
||||
_SAFE_QUOTED_IDENTIFIER = re.compile(r"^[a-zA-Z0-9_][a-zA-Z0-9_.\-]{0,127}$")
|
||||
|
||||
|
||||
def is_safe_identifier(name: object) -> bool:
|
||||
"""Return True if ``name`` is safe to interpolate into a strict
|
||||
DuckDB identifier position (alias, extension, schema we generate)."""
|
||||
return isinstance(name, str) and bool(_SAFE_IDENTIFIER.match(name))
|
||||
|
||||
|
||||
def is_safe_quoted_identifier(name: object) -> bool:
|
||||
"""Return True if ``name`` is safe to interpolate **inside** double-quotes
|
||||
in a DuckDB identifier position. Allows `.` and `-` for upstream
|
||||
naming conventions (Keboola buckets like `in.c-events`, BigQuery
|
||||
datasets) but refuses anything that could close the quote or
|
||||
inject control characters."""
|
||||
return isinstance(name, str) and bool(_SAFE_QUOTED_IDENTIFIER.match(name))
|
||||
|
||||
|
||||
def validate_identifier(name: str, context: str) -> bool:
|
||||
"""Strict check — returns True if safe, False (with WARNING log) if not.
|
||||
Use for identifiers that should match `[a-zA-Z_][a-zA-Z0-9_]*`."""
|
||||
if not is_safe_identifier(name):
|
||||
logger.warning("Rejected unsafe %s identifier: %r", context, name)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def validate_quoted_identifier(name: str, context: str) -> bool:
|
||||
"""Relaxed check for upstream-typed identifiers (buckets, datasets).
|
||||
Accepts dots and dashes; refuses quote/semicolon/control chars."""
|
||||
if not is_safe_quoted_identifier(name):
|
||||
logger.warning("Rejected unsafe %s identifier: %r", context, name)
|
||||
return False
|
||||
return True
|
||||
|
|
@ -38,15 +38,14 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
_rebuild_lock = threading.Lock()
|
||||
|
||||
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
|
||||
|
||||
|
||||
def _validate_identifier(name: str, context: str) -> bool:
|
||||
"""Validate a DuckDB identifier. Returns True if safe, False if not."""
|
||||
if not _SAFE_IDENTIFIER.match(name):
|
||||
logger.warning("Rejected unsafe %s identifier: %r", context, name)
|
||||
return False
|
||||
return True
|
||||
# Identifier validation lives in src/identifier_validation.py so the
|
||||
# orchestrator and the extractors share the same regex (#81 Group D).
|
||||
# The local names are kept as aliases so existing call sites need no
|
||||
# rename — they import from a single source of truth now.
|
||||
from src.identifier_validation import ( # noqa: E402
|
||||
_SAFE_IDENTIFIER, # noqa: F401 (re-exported for any historical caller)
|
||||
validate_identifier as _validate_identifier,
|
||||
)
|
||||
|
||||
|
||||
def _atomic_swap_db(tmp_path: str, target_path: str) -> None:
|
||||
|
|
|
|||
172
tests/test_extractor_identifier_validation.py
Normal file
172
tests/test_extractor_identifier_validation.py
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
"""Issue #81 Group D — extractor-layer identifier injection (M15).
|
||||
|
||||
Mirror of `_validate_identifier` coverage in `tests/test_orchestrator.py`,
|
||||
but at the extractor layer. Each test feeds a registry row with an
|
||||
attacker-shaped `name` / `bucket` / `source_table` into the extractor and
|
||||
asserts the row is skipped (no SQL run, error recorded) while valid
|
||||
sibling rows in the same registry continue to process.
|
||||
"""
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from src.identifier_validation import (
|
||||
is_safe_identifier,
|
||||
is_safe_quoted_identifier,
|
||||
validate_identifier,
|
||||
validate_quoted_identifier,
|
||||
)
|
||||
|
||||
|
||||
class TestIsSafeIdentifier:
|
||||
"""Strict identifier — for our own view names, aliases."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name",
|
||||
["orders", "Orders", "_priv", "ABC_123", "x", "a" * 64],
|
||||
)
|
||||
def test_valid(self, name):
|
||||
assert is_safe_identifier(name) is True
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name",
|
||||
[
|
||||
"",
|
||||
"1starts_with_digit",
|
||||
"has space",
|
||||
"has-dash",
|
||||
"has.dot",
|
||||
"has;semi",
|
||||
"has\"quote",
|
||||
"has'quote",
|
||||
"evil\"; DROP TABLE x; --",
|
||||
"x" * 65, # too long
|
||||
"café", # diacritic — DuckDB allows but our policy doesn't
|
||||
None,
|
||||
123,
|
||||
["x"],
|
||||
],
|
||||
)
|
||||
def test_invalid(self, name):
|
||||
assert is_safe_identifier(name) is False
|
||||
|
||||
|
||||
class TestIsSafeQuotedIdentifier:
|
||||
"""Relaxed identifier for upstream-typed names (Keboola buckets,
|
||||
BigQuery datasets). Allows `.` and `-`; refuses injection markers."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name",
|
||||
[
|
||||
"orders",
|
||||
"in.c-events", # Keboola bucket
|
||||
"out.c-marketing.roi", # nested Keboola bucket form
|
||||
"my-bq-dataset",
|
||||
"1starts_with_digit", # numeric start fine inside quotes
|
||||
"ABC_123",
|
||||
"a" * 128, # at the limit
|
||||
],
|
||||
)
|
||||
def test_valid(self, name):
|
||||
assert is_safe_quoted_identifier(name) is True
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name",
|
||||
[
|
||||
"", # empty
|
||||
"has space",
|
||||
"has\"quote", # would close the surrounding `"`
|
||||
"has'apostrophe",
|
||||
"has;semi",
|
||||
"evil\"; DROP TABLE x; --",
|
||||
"name\x00with-nul", # NUL
|
||||
"name\nwith-newline", # control char
|
||||
"x" * 129, # too long
|
||||
".starts_with_dot",
|
||||
"-starts_with_dash",
|
||||
None,
|
||||
123,
|
||||
["x"],
|
||||
],
|
||||
)
|
||||
def test_invalid(self, name):
|
||||
assert is_safe_quoted_identifier(name) is False
|
||||
|
||||
|
||||
class TestKeboolaExtractorRefusesUnsafeIdentifiers:
|
||||
"""Verify keboola/extractor.py:run() skips registry rows with unsafe
|
||||
identifiers but still processes the safe siblings."""
|
||||
|
||||
def test_unsafe_table_name_rejected_safe_kept(self, tmp_path, monkeypatch):
|
||||
"""Behavioural test: pass mixed registry rows, observe stats."""
|
||||
from connectors.keboola import extractor as kbe
|
||||
|
||||
# Use the extractor's run() with use_extension=False so we don't
|
||||
# need a real Keboola server. We mock _extract_via_legacy to be a
|
||||
# no-op that creates a tiny parquet so the size lookup works.
|
||||
import duckdb
|
||||
|
||||
def fake_legacy(tc, pq_path, url, token):
|
||||
d = duckdb.connect()
|
||||
d.execute(f"COPY (SELECT 1 AS x) TO '{pq_path}' (FORMAT PARQUET)")
|
||||
d.close()
|
||||
|
||||
monkeypatch.setattr(kbe, "_extract_via_legacy", fake_legacy)
|
||||
|
||||
# Three rows: a good underscore-only name, one with a dash (now
|
||||
# rejected at the extractor — orchestrator's strict validator
|
||||
# would have silently dropped it at rebuild time), and one with
|
||||
# a hostile injection.
|
||||
rows = [
|
||||
{"name": "good_table", "query_mode": "local"},
|
||||
{"name": "events-2026", "query_mode": "local"},
|
||||
{"name": "evil\"; DROP TABLE x; --", "query_mode": "local"},
|
||||
]
|
||||
out_dir = tmp_path / "extracts" / "keboola"
|
||||
result = kbe.run(
|
||||
str(out_dir), rows, keboola_url="", keboola_token="",
|
||||
)
|
||||
|
||||
# Only good_table passes. events-2026 is rejected up front because
|
||||
# the orchestrator would silently drop it at rebuild time anyway —
|
||||
# better to fail fast and visibly. Operators with dashed names
|
||||
# rename them to underscore form.
|
||||
assert result["tables_extracted"] == 1
|
||||
assert result["tables_failed"] == 2
|
||||
assert any("unsafe identifier" in (e.get("error") or "") for e in result["errors"])
|
||||
|
||||
def test_unsafe_bucket_in_remote_row_rejected(self, tmp_path, monkeypatch):
|
||||
from connectors.keboola import extractor as kbe
|
||||
|
||||
# An adversarial bucket containing `"` would close the surrounding
|
||||
# quote and inject SQL. Real Keboola buckets like `in.c-events`
|
||||
# are accepted by the relaxed validator.
|
||||
rows = [
|
||||
{
|
||||
"name": "good",
|
||||
"query_mode": "remote",
|
||||
"bucket": 'evil"; DROP--',
|
||||
"source_table": "t",
|
||||
},
|
||||
]
|
||||
out_dir = tmp_path / "extracts" / "keboola"
|
||||
result = kbe.run(
|
||||
str(out_dir), rows, keboola_url="", keboola_token="",
|
||||
)
|
||||
assert result["tables_failed"] == 1
|
||||
assert any("unsafe bucket/source_table" in (e.get("error") or "") for e in result["errors"])
|
||||
|
||||
|
||||
class TestBigQueryExtractorRefusesUnsafeIdentifiers:
|
||||
def test_unsafe_dataset_rejected(self):
|
||||
"""BigQuery extractor uses validate_quoted_identifier for dataset."""
|
||||
# `my.dataset` is now legitimate (BigQuery datasets can contain dots).
|
||||
assert validate_quoted_identifier("my-dataset", "BigQuery dataset") is True
|
||||
# An injection attempt via embedded quote — refused.
|
||||
assert validate_quoted_identifier("evil\"; DROP --", "BigQuery dataset") is False
|
||||
# Sanity: a normal dataset name passes.
|
||||
assert validate_quoted_identifier("marketing", "BigQuery dataset") is True
|
||||
# The view-name (table_name) still uses the strict validator.
|
||||
assert validate_identifier("marketing_view", "BigQuery table_name") is True
|
||||
assert validate_identifier("marketing-view", "BigQuery table_name") is False
|
||||
Loading…
Reference in a new issue