Closes M15 from issue #81 — SQL injection via attacker-controlled identifiers in connectors/keboola/extractor.py and connectors/bigquery/extractor.py. Lifted _validate_identifier from src/orchestrator.py into a new src/identifier_validation.py shared module (single source of truth for both layers). Two validator policies: - validate_identifier (strict, ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$) for table_name — matches the orchestrator's rebuild-time check, so dashed names fail fast at extraction rather than being silently dropped. - validate_quoted_identifier (relaxed, accepts dashes/dots) for bucket/dataset/source_table — Keboola in.c-foo and BigQuery my-dataset are legitimate, just need to be safe inside `"..."`. Both extractors skip-and-continue on unsafe rows (logged + counted in failure stats); _extract_via_extension re-validates as defense-in-depth. 71/71 extractor + orchestrator tests pass. Refs #81 Group D.
172 lines
6.3 KiB
Python
172 lines
6.3 KiB
Python
"""Issue #81 Group D — extractor-layer identifier injection (M15).
|
|
|
|
Mirror of `_validate_identifier` coverage in `tests/test_orchestrator.py`,
|
|
but at the extractor layer. Each test feeds a registry row with an
|
|
attacker-shaped `name` / `bucket` / `source_table` into the extractor and
|
|
asserts the row is skipped (no SQL run, error recorded) while valid
|
|
sibling rows in the same registry continue to process.
|
|
"""
|
|
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
from src.identifier_validation import (
|
|
is_safe_identifier,
|
|
is_safe_quoted_identifier,
|
|
validate_identifier,
|
|
validate_quoted_identifier,
|
|
)
|
|
|
|
|
|
class TestIsSafeIdentifier:
|
|
"""Strict identifier — for our own view names, aliases."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"name",
|
|
["orders", "Orders", "_priv", "ABC_123", "x", "a" * 64],
|
|
)
|
|
def test_valid(self, name):
|
|
assert is_safe_identifier(name) is True
|
|
|
|
@pytest.mark.parametrize(
|
|
"name",
|
|
[
|
|
"",
|
|
"1starts_with_digit",
|
|
"has space",
|
|
"has-dash",
|
|
"has.dot",
|
|
"has;semi",
|
|
"has\"quote",
|
|
"has'quote",
|
|
"evil\"; DROP TABLE x; --",
|
|
"x" * 65, # too long
|
|
"café", # diacritic — DuckDB allows but our policy doesn't
|
|
None,
|
|
123,
|
|
["x"],
|
|
],
|
|
)
|
|
def test_invalid(self, name):
|
|
assert is_safe_identifier(name) is False
|
|
|
|
|
|
class TestIsSafeQuotedIdentifier:
|
|
"""Relaxed identifier for upstream-typed names (Keboola buckets,
|
|
BigQuery datasets). Allows `.` and `-`; refuses injection markers."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"name",
|
|
[
|
|
"orders",
|
|
"in.c-events", # Keboola bucket
|
|
"out.c-marketing.roi", # nested Keboola bucket form
|
|
"my-bq-dataset",
|
|
"1starts_with_digit", # numeric start fine inside quotes
|
|
"ABC_123",
|
|
"a" * 128, # at the limit
|
|
],
|
|
)
|
|
def test_valid(self, name):
|
|
assert is_safe_quoted_identifier(name) is True
|
|
|
|
@pytest.mark.parametrize(
|
|
"name",
|
|
[
|
|
"", # empty
|
|
"has space",
|
|
"has\"quote", # would close the surrounding `"`
|
|
"has'apostrophe",
|
|
"has;semi",
|
|
"evil\"; DROP TABLE x; --",
|
|
"name\x00with-nul", # NUL
|
|
"name\nwith-newline", # control char
|
|
"x" * 129, # too long
|
|
".starts_with_dot",
|
|
"-starts_with_dash",
|
|
None,
|
|
123,
|
|
["x"],
|
|
],
|
|
)
|
|
def test_invalid(self, name):
|
|
assert is_safe_quoted_identifier(name) is False
|
|
|
|
|
|
class TestKeboolaExtractorRefusesUnsafeIdentifiers:
|
|
"""Verify keboola/extractor.py:run() skips registry rows with unsafe
|
|
identifiers but still processes the safe siblings."""
|
|
|
|
def test_unsafe_table_name_rejected_safe_kept(self, tmp_path, monkeypatch):
|
|
"""Behavioural test: pass mixed registry rows, observe stats."""
|
|
from connectors.keboola import extractor as kbe
|
|
|
|
# Use the extractor's run() with use_extension=False so we don't
|
|
# need a real Keboola server. We mock _extract_via_legacy to be a
|
|
# no-op that creates a tiny parquet so the size lookup works.
|
|
import duckdb
|
|
|
|
def fake_legacy(tc, pq_path, url, token):
|
|
d = duckdb.connect()
|
|
d.execute(f"COPY (SELECT 1 AS x) TO '{pq_path}' (FORMAT PARQUET)")
|
|
d.close()
|
|
|
|
monkeypatch.setattr(kbe, "_extract_via_legacy", fake_legacy)
|
|
|
|
# Three rows: a good underscore-only name, one with a dash (now
|
|
# rejected at the extractor — orchestrator's strict validator
|
|
# would have silently dropped it at rebuild time), and one with
|
|
# a hostile injection.
|
|
rows = [
|
|
{"name": "good_table", "query_mode": "local"},
|
|
{"name": "events-2026", "query_mode": "local"},
|
|
{"name": "evil\"; DROP TABLE x; --", "query_mode": "local"},
|
|
]
|
|
out_dir = tmp_path / "extracts" / "keboola"
|
|
result = kbe.run(
|
|
str(out_dir), rows, keboola_url="", keboola_token="",
|
|
)
|
|
|
|
# Only good_table passes. events-2026 is rejected up front because
|
|
# the orchestrator would silently drop it at rebuild time anyway —
|
|
# better to fail fast and visibly. Operators with dashed names
|
|
# rename them to underscore form.
|
|
assert result["tables_extracted"] == 1
|
|
assert result["tables_failed"] == 2
|
|
assert any("unsafe identifier" in (e.get("error") or "") for e in result["errors"])
|
|
|
|
def test_unsafe_bucket_in_remote_row_rejected(self, tmp_path, monkeypatch):
|
|
from connectors.keboola import extractor as kbe
|
|
|
|
# An adversarial bucket containing `"` would close the surrounding
|
|
# quote and inject SQL. Real Keboola buckets like `in.c-events`
|
|
# are accepted by the relaxed validator.
|
|
rows = [
|
|
{
|
|
"name": "good",
|
|
"query_mode": "remote",
|
|
"bucket": 'evil"; DROP--',
|
|
"source_table": "t",
|
|
},
|
|
]
|
|
out_dir = tmp_path / "extracts" / "keboola"
|
|
result = kbe.run(
|
|
str(out_dir), rows, keboola_url="", keboola_token="",
|
|
)
|
|
assert result["tables_failed"] == 1
|
|
assert any("unsafe bucket/source_table" in (e.get("error") or "") for e in result["errors"])
|
|
|
|
|
|
class TestBigQueryExtractorRefusesUnsafeIdentifiers:
|
|
def test_unsafe_dataset_rejected(self):
|
|
"""BigQuery extractor uses validate_quoted_identifier for dataset."""
|
|
# `my.dataset` is now legitimate (BigQuery datasets can contain dots).
|
|
assert validate_quoted_identifier("my-dataset", "BigQuery dataset") is True
|
|
# An injection attempt via embedded quote — refused.
|
|
assert validate_quoted_identifier("evil\"; DROP --", "BigQuery dataset") is False
|
|
# Sanity: a normal dataset name passes.
|
|
assert validate_quoted_identifier("marketing", "BigQuery dataset") is True
|
|
# The view-name (table_name) still uses the strict validator.
|
|
assert validate_identifier("marketing_view", "BigQuery table_name") is True
|
|
assert validate_identifier("marketing-view", "BigQuery table_name") is False
|