From 569cd90d753c9ac6bd67c83736b5609b4e66c1c7 Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr <139972147+ZdenekSrotyr@users.noreply.github.com> Date: Mon, 27 Apr 2026 21:46:17 +0200 Subject: [PATCH] =?UTF-8?q?fix(security):=20#81=20Group=20D=20=E2=80=94=20?= =?UTF-8?q?extractor-side=20identifier=20validation=20(squashed)=20(#97)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes M15 from issue #81 — SQL injection via attacker-controlled identifiers in connectors/keboola/extractor.py and connectors/bigquery/extractor.py. Lifted _validate_identifier from src/orchestrator.py into a new src/identifier_validation.py shared module (single source of truth for both layers). Two validator policies: - validate_identifier (strict, ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$) for table_name — matches the orchestrator's rebuild-time check, so dashed names fail fast at extraction rather than being silently dropped. - validate_quoted_identifier (relaxed, accepts dashes/dots) for bucket/dataset/source_table — Keboola in.c-foo and BigQuery my-dataset are legitimate, just need to be safe inside `"..."`. Both extractors skip-and-continue on unsafe rows (logged + counted in failure stats); _extract_via_extension re-validates as defense-in-depth. 71/71 extractor + orchestrator tests pass. Refs #81 Group D. --- CHANGELOG.md | 19 ++ connectors/bigquery/extractor.py | 21 +++ connectors/keboola/extractor.py | 57 +++++- src/identifier_validation.py | 63 +++++++ src/orchestrator.py | 17 +- tests/test_extractor_identifier_validation.py | 172 ++++++++++++++++++ 6 files changed, 336 insertions(+), 13 deletions(-) create mode 100644 src/identifier_validation.py create mode 100644 tests/test_extractor_identifier_validation.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 611d016..f1560e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -133,6 +133,25 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C `keboola, bigquery`) is visible at boot rather than at the next failed attach. See `docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md`. +- **Security (MEDIUM)**: extractor-side identifier validation (issue + #81 Group D / M15). The Keboola and BigQuery extractors interpolate + `table_name`, `bucket` / `dataset`, and `source_table` from + `table_registry` directly into `CREATE OR REPLACE VIEW`, + `INSERT INTO _meta`, and `COPY ... TO` SQL. Anyone with write access + to `table_registry` (admin, registry-write API) could inject SQL via + these identifiers. New shared module `src/identifier_validation.py` + exposes a strict `validate_identifier` (for our own view names — + `^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`, used for `table_name` so it matches + the orchestrator's rebuild-time check and dashed names fail fast at + extraction rather than being silently dropped at rebuild) and a + relaxed `validate_quoted_identifier` (for upstream-typed names like + Keboola `in.c-foo` / BigQuery `my-dataset`: + `[a-zA-Z0-9_][a-zA-Z0-9_.\-]*`, refusing any character that could + close a `"..."` identifier literal). The orchestrator's existing + `_validate_identifier` was lifted into the new module so both layers + share a single source of truth; both extractors skip-and-continue on + unsafe rows (logged + counted in failure stats; the rest of the + registry still processes). ### Removed diff --git a/connectors/bigquery/extractor.py b/connectors/bigquery/extractor.py index 7917052..919bd37 100644 --- a/connectors/bigquery/extractor.py +++ b/connectors/bigquery/extractor.py @@ -12,6 +12,8 @@ from typing import List, Dict, Any import duckdb +from src.identifier_validation import validate_identifier, validate_quoted_identifier + logger = logging.getLogger(__name__) @@ -90,6 +92,25 @@ def init_extract( dataset = tc.get("bucket", "") # BigQuery dataset source_table = tc.get("source_table", table_name) + # #81 Group D — refuse rows with unsafe identifiers. Same + # rationale as the keboola extractor: registry is admin-controlled + # but anyone with write access can otherwise inject SQL via the + # CREATE VIEW interpolation below. Skip-and-continue. + # `table_name` is the DuckDB view name in the master + # analytics DB and the orchestrator uses the STRICT + # validator there — accept the same constraint upstream + # so a name with `-` or `.` fails fast in extraction + # rather than getting silently dropped at rebuild time. + # `dataset` and `source_table` are upstream-typed (BQ + # naming) so use the relaxed validator for those. + if not (validate_identifier(table_name, "BigQuery table_name") and + validate_quoted_identifier(dataset, "BigQuery dataset") and + validate_quoted_identifier(source_table, "BigQuery source_table")): + stats["errors"].append( + {"table": table_name, "error": "unsafe identifier"} + ) + continue + try: conn.execute( f'CREATE OR REPLACE VIEW "{table_name}" AS ' diff --git a/connectors/keboola/extractor.py b/connectors/keboola/extractor.py index fbcf611..5ddcc00 100644 --- a/connectors/keboola/extractor.py +++ b/connectors/keboola/extractor.py @@ -8,6 +8,13 @@ from typing import List, Dict, Any import duckdb +from src.identifier_validation import ( + is_safe_identifier, + is_safe_quoted_identifier, + validate_identifier, + validate_quoted_identifier, +) + logger = logging.getLogger(__name__) @@ -95,10 +102,37 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str, table_name = tc["name"] query_mode = tc.get("query_mode", "local") + # #81 Group D — refuse rows whose identifiers don't pass the + # whitelist. The registry is admin-controlled but anyone with + # write access can otherwise inject SQL via the CREATE VIEW / + # COPY / SELECT interpolation below. Skip-and-continue rather + # than crashing the whole extraction; valid rows still process. + # + # `table_name` is the DuckDB view name in the master + # analytics DB. The orchestrator uses the STRICT validator + # (`^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`) when re-creating views, + # so any name with `-` or `.` would pass extraction here + # but be silently dropped at orchestrator-rebuild time. + # Use the strict validator here too so the failure is + # caught early and visible in tables_failed. + if not validate_identifier(table_name, "Keboola table_name"): + stats["tables_failed"] += 1 + stats["errors"].append( + {"table": table_name, "error": "unsafe identifier"} + ) + continue + if query_mode == "remote": # Create view pointing to kbc extension (requires re-ATTACH at query time) bucket = tc.get("bucket", "") source_table = tc.get("source_table", table_name) + if not (validate_quoted_identifier(bucket, "Keboola bucket") and + validate_quoted_identifier(source_table, "Keboola source_table")): + stats["tables_failed"] += 1 + stats["errors"].append( + {"table": table_name, "error": "unsafe bucket/source_table"} + ) + continue if use_extension and bucket: conn.execute( f'CREATE OR REPLACE VIEW "{table_name}" AS ' @@ -119,13 +153,19 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str, else: _extract_via_legacy(tc, pq_path, keboola_url, keboola_token) - # Get row count and file size - rows = conn.execute(f"SELECT count(*) FROM read_parquet('{pq_path}')").fetchone()[0] + # Get row count and file size. pq_path is built from the + # validated table_name above, but escape the parquet path + # literal for defense-in-depth. + safe_pq_lit = pq_path.replace("'", "''") + rows = conn.execute( + f"SELECT count(*) FROM read_parquet('{safe_pq_lit}')" + ).fetchone()[0] size = os.path.getsize(pq_path) # Create view and register in _meta conn.execute( - f'CREATE OR REPLACE VIEW "{table_name}" AS SELECT * FROM read_parquet(\'{pq_path}\')' + f'CREATE OR REPLACE VIEW "{table_name}" AS ' + f'SELECT * FROM read_parquet(\'{safe_pq_lit}\')' ) conn.execute( "INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')", @@ -172,8 +212,17 @@ def _extract_via_extension( """Extract a table using the DuckDB Keboola extension.""" bucket = tc.get("bucket", "") source_table = tc.get("source_table", tc["name"]) + # #81 Group D — defense-in-depth. The caller already validates these; + # refuse here too in case a future caller forgets. Use the relaxed + # quoted-identifier check that accepts Keboola's `in.c-foo` form. + if not (is_safe_quoted_identifier(bucket) and is_safe_quoted_identifier(source_table)): + raise ValueError( + f"unsafe bucket/source_table: {bucket!r}/{source_table!r}" + ) + safe_pq_lit = pq_path.replace("'", "''") conn.execute( - f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") TO \'{pq_path}\' (FORMAT PARQUET)' + f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") ' + f'TO \'{safe_pq_lit}\' (FORMAT PARQUET)' ) diff --git a/src/identifier_validation.py b/src/identifier_validation.py new file mode 100644 index 0000000..53c59f9 --- /dev/null +++ b/src/identifier_validation.py @@ -0,0 +1,63 @@ +"""DuckDB identifier validation — shared across orchestrator and extractors. + +Issue #81 Group D — extractor-layer SQL injection (M15) is the peer of the +orchestrator's `_meta.table_name` SQLi (M14, fixed previously by +`src/orchestrator.py:_validate_identifier`). Same trust problem at a +different layer: an attacker who controls the contents of `table_registry` +(admin or whoever can write to that table) can inject SQL via identifier +interpolation in a connector's `CREATE OR REPLACE VIEW` / `COPY` / +`INSERT INTO _meta` statements. + +Lifted from `src/orchestrator.py` so both layers use the same regex. +""" + +from __future__ import annotations + +import logging +import re + +logger = logging.getLogger(__name__) + +# Strict DuckDB identifier — letter or underscore start, alphanumeric/underscore body, +# bounded length. Use for orchestrator-side aliases, extension names, view names — +# anything we generate or that comes from a tightly-controlled namespace. +_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$") + +# Relaxed identifier allowing the dotted/dashed forms that real upstreams use +# — Keboola buckets (`in.c-foo`), BigQuery datasets, etc. Still refuses anything +# that could break out of a `"..."` quoted identifier (no `"`, no `'`, no `;`, +# no control chars, no NUL). 128-char cap matches common DB identifier limits. +_SAFE_QUOTED_IDENTIFIER = re.compile(r"^[a-zA-Z0-9_][a-zA-Z0-9_.\-]{0,127}$") + + +def is_safe_identifier(name: object) -> bool: + """Return True if ``name`` is safe to interpolate into a strict + DuckDB identifier position (alias, extension, schema we generate).""" + return isinstance(name, str) and bool(_SAFE_IDENTIFIER.match(name)) + + +def is_safe_quoted_identifier(name: object) -> bool: + """Return True if ``name`` is safe to interpolate **inside** double-quotes + in a DuckDB identifier position. Allows `.` and `-` for upstream + naming conventions (Keboola buckets like `in.c-events`, BigQuery + datasets) but refuses anything that could close the quote or + inject control characters.""" + return isinstance(name, str) and bool(_SAFE_QUOTED_IDENTIFIER.match(name)) + + +def validate_identifier(name: str, context: str) -> bool: + """Strict check — returns True if safe, False (with WARNING log) if not. + Use for identifiers that should match `[a-zA-Z_][a-zA-Z0-9_]*`.""" + if not is_safe_identifier(name): + logger.warning("Rejected unsafe %s identifier: %r", context, name) + return False + return True + + +def validate_quoted_identifier(name: str, context: str) -> bool: + """Relaxed check for upstream-typed identifiers (buckets, datasets). + Accepts dots and dashes; refuses quote/semicolon/control chars.""" + if not is_safe_quoted_identifier(name): + logger.warning("Rejected unsafe %s identifier: %r", context, name) + return False + return True diff --git a/src/orchestrator.py b/src/orchestrator.py index 833145a..0183e2f 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -38,15 +38,14 @@ logger = logging.getLogger(__name__) _rebuild_lock = threading.Lock() -_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$") - - -def _validate_identifier(name: str, context: str) -> bool: - """Validate a DuckDB identifier. Returns True if safe, False if not.""" - if not _SAFE_IDENTIFIER.match(name): - logger.warning("Rejected unsafe %s identifier: %r", context, name) - return False - return True +# Identifier validation lives in src/identifier_validation.py so the +# orchestrator and the extractors share the same regex (#81 Group D). +# The local names are kept as aliases so existing call sites need no +# rename — they import from a single source of truth now. +from src.identifier_validation import ( # noqa: E402 + _SAFE_IDENTIFIER, # noqa: F401 (re-exported for any historical caller) + validate_identifier as _validate_identifier, +) def _atomic_swap_db(tmp_path: str, target_path: str) -> None: diff --git a/tests/test_extractor_identifier_validation.py b/tests/test_extractor_identifier_validation.py new file mode 100644 index 0000000..8ae02cb --- /dev/null +++ b/tests/test_extractor_identifier_validation.py @@ -0,0 +1,172 @@ +"""Issue #81 Group D — extractor-layer identifier injection (M15). + +Mirror of `_validate_identifier` coverage in `tests/test_orchestrator.py`, +but at the extractor layer. Each test feeds a registry row with an +attacker-shaped `name` / `bucket` / `source_table` into the extractor and +asserts the row is skipped (no SQL run, error recorded) while valid +sibling rows in the same registry continue to process. +""" + +from unittest.mock import MagicMock + +import pytest + +from src.identifier_validation import ( + is_safe_identifier, + is_safe_quoted_identifier, + validate_identifier, + validate_quoted_identifier, +) + + +class TestIsSafeIdentifier: + """Strict identifier — for our own view names, aliases.""" + + @pytest.mark.parametrize( + "name", + ["orders", "Orders", "_priv", "ABC_123", "x", "a" * 64], + ) + def test_valid(self, name): + assert is_safe_identifier(name) is True + + @pytest.mark.parametrize( + "name", + [ + "", + "1starts_with_digit", + "has space", + "has-dash", + "has.dot", + "has;semi", + "has\"quote", + "has'quote", + "evil\"; DROP TABLE x; --", + "x" * 65, # too long + "café", # diacritic — DuckDB allows but our policy doesn't + None, + 123, + ["x"], + ], + ) + def test_invalid(self, name): + assert is_safe_identifier(name) is False + + +class TestIsSafeQuotedIdentifier: + """Relaxed identifier for upstream-typed names (Keboola buckets, + BigQuery datasets). Allows `.` and `-`; refuses injection markers.""" + + @pytest.mark.parametrize( + "name", + [ + "orders", + "in.c-events", # Keboola bucket + "out.c-marketing.roi", # nested Keboola bucket form + "my-bq-dataset", + "1starts_with_digit", # numeric start fine inside quotes + "ABC_123", + "a" * 128, # at the limit + ], + ) + def test_valid(self, name): + assert is_safe_quoted_identifier(name) is True + + @pytest.mark.parametrize( + "name", + [ + "", # empty + "has space", + "has\"quote", # would close the surrounding `"` + "has'apostrophe", + "has;semi", + "evil\"; DROP TABLE x; --", + "name\x00with-nul", # NUL + "name\nwith-newline", # control char + "x" * 129, # too long + ".starts_with_dot", + "-starts_with_dash", + None, + 123, + ["x"], + ], + ) + def test_invalid(self, name): + assert is_safe_quoted_identifier(name) is False + + +class TestKeboolaExtractorRefusesUnsafeIdentifiers: + """Verify keboola/extractor.py:run() skips registry rows with unsafe + identifiers but still processes the safe siblings.""" + + def test_unsafe_table_name_rejected_safe_kept(self, tmp_path, monkeypatch): + """Behavioural test: pass mixed registry rows, observe stats.""" + from connectors.keboola import extractor as kbe + + # Use the extractor's run() with use_extension=False so we don't + # need a real Keboola server. We mock _extract_via_legacy to be a + # no-op that creates a tiny parquet so the size lookup works. + import duckdb + + def fake_legacy(tc, pq_path, url, token): + d = duckdb.connect() + d.execute(f"COPY (SELECT 1 AS x) TO '{pq_path}' (FORMAT PARQUET)") + d.close() + + monkeypatch.setattr(kbe, "_extract_via_legacy", fake_legacy) + + # Three rows: a good underscore-only name, one with a dash (now + # rejected at the extractor — orchestrator's strict validator + # would have silently dropped it at rebuild time), and one with + # a hostile injection. + rows = [ + {"name": "good_table", "query_mode": "local"}, + {"name": "events-2026", "query_mode": "local"}, + {"name": "evil\"; DROP TABLE x; --", "query_mode": "local"}, + ] + out_dir = tmp_path / "extracts" / "keboola" + result = kbe.run( + str(out_dir), rows, keboola_url="", keboola_token="", + ) + + # Only good_table passes. events-2026 is rejected up front because + # the orchestrator would silently drop it at rebuild time anyway — + # better to fail fast and visibly. Operators with dashed names + # rename them to underscore form. + assert result["tables_extracted"] == 1 + assert result["tables_failed"] == 2 + assert any("unsafe identifier" in (e.get("error") or "") for e in result["errors"]) + + def test_unsafe_bucket_in_remote_row_rejected(self, tmp_path, monkeypatch): + from connectors.keboola import extractor as kbe + + # An adversarial bucket containing `"` would close the surrounding + # quote and inject SQL. Real Keboola buckets like `in.c-events` + # are accepted by the relaxed validator. + rows = [ + { + "name": "good", + "query_mode": "remote", + "bucket": 'evil"; DROP--', + "source_table": "t", + }, + ] + out_dir = tmp_path / "extracts" / "keboola" + result = kbe.run( + str(out_dir), rows, keboola_url="", keboola_token="", + ) + assert result["tables_failed"] == 1 + assert any("unsafe bucket/source_table" in (e.get("error") or "") for e in result["errors"]) + + +class TestBigQueryExtractorRefusesUnsafeIdentifiers: + def test_unsafe_dataset_rejected(self): + """BigQuery extractor uses validate_quoted_identifier for dataset.""" + # `my.dataset` is now legitimate (BigQuery datasets can contain dots). + assert validate_quoted_identifier("my-dataset", "BigQuery dataset") is True + # An injection attempt via embedded quote — refused. + assert validate_quoted_identifier("evil\"; DROP --", "BigQuery dataset") is False + # Sanity: a normal dataset name passes. + assert validate_quoted_identifier("marketing", "BigQuery dataset") is True + # The view-name (table_name) still uses the strict validator. + assert validate_identifier("marketing_view", "BigQuery table_name") is True + assert validate_identifier("marketing-view", "BigQuery table_name") is False