fix(security): #81 Group D — extractor-side identifier validation (squashed) (#97)

Closes M15 from issue #81 — SQL injection via attacker-controlled identifiers in connectors/keboola/extractor.py and connectors/bigquery/extractor.py. Lifted _validate_identifier from src/orchestrator.py into a new src/identifier_validation.py shared module (single source of truth for both layers). Two validator policies: - validate_identifier (strict, ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$) for table_name — matches the orchestrator's rebuild-time check, so dashed names fail fast at extraction rather than being silently dropped. - validate_quoted_identifier (relaxed, accepts dashes/dots) for bucket/dataset/source_table — Keboola in.c-foo and BigQuery my-dataset are legitimate, just need to be safe inside `"..."`. Both extractors skip-and-continue on unsafe rows (logged + counted in failure stats); _extract_via_extension re-validates as defense-in-depth. 71/71 extractor + orchestrator tests pass. Refs #81 Group D.
2026-04-27 21:46:17 +02:00 · 2026-04-27 21:46:17 +02:00 · 569cd90d75
commit 569cd90d75
parent 23be8ad46f
6 changed files with 336 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -133,6 +133,25 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
  `keboola, bigquery`) is visible at boot rather than at the next
  failed attach. See
  `docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md`.
+- **Security (MEDIUM)**: extractor-side identifier validation (issue
+  #81 Group D / M15). The Keboola and BigQuery extractors interpolate
+  `table_name`, `bucket` / `dataset`, and `source_table` from
+  `table_registry` directly into `CREATE OR REPLACE VIEW`,
+  `INSERT INTO _meta`, and `COPY ... TO` SQL. Anyone with write access
+  to `table_registry` (admin, registry-write API) could inject SQL via
+  these identifiers. New shared module `src/identifier_validation.py`
+  exposes a strict `validate_identifier` (for our own view names —
+  `^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`, used for `table_name` so it matches
+  the orchestrator's rebuild-time check and dashed names fail fast at
+  extraction rather than being silently dropped at rebuild) and a
+  relaxed `validate_quoted_identifier` (for upstream-typed names like
+  Keboola `in.c-foo` / BigQuery `my-dataset`:
+  `[a-zA-Z0-9_][a-zA-Z0-9_.\-]*`, refusing any character that could
+  close a `"..."` identifier literal). The orchestrator's existing
+  `_validate_identifier` was lifted into the new module so both layers
+  share a single source of truth; both extractors skip-and-continue on
+  unsafe rows (logged + counted in failure stats; the rest of the
+  registry still processes).

 ### Removed

--- a/connectors/bigquery/extractor.py
+++ b/connectors/bigquery/extractor.py
@ -12,6 +12,8 @@ from typing import List, Dict, Any

 import duckdb

+from src.identifier_validation import validate_identifier, validate_quoted_identifier
+
 logger = logging.getLogger(__name__)


@ -90,6 +92,25 @@ def init_extract(
            dataset = tc.get("bucket", "")  # BigQuery dataset
            source_table = tc.get("source_table", table_name)

+            # #81 Group D — refuse rows with unsafe identifiers. Same
+            # rationale as the keboola extractor: registry is admin-controlled
+            # but anyone with write access can otherwise inject SQL via the
+            # CREATE VIEW interpolation below. Skip-and-continue.
+            # `table_name` is the DuckDB view name in the master
+            # analytics DB and the orchestrator uses the STRICT
+            # validator there — accept the same constraint upstream
+            # so a name with `-` or `.` fails fast in extraction
+            # rather than getting silently dropped at rebuild time.
+            # `dataset` and `source_table` are upstream-typed (BQ
+            # naming) so use the relaxed validator for those.
+            if not (validate_identifier(table_name, "BigQuery table_name") and
+                    validate_quoted_identifier(dataset, "BigQuery dataset") and
+                    validate_quoted_identifier(source_table, "BigQuery source_table")):
+                stats["errors"].append(
+                    {"table": table_name, "error": "unsafe identifier"}
+                )
+                continue
+
            try:
                conn.execute(
                    f'CREATE OR REPLACE VIEW "{table_name}" AS '
--- a/connectors/keboola/extractor.py
+++ b/connectors/keboola/extractor.py
@ -8,6 +8,13 @@ from typing import List, Dict, Any

 import duckdb

+from src.identifier_validation import (
+    is_safe_identifier,
+    is_safe_quoted_identifier,
+    validate_identifier,
+    validate_quoted_identifier,
+)
+
 logger = logging.getLogger(__name__)


@ -95,10 +102,37 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
            table_name = tc["name"]
            query_mode = tc.get("query_mode", "local")

+            # #81 Group D — refuse rows whose identifiers don't pass the
+            # whitelist. The registry is admin-controlled but anyone with
+            # write access can otherwise inject SQL via the CREATE VIEW /
+            # COPY / SELECT interpolation below. Skip-and-continue rather
+            # than crashing the whole extraction; valid rows still process.
+            #
+            # `table_name` is the DuckDB view name in the master
+            # analytics DB. The orchestrator uses the STRICT validator
+            # (`^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`) when re-creating views,
+            # so any name with `-` or `.` would pass extraction here
+            # but be silently dropped at orchestrator-rebuild time.
+            # Use the strict validator here too so the failure is
+            # caught early and visible in tables_failed.
+            if not validate_identifier(table_name, "Keboola table_name"):
+                stats["tables_failed"] += 1
+                stats["errors"].append(
+                    {"table": table_name, "error": "unsafe identifier"}
+                )
+                continue
+
            if query_mode == "remote":
                # Create view pointing to kbc extension (requires re-ATTACH at query time)
                bucket = tc.get("bucket", "")
                source_table = tc.get("source_table", table_name)
+                if not (validate_quoted_identifier(bucket, "Keboola bucket") and
+                        validate_quoted_identifier(source_table, "Keboola source_table")):
+                    stats["tables_failed"] += 1
+                    stats["errors"].append(
+                        {"table": table_name, "error": "unsafe bucket/source_table"}
+                    )
+                    continue
                if use_extension and bucket:
                    conn.execute(
                        f'CREATE OR REPLACE VIEW "{table_name}" AS '
@ -119,13 +153,19 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
                else:
                    _extract_via_legacy(tc, pq_path, keboola_url, keboola_token)

-                # Get row count and file size
-                rows = conn.execute(f"SELECT count(*) FROM read_parquet('{pq_path}')").fetchone()[0]
+                # Get row count and file size. pq_path is built from the
+                # validated table_name above, but escape the parquet path
+                # literal for defense-in-depth.
+                safe_pq_lit = pq_path.replace("'", "''")
+                rows = conn.execute(
+                    f"SELECT count(*) FROM read_parquet('{safe_pq_lit}')"
+                ).fetchone()[0]
                size = os.path.getsize(pq_path)

                # Create view and register in _meta
                conn.execute(
-                    f'CREATE OR REPLACE VIEW "{table_name}" AS SELECT * FROM read_parquet(\'{pq_path}\')'
+                    f'CREATE OR REPLACE VIEW "{table_name}" AS '
+                    f'SELECT * FROM read_parquet(\'{safe_pq_lit}\')'
                )
                conn.execute(
                    "INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')",
@ -172,8 +212,17 @@ def _extract_via_extension(
    """Extract a table using the DuckDB Keboola extension."""
    bucket = tc.get("bucket", "")
    source_table = tc.get("source_table", tc["name"])
+    # #81 Group D — defense-in-depth. The caller already validates these;
+    # refuse here too in case a future caller forgets. Use the relaxed
+    # quoted-identifier check that accepts Keboola's `in.c-foo` form.
+    if not (is_safe_quoted_identifier(bucket) and is_safe_quoted_identifier(source_table)):
+        raise ValueError(
+            f"unsafe bucket/source_table: {bucket!r}/{source_table!r}"
+        )
+    safe_pq_lit = pq_path.replace("'", "''")
    conn.execute(
-        f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") TO \'{pq_path}\' (FORMAT PARQUET)'
+        f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") '
+        f'TO \'{safe_pq_lit}\' (FORMAT PARQUET)'
    )


--- a/src/identifier_validation.py
+++ b/src/identifier_validation.py
@ -0,0 +1,63 @@
+"""DuckDB identifier validation — shared across orchestrator and extractors.
+
+Issue #81 Group D — extractor-layer SQL injection (M15) is the peer of the
+orchestrator's `_meta.table_name` SQLi (M14, fixed previously by
+`src/orchestrator.py:_validate_identifier`). Same trust problem at a
+different layer: an attacker who controls the contents of `table_registry`
+(admin or whoever can write to that table) can inject SQL via identifier
+interpolation in a connector's `CREATE OR REPLACE VIEW` / `COPY` /
+`INSERT INTO _meta` statements.
+
+Lifted from `src/orchestrator.py` so both layers use the same regex.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+logger = logging.getLogger(__name__)
+
+# Strict DuckDB identifier — letter or underscore start, alphanumeric/underscore body,
+# bounded length. Use for orchestrator-side aliases, extension names, view names —
+# anything we generate or that comes from a tightly-controlled namespace.
+_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
+
+# Relaxed identifier allowing the dotted/dashed forms that real upstreams use
+# — Keboola buckets (`in.c-foo`), BigQuery datasets, etc. Still refuses anything
+# that could break out of a `"..."` quoted identifier (no `"`, no `'`, no `;`,
+# no control chars, no NUL). 128-char cap matches common DB identifier limits.
+_SAFE_QUOTED_IDENTIFIER = re.compile(r"^[a-zA-Z0-9_][a-zA-Z0-9_.\-]{0,127}$")
+
+
+def is_safe_identifier(name: object) -> bool:
+    """Return True if ``name`` is safe to interpolate into a strict
+    DuckDB identifier position (alias, extension, schema we generate)."""
+    return isinstance(name, str) and bool(_SAFE_IDENTIFIER.match(name))
+
+
+def is_safe_quoted_identifier(name: object) -> bool:
+    """Return True if ``name`` is safe to interpolate **inside** double-quotes
+    in a DuckDB identifier position. Allows `.` and `-` for upstream
+    naming conventions (Keboola buckets like `in.c-events`, BigQuery
+    datasets) but refuses anything that could close the quote or
+    inject control characters."""
+    return isinstance(name, str) and bool(_SAFE_QUOTED_IDENTIFIER.match(name))
+
+
+def validate_identifier(name: str, context: str) -> bool:
+    """Strict check — returns True if safe, False (with WARNING log) if not.
+    Use for identifiers that should match `[a-zA-Z_][a-zA-Z0-9_]*`."""
+    if not is_safe_identifier(name):
+        logger.warning("Rejected unsafe %s identifier: %r", context, name)
+        return False
+    return True
+
+
+def validate_quoted_identifier(name: str, context: str) -> bool:
+    """Relaxed check for upstream-typed identifiers (buckets, datasets).
+    Accepts dots and dashes; refuses quote/semicolon/control chars."""
+    if not is_safe_quoted_identifier(name):
+        logger.warning("Rejected unsafe %s identifier: %r", context, name)
+        return False
+    return True
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@ -38,15 +38,14 @@ logger = logging.getLogger(__name__)

 _rebuild_lock = threading.Lock()

-_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
-
-
-def _validate_identifier(name: str, context: str) -> bool:
-    """Validate a DuckDB identifier. Returns True if safe, False if not."""
-    if not _SAFE_IDENTIFIER.match(name):
-        logger.warning("Rejected unsafe %s identifier: %r", context, name)
-        return False
-    return True
+# Identifier validation lives in src/identifier_validation.py so the
+# orchestrator and the extractors share the same regex (#81 Group D).
+# The local names are kept as aliases so existing call sites need no
+# rename — they import from a single source of truth now.
+from src.identifier_validation import (  # noqa: E402
+    _SAFE_IDENTIFIER,  # noqa: F401  (re-exported for any historical caller)
+    validate_identifier as _validate_identifier,
+)


 def _atomic_swap_db(tmp_path: str, target_path: str) -> None:
--- a/tests/test_extractor_identifier_validation.py
+++ b/tests/test_extractor_identifier_validation.py
@ -0,0 +1,172 @@
+"""Issue #81 Group D — extractor-layer identifier injection (M15).
+
+Mirror of `_validate_identifier` coverage in `tests/test_orchestrator.py`,
+but at the extractor layer. Each test feeds a registry row with an
+attacker-shaped `name` / `bucket` / `source_table` into the extractor and
+asserts the row is skipped (no SQL run, error recorded) while valid
+sibling rows in the same registry continue to process.
+"""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from src.identifier_validation import (
+    is_safe_identifier,
+    is_safe_quoted_identifier,
+    validate_identifier,
+    validate_quoted_identifier,
+)
+
+
+class TestIsSafeIdentifier:
+    """Strict identifier — for our own view names, aliases."""
+
+    @pytest.mark.parametrize(
+        "name",
+        ["orders", "Orders", "_priv", "ABC_123", "x", "a" * 64],
+    )
+    def test_valid(self, name):
+        assert is_safe_identifier(name) is True
+
+    @pytest.mark.parametrize(
+        "name",
+        [
+            "",
+            "1starts_with_digit",
+            "has space",
+            "has-dash",
+            "has.dot",
+            "has;semi",
+            "has\"quote",
+            "has'quote",
+            "evil\"; DROP TABLE x; --",
+            "x" * 65,        # too long
+            "café",          # diacritic — DuckDB allows but our policy doesn't
+            None,
+            123,
+            ["x"],
+        ],
+    )
+    def test_invalid(self, name):
+        assert is_safe_identifier(name) is False
+
+
+class TestIsSafeQuotedIdentifier:
+    """Relaxed identifier for upstream-typed names (Keboola buckets,
+    BigQuery datasets). Allows `.` and `-`; refuses injection markers."""
+
+    @pytest.mark.parametrize(
+        "name",
+        [
+            "orders",
+            "in.c-events",          # Keboola bucket
+            "out.c-marketing.roi",  # nested Keboola bucket form
+            "my-bq-dataset",
+            "1starts_with_digit",   # numeric start fine inside quotes
+            "ABC_123",
+            "a" * 128,              # at the limit
+        ],
+    )
+    def test_valid(self, name):
+        assert is_safe_quoted_identifier(name) is True
+
+    @pytest.mark.parametrize(
+        "name",
+        [
+            "",                       # empty
+            "has space",
+            "has\"quote",            # would close the surrounding `"`
+            "has'apostrophe",
+            "has;semi",
+            "evil\"; DROP TABLE x; --",
+            "name\x00with-nul",      # NUL
+            "name\nwith-newline",    # control char
+            "x" * 129,                # too long
+            ".starts_with_dot",
+            "-starts_with_dash",
+            None,
+            123,
+            ["x"],
+        ],
+    )
+    def test_invalid(self, name):
+        assert is_safe_quoted_identifier(name) is False
+
+
+class TestKeboolaExtractorRefusesUnsafeIdentifiers:
+    """Verify keboola/extractor.py:run() skips registry rows with unsafe
+    identifiers but still processes the safe siblings."""
+
+    def test_unsafe_table_name_rejected_safe_kept(self, tmp_path, monkeypatch):
+        """Behavioural test: pass mixed registry rows, observe stats."""
+        from connectors.keboola import extractor as kbe
+
+        # Use the extractor's run() with use_extension=False so we don't
+        # need a real Keboola server. We mock _extract_via_legacy to be a
+        # no-op that creates a tiny parquet so the size lookup works.
+        import duckdb
+
+        def fake_legacy(tc, pq_path, url, token):
+            d = duckdb.connect()
+            d.execute(f"COPY (SELECT 1 AS x) TO '{pq_path}' (FORMAT PARQUET)")
+            d.close()
+
+        monkeypatch.setattr(kbe, "_extract_via_legacy", fake_legacy)
+
+        # Three rows: a good underscore-only name, one with a dash (now
+        # rejected at the extractor — orchestrator's strict validator
+        # would have silently dropped it at rebuild time), and one with
+        # a hostile injection.
+        rows = [
+            {"name": "good_table", "query_mode": "local"},
+            {"name": "events-2026", "query_mode": "local"},
+            {"name": "evil\"; DROP TABLE x; --", "query_mode": "local"},
+        ]
+        out_dir = tmp_path / "extracts" / "keboola"
+        result = kbe.run(
+            str(out_dir), rows, keboola_url="", keboola_token="",
+        )
+
+        # Only good_table passes. events-2026 is rejected up front because
+        # the orchestrator would silently drop it at rebuild time anyway —
+        # better to fail fast and visibly. Operators with dashed names
+        # rename them to underscore form.
+        assert result["tables_extracted"] == 1
+        assert result["tables_failed"] == 2
+        assert any("unsafe identifier" in (e.get("error") or "") for e in result["errors"])
+
+    def test_unsafe_bucket_in_remote_row_rejected(self, tmp_path, monkeypatch):
+        from connectors.keboola import extractor as kbe
+
+        # An adversarial bucket containing `"` would close the surrounding
+        # quote and inject SQL. Real Keboola buckets like `in.c-events`
+        # are accepted by the relaxed validator.
+        rows = [
+            {
+                "name": "good",
+                "query_mode": "remote",
+                "bucket": 'evil"; DROP--',
+                "source_table": "t",
+            },
+        ]
+        out_dir = tmp_path / "extracts" / "keboola"
+        result = kbe.run(
+            str(out_dir), rows, keboola_url="", keboola_token="",
+        )
+        assert result["tables_failed"] == 1
+        assert any("unsafe bucket/source_table" in (e.get("error") or "") for e in result["errors"])
+
+
+class TestBigQueryExtractorRefusesUnsafeIdentifiers:
+    def test_unsafe_dataset_rejected(self):
+        """BigQuery extractor uses validate_quoted_identifier for dataset."""
+        # `my.dataset` is now legitimate (BigQuery datasets can contain dots).
+        assert validate_quoted_identifier("my-dataset", "BigQuery dataset") is True
+        # An injection attempt via embedded quote — refused.
+        assert validate_quoted_identifier("evil\"; DROP --", "BigQuery dataset") is False
+        # Sanity: a normal dataset name passes.
+        assert validate_quoted_identifier("marketing", "BigQuery dataset") is True
+        # The view-name (table_name) still uses the strict validator.
+        assert validate_identifier("marketing_view", "BigQuery table_name") is True
+        assert validate_identifier("marketing-view", "BigQuery table_name") is False