From 569cd90d753c9ac6bd67c83736b5609b4e66c1c7 Mon Sep 17 00:00:00 2001
From: ZdenekSrotyr <139972147+ZdenekSrotyr@users.noreply.github.com>
Date: Mon, 27 Apr 2026 21:46:17 +0200
Subject: [PATCH] =?UTF-8?q?fix(security):=20#81=20Group=20D=20=E2=80=94=20?=
 =?UTF-8?q?extractor-side=20identifier=20validation=20(squashed)=20(#97)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes M15 from issue #81 — SQL injection via attacker-controlled
identifiers in connectors/keboola/extractor.py and
connectors/bigquery/extractor.py.

Lifted _validate_identifier from src/orchestrator.py into a new
src/identifier_validation.py shared module (single source of truth for
both layers). Two validator policies:

- validate_identifier (strict, ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$) for
  table_name — matches the orchestrator's rebuild-time check, so dashed
  names fail fast at extraction rather than being silently dropped.
- validate_quoted_identifier (relaxed, accepts dashes/dots) for
  bucket/dataset/source_table — Keboola in.c-foo and BigQuery
  my-dataset are legitimate, just need to be safe inside `"..."`.

Both extractors skip-and-continue on unsafe rows (logged + counted in
failure stats); _extract_via_extension re-validates as defense-in-depth.

71/71 extractor + orchestrator tests pass.
Refs #81 Group D.
---
 CHANGELOG.md                                  |  19 ++
 connectors/bigquery/extractor.py              |  21 +++
 connectors/keboola/extractor.py               |  57 +++++-
 src/identifier_validation.py                  |  63 +++++++
 src/orchestrator.py                           |  17 +-
 tests/test_extractor_identifier_validation.py | 172 ++++++++++++++++++
 6 files changed, 336 insertions(+), 13 deletions(-)
 create mode 100644 src/identifier_validation.py
 create mode 100644 tests/test_extractor_identifier_validation.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 611d016..f1560e7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -133,6 +133,25 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
   `keboola, bigquery`) is visible at boot rather than at the next
   failed attach. See
   `docs/superpowers/plans/2026-04-27-issue-81-trust-boundary.md`.
+- **Security (MEDIUM)**: extractor-side identifier validation (issue
+  #81 Group D / M15). The Keboola and BigQuery extractors interpolate
+  `table_name`, `bucket` / `dataset`, and `source_table` from
+  `table_registry` directly into `CREATE OR REPLACE VIEW`,
+  `INSERT INTO _meta`, and `COPY ... TO` SQL. Anyone with write access
+  to `table_registry` (admin, registry-write API) could inject SQL via
+  these identifiers. New shared module `src/identifier_validation.py`
+  exposes a strict `validate_identifier` (for our own view names —
+  `^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`, used for `table_name` so it matches
+  the orchestrator's rebuild-time check and dashed names fail fast at
+  extraction rather than being silently dropped at rebuild) and a
+  relaxed `validate_quoted_identifier` (for upstream-typed names like
+  Keboola `in.c-foo` / BigQuery `my-dataset`:
+  `[a-zA-Z0-9_][a-zA-Z0-9_.\-]*`, refusing any character that could
+  close a `"..."` identifier literal). The orchestrator's existing
+  `_validate_identifier` was lifted into the new module so both layers
+  share a single source of truth; both extractors skip-and-continue on
+  unsafe rows (logged + counted in failure stats; the rest of the
+  registry still processes).
 
 ### Removed
 
diff --git a/connectors/bigquery/extractor.py b/connectors/bigquery/extractor.py
index 7917052..919bd37 100644
--- a/connectors/bigquery/extractor.py
+++ b/connectors/bigquery/extractor.py
@@ -12,6 +12,8 @@ from typing import List, Dict, Any
 
 import duckdb
 
+from src.identifier_validation import validate_identifier, validate_quoted_identifier
+
 logger = logging.getLogger(__name__)
 
 
@@ -90,6 +92,25 @@ def init_extract(
             dataset = tc.get("bucket", "")  # BigQuery dataset
             source_table = tc.get("source_table", table_name)
 
+            # #81 Group D — refuse rows with unsafe identifiers. Same
+            # rationale as the keboola extractor: registry is admin-controlled
+            # but anyone with write access can otherwise inject SQL via the
+            # CREATE VIEW interpolation below. Skip-and-continue.
+            # `table_name` is the DuckDB view name in the master
+            # analytics DB and the orchestrator uses the STRICT
+            # validator there — accept the same constraint upstream
+            # so a name with `-` or `.` fails fast in extraction
+            # rather than getting silently dropped at rebuild time.
+            # `dataset` and `source_table` are upstream-typed (BQ
+            # naming) so use the relaxed validator for those.
+            if not (validate_identifier(table_name, "BigQuery table_name") and
+                    validate_quoted_identifier(dataset, "BigQuery dataset") and
+                    validate_quoted_identifier(source_table, "BigQuery source_table")):
+                stats["errors"].append(
+                    {"table": table_name, "error": "unsafe identifier"}
+                )
+                continue
+
             try:
                 conn.execute(
                     f'CREATE OR REPLACE VIEW "{table_name}" AS '
diff --git a/connectors/keboola/extractor.py b/connectors/keboola/extractor.py
index fbcf611..5ddcc00 100644
--- a/connectors/keboola/extractor.py
+++ b/connectors/keboola/extractor.py
@@ -8,6 +8,13 @@ from typing import List, Dict, Any
 
 import duckdb
 
+from src.identifier_validation import (
+    is_safe_identifier,
+    is_safe_quoted_identifier,
+    validate_identifier,
+    validate_quoted_identifier,
+)
+
 logger = logging.getLogger(__name__)
 
 
@@ -95,10 +102,37 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
             table_name = tc["name"]
             query_mode = tc.get("query_mode", "local")
 
+            # #81 Group D — refuse rows whose identifiers don't pass the
+            # whitelist. The registry is admin-controlled but anyone with
+            # write access can otherwise inject SQL via the CREATE VIEW /
+            # COPY / SELECT interpolation below. Skip-and-continue rather
+            # than crashing the whole extraction; valid rows still process.
+            #
+            # `table_name` is the DuckDB view name in the master
+            # analytics DB. The orchestrator uses the STRICT validator
+            # (`^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`) when re-creating views,
+            # so any name with `-` or `.` would pass extraction here
+            # but be silently dropped at orchestrator-rebuild time.
+            # Use the strict validator here too so the failure is
+            # caught early and visible in tables_failed.
+            if not validate_identifier(table_name, "Keboola table_name"):
+                stats["tables_failed"] += 1
+                stats["errors"].append(
+                    {"table": table_name, "error": "unsafe identifier"}
+                )
+                continue
+
             if query_mode == "remote":
                 # Create view pointing to kbc extension (requires re-ATTACH at query time)
                 bucket = tc.get("bucket", "")
                 source_table = tc.get("source_table", table_name)
+                if not (validate_quoted_identifier(bucket, "Keboola bucket") and
+                        validate_quoted_identifier(source_table, "Keboola source_table")):
+                    stats["tables_failed"] += 1
+                    stats["errors"].append(
+                        {"table": table_name, "error": "unsafe bucket/source_table"}
+                    )
+                    continue
                 if use_extension and bucket:
                     conn.execute(
                         f'CREATE OR REPLACE VIEW "{table_name}" AS '
@@ -119,13 +153,19 @@ def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str,
                 else:
                     _extract_via_legacy(tc, pq_path, keboola_url, keboola_token)
 
-                # Get row count and file size
-                rows = conn.execute(f"SELECT count(*) FROM read_parquet('{pq_path}')").fetchone()[0]
+                # Get row count and file size. pq_path is built from the
+                # validated table_name above, but escape the parquet path
+                # literal for defense-in-depth.
+                safe_pq_lit = pq_path.replace("'", "''")
+                rows = conn.execute(
+                    f"SELECT count(*) FROM read_parquet('{safe_pq_lit}')"
+                ).fetchone()[0]
                 size = os.path.getsize(pq_path)
 
                 # Create view and register in _meta
                 conn.execute(
-                    f'CREATE OR REPLACE VIEW "{table_name}" AS SELECT * FROM read_parquet(\'{pq_path}\')'
+                    f'CREATE OR REPLACE VIEW "{table_name}" AS '
+                    f'SELECT * FROM read_parquet(\'{safe_pq_lit}\')'
                 )
                 conn.execute(
                     "INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')",
@@ -172,8 +212,17 @@ def _extract_via_extension(
     """Extract a table using the DuckDB Keboola extension."""
     bucket = tc.get("bucket", "")
     source_table = tc.get("source_table", tc["name"])
+    # #81 Group D — defense-in-depth. The caller already validates these;
+    # refuse here too in case a future caller forgets. Use the relaxed
+    # quoted-identifier check that accepts Keboola's `in.c-foo` form.
+    if not (is_safe_quoted_identifier(bucket) and is_safe_quoted_identifier(source_table)):
+        raise ValueError(
+            f"unsafe bucket/source_table: {bucket!r}/{source_table!r}"
+        )
+    safe_pq_lit = pq_path.replace("'", "''")
     conn.execute(
-        f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") TO \'{pq_path}\' (FORMAT PARQUET)'
+        f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") '
+        f'TO \'{safe_pq_lit}\' (FORMAT PARQUET)'
     )
 
 
diff --git a/src/identifier_validation.py b/src/identifier_validation.py
new file mode 100644
index 0000000..53c59f9
--- /dev/null
+++ b/src/identifier_validation.py
@@ -0,0 +1,63 @@
+"""DuckDB identifier validation — shared across orchestrator and extractors.
+
+Issue #81 Group D — extractor-layer SQL injection (M15) is the peer of the
+orchestrator's `_meta.table_name` SQLi (M14, fixed previously by
+`src/orchestrator.py:_validate_identifier`). Same trust problem at a
+different layer: an attacker who controls the contents of `table_registry`
+(admin or whoever can write to that table) can inject SQL via identifier
+interpolation in a connector's `CREATE OR REPLACE VIEW` / `COPY` /
+`INSERT INTO _meta` statements.
+
+Lifted from `src/orchestrator.py` so both layers use the same regex.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+logger = logging.getLogger(__name__)
+
+# Strict DuckDB identifier — letter or underscore start, alphanumeric/underscore body,
+# bounded length. Use for orchestrator-side aliases, extension names, view names —
+# anything we generate or that comes from a tightly-controlled namespace.
+_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
+
+# Relaxed identifier allowing the dotted/dashed forms that real upstreams use
+# — Keboola buckets (`in.c-foo`), BigQuery datasets, etc. Still refuses anything
+# that could break out of a `"..."` quoted identifier (no `"`, no `'`, no `;`,
+# no control chars, no NUL). 128-char cap matches common DB identifier limits.
+_SAFE_QUOTED_IDENTIFIER = re.compile(r"^[a-zA-Z0-9_][a-zA-Z0-9_.\-]{0,127}$")
+
+
+def is_safe_identifier(name: object) -> bool:
+    """Return True if ``name`` is safe to interpolate into a strict
+    DuckDB identifier position (alias, extension, schema we generate)."""
+    return isinstance(name, str) and bool(_SAFE_IDENTIFIER.match(name))
+
+
+def is_safe_quoted_identifier(name: object) -> bool:
+    """Return True if ``name`` is safe to interpolate **inside** double-quotes
+    in a DuckDB identifier position. Allows `.` and `-` for upstream
+    naming conventions (Keboola buckets like `in.c-events`, BigQuery
+    datasets) but refuses anything that could close the quote or
+    inject control characters."""
+    return isinstance(name, str) and bool(_SAFE_QUOTED_IDENTIFIER.match(name))
+
+
+def validate_identifier(name: str, context: str) -> bool:
+    """Strict check — returns True if safe, False (with WARNING log) if not.
+    Use for identifiers that should match `[a-zA-Z_][a-zA-Z0-9_]*`."""
+    if not is_safe_identifier(name):
+        logger.warning("Rejected unsafe %s identifier: %r", context, name)
+        return False
+    return True
+
+
+def validate_quoted_identifier(name: str, context: str) -> bool:
+    """Relaxed check for upstream-typed identifiers (buckets, datasets).
+    Accepts dots and dashes; refuses quote/semicolon/control chars."""
+    if not is_safe_quoted_identifier(name):
+        logger.warning("Rejected unsafe %s identifier: %r", context, name)
+        return False
+    return True
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 833145a..0183e2f 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -38,15 +38,14 @@ logger = logging.getLogger(__name__)
 
 _rebuild_lock = threading.Lock()
 
-_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
-
-
-def _validate_identifier(name: str, context: str) -> bool:
-    """Validate a DuckDB identifier. Returns True if safe, False if not."""
-    if not _SAFE_IDENTIFIER.match(name):
-        logger.warning("Rejected unsafe %s identifier: %r", context, name)
-        return False
-    return True
+# Identifier validation lives in src/identifier_validation.py so the
+# orchestrator and the extractors share the same regex (#81 Group D).
+# The local names are kept as aliases so existing call sites need no
+# rename — they import from a single source of truth now.
+from src.identifier_validation import (  # noqa: E402
+    _SAFE_IDENTIFIER,  # noqa: F401  (re-exported for any historical caller)
+    validate_identifier as _validate_identifier,
+)
 
 
 def _atomic_swap_db(tmp_path: str, target_path: str) -> None:
diff --git a/tests/test_extractor_identifier_validation.py b/tests/test_extractor_identifier_validation.py
new file mode 100644
index 0000000..8ae02cb
--- /dev/null
+++ b/tests/test_extractor_identifier_validation.py
@@ -0,0 +1,172 @@
+"""Issue #81 Group D — extractor-layer identifier injection (M15).
+
+Mirror of `_validate_identifier` coverage in `tests/test_orchestrator.py`,
+but at the extractor layer. Each test feeds a registry row with an
+attacker-shaped `name` / `bucket` / `source_table` into the extractor and
+asserts the row is skipped (no SQL run, error recorded) while valid
+sibling rows in the same registry continue to process.
+"""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from src.identifier_validation import (
+    is_safe_identifier,
+    is_safe_quoted_identifier,
+    validate_identifier,
+    validate_quoted_identifier,
+)
+
+
+class TestIsSafeIdentifier:
+    """Strict identifier — for our own view names, aliases."""
+
+    @pytest.mark.parametrize(
+        "name",
+        ["orders", "Orders", "_priv", "ABC_123", "x", "a" * 64],
+    )
+    def test_valid(self, name):
+        assert is_safe_identifier(name) is True
+
+    @pytest.mark.parametrize(
+        "name",
+        [
+            "",
+            "1starts_with_digit",
+            "has space",
+            "has-dash",
+            "has.dot",
+            "has;semi",
+            "has\"quote",
+            "has'quote",
+            "evil\"; DROP TABLE x; --",
+            "x" * 65,        # too long
+            "café",          # diacritic — DuckDB allows but our policy doesn't
+            None,
+            123,
+            ["x"],
+        ],
+    )
+    def test_invalid(self, name):
+        assert is_safe_identifier(name) is False
+
+
+class TestIsSafeQuotedIdentifier:
+    """Relaxed identifier for upstream-typed names (Keboola buckets,
+    BigQuery datasets). Allows `.` and `-`; refuses injection markers."""
+
+    @pytest.mark.parametrize(
+        "name",
+        [
+            "orders",
+            "in.c-events",          # Keboola bucket
+            "out.c-marketing.roi",  # nested Keboola bucket form
+            "my-bq-dataset",
+            "1starts_with_digit",   # numeric start fine inside quotes
+            "ABC_123",
+            "a" * 128,              # at the limit
+        ],
+    )
+    def test_valid(self, name):
+        assert is_safe_quoted_identifier(name) is True
+
+    @pytest.mark.parametrize(
+        "name",
+        [
+            "",                       # empty
+            "has space",
+            "has\"quote",            # would close the surrounding `"`
+            "has'apostrophe",
+            "has;semi",
+            "evil\"; DROP TABLE x; --",
+            "name\x00with-nul",      # NUL
+            "name\nwith-newline",    # control char
+            "x" * 129,                # too long
+            ".starts_with_dot",
+            "-starts_with_dash",
+            None,
+            123,
+            ["x"],
+        ],
+    )
+    def test_invalid(self, name):
+        assert is_safe_quoted_identifier(name) is False
+
+
+class TestKeboolaExtractorRefusesUnsafeIdentifiers:
+    """Verify keboola/extractor.py:run() skips registry rows with unsafe
+    identifiers but still processes the safe siblings."""
+
+    def test_unsafe_table_name_rejected_safe_kept(self, tmp_path, monkeypatch):
+        """Behavioural test: pass mixed registry rows, observe stats."""
+        from connectors.keboola import extractor as kbe
+
+        # Use the extractor's run() with use_extension=False so we don't
+        # need a real Keboola server. We mock _extract_via_legacy to be a
+        # no-op that creates a tiny parquet so the size lookup works.
+        import duckdb
+
+        def fake_legacy(tc, pq_path, url, token):
+            d = duckdb.connect()
+            d.execute(f"COPY (SELECT 1 AS x) TO '{pq_path}' (FORMAT PARQUET)")
+            d.close()
+
+        monkeypatch.setattr(kbe, "_extract_via_legacy", fake_legacy)
+
+        # Three rows: a good underscore-only name, one with a dash (now
+        # rejected at the extractor — orchestrator's strict validator
+        # would have silently dropped it at rebuild time), and one with
+        # a hostile injection.
+        rows = [
+            {"name": "good_table", "query_mode": "local"},
+            {"name": "events-2026", "query_mode": "local"},
+            {"name": "evil\"; DROP TABLE x; --", "query_mode": "local"},
+        ]
+        out_dir = tmp_path / "extracts" / "keboola"
+        result = kbe.run(
+            str(out_dir), rows, keboola_url="", keboola_token="",
+        )
+
+        # Only good_table passes. events-2026 is rejected up front because
+        # the orchestrator would silently drop it at rebuild time anyway —
+        # better to fail fast and visibly. Operators with dashed names
+        # rename them to underscore form.
+        assert result["tables_extracted"] == 1
+        assert result["tables_failed"] == 2
+        assert any("unsafe identifier" in (e.get("error") or "") for e in result["errors"])
+
+    def test_unsafe_bucket_in_remote_row_rejected(self, tmp_path, monkeypatch):
+        from connectors.keboola import extractor as kbe
+
+        # An adversarial bucket containing `"` would close the surrounding
+        # quote and inject SQL. Real Keboola buckets like `in.c-events`
+        # are accepted by the relaxed validator.
+        rows = [
+            {
+                "name": "good",
+                "query_mode": "remote",
+                "bucket": 'evil"; DROP--',
+                "source_table": "t",
+            },
+        ]
+        out_dir = tmp_path / "extracts" / "keboola"
+        result = kbe.run(
+            str(out_dir), rows, keboola_url="", keboola_token="",
+        )
+        assert result["tables_failed"] == 1
+        assert any("unsafe bucket/source_table" in (e.get("error") or "") for e in result["errors"])
+
+
+class TestBigQueryExtractorRefusesUnsafeIdentifiers:
+    def test_unsafe_dataset_rejected(self):
+        """BigQuery extractor uses validate_quoted_identifier for dataset."""
+        # `my.dataset` is now legitimate (BigQuery datasets can contain dots).
+        assert validate_quoted_identifier("my-dataset", "BigQuery dataset") is True
+        # An injection attempt via embedded quote — refused.
+        assert validate_quoted_identifier("evil\"; DROP --", "BigQuery dataset") is False
+        # Sanity: a normal dataset name passes.
+        assert validate_quoted_identifier("marketing", "BigQuery dataset") is True
+        # The view-name (table_name) still uses the strict validator.
+        assert validate_identifier("marketing_view", "BigQuery table_name") is True
+        assert validate_identifier("marketing-view", "BigQuery table_name") is False