fix: reject unsafe SQL identifiers in orchestrator

Adds _validate_identifier() with ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$ regex and applies it to source_name (directory names), table_name (_meta rows), and alias/extension (_remote_attach rows) before any SQL interpolation. Adds two tests covering SQL-injection directory names and malicious _meta entries.
2026-04-09 06:51:07 +02:00 · 2026-04-09 06:51:07 +02:00 · 0d3ab5060c
commit 0d3ab5060c
parent cb9c566d07
2 changed files with 102 additions and 0 deletions
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@ -20,6 +20,7 @@ so that remote views resolve correctly.
 import hashlib
 import logging
 import os
+import re
 import threading
 from pathlib import Path
 from typing import Dict, List
@ -30,6 +31,16 @@ logger = logging.getLogger(__name__)

 _rebuild_lock = threading.Lock()

+_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
+
+
+def _validate_identifier(name: str, context: str) -> bool:
+    """Validate a DuckDB identifier. Returns True if safe, False if not."""
+    if not _SAFE_IDENTIFIER.match(name):
+        logger.warning("Rejected unsafe %s identifier: %r", context, name)
+        return False
+    return True
+

 def _get_extracts_dir() -> Path:
    data_dir = Path(os.environ.get("DATA_DIR", "./data"))
@ -97,6 +108,9 @@ class SyncOrchestrator:
                    logger.debug("Skipping %s — no extract.duckdb", ext_dir.name)
                    continue

+                if not _validate_identifier(ext_dir.name, "source_name"):
+                    continue
+
                tables = self._attach_and_create_views(
                    conn, ext_dir.name, str(db_file)
                )
@ -147,6 +161,8 @@ class SyncOrchestrator:
            ).fetchall()

            for table_name, rows, size_bytes, query_mode in meta_rows:
+                if not _validate_identifier(table_name, "table_name"):
+                    continue
                conn.execute(
                    f"CREATE OR REPLACE VIEW \"{table_name}\" AS "
                    f"SELECT * FROM {source_name}.\"{table_name}\""
@ -180,6 +196,11 @@ class SyncOrchestrator:
        ).fetchall()

        for alias, extension, url, token_env in rows:
+            if not _validate_identifier(alias, "remote_attach alias"):
+                continue
+            if not _validate_identifier(extension, "remote_attach extension"):
+                continue
+
            token = os.environ.get(token_env, "") if token_env else ""
            if token_env and not token:
                logger.warning(
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@ -291,3 +291,84 @@ class TestSyncOrchestrator:
        result1 = orch.rebuild()
        result2 = orch.rebuild()
        assert result1 == result2
+
+    def test_rejects_malicious_source_name(self, setup_env):
+        """Directory names with SQL injection chars must be skipped entirely."""
+        from src.orchestrator import SyncOrchestrator
+
+        # Create a directory whose name contains SQL injection characters
+        malicious_name = "evil; DROP TABLE users--"
+        malicious_dir = setup_env["extracts_dir"] / malicious_name
+        malicious_dir.mkdir()
+        (malicious_dir / "data").mkdir()
+
+        # Create a valid extract.duckdb inside the malicious directory
+        db_path = malicious_dir / "extract.duckdb"
+        conn = duckdb.connect(str(db_path))
+        conn.execute(
+            """CREATE TABLE _meta (
+            table_name VARCHAR, description VARCHAR, rows BIGINT,
+            size_bytes BIGINT, extracted_at TIMESTAMP,
+            query_mode VARCHAR DEFAULT 'local'
+        )"""
+        )
+        conn.execute('CREATE TABLE "orders" (id VARCHAR)')
+        conn.execute(
+            "INSERT INTO _meta VALUES ('orders', '', 0, 0, current_timestamp, 'local')"
+        )
+        conn.close()
+
+        # Also create a safe source to confirm non-malicious sources still work
+        _create_mock_extract(
+            setup_env["extracts_dir"],
+            "keboola",
+            [{"name": "orders", "data": [{"id": "1"}]}],
+        )
+
+        orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
+        result = orch.rebuild()
+
+        # The malicious directory must not appear in results
+        assert malicious_name not in result
+        # The safe source must still be processed
+        assert "keboola" in result
+
+    def test_rejects_malicious_table_name(self, setup_env):
+        """Tables with SQL injection names in _meta must be skipped; safe tables still work."""
+        from src.orchestrator import SyncOrchestrator
+
+        source_dir = setup_env["extracts_dir"] / "keboola"
+        source_dir.mkdir()
+        (source_dir / "data").mkdir()
+
+        db_path = source_dir / "extract.duckdb"
+        conn = duckdb.connect(str(db_path))
+        conn.execute(
+            """CREATE TABLE _meta (
+            table_name VARCHAR, description VARCHAR, rows BIGINT,
+            size_bytes BIGINT, extracted_at TIMESTAMP,
+            query_mode VARCHAR DEFAULT 'local'
+        )"""
+        )
+
+        # Safe table
+        conn.execute('CREATE TABLE "orders" (id VARCHAR)')
+        conn.execute("INSERT INTO orders VALUES ('1')")
+        conn.execute(
+            "INSERT INTO _meta VALUES ('orders', '', 1, 0, current_timestamp, 'local')"
+        )
+
+        # Malicious table_name in _meta (no actual table needed — validation rejects before access)
+        conn.execute(
+            "INSERT INTO _meta VALUES ('evil; DROP TABLE users--', '', 0, 0, current_timestamp, 'local')"
+        )
+        conn.close()
+
+        orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
+        result = orch.rebuild()
+
+        assert "keboola" in result
+        # Safe table present
+        assert "orders" in result["keboola"]
+        # Malicious table_name must not appear
+        assert "evil; DROP TABLE users--" not in result["keboola"]