fix: reject unsafe SQL identifiers in orchestrator

Adds _validate_identifier() with ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$ regex and
applies it to source_name (directory names), table_name (_meta rows), and
alias/extension (_remote_attach rows) before any SQL interpolation.
Adds two tests covering SQL-injection directory names and malicious _meta entries.
This commit is contained in:
ZdenekSrotyr 2026-04-09 06:51:07 +02:00
parent cb9c566d07
commit 0d3ab5060c
2 changed files with 102 additions and 0 deletions

View file

@ -20,6 +20,7 @@ so that remote views resolve correctly.
import hashlib
import logging
import os
import re
import threading
from pathlib import Path
from typing import Dict, List
@ -30,6 +31,16 @@ logger = logging.getLogger(__name__)
_rebuild_lock = threading.Lock()
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
def _validate_identifier(name: str, context: str) -> bool:
"""Validate a DuckDB identifier. Returns True if safe, False if not."""
if not _SAFE_IDENTIFIER.match(name):
logger.warning("Rejected unsafe %s identifier: %r", context, name)
return False
return True
def _get_extracts_dir() -> Path:
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
@ -97,6 +108,9 @@ class SyncOrchestrator:
logger.debug("Skipping %s — no extract.duckdb", ext_dir.name)
continue
if not _validate_identifier(ext_dir.name, "source_name"):
continue
tables = self._attach_and_create_views(
conn, ext_dir.name, str(db_file)
)
@ -147,6 +161,8 @@ class SyncOrchestrator:
).fetchall()
for table_name, rows, size_bytes, query_mode in meta_rows:
if not _validate_identifier(table_name, "table_name"):
continue
conn.execute(
f"CREATE OR REPLACE VIEW \"{table_name}\" AS "
f"SELECT * FROM {source_name}.\"{table_name}\""
@ -180,6 +196,11 @@ class SyncOrchestrator:
).fetchall()
for alias, extension, url, token_env in rows:
if not _validate_identifier(alias, "remote_attach alias"):
continue
if not _validate_identifier(extension, "remote_attach extension"):
continue
token = os.environ.get(token_env, "") if token_env else ""
if token_env and not token:
logger.warning(

View file

@ -291,3 +291,84 @@ class TestSyncOrchestrator:
result1 = orch.rebuild()
result2 = orch.rebuild()
assert result1 == result2
def test_rejects_malicious_source_name(self, setup_env):
"""Directory names with SQL injection chars must be skipped entirely."""
from src.orchestrator import SyncOrchestrator
# Create a directory whose name contains SQL injection characters
malicious_name = "evil; DROP TABLE users--"
malicious_dir = setup_env["extracts_dir"] / malicious_name
malicious_dir.mkdir()
(malicious_dir / "data").mkdir()
# Create a valid extract.duckdb inside the malicious directory
db_path = malicious_dir / "extract.duckdb"
conn = duckdb.connect(str(db_path))
conn.execute(
"""CREATE TABLE _meta (
table_name VARCHAR, description VARCHAR, rows BIGINT,
size_bytes BIGINT, extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'local'
)"""
)
conn.execute('CREATE TABLE "orders" (id VARCHAR)')
conn.execute(
"INSERT INTO _meta VALUES ('orders', '', 0, 0, current_timestamp, 'local')"
)
conn.close()
# Also create a safe source to confirm non-malicious sources still work
_create_mock_extract(
setup_env["extracts_dir"],
"keboola",
[{"name": "orders", "data": [{"id": "1"}]}],
)
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
result = orch.rebuild()
# The malicious directory must not appear in results
assert malicious_name not in result
# The safe source must still be processed
assert "keboola" in result
def test_rejects_malicious_table_name(self, setup_env):
"""Tables with SQL injection names in _meta must be skipped; safe tables still work."""
from src.orchestrator import SyncOrchestrator
source_dir = setup_env["extracts_dir"] / "keboola"
source_dir.mkdir()
(source_dir / "data").mkdir()
db_path = source_dir / "extract.duckdb"
conn = duckdb.connect(str(db_path))
conn.execute(
"""CREATE TABLE _meta (
table_name VARCHAR, description VARCHAR, rows BIGINT,
size_bytes BIGINT, extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'local'
)"""
)
# Safe table
conn.execute('CREATE TABLE "orders" (id VARCHAR)')
conn.execute("INSERT INTO orders VALUES ('1')")
conn.execute(
"INSERT INTO _meta VALUES ('orders', '', 1, 0, current_timestamp, 'local')"
)
# Malicious table_name in _meta (no actual table needed — validation rejects before access)
conn.execute(
"INSERT INTO _meta VALUES ('evil; DROP TABLE users--', '', 0, 0, current_timestamp, 'local')"
)
conn.close()
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
result = orch.rebuild()
assert "keboola" in result
# Safe table present
assert "orders" in result["keboola"]
# Malicious table_name must not appear
assert "evil; DROP TABLE users--" not in result["keboola"]