fix: reject unsafe SQL identifiers in orchestrator
Adds _validate_identifier() with ^[a-zA-Z_][a-zA-Z0-9_]{0,63}$ regex and
applies it to source_name (directory names), table_name (_meta rows), and
alias/extension (_remote_attach rows) before any SQL interpolation.
Adds two tests covering SQL-injection directory names and malicious _meta entries.
This commit is contained in:
parent
cb9c566d07
commit
0d3ab5060c
2 changed files with 102 additions and 0 deletions
|
|
@ -20,6 +20,7 @@ so that remote views resolve correctly.
|
|||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
|
@ -30,6 +31,16 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
_rebuild_lock = threading.Lock()
|
||||
|
||||
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
|
||||
|
||||
|
||||
def _validate_identifier(name: str, context: str) -> bool:
|
||||
"""Validate a DuckDB identifier. Returns True if safe, False if not."""
|
||||
if not _SAFE_IDENTIFIER.match(name):
|
||||
logger.warning("Rejected unsafe %s identifier: %r", context, name)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _get_extracts_dir() -> Path:
|
||||
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
||||
|
|
@ -97,6 +108,9 @@ class SyncOrchestrator:
|
|||
logger.debug("Skipping %s — no extract.duckdb", ext_dir.name)
|
||||
continue
|
||||
|
||||
if not _validate_identifier(ext_dir.name, "source_name"):
|
||||
continue
|
||||
|
||||
tables = self._attach_and_create_views(
|
||||
conn, ext_dir.name, str(db_file)
|
||||
)
|
||||
|
|
@ -147,6 +161,8 @@ class SyncOrchestrator:
|
|||
).fetchall()
|
||||
|
||||
for table_name, rows, size_bytes, query_mode in meta_rows:
|
||||
if not _validate_identifier(table_name, "table_name"):
|
||||
continue
|
||||
conn.execute(
|
||||
f"CREATE OR REPLACE VIEW \"{table_name}\" AS "
|
||||
f"SELECT * FROM {source_name}.\"{table_name}\""
|
||||
|
|
@ -180,6 +196,11 @@ class SyncOrchestrator:
|
|||
).fetchall()
|
||||
|
||||
for alias, extension, url, token_env in rows:
|
||||
if not _validate_identifier(alias, "remote_attach alias"):
|
||||
continue
|
||||
if not _validate_identifier(extension, "remote_attach extension"):
|
||||
continue
|
||||
|
||||
token = os.environ.get(token_env, "") if token_env else ""
|
||||
if token_env and not token:
|
||||
logger.warning(
|
||||
|
|
|
|||
|
|
@ -291,3 +291,84 @@ class TestSyncOrchestrator:
|
|||
result1 = orch.rebuild()
|
||||
result2 = orch.rebuild()
|
||||
assert result1 == result2
|
||||
|
||||
def test_rejects_malicious_source_name(self, setup_env):
|
||||
"""Directory names with SQL injection chars must be skipped entirely."""
|
||||
from src.orchestrator import SyncOrchestrator
|
||||
|
||||
# Create a directory whose name contains SQL injection characters
|
||||
malicious_name = "evil; DROP TABLE users--"
|
||||
malicious_dir = setup_env["extracts_dir"] / malicious_name
|
||||
malicious_dir.mkdir()
|
||||
(malicious_dir / "data").mkdir()
|
||||
|
||||
# Create a valid extract.duckdb inside the malicious directory
|
||||
db_path = malicious_dir / "extract.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
conn.execute(
|
||||
"""CREATE TABLE _meta (
|
||||
table_name VARCHAR, description VARCHAR, rows BIGINT,
|
||||
size_bytes BIGINT, extracted_at TIMESTAMP,
|
||||
query_mode VARCHAR DEFAULT 'local'
|
||||
)"""
|
||||
)
|
||||
conn.execute('CREATE TABLE "orders" (id VARCHAR)')
|
||||
conn.execute(
|
||||
"INSERT INTO _meta VALUES ('orders', '', 0, 0, current_timestamp, 'local')"
|
||||
)
|
||||
conn.close()
|
||||
|
||||
# Also create a safe source to confirm non-malicious sources still work
|
||||
_create_mock_extract(
|
||||
setup_env["extracts_dir"],
|
||||
"keboola",
|
||||
[{"name": "orders", "data": [{"id": "1"}]}],
|
||||
)
|
||||
|
||||
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
|
||||
result = orch.rebuild()
|
||||
|
||||
# The malicious directory must not appear in results
|
||||
assert malicious_name not in result
|
||||
# The safe source must still be processed
|
||||
assert "keboola" in result
|
||||
|
||||
def test_rejects_malicious_table_name(self, setup_env):
|
||||
"""Tables with SQL injection names in _meta must be skipped; safe tables still work."""
|
||||
from src.orchestrator import SyncOrchestrator
|
||||
|
||||
source_dir = setup_env["extracts_dir"] / "keboola"
|
||||
source_dir.mkdir()
|
||||
(source_dir / "data").mkdir()
|
||||
|
||||
db_path = source_dir / "extract.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
conn.execute(
|
||||
"""CREATE TABLE _meta (
|
||||
table_name VARCHAR, description VARCHAR, rows BIGINT,
|
||||
size_bytes BIGINT, extracted_at TIMESTAMP,
|
||||
query_mode VARCHAR DEFAULT 'local'
|
||||
)"""
|
||||
)
|
||||
|
||||
# Safe table
|
||||
conn.execute('CREATE TABLE "orders" (id VARCHAR)')
|
||||
conn.execute("INSERT INTO orders VALUES ('1')")
|
||||
conn.execute(
|
||||
"INSERT INTO _meta VALUES ('orders', '', 1, 0, current_timestamp, 'local')"
|
||||
)
|
||||
|
||||
# Malicious table_name in _meta (no actual table needed — validation rejects before access)
|
||||
conn.execute(
|
||||
"INSERT INTO _meta VALUES ('evil; DROP TABLE users--', '', 0, 0, current_timestamp, 'local')"
|
||||
)
|
||||
conn.close()
|
||||
|
||||
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
|
||||
result = orch.rebuild()
|
||||
|
||||
assert "keboola" in result
|
||||
# Safe table present
|
||||
assert "orders" in result["keboola"]
|
||||
# Malicious table_name must not appear
|
||||
assert "evil; DROP TABLE users--" not in result["keboola"]
|
||||
|
|
|
|||
Loading…
Reference in a new issue