diff --git a/src/orchestrator.py b/src/orchestrator.py index b4e97ed..e8e7e78 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -20,6 +20,7 @@ so that remote views resolve correctly. import hashlib import logging import os +import re import threading from pathlib import Path from typing import Dict, List @@ -30,6 +31,16 @@ logger = logging.getLogger(__name__) _rebuild_lock = threading.Lock() +_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$") + + +def _validate_identifier(name: str, context: str) -> bool: + """Validate a DuckDB identifier. Returns True if safe, False if not.""" + if not _SAFE_IDENTIFIER.match(name): + logger.warning("Rejected unsafe %s identifier: %r", context, name) + return False + return True + def _get_extracts_dir() -> Path: data_dir = Path(os.environ.get("DATA_DIR", "./data")) @@ -97,6 +108,9 @@ class SyncOrchestrator: logger.debug("Skipping %s — no extract.duckdb", ext_dir.name) continue + if not _validate_identifier(ext_dir.name, "source_name"): + continue + tables = self._attach_and_create_views( conn, ext_dir.name, str(db_file) ) @@ -147,6 +161,8 @@ class SyncOrchestrator: ).fetchall() for table_name, rows, size_bytes, query_mode in meta_rows: + if not _validate_identifier(table_name, "table_name"): + continue conn.execute( f"CREATE OR REPLACE VIEW \"{table_name}\" AS " f"SELECT * FROM {source_name}.\"{table_name}\"" @@ -180,6 +196,11 @@ class SyncOrchestrator: ).fetchall() for alias, extension, url, token_env in rows: + if not _validate_identifier(alias, "remote_attach alias"): + continue + if not _validate_identifier(extension, "remote_attach extension"): + continue + token = os.environ.get(token_env, "") if token_env else "" if token_env and not token: logger.warning( diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index cefdb9a..8a903eb 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -291,3 +291,84 @@ class TestSyncOrchestrator: result1 = orch.rebuild() result2 = orch.rebuild() assert result1 == result2 + + def test_rejects_malicious_source_name(self, setup_env): + """Directory names with SQL injection chars must be skipped entirely.""" + from src.orchestrator import SyncOrchestrator + + # Create a directory whose name contains SQL injection characters + malicious_name = "evil; DROP TABLE users--" + malicious_dir = setup_env["extracts_dir"] / malicious_name + malicious_dir.mkdir() + (malicious_dir / "data").mkdir() + + # Create a valid extract.duckdb inside the malicious directory + db_path = malicious_dir / "extract.duckdb" + conn = duckdb.connect(str(db_path)) + conn.execute( + """CREATE TABLE _meta ( + table_name VARCHAR, description VARCHAR, rows BIGINT, + size_bytes BIGINT, extracted_at TIMESTAMP, + query_mode VARCHAR DEFAULT 'local' + )""" + ) + conn.execute('CREATE TABLE "orders" (id VARCHAR)') + conn.execute( + "INSERT INTO _meta VALUES ('orders', '', 0, 0, current_timestamp, 'local')" + ) + conn.close() + + # Also create a safe source to confirm non-malicious sources still work + _create_mock_extract( + setup_env["extracts_dir"], + "keboola", + [{"name": "orders", "data": [{"id": "1"}]}], + ) + + orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"]) + result = orch.rebuild() + + # The malicious directory must not appear in results + assert malicious_name not in result + # The safe source must still be processed + assert "keboola" in result + + def test_rejects_malicious_table_name(self, setup_env): + """Tables with SQL injection names in _meta must be skipped; safe tables still work.""" + from src.orchestrator import SyncOrchestrator + + source_dir = setup_env["extracts_dir"] / "keboola" + source_dir.mkdir() + (source_dir / "data").mkdir() + + db_path = source_dir / "extract.duckdb" + conn = duckdb.connect(str(db_path)) + conn.execute( + """CREATE TABLE _meta ( + table_name VARCHAR, description VARCHAR, rows BIGINT, + size_bytes BIGINT, extracted_at TIMESTAMP, + query_mode VARCHAR DEFAULT 'local' + )""" + ) + + # Safe table + conn.execute('CREATE TABLE "orders" (id VARCHAR)') + conn.execute("INSERT INTO orders VALUES ('1')") + conn.execute( + "INSERT INTO _meta VALUES ('orders', '', 1, 0, current_timestamp, 'local')" + ) + + # Malicious table_name in _meta (no actual table needed — validation rejects before access) + conn.execute( + "INSERT INTO _meta VALUES ('evil; DROP TABLE users--', '', 0, 0, current_timestamp, 'local')" + ) + conn.close() + + orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"]) + result = orch.rebuild() + + assert "keboola" in result + # Safe table present + assert "orders" in result["keboola"] + # Malicious table_name must not appear + assert "evil; DROP TABLE users--" not in result["keboola"]