"""DuckDB connection management and schema versioning. Provides get_system_db() for the system state database and get_analytics_db() for the analytics database with parquet views. """ import logging import os import re import shutil from pathlib import Path import duckdb logger = logging.getLogger(__name__) _SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$") SCHEMA_VERSION = 3 _SYSTEM_SCHEMA = """ CREATE TABLE IF NOT EXISTS schema_version ( version INTEGER NOT NULL, applied_at TIMESTAMP DEFAULT current_timestamp ); CREATE TABLE IF NOT EXISTS users ( id VARCHAR PRIMARY KEY, email VARCHAR UNIQUE NOT NULL, name VARCHAR, role VARCHAR DEFAULT 'analyst', password_hash VARCHAR, setup_token VARCHAR, setup_token_created TIMESTAMP, reset_token VARCHAR, reset_token_created TIMESTAMP, created_at TIMESTAMP DEFAULT current_timestamp, updated_at TIMESTAMP ); CREATE TABLE IF NOT EXISTS sync_state ( table_id VARCHAR PRIMARY KEY, last_sync TIMESTAMP, rows BIGINT, file_size_bytes BIGINT, uncompressed_size_bytes BIGINT, columns INTEGER, hash VARCHAR, status VARCHAR DEFAULT 'ok', error TEXT ); CREATE TABLE IF NOT EXISTS sync_history ( id VARCHAR PRIMARY KEY, table_id VARCHAR NOT NULL, synced_at TIMESTAMP NOT NULL, rows BIGINT, duration_ms INTEGER, status VARCHAR, error TEXT ); CREATE TABLE IF NOT EXISTS user_sync_settings ( user_id VARCHAR NOT NULL, dataset VARCHAR NOT NULL, enabled BOOLEAN DEFAULT false, table_mode VARCHAR DEFAULT 'all', tables JSON, updated_at TIMESTAMP, PRIMARY KEY (user_id, dataset) ); CREATE TABLE IF NOT EXISTS knowledge_items ( id VARCHAR PRIMARY KEY, title VARCHAR NOT NULL, content TEXT, category VARCHAR, tags JSON, status VARCHAR DEFAULT 'pending', contributors JSON, source_user VARCHAR, audience VARCHAR, created_at TIMESTAMP DEFAULT current_timestamp, updated_at TIMESTAMP ); CREATE TABLE IF NOT EXISTS knowledge_votes ( item_id VARCHAR NOT NULL, user_id VARCHAR NOT NULL, vote INTEGER, voted_at TIMESTAMP DEFAULT current_timestamp, PRIMARY KEY (item_id, user_id) ); CREATE TABLE IF NOT EXISTS audit_log ( id VARCHAR PRIMARY KEY, timestamp TIMESTAMP NOT NULL DEFAULT current_timestamp, user_id VARCHAR, action VARCHAR NOT NULL, resource VARCHAR, params JSON, result VARCHAR, duration_ms INTEGER ); CREATE TABLE IF NOT EXISTS telegram_links ( user_id VARCHAR PRIMARY KEY, chat_id BIGINT NOT NULL, linked_at TIMESTAMP DEFAULT current_timestamp ); CREATE TABLE IF NOT EXISTS pending_codes ( code VARCHAR PRIMARY KEY, chat_id BIGINT NOT NULL, created_at TIMESTAMP DEFAULT current_timestamp ); CREATE TABLE IF NOT EXISTS script_registry ( id VARCHAR PRIMARY KEY, name VARCHAR NOT NULL, owner VARCHAR, schedule VARCHAR, source TEXT NOT NULL, deployed_at TIMESTAMP DEFAULT current_timestamp, last_run TIMESTAMP, last_status VARCHAR ); CREATE TABLE IF NOT EXISTS table_registry ( id VARCHAR PRIMARY KEY, name VARCHAR NOT NULL, source_type VARCHAR, bucket VARCHAR, source_table VARCHAR, sync_strategy VARCHAR DEFAULT 'full_refresh', query_mode VARCHAR DEFAULT 'local', sync_schedule VARCHAR, profile_after_sync BOOLEAN DEFAULT true, primary_key VARCHAR, folder VARCHAR, description TEXT, registered_by VARCHAR, is_public BOOLEAN DEFAULT true, registered_at TIMESTAMP DEFAULT current_timestamp ); CREATE TABLE IF NOT EXISTS table_profiles ( table_id VARCHAR PRIMARY KEY, profile JSON NOT NULL, profiled_at TIMESTAMP DEFAULT current_timestamp ); CREATE TABLE IF NOT EXISTS dataset_permissions ( user_id VARCHAR NOT NULL, dataset VARCHAR NOT NULL, access VARCHAR DEFAULT 'read', PRIMARY KEY (user_id, dataset) ); CREATE TABLE IF NOT EXISTS access_requests ( id VARCHAR PRIMARY KEY, user_id VARCHAR NOT NULL, user_email VARCHAR NOT NULL, table_id VARCHAR NOT NULL, reason TEXT, status VARCHAR DEFAULT 'pending', reviewed_by VARCHAR, reviewed_at TIMESTAMP, created_at TIMESTAMP DEFAULT current_timestamp ); """ import threading _system_db_lock = threading.Lock() _system_db_conn: duckdb.DuckDBPyConnection | None = None _system_db_path: str | None = None def _get_data_dir() -> Path: return Path(os.environ.get("DATA_DIR", "./data")) def get_system_db() -> duckdb.DuckDBPyConnection: """Get a connection to the system state database. Uses a single shared connection per DATA_DIR to avoid DuckDB lock conflicts between the main app and background tasks. Returns a cursor so callers can safely close() it without closing the underlying connection. """ global _system_db_conn, _system_db_path db_path = str(_get_data_dir() / "state" / "system.duckdb") with _system_db_lock: if _system_db_conn is None or _system_db_path != db_path: # Close old connection if DATA_DIR changed (e.g., in tests) if _system_db_conn is not None: try: _system_db_conn.close() except Exception: pass Path(db_path).parent.mkdir(parents=True, exist_ok=True) _system_db_conn = duckdb.connect(db_path) _system_db_path = db_path _ensure_schema(_system_db_conn) return _system_db_conn.cursor() def get_analytics_db() -> duckdb.DuckDBPyConnection: """Get a connection to the analytics database (parquet views).""" db_path = _get_data_dir() / "analytics" / "server.duckdb" db_path.parent.mkdir(parents=True, exist_ok=True) return duckdb.connect(str(db_path)) def get_analytics_db_readonly() -> duckdb.DuckDBPyConnection: """Read-only connection to analytics DB. Blocks writes and external access. ATTACHes extract.duckdb files so views that reference them work. """ db_path = _get_data_dir() / "analytics" / "server.duckdb" if not db_path.exists(): db_path.parent.mkdir(parents=True, exist_ok=True) conn = duckdb.connect(str(db_path), read_only=False) try: conn.execute("SET enable_external_access = false") except Exception: pass return conn conn = duckdb.connect(str(db_path), read_only=True) # ATTACH extract.duckdb files FIRST so views referencing them work extracts_dir = _get_data_dir() / "extracts" if extracts_dir.exists(): for ext_dir in sorted(extracts_dir.iterdir()): db_file = ext_dir / "extract.duckdb" if db_file.exists() and ext_dir.is_dir(): if not _SAFE_IDENTIFIER.match(ext_dir.name): continue try: conn.execute(f"ATTACH '{db_file}' AS {ext_dir.name} (READ_ONLY)") except Exception: pass # Note: external_access stays enabled because views use read_parquet() on local files. # File-path-based attacks are blocked by the SQL blocklist in app/api/query.py. return conn _V1_TO_V2_MIGRATIONS = [ "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_type VARCHAR", "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS bucket VARCHAR", "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_table VARCHAR", "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS query_mode VARCHAR DEFAULT 'local'", "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS sync_schedule VARCHAR", "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS profile_after_sync BOOLEAN DEFAULT true", ] _V2_TO_V3_MIGRATIONS = [ "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS is_public BOOLEAN DEFAULT true", ] def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: """Create tables if they don't exist. Apply migrations if schema version changed.""" current = get_schema_version(conn) if current < SCHEMA_VERSION: # Snapshot before migration for rollback support if current > 0: try: db_path = Path(os.environ.get("DATA_DIR", "./data")) / "state" / "system.duckdb" if db_path.exists(): snapshot = db_path.parent / "system.duckdb.pre-migrate" shutil.copy2(str(db_path), str(snapshot)) logger.info("Pre-migration snapshot saved: %s", snapshot) except Exception as e: logger.warning("Could not create pre-migration snapshot: %s", e) conn.execute(_SYSTEM_SCHEMA) if current == 0: conn.execute( "INSERT INTO schema_version (version) VALUES (?)", [SCHEMA_VERSION], ) else: if current < 2: for sql in _V1_TO_V2_MIGRATIONS: conn.execute(sql) if current < 3: for sql in _V2_TO_V3_MIGRATIONS: conn.execute(sql) conn.execute( "UPDATE schema_version SET version = ?, applied_at = current_timestamp", [SCHEMA_VERSION], ) def get_schema_version(conn: duckdb.DuckDBPyConnection) -> int: """Get current schema version. Returns 0 if no schema exists.""" try: result = conn.execute("SELECT MAX(version) FROM schema_version").fetchone() return result[0] if result and result[0] else 0 except duckdb.CatalogException: return 0 def close_system_db() -> None: """Close the shared system DB connection. Called on app shutdown.""" global _system_db_conn, _system_db_path if _system_db_conn: try: _system_db_conn.close() except Exception: pass _system_db_conn = None _system_db_path = None