CalVer CI (release.yml) with stable/dev channels, health endpoint with version/channel/schema_version, JWT secret auto-generation with file persistence, smoke test script + Docker-in-CI, pre-migration snapshot, /api/admin/configure for headless setup, /api/admin/ discover-and-register, /setup wizard, OpenAPI snapshot test, custom connector mount support, CHANGELOG, migration safety tests, startup banner. 663 tests pass (6 new migration safety + 3 OpenAPI snapshot + 1 updated JWT test).
314 lines
9.7 KiB
Python
314 lines
9.7 KiB
Python
"""DuckDB connection management and schema versioning.
|
|
|
|
Provides get_system_db() for the system state database
|
|
and get_analytics_db() for the analytics database with parquet views.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
import duckdb
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
|
|
|
|
SCHEMA_VERSION = 3
|
|
|
|
_SYSTEM_SCHEMA = """
|
|
CREATE TABLE IF NOT EXISTS schema_version (
|
|
version INTEGER NOT NULL,
|
|
applied_at TIMESTAMP DEFAULT current_timestamp
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS users (
|
|
id VARCHAR PRIMARY KEY,
|
|
email VARCHAR UNIQUE NOT NULL,
|
|
name VARCHAR,
|
|
role VARCHAR DEFAULT 'analyst',
|
|
password_hash VARCHAR,
|
|
setup_token VARCHAR,
|
|
setup_token_created TIMESTAMP,
|
|
reset_token VARCHAR,
|
|
reset_token_created TIMESTAMP,
|
|
created_at TIMESTAMP DEFAULT current_timestamp,
|
|
updated_at TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS sync_state (
|
|
table_id VARCHAR PRIMARY KEY,
|
|
last_sync TIMESTAMP,
|
|
rows BIGINT,
|
|
file_size_bytes BIGINT,
|
|
uncompressed_size_bytes BIGINT,
|
|
columns INTEGER,
|
|
hash VARCHAR,
|
|
status VARCHAR DEFAULT 'ok',
|
|
error TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS sync_history (
|
|
id VARCHAR PRIMARY KEY,
|
|
table_id VARCHAR NOT NULL,
|
|
synced_at TIMESTAMP NOT NULL,
|
|
rows BIGINT,
|
|
duration_ms INTEGER,
|
|
status VARCHAR,
|
|
error TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS user_sync_settings (
|
|
user_id VARCHAR NOT NULL,
|
|
dataset VARCHAR NOT NULL,
|
|
enabled BOOLEAN DEFAULT false,
|
|
table_mode VARCHAR DEFAULT 'all',
|
|
tables JSON,
|
|
updated_at TIMESTAMP,
|
|
PRIMARY KEY (user_id, dataset)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS knowledge_items (
|
|
id VARCHAR PRIMARY KEY,
|
|
title VARCHAR NOT NULL,
|
|
content TEXT,
|
|
category VARCHAR,
|
|
tags JSON,
|
|
status VARCHAR DEFAULT 'pending',
|
|
contributors JSON,
|
|
source_user VARCHAR,
|
|
audience VARCHAR,
|
|
created_at TIMESTAMP DEFAULT current_timestamp,
|
|
updated_at TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS knowledge_votes (
|
|
item_id VARCHAR NOT NULL,
|
|
user_id VARCHAR NOT NULL,
|
|
vote INTEGER,
|
|
voted_at TIMESTAMP DEFAULT current_timestamp,
|
|
PRIMARY KEY (item_id, user_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS audit_log (
|
|
id VARCHAR PRIMARY KEY,
|
|
timestamp TIMESTAMP NOT NULL DEFAULT current_timestamp,
|
|
user_id VARCHAR,
|
|
action VARCHAR NOT NULL,
|
|
resource VARCHAR,
|
|
params JSON,
|
|
result VARCHAR,
|
|
duration_ms INTEGER
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS telegram_links (
|
|
user_id VARCHAR PRIMARY KEY,
|
|
chat_id BIGINT NOT NULL,
|
|
linked_at TIMESTAMP DEFAULT current_timestamp
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS pending_codes (
|
|
code VARCHAR PRIMARY KEY,
|
|
chat_id BIGINT NOT NULL,
|
|
created_at TIMESTAMP DEFAULT current_timestamp
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS script_registry (
|
|
id VARCHAR PRIMARY KEY,
|
|
name VARCHAR NOT NULL,
|
|
owner VARCHAR,
|
|
schedule VARCHAR,
|
|
source TEXT NOT NULL,
|
|
deployed_at TIMESTAMP DEFAULT current_timestamp,
|
|
last_run TIMESTAMP,
|
|
last_status VARCHAR
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS table_registry (
|
|
id VARCHAR PRIMARY KEY,
|
|
name VARCHAR NOT NULL,
|
|
source_type VARCHAR,
|
|
bucket VARCHAR,
|
|
source_table VARCHAR,
|
|
sync_strategy VARCHAR DEFAULT 'full_refresh',
|
|
query_mode VARCHAR DEFAULT 'local',
|
|
sync_schedule VARCHAR,
|
|
profile_after_sync BOOLEAN DEFAULT true,
|
|
primary_key VARCHAR,
|
|
folder VARCHAR,
|
|
description TEXT,
|
|
registered_by VARCHAR,
|
|
is_public BOOLEAN DEFAULT true,
|
|
registered_at TIMESTAMP DEFAULT current_timestamp
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS table_profiles (
|
|
table_id VARCHAR PRIMARY KEY,
|
|
profile JSON NOT NULL,
|
|
profiled_at TIMESTAMP DEFAULT current_timestamp
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS dataset_permissions (
|
|
user_id VARCHAR NOT NULL,
|
|
dataset VARCHAR NOT NULL,
|
|
access VARCHAR DEFAULT 'read',
|
|
PRIMARY KEY (user_id, dataset)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS access_requests (
|
|
id VARCHAR PRIMARY KEY,
|
|
user_id VARCHAR NOT NULL,
|
|
user_email VARCHAR NOT NULL,
|
|
table_id VARCHAR NOT NULL,
|
|
reason TEXT,
|
|
status VARCHAR DEFAULT 'pending',
|
|
reviewed_by VARCHAR,
|
|
reviewed_at TIMESTAMP,
|
|
created_at TIMESTAMP DEFAULT current_timestamp
|
|
);
|
|
"""
|
|
|
|
|
|
import threading
|
|
|
|
_system_db_lock = threading.Lock()
|
|
_system_db_conn: duckdb.DuckDBPyConnection | None = None
|
|
_system_db_path: str | None = None
|
|
|
|
|
|
def _get_data_dir() -> Path:
|
|
return Path(os.environ.get("DATA_DIR", "./data"))
|
|
|
|
|
|
def get_system_db() -> duckdb.DuckDBPyConnection:
|
|
"""Get a connection to the system state database.
|
|
|
|
Uses a single shared connection per DATA_DIR to avoid DuckDB lock
|
|
conflicts between the main app and background tasks. Returns a cursor
|
|
so callers can safely close() it without closing the underlying connection.
|
|
"""
|
|
global _system_db_conn, _system_db_path
|
|
db_path = str(_get_data_dir() / "state" / "system.duckdb")
|
|
|
|
with _system_db_lock:
|
|
if _system_db_conn is None or _system_db_path != db_path:
|
|
# Close old connection if DATA_DIR changed (e.g., in tests)
|
|
if _system_db_conn is not None:
|
|
try:
|
|
_system_db_conn.close()
|
|
except Exception:
|
|
pass
|
|
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
_system_db_conn = duckdb.connect(db_path)
|
|
_system_db_path = db_path
|
|
_ensure_schema(_system_db_conn)
|
|
return _system_db_conn.cursor()
|
|
|
|
|
|
def get_analytics_db() -> duckdb.DuckDBPyConnection:
|
|
"""Get a connection to the analytics database (parquet views)."""
|
|
db_path = _get_data_dir() / "analytics" / "server.duckdb"
|
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
return duckdb.connect(str(db_path))
|
|
|
|
|
|
def get_analytics_db_readonly() -> duckdb.DuckDBPyConnection:
|
|
"""Read-only connection to analytics DB. Blocks writes and external access.
|
|
|
|
ATTACHes extract.duckdb files so views that reference them work.
|
|
"""
|
|
db_path = _get_data_dir() / "analytics" / "server.duckdb"
|
|
if not db_path.exists():
|
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
conn = duckdb.connect(str(db_path), read_only=False)
|
|
try:
|
|
conn.execute("SET enable_external_access = false")
|
|
except Exception:
|
|
pass
|
|
return conn
|
|
conn = duckdb.connect(str(db_path), read_only=True)
|
|
# ATTACH extract.duckdb files FIRST so views referencing them work
|
|
extracts_dir = _get_data_dir() / "extracts"
|
|
if extracts_dir.exists():
|
|
for ext_dir in sorted(extracts_dir.iterdir()):
|
|
db_file = ext_dir / "extract.duckdb"
|
|
if db_file.exists() and ext_dir.is_dir():
|
|
if not _SAFE_IDENTIFIER.match(ext_dir.name):
|
|
continue
|
|
try:
|
|
conn.execute(f"ATTACH '{db_file}' AS {ext_dir.name} (READ_ONLY)")
|
|
except Exception:
|
|
pass
|
|
# Note: external_access stays enabled because views use read_parquet() on local files.
|
|
# File-path-based attacks are blocked by the SQL blocklist in app/api/query.py.
|
|
return conn
|
|
|
|
|
|
_V1_TO_V2_MIGRATIONS = [
|
|
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_type VARCHAR",
|
|
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS bucket VARCHAR",
|
|
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_table VARCHAR",
|
|
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS query_mode VARCHAR DEFAULT 'local'",
|
|
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS sync_schedule VARCHAR",
|
|
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS profile_after_sync BOOLEAN DEFAULT true",
|
|
]
|
|
|
|
_V2_TO_V3_MIGRATIONS = [
|
|
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS is_public BOOLEAN DEFAULT true",
|
|
]
|
|
|
|
|
|
def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
|
|
"""Create tables if they don't exist. Apply migrations if schema version changed."""
|
|
current = get_schema_version(conn)
|
|
if current < SCHEMA_VERSION:
|
|
# Snapshot before migration for rollback support
|
|
if current > 0:
|
|
try:
|
|
db_path = Path(os.environ.get("DATA_DIR", "./data")) / "state" / "system.duckdb"
|
|
if db_path.exists():
|
|
snapshot = db_path.parent / "system.duckdb.pre-migrate"
|
|
shutil.copy2(str(db_path), str(snapshot))
|
|
logger.info("Pre-migration snapshot saved: %s", snapshot)
|
|
except Exception as e:
|
|
logger.warning("Could not create pre-migration snapshot: %s", e)
|
|
conn.execute(_SYSTEM_SCHEMA)
|
|
if current == 0:
|
|
conn.execute(
|
|
"INSERT INTO schema_version (version) VALUES (?)",
|
|
[SCHEMA_VERSION],
|
|
)
|
|
else:
|
|
if current < 2:
|
|
for sql in _V1_TO_V2_MIGRATIONS:
|
|
conn.execute(sql)
|
|
if current < 3:
|
|
for sql in _V2_TO_V3_MIGRATIONS:
|
|
conn.execute(sql)
|
|
conn.execute(
|
|
"UPDATE schema_version SET version = ?, applied_at = current_timestamp",
|
|
[SCHEMA_VERSION],
|
|
)
|
|
|
|
|
|
def get_schema_version(conn: duckdb.DuckDBPyConnection) -> int:
|
|
"""Get current schema version. Returns 0 if no schema exists."""
|
|
try:
|
|
result = conn.execute("SELECT MAX(version) FROM schema_version").fetchone()
|
|
return result[0] if result and result[0] else 0
|
|
except duckdb.CatalogException:
|
|
return 0
|
|
|
|
|
|
def close_system_db() -> None:
|
|
"""Close the shared system DB connection. Called on app shutdown."""
|
|
global _system_db_conn, _system_db_path
|
|
if _system_db_conn:
|
|
try:
|
|
_system_db_conn.close()
|
|
except Exception:
|
|
pass
|
|
_system_db_conn = None
|
|
_system_db_path = None
|