agnes-the-ai-analyst/src/db.py
ZdenekSrotyr 6c53082295 feat: multi-instance deployment — all 14 must-have items from spec
CalVer CI (release.yml) with stable/dev channels, health endpoint
with version/channel/schema_version, JWT secret auto-generation with
file persistence, smoke test script + Docker-in-CI, pre-migration
snapshot, /api/admin/configure for headless setup, /api/admin/
discover-and-register, /setup wizard, OpenAPI snapshot test, custom
connector mount support, CHANGELOG, migration safety tests, startup
banner.

663 tests pass (6 new migration safety + 3 OpenAPI snapshot + 1
updated JWT test).
2026-04-10 11:57:42 +02:00

314 lines
9.7 KiB
Python

"""DuckDB connection management and schema versioning.
Provides get_system_db() for the system state database
and get_analytics_db() for the analytics database with parquet views.
"""
import logging
import os
import re
import shutil
from pathlib import Path
import duckdb
logger = logging.getLogger(__name__)
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
SCHEMA_VERSION = 3
_SYSTEM_SCHEMA = """
CREATE TABLE IF NOT EXISTS schema_version (
version INTEGER NOT NULL,
applied_at TIMESTAMP DEFAULT current_timestamp
);
CREATE TABLE IF NOT EXISTS users (
id VARCHAR PRIMARY KEY,
email VARCHAR UNIQUE NOT NULL,
name VARCHAR,
role VARCHAR DEFAULT 'analyst',
password_hash VARCHAR,
setup_token VARCHAR,
setup_token_created TIMESTAMP,
reset_token VARCHAR,
reset_token_created TIMESTAMP,
created_at TIMESTAMP DEFAULT current_timestamp,
updated_at TIMESTAMP
);
CREATE TABLE IF NOT EXISTS sync_state (
table_id VARCHAR PRIMARY KEY,
last_sync TIMESTAMP,
rows BIGINT,
file_size_bytes BIGINT,
uncompressed_size_bytes BIGINT,
columns INTEGER,
hash VARCHAR,
status VARCHAR DEFAULT 'ok',
error TEXT
);
CREATE TABLE IF NOT EXISTS sync_history (
id VARCHAR PRIMARY KEY,
table_id VARCHAR NOT NULL,
synced_at TIMESTAMP NOT NULL,
rows BIGINT,
duration_ms INTEGER,
status VARCHAR,
error TEXT
);
CREATE TABLE IF NOT EXISTS user_sync_settings (
user_id VARCHAR NOT NULL,
dataset VARCHAR NOT NULL,
enabled BOOLEAN DEFAULT false,
table_mode VARCHAR DEFAULT 'all',
tables JSON,
updated_at TIMESTAMP,
PRIMARY KEY (user_id, dataset)
);
CREATE TABLE IF NOT EXISTS knowledge_items (
id VARCHAR PRIMARY KEY,
title VARCHAR NOT NULL,
content TEXT,
category VARCHAR,
tags JSON,
status VARCHAR DEFAULT 'pending',
contributors JSON,
source_user VARCHAR,
audience VARCHAR,
created_at TIMESTAMP DEFAULT current_timestamp,
updated_at TIMESTAMP
);
CREATE TABLE IF NOT EXISTS knowledge_votes (
item_id VARCHAR NOT NULL,
user_id VARCHAR NOT NULL,
vote INTEGER,
voted_at TIMESTAMP DEFAULT current_timestamp,
PRIMARY KEY (item_id, user_id)
);
CREATE TABLE IF NOT EXISTS audit_log (
id VARCHAR PRIMARY KEY,
timestamp TIMESTAMP NOT NULL DEFAULT current_timestamp,
user_id VARCHAR,
action VARCHAR NOT NULL,
resource VARCHAR,
params JSON,
result VARCHAR,
duration_ms INTEGER
);
CREATE TABLE IF NOT EXISTS telegram_links (
user_id VARCHAR PRIMARY KEY,
chat_id BIGINT NOT NULL,
linked_at TIMESTAMP DEFAULT current_timestamp
);
CREATE TABLE IF NOT EXISTS pending_codes (
code VARCHAR PRIMARY KEY,
chat_id BIGINT NOT NULL,
created_at TIMESTAMP DEFAULT current_timestamp
);
CREATE TABLE IF NOT EXISTS script_registry (
id VARCHAR PRIMARY KEY,
name VARCHAR NOT NULL,
owner VARCHAR,
schedule VARCHAR,
source TEXT NOT NULL,
deployed_at TIMESTAMP DEFAULT current_timestamp,
last_run TIMESTAMP,
last_status VARCHAR
);
CREATE TABLE IF NOT EXISTS table_registry (
id VARCHAR PRIMARY KEY,
name VARCHAR NOT NULL,
source_type VARCHAR,
bucket VARCHAR,
source_table VARCHAR,
sync_strategy VARCHAR DEFAULT 'full_refresh',
query_mode VARCHAR DEFAULT 'local',
sync_schedule VARCHAR,
profile_after_sync BOOLEAN DEFAULT true,
primary_key VARCHAR,
folder VARCHAR,
description TEXT,
registered_by VARCHAR,
is_public BOOLEAN DEFAULT true,
registered_at TIMESTAMP DEFAULT current_timestamp
);
CREATE TABLE IF NOT EXISTS table_profiles (
table_id VARCHAR PRIMARY KEY,
profile JSON NOT NULL,
profiled_at TIMESTAMP DEFAULT current_timestamp
);
CREATE TABLE IF NOT EXISTS dataset_permissions (
user_id VARCHAR NOT NULL,
dataset VARCHAR NOT NULL,
access VARCHAR DEFAULT 'read',
PRIMARY KEY (user_id, dataset)
);
CREATE TABLE IF NOT EXISTS access_requests (
id VARCHAR PRIMARY KEY,
user_id VARCHAR NOT NULL,
user_email VARCHAR NOT NULL,
table_id VARCHAR NOT NULL,
reason TEXT,
status VARCHAR DEFAULT 'pending',
reviewed_by VARCHAR,
reviewed_at TIMESTAMP,
created_at TIMESTAMP DEFAULT current_timestamp
);
"""
import threading
_system_db_lock = threading.Lock()
_system_db_conn: duckdb.DuckDBPyConnection | None = None
_system_db_path: str | None = None
def _get_data_dir() -> Path:
return Path(os.environ.get("DATA_DIR", "./data"))
def get_system_db() -> duckdb.DuckDBPyConnection:
"""Get a connection to the system state database.
Uses a single shared connection per DATA_DIR to avoid DuckDB lock
conflicts between the main app and background tasks. Returns a cursor
so callers can safely close() it without closing the underlying connection.
"""
global _system_db_conn, _system_db_path
db_path = str(_get_data_dir() / "state" / "system.duckdb")
with _system_db_lock:
if _system_db_conn is None or _system_db_path != db_path:
# Close old connection if DATA_DIR changed (e.g., in tests)
if _system_db_conn is not None:
try:
_system_db_conn.close()
except Exception:
pass
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
_system_db_conn = duckdb.connect(db_path)
_system_db_path = db_path
_ensure_schema(_system_db_conn)
return _system_db_conn.cursor()
def get_analytics_db() -> duckdb.DuckDBPyConnection:
"""Get a connection to the analytics database (parquet views)."""
db_path = _get_data_dir() / "analytics" / "server.duckdb"
db_path.parent.mkdir(parents=True, exist_ok=True)
return duckdb.connect(str(db_path))
def get_analytics_db_readonly() -> duckdb.DuckDBPyConnection:
"""Read-only connection to analytics DB. Blocks writes and external access.
ATTACHes extract.duckdb files so views that reference them work.
"""
db_path = _get_data_dir() / "analytics" / "server.duckdb"
if not db_path.exists():
db_path.parent.mkdir(parents=True, exist_ok=True)
conn = duckdb.connect(str(db_path), read_only=False)
try:
conn.execute("SET enable_external_access = false")
except Exception:
pass
return conn
conn = duckdb.connect(str(db_path), read_only=True)
# ATTACH extract.duckdb files FIRST so views referencing them work
extracts_dir = _get_data_dir() / "extracts"
if extracts_dir.exists():
for ext_dir in sorted(extracts_dir.iterdir()):
db_file = ext_dir / "extract.duckdb"
if db_file.exists() and ext_dir.is_dir():
if not _SAFE_IDENTIFIER.match(ext_dir.name):
continue
try:
conn.execute(f"ATTACH '{db_file}' AS {ext_dir.name} (READ_ONLY)")
except Exception:
pass
# Note: external_access stays enabled because views use read_parquet() on local files.
# File-path-based attacks are blocked by the SQL blocklist in app/api/query.py.
return conn
_V1_TO_V2_MIGRATIONS = [
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_type VARCHAR",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS bucket VARCHAR",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_table VARCHAR",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS query_mode VARCHAR DEFAULT 'local'",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS sync_schedule VARCHAR",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS profile_after_sync BOOLEAN DEFAULT true",
]
_V2_TO_V3_MIGRATIONS = [
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS is_public BOOLEAN DEFAULT true",
]
def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
"""Create tables if they don't exist. Apply migrations if schema version changed."""
current = get_schema_version(conn)
if current < SCHEMA_VERSION:
# Snapshot before migration for rollback support
if current > 0:
try:
db_path = Path(os.environ.get("DATA_DIR", "./data")) / "state" / "system.duckdb"
if db_path.exists():
snapshot = db_path.parent / "system.duckdb.pre-migrate"
shutil.copy2(str(db_path), str(snapshot))
logger.info("Pre-migration snapshot saved: %s", snapshot)
except Exception as e:
logger.warning("Could not create pre-migration snapshot: %s", e)
conn.execute(_SYSTEM_SCHEMA)
if current == 0:
conn.execute(
"INSERT INTO schema_version (version) VALUES (?)",
[SCHEMA_VERSION],
)
else:
if current < 2:
for sql in _V1_TO_V2_MIGRATIONS:
conn.execute(sql)
if current < 3:
for sql in _V2_TO_V3_MIGRATIONS:
conn.execute(sql)
conn.execute(
"UPDATE schema_version SET version = ?, applied_at = current_timestamp",
[SCHEMA_VERSION],
)
def get_schema_version(conn: duckdb.DuckDBPyConnection) -> int:
"""Get current schema version. Returns 0 if no schema exists."""
try:
result = conn.execute("SELECT MAX(version) FROM schema_version").fetchone()
return result[0] if result and result[0] else 0
except duckdb.CatalogException:
return 0
def close_system_db() -> None:
"""Close the shared system DB connection. Called on app shutdown."""
global _system_db_conn, _system_db_path
if _system_db_conn:
try:
_system_db_conn.close()
except Exception:
pass
_system_db_conn = None
_system_db_path = None