feat: add DuckDB state layer with schema management
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
eb7e5bdf8f
commit
f76411c603
2 changed files with 295 additions and 0 deletions
190
src/db.py
Normal file
190
src/db.py
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
"""DuckDB connection management and schema initialization.
|
||||
|
||||
Provides connections to the system state database and analytics database,
|
||||
with automatic directory creation and schema bootstrapping.
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import duckdb
|
||||
|
||||
SCHEMA_VERSION = 1
|
||||
|
||||
_SCHEMA_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS schema_version (
|
||||
version INTEGER NOT NULL,
|
||||
applied_at TIMESTAMP DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS audit_log (
|
||||
id VARCHAR PRIMARY KEY,
|
||||
timestamp TIMESTAMP DEFAULT current_timestamp,
|
||||
actor VARCHAR,
|
||||
action VARCHAR NOT NULL,
|
||||
entity_type VARCHAR,
|
||||
entity_id VARCHAR,
|
||||
details JSON
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS dataset_permissions (
|
||||
id VARCHAR PRIMARY KEY,
|
||||
user_email VARCHAR NOT NULL,
|
||||
dataset VARCHAR NOT NULL,
|
||||
permission VARCHAR NOT NULL DEFAULT 'read',
|
||||
granted_by VARCHAR,
|
||||
granted_at TIMESTAMP DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS knowledge_items (
|
||||
id VARCHAR PRIMARY KEY,
|
||||
title VARCHAR NOT NULL,
|
||||
content VARCHAR,
|
||||
category VARCHAR,
|
||||
author VARCHAR,
|
||||
status VARCHAR DEFAULT 'active',
|
||||
metadata JSON,
|
||||
created_at TIMESTAMP DEFAULT current_timestamp,
|
||||
updated_at TIMESTAMP DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS knowledge_votes (
|
||||
id VARCHAR PRIMARY KEY,
|
||||
item_id VARCHAR NOT NULL,
|
||||
user_email VARCHAR NOT NULL,
|
||||
vote INTEGER NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS pending_codes (
|
||||
code VARCHAR PRIMARY KEY,
|
||||
user_email VARCHAR NOT NULL,
|
||||
purpose VARCHAR,
|
||||
created_at TIMESTAMP DEFAULT current_timestamp,
|
||||
expires_at TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS script_registry (
|
||||
id VARCHAR PRIMARY KEY,
|
||||
name VARCHAR NOT NULL,
|
||||
path VARCHAR NOT NULL,
|
||||
description VARCHAR,
|
||||
author VARCHAR,
|
||||
metadata JSON,
|
||||
created_at TIMESTAMP DEFAULT current_timestamp,
|
||||
updated_at TIMESTAMP DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sync_history (
|
||||
id VARCHAR PRIMARY KEY,
|
||||
table_name VARCHAR NOT NULL,
|
||||
status VARCHAR NOT NULL,
|
||||
rows_synced INTEGER,
|
||||
started_at TIMESTAMP DEFAULT current_timestamp,
|
||||
finished_at TIMESTAMP,
|
||||
error VARCHAR,
|
||||
metadata JSON
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sync_state (
|
||||
table_name VARCHAR PRIMARY KEY,
|
||||
last_sync TIMESTAMP,
|
||||
status VARCHAR DEFAULT 'pending',
|
||||
row_count INTEGER,
|
||||
file_hash VARCHAR,
|
||||
metadata JSON
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS table_profiles (
|
||||
table_name VARCHAR PRIMARY KEY,
|
||||
profile JSON,
|
||||
profiled_at TIMESTAMP DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS table_registry (
|
||||
table_name VARCHAR PRIMARY KEY,
|
||||
bucket VARCHAR,
|
||||
source VARCHAR,
|
||||
sync_strategy VARCHAR DEFAULT 'full',
|
||||
primary_key VARCHAR,
|
||||
description VARCHAR,
|
||||
metadata JSON,
|
||||
registered_at TIMESTAMP DEFAULT current_timestamp,
|
||||
updated_at TIMESTAMP DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS telegram_links (
|
||||
chat_id VARCHAR PRIMARY KEY,
|
||||
user_email VARCHAR NOT NULL,
|
||||
linked_at TIMESTAMP DEFAULT current_timestamp,
|
||||
active BOOLEAN DEFAULT true
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS user_sync_settings (
|
||||
user_email VARCHAR PRIMARY KEY,
|
||||
settings JSON,
|
||||
updated_at TIMESTAMP DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS users (
|
||||
email VARCHAR PRIMARY KEY,
|
||||
name VARCHAR,
|
||||
picture VARCHAR,
|
||||
role VARCHAR DEFAULT 'analyst',
|
||||
is_active BOOLEAN DEFAULT true,
|
||||
metadata JSON,
|
||||
created_at TIMESTAMP DEFAULT current_timestamp,
|
||||
last_login TIMESTAMP
|
||||
);
|
||||
"""
|
||||
|
||||
|
||||
def _get_data_dir() -> Path:
|
||||
"""Return the DATA_DIR path, defaulting to ./data."""
|
||||
return Path(os.environ.get("DATA_DIR", "data"))
|
||||
|
||||
|
||||
def get_system_db() -> duckdb.DuckDBPyConnection:
|
||||
"""Open (or create) the system state database and ensure schema exists.
|
||||
|
||||
Returns a DuckDB connection to {DATA_DIR}/state/system.duckdb.
|
||||
Creates directories and all schema tables on first call.
|
||||
"""
|
||||
db_dir = _get_data_dir() / "state"
|
||||
db_dir.mkdir(parents=True, exist_ok=True)
|
||||
db_path = db_dir / "system.duckdb"
|
||||
|
||||
conn = duckdb.connect(str(db_path))
|
||||
conn.execute(_SCHEMA_SQL)
|
||||
|
||||
# Seed schema_version if empty
|
||||
row = conn.execute("SELECT COUNT(*) FROM schema_version").fetchone()
|
||||
if row[0] == 0:
|
||||
conn.execute(
|
||||
"INSERT INTO schema_version (version) VALUES (?)", [SCHEMA_VERSION]
|
||||
)
|
||||
|
||||
return conn
|
||||
|
||||
|
||||
def get_analytics_db() -> duckdb.DuckDBPyConnection:
|
||||
"""Open (or create) the analytics database.
|
||||
|
||||
Returns a DuckDB connection to {DATA_DIR}/analytics/server.duckdb.
|
||||
Creates directories if needed.
|
||||
"""
|
||||
db_dir = _get_data_dir() / "analytics"
|
||||
db_dir.mkdir(parents=True, exist_ok=True)
|
||||
db_path = db_dir / "server.duckdb"
|
||||
|
||||
return duckdb.connect(str(db_path))
|
||||
|
||||
|
||||
def get_schema_version(conn: duckdb.DuckDBPyConnection) -> int:
|
||||
"""Return the current schema version, or 0 if no schema_version table."""
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT version FROM schema_version ORDER BY applied_at DESC LIMIT 1"
|
||||
).fetchone()
|
||||
return row[0] if row else 0
|
||||
except duckdb.CatalogException:
|
||||
return 0
|
||||
105
tests/test_db.py
Normal file
105
tests/test_db.py
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
"""Tests for src.db — DuckDB connection management and schema."""
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import duckdb
|
||||
import pytest
|
||||
|
||||
|
||||
def _setup_data_dir(tmp_path):
|
||||
"""Set DATA_DIR env var to a temporary directory."""
|
||||
os.environ["DATA_DIR"] = str(tmp_path)
|
||||
|
||||
|
||||
class TestGetSystemDb:
|
||||
"""Tests for get_system_db()."""
|
||||
|
||||
def test_get_system_db_creates_tables(self, tmp_path):
|
||||
_setup_data_dir(tmp_path)
|
||||
from src.db import get_system_db
|
||||
|
||||
conn = get_system_db()
|
||||
try:
|
||||
tables = [
|
||||
row[0]
|
||||
for row in conn.execute(
|
||||
"SELECT table_name FROM information_schema.tables "
|
||||
"WHERE table_schema = 'main' ORDER BY table_name"
|
||||
).fetchall()
|
||||
]
|
||||
expected = sorted([
|
||||
"schema_version",
|
||||
"users",
|
||||
"sync_state",
|
||||
"sync_history",
|
||||
"user_sync_settings",
|
||||
"knowledge_items",
|
||||
"knowledge_votes",
|
||||
"audit_log",
|
||||
"telegram_links",
|
||||
"pending_codes",
|
||||
"script_registry",
|
||||
"table_registry",
|
||||
"table_profiles",
|
||||
"dataset_permissions",
|
||||
])
|
||||
assert tables == expected
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_get_system_db_idempotent(self, tmp_path):
|
||||
_setup_data_dir(tmp_path)
|
||||
from src.db import get_system_db
|
||||
|
||||
conn = get_system_db()
|
||||
conn.execute(
|
||||
"INSERT INTO users (email, name) VALUES ('test@example.com', 'Test')"
|
||||
)
|
||||
conn.close()
|
||||
|
||||
conn2 = get_system_db()
|
||||
try:
|
||||
rows = conn2.execute("SELECT email FROM users").fetchall()
|
||||
assert len(rows) == 1
|
||||
assert rows[0][0] == "test@example.com"
|
||||
finally:
|
||||
conn2.close()
|
||||
|
||||
|
||||
class TestGetSchemaVersion:
|
||||
"""Tests for get_schema_version()."""
|
||||
|
||||
def test_get_schema_version(self, tmp_path):
|
||||
_setup_data_dir(tmp_path)
|
||||
from src.db import get_schema_version, get_system_db
|
||||
|
||||
conn = get_system_db()
|
||||
try:
|
||||
assert get_schema_version(conn) == 1
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_get_schema_version_no_table(self, tmp_path):
|
||||
_setup_data_dir(tmp_path)
|
||||
from src.db import get_schema_version
|
||||
|
||||
db_path = tmp_path / "empty.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
try:
|
||||
assert get_schema_version(conn) == 0
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
class TestGetAnalyticsDb:
|
||||
"""Tests for get_analytics_db()."""
|
||||
|
||||
def test_get_analytics_db(self, tmp_path):
|
||||
_setup_data_dir(tmp_path)
|
||||
from src.db import get_analytics_db
|
||||
|
||||
conn = get_analytics_db()
|
||||
try:
|
||||
assert (tmp_path / "analytics" / "server.duckdb").exists()
|
||||
finally:
|
||||
conn.close()
|
||||
Loading…
Reference in a new issue