diff --git a/src/db.py b/src/db.py index cf59420..e5630f7 100644 --- a/src/db.py +++ b/src/db.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) _SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$") -SCHEMA_VERSION = 3 +SCHEMA_VERSION = 4 _SYSTEM_SCHEMA = """ CREATE TABLE IF NOT EXISTS schema_version ( @@ -168,6 +168,42 @@ CREATE TABLE IF NOT EXISTS access_requests ( reviewed_at TIMESTAMP, created_at TIMESTAMP DEFAULT current_timestamp ); + +CREATE TABLE IF NOT EXISTS metric_definitions ( + id VARCHAR PRIMARY KEY, + name VARCHAR NOT NULL, + display_name VARCHAR NOT NULL, + category VARCHAR NOT NULL, + description TEXT, + type VARCHAR DEFAULT 'sum', + unit VARCHAR, + grain VARCHAR DEFAULT 'monthly', + table_name VARCHAR, + tables VARCHAR[], + expression VARCHAR, + time_column VARCHAR, + dimensions VARCHAR[], + filters VARCHAR[], + synonyms VARCHAR[], + notes VARCHAR[], + sql TEXT NOT NULL, + sql_variants JSON, + validation JSON, + source VARCHAR DEFAULT 'manual', + created_at TIMESTAMP DEFAULT current_timestamp, + updated_at TIMESTAMP DEFAULT current_timestamp +); + +CREATE TABLE IF NOT EXISTS column_metadata ( + table_id VARCHAR NOT NULL, + column_name VARCHAR NOT NULL, + basetype VARCHAR, + description VARCHAR, + confidence VARCHAR DEFAULT 'manual', + source VARCHAR DEFAULT 'manual', + updated_at TIMESTAMP DEFAULT current_timestamp, + PRIMARY KEY (table_id, column_name) +); """ @@ -259,6 +295,47 @@ _V2_TO_V3_MIGRATIONS = [ "ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS is_public BOOLEAN DEFAULT true", ] +_V3_TO_V4_MIGRATIONS = [ + """ + CREATE TABLE IF NOT EXISTS metric_definitions ( + id VARCHAR PRIMARY KEY, + name VARCHAR NOT NULL, + display_name VARCHAR NOT NULL, + category VARCHAR NOT NULL, + description TEXT, + type VARCHAR DEFAULT 'sum', + unit VARCHAR, + grain VARCHAR DEFAULT 'monthly', + table_name VARCHAR, + tables VARCHAR[], + expression VARCHAR, + time_column VARCHAR, + dimensions VARCHAR[], + filters VARCHAR[], + synonyms VARCHAR[], + notes VARCHAR[], + sql TEXT NOT NULL, + sql_variants JSON, + validation JSON, + source VARCHAR DEFAULT 'manual', + created_at TIMESTAMP DEFAULT current_timestamp, + updated_at TIMESTAMP DEFAULT current_timestamp + ) + """, + """ + CREATE TABLE IF NOT EXISTS column_metadata ( + table_id VARCHAR NOT NULL, + column_name VARCHAR NOT NULL, + basetype VARCHAR, + description VARCHAR, + confidence VARCHAR DEFAULT 'manual', + source VARCHAR DEFAULT 'manual', + updated_at TIMESTAMP DEFAULT current_timestamp, + PRIMARY KEY (table_id, column_name) + ) + """, +] + def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: """Create tables if they don't exist. Apply migrations if schema version changed.""" @@ -296,6 +373,9 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: if current < 3: for sql in _V2_TO_V3_MIGRATIONS: conn.execute(sql) + if current < 4: + for sql in _V3_TO_V4_MIGRATIONS: + conn.execute(sql) conn.execute( "UPDATE schema_version SET version = ?, applied_at = current_timestamp", [SCHEMA_VERSION], diff --git a/tests/test_db.py b/tests/test_db.py index 3a4ae0c..d61a4b9 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -28,7 +28,7 @@ class TestGetSystemDb: "user_sync_settings", "knowledge_items", "knowledge_votes", "audit_log", "telegram_links", "pending_codes", "script_registry", "table_registry", "table_profiles", - "dataset_permissions", + "dataset_permissions", "metric_definitions", "column_metadata", } assert expected.issubset(tables), f"Missing: {expected - tables}" finally: @@ -55,11 +55,11 @@ class TestGetSystemDb: class TestGetSchemaVersion: def test_returns_version(self, tmp_path, monkeypatch): _setup_data_dir(tmp_path, monkeypatch) - from src.db import get_schema_version, get_system_db + from src.db import get_schema_version, get_system_db, SCHEMA_VERSION conn = get_system_db() try: - assert get_schema_version(conn) == 3 + assert get_schema_version(conn) == SCHEMA_VERSION finally: conn.close() @@ -113,7 +113,8 @@ class TestV1ToV2Migration: from src.db import get_system_db, get_schema_version conn2 = get_system_db() try: - assert get_schema_version(conn2) == 3 + from src.db import SCHEMA_VERSION + assert get_schema_version(conn2) == SCHEMA_VERSION # Verify old data preserved row = conn2.execute("SELECT name, folder FROM table_registry WHERE id='t1'").fetchone() assert row[0] == "Test" @@ -198,10 +199,10 @@ class TestMigrationSafety: conn.close() def test_v2_to_v3_migration(self, tmp_path, monkeypatch): - """v2 DB migrated to v3: schema_version=3 and is_public column added.""" + """v2 DB migrated to current schema: is_public column added.""" monkeypatch.setenv("DATA_DIR", str(tmp_path)) import duckdb as _duckdb - from src.db import _ensure_schema, get_schema_version + from src.db import _ensure_schema, get_schema_version, SCHEMA_VERSION db_path = tmp_path / "state" / "system.duckdb" self._create_v2_db(db_path) @@ -209,7 +210,7 @@ class TestMigrationSafety: conn = _duckdb.connect(str(db_path)) try: _ensure_schema(conn) - assert get_schema_version(conn) == 3 + assert get_schema_version(conn) == SCHEMA_VERSION cols = { r[0] for r in conn.execute( @@ -285,7 +286,8 @@ class TestMigrationSafety: _ensure_schema(conn) - assert get_schema_version(conn) == 3 + from src.db import SCHEMA_VERSION + assert get_schema_version(conn) == SCHEMA_VERSION row = conn.execute( "SELECT name, description FROM table_registry WHERE id='row1'" ).fetchone() @@ -343,6 +345,123 @@ class TestMigrationSafety: conn.close() +class TestSchemaV4: + """Tests for v4 schema additions: metric_definitions and column_metadata tables.""" + + def test_metric_definitions_table_exists(self, tmp_path, monkeypatch): + """metric_definitions and column_metadata tables exist after get_system_db().""" + _setup_data_dir(tmp_path, monkeypatch) + from src.db import get_system_db + + conn = get_system_db() + try: + tables = { + row[0] + for row in conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" + ).fetchall() + } + assert "metric_definitions" in tables, "metric_definitions table missing" + assert "column_metadata" in tables, "column_metadata table missing" + finally: + conn.close() + + def test_metric_definitions_columns(self, tmp_path, monkeypatch): + """metric_definitions table has all expected columns.""" + _setup_data_dir(tmp_path, monkeypatch) + from src.db import get_system_db + + conn = get_system_db() + try: + cols = { + row[0] + for row in conn.execute( + "SELECT column_name FROM information_schema.columns WHERE table_name = 'metric_definitions'" + ).fetchall() + } + expected = { + "id", "name", "display_name", "category", "description", + "type", "unit", "grain", "table_name", "tables", + "expression", "time_column", "dimensions", "filters", + "synonyms", "notes", "sql", "sql_variants", "validation", + "source", "created_at", "updated_at", + } + assert expected.issubset(cols), f"Missing columns: {expected - cols}" + finally: + conn.close() + + def test_column_metadata_table_exists(self, tmp_path, monkeypatch): + """column_metadata table has all expected columns.""" + _setup_data_dir(tmp_path, monkeypatch) + from src.db import get_system_db + + conn = get_system_db() + try: + cols = { + row[0] + for row in conn.execute( + "SELECT column_name FROM information_schema.columns WHERE table_name = 'column_metadata'" + ).fetchall() + } + expected = { + "table_id", "column_name", "basetype", "description", + "confidence", "source", "updated_at", + } + assert expected.issubset(cols), f"Missing columns: {expected - cols}" + finally: + conn.close() + + def test_v3_to_v4_migration(self, tmp_path, monkeypatch): + """Simulate a v3 database, call get_system_db(), verify it migrates to v4.""" + monkeypatch.setenv("DATA_DIR", str(tmp_path)) + import duckdb as _duckdb + from src.db import get_system_db, get_schema_version + + # Build a minimal v3 database manually + db_path = tmp_path / "state" / "system.duckdb" + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = _duckdb.connect(str(db_path)) + try: + conn.execute( + "CREATE TABLE schema_version (version INTEGER, applied_at TIMESTAMP DEFAULT current_timestamp);" + "INSERT INTO schema_version (version) VALUES (3);" + ) + # Create the tables that exist in v3 (minimal stubs) + conn.execute("CREATE TABLE IF NOT EXISTS table_registry (id VARCHAR PRIMARY KEY, name VARCHAR NOT NULL, is_public BOOLEAN DEFAULT true)") + for ddl in [ + "CREATE TABLE IF NOT EXISTS users (id VARCHAR PRIMARY KEY, email VARCHAR)", + "CREATE TABLE IF NOT EXISTS sync_state (table_id VARCHAR PRIMARY KEY)", + "CREATE TABLE IF NOT EXISTS sync_history (id VARCHAR PRIMARY KEY, table_id VARCHAR)", + "CREATE TABLE IF NOT EXISTS user_sync_settings (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))", + "CREATE TABLE IF NOT EXISTS knowledge_items (id VARCHAR PRIMARY KEY, title VARCHAR)", + "CREATE TABLE IF NOT EXISTS knowledge_votes (item_id VARCHAR, user_id VARCHAR, PRIMARY KEY(item_id, user_id))", + "CREATE TABLE IF NOT EXISTS audit_log (id VARCHAR PRIMARY KEY, action VARCHAR)", + "CREATE TABLE IF NOT EXISTS telegram_links (user_id VARCHAR PRIMARY KEY, chat_id BIGINT)", + "CREATE TABLE IF NOT EXISTS pending_codes (code VARCHAR PRIMARY KEY, chat_id BIGINT)", + "CREATE TABLE IF NOT EXISTS script_registry (id VARCHAR PRIMARY KEY, name VARCHAR, source TEXT)", + "CREATE TABLE IF NOT EXISTS table_profiles (table_id VARCHAR PRIMARY KEY, profile JSON)", + "CREATE TABLE IF NOT EXISTS dataset_permissions (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))", + "CREATE TABLE IF NOT EXISTS access_requests (id VARCHAR PRIMARY KEY, user_id VARCHAR, user_email VARCHAR, table_id VARCHAR)", + ]: + conn.execute(ddl) + finally: + conn.close() + + conn2 = get_system_db() + try: + assert get_schema_version(conn2) == 4, f"Expected version 4, got {get_schema_version(conn2)}" + tables = { + row[0] + for row in conn2.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" + ).fetchall() + } + assert "metric_definitions" in tables, "metric_definitions table missing after migration" + assert "column_metadata" in tables, "column_metadata table missing after migration" + finally: + conn2.close() + + class TestGetAnalyticsDbReadonly: def test_analytics_readonly_rejects_malicious_dir_name(self, tmp_path, monkeypatch): """Directories with SQL-injection chars in their name are skipped."""