diff --git a/app/auth/dependencies.py b/app/auth/dependencies.py index 10b3bf6..1642d0a 100644 --- a/app/auth/dependencies.py +++ b/app/auth/dependencies.py @@ -1,6 +1,5 @@ """FastAPI auth dependencies — current user, role checking.""" -from enum import Enum from typing import Optional import duckdb @@ -8,24 +7,10 @@ from fastapi import Depends, HTTPException, Header, Request, status from app.auth.jwt import verify_token from src.db import get_system_db +from src.rbac import Role, ROLE_HIERARCHY from src.repositories.users import UserRepository -class Role(str, Enum): - VIEWER = "viewer" - ANALYST = "analyst" - ADMIN = "admin" - KM_ADMIN = "km_admin" - - -ROLE_HIERARCHY = { - Role.VIEWER: 0, - Role.ANALYST: 1, - Role.KM_ADMIN: 2, - Role.ADMIN: 3, -} - - def _get_db(): conn = get_system_db() try: diff --git a/cli/skills/connectors.md b/cli/skills/connectors.md index 3b451a2..4b0eb9f 100644 --- a/cli/skills/connectors.md +++ b/cli/skills/connectors.md @@ -1,30 +1,53 @@ # Connectors — How to add a new data source ## Existing Connectors -- **Keboola** (`connectors/keboola/`) — Keboola Storage API -- **BigQuery** (`connectors/bigquery/`) — Google BigQuery -- **Jira** (`connectors/jira/`) — Jira webhook + API +- **Keboola** (`connectors/keboola/extractor.py`) — DuckDB Keboola extension, batch pull +- **BigQuery** (`connectors/bigquery/extractor.py`) — DuckDB BQ extension, remote-only +- **Jira** (`connectors/jira/`) — Webhook + incremental parquet transform + +## extract.duckdb Contract + +Every connector produces the same output: +``` +/data/extracts/{source_name}/ +├── extract.duckdb ← _meta table + views +└── data/ ← parquet files (local sources only) +``` + +The `_meta` table must have columns: +- `table_name VARCHAR` — view name +- `description VARCHAR` +- `rows BIGINT` +- `size_bytes BIGINT` +- `extracted_at TIMESTAMP` +- `query_mode VARCHAR` — 'local' (data here) or 'remote' (query on demand) ## Adding a New Connector -1. Create `connectors//adapter.py` implementing the `DataSource` ABC: +1. Create `connectors//extractor.py`: ```python - from src.data_sync import DataSource + import duckdb + from pathlib import Path - class MyDataSource(DataSource): - def sync_table(self, table_config, sync_state): ... - def discover_tables(self): ... - def get_column_metadata(self, table_id): ... - def get_source_name(self): ... + def run(output_dir: str, table_configs: list[dict], **kwargs): + output = Path(output_dir) + data_dir = output / "data" + data_dir.mkdir(parents=True, exist_ok=True) + + conn = duckdb.connect(str(output / "extract.duckdb")) + # Create _meta table + # For each table: COPY TO parquet, create view, insert _meta row + conn.close() ``` -2. The factory in `src/data_sync.py:create_data_source()` auto-discovers connectors. - Set `DATA_SOURCE=` in instance.yaml or .env. +2. Register tables in DuckDB `table_registry` via admin API or migration script. + Set `source_type` to your connector name. 3. Add required env vars to `.env` and `config/.env.template`. -4. Add tests to `tests/test__adapter.py`. +4. The SyncOrchestrator (`src/orchestrator.py`) will auto-discover your extract.duckdb. ## Configuration -Each connector reads credentials from environment variables. -Table definitions are in `docs/data_description.md` (YAML blocks). +- Instance-level config: `config/instance.yaml` (connection details) +- Table definitions: DuckDB `table_registry` table +- Credentials: environment variables diff --git a/src/rbac.py b/src/rbac.py new file mode 100644 index 0000000..a8d4819 --- /dev/null +++ b/src/rbac.py @@ -0,0 +1,97 @@ +"""Role-based access control — centralized permission checks using DuckDB. + +Replaces Linux group-based auth (sudo/data-ops → admin, dataread → analyst). +Used by both FastAPI (app/auth/dependencies.py) and Flask webapp (webapp/auth.py). +""" + +from enum import Enum +from typing import Optional + +from src.db import get_system_db +from src.repositories.users import UserRepository + + +class Role(str, Enum): + VIEWER = "viewer" + ANALYST = "analyst" + KM_ADMIN = "km_admin" + ADMIN = "admin" + + +ROLE_HIERARCHY = { + Role.VIEWER: 0, + Role.ANALYST: 1, + Role.KM_ADMIN: 2, + Role.ADMIN: 3, +} + + +def get_user_role(email: str) -> Role: + """Get role for a user by email. Returns VIEWER if not found.""" + conn = get_system_db() + try: + repo = UserRepository(conn) + user = repo.get_by_email(email) + if user: + try: + return Role(user.get("role", "viewer")) + except ValueError: + return Role.VIEWER + return Role.VIEWER + finally: + conn.close() + + +def has_role(email: str, minimum_role: Role) -> bool: + """Check if user has at least the given role level.""" + user_role = get_user_role(email) + return ROLE_HIERARCHY.get(user_role, 0) >= ROLE_HIERARCHY.get(minimum_role, 0) + + +def is_admin(email: str) -> bool: + """Check if user is an admin.""" + return has_role(email, Role.ADMIN) + + +def is_km_admin(email: str) -> bool: + """Check if user is a KM admin or higher.""" + return has_role(email, Role.KM_ADMIN) + + +def is_analyst(email: str) -> bool: + """Check if user is an analyst or higher.""" + return has_role(email, Role.ANALYST) + + +def has_dataset_access(email: str, dataset: str) -> bool: + """Check if user has access to a specific dataset. + + Admins have access to all datasets. + Other users need explicit permission in dataset_permissions table. + """ + if is_admin(email): + return True + + conn = get_system_db() + try: + user = UserRepository(conn).get_by_email(email) + if not user: + return False + from src.repositories.sync_settings import DatasetPermissionRepository + return DatasetPermissionRepository(conn).has_access(user["id"], dataset) + finally: + conn.close() + + +def set_user_role(email: str, role: Role) -> bool: + """Set role for a user. Returns True if successful.""" + conn = get_system_db() + try: + repo = UserRepository(conn) + user = repo.get_by_email(email) + if not user: + return False + repo.update(user["id"], role=role.value) + return True + finally: + conn.close() diff --git a/tests/test_rbac.py b/tests/test_rbac.py new file mode 100644 index 0000000..45fa6ec --- /dev/null +++ b/tests/test_rbac.py @@ -0,0 +1,84 @@ +"""Tests for src/rbac.py — role-based access control.""" + +import os +import pytest + + +@pytest.fixture +def setup_db(tmp_path): + os.environ["DATA_DIR"] = str(tmp_path) + from src.db import get_system_db + from src.repositories.users import UserRepository + + conn = get_system_db() + repo = UserRepository(conn) + repo.create(id="admin1", email="admin@test.com", name="Admin", role="admin") + repo.create(id="analyst1", email="analyst@test.com", name="Analyst", role="analyst") + repo.create(id="km1", email="km@test.com", name="KM Admin", role="km_admin") + repo.create(id="viewer1", email="viewer@test.com", name="Viewer", role="viewer") + conn.close() + yield + + +class TestGetUserRole: + def test_admin(self, setup_db): + from src.rbac import get_user_role, Role + assert get_user_role("admin@test.com") == Role.ADMIN + + def test_analyst(self, setup_db): + from src.rbac import get_user_role, Role + assert get_user_role("analyst@test.com") == Role.ANALYST + + def test_unknown_user(self, setup_db): + from src.rbac import get_user_role, Role + assert get_user_role("nobody@test.com") == Role.VIEWER + + +class TestHasRole: + def test_admin_has_all_roles(self, setup_db): + from src.rbac import has_role, Role + assert has_role("admin@test.com", Role.VIEWER) + assert has_role("admin@test.com", Role.ANALYST) + assert has_role("admin@test.com", Role.KM_ADMIN) + assert has_role("admin@test.com", Role.ADMIN) + + def test_analyst_cant_admin(self, setup_db): + from src.rbac import has_role, Role + assert has_role("analyst@test.com", Role.ANALYST) + assert not has_role("analyst@test.com", Role.ADMIN) + + def test_viewer_is_minimal(self, setup_db): + from src.rbac import has_role, Role + assert has_role("viewer@test.com", Role.VIEWER) + assert not has_role("viewer@test.com", Role.ANALYST) + + +class TestConvenienceFunctions: + def test_is_admin(self, setup_db): + from src.rbac import is_admin + assert is_admin("admin@test.com") + assert not is_admin("analyst@test.com") + + def test_is_km_admin(self, setup_db): + from src.rbac import is_km_admin + assert is_km_admin("km@test.com") + assert is_km_admin("admin@test.com") # admin >= km_admin + assert not is_km_admin("analyst@test.com") + + def test_is_analyst(self, setup_db): + from src.rbac import is_analyst + assert is_analyst("analyst@test.com") + assert is_analyst("admin@test.com") + assert not is_analyst("viewer@test.com") + + +class TestSetUserRole: + def test_set_role(self, setup_db): + from src.rbac import set_user_role, get_user_role, Role + assert get_user_role("viewer@test.com") == Role.VIEWER + assert set_user_role("viewer@test.com", Role.ADMIN) + assert get_user_role("viewer@test.com") == Role.ADMIN + + def test_set_role_nonexistent(self, setup_db): + from src.rbac import set_user_role, Role + assert not set_user_role("nobody@test.com", Role.ADMIN) diff --git a/webapp/auth.py b/webapp/auth.py index 12eceeb..e3e50c1 100644 --- a/webapp/auth.py +++ b/webapp/auth.py @@ -37,7 +37,7 @@ def login_required(f): def admin_required(f): """Decorator to require admin privileges for a route. - Recomputes admin status server-side on every request. + Checks role in DuckDB users table via src/rbac.py. Returns 403 JSON for API routes, redirect for HTML routes. """ @@ -48,13 +48,10 @@ def admin_required(f): return jsonify({"error": "Authentication required"}), 401 return redirect(url_for("auth.login")) - from .user_service import check_user_exists, get_webapp_username + from src.rbac import is_admin email = session.get("user", {}).get("email", "") - username = get_webapp_username(email) - user_info = check_user_exists(username) - - if not user_info.is_admin: + if not is_admin(email): if request.path.startswith("/api/"): return jsonify({"error": "Admin access required"}), 403 flash("Admin access required.", "error") @@ -68,7 +65,7 @@ def admin_required(f): def km_admin_required(f): """Decorator to require Corporate Memory admin privileges for a route. - Checks km_admin flag via corporate_memory_service.is_km_admin(). + Checks role in DuckDB users table via src/rbac.py. Returns 403 JSON for API routes, redirect for HTML routes. """ @@ -79,7 +76,7 @@ def km_admin_required(f): return jsonify({"error": "Authentication required"}), 401 return redirect(url_for("auth.login")) - from .corporate_memory_service import is_km_admin + from src.rbac import is_km_admin email = session.get("user", {}).get("email", "") if not is_km_admin(email):