feat: implement data access control — table-level permissions

Schema v3: add is_public column to table_registry (default true).

src/rbac.py: can_access_table() checks admin bypass, public flag,
explicit permissions, wildcard bucket permissions.

API enforcement:
- manifest: filters tables by user access
- download: 403 if no access
- catalog: filters table list
- query: validates referenced tables against allowed list

New admin permissions API (/api/admin/permissions) for grant/revoke.

28 access control tests + 733 total tests passing.
This commit is contained in:
ZdenekSrotyr 2026-03-31 12:33:31 +02:00
parent 78f003f5b5
commit 1074d5ec49
14 changed files with 964 additions and 21 deletions

View file

@ -11,6 +11,7 @@ import yaml
from app.auth.dependencies import get_current_user, _get_db
from src.repositories.profiles import ProfileRepository
from src.rbac import can_access_table
router = APIRouter(prefix="/api/catalog", tags=["catalog"])
@ -52,6 +53,11 @@ async def list_catalog_tables(
from src.repositories.table_registry import TableRegistryRepository
repo = TableRegistryRepository(conn)
all_tables = repo.list_all()
# Filter by user's accessible tables (admin sees all)
if user.get("role") != "admin":
all_tables = [t for t in all_tables if can_access_table(user, t["id"], conn)]
tables = [
{
"id": t["id"],

View file

@ -5,8 +5,10 @@ from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import FileResponse
import duckdb
from app.auth.dependencies import get_current_user
from app.auth.dependencies import get_current_user, _get_db
from src.rbac import can_access_table
router = APIRouter(prefix="/api/data", tags=["data"])
@ -20,8 +22,13 @@ async def download_table(
table_id: str,
request: Request,
user: dict = Depends(get_current_user),
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
"""Stream a parquet file for download. Supports ETag for caching."""
# Check access FIRST
if not can_access_table(user, table_id, conn):
raise HTTPException(status_code=403, detail="Access denied to this table")
data_dir = _get_data_dir()
# Search in extracts directory (v2 extract.duckdb architecture)

69
app/api/permissions.py Normal file
View file

@ -0,0 +1,69 @@
"""Admin permissions API — grant/revoke dataset access."""
import logging
from typing import Optional, List
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
import duckdb
from app.auth.dependencies import require_role, get_current_user, Role, _get_db
from src.repositories.sync_settings import DatasetPermissionRepository
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/admin/permissions", tags=["permissions"])
class PermissionRequest(BaseModel):
user_id: str
dataset: str # table_id, bucket wildcard, or dataset group
access: str = "read" # "read" or "none"
@router.post("", status_code=201)
async def grant_permission(
request: PermissionRequest,
user: dict = Depends(require_role(Role.ADMIN)),
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
"""Grant a user access to a dataset/table."""
repo = DatasetPermissionRepository(conn)
repo.grant(request.user_id, request.dataset, request.access)
return {"user_id": request.user_id, "dataset": request.dataset, "access": request.access}
@router.delete("")
async def revoke_permission(
request: PermissionRequest,
user: dict = Depends(require_role(Role.ADMIN)),
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
"""Revoke a user's access to a dataset/table."""
repo = DatasetPermissionRepository(conn)
repo.revoke(request.user_id, request.dataset)
return {"revoked": True}
@router.get("/{user_id}")
async def get_user_permissions(
user_id: str,
user: dict = Depends(require_role(Role.ADMIN)),
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
"""List all permissions for a user."""
repo = DatasetPermissionRepository(conn)
permissions = repo.get_user_permissions(user_id)
return {"user_id": user_id, "permissions": permissions}
@router.get("")
async def list_all_permissions(
user: dict = Depends(require_role(Role.ADMIN)),
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
"""List all dataset permissions."""
results = conn.execute("SELECT * FROM dataset_permissions ORDER BY user_id, dataset").fetchall()
if not results:
return {"permissions": []}
columns = [desc[0] for desc in conn.description]
return {"permissions": [dict(zip(columns, row)) for row in results]}

View file

@ -5,9 +5,11 @@ from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
import duckdb
from app.auth.dependencies import get_current_user
from app.auth.dependencies import get_current_user, _get_db
from src.db import get_analytics_db
from src.rbac import get_accessible_tables
router = APIRouter(prefix="/api/query", tags=["query"])
@ -28,6 +30,7 @@ class QueryResponse(BaseModel):
async def execute_query(
request: QueryRequest,
user: dict = Depends(get_current_user),
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
):
"""Execute SQL against the server analytics DuckDB."""
sql_lower = request.sql.strip().lower()
@ -49,11 +52,26 @@ async def execute_query(
if not sql_lower.startswith("select ") and not sql_lower.startswith("with "):
raise HTTPException(status_code=400, detail="Query must start with SELECT or WITH")
conn = get_analytics_db()
# Get allowed tables for this user
allowed = get_accessible_tables(user, conn)
analytics = get_analytics_db()
try:
if allowed is not None: # None = admin, sees all
# Get all views in analytics DB
all_views = {row[0] for row in analytics.execute(
"SELECT table_name FROM information_schema.tables WHERE table_type='VIEW'"
).fetchall()}
# Check if query references any forbidden tables
forbidden = all_views - set(allowed)
for table in forbidden:
if table.lower() in sql_lower:
raise HTTPException(status_code=403, detail=f"Access denied to table '{table}'")
# Open in read-only mode for extra safety
result = conn.execute(request.sql).fetchmany(request.limit + 1)
columns = [desc[0] for desc in conn.description] if conn.description else []
result = analytics.execute(request.sql).fetchmany(request.limit + 1)
columns = [desc[0] for desc in analytics.description] if analytics.description else []
truncated = len(result) > request.limit
rows = result[:request.limit]
# Convert to serializable types
@ -69,7 +87,9 @@ async def execute_query(
row_count=len(serializable_rows),
truncated=truncated,
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=400, detail=f"Query error: {str(e)}")
finally:
conn.close()
analytics.close()

View file

@ -15,6 +15,7 @@ import duckdb
from app.auth.dependencies import get_current_user, require_role, Role, _get_db
from src.repositories.sync_state import SyncStateRepository
from src.repositories.sync_settings import SyncSettingsRepository, DatasetPermissionRepository
from src.rbac import can_access_table
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/sync", tags=["sync"])
@ -97,20 +98,16 @@ async def sync_manifest(
):
"""Return hash-based manifest of all synced data, filtered per user."""
repo = SyncStateRepository(conn)
perm_repo = DatasetPermissionRepository(conn)
all_states = repo.get_all_states()
# Filter by user's accessible datasets (admin sees all)
user_role = user.get("role", "viewer")
accessible = None
if user_role != "admin":
accessible = set(perm_repo.get_accessible_datasets(user["id"]))
# Filter by user's accessible tables (admin sees all)
if user.get("role") != "admin":
all_states = [s for s in all_states if can_access_table(user, s["table_id"], conn)]
data_dir = _get_data_dir()
tables = {}
for state in all_states:
table_id = state["table_id"]
# If user has limited access, filter tables (simplified: by table_id prefix)
tables[table_id] = {
"hash": state.get("hash", ""),
"updated": state.get("last_sync").isoformat() if state.get("last_sync") else None,

View file

@ -23,6 +23,7 @@ from app.api.settings import router as settings_router
from app.api.catalog import router as catalog_router
from app.api.telegram import router as telegram_router
from app.api.admin import router as admin_router
from app.api.permissions import router as permissions_router
from app.web.router import router as web_router
logger = logging.getLogger(__name__)
@ -101,6 +102,7 @@ def create_app() -> FastAPI:
app.include_router(catalog_router)
app.include_router(telegram_router)
app.include_router(admin_router)
app.include_router(permissions_router)
# Web UI router (must be last — has catch-all routes)
app.include_router(web_router)

View file

@ -0,0 +1,202 @@
# Data Access Control — Spec
**Date:** 2026-03-31
**Status:** Draft
## 1. Problem
V novém systému (API místo rsync) nemáme ekvivalent rsync filtru. Každý přihlášený uživatel vidí a stáhne všechny tabulky. V produkci to řeší filesystem permissions + per-user rsync filter.
## 2. Současný model (produkce, rsync)
```
Server: /data/src_data/parquet/
├── crm/orders.parquet ← dataread group
├── crm/customers.parquet ← dataread group
├── private/salaries.parquet ← data-private group only
└── jira/issues/2026-03.parquet ← dataread group
Analytik (sync_data.sh):
1. Webapp generuje ~/.sync_rsync_filter (include/exclude per tabulka)
2. rsync --filter="merge ~/.sync_rsync_filter" stáhne jen povolené
3. AI agent pracuje s lokálními soubory → vidí jen to co se stáhlo
```
Tři vrstvy:
- **Linux skupiny** (dataread, data-private) → hrubé řízení
- **Datasety** (opt-in v instance.yaml) → celé skupiny tabulek
- **Per-table subscription** (explicit mode) → jednotlivé tabulky
## 3. Nový model (API)
Princip zůstává: **uživatel vidí jen to, k čemu má explicitní přístup**.
### 3.1 Datový model
Stávající tabulka `dataset_permissions` v DuckDB:
```sql
CREATE TABLE dataset_permissions (
user_id VARCHAR NOT NULL,
dataset VARCHAR NOT NULL, -- table_id nebo dataset group name
access VARCHAR DEFAULT 'read', -- 'read', 'none'
PRIMARY KEY (user_id, dataset)
);
```
`dataset` může být:
- **Table ID** (`circle`, `chart_of_accounts`) — přístup k jedné tabulce
- **Wildcard/group** (`in.c-finance.*`) — přístup ke všem tabulkám v bucketu
- **Dataset name** (`jira`, `finance`) — pojmenovaná skupina z instance.yaml
### 3.2 Pravidla přístupu
```
Admin → vidí vše (bypass permissions)
Ostatní → vidí jen tabulky kde:
1. Existuje explicitní permission (dataset_permissions.access = 'read')
2. NEBO tabulka patří do povoleného datasetu/bucketu
3. NEBO je tabulka public (nový flag v table_registry)
```
### 3.3 Nový sloupec v table_registry
```sql
ALTER TABLE table_registry ADD COLUMN is_public BOOLEAN DEFAULT true;
```
- `is_public = true` → každý přihlášený uživatel vidí (default, zpětně kompatibilní)
- `is_public = false` → vyžaduje explicitní permission
## 4. Kde se kontroluje
### 4.1 Manifest (`GET /api/sync/manifest`)
```python
# Současný kód (NEFUNGUJE):
accessible = set(perm_repo.get_accessible_datasets(user["id"]))
# ... ale nikdy nefiltruje
# Nový kód:
all_states = repo.get_all_states()
if user["role"] != "admin":
all_states = [s for s in all_states if _user_can_access(user, s["table_id"])]
```
### 4.2 Download (`GET /api/data/{table}/download`)
```python
# Současný kód (ŽÁDNÁ KONTROLA):
return FileResponse(path=file_path)
# Nový kód:
if not _user_can_access(user, table_id):
raise HTTPException(403, "Access denied")
return FileResponse(path=file_path)
```
### 4.3 Query (`POST /api/query`)
```python
# Současný kód: otevře analytics.duckdb s VŠEMI views
# Nový kód: vytvořit per-user filtered connection
# Varianta A: CREATE TEMP VIEW pro povolené tabulky
# Varianta B: Dynamicky generovat allowed list, validovat SQL against it
```
Query je nejtěžší — uživatel může napsat `SELECT * FROM salaries` a pokud view existuje v analytics.duckdb, data se vrátí. Řešení:
**Varianta A — Filtered views (doporučeno):**
Per-request vytvoření in-memory DuckDB, ATTACH analytics.duckdb, vytvořit views jen pro povolené tabulky. Overhead ~10ms.
**Varianta B — SQL validation:**
Parsovat SQL, extrahovat referenced tables, ověřit proti allowed list. Křehké (sub-queries, CTEs, aliasy).
### 4.4 Catalog (`GET /api/catalog/tables`)
```python
# Filtrovat jako manifest — uživatel vidí metadata jen povolených tabulek
if user["role"] != "admin":
tables = [t for t in tables if _user_can_access(user, t["id"])]
```
## 5. Shared helper
```python
# src/rbac.py — rozšíření
def can_access_table(user: dict, table_id: str) -> bool:
"""Check if user can access a specific table."""
# Admin bypass
if user.get("role") == "admin":
return True
# Check if table is public
table = TableRegistryRepository(conn).get(table_id)
if table and table.get("is_public", True):
return True
# Check explicit permission
user_id = user["id"]
if DatasetPermissionRepository(conn).has_access(user_id, table_id):
return True
# Check wildcard/bucket permission (e.g., "in.c-finance.*")
bucket = table.get("bucket", "") if table else ""
if bucket and DatasetPermissionRepository(conn).has_access(user_id, f"{bucket}.*"):
return True
return False
```
## 6. Admin API pro permissions
```
POST /api/admin/permissions — grant access
DELETE /api/admin/permissions — revoke access
GET /api/admin/permissions/{user_id} — list user's permissions
GET /api/admin/permissions — list all (admin only)
POST body: {"user_id": "...", "dataset": "circle", "access": "read"}
```
## 7. Migrace
### Pro existující instance:
1. Všechny stávající tabulky: `is_public = true` (zachová současné chování)
2. Admin nastaví `is_public = false` pro citlivé tabulky
3. Přidá explicitní permissions pro uživatele
### Pro nové instance:
- Default `is_public = true` → otevřený model (jako teď)
- Admin může přepnout na uzavřený: `is_public = false` per tabulka
## 8. CLI (`da sync`)
```
da sync
→ GET /api/sync/manifest (vrátí jen povolené tabulky)
→ pro každou tabulku: GET /api/data/{table}/download
→ rebuild lokální DuckDB jen z povolených parquetů
→ AI agent vidí jen to co se stáhlo
```
Identický princip jako rsync filter — ale filtr je server-side v API, ne v souboru.
## 9. Co se NEMĚNÍ
- Role hierarchy (viewer < analyst < km_admin < admin)
- Admin vidí vše
- JWT auth flow
- Orchestrator + extractory (server-side, vidí vše)
- Sync trigger (admin-only, stahuje vše na server)
## 10. Implementační pořadí
1. `is_public` sloupec v table_registry (schema v3)
2. `can_access_table()` helper v src/rbac.py
3. Filtrování v manifest + download + catalog
4. Admin permissions API
5. Query endpoint — filtered views
6. Testy

View file

@ -9,7 +9,7 @@ from pathlib import Path
import duckdb
SCHEMA_VERSION = 2
SCHEMA_VERSION = 3
_SYSTEM_SCHEMA = """
CREATE TABLE IF NOT EXISTS schema_version (
@ -133,6 +133,7 @@ CREATE TABLE IF NOT EXISTS table_registry (
folder VARCHAR,
description TEXT,
registered_by VARCHAR,
is_public BOOLEAN DEFAULT true,
registered_at TIMESTAMP DEFAULT current_timestamp
);
@ -180,6 +181,10 @@ _V1_TO_V2_MIGRATIONS = [
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS profile_after_sync BOOLEAN DEFAULT true",
]
_V2_TO_V3_MIGRATIONS = [
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS is_public BOOLEAN DEFAULT true",
]
def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
"""Create tables if they don't exist. Apply migrations if schema version changed."""
@ -195,6 +200,9 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
if current < 2:
for sql in _V1_TO_V2_MIGRATIONS:
conn.execute(sql)
if current < 3:
for sql in _V2_TO_V3_MIGRATIONS:
conn.execute(sql)
conn.execute(
"UPDATE schema_version SET version = ?, applied_at = current_timestamp",
[SCHEMA_VERSION],

View file

@ -83,6 +83,76 @@ def has_dataset_access(email: str, dataset: str) -> bool:
conn.close()
def can_access_table(user: dict, table_id: str, conn=None) -> bool:
"""Check if user can access a specific table.
Rules:
1. Admin -> always True
2. Table is_public=True -> always True
3. Explicit permission in dataset_permissions -> True
4. Wildcard bucket permission (e.g., 'in.c-finance.*') -> True
5. Otherwise -> False
"""
if user.get("role") == "admin":
return True
should_close = False
if conn is None:
conn = get_system_db()
should_close = True
try:
from src.repositories.table_registry import TableRegistryRepository
from src.repositories.sync_settings import DatasetPermissionRepository
# Check if table is public
table = TableRegistryRepository(conn).get(table_id)
if table and table.get("is_public", True):
return True
user_id = user.get("id", "")
perm_repo = DatasetPermissionRepository(conn)
# Check explicit permission
if perm_repo.has_access(user_id, table_id):
return True
# Check wildcard bucket permission
bucket = table.get("bucket", "") if table else ""
if bucket and perm_repo.has_access(user_id, f"{bucket}.*"):
return True
return False
finally:
if should_close:
conn.close()
def get_accessible_tables(user: dict, conn=None) -> list[str]:
"""Get list of table IDs the user can access. Used for filtering."""
if user.get("role") == "admin":
return None # None means "all" — admin bypass
should_close = False
if conn is None:
conn = get_system_db()
should_close = True
try:
from src.repositories.table_registry import TableRegistryRepository
repo = TableRegistryRepository(conn)
all_tables = repo.list_all()
accessible = []
for t in all_tables:
if can_access_table(user, t["id"], conn):
accessible.append(t["id"])
return accessible
finally:
if should_close:
conn.close()
def set_user_role(email: str, role: Role) -> bool:
"""Set role for a user. Returns True if successful."""
conn = get_system_db()

View file

@ -17,23 +17,25 @@ class TableRegistryRepository:
source_type: Optional[str] = None, bucket: Optional[str] = None,
source_table: Optional[str] = None, query_mode: str = "local",
sync_schedule: Optional[str] = None, profile_after_sync: bool = True,
is_public: bool = True,
) -> None:
now = datetime.now(timezone.utc)
self.conn.execute(
"""INSERT INTO table_registry (id, name, folder, sync_strategy,
primary_key, description, registered_by, registered_at,
source_type, bucket, source_table, query_mode,
sync_schedule, profile_after_sync)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
sync_schedule, profile_after_sync, is_public)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT (id) DO UPDATE SET
name = excluded.name, folder = excluded.folder,
sync_strategy = excluded.sync_strategy, primary_key = excluded.primary_key,
description = excluded.description, registered_at = excluded.registered_at,
source_type = excluded.source_type, bucket = excluded.bucket,
source_table = excluded.source_table, query_mode = excluded.query_mode,
sync_schedule = excluded.sync_schedule, profile_after_sync = excluded.profile_after_sync""",
sync_schedule = excluded.sync_schedule, profile_after_sync = excluded.profile_after_sync,
is_public = excluded.is_public""",
[id, name, folder, sync_strategy, primary_key, description, registered_by, now,
source_type, bucket, source_table, query_mode, sync_schedule, profile_after_sync],
source_type, bucket, source_table, query_mode, sync_schedule, profile_after_sync, is_public],
)
def unregister(self, table_id: str) -> None:

View file

@ -0,0 +1,554 @@
"""E2E access control tests — verify data isolation between users."""
import os
import duckdb
import pytest
from tests.conftest import create_mock_extract
def _auth(token):
return {"Authorization": f"Bearer {token}"}
class TestPublicTablesAccessible:
"""Default: is_public=True tables are accessible to everyone."""
def test_analyst_sees_public_tables_in_manifest(self, seeded_app):
"""Analyst can see public tables in manifest."""
c = seeded_app["client"]
env = seeded_app["env"]
# Create extract with data
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "orders", "data": [{"id": "1"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
# Register table as public (default)
c.post("/api/admin/register-table", json={
"name": "orders", "source_type": "keboola",
}, headers=_auth(seeded_app["admin_token"]))
# Analyst should see it
resp = c.get("/api/sync/manifest", headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 200
def test_analyst_can_download_public_table(self, seeded_app):
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "orders", "data": [{"id": "1"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
c = seeded_app["client"]
# Register table so access control recognizes it as public
c.post("/api/admin/register-table", json={
"name": "orders", "source_type": "keboola",
}, headers=_auth(seeded_app["admin_token"]))
resp = c.get("/api/data/orders/download", headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 200
def test_admin_can_download_public_table(self, seeded_app):
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "orders", "data": [{"id": "1"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
c = seeded_app["client"]
resp = c.get("/api/data/orders/download", headers=_auth(seeded_app["admin_token"]))
assert resp.status_code == 200
def test_public_table_visible_in_catalog(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "orders", "data": [{"id": "1"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
c.post("/api/admin/register-table", json={
"name": "orders", "source_type": "keboola",
}, headers=_auth(seeded_app["admin_token"]))
resp = c.get("/api/catalog/tables", headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 200
names = {t["name"] for t in resp.json()["tables"]}
assert "orders" in names
class TestPrivateTablesRestricted:
"""Tables with is_public=False require explicit permission."""
def test_analyst_cannot_see_private_table_in_manifest(self, seeded_app):
"""Private table hidden from manifest for unauthorized user."""
c = seeded_app["client"]
env = seeded_app["env"]
# Create extract
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1", "amount": "100000"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
# Register as public first (default), then make private
c.post("/api/admin/register-table", json={
"name": "salaries", "source_type": "keboola",
}, headers=_auth(seeded_app["admin_token"]))
# Make private via direct DB update
from src.db import get_system_db
conn = get_system_db()
conn.execute("UPDATE table_registry SET is_public = false WHERE name = 'salaries'")
conn.close()
# Analyst should NOT see it in manifest
resp = c.get("/api/sync/manifest", headers=_auth(seeded_app["analyst_token"]))
assert "salaries" not in resp.json().get("tables", {})
# Admin SHOULD see it
resp = c.get("/api/sync/manifest", headers=_auth(seeded_app["admin_token"]))
assert resp.status_code == 200
# Admin sees all — salaries should not be filtered out
def test_analyst_blocked_from_downloading_private_table(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1", "amount": "100000"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
# Make private
from src.db import get_system_db
conn = get_system_db()
conn.execute(
"INSERT INTO table_registry (id, name, is_public) VALUES ('salaries','salaries',false) "
"ON CONFLICT(id) DO UPDATE SET is_public=false"
)
conn.close()
resp = c.get("/api/data/salaries/download", headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 403
def test_admin_can_download_private_table(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1", "amount": "100000"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
from src.db import get_system_db
conn = get_system_db()
conn.execute(
"INSERT INTO table_registry (id, name, is_public) VALUES ('salaries','salaries',false) "
"ON CONFLICT(id) DO UPDATE SET is_public=false"
)
conn.close()
resp = c.get("/api/data/salaries/download", headers=_auth(seeded_app["admin_token"]))
assert resp.status_code == 200
def test_mixed_public_private_manifest(self, seeded_app):
"""Manifest shows public tables but hides private ones for analyst."""
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "orders", "data": [{"id": "1"}]},
{"name": "salaries", "data": [{"id": "1", "amount": "100000"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
# Register both
c.post("/api/admin/register-table", json={
"name": "orders", "source_type": "keboola",
}, headers=_auth(seeded_app["admin_token"]))
c.post("/api/admin/register-table", json={
"name": "salaries", "source_type": "keboola",
}, headers=_auth(seeded_app["admin_token"]))
# Make salaries private
from src.db import get_system_db
conn = get_system_db()
conn.execute("UPDATE table_registry SET is_public = false WHERE name = 'salaries'")
conn.close()
# Analyst sees orders but not salaries
resp = c.get("/api/sync/manifest", headers=_auth(seeded_app["analyst_token"]))
tables = resp.json().get("tables", {})
assert "orders" in tables
assert "salaries" not in tables
class TestExplicitPermissions:
"""Granting explicit access to private tables."""
def test_grant_then_access(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
# Make private
from src.db import get_system_db
conn = get_system_db()
conn.execute(
"INSERT INTO table_registry (id, name, is_public) VALUES ('salaries','salaries',false) "
"ON CONFLICT(id) DO UPDATE SET is_public=false"
)
conn.close()
# Analyst blocked
resp = c.get("/api/data/salaries/download", headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 403
# Admin grants access
resp = c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "salaries", "access": "read",
}, headers=_auth(seeded_app["admin_token"]))
assert resp.status_code == 201
# Now analyst CAN download
resp = c.get("/api/data/salaries/download", headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 200
def test_revoke_removes_access(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
from src.db import get_system_db
conn = get_system_db()
conn.execute(
"INSERT INTO table_registry (id, name, is_public) VALUES ('salaries','salaries',false) "
"ON CONFLICT(id) DO UPDATE SET is_public=false"
)
conn.close()
# Grant
c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "salaries", "access": "read",
}, headers=_auth(seeded_app["admin_token"]))
# Verify access
resp = c.get("/api/data/salaries/download", headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 200
# Revoke
c.request("DELETE", "/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "salaries",
}, headers=_auth(seeded_app["admin_token"]))
# Now blocked again
resp = c.get("/api/data/salaries/download", headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 403
def test_grant_makes_private_table_visible_in_manifest(self, seeded_app):
"""After granting access, analyst sees private table in manifest."""
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
c.post("/api/admin/register-table", json={
"name": "salaries", "source_type": "keboola",
}, headers=_auth(seeded_app["admin_token"]))
from src.db import get_system_db
conn = get_system_db()
conn.execute("UPDATE table_registry SET is_public = false WHERE name = 'salaries'")
conn.close()
# Not visible before grant
resp = c.get("/api/sync/manifest", headers=_auth(seeded_app["analyst_token"]))
assert "salaries" not in resp.json().get("tables", {})
# Grant access
c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "salaries", "access": "read",
}, headers=_auth(seeded_app["admin_token"]))
# Now visible
resp = c.get("/api/sync/manifest", headers=_auth(seeded_app["analyst_token"]))
assert "salaries" in resp.json().get("tables", {})
class TestCatalogFiltering:
"""Catalog only shows accessible tables."""
def test_private_table_hidden_from_catalog(self, seeded_app):
c = seeded_app["client"]
# Register public + private
c.post("/api/admin/register-table", json={"name": "public_table"}, headers=_auth(seeded_app["admin_token"]))
c.post("/api/admin/register-table", json={"name": "private_table"}, headers=_auth(seeded_app["admin_token"]))
from src.db import get_system_db
conn = get_system_db()
conn.execute("UPDATE table_registry SET is_public = false WHERE name = 'private_table'")
conn.close()
resp = c.get("/api/catalog/tables", headers=_auth(seeded_app["analyst_token"]))
names = {t["name"] for t in resp.json()["tables"]}
assert "public_table" in names
assert "private_table" not in names
def test_admin_sees_all_in_catalog(self, seeded_app):
c = seeded_app["client"]
c.post("/api/admin/register-table", json={"name": "public_table"}, headers=_auth(seeded_app["admin_token"]))
c.post("/api/admin/register-table", json={"name": "private_table"}, headers=_auth(seeded_app["admin_token"]))
from src.db import get_system_db
conn = get_system_db()
conn.execute("UPDATE table_registry SET is_public = false WHERE name = 'private_table'")
conn.close()
resp = c.get("/api/catalog/tables", headers=_auth(seeded_app["admin_token"]))
names = {t["name"] for t in resp.json()["tables"]}
assert "public_table" in names
assert "private_table" in names
def test_granted_private_table_shown_in_catalog(self, seeded_app):
"""After granting access, private table appears in catalog for that user."""
c = seeded_app["client"]
c.post("/api/admin/register-table", json={"name": "secret_data"}, headers=_auth(seeded_app["admin_token"]))
from src.db import get_system_db
conn = get_system_db()
conn.execute("UPDATE table_registry SET is_public = false WHERE name = 'secret_data'")
conn.close()
# Not visible before grant
resp = c.get("/api/catalog/tables", headers=_auth(seeded_app["analyst_token"]))
names = {t["name"] for t in resp.json()["tables"]}
assert "secret_data" not in names
# Grant access
c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "secret_data", "access": "read",
}, headers=_auth(seeded_app["admin_token"]))
# Now visible
resp = c.get("/api/catalog/tables", headers=_auth(seeded_app["analyst_token"]))
names = {t["name"] for t in resp.json()["tables"]}
assert "secret_data" in names
class TestPermissionsAPI:
"""Admin permissions CRUD."""
def test_grant_and_list(self, seeded_app):
c = seeded_app["client"]
h = _auth(seeded_app["admin_token"])
resp = c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "secret_data", "access": "read",
}, headers=h)
assert resp.status_code == 201
resp = c.get("/api/admin/permissions/analyst1", headers=h)
assert resp.status_code == 200
datasets = {p["dataset"] for p in resp.json()["permissions"]}
assert "secret_data" in datasets
def test_analyst_cannot_manage_permissions(self, seeded_app):
c = seeded_app["client"]
resp = c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "anything",
}, headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 403
def test_grant_multiple_datasets(self, seeded_app):
c = seeded_app["client"]
h = _auth(seeded_app["admin_token"])
for ds in ["sales", "hr", "finance"]:
resp = c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": ds, "access": "read",
}, headers=h)
assert resp.status_code == 201
resp = c.get("/api/admin/permissions/analyst1", headers=h)
datasets = {p["dataset"] for p in resp.json()["permissions"]}
assert datasets == {"sales", "hr", "finance"}
def test_revoke_via_delete(self, seeded_app):
c = seeded_app["client"]
h = _auth(seeded_app["admin_token"])
c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "secret_data", "access": "read",
}, headers=h)
resp = c.request("DELETE", "/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "secret_data",
}, headers=h)
assert resp.status_code == 200
resp = c.get("/api/admin/permissions/analyst1", headers=h)
datasets = {p["dataset"] for p in resp.json()["permissions"]}
assert "secret_data" not in datasets
def test_analyst_cannot_revoke_permissions(self, seeded_app):
c = seeded_app["client"]
resp = c.request("DELETE", "/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "anything",
}, headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 403
class TestQueryFiltering:
"""Query endpoint respects access control."""
def test_analyst_blocked_from_querying_private_table(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
# Create extract with private data
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1", "amount": "100000"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
from src.db import get_system_db
conn = get_system_db()
conn.execute(
"INSERT INTO table_registry (id, name, is_public) VALUES ('salaries','salaries',false) "
"ON CONFLICT(id) DO UPDATE SET is_public=false"
)
conn.close()
resp = c.post("/api/query", json={"sql": "SELECT * FROM salaries"},
headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code == 403
def test_admin_can_query_private_table(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1", "amount": "100000"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
from src.db import get_system_db
conn = get_system_db()
conn.execute(
"INSERT INTO table_registry (id, name, is_public) VALUES ('salaries','salaries',false) "
"ON CONFLICT(id) DO UPDATE SET is_public=false"
)
conn.close()
resp = c.post("/api/query", json={"sql": "SELECT * FROM salaries"},
headers=_auth(seeded_app["admin_token"]))
# Admin should not be blocked by access control
assert resp.status_code != 403
def test_analyst_can_query_public_table(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "orders", "data": [{"id": "1", "total": "99.99"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
# Register table so access control recognizes it as public
c.post("/api/admin/register-table", json={
"name": "orders", "source_type": "keboola",
}, headers=_auth(seeded_app["admin_token"]))
resp = c.post("/api/query", json={"sql": "SELECT * FROM orders"},
headers=_auth(seeded_app["analyst_token"]))
# Public table should not be blocked
assert resp.status_code != 403
def test_granted_analyst_can_query_private_table(self, seeded_app):
c = seeded_app["client"]
env = seeded_app["env"]
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "salaries", "data": [{"id": "1", "amount": "100000"}]},
])
from src.orchestrator import SyncOrchestrator
SyncOrchestrator().rebuild()
from src.db import get_system_db
conn = get_system_db()
conn.execute(
"INSERT INTO table_registry (id, name, is_public) VALUES ('salaries','salaries',false) "
"ON CONFLICT(id) DO UPDATE SET is_public=false"
)
conn.close()
# Grant access
c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "salaries", "access": "read",
}, headers=_auth(seeded_app["admin_token"]))
resp = c.post("/api/query", json={"sql": "SELECT * FROM salaries"},
headers=_auth(seeded_app["analyst_token"]))
assert resp.status_code != 403
class TestUnauthenticatedAccess:
"""Endpoints require authentication."""
def test_manifest_requires_auth(self, seeded_app):
c = seeded_app["client"]
resp = c.get("/api/sync/manifest")
assert resp.status_code in (401, 403)
def test_download_requires_auth(self, seeded_app):
c = seeded_app["client"]
resp = c.get("/api/data/orders/download")
assert resp.status_code in (401, 403)
def test_catalog_requires_auth(self, seeded_app):
c = seeded_app["client"]
resp = c.get("/api/catalog/tables")
assert resp.status_code in (401, 403)
def test_query_requires_auth(self, seeded_app):
c = seeded_app["client"]
resp = c.post("/api/query", json={"sql": "SELECT 1"})
assert resp.status_code in (401, 403)
def test_permissions_api_requires_auth(self, seeded_app):
c = seeded_app["client"]
resp = c.post("/api/admin/permissions", json={
"user_id": "analyst1", "dataset": "anything",
})
assert resp.status_code in (401, 403)

View file

@ -59,7 +59,7 @@ class TestGetSchemaVersion:
conn = get_system_db()
try:
assert get_schema_version(conn) == 2
assert get_schema_version(conn) == 3
finally:
conn.close()
@ -113,7 +113,7 @@ class TestV1ToV2Migration:
from src.db import get_system_db, get_schema_version
conn2 = get_system_db()
try:
assert get_schema_version(conn2) == 2
assert get_schema_version(conn2) == 3
# Verify old data preserved
row = conn2.execute("SELECT name, folder FROM table_registry WHERE id='t1'").fetchone()
assert row[0] == "Test"

View file

@ -136,6 +136,12 @@ class TestRBACEnforcement:
create_mock_extract(env["extracts_dir"], "keboola", [
{"name": "orders", "data": [{"id": "1"}]},
])
# Register the table so RBAC can check is_public (defaults to True)
admin_t = seeded_app["admin_token"]
c.post("/api/admin/register-table", json={
"name": "orders", "source_type": "keboola", "bucket": "in.c-crm",
"source_table": "orders", "query_mode": "local",
}, headers=_auth(admin_t))
t = seeded_app["analyst_token"]
resp = c.get("/api/data/orders/download", headers=_auth(t))
assert resp.status_code == 200

View file

@ -263,7 +263,7 @@ class TestSchemaMigration:
from src.db import get_system_db, get_schema_version
conn2 = get_system_db()
assert get_schema_version(conn2) == 2
assert get_schema_version(conn2) == 3
# Old data preserved
old = conn2.execute("SELECT name, folder FROM table_registry WHERE id='old_table'").fetchone()