* fix(api): harden API surface before Swagger — 9 findings from issue #336 ADV-001: POST /api/sync/table-subscriptions now checks can_access() per table entry, matching the gate already on POST /api/sync/settings. ADV-002: GET /webhooks/jira/health gated behind require_admin; jira_domain removed from response to prevent anonymous info disclosure. ADV-003: GET /api/version no longer exposes commit_sha or schema_version. ADV-005: /docs, /redoc, /openapi.json now require a valid session via custom FastAPI routes (docs_url=None, redoc_url=None, openapi_url=None). ADV-006: /cli/ and /webhooks/ added to _API_PATH_PREFIXES so future auth-gated routes there return JSON 401 not an HTML redirect. ADV-007: GET /api/catalog/tables wired to CatalogTablesResponse model. ADV-008: TableSubscriptionUpdate.tables capped at max_length=500. ADV-009: GET /api/users and GET /auth/admin/tokens accept limit/offset (default 1000, max 10000); repositories updated accordingly. Tests: 11 new regression tests in TestApiHardening336; test_jira_webhooks fixture updated with seeded admin user; OpenAPI snapshot regenerated. * fix(test): update test_journey_jira health check to use admin auth after ADV-002 gate * fix(security): close /auth/bootstrap auth-bypass + BREAKING markers on ADV-002/003/005 Reviewer-flagged regression introduced by ADV-009's pagination on UserRepository.list_all(): the silent default LIMIT 1000 broke the bootstrap check at app/auth/router.py and the startup no-password warning at app/main.py — both call list_all() with no args and depend on exhaustive enumeration. On an instance with >1000 users where no password-holder lands in the email-sorted first page, [u for u in list_all() if u.get('password_hash')] becomes empty → bootstrap re-opens → an unauthenticated caller can claim admin via /auth/bootstrap. Real auth-bypass on a security-sensitive boot path. Fix: - src/repositories/users.py: list_all() restored to no-arg, returns EVERY row (no LIMIT). Comment explicitly warns against re-adding pagination here. API-surface pagination moved to a new list_paginated(limit, offset) method with its own docstring. - app/api/users.py: GET /api/users now calls list_paginated(). Existing query-param validation (limit <= 10000) preserved. Regression guards in tests/test_security.py::TestApiHardening336: - test_users_list_all_returns_every_row_no_silent_limit asserts list_all() takes no params other than self (via inspect.signature) so a future cleanup can't accidentally re-add limit/offset. - test_users_list_paginated_is_separate_method asserts the paginated variant is a distinct method, not an overload. CHANGELOG: added **BREAKING** markers per CLAUDE.md release discipline to three pre-existing ADV bullets that are observable breaking changes for external consumers: - ADV-002 (webhook health going from anonymous to admin-only) - ADV-003 (/api/version dropping commit_sha + schema_version) - ADV-005 (/docs, /redoc, /openapi.json going from anonymous to session-required) * release: 0.54.25 — API hardening before Swagger (ADV-001..009) + bootstrap-bypass regression fix --------- Co-authored-by: ZdenekSrotyr <zdenek.srotyr@keboola.com>
122 lines
4.3 KiB
Python
122 lines
4.3 KiB
Python
"""Catalog endpoints — table profiles, metrics."""
|
|
|
|
import json
|
|
from typing import List, Optional
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
from pydantic import BaseModel
|
|
import duckdb
|
|
|
|
from app.auth.dependencies import get_current_user, _get_db
|
|
from app.utils import get_data_dir as _get_data_dir
|
|
from src.repositories.profiles import ProfileRepository
|
|
from src.rbac import can_access_table
|
|
|
|
router = APIRouter(prefix="/api/catalog", tags=["catalog"])
|
|
|
|
|
|
class CatalogTableItem(BaseModel):
|
|
id: str
|
|
name: str
|
|
description: Optional[str] = None
|
|
source_type: Optional[str] = None
|
|
sync_strategy: Optional[str] = None
|
|
query_mode: str = "local"
|
|
|
|
|
|
class CatalogTablesResponse(BaseModel):
|
|
tables: List[CatalogTableItem]
|
|
count: int
|
|
|
|
|
|
@router.get("/profile/{table_name}")
|
|
async def get_table_profile(
|
|
table_name: str,
|
|
user: dict = Depends(get_current_user),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Get profiler data for a specific table."""
|
|
# Check table-level access
|
|
if not can_access_table(user, table_name, conn):
|
|
raise HTTPException(status_code=403, detail=f"Access denied to table '{table_name}'")
|
|
repo = ProfileRepository(conn)
|
|
profile = repo.get(table_name)
|
|
if not profile:
|
|
# Fallback: try loading from profiles.json on disk
|
|
profiles_path = _get_data_dir() / "src_data" / "metadata" / "profiles.json"
|
|
if profiles_path.exists():
|
|
try:
|
|
all_profiles = json.loads(profiles_path.read_text())
|
|
tables = all_profiles.get("tables", all_profiles)
|
|
if table_name in tables:
|
|
return tables[table_name]
|
|
except Exception:
|
|
pass
|
|
raise HTTPException(status_code=404, detail=f"Profile not found for '{table_name}'")
|
|
return profile
|
|
|
|
|
|
@router.get("/tables", response_model=CatalogTablesResponse)
|
|
async def list_catalog_tables(
|
|
user: dict = Depends(get_current_user),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""List all available tables from table_registry."""
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
repo = TableRegistryRepository(conn)
|
|
all_tables = repo.list_all()
|
|
|
|
# Filter by user's accessible tables. ``can_access_table`` has its own
|
|
# admin shortcut (Admin group → True), so no need to pre-branch here.
|
|
all_tables = [t for t in all_tables if can_access_table(user, t["id"], conn)]
|
|
|
|
tables = [
|
|
{
|
|
"id": t["id"],
|
|
"name": t["name"],
|
|
"description": t.get("description"),
|
|
"source_type": t.get("source_type"),
|
|
"sync_strategy": t.get("sync_strategy"),
|
|
"query_mode": t.get("query_mode", "local"),
|
|
}
|
|
for t in all_tables
|
|
]
|
|
return {"tables": tables, "count": len(tables)}
|
|
|
|
|
|
@router.get("/metrics/{metric_path:path}", deprecated=True)
|
|
async def get_metric(
|
|
metric_path: str,
|
|
user: dict = Depends(get_current_user),
|
|
):
|
|
"""Deprecated: use GET /api/metrics/{metric_id} instead."""
|
|
from fastapi.responses import RedirectResponse
|
|
metric_id = metric_path.replace(".yml", "")
|
|
return RedirectResponse(url=f"/api/metrics/{metric_id}", status_code=301)
|
|
|
|
|
|
@router.post("/profile/{table_name}/refresh")
|
|
async def refresh_profile(
|
|
table_name: str,
|
|
user: dict = Depends(get_current_user),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Re-generate profile for a table on demand."""
|
|
# Check table-level access
|
|
if not can_access_table(user, table_name, conn):
|
|
raise HTTPException(status_code=403, detail=f"Access denied to table '{table_name}'")
|
|
from src.profiler import profile_table, TableInfo
|
|
|
|
data_dir = _get_data_dir()
|
|
extracts_dir = data_dir / "extracts"
|
|
candidates = list(extracts_dir.rglob(f"data/{table_name}.parquet"))
|
|
if not candidates:
|
|
raise HTTPException(status_code=404, detail=f"No parquet for '{table_name}'")
|
|
|
|
try:
|
|
table_info = TableInfo(name=table_name, table_id=table_name)
|
|
profile = profile_table(table_info, candidates[0], [], {}, {})
|
|
ProfileRepository(conn).save(table_name, profile)
|
|
return {"status": "ok", "table": table_name, "columns": len(profile.get("columns", {}))}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Profile failed: {e}")
|