diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..23c55c4 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,511 @@ +# Architecture — Detailed Reference + +Comprehensive architectural overview of the OSS AI Data Analyst platform. +For a concise summary, see [../ARCHITECTURE.md](../ARCHITECTURE.md). + +## Top-Level Module Map + +``` +oss-ai-data-analyst/ +├── src/ Core engine (config, sync, parquet, profiling) +├── connectors/ Pluggable data connectors (keboola, jira) +├── auth/ Pluggable auth providers (google, password, desktop) +├── services/ Standalone background services +├── webapp/ Flask web portal (dashboard, catalog, API) +├── server/ Server deployment (setup, deploy, nginx, systemd) +├── scripts/ Analyst-side utility scripts (sync, DuckDB, dev server) +├── config/ Instance configuration (loader, templates) +├── examples/ Example notification scripts +├── tests/ Test suite +├── dev_docs/ Internal development documentation +└── docs/ User-facing documentation +``` + +## Block Diagram + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ EXTERNAL DATA SOURCES │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ Keboola │ │ Jira │ │ CSV │ │ BigQuery │ │ +│ │ Storage │ │ Cloud │ │ (plan) │ │ (plan) │ │ +│ └────┬─────┘ └────┬─────┘ └──────────┘ └──────────┘ │ +└────────┼──────────────┼────────────────────────────────────────────────────┘ + │ │ + ▼ ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ CONNECTORS (connectors/) auto-discovery via importlib │ +│ │ +│ ┌──────────────────────────┐ ┌─────────────────────────────────────┐ │ +│ │ connectors/keboola/ │ │ connectors/jira/ │ │ +│ │ │ │ │ │ +│ │ adapter.py │ │ webhook.py Flask blueprint │ │ +│ │ KeboolaDataSource (ABC) │ │ service.py Jira REST API client │ │ +│ │ full/incr/partitioned │ │ transform.py JSON -> 6 Parquet tbl│ │ +│ │ │ │ incremental_transform.py realtime │ │ +│ │ client.py │ │ file_lock.py POSIX advisory locks │ │ +│ │ Keboola Storage API │ │ │ │ +│ │ type mapping + cache │ │ scripts/ backfill, SLA poll, │ │ +│ │ │ │ consistency check │ │ +│ │ tests/ │ │ systemd/ jira-sla-poll, │ │ +│ └──────────────────────────┘ │ jira-consistency │ │ +│ │ tests/ │ │ +│ Registry: src/data_sync.py └─────────────────────────────────────┘ │ +│ create_data_source(type) -> │ +│ importlib("connectors.{type}.adapter") │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ Parquet files +┌─────────────────────────────────────────────────────────────────────────────┐ +│ CORE ENGINE (src/) │ +│ │ +│ ┌─────────────────────┐ ┌──────────────────┐ ┌──────────────────────┐ │ +│ │ data_sync.py │ │ config.py │ │ profiler.py │ │ +│ │ DataSource ABC │ │ data_description │ │ Parquet -> stats │ │ +│ │ SyncState (JSON) │ │ .md parser │ │ alerts, sampling │ │ +│ │ DataSyncManager │ │ TableConfig │ │ -> profiles.json │ │ +│ │ create_data_source()│ │ WhereFilter │ └──────────────────────┘ │ +│ └─────────────────────┘ │ ForeignKey │ │ +│ │ get_config() │ ┌──────────────────────┐ │ +│ └──────────────────┘ │ parquet_manager.py │ │ +│ │ CSV->Parquet, merge │ │ +│ │ upsert, schema │ │ +│ └──────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ /data/src_data/parquet/ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ AUTH PROVIDERS (auth/) auto-discovery via scan │ +│ │ +│ ┌────────────────┐ ┌────────────────┐ ┌──────────────────────┐ │ +│ │ auth/google/ │ │ auth/password/ │ │ auth/desktop/ │ │ +│ │ │ │ │ │ │ │ +│ │ Google OAuth │ │ Email+password │ │ JWT for desktop app │ │ +│ │ SSO (Authlib) │ │ Argon2 hash │ │ visible=False │ │ +│ │ domain restrict │ │ SendGrid email │ │ (API-only, not login) │ │ +│ │ order=10 │ │ order=20 │ │ order=100 │ │ +│ └────────────────┘ └────────────────┘ └──────────────────────┘ │ +│ │ +│ ABC: AuthProvider (get_name, get_blueprint, get_login_button, is_avail.) │ +│ Discovery: discover_providers() -> scans auth/*/provider.py │ +│ Contract: all providers set session["user"] = {email, name, picture} │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ Blueprints registered in Flask app + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ WEB PORTAL (webapp/) │ +│ │ +│ ┌───────────────────┐ ┌──────────────────────────────────────────────┐ │ +│ │ app.py (Flask) │ │ Pages │ │ +│ │ - discover auth │ │ /dashboard - account, stats, setup │ │ +│ │ providers │ │ /catalog - data catalog + profiles │ │ +│ │ - register │ │ /corporate-memory - knowledge + voting │ │ +│ │ blueprints │ │ /activity-center - intelligence overview │ │ +│ │ - inject_config() │ └──────────────────────────────────────────────┘ │ +│ │ - routes │ │ +│ └───────────────────┘ ┌──────────────────────────────────────────────┐ │ +│ │ API Endpoints │ │ +│ ┌───────────────────┐ │ /webhooks/jira (HMAC, -> jira connector)│ │ +│ │ webapp services │ │ /api/telegram/* (link/unlink/status) │ │ +│ │ user_service │ │ /api/desktop/* (JWT, scripts, run) │ │ +│ │ account_service │ │ /api/sync-settings (GET/POST) │ │ +│ │ sync_settings_svc │ │ /api/corporate-memory/* (CRUD, votes) │ │ +│ │ telegram_service │ │ /api/catalog/profile/ │ │ +│ │ email_service │ │ /health (service health) │ │ +│ │ health_service │ └──────────────────────────────────────────────┘ │ +│ │ corporate_memory │ │ +│ └───────────────────┘ Config chain: instance.yaml -> loader -> Config -> │ +│ inject_config() -> {{ config.X }} in Jinja │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ BACKGROUND SERVICES (services/) each = __main__.py + systemd │ +│ │ +│ ┌────────────────────────┐ ┌─────────────────────────────────────────┐ │ +│ │ services/telegram_bot/ │ │ services/ws_gateway/ │ │ +│ │ │ │ │ │ +│ │ bot.py polling + │ │ gateway.py WebSocket TCP:8765 │ │ +│ │ HTTP socket │ │ + HTTP dispatch socket │ │ +│ │ runner.py script exec │ │ auth.py JWT validation │ │ +│ │ sender.py msg dispatch │ │ config.py gateway config │ │ +│ │ dispatch.py -> WS gw │ │ │ │ +│ │ storage.py JSON state │ │ Heartbeat: ping/pong, 3 miss = drop │ │ +│ │ status.py /status cmd │ │ Per-user connection limit (5) │ │ +│ │ │ │ │ │ +│ │ Always running (systemd)│ │ Always running (systemd) │ │ +│ └────────────────────────┘ └─────────────────────────────────────────┘ │ +│ │ +│ ┌────────────────────────┐ ┌─────────────────────────────────────────┐ │ +│ │ services/ │ │ services/ │ │ +│ │ corporate_memory/ │ │ session_collector/ │ │ +│ │ │ │ │ │ +│ │ collector.py │ │ collector.py │ │ +│ │ Scans CLAUDE.local.md │ │ Copies .jsonl from user homes │ │ +│ │ -> Claude Haiku -> JSON│ │ to /data/user_sessions/ │ │ +│ │ MD5 change detection │ │ Idempotent, atomic writes │ │ +│ │ prompts.py │ │ │ │ +│ │ LLM prompts for │ │ Timer: every 6 hours │ │ +│ │ knowledge extraction │ │ │ │ +│ │ │ │ │ │ +│ │ Timer: every 30 min │ │ │ │ +│ └────────────────────────┘ └─────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ Unix sockets + /data/ filesystem + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SERVER INFRASTRUCTURE (server/) │ +│ │ +│ ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────────┐ │ +│ │ Deployment │ │ User Management │ │ Web Server │ │ +│ │ setup.sh │ │ bin/add-analyst │ │ webapp-nginx.conf │ │ +│ │ deploy.sh (CI/CD) │ │ bin/list-analysts │ │ webapp.service │ │ +│ │ webapp-setup.sh │ │ bin/notify-runner │ │ SSL (Let's Encrypt) │ │ +│ │ sudoers rules │ │ bin/notify-scripts │ │ Gunicorn + Unix sock │ │ +│ └──────────────────┘ └────────────────────┘ └───────────────────────┘ │ +│ │ +│ Groups: dataread (analysts) | data-private (privileged) | data-ops (admin) │ +│ │ +│ /data/ │ +│ ├── src_data/parquet/ shared data (readonly for analysts) │ +│ ├── src_data/metadata/ sync_state.json, profiles.json │ +│ ├── src_data/raw/jira/ webhook JSON, attachments │ +│ ├── docs/ , scripts/ documentation, helper scripts │ +│ ├── notifications/ telegram_users, desktop_users, codes │ +│ ├── corporate-memory/ knowledge.json, votes.json │ +│ └── user_sessions/ centralized Claude Code transcripts │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ rsync (SSH) - scripts/sync_data.sh (bi-directional) + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ANALYST WORKSTATION (local) │ +│ │ +│ server/ (read-only, rsynced from broker) │ +│ ├── parquet/, docs/, scripts/, metadata/ │ +│ │ +│ user/ (writable workspace, backed up to server) │ +│ ├── duckdb/analytics.duckdb SQL views over parquet │ +│ ├── notifications/*.py custom notification scripts │ +│ ├── sessions/ Claude Code transcripts │ +│ └── artifacts/ analysis outputs │ +│ │ +│ .claude/rules/ corporate memory knowledge rules │ +│ │ +│ Claude Code <- local analysis over DuckDB + Parquet │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## Auto-Discovery Patterns + +The platform uses three symmetrical auto-discovery mechanisms. Adding a new +connector, auth method, or service requires no changes to existing code. + +### 1. Connector Discovery (`src/data_sync.py`) + +``` +config/instance.yaml -> data_source.type: "keboola" + -> importlib.import_module("connectors.keboola.adapter") + -> KeboolaDataSource (implements DataSource ABC) +``` + +- Factory: `create_data_source(type)` in `src/data_sync.py` +- Connectors live in `connectors/{name}/adapter.py` +- Must export a `DataSource` subclass or a `create_data_source()` factory function +- Keboola is hard-coded for ImportError handling; all others use dynamic import + +### 2. Auth Provider Discovery (`auth/__init__.py`) + +``` +startup -> scan auth/*/provider.py + -> import `provider` instance + -> filter by is_available() (checks env vars) + -> register blueprint + login button in Flask +``` + +- ABC: `AuthProvider` with methods `get_name()`, `get_blueprint()`, `get_login_button()`, `is_available()`, `init_app()` +- Session contract: all providers set `session["user"] = {email, name, picture}` +- Login page renders buttons dynamically, sorted by `order` field + +### 3. Service Pattern (`services/*/__main__.py`) + +``` +python -m services. # entry point +services//systemd/ # unit files +deploy.sh auto-discovers # systemd/* in each service dir +``` + +- Each service is self-contained: code, systemd units, and config in one directory +- `deploy.sh` scans `services/*/systemd/*.service` and `connectors/*/systemd/*.service` +- Long-running services (telegram_bot, ws_gateway) use async dual-server model +- Periodic services (corporate_memory, session_collector) are systemd timer oneshots + +## Data Flows + +### Pull Sync (Keboola) + +``` +Keboola Storage API + -> connectors/keboola/client.py (export CSV with filters) + -> src/parquet_manager.py (convert to typed Parquet) + -> /data/src_data/parquet/ (stored on broker) + -> rsync to analyst (scripts/sync_data.sh) + -> DuckDB views (scripts/setup_views.sh) +``` + +Sync strategies: `full_refresh`, `incremental`, `partitioned`, `chunked_initial_load`. + +### Push Sync (Jira) + +``` +Jira Cloud webhook (issue created/updated/deleted) + -> connectors/jira/webhook.py (HMAC-SHA256 verification) + -> connectors/jira/service.py (fetch full issue + attachments) + -> /data/src_data/raw/jira/issues/ (atomic JSON write) + -> connectors/jira/incremental_transform.py (update monthly Parquet) + -> /data/src_data/parquet/jira/ (6 tables: issues, comments, + attachments, changelog, + issuelinks, remote_links) +``` + +Background jobs supplement the webhook pipeline: +- `jira-sla-poll` (every 5 min): refreshes SLA fields for open tickets +- `jira-consistency` (every 6h): detects and backfills missing issues + +### Notification Pipeline + +``` +~/user/notifications/*.py analyst's custom scripts + -> server/bin/notify-runner (cron, executes with timeout) + -> cooldown check (~/.notifications/state/) + ├-> services/telegram_bot/ (Unix socket /run/notify-bot/bot.sock) + │ -> Telegram chat message (text or photo) + └-> services/ws_gateway/ (Unix socket /run/ws-gateway/ws.sock) + -> WebSocket push to desktop app +``` + +Script output format: +```json +{ + "notify": true, + "title": "Revenue dropped 25%", + "message": "Details...", + "cooldown": "6h", + "image_path": "/tmp/chart.png" +} +``` + +### Knowledge Loop (Corporate Memory) + +``` +Analyst writes CLAUDE.local.md (insights, patterns, tips) + -> scripts/sync_data.sh (uploads to server) + -> services/corporate_memory/ (timer, every 30 min) + -> MD5 change detection + -> Claude Haiku extracts knowledge items + -> /data/corporate-memory/knowledge.json + -> webapp /corporate-memory (voting UI: upvote/downvote) + -> scripts/sync_data.sh (downloads to analyst) + -> .claude/rules/ (rules for Claude Code) + -> Claude Code uses rules in next session +``` + +## Module Reference + +### Core Engine (`src/`) + +| File | Lines | Responsibility | +|------|-------|----------------| +| `data_sync.py` | ~1400 | `DataSource` ABC, `SyncState`, `DataSyncManager`, connector factory | +| `config.py` | ~600 | Parse `data_description.md` YAML blocks, `TableConfig`, `WhereFilter`, `ForeignKey` | +| `parquet_manager.py` | ~750 | CSV-to-Parquet conversion, merge, upsert, schema enforcement | +| `profiler.py` | ~1200 | Data profiling: stats, alerts, type classification -> `profiles.json` | + +### Connectors (`connectors/`) + +| Module | Files | Sync Model | Description | +|--------|-------|------------|-------------| +| `keboola/` | adapter.py, client.py, tests/ | Pull (DataSource ABC) | Keboola Storage API, type mapping, metadata caching (24h TTL) | +| `jira/` | webhook.py, service.py, transform.py, incremental_transform.py, file_lock.py, scripts/, systemd/, tests/ | Push (webhook) | Real-time webhook pipeline, SLA polling, consistency monitoring, 6 output Parquet tables | + +### Auth Providers (`auth/`) + +| Provider | Available when | Login UI | Order | Description | +|----------|---------------|----------|-------|-------------| +| `google/` | `GOOGLE_CLIENT_ID` set | Yes | 10 | Google OAuth SSO with domain restriction | +| `password/` | `SENDGRID_API_KEY` set | Yes | 20 | Email + password for external users (Argon2, rate limiting) | +| `desktop/` | `DESKTOP_JWT_SECRET` set | No (API-only) | 100 | JWT tokens for native desktop app | + +### Background Services (`services/`) + +| Service | Type | Schedule | Description | +|---------|------|----------|-------------| +| `telegram_bot/` | Long-running | Always on | Telegram polling + HTTP dispatch socket, script execution, /status /test commands | +| `ws_gateway/` | Long-running | Always on | WebSocket TCP:8765 + HTTP dispatch socket, JWT auth, heartbeat | +| `corporate_memory/` | Timer oneshot | Every 30 min | AI knowledge extraction from CLAUDE.local.md via Claude Haiku | +| `session_collector/` | Timer oneshot | Every 6 hours | Copy session .jsonl from user homes to central storage | + +### Web Portal (`webapp/`) + +| File | Responsibility | +|------|----------------| +| `app.py` | Flask factory, blueprint registration, route definitions, context processors | +| `config.py` | Load `instance.yaml`, expose `Config` to templates | +| `auth.py` | Core auth infrastructure: `login_required`, `validate_email_domain`, `/login`, `/logout` | +| `user_service.py` | Username derivation, SSH key validation, system account creation | +| `account_service.py` | Dashboard account widget data, cron info, sync status | +| `sync_settings_service.py` | Per-user dataset sync preferences | +| `telegram_service.py` | Telegram account linking/unlinking | +| `desktop_auth.py` | JWT generation/validation, desktop app link state | +| `password_auth.py` | Password auth implementation (Argon2, rate limiting, token workflow) | +| `email_service.py` | SendGrid integration for setup/reset emails | +| `corporate_memory_service.py` | Knowledge CRUD, voting, user rules regeneration | +| `health_service.py` | System health checks (services, timers, disk, load, webhooks) | +| `notification_images.py` | Serve chart PNGs generated by notification runner | +| `utils/metric_parser.py` | Parse business metric YAML definitions for catalog UI | + +### Server Infrastructure (`server/`) + +| File | Responsibility | +|------|----------------| +| `setup.sh` | Initial server bootstrap (groups, users, directories, venv) | +| `deploy.sh` | CI/CD deployment (git pull, deps, scripts, services, ACLs) | +| `webapp-setup.sh` | Nginx + SSL + Gunicorn setup | +| `webapp-nginx.conf` | Nginx reverse proxy config (HTTPS, WebSocket upgrade) | +| `webapp.service` | Systemd unit for Gunicorn | +| `sudoers-deploy` | Sudo rules for deploy user (least-privilege) | +| `sudoers-webapp` | Sudo rules for www-data | +| `bin/add-analyst` | Create analyst user with workspace structure | +| `bin/list-analysts` | List registered analysts | +| `bin/notify-runner` | Execute user notification scripts, dispatch to bot + gateway | +| `bin/notify-scripts` | List/run notification scripts for a user | + +### Analyst Scripts (`scripts/`) + +| File | Responsibility | +|------|----------------| +| `sync_data.sh` | Bi-directional rsync: download data, upload workspace, refresh DuckDB | +| `setup_views.sh` | Create/replace DuckDB views over all Parquet files | +| `duckdb_manager.py` | DuckDB setup utility | +| `dev_run.py` | Development server with auth bypass | +| `collect_session.py` | Session transcript collector (used by service) | +| `generate_user_sync_configs.py` | Generate per-user sync config files | + +## Analyst Workspace Layout + +Created by `server/bin/add-analyst` for each registered user: + +``` +/home/{username}/ +├── server/ read-only symlinks to shared data +│ ├── parquet/ -> /data/src_data/parquet +│ ├── docs/ -> /data/docs +│ ├── scripts/ -> /data/scripts +│ ├── metadata/ -> /data/src_data/metadata +│ └── jira_attachments/ -> /data/src_data/raw/jira/attachments +├── user/ writable workspace (backed up to server) +│ ├── duckdb/ local DuckDB database +│ ├── notifications/ custom notification scripts (*.py) +│ ├── artifacts/ analysis outputs +│ ├── scripts/ user helper scripts +│ ├── parquet/ user Parquet files +│ └── sessions/ Claude Code session transcripts +├── .notifications/ notification runner state +│ ├── state/ cooldown tracking (JSON per script) +│ └── logs/ runner logs +└── .claude/ + └── rules/ corporate memory knowledge rules (auto-synced) +``` + +## Security Model + +### System Groups + +| Group | Access | +|-------|--------| +| `data-ops` | Full admin access to all server resources | +| `dataread` | Read access to public Parquet data | +| `data-private` | Read access to sensitive/restricted data | + +### Authentication Layers + +| Layer | Mechanism | Scope | +|-------|-----------|-------| +| Web portal | Google OAuth / email+password | Browser sessions | +| Desktop app | JWT Bearer tokens | API endpoints (`/api/desktop/*`) | +| Jira webhook | HMAC-SHA256 signature | Webhook endpoint | +| SSH access | Key-based auth only | Data sync (rsync) | +| Inter-service | Unix socket permissions | Bot, gateway, webapp | + +### Permission Boundaries + +- Analysts cannot access other users' home directories +- Webapp (www-data) uses sudoers-whitelisted commands for user operations +- Deploy user has explicit sudo rules for service management +- Staging directory (`/tmp/data_analyst_staging`) uses setgid for group ownership +- All JSON state files written atomically: `tempfile.mkstemp()` + `os.fchmod()` + `os.replace()` + +## Configuration Chain + +``` +config/instance.yaml (instance-specific, not committed) + | loaded by config/loader.py + | ${ENV_VAR} references resolved from .env / environment + v +webapp/config.py (Flask Config class) + | _load_instance_config() at module level + | _get(config, *keys) for safe nested access + v +inject_config() context processor (exposes Config to templates) + v +{{ config.INSTANCE_NAME }} in Jinja2 (all templates have access) +``` + +Validation: `config/loader.py` checks required fields at startup (`instance.name`, +`auth.allowed_domain`, `server.host`, `server.hostname`, `auth.webapp_secret_key`). +Missing required fields cause immediate startup failure with a clear error message. + +## Server Filesystem Layout + +``` +/opt/data-analyst/ +├── repo/ git repository (deployed via CI/CD) +├── .venv/ Python virtual environment +├── logs/ application logs +└── .env secrets (mode 0640) + +/data/ +├── src_data/ +│ ├── parquet/ shared Parquet files (readonly for analysts) +│ ├── metadata/ sync_state.json, profiles.json, table_metadata.json +│ └── raw/jira/ webhook JSON files, attachments +├── docs/ documentation and schema +├── scripts/ helper scripts synced to analysts +├── notifications/ telegram_users.json, desktop_users.json, pending_codes.json +├── corporate-memory/ knowledge.json, votes.json, user_hashes.json +└── user_sessions/ centralized Claude Code session transcripts + +/run/ +├── notify-bot/bot.sock Telegram bot HTTP socket +├── ws-gateway/ws.sock WebSocket gateway HTTP socket +└── webapp/webapp.sock Gunicorn WSGI socket +``` + +## CI/CD + +### Deploy Guard (`.github/workflows/deploy-guard.yml`) + +Runs on every pull request: +1. `pytest tests/test_deploy_guard.py` - validates deploy.sh/sudoers/systemd consistency +2. `pytest tests/test_sync_data.py -m "not live"` - validates sync script reliability +3. `visudo -cf server/sudoers-*` - validates sudoers syntax in Docker + +### Deployment (`.github/workflows/deploy.yml.example`) + +Runs on push to main (or manual trigger): +1. SSH into server +2. Execute `server/deploy.sh` (git pull, deps, scripts, services, ACLs) diff --git a/docs/testing/vm_test_plan.md b/docs/testing/vm_test_plan.md new file mode 100644 index 0000000..7abe38f --- /dev/null +++ b/docs/testing/vm_test_plan.md @@ -0,0 +1,428 @@ +# VM Test Plan - Self-Service Data Onboarding + +End-to-end test of the full platform on a clean VM with a new GitHub repository. + +## Prerequisites + +- Clean Ubuntu 22.04+ VM (or Debian 12) with root access +- GitHub account with ability to create repositories +- Domain name pointing to the VM (or use IP + skip SSL) +- Keboola project with Storage API token (for discovery/sync testing) +- Google OAuth credentials (for login testing) + +--- + +## Step 0: Create GitHub Repository & Push + +**On your local machine:** + +```bash +cd /Users/padak/github/oss-ai-data-analyst + +# Create repo on GitHub (pick org/name) +gh repo create YOUR_ORG/ai-data-analyst --private --source=. --push + +# Verify +gh repo view YOUR_ORG/ai-data-analyst +``` + +**Expected:** Repo created, code pushed, visible on GitHub. + +--- + +## Step 1: VM Initial Setup + +**On the VM as root:** + +```bash +# Clone the repo +REPO_URL="git@github.com:YOUR_ORG/ai-data-analyst.git" +APP_DIR="/opt/data-analyst" +mkdir -p $APP_DIR +ssh-keygen -t ed25519 -f /root/.ssh/deploy_key -N "" +# Add deploy key to GitHub repo (Settings -> Deploy keys) + +sudo -u deploy git clone $REPO_URL $APP_DIR/repo + +# Run setup +cd $APP_DIR/repo +REPO_URL=$REPO_URL bash server/setup.sh +``` + +### Checklist + +| # | Check | Command | +|---|-------|---------| +| 1.1 | Groups created | `getent group data-ops dataread data-private` | +| 1.2 | Deploy user exists | `id deploy` | +| 1.3 | Directory structure | `ls -la /opt/data-analyst/` | +| 1.4 | Python venv works | `/opt/data-analyst/.venv/bin/python -c "import flask; print('OK')"` | +| 1.5 | Management scripts | `which add-analyst list-analysts` | + +--- + +## Step 2: Webapp Setup + +```bash +export SERVER_HOSTNAME="data.yourdomain.com" # or skip SSL with IP +bash server/webapp-setup.sh +``` + +Then edit `/opt/data-analyst/.env`: + +```bash +# Required +WEBAPP_SECRET_KEY="$(python3 -c 'import secrets; print(secrets.token_hex(32))')" +GOOGLE_CLIENT_ID="your-google-client-id" +GOOGLE_CLIENT_SECRET="your-google-client-secret" +SERVER_HOST="YOUR_VM_IP" +SERVER_HOSTNAME="data.yourdomain.com" + +# For Keboola discovery/sync +KEBOOLA_STORAGE_TOKEN="your-token" +KEBOOLA_STACK_URL="https://connection.keboola.com" +KEBOOLA_PROJECT_ID="your-project-id" +DATA_SOURCE="keboola" +DATA_DIR="/data/src_data" +``` + +### Checklist + +| # | Check | Command | +|---|-------|---------| +| 2.1 | Nginx running | `systemctl status nginx` | +| 2.2 | Webapp running | `systemctl status webapp` | +| 2.3 | SSL cert (if domain) | `curl -I https://data.yourdomain.com/health` | +| 2.4 | Health endpoint | `curl http://localhost:5000/health` (or via nginx) | +| 2.5 | Login page loads | Browser: `https://data.yourdomain.com/login` | + +--- + +## Step 3: Instance Configuration + +```bash +cd /opt/data-analyst/repo +cp config/instance.yaml.example config/instance.yaml +``` + +Edit `config/instance.yaml` with: +- `instance.name` / `instance.subtitle` +- `server.hostname` / `server.host` +- `auth.allowed_domain` (your Google domain) +- `data_source.type: "keboola"` + keboola settings +- `catalog.categories` (at least one, e.g., `crm: {label: "CRM", icon: "crm"}`) + +### Checklist + +| # | Check | Command | +|---|-------|---------| +| 3.1 | Config loads | `cd /opt/data-analyst/repo && .venv/bin/python -c "from config.loader import load_instance_config; print(load_instance_config())"` | +| 3.2 | Webapp picks it up | Restart webapp, check login page shows instance name | + +--- + +## Step 4: Create Admin Account & Login + +1. Login via Google OAuth in browser +2. Register account with SSH key +3. Verify the user is admin: + +```bash +id YOUR_USERNAME # should be in data-ops or sudo group +# If not admin, manually add: +usermod -aG data-ops YOUR_USERNAME +``` + +### Checklist + +| # | Check | Command | +|---|-------|---------| +| 4.1 | Google OAuth works | Login via browser | +| 4.2 | Account created | `list-analysts` shows your username | +| 4.3 | Dashboard loads | Browser: /dashboard shows data stats | +| 4.4 | Admin access | Browser: /admin/tables loads (no 403) | + +--- + +## Step 5: Test Discovery API (Phase 1) + +In browser, go to `/admin/tables` and click "Discover tables from source". + +### Checklist + +| # | Check | Expected | +|---|-------|----------| +| 5.1 | Discovery button works | Loading spinner, then tables appear | +| 5.2 | Tables grouped by bucket | Buckets shown as collapsible sections | +| 5.3 | Table details shown | Name, columns, row count, size for each table | +| 5.4 | "Available" badge | All tables show "Available" (none registered yet) | +| 5.5 | API direct test | `curl -b cookies.txt https://HOST/api/admin/discover-tables \| jq .total` | + +--- + +## Step 6: Test Table Registry (Phase 2) + +### 6a: Register tables via Admin UI + +1. Click "Register" on a table in discovery results +2. Fill in: sync_strategy=full_refresh, confirm primary key +3. Click "Register Table" +4. Repeat for 2-3 more tables (try incremental too) + +### 6b: Verify registry + +```bash +# On server +cat /data/src_data/metadata/table_registry.json | python3 -m json.tool | head -30 + +# Check generated data_description.md +head -10 /opt/data-analyst/repo/docs/data_description.md +# Should show: + +# Check audit log +cat /data/src_data/metadata/registry_audit.log +``` + +### 6c: Test via API + +```bash +# List registry +curl -b cookies.txt https://HOST/api/admin/registry | jq '.tables | length' + +# Update a table +curl -b cookies.txt -X PUT https://HOST/api/admin/registry/in.c-crm.company \ + -H "Content-Type: application/json" \ + -d '{"description": "Updated via API", "version": CURRENT_VERSION}' + +# Delete a table +curl -b cookies.txt -X DELETE https://HOST/api/admin/registry/in.c-crm.company \ + -H "Content-Type: application/json" \ + -d '{"version": CURRENT_VERSION}' +``` + +### Checklist + +| # | Check | Expected | +|---|-------|----------| +| 6.1 | Register table | Success, table appears in registry panel | +| 6.2 | Badge changes | Registered tables show green "Registered" badge | +| 6.3 | data_description.md | Generated with AUTO-GENERATED header + checksum | +| 6.4 | Audit log written | Actions logged with timestamps and emails | +| 6.5 | Optimistic locking | Stale version POST returns 409 | +| 6.6 | Edit table | PUT changes description/strategy | +| 6.7 | Delete table | Table removed, badge reverts to "Available" | + +--- + +## Step 7: Test Data Sync + Auto-Profiling (Phase 3) + +```bash +cd /opt/data-analyst/repo +source .venv/bin/activate + +# Run sync for registered tables +python -m src.data_sync +``` + +### Checklist + +| # | Check | Expected | +|---|-------|----------| +| 7.1 | Sync completes | Tables downloaded, Parquet created | +| 7.2 | Schema.yml generated | `cat docs/schema.yml \| head` | +| 7.3 | Auto-profiling ran | Log shows "Auto-profiling: N profiled" | +| 7.4 | profiles.json exists | `ls -la /data/src_data/metadata/profiles.json` | +| 7.5 | Catalog shows profiles | Browser: /catalog -> click table -> profile data loads | + +--- + +## Step 8: Test Per-Table Subscriptions (Phase 4) + +### 8a: Via API + +```bash +# Get current subscriptions +curl -b cookies.txt https://HOST/api/table-subscriptions | jq . + +# Switch to explicit mode, subscribe to specific tables +curl -b cookies.txt -X POST https://HOST/api/table-subscriptions \ + -H "Content-Type: application/json" \ + -d '{ + "table_mode": "explicit", + "tables": {"company": true, "contact": true, "events": false} + }' +``` + +### 8b: Via Catalog UI + +1. Go to /catalog +2. Tables should show subscription status (all subscribed in "all" mode) +3. After switching to "explicit" mode via API, unsubscribed tables should be visually different + +### Checklist + +| # | Check | Expected | +|---|-------|----------| +| 8.1 | Default is "all" mode | GET returns `table_mode: "all"` | +| 8.2 | Switch to explicit | POST succeeds, settings saved | +| 8.3 | Config YAML updated | `cat /home/USERNAME/.sync_settings.yaml` shows `table_mode: explicit` | +| 8.4 | Catalog reflects subs | Subscribed vs unsubscribed tables visually distinct | + +--- + +## Step 9: Test Smart Sync (Phase 5) + +### 9a: Check rsync filter generation + +```bash +# After setting explicit subscriptions: +cat /home/USERNAME/.sync_rsync_filter +# Should show include/exclude rules +``` + +### 9b: Test from analyst machine + +```bash +# On analyst machine (or simulate): +bash server/scripts/sync_data.sh --dry-run +# Should show filter-based sync when explicit mode is active +``` + +### Checklist + +| # | Check | Expected | +|---|-------|----------| +| 9.1 | Filter file exists | `.sync_rsync_filter` created in user home | +| 9.2 | Correct include/exclude | Subscribed tables included, others excluded | +| 9.3 | Dry-run uses filter | `--filter="merge ..."` in rsync output | +| 9.4 | Fallback works | Without filter file, syncs everything (backwards compat) | + +--- + +## Step 10: Migration Test (One-Time Bootstrap) + +If you already have a `docs/data_description.md` with tables defined: + +```bash +python3 -c " +from src.table_registry import TableRegistry +from pathlib import Path + +registry = TableRegistry.import_from_data_description( + Path('docs/data_description.md'), + Path('/data/src_data/metadata/table_registry.json'), + registered_by='migration@test.com' +) +print(f'Migrated {len(registry.list_tables())} tables') +print(f'Version: {registry.version}') +" +``` + +### Checklist + +| # | Check | Expected | +|---|-------|----------| +| 10.1 | Migration succeeds | All tables imported | +| 10.2 | Registry JSON valid | `cat table_registry.json \| python3 -m json.tool` | +| 10.3 | migrated_from marker | `"migrated_from": "docs/data_description.md"` in metadata | +| 10.4 | Admin UI shows tables | /admin/tables lists all migrated tables | + +--- + +## Step 11: Regression Tests + +```bash +cd /opt/data-analyst/repo +source .venv/bin/activate +python -m pytest tests/ -v +``` + +### Checklist + +| # | Check | Expected | +|---|-------|----------| +| 11.1 | All tests pass | 132+ tests, 0 failures | +| 11.2 | No import errors | All modules load cleanly | + +--- + +## Quick Smoke Test Script + +Run this after full setup to verify the critical path: + +```bash +#!/bin/bash +# smoke_test.sh - Quick verification of self-service onboarding +set -e + +APP_DIR="/opt/data-analyst/repo" +cd "$APP_DIR" +source .venv/bin/activate + +echo "=== Smoke Test ===" + +# 1. Tests +echo "[1/5] Running tests..." +python -m pytest tests/ -q --tb=short +echo " PASS" + +# 2. Registry module +echo "[2/5] Testing Table Registry..." +python -c " +from src.table_registry import TableRegistry +from pathlib import Path +import tempfile +r = TableRegistry(Path(tempfile.mktemp(suffix='.json'))) +r.register_table({'id': 'test.t', 'name': 't', 'primary_key': 'id', 'sync_strategy': 'full_refresh'}, 'test') +assert r.is_registered('test.t') +r.unregister_table('test.t') +assert not r.is_registered('test.t') +print(' PASS') +" + +# 3. Discovery (needs Keboola credentials) +echo "[3/5] Testing Discovery API..." +python -c " +try: + from src.data_sync import create_data_source + ds = create_data_source() + tables = ds.discover_tables() + print(f' PASS - Discovered {len(tables)} tables') +except Exception as e: + print(f' SKIP - {e}') +" + +# 4. Profiler API +echo "[4/5] Testing Profiler API..." +python -c " +from src.profiler import profile_changed_tables +result = profile_changed_tables([]) +assert result == {'success': 0, 'errors': 0, 'skipped': 0} +print(' PASS') +" + +# 5. Webapp imports +echo "[5/5] Testing Webapp imports..." +python -c " +from webapp.auth import admin_required, login_required +from webapp.sync_settings_service import get_table_subscriptions, generate_rsync_filter +from src.table_registry import TableRegistry, ConflictError +print(' PASS') +" + +echo "" +echo "=== All smoke tests passed ===" +``` + +--- + +## Troubleshooting + +| Problem | Fix | +|---------|-----| +| `/admin/tables` returns 403 | User not in `data-ops` group. Run `usermod -aG data-ops USERNAME` | +| Discovery returns empty | Check `KEBOOLA_STORAGE_TOKEN` in `.env`, verify `DATA_SOURCE=keboola` | +| Profiles not generated | Check `/data/src_data/parquet/` has parquet files, check DuckDB installed | +| Rsync filter not created | Check `sudo` permissions for `www-data` in sudoers-webapp | +| `data_description.md` not updating | Check write permissions on `docs/` directory | +| Webapp won't start | Check `journalctl -u webapp -n 50` for errors |