Open-source AI data analyst platform extracted from internal repo. Includes data sync engine, Keboola adapter, Flask web portal, server deployment scripts, and configuration templates.
420 lines
16 KiB
YAML
420 lines
16 KiB
YAML
version: "1.0"
|
|
project_name: "internal_ai_data_analyst"
|
|
project_dir: "."
|
|
|
|
server:
|
|
host: "{server_host}"
|
|
hostname: "data-analyst"
|
|
webapp_url: "{webapp_url}"
|
|
|
|
setup:
|
|
steps:
|
|
- name: "detect_existing_project"
|
|
description: "Check if project already exists"
|
|
check: "test -f ./CLAUDE.md"
|
|
on_success: "verify_project_identity"
|
|
message: |
|
|
📁 Existing CLAUDE.md detected in current directory
|
|
|
|
Verifying this is an AI Data Analyst project...
|
|
|
|
- name: "verify_project_identity"
|
|
description: "Verify this is the correct project type"
|
|
check: "grep -q 'AI Data Analyst' ./CLAUDE.md"
|
|
on_success: "existing_project_confirmed"
|
|
on_failure: |
|
|
❌ Wrong project type detected
|
|
|
|
The CLAUDE.md file exists but doesn't match AI Data Analyst.
|
|
|
|
Options:
|
|
- Choose a different directory for setup
|
|
- Remove existing CLAUDE.md if this was a mistake
|
|
message: |
|
|
✅ AI Data Analyst project confirmed
|
|
|
|
This directory is already set up. You can:
|
|
- Sync latest data: bash server/scripts/sync_data.sh
|
|
- View project context: cat CLAUDE.md
|
|
|
|
To recreate CLAUDE.md: rm -f ./CLAUDE.md and re-run bootstrap
|
|
|
|
- name: "check_directory_empty"
|
|
description: "Warn if directory is not empty"
|
|
check: "[ $(ls -A . 2>/dev/null | wc -l) -eq 0 ]"
|
|
on_failure: |
|
|
⚠️ Current directory is not empty
|
|
|
|
Found existing files here. This setup will create the following:
|
|
- .claude/ (project metadata)
|
|
- server/ (read-only data from server: parquet files ~690 MB, docs, scripts)
|
|
- user/ (your workspace: DuckDB database, notifications, artifacts)
|
|
- .venv/ (Python virtual environment)
|
|
|
|
Make sure you're in the correct directory before continuing.
|
|
If this is the right place, the setup will proceed without affecting existing files.
|
|
warn_only: true
|
|
message: |
|
|
📂 Starting setup in current directory...
|
|
|
|
- name: "generate_ssh_key"
|
|
description: "Generate SSH key for server authentication"
|
|
check: "test -f ~/.ssh/data_analyst_server.pub"
|
|
action: |
|
|
ssh-keygen -t ed25519 -f ~/.ssh/data_analyst_server -C "{username}@data-analyst" -N ''
|
|
on_success: "show_public_key"
|
|
message: |
|
|
🔑 SSH key generated successfully
|
|
|
|
- name: "show_public_key"
|
|
description: "Display SSH public key to user"
|
|
action: "cat ~/.ssh/data_analyst_server.pub"
|
|
message: |
|
|
📋 Your SSH public key has been generated!
|
|
|
|
Next steps:
|
|
1. Copy the public key shown above
|
|
2. Go to: {webapp_url}
|
|
3. Sign in:
|
|
- Internal users: Click "Sign in with Google"
|
|
- External users: Click "Sign in with Email"
|
|
4. Paste the key into the form and click "Create Account"
|
|
5. Wait a few seconds for account creation
|
|
6. Come back here to continue
|
|
|
|
Note: Your username will be derived from your email:
|
|
- john.doe@example.com -> john.doe
|
|
- partner@company.com -> partner_company_com
|
|
wait_for_user: true
|
|
|
|
- name: "add_ssh_config"
|
|
description: "Add SSH config entry"
|
|
requires: ["show_public_key"]
|
|
check: "grep -q 'Host data-analyst' ~/.ssh/config 2>/dev/null"
|
|
action: |
|
|
mkdir -p ~/.ssh
|
|
cat >> ~/.ssh/config << 'EOF'
|
|
|
|
Host data-analyst
|
|
HostName {server_host}
|
|
User {username}
|
|
IdentityFile ~/.ssh/data_analyst_server
|
|
StrictHostKeyChecking accept-new
|
|
EOF
|
|
chmod 600 ~/.ssh/config
|
|
message: |
|
|
⚙️ SSH configuration added for data-analyst server
|
|
|
|
- name: "test_ssh_connection"
|
|
description: "Test SSH connection to server"
|
|
requires: ["add_ssh_config"]
|
|
action: "ssh -o ConnectTimeout=5 -o BatchMode=yes data-analyst echo 'ok' 2>/dev/null"
|
|
message: |
|
|
🔌 Testing connection to data server... (this may take a few seconds)
|
|
✅ Connection successful! You're authenticated and ready to sync data.
|
|
on_failure: |
|
|
❌ SSH connection failed!
|
|
|
|
Please verify:
|
|
1. You completed registration at {webapp_url}
|
|
2. Your account shows as created on the dashboard
|
|
3. You copied the correct username
|
|
|
|
Common issues:
|
|
- Account creation can take a few seconds
|
|
- Make sure you pasted the complete SSH public key
|
|
- Check that username matches exactly (case-sensitive)
|
|
retry: true
|
|
max_retries: 3
|
|
|
|
- name: "create_folders"
|
|
description: "Create local project structure"
|
|
action: |
|
|
mkdir -p ./server/docs ./server/scripts ./server/examples ./server/parquet ./server/metadata
|
|
mkdir -p ./user/duckdb ./user/notifications ./user/artifacts ./user/scripts ./user/parquet ./user/sessions
|
|
mkdir -p ./.venv
|
|
message: |
|
|
📁 Project structure created (server/, user/, .venv/)
|
|
|
|
- name: "check_rsync"
|
|
description: "Verify rsync is available (preferred) or prepare scp fallback"
|
|
check: "command -v rsync >/dev/null 2>&1"
|
|
warn_only: true
|
|
on_failure: |
|
|
⚠️ rsync is not installed on your system.
|
|
|
|
RECOMMENDED: Install rsync for better performance and reliability.
|
|
|
|
Installation instructions:
|
|
|
|
macOS (Homebrew):
|
|
brew install rsync
|
|
|
|
macOS (without Homebrew):
|
|
Install Homebrew first: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
|
Then: brew install rsync
|
|
|
|
Linux (Debian/Ubuntu):
|
|
sudo apt-get update && sudo apt-get install -y rsync
|
|
|
|
Linux (RHEL/CentOS):
|
|
sudo yum install -y rsync
|
|
|
|
Windows:
|
|
Option 1 (Recommended): Use WSL (Windows Subsystem for Linux)
|
|
1. Open PowerShell as Administrator
|
|
2. Run: wsl --install
|
|
3. Restart your computer
|
|
4. After restart, run this setup again in WSL terminal
|
|
|
|
Option 2: Git for Windows
|
|
1. Install Git for Windows from: https://git-scm.com/download/win
|
|
2. Use Git Bash terminal
|
|
|
|
---
|
|
SCP FALLBACK (if rsync installation is not possible):
|
|
|
|
If you cannot install rsync, scp can be used as alternative.
|
|
IMPORTANT: scp with wildcard (*) does NOT copy dotfiles (files starting with .)
|
|
|
|
When using scp, you MUST explicitly copy dotfiles separately:
|
|
|
|
Example for metadata directory:
|
|
scp -r data-analyst:server/metadata/* ./server/metadata/
|
|
scp data-analyst:server/metadata/.* ./server/metadata/
|
|
|
|
Or copy the entire directory (includes dotfiles):
|
|
scp -r data-analyst:server/metadata ./server/
|
|
|
|
- name: "download_sync_settings"
|
|
description: "Download user's dataset sync preferences from server"
|
|
action: |
|
|
SYNC_CONFIG="/tmp/.sync_settings_$(id -u).yaml"
|
|
if scp -q data-analyst:~/.sync_settings.yaml "$SYNC_CONFIG" 2>/dev/null; then
|
|
echo "Settings loaded from server"
|
|
else
|
|
# No custom settings yet - create defaults (all optional datasets disabled)
|
|
cat > "$SYNC_CONFIG" << 'DEFAULTS'
|
|
datasets:
|
|
jira: false
|
|
jira_attachments: false
|
|
kbc_telemetry_expert: false
|
|
DEFAULTS
|
|
echo "No custom settings found, using defaults"
|
|
fi
|
|
requires: ["test_ssh_connection"]
|
|
message: |
|
|
📥 Downloading dataset preferences from portal...
|
|
✅ Sync settings loaded (manage at {webapp_url})
|
|
|
|
- name: "download_scripts"
|
|
description: "Download setup scripts from server"
|
|
action: |
|
|
rsync -avz data-analyst:server/scripts/ ./server/scripts/
|
|
requires: ["test_ssh_connection", "create_folders", "check_rsync"]
|
|
message: |
|
|
📥 Downloading helper scripts from server...
|
|
✅ Scripts downloaded successfully
|
|
|
|
- name: "download_docs"
|
|
description: "Download documentation from server"
|
|
action: |
|
|
SYNC_CONFIG="/tmp/.sync_settings_$(id -u).yaml"
|
|
|
|
# Build exclude list for disabled datasets (generic based on settings)
|
|
DOC_EXCLUDES=""
|
|
if [[ -f "$SYNC_CONFIG" ]]; then
|
|
for name in $(grep -E '^\s+\w+:\s*false' "$SYNC_CONFIG" | sed 's/:.*//' | tr -d ' '); do
|
|
DOC_EXCLUDES="$DOC_EXCLUDES --exclude=datasets/${name}* --exclude=${name}_*"
|
|
done
|
|
fi
|
|
|
|
rsync -avz $DOC_EXCLUDES data-analyst:server/docs/ ./server/docs/
|
|
rsync -avz data-analyst:server/examples/ ./server/examples/
|
|
rsync -avz data-analyst:server/metadata/ ./server/metadata/
|
|
requires: ["test_ssh_connection", "create_folders", "check_rsync", "download_sync_settings"]
|
|
message: |
|
|
📥 Downloading documentation, examples, and metadata from server...
|
|
✅ Documentation downloaded successfully
|
|
|
|
- name: "download_data"
|
|
description: "Download data from server"
|
|
action: |
|
|
SYNC_CONFIG="/tmp/.sync_settings_$(id -u).yaml"
|
|
|
|
# Exclude ALL optional datasets from core sync (generic based on settings)
|
|
PARQUET_EXCLUDES=""
|
|
ENABLED_DATASETS=""
|
|
if [[ -f "$SYNC_CONFIG" ]]; then
|
|
for name in $(grep -E '^\s+\w+:\s*(true|false)' "$SYNC_CONFIG" | sed 's/:.*//' | tr -d ' '); do
|
|
PARQUET_EXCLUDES="$PARQUET_EXCLUDES --exclude=${name}/"
|
|
done
|
|
ENABLED_DATASETS=$(grep -E '^\s+\w+:\s*true' "$SYNC_CONFIG" | sed 's/:.*//' | tr -d ' ')
|
|
fi
|
|
|
|
# Sync core data (excludes all optional datasets)
|
|
rsync -avz --progress $PARQUET_EXCLUDES data-analyst:server/parquet/ ./server/parquet/
|
|
|
|
# Sync each enabled optional dataset individually
|
|
for name in $ENABLED_DATASETS; do
|
|
echo ""
|
|
echo "Syncing optional dataset: $name"
|
|
rsync -avz --progress data-analyst:server/parquet/${name}/ ./server/parquet/${name}/ 2>/dev/null || true
|
|
done
|
|
requires: ["test_ssh_connection", "create_folders", "check_rsync", "download_sync_settings"]
|
|
message: |
|
|
📥 Downloading data files from server...
|
|
|
|
This is the largest download and may take 5-10 minutes depending on your connection and current data volume.
|
|
Only datasets enabled in your portal settings will be downloaded.
|
|
|
|
✅ Data downloaded successfully! All enabled tables are now available locally.
|
|
|
|
- name: "setup_venv"
|
|
description: "Create Python virtual environment and install dependencies"
|
|
check: "test -f ./.venv/bin/python || test -f ./.venv/Scripts/python.exe"
|
|
action: |
|
|
# Use python3 if available, otherwise python (Windows compatibility)
|
|
if command -v python3 >/dev/null 2>&1; then
|
|
PYTHON_CMD=python3
|
|
else
|
|
PYTHON_CMD=python
|
|
fi
|
|
|
|
# Create venv
|
|
$PYTHON_CMD -m venv ./.venv
|
|
|
|
# Activate and install dependencies
|
|
if [ -f ./.venv/bin/activate ]; then
|
|
source ./.venv/bin/activate
|
|
else
|
|
source ./.venv/Scripts/activate
|
|
fi
|
|
|
|
pip install --upgrade pip --quiet
|
|
pip install pandas pyarrow duckdb pyyaml python-dotenv --quiet
|
|
requires: ["create_folders"]
|
|
message: |
|
|
🐍 Setting up Python environment...
|
|
|
|
Creating virtual environment and installing dependencies:
|
|
- pandas (data manipulation)
|
|
- pyarrow (Parquet file support)
|
|
- duckdb (analytical database)
|
|
- pyyaml & python-dotenv (configuration)
|
|
|
|
This may take 1-2 minutes to download and install packages.
|
|
|
|
✅ Python environment ready! All dependencies installed.
|
|
|
|
- name: "setup_server_venv"
|
|
description: "Create Python virtual environment on server for notifications"
|
|
action: |
|
|
# Freeze local requirements
|
|
if [ -f ./.venv/bin/activate ]; then
|
|
source ./.venv/bin/activate
|
|
else
|
|
source ./.venv/Scripts/activate
|
|
fi
|
|
LOCAL_REQ=$(mktemp)
|
|
pip freeze > "$LOCAL_REQ"
|
|
|
|
# Create venv on server and install same packages
|
|
ssh data-analyst "python3 -m venv ~/.venv"
|
|
scp "$LOCAL_REQ" data-analyst:~/.analyst_requirements.txt
|
|
ssh data-analyst "~/.venv/bin/pip install --upgrade pip --quiet && ~/.venv/bin/pip install -r ~/.analyst_requirements.txt --quiet && rm -f ~/.analyst_requirements.txt"
|
|
rm -f "$LOCAL_REQ"
|
|
requires: ["setup_venv", "test_ssh_connection"]
|
|
message: |
|
|
Setting up Python environment on server (for notifications)...
|
|
Server Python environment ready!
|
|
|
|
- name: "initialize_duckdb"
|
|
description: "Initialize DuckDB views on Parquet files"
|
|
action: |
|
|
bash server/scripts/setup_views.sh
|
|
requires: ["download_scripts", "download_data", "setup_venv"]
|
|
message: |
|
|
🦆 Initializing DuckDB analytical database...
|
|
|
|
Creating views for all tables spanning:
|
|
- Company and project data
|
|
- Employee information
|
|
- Sales and financial metrics
|
|
- Product telemetry and usage data
|
|
|
|
This may take 30-60 seconds to create all views.
|
|
|
|
✅ DuckDB database initialized! All tables ready for queries.
|
|
|
|
- name: "setup_claude_project_context"
|
|
description: "Create Claude Code project context files"
|
|
action: |
|
|
# Generate CLAUDE.md from template with variable substitution
|
|
sed -e "s/{username}/$USER/g" \
|
|
./server/docs/setup/claude_md_template.txt > ./CLAUDE.md
|
|
chmod 644 ./CLAUDE.md
|
|
# Create CLAUDE.local.md for user's personal customizations (not synced)
|
|
if [[ ! -f "./CLAUDE.local.md" ]]; then
|
|
cat > ./CLAUDE.local.md << 'LOCALEOF'
|
|
# CLAUDE.local.md
|
|
|
|
Your personal instructions for Claude Code in this project.
|
|
This file is NOT overwritten by data sync - it is yours to customize.
|
|
|
|
## Your Custom Instructions
|
|
|
|
Add your preferences, shortcuts, or project-specific notes below:
|
|
|
|
LOCALEOF
|
|
chmod 644 ./CLAUDE.local.md
|
|
fi
|
|
# Create .claude directory and copy settings.json (project permissions)
|
|
mkdir -p ./.claude
|
|
if [[ -f "./server/docs/setup/claude_settings.json" ]]; then
|
|
cp ./server/docs/setup/claude_settings.json ./.claude/settings.json
|
|
fi
|
|
requires: ["download_docs"]
|
|
message: |
|
|
📝 Creating project context for Claude Code...
|
|
✅ CLAUDE.md created (auto-updated on sync)
|
|
✅ CLAUDE.local.md created (your personal customizations, never overwritten)
|
|
✅ .claude/settings.json synced (project permissions)
|
|
|
|
- name: "check_setup"
|
|
description: "Verify setup completed successfully"
|
|
action: |
|
|
# Use python3 if available, otherwise python (Windows compatibility)
|
|
if command -v python3 >/dev/null 2>&1; then
|
|
PYTHON_CMD=python3
|
|
else
|
|
PYTHON_CMD=python
|
|
fi
|
|
|
|
echo "Setup verification complete"
|
|
requires: ["initialize_duckdb"]
|
|
message: |
|
|
🎉 Setup complete! Your AI Data Analyst environment is ready.
|
|
|
|
✅ What's been set up:
|
|
- Loads of interesting data (companies, projects, employees, sales, telemetry)
|
|
- DuckDB analytical database with all views configured
|
|
- Python environment with pandas, pyarrow, duckdb
|
|
- Helper scripts for data sync and freshness checks
|
|
- Complete documentation and examples
|
|
|
|
📊 You can now:
|
|
- Start asking questions about your data
|
|
- Explore server/docs/data_description.md for table schemas
|
|
- See docs/GETTING_STARTED.md for query examples
|
|
|
|
🔄 Maintenance commands:
|
|
- Sync latest data: bash server/scripts/sync_data.sh
|
|
|
|
# Python dependencies (installed in setup_venv step)
|
|
dependencies:
|
|
- pandas>=2.0.0
|
|
- pyarrow>=12.0.0
|
|
- duckdb>=0.9.0
|
|
- pyyaml>=6.0
|
|
- python-dotenv>=1.0.0
|