version: "1.0" project_name: "internal_ai_data_analyst" project_dir: "." server: host: "{server_host}" hostname: "data-analyst" webapp_url: "{webapp_url}" setup: steps: - name: "detect_existing_project" description: "Check if project already exists" check: "test -f ./CLAUDE.md" on_success: "verify_project_identity" message: | 📁 Existing CLAUDE.md detected in current directory Verifying this is an AI Data Analyst project... - name: "verify_project_identity" description: "Verify this is the correct project type" check: "grep -q 'AI Data Analyst' ./CLAUDE.md" on_success: "existing_project_confirmed" on_failure: | ❌ Wrong project type detected The CLAUDE.md file exists but doesn't match AI Data Analyst. Options: - Choose a different directory for setup - Remove existing CLAUDE.md if this was a mistake message: | ✅ AI Data Analyst project confirmed This directory is already set up. You can: - Sync latest data: bash server/scripts/sync_data.sh - View project context: cat CLAUDE.md To recreate CLAUDE.md: rm -f ./CLAUDE.md and re-run bootstrap - name: "check_directory_empty" description: "Warn if directory is not empty" check: "[ $(ls -A . 2>/dev/null | wc -l) -eq 0 ]" on_failure: | ⚠️ Current directory is not empty Found existing files here. This setup will create the following: - .claude/ (project metadata) - server/ (read-only data from server: parquet files ~690 MB, docs, scripts) - user/ (your workspace: DuckDB database, notifications, artifacts) - .venv/ (Python virtual environment) Make sure you're in the correct directory before continuing. If this is the right place, the setup will proceed without affecting existing files. warn_only: true message: | 📂 Starting setup in current directory... - name: "generate_ssh_key" description: "Generate SSH key for server authentication" check: "test -f ~/.ssh/data_analyst_server.pub" action: | ssh-keygen -t ed25519 -f ~/.ssh/data_analyst_server -C "{username}@data-analyst" -N '' on_success: "show_public_key" message: | 🔑 SSH key generated successfully - name: "show_public_key" description: "Display SSH public key to user" action: "cat ~/.ssh/data_analyst_server.pub" message: | 📋 Your SSH public key has been generated! Next steps: 1. Copy the public key shown above 2. Go to: {webapp_url} 3. Sign in: - Internal users: Click "Sign in with Google" - External users: Click "Sign in with Email" 4. Paste the key into the form and click "Create Account" 5. Wait a few seconds for account creation 6. Come back here to continue Note: Your username will be derived from your email: - john.doe@example.com -> john.doe - partner@company.com -> partner_company_com wait_for_user: true - name: "add_ssh_config" description: "Add SSH config entry" requires: ["show_public_key"] check: "grep -q 'Host data-analyst' ~/.ssh/config 2>/dev/null" action: | mkdir -p ~/.ssh cat >> ~/.ssh/config << 'EOF' Host data-analyst HostName {server_host} User {username} IdentityFile ~/.ssh/data_analyst_server StrictHostKeyChecking accept-new EOF chmod 600 ~/.ssh/config message: | ⚙️ SSH configuration added for data-analyst server - name: "test_ssh_connection" description: "Test SSH connection to server" requires: ["add_ssh_config"] action: "ssh -o ConnectTimeout=5 -o BatchMode=yes data-analyst echo 'ok' 2>/dev/null" message: | 🔌 Testing connection to data server... (this may take a few seconds) ✅ Connection successful! You're authenticated and ready to sync data. on_failure: | ❌ SSH connection failed! Please verify: 1. You completed registration at {webapp_url} 2. Your account shows as created on the dashboard 3. You copied the correct username Common issues: - Account creation can take a few seconds - Make sure you pasted the complete SSH public key - Check that username matches exactly (case-sensitive) retry: true max_retries: 3 - name: "create_folders" description: "Create local project structure" action: | mkdir -p ./server/docs ./server/scripts ./server/examples ./server/parquet ./server/metadata mkdir -p ./user/duckdb ./user/notifications ./user/artifacts ./user/scripts ./user/parquet ./user/sessions mkdir -p ./.venv message: | 📁 Project structure created (server/, user/, .venv/) - name: "check_rsync" description: "Verify rsync is available (preferred) or prepare scp fallback" check: "command -v rsync >/dev/null 2>&1" warn_only: true on_failure: | ⚠️ rsync is not installed on your system. RECOMMENDED: Install rsync for better performance and reliability. Installation instructions: macOS (Homebrew): brew install rsync macOS (without Homebrew): Install Homebrew first: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" Then: brew install rsync Linux (Debian/Ubuntu): sudo apt-get update && sudo apt-get install -y rsync Linux (RHEL/CentOS): sudo yum install -y rsync Windows: Option 1 (Recommended): Use WSL (Windows Subsystem for Linux) 1. Open PowerShell as Administrator 2. Run: wsl --install 3. Restart your computer 4. After restart, run this setup again in WSL terminal Option 2: Git for Windows 1. Install Git for Windows from: https://git-scm.com/download/win 2. Use Git Bash terminal --- SCP FALLBACK (if rsync installation is not possible): If you cannot install rsync, scp can be used as alternative. IMPORTANT: scp with wildcard (*) does NOT copy dotfiles (files starting with .) When using scp, you MUST explicitly copy dotfiles separately: Example for metadata directory: scp -r data-analyst:server/metadata/* ./server/metadata/ scp data-analyst:server/metadata/.* ./server/metadata/ Or copy the entire directory (includes dotfiles): scp -r data-analyst:server/metadata ./server/ - name: "download_sync_settings" description: "Download user's dataset sync preferences from server" action: | SYNC_CONFIG="/tmp/.sync_settings_$(id -u).yaml" if scp -q data-analyst:~/.sync_settings.yaml "$SYNC_CONFIG" 2>/dev/null; then echo "Settings loaded from server" else # No custom settings yet - create defaults (all optional datasets disabled) cat > "$SYNC_CONFIG" << 'DEFAULTS' datasets: jira: false jira_attachments: false kbc_telemetry_expert: false DEFAULTS echo "No custom settings found, using defaults" fi requires: ["test_ssh_connection"] message: | 📥 Downloading dataset preferences from portal... ✅ Sync settings loaded (manage at {webapp_url}) - name: "download_scripts" description: "Download setup scripts from server" action: | rsync -avz data-analyst:server/scripts/ ./server/scripts/ requires: ["test_ssh_connection", "create_folders", "check_rsync"] message: | 📥 Downloading helper scripts from server... ✅ Scripts downloaded successfully - name: "download_docs" description: "Download documentation from server" action: | SYNC_CONFIG="/tmp/.sync_settings_$(id -u).yaml" # Build exclude list for disabled datasets (generic based on settings) DOC_EXCLUDES="" if [[ -f "$SYNC_CONFIG" ]]; then for name in $(grep -E '^\s+\w+:\s*false' "$SYNC_CONFIG" | sed 's/:.*//' | tr -d ' '); do DOC_EXCLUDES="$DOC_EXCLUDES --exclude=datasets/${name}* --exclude=${name}_*" done fi rsync -avz $DOC_EXCLUDES data-analyst:server/docs/ ./server/docs/ rsync -avz data-analyst:server/examples/ ./server/examples/ rsync -avz data-analyst:server/metadata/ ./server/metadata/ requires: ["test_ssh_connection", "create_folders", "check_rsync", "download_sync_settings"] message: | 📥 Downloading documentation, examples, and metadata from server... ✅ Documentation downloaded successfully - name: "download_data" description: "Download data from server" action: | SYNC_CONFIG="/tmp/.sync_settings_$(id -u).yaml" # Exclude ALL optional datasets from core sync (generic based on settings) PARQUET_EXCLUDES="" ENABLED_DATASETS="" if [[ -f "$SYNC_CONFIG" ]]; then for name in $(grep -E '^\s+\w+:\s*(true|false)' "$SYNC_CONFIG" | sed 's/:.*//' | tr -d ' '); do PARQUET_EXCLUDES="$PARQUET_EXCLUDES --exclude=${name}/" done ENABLED_DATASETS=$(grep -E '^\s+\w+:\s*true' "$SYNC_CONFIG" | sed 's/:.*//' | tr -d ' ') fi # Sync core data (excludes all optional datasets) rsync -avz --progress $PARQUET_EXCLUDES data-analyst:server/parquet/ ./server/parquet/ # Sync each enabled optional dataset individually for name in $ENABLED_DATASETS; do echo "" echo "Syncing optional dataset: $name" rsync -avz --progress data-analyst:server/parquet/${name}/ ./server/parquet/${name}/ 2>/dev/null || true done requires: ["test_ssh_connection", "create_folders", "check_rsync", "download_sync_settings"] message: | 📥 Downloading data files from server... This is the largest download and may take 5-10 minutes depending on your connection and current data volume. Only datasets enabled in your portal settings will be downloaded. ✅ Data downloaded successfully! All enabled tables are now available locally. - name: "setup_venv" description: "Create Python virtual environment and install dependencies" check: "test -f ./.venv/bin/python || test -f ./.venv/Scripts/python.exe" action: | # Use python3 if available, otherwise python (Windows compatibility) if command -v python3 >/dev/null 2>&1; then PYTHON_CMD=python3 else PYTHON_CMD=python fi # Create venv $PYTHON_CMD -m venv ./.venv # Activate and install dependencies if [ -f ./.venv/bin/activate ]; then source ./.venv/bin/activate else source ./.venv/Scripts/activate fi pip install --upgrade pip --quiet pip install pandas pyarrow duckdb pyyaml python-dotenv --quiet requires: ["create_folders"] message: | 🐍 Setting up Python environment... Creating virtual environment and installing dependencies: - pandas (data manipulation) - pyarrow (Parquet file support) - duckdb (analytical database) - pyyaml & python-dotenv (configuration) This may take 1-2 minutes to download and install packages. ✅ Python environment ready! All dependencies installed. - name: "setup_server_venv" description: "Create Python virtual environment on server for notifications" action: | # Freeze local requirements if [ -f ./.venv/bin/activate ]; then source ./.venv/bin/activate else source ./.venv/Scripts/activate fi LOCAL_REQ=$(mktemp) pip freeze > "$LOCAL_REQ" # Create venv on server and install same packages ssh data-analyst "python3 -m venv ~/.venv" scp "$LOCAL_REQ" data-analyst:~/.analyst_requirements.txt ssh data-analyst "~/.venv/bin/pip install --upgrade pip --quiet && ~/.venv/bin/pip install -r ~/.analyst_requirements.txt --quiet && rm -f ~/.analyst_requirements.txt" rm -f "$LOCAL_REQ" requires: ["setup_venv", "test_ssh_connection"] message: | Setting up Python environment on server (for notifications)... Server Python environment ready! - name: "initialize_duckdb" description: "Initialize DuckDB views on Parquet files" action: | bash server/scripts/setup_views.sh requires: ["download_scripts", "download_data", "setup_venv"] message: | 🦆 Initializing DuckDB analytical database... Creating views for all tables spanning: - Company and project data - Employee information - Sales and financial metrics - Product telemetry and usage data This may take 30-60 seconds to create all views. ✅ DuckDB database initialized! All tables ready for queries. - name: "setup_claude_project_context" description: "Create Claude Code project context files" action: | # Generate CLAUDE.md from template with variable substitution sed -e "s/{username}/$USER/g" \ ./server/docs/setup/claude_md_template.txt > ./CLAUDE.md chmod 644 ./CLAUDE.md # Create CLAUDE.local.md for user's personal customizations (not synced) if [[ ! -f "./CLAUDE.local.md" ]]; then cat > ./CLAUDE.local.md << 'LOCALEOF' # CLAUDE.local.md Your personal instructions for Claude Code in this project. This file is NOT overwritten by data sync - it is yours to customize. ## Your Custom Instructions Add your preferences, shortcuts, or project-specific notes below: LOCALEOF chmod 644 ./CLAUDE.local.md fi # Create .claude directory and copy settings.json (project permissions) mkdir -p ./.claude if [[ -f "./server/docs/setup/claude_settings.json" ]]; then cp ./server/docs/setup/claude_settings.json ./.claude/settings.json fi requires: ["download_docs"] message: | 📝 Creating project context for Claude Code... ✅ CLAUDE.md created (auto-updated on sync) ✅ CLAUDE.local.md created (your personal customizations, never overwritten) ✅ .claude/settings.json synced (project permissions) - name: "check_setup" description: "Verify setup completed successfully" action: | # Use python3 if available, otherwise python (Windows compatibility) if command -v python3 >/dev/null 2>&1; then PYTHON_CMD=python3 else PYTHON_CMD=python fi echo "Setup verification complete" requires: ["initialize_duckdb"] message: | 🎉 Setup complete! Your AI Data Analyst environment is ready. ✅ What's been set up: - Loads of interesting data (companies, projects, employees, sales, telemetry) - DuckDB analytical database with all views configured - Python environment with pandas, pyarrow, duckdb - Helper scripts for data sync and freshness checks - Complete documentation and examples 📊 You can now: - Start asking questions about your data - Explore server/docs/data_description.md for table schemas - See docs/GETTING_STARTED.md for query examples 🔄 Maintenance commands: - Sync latest data: bash server/scripts/sync_data.sh # Python dependencies (installed in setup_venv step) dependencies: - pandas>=2.0.0 - pyarrow>=12.0.0 - duckdb>=0.9.0 - pyyaml>=6.0 - python-dotenv>=1.0.0