#!/bin/bash # Sync data from server and upload user files # # Usage: # bash server/scripts/sync_data.sh # Full sync (pull server/ + push user/) # bash server/scripts/sync_data.sh --dry-run # Show what would be synced (no changes) # bash server/scripts/sync_data.sh --push # Only upload user/ to server set -e # Parse arguments DRY_RUN="" PUSH_ONLY=false for arg in "$@"; do case "$arg" in --dry-run) DRY_RUN="--dry-run" ;; --push) PUSH_ONLY=true ;; esac done # --- SSH alias (read from .sync_connection, default: data-analyst) --- SSH_HOST="data-analyst" if [[ -f "./.sync_connection" ]]; then _alias=$(grep '^ssh_alias=' ./.sync_connection 2>/dev/null | cut -d= -f2) if [[ -n "$_alias" ]]; then SSH_HOST="$_alias" fi fi # --- Rsync reliability settings (Issue #197) --- RSYNC_SSH_OPTS='ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=3 -o ConnectTimeout=30' RSYNC_TIMEOUT=300 RSYNC_MAX_RETRIES=3 RSYNC_RETRY_DELAY=5 rsync_reliable() { local attempt=1 local delay=$RSYNC_RETRY_DELAY while [[ $attempt -le $RSYNC_MAX_RETRIES ]]; do rsync -e "$RSYNC_SSH_OPTS" --timeout="$RSYNC_TIMEOUT" \ --partial-dir=.rsync-partial "$@" && return 0 local exit_code=$? # Exit codes 23/24 = partial transfer (permission denied, vanished files) โ€” not retryable if [[ $exit_code -eq 23 || $exit_code -eq 24 ]]; then echo " Warning: rsync partial transfer (exit $exit_code), continuing..." return 0 fi if [[ $attempt -lt $RSYNC_MAX_RETRIES ]]; then echo " Rsync failed (exit $exit_code), retrying in ${delay}s (attempt $attempt/$RSYNC_MAX_RETRIES)..." sleep "$delay" delay=$((delay * 2)) fi attempt=$((attempt + 1)) done echo " ERROR: Rsync failed after $RSYNC_MAX_RETRIES attempts" return 1 } # --- Self-update check (safety net) --- # If Claude forgot to run rsync first, this catches script updates if [[ -z "$DRY_RUN" ]]; then SCRIPT_PATH="./server/scripts/sync_data.sh" if [[ -f "$SCRIPT_PATH" ]]; then OLD_CHECKSUM=$(md5sum "$SCRIPT_PATH" 2>/dev/null | cut -d' ' -f1 || echo "none") echo "๐Ÿ”„ Checking for script updates..." rsync -e "$RSYNC_SSH_OPTS" --timeout=30 -avz --quiet $SSH_HOST:server/scripts/ ./server/scripts/ 2>/dev/null || \ scp -rq $SSH_HOST:server/scripts/* ./server/scripts/ 2>/dev/null || true NEW_CHECKSUM=$(md5sum "$SCRIPT_PATH" 2>/dev/null | cut -d' ' -f1 || echo "none") if [[ "$OLD_CHECKSUM" != "$NEW_CHECKSUM" && "$OLD_CHECKSUM" != "none" ]]; then echo " sync_data.sh updated, restarting with new version..." echo "" exec bash "$SCRIPT_PATH" "$@" fi echo " โœ… Scripts up to date" fi fi # Check if rsync is available, fall back to scp if not USE_RSYNC=true if ! command -v rsync >/dev/null 2>&1; then USE_RSYNC=false echo "โš ๏ธ rsync not found, using scp as fallback (slower, no incremental sync)" echo "" fi # Helper function: sync directory from server (handles both rsync and scp) # Usage: sync_from_server [--with-dotfiles] sync_from_server() { local remote_path="$1" local local_path="$2" local with_dotfiles="$3" if [[ "$USE_RSYNC" == true ]]; then rsync_reliable -avz --delete $DRY_RUN "$SSH_HOST:${remote_path}/" "${local_path}/" else if [[ -n "$DRY_RUN" ]]; then echo " [dry-run] Would copy: $SSH_HOST:${remote_path}/* -> ${local_path}/" if [[ "$with_dotfiles" == "--with-dotfiles" ]]; then echo " [dry-run] Would copy dotfiles: $SSH_HOST:${remote_path}/.* -> ${local_path}/" fi else mkdir -p "${local_path}" # Copy regular files scp -r "$SSH_HOST:${remote_path}/"* "${local_path}/" 2>/dev/null || true # IMPORTANT: scp with * does NOT copy dotfiles, must copy explicitly if [[ "$with_dotfiles" == "--with-dotfiles" ]]; then scp "$SSH_HOST:${remote_path}/".* "${local_path}/" 2>/dev/null || true fi fi fi } # Helper function: sync directory to server sync_to_server() { local local_path="$1" local remote_path="$2" if [[ "$USE_RSYNC" == true ]]; then rsync_reliable -avz --delete $DRY_RUN "${local_path}/" "$SSH_HOST:${remote_path}/" else if [[ -n "$DRY_RUN" ]]; then echo " [dry-run] Would upload: ${local_path}/* -> $SSH_HOST:${remote_path}/" else scp -r "${local_path}/"* "$SSH_HOST:${remote_path}/" 2>/dev/null || true scp "${local_path}/".* "$SSH_HOST:${remote_path}/" 2>/dev/null || true fi fi } if [[ "$PUSH_ONLY" == true ]]; then if [[ -n "$DRY_RUN" ]]; then echo "๐Ÿ” DRY RUN MODE - showing what would be pushed..." else echo "๐Ÿ“ค Uploading user files to server..." fi echo "" sync_to_server ./user user # Backup CLAUDE.local.md to server home (disaster recovery) if [[ -f "./CLAUDE.local.md" ]]; then if [[ -n "$DRY_RUN" ]]; then echo " [dry-run] Would backup: ./CLAUDE.local.md -> $SSH_HOST:~/CLAUDE.local.md" else scp -q ./CLAUDE.local.md "$SSH_HOST:~/CLAUDE.local.md" && \ echo "๐Ÿ“ CLAUDE.local.md backed up to server" fi fi if [[ -n "$DRY_RUN" ]]; then echo "" echo "๐Ÿ” Dry run complete - no changes made" echo "๐Ÿ’ก To perform actual push, run: bash server/scripts/sync_data.sh --push" else echo "" echo "โœ… User files uploaded to server!" fi exit 0 fi if [[ -n "$DRY_RUN" ]]; then echo "๐Ÿ” DRY RUN MODE - showing what would be synced..." echo "" else echo "๐Ÿ”„ Syncing data from server..." echo "" fi # --- Migration: detect old directory structure and migrate --- if [[ -z "$DRY_RUN" ]] && [[ -d "./data/parquet" ]] && [[ ! -d "./server/parquet" ]]; then echo "๐Ÿ”„ Migrating from old directory structure to server/ + user/ layout..." echo "" # Create new structure mkdir -p ./server/docs ./server/scripts ./server/examples ./server/parquet ./server/metadata mkdir -p ./user/duckdb ./user/notifications ./user/artifacts ./user/scripts ./user/parquet ./user/sessions # Move data if [[ -d "./data/parquet" ]]; then cp -r ./data/parquet/* ./server/parquet/ 2>/dev/null || true fi if [[ -d "./data/metadata" ]]; then cp -r ./data/metadata/* ./server/metadata/ 2>/dev/null || true fi if [[ -d "./data/duckdb" ]]; then cp -r ./data/duckdb/* ./user/duckdb/ 2>/dev/null || true fi if [[ -d "./docs" ]] && [[ ! -L "./docs" ]]; then cp -r ./docs/* ./server/docs/ 2>/dev/null || true fi if [[ -d "./scripts" ]] && [[ ! -L "./scripts" ]]; then cp -r ./scripts/* ./server/scripts/ 2>/dev/null || true fi echo "โœ… Migration complete. Old directories preserved (remove manually when ready)." echo " To clean up: rm -rf ./data ./docs ./scripts" echo "" echo "๐Ÿ“Œ From now on, use this command to sync:" echo " bash server/scripts/sync_data.sh" echo "" fi # --- Download sync settings first (needed for excludes) --- # Config is managed via the web portal (Data Settings page) # Stored on server in ~/.sync_settings.yaml SYNC_CONFIG_LOCAL="/tmp/.sync_settings_$(id -u).yaml" if [[ -z "$DRY_RUN" ]]; then if scp -q $SSH_HOST:~/.sync_settings.yaml "$SYNC_CONFIG_LOCAL" 2>/dev/null; then echo "๐Ÿ“ฅ Sync settings loaded from portal" else # No custom settings on server โ€” create empty config (no optional datasets) cat > "$SYNC_CONFIG_LOCAL" << 'DEFAULTS' datasets: {} DEFAULTS fi # Download rsync filter for per-table sync SYNC_FILTER_LOCAL="/tmp/.sync_rsync_filter_$(id -u)" if scp -q $SSH_HOST:~/.sync_rsync_filter "$SYNC_FILTER_LOCAL" 2>/dev/null; then echo " โœ… Filter file loaded" else # No filter file = no per-table filtering rm -f "$SYNC_FILTER_LOCAL" fi echo "" else # For dry-run, still need settings to show what would happen if [[ ! -f "$SYNC_CONFIG_LOCAL" ]]; then cat > "$SYNC_CONFIG_LOCAL" << 'DEFAULTS' datasets: {} DEFAULTS fi # Download rsync filter for dry-run too SYNC_FILTER_LOCAL="/tmp/.sync_rsync_filter_$(id -u)" scp -q $SSH_HOST:~/.sync_rsync_filter "$SYNC_FILTER_LOCAL" 2>/dev/null || rm -f "$SYNC_FILTER_LOCAL" fi # --- Sync server/ content (read-only from server, --delete removes obsolete files) --- echo "๐Ÿ“‹ Syncing documentation and scripts..." # Build exclude list for docs based on disabled optional datasets DOCS_EXCLUDES="" if [[ -f "$SYNC_CONFIG_LOCAL" ]]; then for name in $(grep -E '^\s+\w+:\s*false' "$SYNC_CONFIG_LOCAL" 2>/dev/null | sed 's/:.*//' | tr -d ' '); do DOCS_EXCLUDES="$DOCS_EXCLUDES --exclude=datasets/${name}* --exclude=${name}_*" done fi # Sync docs with excludes for disabled datasets if [[ "$USE_RSYNC" == true ]]; then rsync_reliable -avz --delete $DOCS_EXCLUDES $DRY_RUN "$SSH_HOST:server/docs/" "./server/docs/" else sync_from_server server/docs ./server/docs fi sync_from_server server/scripts ./server/scripts sync_from_server server/examples ./server/examples sync_from_server server/metadata ./server/metadata if [[ -z "$DRY_RUN" ]]; then # Regenerate CLAUDE.md from updated template (preserves user's CLAUDE.local.md) if [[ -f "./server/docs/setup/claude_md_template.txt" ]]; then # Extract username from existing CLAUDE.md, fall back to $USER ANALYST_USER="$USER" if [[ -f "./CLAUDE.md" ]]; then EXISTING_USER=$(grep -oP '\*\*Analyst\*\* \| \K\S+' ./CLAUDE.md 2>/dev/null || true) if [[ -n "$EXISTING_USER" ]]; then ANALYST_USER="$EXISTING_USER" fi fi # Read connection details from .sync_connection (written by bootstrap) # Use TMPL_ prefix to avoid overwriting $SSH_HOST (the SSH alias used for rsync) TMPL_SSH_ALIAS="$SSH_HOST" TMPL_SERVER_HOST="unknown" TMPL_WEBAPP_URL="" if [[ -f "./.sync_connection" ]]; then TMPL_SSH_ALIAS=$(grep '^ssh_alias=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "$SSH_HOST") TMPL_SERVER_HOST=$(grep '^server_host=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "unknown") TMPL_WEBAPP_URL=$(grep '^webapp_url=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "") fi # Fallback: extract host from SSH config if [[ "$TMPL_SERVER_HOST" == "unknown" ]] && [[ -f "$HOME/.ssh/config" ]]; then TMPL_SERVER_HOST=$(awk "/^Host ${TMPL_SSH_ALIAS}\$/,/^Host /{if(/HostName/) print \$2}" "$HOME/.ssh/config" 2>/dev/null | head -1) TMPL_SERVER_HOST="${TMPL_SERVER_HOST:-unknown}" fi TMPL_WEBAPP_URL="${TMPL_WEBAPP_URL:-https://${TMPL_SERVER_HOST}}" sed -e "s/{username}/$ANALYST_USER/g" \ -e "s/{ssh_alias}/$TMPL_SSH_ALIAS/g" \ -e "s|{server_host}|$TMPL_SERVER_HOST|g" \ -e "s|{webapp_url}|$TMPL_WEBAPP_URL|g" \ ./server/docs/setup/claude_md_template.txt > ./CLAUDE.md echo "๐Ÿ“ CLAUDE.md updated from latest template" fi # Update .claude/settings.json (project permissions) if [[ -f "./server/docs/setup/claude_settings.json" ]]; then mkdir -p ./.claude cp ./server/docs/setup/claude_settings.json ./.claude/settings.json echo "๐Ÿ”’ .claude/settings.json updated" fi echo "" fi # Sync core parquet data (excludes optional datasets not enabled in settings) echo "๐Ÿ“ฆ Syncing core parquet files..." # Build parquet exclude list from optional datasets in sync settings PARQUET_EXCLUDES="" if [[ -f "$SYNC_CONFIG_LOCAL" ]]; then for name in $(grep -E '^\s+\w+:\s*(true|false)' "$SYNC_CONFIG_LOCAL" 2>/dev/null | sed 's/:.*//' | tr -d ' '); do PARQUET_EXCLUDES="$PARQUET_EXCLUDES --exclude=${name}/" done fi if [[ "$USE_RSYNC" == true ]]; then if [[ -f "$SYNC_FILTER_LOCAL" ]] && grep -q "table_mode: explicit" "$SYNC_FILTER_LOCAL" 2>/dev/null; then echo " Using per-table filter (explicit mode)" rsync_reliable -av --delete --progress --filter="merge $SYNC_FILTER_LOCAL" $DRY_RUN $SSH_HOST:server/parquet/ ./server/parquet/ else rsync_reliable -av --delete --progress $PARQUET_EXCLUDES $DRY_RUN $SSH_HOST:server/parquet/ ./server/parquet/ fi else sync_from_server server/parquet ./server/parquet fi # Create user/ directories if missing if [[ -z "$DRY_RUN" ]]; then mkdir -p ./user/duckdb ./user/notifications ./user/artifacts ./user/scripts ./user/parquet ./user/sessions fi # --- Sync optional datasets (generic, driven by sync_settings.yaml) --- SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" if [[ -f "$SYNC_CONFIG_LOCAL" ]]; then # Sync enabled datasets ENABLED_DATASETS=$(grep -E '^\s+\w+:\s*true' "$SYNC_CONFIG_LOCAL" 2>/dev/null | sed 's/:.*//' | tr -d ' ') for name in $ENABLED_DATASETS; do echo "" # Run dataset-specific sync script if it exists if [[ -f "${SCRIPT_DIR}/sync_${name}.sh" ]]; then bash "${SCRIPT_DIR}/sync_${name}.sh" $DRY_RUN else # Generic: sync parquet + docs for this dataset echo "๐Ÿ“Š Syncing optional dataset: $name" if [[ "$USE_RSYNC" == true ]]; then rsync_reliable -av --delete --progress $DRY_RUN $SSH_HOST:server/parquet/${name}/ ./server/parquet/${name}/ fi mkdir -p ./server/docs/datasets rsync_reliable -avz $DRY_RUN $SSH_HOST:"server/docs/datasets/${name}*" ./server/docs/datasets/ 2>/dev/null || true fi done # Cleanup disabled datasets (remove stale local data so Claude doesn't see it) DISABLED_DATASETS=$(grep -E '^\s+\w+:\s*false' "$SYNC_CONFIG_LOCAL" 2>/dev/null | sed 's/:.*//' | tr -d ' ') for name in $DISABLED_DATASETS; do CLEANUP_NEEDED=false [[ -d "./server/parquet/${name}" ]] && CLEANUP_NEEDED=true ls ./server/docs/datasets/${name}* &>/dev/null 2>&1 && CLEANUP_NEEDED=true if [[ "$CLEANUP_NEEDED" == true ]]; then echo "" if [[ -n "$DRY_RUN" ]]; then echo "๐Ÿงน [dry-run] Would clean up disabled dataset: $name" else echo "๐Ÿงน Cleaning up disabled dataset: $name" rm -rf "./server/parquet/${name}" 2>/dev/null rm -rf ./server/docs/datasets/${name}* 2>/dev/null fi fi done fi # --- Backup: collect missed session transcripts --- # The SessionEnd hook copies transcripts to user/sessions/ automatically. # This backup catches any missed transcripts (e.g. terminal killed via SIGKILL). if [[ -z "$DRY_RUN" ]]; then # Encode project path the same way Claude Code does: # Replaces ALL non-alphanumeric characters with hyphens # /Users/john/my_project -> -Users-john-my-project (macOS/Linux) # /c/Users/john/project -> -c-Users-john-project (Windows/Git Bash) ENCODED_PATH=$(pwd | sed 's|[^a-zA-Z0-9]|-|g; s|^-*||') TRANSCRIPT_DIR="$HOME/.claude/projects/-${ENCODED_PATH}" if [[ -d "$TRANSCRIPT_DIR" ]]; then COLLECTED=0 for jsonl in "$TRANSCRIPT_DIR"/*.jsonl; do [[ -f "$jsonl" ]] || continue SESSION_ID=$(basename "$jsonl" .jsonl) # Skip if already collected (check by session_id, ignoring date prefix) if ls ./user/sessions/*_"${SESSION_ID}".jsonl 1>/dev/null 2>&1; then continue fi # Use the file's actual modification date, not current date FILE_DATE=$(date -r "$jsonl" '+%Y-%m-%d') TARGET="./user/sessions/${FILE_DATE}_${SESSION_ID}.jsonl" cp "$jsonl" "$TARGET" 2>/dev/null && COLLECTED=$((COLLECTED + 1)) done if [[ $COLLECTED -gt 0 ]]; then echo "๐Ÿ“‹ Collected $COLLECTED missed session transcript(s) to user/sessions/" fi fi fi # --- Push user/ to server (backup + runtime for notifications, no --delete to preserve backups) --- echo "" echo "๐Ÿ“ค Uploading user files to server..." sync_to_server ./user user # Backup CLAUDE.local.md to server home (disaster recovery) if [[ -f "./CLAUDE.local.md" ]]; then if [[ -n "$DRY_RUN" ]]; then echo " [dry-run] Would backup: ./CLAUDE.local.md -> $SSH_HOST:~/CLAUDE.local.md" else scp -q ./CLAUDE.local.md "$SSH_HOST:~/CLAUDE.local.md" && \ echo "๐Ÿ“ CLAUDE.local.md backed up to server" fi fi # --- Sync corporate memory rules --- # Rules are generated server-side based on user votes echo "" echo "๐Ÿ“š Syncing corporate memory rules..." if [[ -z "$DRY_RUN" ]]; then mkdir -p .claude/rules if scp -rq "$SSH_HOST:~/.claude_rules/"* .claude/rules/ 2>/dev/null; then RULES_COUNT=$(ls -1 .claude/rules/km_*.md 2>/dev/null | wc -l) echo " โœ… $RULES_COUNT knowledge rules synced to .claude/rules/" else echo " โ„น๏ธ No corporate memory rules yet (upvote items in the portal)" fi else echo " [dry-run] Would sync corporate memory rules to .claude/rules/" fi # Note: Python environment on server is set up during bootstrap (not synced). # Server venv is created via: ssh {alias} 'python3 -m venv ~/.venv && pip install ...' # This avoids cross-platform pip freeze issues (Windows/macOS -> Linux). # Only update DuckDB and check freshness if NOT dry-run if [[ -z "$DRY_RUN" ]]; then # Determine script location SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # Check if DuckDB is corrupted (can't open), delete if so DUCKDB_FILE="user/duckdb/analytics.duckdb" if [[ -f "$DUCKDB_FILE" ]]; then echo "" echo "๐Ÿ” Validating DuckDB..." if ! python -c "import duckdb; duckdb.connect('$DUCKDB_FILE').execute('SELECT 1')" 2>/dev/null; then echo "โš ๏ธ DuckDB corrupted, will recreate from parquet files..." rm -f "$DUCKDB_FILE" else echo " โœ… DuckDB OK" fi fi # Reinitialize DuckDB views (creates new DB if deleted, or updates views if exists) echo "" echo "๐Ÿ”„ Updating DuckDB views..." bash "${SCRIPT_DIR}/setup_views.sh" # Update sync timestamp on server (used by webapp Account card) ssh $SSH_HOST "touch ~/server/" 2>/dev/null || true echo "" echo "โœ… Data sync complete!" echo "" else echo "" echo "๐Ÿ” Dry run complete - no changes made" echo "" echo "๐Ÿ’ก To perform actual sync, run: bash server/scripts/sync_data.sh" fi