agnes-the-ai-analyst/scripts/sync_data.sh
Petr e17dd85504 Remove hardcoded Jira/Keboola references from sync_data.sh
- Silent fallback when no sync settings exist (no 'Jira disabled' message)
- Generic dataset exclude/include loop driven by sync_settings.yaml
- Generic cleanup loop for disabled datasets
- Replaces 100+ lines of hardcoded Jira/kbc_telemetry_expert blocks
2026-03-15 01:02:37 +01:00

474 lines
18 KiB
Bash
Executable file
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Sync data from server and upload user files
#
# Usage:
# bash server/scripts/sync_data.sh # Full sync (pull server/ + push user/)
# bash server/scripts/sync_data.sh --dry-run # Show what would be synced (no changes)
# bash server/scripts/sync_data.sh --push # Only upload user/ to server
set -e
# Parse arguments
DRY_RUN=""
PUSH_ONLY=false
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN="--dry-run" ;;
--push) PUSH_ONLY=true ;;
esac
done
# --- SSH alias (read from .sync_connection, default: data-analyst) ---
SSH_HOST="data-analyst"
if [[ -f "./.sync_connection" ]]; then
_alias=$(grep '^ssh_alias=' ./.sync_connection 2>/dev/null | cut -d= -f2)
if [[ -n "$_alias" ]]; then
SSH_HOST="$_alias"
fi
fi
# --- Rsync reliability settings (Issue #197) ---
RSYNC_SSH_OPTS='ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=3 -o ConnectTimeout=30'
RSYNC_TIMEOUT=300
RSYNC_MAX_RETRIES=3
RSYNC_RETRY_DELAY=5
rsync_reliable() {
local attempt=1
local delay=$RSYNC_RETRY_DELAY
while [[ $attempt -le $RSYNC_MAX_RETRIES ]]; do
rsync -e "$RSYNC_SSH_OPTS" --timeout="$RSYNC_TIMEOUT" \
--partial-dir=.rsync-partial "$@" && return 0
local exit_code=$?
# Exit codes 23/24 = partial transfer (permission denied, vanished files) — not retryable
if [[ $exit_code -eq 23 || $exit_code -eq 24 ]]; then
echo " Warning: rsync partial transfer (exit $exit_code), continuing..."
return 0
fi
if [[ $attempt -lt $RSYNC_MAX_RETRIES ]]; then
echo " Rsync failed (exit $exit_code), retrying in ${delay}s (attempt $attempt/$RSYNC_MAX_RETRIES)..."
sleep "$delay"
delay=$((delay * 2))
fi
attempt=$((attempt + 1))
done
echo " ERROR: Rsync failed after $RSYNC_MAX_RETRIES attempts"
return 1
}
# --- Self-update check (safety net) ---
# If Claude forgot to run rsync first, this catches script updates
if [[ -z "$DRY_RUN" ]]; then
SCRIPT_PATH="./server/scripts/sync_data.sh"
if [[ -f "$SCRIPT_PATH" ]]; then
OLD_CHECKSUM=$(md5sum "$SCRIPT_PATH" 2>/dev/null | cut -d' ' -f1 || echo "none")
echo "🔄 Checking for script updates..."
rsync -e "$RSYNC_SSH_OPTS" --timeout=30 -avz --quiet $SSH_HOST:server/scripts/ ./server/scripts/ 2>/dev/null || \
scp -rq $SSH_HOST:server/scripts/* ./server/scripts/ 2>/dev/null || true
NEW_CHECKSUM=$(md5sum "$SCRIPT_PATH" 2>/dev/null | cut -d' ' -f1 || echo "none")
if [[ "$OLD_CHECKSUM" != "$NEW_CHECKSUM" && "$OLD_CHECKSUM" != "none" ]]; then
echo " sync_data.sh updated, restarting with new version..."
echo ""
exec bash "$SCRIPT_PATH" "$@"
fi
echo " ✅ Scripts up to date"
fi
fi
# Check if rsync is available, fall back to scp if not
USE_RSYNC=true
if ! command -v rsync >/dev/null 2>&1; then
USE_RSYNC=false
echo "⚠️ rsync not found, using scp as fallback (slower, no incremental sync)"
echo ""
fi
# Helper function: sync directory from server (handles both rsync and scp)
# Usage: sync_from_server <remote_path> <local_path> [--with-dotfiles]
sync_from_server() {
local remote_path="$1"
local local_path="$2"
local with_dotfiles="$3"
if [[ "$USE_RSYNC" == true ]]; then
rsync_reliable -avz --delete $DRY_RUN "$SSH_HOST:${remote_path}/" "${local_path}/"
else
if [[ -n "$DRY_RUN" ]]; then
echo " [dry-run] Would copy: $SSH_HOST:${remote_path}/* -> ${local_path}/"
if [[ "$with_dotfiles" == "--with-dotfiles" ]]; then
echo " [dry-run] Would copy dotfiles: $SSH_HOST:${remote_path}/.* -> ${local_path}/"
fi
else
mkdir -p "${local_path}"
# Copy regular files
scp -r "$SSH_HOST:${remote_path}/"* "${local_path}/" 2>/dev/null || true
# IMPORTANT: scp with * does NOT copy dotfiles, must copy explicitly
if [[ "$with_dotfiles" == "--with-dotfiles" ]]; then
scp "$SSH_HOST:${remote_path}/".* "${local_path}/" 2>/dev/null || true
fi
fi
fi
}
# Helper function: sync directory to server
sync_to_server() {
local local_path="$1"
local remote_path="$2"
if [[ "$USE_RSYNC" == true ]]; then
rsync_reliable -avz --delete $DRY_RUN "${local_path}/" "$SSH_HOST:${remote_path}/"
else
if [[ -n "$DRY_RUN" ]]; then
echo " [dry-run] Would upload: ${local_path}/* -> $SSH_HOST:${remote_path}/"
else
scp -r "${local_path}/"* "$SSH_HOST:${remote_path}/" 2>/dev/null || true
scp "${local_path}/".* "$SSH_HOST:${remote_path}/" 2>/dev/null || true
fi
fi
}
if [[ "$PUSH_ONLY" == true ]]; then
if [[ -n "$DRY_RUN" ]]; then
echo "🔍 DRY RUN MODE - showing what would be pushed..."
else
echo "📤 Uploading user files to server..."
fi
echo ""
sync_to_server ./user user
# Backup CLAUDE.local.md to server home (disaster recovery)
if [[ -f "./CLAUDE.local.md" ]]; then
if [[ -n "$DRY_RUN" ]]; then
echo " [dry-run] Would backup: ./CLAUDE.local.md -> $SSH_HOST:~/CLAUDE.local.md"
else
scp -q ./CLAUDE.local.md "$SSH_HOST:~/CLAUDE.local.md" && \
echo "📝 CLAUDE.local.md backed up to server"
fi
fi
if [[ -n "$DRY_RUN" ]]; then
echo ""
echo "🔍 Dry run complete - no changes made"
echo "💡 To perform actual push, run: bash server/scripts/sync_data.sh --push"
else
echo ""
echo "✅ User files uploaded to server!"
fi
exit 0
fi
if [[ -n "$DRY_RUN" ]]; then
echo "🔍 DRY RUN MODE - showing what would be synced..."
echo ""
else
echo "🔄 Syncing data from server..."
echo ""
fi
# --- Migration: detect old directory structure and migrate ---
if [[ -z "$DRY_RUN" ]] && [[ -d "./data/parquet" ]] && [[ ! -d "./server/parquet" ]]; then
echo "🔄 Migrating from old directory structure to server/ + user/ layout..."
echo ""
# Create new structure
mkdir -p ./server/docs ./server/scripts ./server/examples ./server/parquet ./server/metadata
mkdir -p ./user/duckdb ./user/notifications ./user/artifacts ./user/scripts ./user/parquet ./user/sessions
# Move data
if [[ -d "./data/parquet" ]]; then
cp -r ./data/parquet/* ./server/parquet/ 2>/dev/null || true
fi
if [[ -d "./data/metadata" ]]; then
cp -r ./data/metadata/* ./server/metadata/ 2>/dev/null || true
fi
if [[ -d "./data/duckdb" ]]; then
cp -r ./data/duckdb/* ./user/duckdb/ 2>/dev/null || true
fi
if [[ -d "./docs" ]] && [[ ! -L "./docs" ]]; then
cp -r ./docs/* ./server/docs/ 2>/dev/null || true
fi
if [[ -d "./scripts" ]] && [[ ! -L "./scripts" ]]; then
cp -r ./scripts/* ./server/scripts/ 2>/dev/null || true
fi
echo "✅ Migration complete. Old directories preserved (remove manually when ready)."
echo " To clean up: rm -rf ./data ./docs ./scripts"
echo ""
echo "📌 From now on, use this command to sync:"
echo " bash server/scripts/sync_data.sh"
echo ""
fi
# --- Download sync settings first (needed for excludes) ---
# Config is managed via the web portal (Data Settings page)
# Stored on server in ~/.sync_settings.yaml
SYNC_CONFIG_LOCAL="/tmp/.sync_settings_$(id -u).yaml"
if [[ -z "$DRY_RUN" ]]; then
if scp -q $SSH_HOST:~/.sync_settings.yaml "$SYNC_CONFIG_LOCAL" 2>/dev/null; then
echo "📥 Sync settings loaded from portal"
else
# No custom settings on server — create empty config (no optional datasets)
cat > "$SYNC_CONFIG_LOCAL" << 'DEFAULTS'
datasets: {}
DEFAULTS
fi
# Download rsync filter for per-table sync
SYNC_FILTER_LOCAL="/tmp/.sync_rsync_filter_$(id -u)"
if scp -q $SSH_HOST:~/.sync_rsync_filter "$SYNC_FILTER_LOCAL" 2>/dev/null; then
echo " ✅ Filter file loaded"
else
# No filter file = no per-table filtering
rm -f "$SYNC_FILTER_LOCAL"
fi
echo ""
else
# For dry-run, still need settings to show what would happen
if [[ ! -f "$SYNC_CONFIG_LOCAL" ]]; then
cat > "$SYNC_CONFIG_LOCAL" << 'DEFAULTS'
datasets: {}
DEFAULTS
fi
# Download rsync filter for dry-run too
SYNC_FILTER_LOCAL="/tmp/.sync_rsync_filter_$(id -u)"
scp -q $SSH_HOST:~/.sync_rsync_filter "$SYNC_FILTER_LOCAL" 2>/dev/null || rm -f "$SYNC_FILTER_LOCAL"
fi
# --- Sync server/ content (read-only from server, --delete removes obsolete files) ---
echo "📋 Syncing documentation and scripts..."
# Build exclude list for docs based on disabled optional datasets
DOCS_EXCLUDES=""
if [[ -f "$SYNC_CONFIG_LOCAL" ]]; then
for name in $(grep -E '^\s+\w+:\s*false' "$SYNC_CONFIG_LOCAL" 2>/dev/null | sed 's/:.*//' | tr -d ' '); do
DOCS_EXCLUDES="$DOCS_EXCLUDES --exclude=datasets/${name}* --exclude=${name}_*"
done
fi
# Sync docs with excludes for disabled datasets
if [[ "$USE_RSYNC" == true ]]; then
rsync_reliable -avz --delete $DOCS_EXCLUDES $DRY_RUN "$SSH_HOST:server/docs/" "./server/docs/"
else
sync_from_server server/docs ./server/docs
fi
sync_from_server server/scripts ./server/scripts
sync_from_server server/examples ./server/examples
sync_from_server server/metadata ./server/metadata
if [[ -z "$DRY_RUN" ]]; then
# Regenerate CLAUDE.md from updated template (preserves user's CLAUDE.local.md)
if [[ -f "./server/docs/setup/claude_md_template.txt" ]]; then
# Extract username from existing CLAUDE.md, fall back to $USER
ANALYST_USER="$USER"
if [[ -f "./CLAUDE.md" ]]; then
EXISTING_USER=$(grep -oP '\*\*Analyst\*\* \| \K\S+' ./CLAUDE.md 2>/dev/null || true)
if [[ -n "$EXISTING_USER" ]]; then
ANALYST_USER="$EXISTING_USER"
fi
fi
# Read connection details from .sync_connection (written by bootstrap)
# Use TMPL_ prefix to avoid overwriting $SSH_HOST (the SSH alias used for rsync)
TMPL_SSH_ALIAS="$SSH_HOST"
TMPL_SERVER_HOST="unknown"
TMPL_WEBAPP_URL=""
if [[ -f "./.sync_connection" ]]; then
TMPL_SSH_ALIAS=$(grep '^ssh_alias=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "$SSH_HOST")
TMPL_SERVER_HOST=$(grep '^server_host=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "unknown")
TMPL_WEBAPP_URL=$(grep '^webapp_url=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "")
fi
# Fallback: extract host from SSH config
if [[ "$TMPL_SERVER_HOST" == "unknown" ]] && [[ -f "$HOME/.ssh/config" ]]; then
TMPL_SERVER_HOST=$(awk "/^Host ${TMPL_SSH_ALIAS}\$/,/^Host /{if(/HostName/) print \$2}" "$HOME/.ssh/config" 2>/dev/null | head -1)
TMPL_SERVER_HOST="${TMPL_SERVER_HOST:-unknown}"
fi
TMPL_WEBAPP_URL="${TMPL_WEBAPP_URL:-https://${TMPL_SERVER_HOST}}"
sed -e "s/{username}/$ANALYST_USER/g" \
-e "s/{ssh_alias}/$TMPL_SSH_ALIAS/g" \
-e "s|{server_host}|$TMPL_SERVER_HOST|g" \
-e "s|{webapp_url}|$TMPL_WEBAPP_URL|g" \
./server/docs/setup/claude_md_template.txt > ./CLAUDE.md
echo "📝 CLAUDE.md updated from latest template"
fi
# Update .claude/settings.json (project permissions)
if [[ -f "./server/docs/setup/claude_settings.json" ]]; then
mkdir -p ./.claude
cp ./server/docs/setup/claude_settings.json ./.claude/settings.json
echo "🔒 .claude/settings.json updated"
fi
echo ""
fi
# Sync core parquet data (excludes optional datasets not enabled in settings)
echo "📦 Syncing core parquet files..."
# Build parquet exclude list from optional datasets in sync settings
PARQUET_EXCLUDES=""
if [[ -f "$SYNC_CONFIG_LOCAL" ]]; then
for name in $(grep -E '^\s+\w+:\s*(true|false)' "$SYNC_CONFIG_LOCAL" 2>/dev/null | sed 's/:.*//' | tr -d ' '); do
PARQUET_EXCLUDES="$PARQUET_EXCLUDES --exclude=${name}/"
done
fi
if [[ "$USE_RSYNC" == true ]]; then
if [[ -f "$SYNC_FILTER_LOCAL" ]] && grep -q "table_mode: explicit" "$SYNC_FILTER_LOCAL" 2>/dev/null; then
echo " Using per-table filter (explicit mode)"
rsync_reliable -av --delete --progress --filter="merge $SYNC_FILTER_LOCAL" $DRY_RUN $SSH_HOST:server/parquet/ ./server/parquet/
else
rsync_reliable -av --delete --progress $PARQUET_EXCLUDES $DRY_RUN $SSH_HOST:server/parquet/ ./server/parquet/
fi
else
sync_from_server server/parquet ./server/parquet
fi
# Create user/ directories if missing
if [[ -z "$DRY_RUN" ]]; then
mkdir -p ./user/duckdb ./user/notifications ./user/artifacts ./user/scripts ./user/parquet ./user/sessions
fi
# --- Sync optional datasets (generic, driven by sync_settings.yaml) ---
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
if [[ -f "$SYNC_CONFIG_LOCAL" ]]; then
# Sync enabled datasets
ENABLED_DATASETS=$(grep -E '^\s+\w+:\s*true' "$SYNC_CONFIG_LOCAL" 2>/dev/null | sed 's/:.*//' | tr -d ' ')
for name in $ENABLED_DATASETS; do
echo ""
# Run dataset-specific sync script if it exists
if [[ -f "${SCRIPT_DIR}/sync_${name}.sh" ]]; then
bash "${SCRIPT_DIR}/sync_${name}.sh" $DRY_RUN
else
# Generic: sync parquet + docs for this dataset
echo "📊 Syncing optional dataset: $name"
if [[ "$USE_RSYNC" == true ]]; then
rsync_reliable -av --delete --progress $DRY_RUN $SSH_HOST:server/parquet/${name}/ ./server/parquet/${name}/
fi
mkdir -p ./server/docs/datasets
rsync_reliable -avz $DRY_RUN $SSH_HOST:"server/docs/datasets/${name}*" ./server/docs/datasets/ 2>/dev/null || true
fi
done
# Cleanup disabled datasets (remove stale local data so Claude doesn't see it)
DISABLED_DATASETS=$(grep -E '^\s+\w+:\s*false' "$SYNC_CONFIG_LOCAL" 2>/dev/null | sed 's/:.*//' | tr -d ' ')
for name in $DISABLED_DATASETS; do
CLEANUP_NEEDED=false
[[ -d "./server/parquet/${name}" ]] && CLEANUP_NEEDED=true
ls ./server/docs/datasets/${name}* &>/dev/null 2>&1 && CLEANUP_NEEDED=true
if [[ "$CLEANUP_NEEDED" == true ]]; then
echo ""
if [[ -n "$DRY_RUN" ]]; then
echo "🧹 [dry-run] Would clean up disabled dataset: $name"
else
echo "🧹 Cleaning up disabled dataset: $name"
rm -rf "./server/parquet/${name}" 2>/dev/null
rm -rf ./server/docs/datasets/${name}* 2>/dev/null
fi
fi
done
fi
# --- Backup: collect missed session transcripts ---
# The SessionEnd hook copies transcripts to user/sessions/ automatically.
# This backup catches any missed transcripts (e.g. terminal killed via SIGKILL).
if [[ -z "$DRY_RUN" ]]; then
# Encode project path the same way Claude Code does:
# Replaces ALL non-alphanumeric characters with hyphens
# /Users/john/my_project -> -Users-john-my-project (macOS/Linux)
# /c/Users/john/project -> -c-Users-john-project (Windows/Git Bash)
ENCODED_PATH=$(pwd | sed 's|[^a-zA-Z0-9]|-|g; s|^-*||')
TRANSCRIPT_DIR="$HOME/.claude/projects/-${ENCODED_PATH}"
if [[ -d "$TRANSCRIPT_DIR" ]]; then
COLLECTED=0
for jsonl in "$TRANSCRIPT_DIR"/*.jsonl; do
[[ -f "$jsonl" ]] || continue
SESSION_ID=$(basename "$jsonl" .jsonl)
# Skip if already collected (check by session_id, ignoring date prefix)
if ls ./user/sessions/*_"${SESSION_ID}".jsonl 1>/dev/null 2>&1; then
continue
fi
# Use the file's actual modification date, not current date
FILE_DATE=$(date -r "$jsonl" '+%Y-%m-%d')
TARGET="./user/sessions/${FILE_DATE}_${SESSION_ID}.jsonl"
cp "$jsonl" "$TARGET" 2>/dev/null && COLLECTED=$((COLLECTED + 1))
done
if [[ $COLLECTED -gt 0 ]]; then
echo "📋 Collected $COLLECTED missed session transcript(s) to user/sessions/"
fi
fi
fi
# --- Push user/ to server (backup + runtime for notifications, no --delete to preserve backups) ---
echo ""
echo "📤 Uploading user files to server..."
sync_to_server ./user user
# Backup CLAUDE.local.md to server home (disaster recovery)
if [[ -f "./CLAUDE.local.md" ]]; then
if [[ -n "$DRY_RUN" ]]; then
echo " [dry-run] Would backup: ./CLAUDE.local.md -> $SSH_HOST:~/CLAUDE.local.md"
else
scp -q ./CLAUDE.local.md "$SSH_HOST:~/CLAUDE.local.md" && \
echo "📝 CLAUDE.local.md backed up to server"
fi
fi
# --- Sync corporate memory rules ---
# Rules are generated server-side based on user votes
echo ""
echo "📚 Syncing corporate memory rules..."
if [[ -z "$DRY_RUN" ]]; then
mkdir -p .claude/rules
if scp -rq "$SSH_HOST:~/.claude_rules/"* .claude/rules/ 2>/dev/null; then
RULES_COUNT=$(ls -1 .claude/rules/km_*.md 2>/dev/null | wc -l)
echo "$RULES_COUNT knowledge rules synced to .claude/rules/"
else
echo " No corporate memory rules yet (upvote items in the portal)"
fi
else
echo " [dry-run] Would sync corporate memory rules to .claude/rules/"
fi
# Note: Python environment on server is set up during bootstrap (not synced).
# Server venv is created via: ssh {alias} 'python3 -m venv ~/.venv && pip install ...'
# This avoids cross-platform pip freeze issues (Windows/macOS -> Linux).
# Only update DuckDB and check freshness if NOT dry-run
if [[ -z "$DRY_RUN" ]]; then
# Determine script location
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# Check if DuckDB is corrupted (can't open), delete if so
DUCKDB_FILE="user/duckdb/analytics.duckdb"
if [[ -f "$DUCKDB_FILE" ]]; then
echo ""
echo "🔍 Validating DuckDB..."
if ! python -c "import duckdb; duckdb.connect('$DUCKDB_FILE').execute('SELECT 1')" 2>/dev/null; then
echo "⚠️ DuckDB corrupted, will recreate from parquet files..."
rm -f "$DUCKDB_FILE"
else
echo " ✅ DuckDB OK"
fi
fi
# Reinitialize DuckDB views (creates new DB if deleted, or updates views if exists)
echo ""
echo "🔄 Updating DuckDB views..."
bash "${SCRIPT_DIR}/setup_views.sh"
# Update sync timestamp on server (used by webapp Account card)
ssh $SSH_HOST "touch ~/server/" 2>/dev/null || true
echo ""
echo "✅ Data sync complete!"
echo ""
else
echo ""
echo "🔍 Dry run complete - no changes made"
echo ""
echo "💡 To perform actual sync, run: bash server/scripts/sync_data.sh"
fi