From 2237334b058ffbc391a56195b3fe963bd3b2f125 Mon Sep 17 00:00:00 2001 From: Petr Date: Sat, 14 Mar 2026 23:57:58 +0100 Subject: [PATCH] Make CLAUDE.md template generic and instance-aware - Remove all Keboola-specific content (metric categories, MRR/ARR refs, corporate memory, hardcoded server IP) - Add {ssh_alias}, {server_host}, {webapp_url} placeholders - Bootstrap saves .sync_connection file with instance details - sync_data.sh reads .sync_connection to substitute all placeholders - Text instructions in dashboard include .sync_connection step --- docs/setup/bootstrap.yaml | 9 +- docs/setup/claude_md_template.txt | 147 ++++++++++-------------------- scripts/sync_data.sh | 20 ++++ webapp/templates/dashboard.html | 16 +++- 4 files changed, 89 insertions(+), 103 deletions(-) diff --git a/docs/setup/bootstrap.yaml b/docs/setup/bootstrap.yaml index a5e843e..62cfb82 100644 --- a/docs/setup/bootstrap.yaml +++ b/docs/setup/bootstrap.yaml @@ -131,10 +131,17 @@ setup: max_retries: 3 - name: "create_folders" - description: "Create local project structure" + description: "Create local project structure and save connection details" action: | mkdir -p ./server/docs ./server/scripts ./server/examples ./server/parquet ./server/metadata mkdir -p ./user/duckdb ./user/notifications ./user/artifacts ./user/scripts ./user/parquet ./user/sessions + + # Save connection details for sync_data.sh to use when generating CLAUDE.md + cat > ./.sync_connection << 'CONN' + ssh_alias={ssh_alias} + server_host={server_host} + webapp_url={webapp_url} + CONN message: | Project structure created (server/, user/). diff --git a/docs/setup/claude_md_template.txt b/docs/setup/claude_md_template.txt index aedf6c9..fcc5a3c 100644 --- a/docs/setup/claude_md_template.txt +++ b/docs/setup/claude_md_template.txt @@ -8,13 +8,13 @@ Project context file for **AI Data Analyst** - local analytics environment with |----------|-------| | **Project Type** | AI Data Analyst | | **Database** | DuckDB at `user/duckdb/analytics.duckdb` | -| **Data Source** | data-analyst server (34.88.8.46) | +| **Data Source** | {ssh_alias} server ({server_host}) | | **Data Format** | Parquet files in `server/parquet/` | | **Analyst** | {username} | --- -## ⚠️ CRITICAL: Always Start Here +## CRITICAL: Always Start Here ### 1. Sync Data When Starting @@ -30,35 +30,7 @@ bash server/scripts/sync_data.sh This updates data, scripts, documentation, and CLAUDE.md. -### 2. Read Metrics Definitions - -**Before calculating ANY business metric (MRR, ARR, usage, limits, etc.), you MUST:** - -1. **Start with the metrics index** - read `server/docs/metrics/metrics.yml` first - - This index file lists all available metrics organized by category - - Find the metric you need and note its file path - -2. **Then read the specific metric file** from its category folder: - ```bash - # Example: Read the metrics index first - cat server/docs/metrics/metrics.yml - - # Then read the specific metric definition you need - cat server/docs/metrics/sales_revenue/mrr.yml - cat server/docs/metrics/product_usage/usage_value.yml - cat server/docs/metrics/finance/infra_cost.yml - cat server/docs/metrics/weekly_leadership_kpis/revenue_upsells_ytd.yml - ``` - -**Categories:** -- `finance/` - Financial metrics (infra costs, retention) -- `product_usage/` - Platform usage, limits, telemetry -- `sales_revenue/` - MRR, ARR, new customers, expansions -- `weekly_leadership_kpis/` - Weekly KPIs for leadership reporting - -Do not calculate metrics from memory. The formulas contain critical details (e.g., conditional aggregation for different metric types, proper value vs company_value usage). Getting this wrong produces plausible but incorrect numbers. - -### 3. Read Schema Documentation Before Writing SQL +### 2. Read Schema Documentation Before Writing SQL **MANDATORY: Before writing ANY SQL query, you MUST read the relevant documentation files:** @@ -73,20 +45,6 @@ cat server/docs/schema.yml - **NEVER guess column names** - schema.yml contains: all column names, types, descriptions, primary keys -#### For on-demand datasets (if enabled): - -```bash -# Check for additional dataset schemas (e.g., kbc_telemetry_expert) -ls server/docs/datasets/ -# Read the dataset doc for table relationships and ER diagrams -cat server/docs/datasets/.md -# Read the dataset schema for column details -cat server/docs/datasets//schema.yml -``` - -- On-demand datasets have their own schema.yml and documentation files -- Only available if enabled in Data Settings at {webapp_url} - #### For table relationships (joins, foreign keys): ```bash @@ -94,9 +52,30 @@ cat server/docs/datasets//schema.yml cat server/docs/data_description.md ``` -- Contains ER diagrams, primary/foreign keys, sync strategies +- Contains primary/foreign keys, sync strategies, and table descriptions - Essential for writing correct JOIN queries -- On-demand dataset docs reference core tables with `(core)` markers + +#### For additional dataset schemas (if available): + +```bash +# Check for additional dataset schemas +ls server/docs/datasets/ 2>/dev/null +``` + +### 3. Read Metrics Definitions (if available) + +**Before calculating ANY business metric, check for metric definitions:** + +```bash +# Check if metrics index exists +cat server/docs/metrics/metrics.yml 2>/dev/null + +# Or list available metric files +ls server/docs/metrics/ 2>/dev/null +``` + +If metric definitions exist, always read the specific metric file before calculating. +Do not calculate metrics from memory - the formulas contain critical details. --- @@ -106,17 +85,16 @@ cat server/docs/data_description.md project_root/ ├── server/ # READ-ONLY - synced from server │ ├── docs/ # Documentation -│ │ ├── metrics/ # Metric definitions (modular structure) -│ │ ├── datasets/ # On-demand dataset docs and schemas -│ │ ├── data_description.md # Table relationships and ER diagrams -│ │ └── schema.yml # Table schemas and column definitions +│ │ ├── data_description.md # Table relationships and descriptions +│ │ ├── schema.yml # Table schemas and column definitions +│ │ ├── metrics/ # Metric definitions (if available) +│ │ └── datasets/ # Additional dataset docs (if available) │ ├── scripts/ # Helper scripts (sync_data.sh, setup_views.sh) -│ ├── examples/ # Example notification scripts +│ ├── examples/ # Example scripts (if available) │ └── parquet/ # Synced parquet data files │ ├── user/ # YOUR WORKSPACE - never overwritten │ ├── duckdb/ # DuckDB database (analytics.duckdb) -│ ├── notifications/ # Your notification scripts │ ├── artifacts/ # Analysis outputs, charts, exports │ └── scripts/ # Your custom scripts │ @@ -159,19 +137,21 @@ for table in tables: con.close() ``` -### Query examples +### Query data -Browse `server/docs/metrics/metrics.yml` for all available metrics, then read specific metric files: -- **Finance**: `finance/` - Infrastructure costs with allocation guides -- **Product Usage**: `product_usage/` - Usage metrics with conditional aggregation, contract limits, usage vs limits -- **Sales & Revenue**: `sales_revenue/` - MRR, ARR, new customer acquisition, expansions -- **Weekly Leadership KPIs**: `weekly_leadership_kpis/` - All weekly metrics for leadership reporting +```bash +# Read schema first, then query +cat server/docs/schema.yml +``` -All metric examples include multiple SQL variants: -- `sql`: Total aggregate across all companies -- `sql_by_company`: Grouped by company -- `sql_single_company`: Filter for specific company -- `sql_by_project`: Project-level analysis (where applicable) +```python +import duckdb +con = duckdb.connect('user/duckdb/analytics.duckdb') +# Write your query based on schema.yml column definitions +result = con.execute("SELECT * FROM your_table LIMIT 10").fetchdf() +print(result) +con.close() +``` --- @@ -193,42 +173,13 @@ You're ready to analyze! --- -## Corporate Memory - -Your `CLAUDE.local.md` file serves a dual purpose: -1. **Personal notes** - never overwritten by server sync, your workspace for discoveries -2. **Knowledge sharing** - backed up to the server and processed into shared team knowledge - -### How It Works - -- Every `sync_data.sh` run backs up your `CLAUDE.local.md` to the server -- Every 30 minutes, the server extracts valuable knowledge from all team members' files -- Extracted knowledge is deduplicated and merged into a shared Corporate Memory database -- Browse and vote on knowledge at {webapp_url}/corporate-memory -- Items you upvote are synced to your `.claude/rules/` during the next data sync - -### What to Write in CLAUDE.local.md - -When you discover something valuable during your work, add it to `CLAUDE.local.md`: - -- **Technical discoveries**: Novel solutions, workarounds, or techniques -- **Best practices**: Patterns that improved code quality or productivity -- **Tool tips**: Useful DuckDB queries, commands, or configurations -- **Debugging wisdom**: How specific errors were diagnosed and resolved -- **Domain knowledge**: Business logic insights or data relationships - -The more specific and actionable your notes are, the more valuable they become for the whole team. - ---- - ## Important Reminders -- ⚠️ **Always read `server/docs/schema.yml` before writing SQL queries** -- ⚠️ **Always check `server/docs/datasets/` for additional schema files from on-demand datasets** -- ⚠️ **Always read `server/docs/metrics/metrics.yml` to find the right metric, then read its definition file before calculating business metrics** -- ⚠️ **Always read `server/docs/data_description.md` for table relationships and joins** -- ✅ Use DuckDB views, not direct parquet file reads -- ❌ Never modify files in `server/` - they're read-only +- Always read `server/docs/schema.yml` before writing SQL queries +- Always read `server/docs/data_description.md` for table relationships and joins +- Check `server/docs/metrics/` for metric definitions before calculating business metrics +- Use DuckDB views, not direct parquet file reads +- Never modify files in `server/` - they're read-only --- diff --git a/scripts/sync_data.sh b/scripts/sync_data.sh index 131e745..b95ae41 100755 --- a/scripts/sync_data.sh +++ b/scripts/sync_data.sh @@ -268,7 +268,27 @@ if [[ -z "$DRY_RUN" ]]; then ANALYST_USER="$EXISTING_USER" fi fi + + # Read connection details from .sync_connection (written by bootstrap) + SSH_ALIAS="data-analyst" + SSH_HOST="unknown" + WEBAPP_URL="" + if [[ -f "./.sync_connection" ]]; then + SSH_ALIAS=$(grep '^ssh_alias=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "data-analyst") + SSH_HOST=$(grep '^server_host=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "unknown") + WEBAPP_URL=$(grep '^webapp_url=' ./.sync_connection 2>/dev/null | cut -d= -f2 || echo "") + fi + # Fallback: extract host from SSH config + if [[ "$SSH_HOST" == "unknown" ]] && [[ -f "$HOME/.ssh/config" ]]; then + SSH_HOST=$(awk "/^Host ${SSH_ALIAS}\$/,/^Host /{if(/HostName/) print \$2}" "$HOME/.ssh/config" 2>/dev/null | head -1) + SSH_HOST="${SSH_HOST:-unknown}" + fi + WEBAPP_URL="${WEBAPP_URL:-https://${SSH_HOST}}" + sed -e "s/{username}/$ANALYST_USER/g" \ + -e "s/{ssh_alias}/$SSH_ALIAS/g" \ + -e "s|{server_host}|$SSH_HOST|g" \ + -e "s|{webapp_url}|$WEBAPP_URL|g" \ ./server/docs/setup/claude_md_template.txt > ./CLAUDE.md echo "📝 CLAUDE.md updated from latest template" fi diff --git a/webapp/templates/dashboard.html b/webapp/templates/dashboard.html index 226a395..e16fe1b 100644 --- a/webapp/templates/dashboard.html +++ b/webapp/templates/dashboard.html @@ -2447,9 +2447,13 @@ + ' IdentityFile ' + sshKey + '\n' + ' StrictHostKeyChecking accept-new\n' + ' Then test: ssh ' + sshAlias + ' echo ok\n\n' - + '2. Create project folders (use explicit mkdir, not brace expansion):\n' + + '2. Create project folders and save connection details:\n' + ' mkdir -p server/docs server/scripts server/parquet server/metadata server/examples\n' - + ' mkdir -p user/duckdb user/notifications user/artifacts user/scripts user/parquet user/sessions\n\n' + + ' mkdir -p user/duckdb user/notifications user/artifacts user/scripts user/parquet user/sessions\n' + + ' Save this to .sync_connection (used by sync script to generate CLAUDE.md):\n' + + ' ssh_alias=' + sshAlias + '\n' + + ' server_host=' + serverHost + '\n' + + ' webapp_url=' + webappUrl + '\n\n' + '3. Download from server via rsync (use --no-perms --no-group to avoid macOS permission errors).\n' + ' Skip directories that don\'t exist on the server (rsync exit code 23 = missing source).\n' + ' rsync -avz --no-perms --no-group ' + sshAlias + ':server/scripts/ ./server/scripts/\n' @@ -2464,8 +2468,12 @@ + ' pip install pandas pyarrow duckdb pyyaml python-dotenv\n\n' + '5. Initialize DuckDB (only if server/scripts/setup_views.sh exists):\n' + ' bash server/scripts/setup_views.sh\n\n' - + '6. Create CLAUDE.md (only if server/docs/setup/claude_md_template.txt exists):\n' - + ' Copy the template, replace {username} with ' + username + '\n'; + + '6. Create CLAUDE.md (if server/docs/setup/claude_md_template.txt exists):\n' + + ' Copy the template, replace placeholders:\n' + + ' {username} -> ' + username + '\n' + + ' {ssh_alias} -> ' + sshAlias + '\n' + + ' {server_host} -> ' + serverHost + '\n' + + ' {webapp_url} -> ' + webappUrl + '\n'; var button = btn || document.getElementById('bootstrapCopyBtn'); var origText = button.textContent;