From 84d14da611c936a324a2c25bc4781b46ef01a526 Mon Sep 17 00:00:00 2001 From: Petr Date: Sat, 21 Mar 2026 18:41:43 +0100 Subject: [PATCH] Fix remote query UX: file-based stdin, ssh permissions, deprecation Session testing revealed 3 issues with remote queries: 1. CLAUDE.md template recommended `cat <= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) GROUP BY 1,2" @@ -257,13 +268,17 @@ cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' "sql": "SELECT * FROM agg_data ORDER BY date_col, dim_col", "format": "table" } -QUERY +``` + +Then run: +```bash +ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' < user/scripts/rq_query.json ``` ### Example 2: JOIN local + remote -```bash -cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' +Write to `user/scripts/rq_query.json`: +```json { "register_bq": { "remote_data": "SELECT date_col, dim_col, SUM(metric) as total FROM `project.dataset.table` WHERE date_col >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY) GROUP BY 1,2" @@ -271,14 +286,17 @@ cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' "sql": "SELECT l.*, r.total FROM local_table l JOIN remote_data r ON l.date_col = r.date_col AND l.dim_col = r.dim_col ORDER BY 1,2", "format": "table" } -QUERY +``` + +Then run: +```bash +ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' < user/scripts/rq_query.json ``` ### Example 3: Download result as Parquet for local analysis -```bash -# 1. Run query, save as Parquet on server -cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' +Write to `user/scripts/rq_query.json`: +```json { "register_bq": { "remote_data": "SELECT ... FROM `project.dataset.table` WHERE ... GROUP BY ..." @@ -287,7 +305,12 @@ cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' "format": "parquet", "output": "/tmp/remote_query/analysis.parquet" } -QUERY +``` + +Then run: +```bash +# 1. Run query on server +ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' < user/scripts/rq_query.json # 2. Download to local machine scp {ssh_alias}:/tmp/remote_query/analysis.parquet ./user/parquet/ @@ -318,10 +341,8 @@ If that exceeds 100K rows, add more aggregation or tighter date filters. 4. **Limits**: 500K rows max per BQ sub-query, 100K rows max in final result 5. If the query might take > 60 seconds, use nohup pattern: ```bash - # Write query to temp file, then run via nohup - cat <<'QUERY' | ssh {ssh_alias} 'cat > /tmp/rq_spec.json && nohup bash ~/server/scripts/remote_query.sh --stdin < /tmp/rq_spec.json > /tmp/rq.log 2>&1 &' - {"register_bq": {"data": "SELECT ..."}, "sql": "SELECT ...", "format": "parquet", "output": "/tmp/remote_query/result.parquet"} - QUERY + # Write query spec to user/scripts/rq_query.json first, then: + ssh {ssh_alias} 'cat > /tmp/rq_spec.json && nohup bash ~/server/scripts/remote_query.sh --stdin < /tmp/rq_spec.json > /tmp/rq.log 2>&1 &' < user/scripts/rq_query.json ssh {ssh_alias} 'tail -5 /tmp/rq.log' # check progress scp {ssh_alias}:/tmp/remote_query/result.parquet ./user/parquet/ ``` diff --git a/docs/setup/claude_settings.json b/docs/setup/claude_settings.json index 6e5bae8..2770760 100644 --- a/docs/setup/claude_settings.json +++ b/docs/setup/claude_settings.json @@ -41,6 +41,8 @@ "Bash(stat:*)", "Bash(bash server/scripts/*)", "Bash(python server/scripts/*)", + "Bash(ssh:*)", + "Bash(scp:*)", "WebFetch(domain:github.com)", "WebSearch" ], diff --git a/src/remote_query.py b/src/remote_query.py index a1ad202..7132d2b 100644 --- a/src/remote_query.py +++ b/src/remote_query.py @@ -329,7 +329,7 @@ def _format_output( # Re-execute without limit wrapper for clean Arrow export arrow_result = conn.execute( f"SELECT * FROM ({sql}) AS _rq LIMIT {max_rows}" - ).fetch_arrow_table() + ).arrow().read_all() if not output_path: output_path = str(Path(_load_remote_query_config()["output_dir"]) / "result.parquet")