diff --git a/docs/setup/claude_md_template.txt b/docs/setup/claude_md_template.txt index e084db6..c185299 100644 --- a/docs/setup/claude_md_template.txt +++ b/docs/setup/claude_md_template.txt @@ -219,51 +219,75 @@ You write two SQL statements: 2. **DuckDB SQL** (`--sql "SQL"`) -- runs in DuckDB after all views (local + BQ) are ready. Can JOIN local tables with registered BQ results. -### Command format +### Command format (JSON via stdin -- ALWAYS use this) + +**IMPORTANT:** Always use the `--stdin` JSON mode to avoid shell escaping issues with +backticks, quotes, and parentheses in SQL. Write a heredoc with the JSON query spec: ```bash -ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh \ - --register-bq "ALIAS=BQ_SQL_QUERY" \ - --sql "DUCKDB_SQL_QUERY" \ - --format table' +cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' +{ + "register_bq": { + "ALIAS": "SELECT ... FROM `project.dataset.table` WHERE ... GROUP BY ..." + }, + "sql": "SELECT ... FROM ALIAS JOIN local_table ...", + "format": "table" +} +QUERY ``` -The wrapper script (`remote_query.sh`) handles environment setup automatically -(PYTHONPATH, CONFIG_DIR, .env loading). All arguments are passed to `python -m src.remote_query`. +The `<<'QUERY'` heredoc passes SQL **literally** -- no escaping needed for backticks, +single quotes, parentheses, or any other special characters. -Arguments: -- `--register-bq "alias=SQL"` -- Register a BQ query result as DuckDB view (repeatable for multiple remote tables) -- `--sql "SQL"` -- The final DuckDB query (can reference local views + registered BQ aliases) -- `--format table|csv|json|parquet` -- Output format (default: table) -- `--output /path/file` -- Output file for parquet/csv/json -- `--max-rows N` -- Override max result rows +**JSON fields:** +- `"sql"` (required) -- DuckDB SQL query (can reference local views + registered BQ aliases) +- `"register_bq"` (optional) -- Object mapping alias names to BigQuery SQL queries +- `"format"` (optional) -- `"table"`, `"csv"`, `"json"`, or `"parquet"` (default: `"table"`) +- `"output"` (optional) -- File path for parquet/csv/json output +- `"max_rows"` (optional) -- Override max result rows ### Example 1: Remote-only query (aggregated data) ```bash -ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh \ - --register-bq "agg_data=SELECT date_col, dim_col, SUM(metric) as total FROM \`project.dataset.table\` WHERE date_col >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) GROUP BY 1,2" \ - --sql "SELECT * FROM agg_data ORDER BY date_col, dim_col" \ - --format table' +cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' +{ + "register_bq": { + "agg_data": "SELECT date_col, dim_col, SUM(metric) as total FROM `project.dataset.table` WHERE date_col >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) GROUP BY 1,2" + }, + "sql": "SELECT * FROM agg_data ORDER BY date_col, dim_col", + "format": "table" +} +QUERY ``` ### Example 2: JOIN local + remote ```bash -ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh \ - --register-bq "remote_data=SELECT date_col, dim_col, SUM(metric) as total FROM \`project.dataset.table\` WHERE date_col >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY) GROUP BY 1,2" \ - --sql "SELECT l.*, r.total FROM local_table l JOIN remote_data r ON l.date_col = r.date_col AND l.dim_col = r.dim_col ORDER BY 1,2" \ - --format table' +cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' +{ + "register_bq": { + "remote_data": "SELECT date_col, dim_col, SUM(metric) as total FROM `project.dataset.table` WHERE date_col >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY) GROUP BY 1,2" + }, + "sql": "SELECT l.*, r.total FROM local_table l JOIN remote_data r ON l.date_col = r.date_col AND l.dim_col = r.dim_col ORDER BY 1,2", + "format": "table" +} +QUERY ``` ### Example 3: Download result as Parquet for local analysis ```bash # 1. Run query, save as Parquet on server -ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh \ - --register-bq "remote_data=SELECT ... GROUP BY ..." \ - --sql "SELECT ... JOIN ..." \ - --format parquet --output /tmp/remote_query/analysis.parquet' +cat <<'QUERY' | ssh {ssh_alias} 'bash ~/server/scripts/remote_query.sh --stdin' +{ + "register_bq": { + "remote_data": "SELECT ... FROM `project.dataset.table` WHERE ... GROUP BY ..." + }, + "sql": "SELECT ... FROM local_table JOIN remote_data ...", + "format": "parquet", + "output": "/tmp/remote_query/analysis.parquet" +} +QUERY # 2. Download to local machine scp {ssh_alias}:/tmp/remote_query/analysis.parquet ./user/parquet/ @@ -294,7 +318,10 @@ If that exceeds 100K rows, add more aggregation or tighter date filters. 4. **Limits**: 500K rows max per BQ sub-query, 100K rows max in final result 5. If the query might take > 60 seconds, use nohup pattern: ```bash - ssh {ssh_alias} 'nohup bash ~/server/scripts/remote_query.sh --register-bq "..." --sql "..." --format parquet --output /tmp/remote_query/result.parquet > /tmp/rq.log 2>&1 &' + # Write query to temp file, then run via nohup + cat <<'QUERY' | ssh {ssh_alias} 'cat > /tmp/rq_spec.json && nohup bash ~/server/scripts/remote_query.sh --stdin < /tmp/rq_spec.json > /tmp/rq.log 2>&1 &' + {"register_bq": {"data": "SELECT ..."}, "sql": "SELECT ...", "format": "parquet", "output": "/tmp/remote_query/result.parquet"} + QUERY ssh {ssh_alias} 'tail -5 /tmp/rq.log' # check progress scp {ssh_alias}:/tmp/remote_query/result.parquet ./user/parquet/ ``` diff --git a/src/remote_query.py b/src/remote_query.py index 301ce8c..3a6fa1c 100644 --- a/src/remote_query.py +++ b/src/remote_query.py @@ -533,9 +533,45 @@ Examples: action="store_true", help="Suppress progress messages (stderr)", ) + parser.add_argument( + "--stdin", + action="store_true", + help="Read query spec from stdin as JSON. Avoids shell escaping issues.", + ) return parser +def _parse_stdin_query() -> dict: + """Parse query specification from stdin JSON. + + Expected format: + { + "sql": "SELECT ... FROM ...", + "register_bq": {"alias": "BQ SQL", ...}, + "format": "table", + "output": "/path/to/file", + "max_rows": 100000, + "max_bq_rows": 500000 + } + + Returns: + Dict with parsed query spec + """ + raw = sys.stdin.read().strip() + if not raw: + raise RemoteQueryError("Empty stdin. Provide JSON query spec.") + + try: + spec = json.loads(raw) + except json.JSONDecodeError as e: + raise RemoteQueryError(f"Invalid JSON on stdin: {e}") + + if "sql" not in spec: + raise RemoteQueryError("JSON must contain 'sql' field.") + + return spec + + def main(argv: Optional[list[str]] = None) -> None: """CLI entry point.""" parser = build_parser() @@ -549,6 +585,25 @@ def main(argv: Optional[list[str]] = None) -> None: ) try: + # --stdin mode: read query spec from JSON on stdin (no shell escaping needed) + if args.stdin: + spec = _parse_stdin_query() + bq_regs = [ + (alias, sql) for alias, sql in spec.get("register_bq", {}).items() + ] + execute_remote_query( + sql=spec["sql"], + bq_registrations=bq_regs, + fmt=spec.get("format", args.fmt), + output=spec.get("output", args.output), + max_rows=spec.get("max_rows", args.max_rows), + max_bq_rows=spec.get("max_bq_rows", args.max_bq_rows), + timeout=args.timeout, + data_dir=args.data_dir, + quiet=args.quiet, + ) + return + execute_remote_query( sql=args.sql, bq_registrations=args.bq_registrations,