agnes-the-ai-analyst/src/sql_safe.py
ZdenekSrotyr 2e1dfb7553
feat(v2): claude-driven fetch primitives + 0.14.0 (#102)
Replaces the BigQuery wrap-view pattern with a discovery + scoped-fetch toolkit driven by the analyst's Claude session. Adds /api/v2/{catalog,schema,sample,scan,scan/estimate}, da catalog/schema/describe/fetch/snapshot/disk-info CLI commands, sqlglot-backed WHERE validator, process-local quota tracker, agent rails skill (cli/skills/agnes-data-querying.md). BREAKING: BQ wrap views off by default — set data_source.bigquery.legacy_wrap_views=true for one cycle. Backward-compat field_validator on primary_key. Catalog cache now matches documented 300s TTL with RBAC fresh per request. Cuts release v0.14.0.
2026-04-29 01:07:19 +02:00

54 lines
2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Shared identifier-validation helpers for SQL identifier safety.
Used wherever code constructs SQL by string interpolation against caller-controlled
identifiers (table/dataset names from registry, alias from _remote_attach, etc.).
The DuckDB BQ extension treats identifiers literally — escaping at the call site
is unsafe; whitelist via regex instead.
"""
import logging
import re
logger = logging.getLogger(__name__)
_SAFE_IDENTIFIER = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
# GCP project IDs: 6-30 chars, lowercase letters / digits / hyphens, must start
# with letter, cannot end with hyphen.
# See https://cloud.google.com/resource-manager/docs/creating-managing-projects
_SAFE_PROJECT_ID = re.compile(r"^[a-z][a-z0-9-]{4,28}[a-z0-9]$")
def is_safe_identifier(name: str) -> bool:
"""Return True if `name` is a safe SQL identifier (alnum+underscore, ≤64 chars, leading non-digit)."""
if not isinstance(name, str):
return False
return bool(_SAFE_IDENTIFIER.match(name))
def validate_identifier(name: str, context: str) -> bool:
"""Validate a SQL identifier; log a warning if rejected. Returns True if safe."""
if not is_safe_identifier(name):
logger.warning("Rejected unsafe %s identifier: %r", context, name)
return False
return True
def is_safe_project_id(project_id: str) -> bool:
"""Return True if `project_id` matches the GCP project ID grammar.
GCP rules: 630 chars, ``[a-z][a-z0-9-]+[a-z0-9]``. Used to gate
project_id values from ``instance.yaml`` before they get f-stringed
into BQ-extension SQL (ATTACH, ``bigquery_query()``, etc.).
"""
if not isinstance(project_id, str):
return False
return bool(_SAFE_PROJECT_ID.match(project_id))
def validate_project_id(project_id: str) -> bool:
"""Validate a GCP project ID; log a warning if rejected. Returns True if safe."""
if not is_safe_project_id(project_id):
logger.warning("Rejected unsafe project_id: %r", project_id)
return False
return True