"""BigQuery metadata provider — populates `TableMetadata` for a remote BQ-backed registry row. Two queries (different INFORMATION_SCHEMA scopes — TABLE_STORAGE is region-scoped, COLUMNS is dataset-scoped, can't be combined): 1. INFORMATION_SCHEMA.TABLE_STORAGE — total_rows + active+long_term bytes. Region-portable per Google's docs; only valid via `.region-.INFORMATION_SCHEMA.TABLE_STORAGE` (verified live 2026-05-07; dataset-scoped TABLE_STORAGE doesn't exist). 2. INFORMATION_SCHEMA.COLUMNS — partition_by + clustered_by. Reuses the consolidated `fetch_bq_columns_full` helper that v2_schema also calls; one shared shape, one round-trip. Region resolution chain: `instance.yaml.data_source.bigquery.location` → `bq.client().get_dataset(...)` → fall back to legacy `__TABLES__` (dataset-scoped, no region required). VIEW handling: TABLE_STORAGE returns no rows for entries whose `table_type='VIEW'`; the legacy `__TABLES__` fallback also doesn't list views. The provider returns `TableMetadata(rows=None, size_bytes=None, partition_by=, clustered_by=)` — analyst Claude reads `null` size and applies the existing CLAUDE.md guidance. `size_bytes` reports `active_logical_bytes + long_term_logical_bytes` (a full BQ scan reads both — reporting only active undercounts aged partitioned tables). """ from __future__ import annotations import logging from app.api._metadata_models import MetadataRequest, TableMetadata from app.instance_config import get_value from connectors.bigquery.access import ( BqAccessError, fetch_bq_columns_full, get_bq_access, ) logger = logging.getLogger(__name__) def fetch(req: MetadataRequest) -> TableMetadata | None: try: bq = get_bq_access() except BqAccessError: return None if not bq.projects.data: return None rows_size = _fetch_rows_and_size(bq, req) columns = fetch_bq_columns_full(bq, req.bucket, req.source_table) part_clust = _derive_partition_cluster(columns) if columns else None entity_type = _fetch_entity_type(bq, req) known_columns = [c["name"] for c in columns] if columns else None if ( rows_size is None and part_clust is None and entity_type is None and not known_columns ): return None # For VIEW / MATERIALIZED VIEW the __TABLES__ fallback returns # ``(0, 0)`` for ``row_count`` and ``size_bytes`` — accurate for the # storage layer (views have no own storage) but misleading for # analysts. Surface ``None`` so catalog consumers see explicit # "unknown" rather than a confidently-wrong zero. if entity_type in ("VIEW", "MATERIALIZED VIEW"): rows_value = None size_value = None else: rows_value = (rows_size or {}).get("rows") size_value = (rows_size or {}).get("size_bytes") return TableMetadata( rows=rows_value, size_bytes=size_value, partition_by=(part_clust or {}).get("partition_by"), clustered_by=(part_clust or {}).get("clustered_by"), entity_type=entity_type, known_columns=known_columns, ) def _fetch_entity_type(bq, req: MetadataRequest) -> str | None: """Look up ``INFORMATION_SCHEMA.TABLES.table_type`` for the table. Single dataset-scoped query, no region required. Returns one of the documented BQ values (``BASE TABLE``, ``VIEW``, ``MATERIALIZED VIEW``, ``EXTERNAL``, ``SNAPSHOT``, ``CLONE``) or ``None`` if the lookup fails / the row isn't found. ``req.bucket`` and ``req.source_table`` are pre-validated by `app/api/v2_catalog._build_metadata_request`, so direct interpolation into the backtick-quoted path is safe. """ try: bq_sql = ( f"SELECT table_type " f"FROM `{bq.projects.data}.{req.bucket}.INFORMATION_SCHEMA.TABLES` " f"WHERE table_name = ?" ) with bq.duckdb_session() as conn: row = conn.execute( "SELECT * FROM bigquery_query(?, ?, ?)", [bq.projects.billing, bq_sql, req.source_table], ).fetchone() except Exception as e: logger.warning( "BQ INFORMATION_SCHEMA.TABLES lookup failed for %s.%s.%s: %s", bq.projects.data, req.bucket, req.source_table, e, ) return None if row is None or row[0] is None: return None return str(row[0]) def _derive_partition_cluster(columns: list[dict]) -> dict | None: """Mirror v2_schema._fetch_bq_table_options derivations from the shared columns-full result.""" if not columns: return None partition_by = next( (c["name"] for c in columns if c["is_partitioning_column"]), None, ) clustered = sorted( (c for c in columns if c["clustering_ordinal_position"] is not None), key=lambda c: c["clustering_ordinal_position"], ) clustered_by = [c["name"] for c in clustered] return {"partition_by": partition_by, "clustered_by": clustered_by} def _fetch_rows_and_size(bq, req: MetadataRequest) -> dict | None: """Resolve rows + size_bytes via TABLE_STORAGE → __TABLES__ fallthrough. See module docstring + spec Open Question §1 for view-path nuance. """ location = _resolve_bq_location(bq, req) if location: result = _fetch_via_table_storage(bq, req, location) if result is not None: return result # TABLE_STORAGE returned None despite having a location: could # be a typo in `data_source.bigquery.location`, a multi-region # dataset operator misclassified, the table is a VIEW, or a # transient permission gap. Try __TABLES__ before giving up. return _fetch_via_legacy_tables(bq, req) def _resolve_bq_location(bq, req: MetadataRequest) -> str | None: """instance.yaml.location → REST get_dataset → None. The REST fallback is best-effort: it requires the SA to have ``bigquery.datasets.get`` on the data project. Most cross-project setups grant ``bigquery.tables.get`` (data viewer) but NOT dataset- level metadata, so this 404s silently for the exact deployments that most need region detection. Configuring ``data_source.bigquery.location`` skips the REST round-trip entirely and makes the path deterministic — strongly recommended for any non-trivial setup. Issue #343. """ cfg_location = (get_value("data_source", "bigquery", "location") or "").strip() if cfg_location: return cfg_location try: ds = bq.client().get_dataset( f"{bq.projects.data}.{req.bucket}" ) return ds.location except Exception as e: logger.warning( "BQ dataset.get fell back for %s.%s: %s. To skip this REST " "round-trip on every metadata refresh (and silence cases " "where the SA lacks bigquery.datasets.get), set " "data_source.bigquery.location in /admin/server-config to the " "dataset's region (e.g. 'us-central1' or 'EU').", bq.projects.data, req.bucket, e, ) return None def _fetch_via_table_storage(bq, req: MetadataRequest, location: str) -> dict | None: """Region-scoped INFORMATION_SCHEMA.TABLE_STORAGE — preferred path. `validate_quoted_identifier` accepts `us-central1`, `europe-west1`, `EU`, `us` etc. (regex `^[a-zA-Z0-9_][a-zA-Z0-9_.\\-]{0,127}$`). Refuses anything that could break out of the backtick-quoted path. Returns None on no-row (table is a VIEW, or different region than configured) — caller decides whether to fall through. `size_bytes` is `active + long_term` logical bytes (a full BQ scan reads both; reporting only active undercounts aged partitioned tables). """ from src.identifier_validation import validate_quoted_identifier if not validate_quoted_identifier(location, "BQ region"): return None # `req.bucket` / `req.source_table` are pre-validated by the # dispatcher; `location` is validated locally above because it # originates from instance.yaml, not from the registry row. try: bq_sql = ( f"SELECT total_rows, " f"IFNULL(active_logical_bytes, 0) + IFNULL(long_term_logical_bytes, 0) " f"FROM `{bq.projects.data}.region-{location}.INFORMATION_SCHEMA.TABLE_STORAGE` " f"WHERE table_schema = ? AND table_name = ?" ) with bq.duckdb_session() as conn: row = conn.execute( "SELECT * FROM bigquery_query(?, ?, ?, ?)", [bq.projects.billing, bq_sql, req.bucket, req.source_table], ).fetchone() except Exception as e: logger.warning( "BQ TABLE_STORAGE fetch failed for %s.%s.%s: %s", bq.projects.data, req.bucket, req.source_table, e, ) return None if row is None: return None # VIEW or wrong region rows_, size_bytes = row return { "rows": int(rows_) if rows_ is not None else None, "size_bytes": int(size_bytes) if size_bytes is not None else None, } def _fetch_via_legacy_tables(bq, req: MetadataRequest) -> dict | None: """Last-resort dataset-scoped __TABLES__ — works without region.""" # `req.bucket` and `req.source_table` are pre-validated by # `app/api/v2_catalog._build_metadata_request` via # `validate_quoted_identifier` before MetadataRequest construction; # safe to interpolate into the backtick-quoted path here. try: bq_sql = ( f"SELECT row_count, size_bytes " f"FROM `{bq.projects.data}.{req.bucket}.__TABLES__` " f"WHERE table_id = ?" ) with bq.duckdb_session() as conn: row = conn.execute( "SELECT * FROM bigquery_query(?, ?, ?)", [bq.projects.billing, bq_sql, req.source_table], ).fetchone() except Exception as e: logger.warning( "BQ __TABLES__ fetch failed for %s.%s.%s: %s", bq.projects.data, req.bucket, req.source_table, e, ) return None if row is None: return None rows_, size_bytes = row return { "rows": int(rows_) if rows_ is not None else None, "size_bytes": int(size_bytes) if size_bytes is not None else None, }