Since 0.47.0 GET /api/v2/catalog enriched each remote BigQuery row by fetching INFORMATION_SCHEMA.TABLE_STORAGE + COLUMNS through the DuckDB BigQuery extension *inside the request*. On cold caches that fanned out to O(N) sequential BQ jobs-API roundtrips — easily 90 s+ on partitioned / view-backed tables — and reliably blew the CLI's 30 s httpx ReadTimeout. Reproduced with py-spy: three AnyIO worker threads stuck inside connectors/bigquery/metadata._fetch_via_legacy_tables. Refactor: enrichment is read exclusively from a new persistent bq_metadata_cache DuckDB table (schema v40), populated by a scheduler- driven refresh job at SCHEDULER_BQ_METADATA_REFRESH_INTERVAL (default 4 h). Cold catalog response on a fresh container is now tens of milliseconds with metadata_freshness=never_fetched for unwarmed rows. New surface: - POST /api/admin/run-bq-metadata-refresh (scheduler-driven, full) - POST /api/v2/metadata-cache/refresh?table=<id> (admin, single) - GET /api/v2/metadata-cache/status (auth, non-admin) - metadata_freshness field per catalog row Removed (internal API): v2_catalog._size_hint_for_row, _resolve_remote_metadata, _metadata_provider_for, _build_metadata_request, _materialized_size_hint, in-memory _metadata_cache. Response shape unchanged for external consumers. 991 tests passing; 2 pre-existing failures (test_db v3→v4 ladder, test_cli_binary_rename) unrelated to this change.
101 lines
4 KiB
Python
101 lines
4 KiB
Python
"""Unit tests for the env-driven JOBS builder in services.scheduler."""
|
|
|
|
import pytest
|
|
|
|
|
|
def test_build_jobs_uses_documented_defaults(monkeypatch):
|
|
"""No env overrides → default cadences."""
|
|
for v in (
|
|
"SCHEDULER_DATA_REFRESH_INTERVAL",
|
|
"SCHEDULER_HEALTH_CHECK_INTERVAL",
|
|
"SCHEDULER_TICK_SECONDS",
|
|
"SCHEDULER_SCRIPT_RUN_INTERVAL",
|
|
):
|
|
monkeypatch.delenv(v, raising=False)
|
|
from services.scheduler.__main__ import build_jobs, resolved_tick_seconds
|
|
jobs = {name: schedule for name, schedule, *_ in build_jobs()}
|
|
assert jobs["data-refresh"] == "every 15m"
|
|
assert jobs["health-check"] == "every 5m"
|
|
assert jobs["script-runner"] == "every 1m"
|
|
assert jobs["marketplaces"] == "daily 03:00"
|
|
assert jobs["bq-metadata-refresh"] == "every 4h"
|
|
assert resolved_tick_seconds() == 30
|
|
|
|
|
|
def test_build_jobs_honors_bq_metadata_env_override(monkeypatch):
|
|
monkeypatch.setenv("SCHEDULER_BQ_METADATA_REFRESH_INTERVAL", "7200") # 2h
|
|
from services.scheduler.__main__ import build_jobs
|
|
jobs = {name: schedule for name, schedule, *_ in build_jobs()}
|
|
assert jobs["bq-metadata-refresh"] == "every 2h"
|
|
|
|
|
|
def test_build_jobs_honors_env_overrides(monkeypatch):
|
|
monkeypatch.setenv("SCHEDULER_DATA_REFRESH_INTERVAL", "1800") # 30m
|
|
monkeypatch.setenv("SCHEDULER_HEALTH_CHECK_INTERVAL", "60") # 1m
|
|
monkeypatch.setenv("SCHEDULER_SCRIPT_RUN_INTERVAL", "120") # 2m
|
|
monkeypatch.setenv("SCHEDULER_TICK_SECONDS", "10")
|
|
from services.scheduler.__main__ import build_jobs, resolved_tick_seconds
|
|
jobs = {name: schedule for name, schedule, *_ in build_jobs()}
|
|
assert jobs["data-refresh"] == "every 30m"
|
|
assert jobs["health-check"] == "every 1m"
|
|
assert jobs["script-runner"] == "every 2m"
|
|
assert resolved_tick_seconds() == 10
|
|
|
|
|
|
@pytest.mark.parametrize("var", [
|
|
"SCHEDULER_DATA_REFRESH_INTERVAL",
|
|
"SCHEDULER_HEALTH_CHECK_INTERVAL",
|
|
"SCHEDULER_TICK_SECONDS",
|
|
"SCHEDULER_SCRIPT_RUN_INTERVAL",
|
|
])
|
|
@pytest.mark.parametrize("bad", ["0", "-5", "abc", ""])
|
|
def test_build_jobs_rejects_invalid_env(monkeypatch, var, bad):
|
|
monkeypatch.setenv(var, bad)
|
|
from services.scheduler.__main__ import build_jobs
|
|
with pytest.raises(ValueError):
|
|
build_jobs()
|
|
|
|
|
|
def test_build_jobs_rejects_tick_larger_than_smallest_interval(monkeypatch):
|
|
"""Tick must be <= the smallest job interval, otherwise jobs would
|
|
consistently miss their cadence by up to one tick."""
|
|
monkeypatch.setenv("SCHEDULER_HEALTH_CHECK_INTERVAL", "60")
|
|
monkeypatch.setenv("SCHEDULER_TICK_SECONDS", "120")
|
|
from services.scheduler.__main__ import build_jobs
|
|
with pytest.raises(ValueError, match="tick"):
|
|
build_jobs()
|
|
|
|
|
|
def test_build_jobs_includes_run_due_endpoint():
|
|
"""The script-runner job must POST to /api/scripts/run-due."""
|
|
from services.scheduler.__main__ import build_jobs
|
|
target = next(j for j in build_jobs() if j[0] == "script-runner")
|
|
name, schedule, endpoint, method, _timeout = target
|
|
assert endpoint == "/api/scripts/run-due"
|
|
assert method == "POST"
|
|
|
|
|
|
@pytest.mark.parametrize("seconds,expected", [
|
|
# Exact multiples of 60 → unchanged.
|
|
(60, "every 1m"),
|
|
(120, "every 2m"),
|
|
(900, "every 15m"),
|
|
# Exact multiples of 3600 → hour form.
|
|
(3600, "every 1h"),
|
|
(7200, "every 2h"),
|
|
# Non-multiples of 60 must round UP (ceiling), so the job never fires
|
|
# MORE often than the operator configured. Devin BUG_0001 on 1af2081.
|
|
(90, "every 2m"), # 90s asked → 120s scheduled, NOT 60s
|
|
(150, "every 3m"),
|
|
(61, "every 2m"),
|
|
(3601, "every 61m"),
|
|
# Sub-minute clamps to 1m (schedule grammar minute-grained).
|
|
(30, "every 1m"),
|
|
(1, "every 1m"),
|
|
])
|
|
def test_seconds_to_schedule_rounds_up_not_down(seconds, expected):
|
|
from services.scheduler.__main__ import _seconds_to_schedule
|
|
assert _seconds_to_schedule(seconds) == expected, (
|
|
f"_seconds_to_schedule({seconds}) must round UP — flooring would "
|
|
f"make jobs fire more often than the operator configured."
|
|
)
|