diff --git a/.server-changes/queue-metrics-dashboard.md b/.server-changes/queue-metrics-dashboard.md
new file mode 100644
index 00000000000..37baffc7aaa
--- /dev/null
+++ b/.server-changes/queue-metrics-dashboard.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: feature
+---
+
+Queue metrics and health on the Queues page: per-queue depth, throughput, concurrency, throttling, and scheduling-delay charts, plus a per-queue detail view. Off by default; enabled per organization.
diff --git a/apps/webapp/app/components/primitives/UsageSparkline.tsx b/apps/webapp/app/components/primitives/UsageSparkline.tsx
index 2ffc1936a1d..7c4bbd5d262 100644
--- a/apps/webapp/app/components/primitives/UsageSparkline.tsx
+++ b/apps/webapp/app/components/primitives/UsageSparkline.tsx
@@ -27,6 +27,8 @@ export type UsageSparklineProps = {
color?: string;
/** Unit shown in the tooltip (e.g. calls, tokens). */
unitLabel?: UnitLabel;
+ /** Trailing scalar shown after the chart. Defaults to the sum of buckets (override for gauges, e.g. peak). */
+ total?: number;
/** Format the trailing total. Defaults to `toLocaleString`. */
formatTotal?: (total: number) => string;
/** Class for the trailing total label. */
@@ -44,14 +46,16 @@ export function UsageSparkline({
bucketIntervalMs,
color = "#3B82F6",
unitLabel = { singular: "call", plural: "calls" },
+ total: totalOverride,
formatTotal,
totalClassName = "text-blue-400",
}: UsageSparklineProps) {
- if (!data || data.every((v) => v === 0)) {
+ const hasTotalOverride = totalOverride !== undefined;
+ if (!data || data.length === 0 || (data.every((v) => v === 0) && !hasTotalOverride)) {
return – ;
}
- const total = data.reduce((a, b) => a + b, 0);
+ const total = totalOverride ?? data.reduce((a, b) => a + b, 0);
const max = Math.max(...data);
// Map each bucket to a dated point so the tooltip can show the window it
diff --git a/apps/webapp/app/components/primitives/charts/Chart.tsx b/apps/webapp/app/components/primitives/charts/Chart.tsx
index 57a2692e677..8894c2da34d 100644
--- a/apps/webapp/app/components/primitives/charts/Chart.tsx
+++ b/apps/webapp/app/components/primitives/charts/Chart.tsx
@@ -216,7 +216,7 @@ const ChartTooltipContent = React.forwardRef<
)}
diff --git a/apps/webapp/app/components/primitives/charts/ChartLine.tsx b/apps/webapp/app/components/primitives/charts/ChartLine.tsx
index 1edd5a2357e..5d5fb95ecce 100644
--- a/apps/webapp/app/components/primitives/charts/ChartLine.tsx
+++ b/apps/webapp/app/components/primitives/charts/ChartLine.tsx
@@ -4,6 +4,7 @@ import {
CartesianGrid,
Line,
LineChart,
+ ReferenceLine,
XAxis,
YAxis,
type XAxisProps,
@@ -48,12 +49,38 @@ export type ChartLineRendererProps = {
tooltipLabelFormatter?: (label: string, payload: any[]) => string;
/** Optional formatter for numeric tooltip values (e.g. bytes, duration) */
tooltipValueFormatter?: (value: number) => string;
+ /** Draw a dot at each data point. Defaults to true; turn off for dense/compact charts. */
+ showDots?: boolean;
+ /** Horizontal reference lines (e.g. limits); the y-domain extends to include them. */
+ referenceLines?: Array<{ y: number; label?: string; color?: string }>;
/** Width injected by ResponsiveContainer */
width?: number;
/** Height injected by ResponsiveContainer */
height?: number;
};
+/** Reference-line label: right-aligned just below the line (recharts injects viewBox). */
+function ReferenceLineLabel({
+ viewBox,
+ value,
+}: {
+ viewBox?: { x: number; y: number; width: number };
+ value: string;
+}) {
+ if (!viewBox) return null;
+ return (
+
+ {value}
+
+ );
+}
+
/**
* Line chart renderer for the compound component system.
* Must be used within a Chart.Root.
@@ -73,6 +100,8 @@ export function ChartLineRenderer({
stacked = false,
tooltipLabelFormatter,
tooltipValueFormatter,
+ showDots = true,
+ referenceLines,
width,
height,
}: ChartLineRendererProps) {
@@ -176,6 +205,17 @@ export function ChartLineRenderer({
labelFormatter={tooltipLabelFormatter}
/>
{/* Note: Legend is now rendered by ChartRoot outside the chart container */}
+ {referenceLines?.map((line) => (
+
: undefined}
+ />
+ ))}
{visibleSeries.map((key) => (
{/* Note: Legend is now rendered by ChartRoot outside the chart container */}
+ {referenceLines?.map((line) => (
+
: undefined}
+ />
+ ))}
{visibleSeries.map((key) => (
diff --git a/apps/webapp/app/components/query/QueryEditor.tsx b/apps/webapp/app/components/query/QueryEditor.tsx
index 8520d2a7a0b..eb9fd08ffb1 100644
--- a/apps/webapp/app/components/query/QueryEditor.tsx
+++ b/apps/webapp/app/components/query/QueryEditor.tsx
@@ -72,7 +72,7 @@ import type { action as titleAction } from "~/routes/resources.orgs.$organizatio
import type { QueryScope } from "~/services/queryService.server";
import { downloadFile, rowsToCSV, rowsToJSON } from "~/utils/dataExport";
import { organizationBillingPath } from "~/utils/pathBuilder";
-import { querySchemas } from "~/v3/querySchemas";
+import { visibleQuerySchemas } from "~/v3/querySchemas";
/** Convert a Date or ISO string to ISO string format */
function toISOString(value: Date | string): string {
@@ -245,7 +245,7 @@ const QueryEditorForm = forwardRef<
{
initMollifierDrainerWorker();
initMollifierStaleSweepWorker();
initBillingLimitWorker();
+initQueueMetricsEmitter();
+initQueueMetricsConsumer();
bootstrap().catch((error) => {
logError(error);
diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
index f5e0f0a671c..26e178d18a6 100644
--- a/apps/webapp/app/env.server.ts
+++ b/apps/webapp/app/env.server.ts
@@ -883,6 +883,31 @@ const EnvironmentSchema = z
RUN_ENGINE_REUSE_SNAPSHOT_COUNT: z.coerce.number().int().default(0),
RUN_ENGINE_MAXIMUM_ENV_COUNT: z.coerce.number().int().optional(),
RUN_ENGINE_RUN_QUEUE_SHARD_COUNT: z.coerce.number().int().default(4),
+ // Queue metrics ingestion (Redis Stream -> ClickHouse). The runtime on/off is the
+ // `queue_metrics:enabled` Redis key; these gate emitter construction + consumer boot.
+ QUEUE_METRICS_EMIT_ENABLED: z.string().default("0"),
+ QUEUE_METRICS_CONSUMER_ENABLED: z.string().default("0"),
+ QUEUE_METRICS_STREAM_SHARD_COUNT: z.coerce.number().int().default(4),
+ QUEUE_METRICS_CONSUMER_BATCH_SIZE: z.coerce.number().int().default(1000),
+ // Counter stream (exact counts, loss-intolerant). Unset host => the run-queue Redis;
+ // set it to a dedicated instance so counter backlog never competes with the run queue.
+ QUEUE_METRICS_REDIS_HOST: z.string().optional(),
+ QUEUE_METRICS_REDIS_PORT: z.coerce.number().optional(),
+ QUEUE_METRICS_REDIS_USERNAME: z.string().optional(),
+ QUEUE_METRICS_REDIS_PASSWORD: z.string().optional(),
+ QUEUE_METRICS_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"),
+ // Default depends on where the stream lives: see metricsDefinition() in
+ // queueMetrics.server.ts (2M on the shared run-queue Redis, 8M on a dedicated one).
+ QUEUE_METRICS_COUNTER_STREAM_MAXLEN: z.coerce.number().int().optional(),
+ // TTL (seconds) on the per-(queue,op) cumulative odometer key, refreshed on every write.
+ // Idle-past-TTL queues purge and self-heal (restart from 1) on return; default 7 days.
+ QUEUE_METRICS_COUNTER_ODOMETER_TTL_SECONDS: z.coerce.number().int().default(604_800),
+ // Per-env distinct queue_name cap (0 = unlimited); overflow maps to "__overflow__".
+ QUEUE_METRICS_MAX_QUEUE_NAMES_PER_ENV: z.coerce.number().int().default(1000),
+ QUEUE_METRICS_MAX_CONCURRENCY_KEYS_PER_QUEUE: z.coerce.number().int().default(10_000),
+ // Fraction (0..1) of ops that emit a gauge; counters are never sampled. Dial below 1
+ // only if EngineCPU is too high in slow-path-heavy regions (hurts low-traffic queues).
+ QUEUE_METRICS_GAUGE_SAMPLE_RATE: z.coerce.number().min(0).max(1).default(1),
RUN_ENGINE_WORKER_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().default(60_000),
RUN_ENGINE_RETRY_WARM_START_THRESHOLD_MS: z.coerce.number().int().default(30_000),
RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS: z.coerce.number().int().default(200),
diff --git a/apps/webapp/app/hooks/useMetricResourceQuery.ts b/apps/webapp/app/hooks/useMetricResourceQuery.ts
new file mode 100644
index 00000000000..8cb8faec507
--- /dev/null
+++ b/apps/webapp/app/hooks/useMetricResourceQuery.ts
@@ -0,0 +1,109 @@
+import { useCallback, useEffect, useRef, useState } from "react";
+import { useInterval } from "./useInterval";
+
+export type MetricResourceRow = Record;
+
+type MetricResourceResponse =
+ | { success: true; data: { rows: MetricResourceRow[] } }
+ | { success: false; error: string };
+
+export type MetricResourceTimeRange = {
+ period: string | null;
+ from: string | null;
+ to: string | null;
+};
+
+export type MetricResourceQueryOptions = {
+ organizationId: string;
+ projectId: string;
+ environmentId: string;
+ timeRange: MetricResourceTimeRange;
+ defaultPeriod: string;
+ queues?: string[];
+ fillGaps?: boolean;
+ refreshIntervalMs?: number;
+};
+
+/**
+ * Client-fetch a TRQL query from the metric resource route (like the dashboard
+ * widgets): own loading state, interval + on-focus refresh, abort on change/unmount.
+ */
+export function useMetricResourceQuery(query: string, opts: MetricResourceQueryOptions) {
+ const [rows, setRows] = useState(null);
+ const [isLoading, setIsLoading] = useState(true);
+ const [failed, setFailed] = useState(false);
+ const abortRef = useRef(null);
+
+ const {
+ organizationId,
+ projectId,
+ environmentId,
+ defaultPeriod,
+ fillGaps,
+ refreshIntervalMs = 60_000,
+ } = opts;
+ const { period, from, to } = opts.timeRange;
+ const queuesKey = opts.queues && opts.queues.length > 0 ? opts.queues.join(",") : undefined;
+
+ const load = useCallback(() => {
+ abortRef.current?.abort();
+ const controller = new AbortController();
+ abortRef.current = controller;
+ setIsLoading(true);
+ fetch("/resources/metric", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ query,
+ scope: "environment",
+ period: period ?? (from || to ? null : defaultPeriod),
+ from,
+ to,
+ fillGaps: !!fillGaps,
+ organizationId,
+ projectId,
+ environmentId,
+ ...(queuesKey !== undefined ? { queues: queuesKey.split(",") } : {}),
+ }),
+ signal: controller.signal,
+ })
+ .then((res) => res.json() as Promise)
+ .then((data) => {
+ if (controller.signal.aborted) return;
+ if (data.success) {
+ setRows(data.data.rows);
+ setFailed(false);
+ } else {
+ setFailed(true);
+ }
+ setIsLoading(false);
+ })
+ .catch((error) => {
+ if (error instanceof DOMException && error.name === "AbortError") return;
+ if (!controller.signal.aborted) {
+ setFailed(true);
+ setIsLoading(false);
+ }
+ });
+ }, [
+ query,
+ period,
+ from,
+ to,
+ defaultPeriod,
+ fillGaps,
+ organizationId,
+ projectId,
+ environmentId,
+ queuesKey,
+ ]);
+
+ useEffect(() => {
+ load();
+ return () => abortRef.current?.abort();
+ }, [load]);
+
+ useInterval({ interval: refreshIntervalMs, onLoad: false, onFocus: true, callback: load });
+
+ return { rows: rows ?? [], isLoading, showLoading: isLoading && !rows, failed };
+}
diff --git a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts
index 971fc9a3033..d831568248d 100644
--- a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts
+++ b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts
@@ -550,7 +550,186 @@ const llmDashboard: BuiltInDashboard = {
},
};
-const builtInDashboards: BuiltInDashboard[] = [overviewDashboard, llmDashboard];
+const queuesDashboard: BuiltInDashboard = {
+ key: "queues",
+ title: "Queues",
+ filters: ["queues"],
+ layout: {
+ version: "1",
+ layout: [
+ { i: "env-used", x: 0, y: 0, w: 3, h: 4 },
+ { i: "env-limit", x: 3, y: 0, w: 3, h: 4 },
+ { i: "env-avail", x: 6, y: 0, w: 3, h: 4 },
+ { i: "env-sat", x: 9, y: 0, w: 3, h: 4 },
+ { i: "sat-time", x: 0, y: 4, w: 6, h: 9 },
+ { i: "used-limit", x: 6, y: 4, w: 6, h: 9 },
+ { i: "t-pressure", x: 0, y: 13, w: 12, h: 2, minH: 2, maxH: 2 },
+ { i: "pressure", x: 0, y: 15, w: 12, h: 11 },
+ { i: "t-trends", x: 0, y: 26, w: 12, h: 2, minH: 2, maxH: 2 },
+ { i: "running-q", x: 0, y: 28, w: 6, h: 9 },
+ { i: "queued-q", x: 6, y: 28, w: 6, h: 9 },
+ { i: "throttled-q", x: 0, y: 37, w: 6, h: 9 },
+ { i: "throughput", x: 6, y: 37, w: 6, h: 9 },
+ { i: "wait-pct", x: 0, y: 46, w: 12, h: 9 },
+ ],
+ widgets: {
+ "env-used": {
+ title: "Concurrency in use",
+ query: `SELECT argMax(max_env_running, bucket_start) AS in_use\nFROM env_metrics`,
+ display: { type: "bignumber", column: "in_use", aggregation: "max", abbreviate: false },
+ },
+ "env-limit": {
+ title: "Environment limit",
+ query: `SELECT argMax(max_env_limit, bucket_start) AS env_limit\nFROM env_metrics`,
+ display: { type: "bignumber", column: "env_limit", aggregation: "max", abbreviate: false },
+ },
+ "env-avail": {
+ title: "Available slots",
+ query: `SELECT argMax(max_env_limit, bucket_start) - argMax(max_env_running, bucket_start) AS available\nFROM env_metrics`,
+ display: { type: "bignumber", column: "available", aggregation: "max", abbreviate: false },
+ },
+ "env-sat": {
+ title: "Env saturation",
+ query: `SELECT round(argMax(max_env_running, bucket_start) * 100.0 / nullIf(argMax(max_env_limit, bucket_start), 0), 1) AS saturation\nFROM env_metrics`,
+ display: {
+ type: "bignumber",
+ column: "saturation",
+ aggregation: "max",
+ abbreviate: false,
+ suffix: "%",
+ },
+ },
+ "sat-time": {
+ title: "Environment saturation over time",
+ query: `SELECT timeBucket() AS t,\n round(max(max_env_running) * 100.0 / nullIf(max(max_env_limit), 0), 1) AS saturation\nFROM env_metrics\nGROUP BY t\nORDER BY t`,
+ display: {
+ type: "chart",
+ chartType: "line",
+ xAxisColumn: "t",
+ yAxisColumns: ["saturation"],
+ groupByColumn: null,
+ stacked: false,
+ sortByColumn: null,
+ sortDirection: "asc",
+ aggregation: "max",
+ },
+ },
+ "used-limit": {
+ title: "Concurrency used vs limit",
+ query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS limit\nFROM env_metrics\nGROUP BY t\nORDER BY t`,
+ // Single-series gauge: carry the last known used/limit across idle buckets instead of dropping to 0.
+ fillGaps: true,
+ display: {
+ type: "chart",
+ chartType: "line",
+ xAxisColumn: "t",
+ yAxisColumns: ["used", "limit"],
+ groupByColumn: null,
+ stacked: false,
+ sortByColumn: null,
+ sortDirection: "asc",
+ aggregation: "max",
+ },
+ },
+ "t-pressure": { title: "Queue pressure", query: "", display: { type: "title" } },
+ pressure: {
+ title: "Queue pressure",
+ query: `SELECT queue,\n argMax(max_running, bucket_start) AS running,\n argMax(max_queued, bucket_start) AS queued,\n argMax(max_limit, bucket_start) AS limit,\n running + queued AS demand,\n max(max_queued) AS peak_queued,\n sum(throttled_count) AS throttled,\n multiIf(running >= limit AND queued > 0, 'queue-limited', queued > 0, 'backlogged', 'healthy') AS status\nFROM queue_metrics\nGROUP BY queue\nORDER BY peak_queued DESC`,
+ display: {
+ type: "table",
+ prettyFormatting: true,
+ sorting: [{ id: "peak_queued", desc: true }],
+ },
+ },
+ "t-trends": { title: "Per-queue trends", query: "", display: { type: "title" } },
+ "running-q": {
+ title: "Running by queue",
+ query: `SELECT timeBucket() AS t, queue, max(max_running) AS running\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`,
+ // Grouped gauge: carry each queue's running across idle buckets (per-group LOCF).
+ fillGaps: true,
+ display: {
+ type: "chart",
+ chartType: "line",
+ xAxisColumn: "t",
+ yAxisColumns: ["running"],
+ groupByColumn: "queue",
+ stacked: false,
+ sortByColumn: null,
+ sortDirection: "asc",
+ aggregation: "max",
+ },
+ },
+ "queued-q": {
+ title: "Queue depth (backlog) by queue",
+ query: `SELECT timeBucket() AS t, queue, max(max_queued) AS queued\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`,
+ // Grouped gauge: carry each queue's backlog across idle buckets (per-group LOCF).
+ fillGaps: true,
+ display: {
+ type: "chart",
+ chartType: "line",
+ xAxisColumn: "t",
+ yAxisColumns: ["queued"],
+ groupByColumn: "queue",
+ stacked: false,
+ sortByColumn: null,
+ sortDirection: "asc",
+ aggregation: "max",
+ },
+ },
+ "throttled-q": {
+ title: "Throttled buckets by queue",
+ query: `SELECT timeBucket() AS t, queue, sum(throttled_count) AS throttled\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`,
+ // Grouped counter: per-group zero-fill so idle buckets read 0, not a gap.
+ fillGaps: true,
+ display: {
+ type: "chart",
+ chartType: "bar",
+ xAxisColumn: "t",
+ yAxisColumns: ["throttled"],
+ groupByColumn: "queue",
+ stacked: true,
+ sortByColumn: null,
+ sortDirection: "asc",
+ aggregation: "sum",
+ },
+ },
+ throughput: {
+ title: "Enqueued vs started",
+ // Counter states merge per queue, then sum outside: a single merge across queues
+ // mixes unrelated odometers and returns wrong totals.
+ query: `SELECT t, sum(enq) AS enqueued, sum(st) AS started\nFROM (\n SELECT timeBucket() AS t, queue,\n deltaSumTimestampMerge(enqueue_delta) AS enq,\n deltaSumTimestampMerge(started_delta) AS st\n FROM queue_metrics\n GROUP BY t, queue\n)\nGROUP BY t\nORDER BY t`,
+ display: {
+ type: "chart",
+ chartType: "line",
+ xAxisColumn: "t",
+ yAxisColumns: ["enqueued", "started"],
+ groupByColumn: null,
+ stacked: false,
+ sortByColumn: null,
+ sortDirection: "asc",
+ aggregation: "sum",
+ },
+ },
+ "wait-pct": {
+ title: "Scheduling delay p50/p95/p99 (ms)",
+ query: `SELECT timeBucket() AS t,\n round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[1]) AS p50,\n round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[3]) AS p95,\n round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[4]) AS p99\nFROM env_metrics\nGROUP BY t\nORDER BY t`,
+ display: {
+ type: "chart",
+ chartType: "line",
+ xAxisColumn: "t",
+ yAxisColumns: ["p50", "p95", "p99"],
+ groupByColumn: null,
+ stacked: false,
+ sortByColumn: null,
+ sortDirection: "asc",
+ aggregation: "max",
+ },
+ },
+ },
+ },
+};
+
+const builtInDashboards: BuiltInDashboard[] = [overviewDashboard, llmDashboard, queuesDashboard];
export function builtInDashboardList(): BuiltInDashboard[] {
return builtInDashboards;
diff --git a/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts b/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts
index df43864b53a..0b84e971b2f 100644
--- a/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts
+++ b/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts
@@ -37,6 +37,9 @@ export const Widget = z.object({
title: z.string(),
query: z.string().default(""),
display: QueryWidgetConfig,
+ // Opt into server-side gap fill (carry-forward for gauges, zero-fill for counters).
+ // Top-level rather than in `display` because display config is client-only and never reaches the query POST.
+ fillGaps: z.boolean().optional(),
});
export type Widget = z.infer;
diff --git a/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts
new file mode 100644
index 00000000000..c7a8166b6a3
--- /dev/null
+++ b/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts
@@ -0,0 +1,94 @@
+import { TaskQueueType, type Prisma } from "@trigger.dev/database";
+import { type AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { engine } from "~/v3/runEngine.server";
+import { BasePresenter } from "./basePresenter.server";
+
+const MAX_ALLOCATION_QUEUES = 500;
+
+export type QueueAllocationItem = {
+ id: string;
+ name: string;
+ type: "task" | "custom";
+ running: number;
+ queued: number;
+ paused: boolean;
+ /** Explicit per-queue limit; null means the queue floats up to the env limit. */
+ limit: number | null;
+ overridden: boolean;
+};
+
+export type QueueAllocation = {
+ queues: QueueAllocationItem[];
+ totalQueues: number;
+ truncated: boolean;
+ /** Sum of explicit limits, each clamped to the env limit. */
+ allocated: number;
+ unlimitedCount: number;
+};
+
+/** Every queue in the environment (capped) with live counts, for the allocation view. */
+export class QueueAllocationPresenter extends BasePresenter {
+ public async call({
+ environment,
+ }: {
+ environment: AuthenticatedEnvironment;
+ }): Promise {
+ const where: Prisma.TaskQueueWhereInput = {
+ runtimeEnvironmentId: environment.id,
+ version: "V2",
+ };
+
+ const [totalQueues, queues] = await Promise.all([
+ this._replica.taskQueue.count({ where }),
+ this._replica.taskQueue.findMany({
+ where,
+ select: {
+ friendlyId: true,
+ name: true,
+ type: true,
+ paused: true,
+ concurrencyLimit: true,
+ concurrencyLimitOverriddenAt: true,
+ },
+ orderBy: { orderableName: "asc" },
+ take: MAX_ALLOCATION_QUEUES,
+ }),
+ ]);
+
+ const names = queues.map((q) => q.name);
+ const [queuedByQueue, runningByQueue] = await Promise.all([
+ engine.lengthOfQueues(environment, names),
+ engine.currentConcurrencyOfQueues(environment, names),
+ ]);
+
+ const envLimit = environment.maximumConcurrencyLimit;
+ let allocated = 0;
+ let unlimitedCount = 0;
+
+ const items: QueueAllocationItem[] = queues.map((queue) => {
+ if (queue.concurrencyLimit === null) {
+ unlimitedCount++;
+ } else {
+ allocated += Math.min(queue.concurrencyLimit, envLimit);
+ }
+ return {
+ id: queue.friendlyId,
+ name: queue.name.replace(/^task\//, ""),
+ type: queue.type === TaskQueueType.VIRTUAL ? ("task" as const) : ("custom" as const),
+ running: runningByQueue[queue.name] ?? 0,
+ queued: queuedByQueue[queue.name] ?? 0,
+ paused: queue.paused,
+ limit: queue.concurrencyLimit,
+ overridden: queue.concurrencyLimitOverriddenAt !== null,
+ };
+ });
+
+ return {
+ queues: items,
+ totalQueues,
+ truncated: totalQueues > queues.length,
+ allocated,
+ unlimitedCount,
+ };
+ }
+}
diff --git a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts
index 024a1342b0a..751d4b0a602 100644
--- a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts
+++ b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts
@@ -3,6 +3,8 @@ import type { Prisma } from "@trigger.dev/database";
import { TaskQueueType } from "@trigger.dev/database";
import { type PrismaClientOrTransaction } from "~/db.server";
import { type AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server";
+import { logger } from "~/services/logger.server";
import { determineEngineVersion } from "~/v3/engineVersion.server";
import { engine } from "~/v3/runEngine.server";
import { BasePresenter } from "./basePresenter.server";
@@ -13,6 +15,12 @@ type QueueListEngine = Pick = {
task: TaskQueueType.VIRTUAL,
custom: TaskQueueType.NAMED,
@@ -30,6 +38,38 @@ const queueListSelect = {
paused: true,
} satisfies Prisma.TaskQueueSelect;
+type QueueListRow = Prisma.TaskQueueGetPayload<{ select: typeof queueListSelect }>;
+
+type QueueListItem = ReturnType;
+
+type QueueListPagination =
+ | { mode: "filtered"; currentPage: number; hasMore: boolean }
+ | { mode: "unfiltered"; currentPage: number; totalPages: number; count: number };
+
+// The `?: undefined` markers keep every key reachable across the union, so consumers
+// can destructure before narrowing on `success`.
+export type QueueListResult =
+ | {
+ success: false;
+ code: string;
+ totalQueues: number;
+ hasFilters: boolean;
+ queues?: undefined;
+ pagination?: undefined;
+ }
+ | {
+ success: true;
+ queues: QueueListItem[];
+ pagination: QueueListPagination;
+ totalQueues?: number;
+ hasFilters: boolean;
+ code?: undefined;
+ };
+
+function formatClickhouseDateTime(date: Date): string {
+ return date.toISOString().slice(0, 19).replace("T", " ");
+}
+
function buildQueueListWhere(
environmentId: string,
query: string | undefined,
@@ -70,13 +110,15 @@ export class QueueListPresenter extends BasePresenter {
query,
page,
type,
+ sort = "name",
}: {
environment: AuthenticatedEnvironment;
query?: string;
page: number;
perPage?: number;
type?: "task" | "custom";
- }) {
+ sort?: QueueListSort;
+ }): Promise {
const hasFilters = Boolean(query?.trim()) || type !== undefined;
const engineVersion = await determineEngineVersion({ environment });
@@ -110,6 +152,18 @@ export class QueueListPresenter extends BasePresenter {
};
}
+ if (sort !== "name") {
+ // Ranking is additive: any failure or unsupported input falls back to name order.
+ try {
+ const ranked = await this.getRankedQueues(environment, query, page, type, sort);
+ if (ranked) {
+ return ranked;
+ }
+ } catch (error) {
+ logger.warn("Queue ranking unavailable, falling back to name order", { error });
+ }
+ }
+
if (hasFilters) {
const { queues, hasMore } = await this.getFilteredQueues(environment, query, page, type);
@@ -143,6 +197,132 @@ export class QueueListPresenter extends BasePresenter {
};
}
+ /**
+ * ClickHouse ranks queues by recent activity and returns the requested page of names;
+ * queues with no recent metrics follow in name order. Null when ranking does not apply.
+ */
+ private async getRankedQueues(
+ environment: AuthenticatedEnvironment,
+ query: string | undefined,
+ page: number,
+ type: "task" | "custom" | undefined,
+ sort: Exclude
+ ) {
+ if (type !== undefined) {
+ return null;
+ }
+
+ const clickhouse = await clickhouseFactory.getClickhouseForOrganization(
+ environment.organizationId,
+ "query"
+ );
+
+ // The window start is aligned to the minute so repeated page loads produce identical
+ // query text and can share ClickHouse query-cache entries.
+ const windowStartMs =
+ Math.floor((Date.now() - QUEUE_RANKING_WINDOW_MINUTES * 60 * 1000) / 60_000) * 60_000;
+ const rankingArgs = {
+ organizationId: environment.organizationId,
+ projectId: environment.projectId,
+ environmentId: environment.id,
+ startTime: formatClickhouseDateTime(new Date(windowStartMs)),
+ nameContains: query?.trim() ?? "",
+ };
+
+ const offset = (page - 1) * this.perPage;
+
+ // One scan returns the page and the total ranked count (window function).
+ const [pageError, pageRows] = await clickhouse.queueMetrics.ranking({
+ ...rankingArgs,
+ byQueuedOnly: sort === "queued" ? 1 : 0,
+ limit: this.perPage,
+ offset,
+ });
+ if (pageError) {
+ throw pageError;
+ }
+
+ let ranked = pageRows?.[0]?.ranked_total ?? 0;
+ if (ranked === 0 && offset > 0) {
+ // Empty page past the ranked head: fetch the count alone for the tail slot math.
+ const [countError, countRows] = await clickhouse.queueMetrics.rankingCount(rankingArgs);
+ if (countError) {
+ throw countError;
+ }
+ ranked = countRows?.[0]?.ranked ?? 0;
+ }
+ if (ranked > MAX_RANKED_QUEUES) {
+ return null;
+ }
+
+ const where = buildQueueListWhere(environment.id, query, type);
+ const totalQueues = await this._replica.taskQueue.count({ where });
+
+ let rankedPageQueues: QueueListRow[] = [];
+ if ((pageRows?.length ?? 0) > 0) {
+ const rankedNames = (pageRows ?? []).map((row) => row.queue_name);
+ rankedPageQueues = await this.findQueuesByNames(where, rankedNames);
+ }
+
+ // Tail of the page: name-ordered queues that have no recent metrics. Slot math uses the
+ // ClickHouse counts so pages never overlap, even if some ranked names no longer exist.
+ const rankedSlots = Math.min(Math.max(ranked - offset, 0), this.perPage);
+ const tailNeeded = this.perPage - rankedSlots;
+ let tailQueues: QueueListRow[] = [];
+ if (tailNeeded > 0) {
+ let excludedNames: string[] = [];
+ if (ranked > 0) {
+ const [allError, allRows] = await clickhouse.queueMetrics.rankingNames({
+ ...rankingArgs,
+ limit: MAX_RANKED_QUEUES,
+ });
+ if (allError) {
+ throw allError;
+ }
+ excludedNames = (allRows ?? []).map((row) => row.queue_name);
+ }
+ // AND keeps the search's name filter intact alongside the exclusion (a spread
+ // would overwrite one name condition with the other).
+ tailQueues = await this._replica.taskQueue.findMany({
+ where: { AND: [where, { name: { notIn: excludedNames } }] },
+ select: queueListSelect,
+ orderBy: {
+ orderableName: "asc",
+ },
+ skip: Math.max(0, offset - ranked),
+ take: tailNeeded,
+ });
+ }
+
+ return {
+ success: true as const,
+ queues: await this.enrichQueues(environment, [...rankedPageQueues, ...tailQueues]),
+ pagination: {
+ mode: "unfiltered" as const,
+ currentPage: page,
+ totalPages: Math.max(1, Math.ceil(totalQueues / this.perPage)),
+ count: totalQueues,
+ },
+ totalQueues,
+ hasFilters: Boolean(query?.trim()) || type !== undefined,
+ };
+ }
+
+ private async findQueuesByNames(
+ where: Prisma.TaskQueueWhereInput,
+ names: string[]
+ ): Promise {
+ if (names.length === 0) {
+ return [];
+ }
+ const queues = await this._replica.taskQueue.findMany({
+ where: { AND: [where, { name: { in: names } }] },
+ select: queueListSelect,
+ });
+ const byName = new Map(queues.map((queue) => [queue.name, queue]));
+ return names.flatMap((name) => byName.get(name) ?? []);
+ }
+
private async getFilteredQueues(
environment: AuthenticatedEnvironment,
query: string | undefined,
diff --git a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts
new file mode 100644
index 00000000000..a36c402dda7
--- /dev/null
+++ b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts
@@ -0,0 +1,139 @@
+import { type AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server";
+import { logger } from "~/services/logger.server";
+
+export type QueueListMetric = {
+ p50WaitMs: number | null;
+ p95WaitMs: number | null;
+ peakQueued: number;
+ /** Equal-width buckets, oldest first, carry-forward filled across idle gaps. */
+ depthSparkline: number[];
+};
+
+export type QueueListMetrics = {
+ bucketStartMs: number;
+ bucketIntervalMs: number;
+ byQueue: Map;
+};
+
+const SPARKLINE_POINTS = 48;
+
+function formatClickhouseDateTime(date: Date): string {
+ return date.toISOString().slice(0, 19).replace("T", " ");
+}
+
+function finiteOrNull(value: number): number | null {
+ return Number.isFinite(value) ? value : null;
+}
+
+export class QueueMetricsPresenter {
+ /**
+ * Per-queue metrics over a time range for a fixed set of queues (the visible list page),
+ * scoped to one ClickHouse query window so cost is independent of total queue count.
+ * Degrades to an empty map if ClickHouse is unavailable so the live list still renders.
+ */
+ public async getQueueListMetrics({
+ environment,
+ queueNames,
+ from,
+ to,
+ }: {
+ environment: AuthenticatedEnvironment;
+ queueNames: string[];
+ from: Date;
+ to: Date;
+ }): Promise {
+ const rangeSeconds = Math.max(60, Math.round((to.getTime() - from.getTime()) / 1000));
+ const bucketSeconds = Math.max(60, Math.round(rangeSeconds / SPARKLINE_POINTS));
+ const numBuckets = Math.max(1, Math.ceil(rangeSeconds / bucketSeconds));
+ const gridStartSeconds =
+ Math.floor(Math.floor(from.getTime() / 1000) / bucketSeconds) * bucketSeconds;
+ const bucketStartMs = gridStartSeconds * 1000;
+ const bucketIntervalMs = bucketSeconds * 1000;
+
+ const empty: QueueListMetrics = {
+ bucketStartMs,
+ bucketIntervalMs,
+ byQueue: new Map(),
+ };
+
+ if (queueNames.length === 0) {
+ return empty;
+ }
+
+ try {
+ const clickhouse = await clickhouseFactory.getClickhouseForOrganization(
+ environment.organizationId,
+ "query"
+ );
+
+ // End bound snaps up to the bucket grid so repeated loads within a bucket produce
+ // identical params and share ClickHouse query-cache entries.
+ const endMs = Math.ceil(to.getTime() / bucketIntervalMs) * bucketIntervalMs;
+ const ids = {
+ organizationId: environment.organizationId,
+ projectId: environment.projectId,
+ environmentId: environment.id,
+ queueNames,
+ startTime: formatClickhouseDateTime(new Date(bucketStartMs)),
+ endTime: formatClickhouseDateTime(new Date(endMs)),
+ };
+
+ const [summaryResult, sparklineResult] = await Promise.all([
+ clickhouse.queueMetrics.listSummary(ids),
+ clickhouse.queueMetrics.depthSparklines({ ...ids, bucketSeconds }),
+ ]);
+
+ const [summaryError, summaryRows] = summaryResult;
+ const [sparklineError, sparklineRows] = sparklineResult;
+
+ if (summaryError || sparklineError) {
+ logger.warn("QueueMetricsPresenter: clickhouse query failed", {
+ summaryError: summaryError?.message,
+ sparklineError: sparklineError?.message,
+ });
+ return empty;
+ }
+
+ // Bucket -> depth per queue, mapped onto the aligned grid and forward-filled.
+ const depthsByQueue = new Map>();
+ for (const row of sparklineRows ?? []) {
+ const bucketMs = Date.parse(row.bucket.replace(" ", "T") + "Z");
+ if (Number.isNaN(bucketMs)) continue;
+ const index = Math.round((bucketMs - bucketStartMs) / bucketIntervalMs);
+ if (index < 0 || index >= numBuckets) continue;
+ let byIndex = depthsByQueue.get(row.queue_name);
+ if (!byIndex) {
+ byIndex = new Map();
+ depthsByQueue.set(row.queue_name, byIndex);
+ }
+ byIndex.set(index, row.depth);
+ }
+
+ const byQueue = new Map();
+ for (const row of summaryRows ?? []) {
+ const byIndex = depthsByQueue.get(row.queue_name);
+ const sparkline: number[] = new Array(numBuckets);
+ let last = 0;
+ for (let i = 0; i < numBuckets; i++) {
+ const value = byIndex?.get(i);
+ if (value !== undefined) last = value;
+ sparkline[i] = last;
+ }
+ byQueue.set(row.queue_name, {
+ p50WaitMs: finiteOrNull(row.p50_wait_ms),
+ p95WaitMs: finiteOrNull(row.p95_wait_ms),
+ peakQueued: row.peak_queued,
+ depthSparkline: sparkline,
+ });
+ }
+
+ return { bucketStartMs, bucketIntervalMs, byQueue };
+ } catch (error) {
+ logger.warn("QueueMetricsPresenter: failed to load queue metrics", {
+ error: error instanceof Error ? error.message : String(error),
+ });
+ return empty;
+ }
+ }
+}
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx
index 5fa237cee6e..d529fdf0d22 100644
--- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx
+++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx
@@ -38,6 +38,7 @@ import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstan
import { requireUser } from "~/services/session.server";
import { cn } from "~/utils/cn";
import { EnvironmentParamSchema } from "~/utils/pathBuilder";
+import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server";
import { QueryScopeSchema } from "~/v3/querySchemas";
import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route";
import { MetricWidget } from "../resources.metric";
@@ -50,6 +51,15 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => {
const user = await requireUser(request);
const { projectParam, organizationSlug, envParam, dashboardKey } = ParamSchema.parse(params);
+ // The built-in "queues" dashboard is part of the metrics UI (unlinked, but reachable by
+ // URL), so gate it per-org like the rest of the Queue Metrics view.
+ if (
+ dashboardKey === "queues" &&
+ !(await canAccessQueueMetricsUi({ userId: user.id, organizationSlug }))
+ ) {
+ throw new Response(undefined, { status: 404, statusText: "Not found" });
+ }
+
const project = await findProjectBySlug(organizationSlug, projectParam, user.id);
if (!project) {
throw new Response(undefined, {
@@ -376,6 +386,7 @@ export function MetricDashboard({
promptSlugs={prompts.length > 0 ? prompts : undefined}
operations={operations.length > 0 ? operations : undefined}
providers={providers.length > 0 ? providers : undefined}
+ fillGaps={widget.fillGaps}
config={widget.display}
organizationId={organization.id}
projectId={project.id}
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/ExamplesContent.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/ExamplesContent.tsx
index 05b4f4d9b62..3188b5409a6 100644
--- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/ExamplesContent.tsx
+++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/ExamplesContent.tsx
@@ -3,7 +3,7 @@ import { Header3 } from "~/components/primitives/Headers";
import { Paragraph } from "~/components/primitives/Paragraph";
import SegmentedControl from "~/components/primitives/SegmentedControl";
import type { QueryScope } from "~/services/queryService.server";
-import { querySchemas } from "~/v3/querySchemas";
+import { visibleQuerySchemas } from "~/v3/querySchemas";
import { TryableCodeBlock } from "./TRQLGuideContent";
// Example queries for the Examples tab
@@ -211,14 +211,14 @@ LIMIT 20`,
},
];
-const tableOptions = querySchemas.map((s) => ({ label: s.name, value: s.name }));
+const tableOptions = visibleQuerySchemas.map((s) => ({ label: s.name, value: s.name }));
export function ExamplesContent({
onTryExample,
}: {
onTryExample: (query: string, scope: QueryScope) => void;
}) {
- const [selectedTable, setSelectedTable] = useState(querySchemas[0].name);
+ const [selectedTable, setSelectedTable] = useState(visibleQuerySchemas[0].name);
const filtered = exampleQueries.filter((e) => e.table === selectedTable);
return (
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/TableSchemaContent.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/TableSchemaContent.tsx
index 285a1f68731..9fc6ec32923 100644
--- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/TableSchemaContent.tsx
+++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/TableSchemaContent.tsx
@@ -4,7 +4,7 @@ import { Badge } from "~/components/primitives/Badge";
import { CopyableText } from "~/components/primitives/CopyableText";
import { Paragraph } from "~/components/primitives/Paragraph";
import SegmentedControl from "~/components/primitives/SegmentedControl";
-import { querySchemas } from "~/v3/querySchemas";
+import { visibleQuerySchemas } from "~/v3/querySchemas";
function ColumnHelpItem({ col }: { col: ColumnSchema }) {
return (
@@ -43,11 +43,11 @@ function ColumnHelpItem({ col }: { col: ColumnSchema }) {
);
}
-const tableOptions = querySchemas.map((s) => ({ label: s.name, value: s.name }));
+const tableOptions = visibleQuerySchemas.map((s) => ({ label: s.name, value: s.name }));
export function TableSchemaContent() {
- const [selectedTable, setSelectedTable] = useState(querySchemas[0].name);
- const table = querySchemas.find((s) => s.name === selectedTable) ?? querySchemas[0];
+ const [selectedTable, setSelectedTable] = useState(visibleQuerySchemas[0].name);
+ const table = visibleQuerySchemas.find((s) => s.name === selectedTable) ?? visibleQuerySchemas[0];
return (
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx
new file mode 100644
index 00000000000..8b13afbcb87
--- /dev/null
+++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx
@@ -0,0 +1,490 @@
+import { Form, useNavigation } from "@remix-run/react";
+import { type ReactNode, useEffect, useMemo, useState } from "react";
+import { BigNumber } from "~/components/metrics/BigNumber";
+import { Badge } from "~/components/primitives/Badge";
+import { Button } from "~/components/primitives/Buttons";
+import { Callout } from "~/components/primitives/Callout";
+import { Dialog, DialogContent, DialogHeader, DialogTrigger } from "~/components/primitives/Dialog";
+import { Input } from "~/components/primitives/Input";
+import { Paragraph } from "~/components/primitives/Paragraph";
+import {
+ Table,
+ TableBody,
+ TableCell,
+ TableHeader,
+ TableHeaderCell,
+ TableRow,
+} from "~/components/primitives/Table";
+import { SimpleTooltip } from "~/components/primitives/Tooltip";
+import { getSeriesColor } from "~/components/code/chartColors";
+import { QueueName } from "~/components/runs/v3/QueueName";
+import { type Environment } from "~/presenters/v3/EnvironmentQueuePresenter.server";
+import {
+ type QueueAllocation,
+ type QueueAllocationItem,
+} from "~/presenters/v3/QueueAllocationPresenter.server";
+import { cn } from "~/utils/cn";
+
+type Drafts = Record
;
+
+export function AllocationView({
+ allocation,
+ environment,
+}: {
+ allocation: QueueAllocation;
+ environment: Environment;
+}) {
+ const [drafts, setDrafts] = useState({});
+ const [reviewOpen, setReviewOpen] = useState(false);
+ const navigation = useNavigation();
+ const isSubmitting = navigation.state !== "idle";
+
+ const envLimit = environment.concurrencyLimit;
+ const burstLimit = Math.round(envLimit * environment.burstFactor);
+
+ useEffect(() => {
+ if (navigation.state === "loading" || navigation.state === "idle") {
+ setReviewOpen(false);
+ }
+ }, [navigation.state]);
+
+ // After an apply revalidates the loader, drop drafts that now match the saved limits.
+ useEffect(() => {
+ setDrafts((prev) => {
+ const next = { ...prev };
+ for (const queue of allocation.queues) {
+ if (next[queue.id] !== undefined && next[queue.id] === queue.limit) {
+ delete next[queue.id];
+ }
+ }
+ return next;
+ });
+ }, [allocation]);
+
+ const draftLimit = (queue: QueueAllocationItem): number | null => drafts[queue.id] ?? queue.limit;
+
+ const draftAllocated = allocation.queues.reduce((sum, queue) => {
+ const limit = draftLimit(queue);
+ return limit === null ? sum : sum + Math.min(limit, envLimit);
+ }, 0);
+
+ const changes = allocation.queues.filter(
+ (queue) => drafts[queue.id] !== undefined && drafts[queue.id] !== queue.limit
+ );
+
+ const unlimitedCount = allocation.queues.filter((queue) => draftLimit(queue) === null).length;
+ const allocationPct = envLimit > 0 ? Math.round((draftAllocated / envLimit) * 100) : 0;
+ const overAllocated = draftAllocated > envLimit;
+
+ const setDraft = (queue: QueueAllocationItem, value: string) => {
+ setDrafts((prev) => {
+ const next = { ...prev };
+ if (value.trim() === "") {
+ delete next[queue.id];
+ return next;
+ }
+ const parsed = parseInt(value, 10);
+ if (!Number.isFinite(parsed) || parsed < 0) return prev;
+ if (parsed === queue.limit) {
+ delete next[queue.id];
+ } else {
+ next[queue.id] = parsed;
+ }
+ return next;
+ });
+ };
+
+ const changesPayload = useMemo(
+ () =>
+ JSON.stringify(changes.map((queue) => ({ friendlyId: queue.id, limit: drafts[queue.id] }))),
+ [changes, drafts]
+ );
+
+ const colorByQueue = useMemo(() => {
+ const map = new Map();
+ allocation.queues.forEach((queue, i) => map.set(queue.id, getSeriesColor(i)));
+ return map;
+ }, [allocation.queues]);
+ const colorFor = (id: string) => colorByQueue.get(id) ?? "#878C99";
+
+ // Busiest first: the queues you'd rebalance are the ones under load. Colors stay
+ // keyed to the loader order so they don't shift as counts change.
+ const tableQueues = useMemo(
+ () => [...allocation.queues].sort((a, b) => b.running + b.queued - (a.running + a.queued)),
+ [allocation.queues]
+ );
+
+ return (
+
+
+ 1 ? `bursts up to ${burstLimit}` : undefined}
+ suffixClassName="text-text-dimmed"
+ />
+
+ 0
+ ? `${unlimitedCount} without a limit (can use up to ${envLimit})`
+ : "all have limits"
+ }
+ suffixClassName="text-text-dimmed"
+ />
+
+
+
+
+ {overAllocated && (
+
+ The queue limits add up to more than the environment limit, so queues will compete for
+ concurrency when the environment saturates. Reduce limits to guarantee each queue its
+ allocation.
+
+ )}
+
+ {allocation.truncated && (
+
+ Showing the first {allocation.queues.length} of {allocation.totalQueues} queues.
+ Allocation totals only include the queues shown.
+
+ )}
+
+
+
setDrafts({})}
+ disabled={changes.length === 0 || isSubmitting}
+ >
+ Reset changes
+
+
+
+
+
+ Review {changes.length} change{changes.length === 1 ? "" : "s"}…
+
+
+
+ Apply queue limits
+
+
+
+
+ Queue
+ Current
+ New
+
+
+
+ {changes.map((queue) => (
+
+
+
+
+ {queue.limit ?? "–"}
+ {drafts[queue.id]}
+
+ ))}
+
+
+
+
+ Limits apply immediately and are set as overrides, so they survive deploys until
+ removed.
+
+
+
+
+
+
+
+
+ );
+}
+
+const MAX_BAR_SEGMENTS = 24;
+
+function AllocationBar({
+ queues,
+ draftLimit,
+ envLimit,
+ burstLimit,
+ draftAllocated,
+ colorFor,
+}: {
+ queues: QueueAllocationItem[];
+ draftLimit: (queue: QueueAllocationItem) => number | null;
+ envLimit: number;
+ burstLimit: number;
+ draftAllocated: number;
+ colorFor: (id: string) => string;
+}) {
+ const limited = queues
+ .map((queue) => ({ queue, limit: draftLimit(queue) }))
+ .filter(
+ (entry): entry is { queue: QueueAllocationItem; limit: number } =>
+ typeof entry.limit === "number" && entry.limit > 0
+ )
+ .sort((a, b) => b.limit - a.limit);
+
+ const top = limited.slice(0, MAX_BAR_SEGMENTS);
+ const rest = limited.slice(MAX_BAR_SEGMENTS);
+ const restTotal = rest.reduce((sum, entry) => sum + entry.limit, 0);
+ const restRunning = rest.reduce(
+ (sum, entry) => sum + Math.min(entry.queue.running, entry.limit),
+ 0
+ );
+
+ const hasBurst = burstLimit > envLimit;
+ // The axis runs to the burst ceiling: allocations are guaranteed up to the env
+ // limit, and everything between the limit and burst is shared overflow headroom.
+ const scale = Math.max(draftAllocated, envLimit, burstLimit);
+ if (scale === 0) return null;
+
+ const free = Math.max(0, envLimit - draftAllocated);
+ const limitMarkerPct = (envLimit / scale) * 100;
+ const burstZoneWidthPct = ((Math.min(burstLimit, scale) - envLimit) / scale) * 100;
+
+ return (
+
+
+
+ {hasBurst && (
+
+ }
+ content={`Shared burst headroom: beyond the environment limit, queues can burst up to ${burstLimit} combined`}
+ disableHoverableContent
+ />
+ )}
+
+ {top.map((entry) => (
+
+ }
+ />
+ ))}
+ {restTotal > 0 && (
+
+ )}
+
+
+
+
+
+
+ {draftAllocated} allocated
+ {free > 0 ? ` · ${free} unallocated` : ""}
+
+ {hasBurst ? (
+ <>
+
+ Environment limit {envLimit}
+
+ Burst {burstLimit}
+ >
+ ) : (
+ Environment limit {envLimit}
+ )}
+
+
+ );
+}
+
+function QueueSegmentTooltip({
+ queue,
+ limit,
+ envLimit,
+ color,
+}: {
+ queue: QueueAllocationItem;
+ limit: number;
+ envLimit: number;
+ color: string;
+}) {
+ const utilizationPct = limit > 0 ? Math.round((queue.running / limit) * 100) : 0;
+ const sharePct = envLimit > 0 ? Math.round((limit / envLimit) * 100) : 0;
+ return (
+
+
+
+
+ {queue.paused && (
+
+ Paused
+
+ )}
+
+
+ Running
+
+ {queue.running} of {limit} ({utilizationPct}%)
+
+ Queued
+ {queue.queued}
+ Allocation
+
+ {sharePct}% of the environment limit
+
+
+
+ );
+}
+
+/** One queue's slice of the capacity bar: dim fill = allocation, solid fill = current usage. */
+function BarSegment({
+ color,
+ widthPct,
+ usagePct,
+ tooltip,
+}: {
+ color: string;
+ widthPct: number;
+ usagePct: number;
+ tooltip: ReactNode;
+}) {
+ return (
+
+ {usagePct > 0 && (
+
+ )}
+
+ }
+ content={tooltip}
+ disableHoverableContent
+ />
+ );
+}
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx
index 877b1235a97..24fe2212953 100644
--- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx
+++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx
@@ -7,11 +7,11 @@ import {
RectangleStackIcon,
} from "@heroicons/react/20/solid";
import { DialogClose } from "@radix-ui/react-dialog";
-import { Form, useNavigation, type MetaFunction } from "@remix-run/react";
+import { Form, Link, useNavigation, type MetaFunction } from "@remix-run/react";
import { type ActionFunctionArgs, type LoaderFunctionArgs } from "@remix-run/server-runtime";
import type { QueueItem } from "@trigger.dev/core/v3/schemas";
import type { RuntimeEnvironmentType } from "@trigger.dev/database";
-import { useEffect, useState } from "react";
+import { type ReactNode, useEffect, useMemo, useState } from "react";
import { typedjson, useTypedLoaderData } from "remix-typedjson";
import { z } from "zod";
import { ConcurrencyIcon } from "~/assets/icons/ConcurrencyIcon";
@@ -21,7 +21,6 @@ import { AdminDebugTooltip } from "~/components/admin/debugTooltip";
import { QueuesHasNoTasks } from "~/components/BlankStatePanels";
import { environmentFullTitle } from "~/components/environments/EnvironmentLabel";
import { PageBody, PageContainer } from "~/components/layout/AppLayout";
-import { BigNumber } from "~/components/metrics/BigNumber";
import { Badge } from "~/components/primitives/Badge";
import { Button, LinkButton, type ButtonVariant } from "~/components/primitives/Buttons";
import { Callout } from "~/components/primitives/Callout";
@@ -55,6 +54,7 @@ import {
import { QueueName } from "~/components/runs/v3/QueueName";
import { env } from "~/env.server";
import { useAutoRevalidate } from "~/hooks/useAutoRevalidate";
+import { LoadingBarDivider } from "~/components/primitives/LoadingBarDivider";
import { useEnvironment } from "~/hooks/useEnvironment";
import { useOrganization } from "~/hooks/useOrganizations";
import { useProject } from "~/hooks/useProject";
@@ -64,6 +64,24 @@ import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server";
import { getUserById } from "~/models/user.server";
import { EnvironmentQueuePresenter } from "~/presenters/v3/EnvironmentQueuePresenter.server";
import { QueueListPresenter } from "~/presenters/v3/QueueListPresenter.server";
+import {
+ QueueMetricsPresenter,
+ type QueueListMetric,
+} from "~/presenters/v3/QueueMetricsPresenter.server";
+import * as Ariakit from "@ariakit/react";
+import { AppliedFilter } from "~/components/primitives/AppliedFilter";
+import { SelectItem, SelectPopover, SelectProvider } from "~/components/primitives/Select";
+import { TimeFilter, timeFilterFromTo } from "~/components/runs/v3/SharedFilters";
+import { useSearchParams } from "~/hooks/useSearchParam";
+import { parseFiniteInt } from "~/utils/searchParams";
+import { UsageSparkline } from "~/components/primitives/UsageSparkline";
+import { buildActivityTimeAxis } from "~/components/primitives/charts/activityTimeAxis";
+import { Chart, type ChartConfig } from "~/components/primitives/charts/ChartCompound";
+import {
+ useMetricResourceQuery,
+ type MetricResourceTimeRange,
+} from "~/hooks/useMetricResourceQuery";
+import { logger } from "~/services/logger.server";
import { requireUserId } from "~/services/session.server";
import { cn } from "~/utils/cn";
import { ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT } from "~/utils/environmentPauseSource";
@@ -72,18 +90,36 @@ import {
docsPath,
EnvironmentParamSchema,
v3BillingPath,
+ v3QueuePath,
v3RunsPath,
} from "~/utils/pathBuilder";
import { concurrencySystem } from "~/v3/services/concurrencySystemInstance.server";
import { PauseEnvironmentService } from "~/v3/services/pauseEnvironment.server";
import { PauseQueueService } from "~/v3/services/pauseQueue.server";
import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route";
+import { BigNumber } from "~/components/metrics/BigNumber";
+import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server";
+import { QueueAllocationPresenter } from "~/presenters/v3/QueueAllocationPresenter.server";
+import { TabButton, TabContainer } from "~/components/primitives/Tabs";
+import { AllocationView } from "./AllocationView";
const SearchParamsSchema = z.object({
query: z.string().optional(),
page: z.coerce.number().min(1).default(1),
+ period: z.string().optional(),
+ from: z.string().optional(),
+ to: z.string().optional(),
+ view: z.string().optional(),
+ sort: z.enum(["busiest", "queued", "name"]).optional(),
});
+const AllocationChangesSchema = z
+ .array(z.object({ friendlyId: z.string(), limit: z.number().int().min(0) }))
+ .min(1)
+ .max(200);
+
+const QUEUE_METRICS_DEFAULT_PERIOD = "1d";
+
export const meta: MetaFunction = () => {
return [
{
@@ -97,7 +133,9 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => {
const { organizationSlug, projectParam, envParam } = EnvironmentParamSchema.parse(params);
const url = new URL(request.url);
- const { page, query } = SearchParamsSchema.parse(Object.fromEntries(url.searchParams));
+ const { page, query, period, from, to, view, sort } = SearchParamsSchema.parse(
+ Object.fromEntries(url.searchParams)
+ );
const project = await findProjectBySlug(organizationSlug, projectParam, userId);
if (!project) {
@@ -115,22 +153,82 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => {
});
}
+ // Per-org gate for the metrics UI. When off, this org gets the classic Queues page and
+ // no metrics query fires.
+ const queueMetricsUiEnabled = await canAccessQueueMetricsUi({ userId, organizationSlug });
+
try {
const queueListPresenter = new QueueListPresenter();
const queues = await queueListPresenter.call({
environment,
query,
page,
+ // Relevance ordering rides the metrics pipeline, so it is part of the gated UI.
+ sort: queueMetricsUiEnabled ? (sort ?? "busiest") : "name",
});
const environmentQueuePresenter = new EnvironmentQueuePresenter();
const autoReloadPollIntervalMs = env.QUEUES_AUTORELOAD_POLL_INTERVAL_MS;
+ // Per-queue list metrics (Delay p95 + backlog sparkline columns) are SSR'd with the table.
+ // The environment header tiles are fetched client-side per card (see QueueEnvMetricTile) so a
+ // slow ClickHouse query never blocks the queues list from rendering.
+ let metrics: {
+ bucketStartMs: number;
+ bucketIntervalMs: number;
+ byQueue: Record;
+ } | null = null;
+
+ const allocationView = queueMetricsUiEnabled && view === "allocation";
+
+ if (queueMetricsUiEnabled && queues.success && !allocationView) {
+ // Metrics are additive observability; a ClickHouse hiccup must not take down queue
+ // management. Fail open to metrics: null instead of bubbling to the page-level 400.
+ try {
+ const presenter = new QueueMetricsPresenter();
+ const queueNames = queues.queues.map((q) =>
+ q.type === "task" ? `task/${q.name}` : q.name
+ );
+ const timeRange = timeFilterFromTo({
+ period,
+ from: parseFiniteInt(from),
+ to: parseFiniteInt(to),
+ defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD,
+ });
+ const queueMetrics =
+ queueNames.length > 0
+ ? await presenter.getQueueListMetrics({
+ environment,
+ queueNames,
+ from: timeRange.from,
+ to: timeRange.to,
+ })
+ : null;
+ if (queueMetrics) {
+ metrics = {
+ bucketStartMs: queueMetrics.bucketStartMs,
+ bucketIntervalMs: queueMetrics.bucketIntervalMs,
+ byQueue: Object.fromEntries(queueMetrics.byQueue),
+ };
+ }
+ } catch (error) {
+ logger.warn("Queue list metrics unavailable, rendering without them", { error });
+ }
+ }
+
+ const allocation =
+ allocationView && queues.success
+ ? await new QueueAllocationPresenter().call({ environment })
+ : null;
+
return typedjson({
...queues,
environment: await environmentQueuePresenter.call(environment),
autoReloadPollIntervalMs,
+ metrics,
+ allocation,
+ queueMetricsUiEnabled,
});
} catch (error) {
console.error(error);
@@ -293,12 +391,61 @@ export const action = async ({ request, params }: ActionFunctionArgs) => {
return redirectWithSuccessMessage(redirectPath, request, "Queue concurrency limit reset");
}
+ case "allocation-apply": {
+ if (!(await canAccessQueueMetricsUi({ userId, organizationSlug }))) {
+ return redirectWithErrorMessage(redirectPath, request, "Not available");
+ }
+
+ let changes;
+ try {
+ changes = AllocationChangesSchema.parse(JSON.parse(String(formData.get("changes"))));
+ } catch {
+ return redirectWithErrorMessage(redirectPath, request, "Invalid changes");
+ }
+
+ const user = await getUserById(userId);
+ if (!user) {
+ return redirectWithErrorMessage(redirectPath, request, "User not found");
+ }
+
+ let failed = 0;
+ for (const change of changes) {
+ const result = await concurrencySystem.queues.overrideQueueConcurrencyLimit(
+ environment,
+ change.friendlyId,
+ change.limit,
+ user
+ );
+ if (!result.isOk()) failed++;
+ }
+
+ if (failed > 0) {
+ return redirectWithErrorMessage(
+ redirectPath,
+ request,
+ `Failed to update ${failed} of ${changes.length} queue limits`
+ );
+ }
+
+ return redirectWithSuccessMessage(
+ redirectPath,
+ request,
+ `Updated ${changes.length} queue limit${changes.length === 1 ? "" : "s"}`
+ );
+ }
default:
return redirectWithErrorMessage(redirectPath, request, "Something went wrong");
}
};
export default function Page() {
+ // Per-org flag decides which whole page renders. Off => the classic Queues page,
+ // byte-for-byte the pre-metrics UI. Each branch is its own component (own hooks).
+ const { queueMetricsUiEnabled } = useTypedLoaderData();
+ return queueMetricsUiEnabled ? : ;
+}
+
+function QueuesWithMetricsView() {
const {
environment,
queues,
@@ -308,24 +455,28 @@ export default function Page() {
totalQueues,
hasFilters,
autoReloadPollIntervalMs,
+ metrics,
+ allocation,
} = useTypedLoaderData();
+ const metricsByQueue = metrics?.byQueue ?? {};
+
const organization = useOrganization();
const project = useProject();
const env = useEnvironment();
const plan = useCurrentPlan();
+ const maxPeriodDays = plan?.v3Subscription?.plan?.limits?.queryPeriodDays?.number;
- useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true });
-
- const limitStatus =
- environment.running === environment.concurrencyLimit * environment.burstFactor
- ? "limit"
- : environment.running > environment.concurrencyLimit
- ? "burst"
- : "within";
+ // The header tiles fetch client-side with the same period/from/to the TimeFilter writes.
+ const { value, replace } = useSearchParams();
+ const timeRange = {
+ period: value("period") ?? null,
+ from: value("from") ?? null,
+ to: value("to") ?? null,
+ };
+ const view = value("view") === "allocation" ? ("allocation" as const) : ("queues" as const);
- const limitClassName =
- limitStatus === "burst" ? "text-warning" : limitStatus === "limit" ? "text-error" : undefined;
+ useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true });
return (
@@ -333,6 +484,30 @@ export default function Page() {
+ {plan ? (
+ plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? (
+
+ Increase limit
+
+ ) : (
+
+ Increase limit
+
+ )
+ ) : null}
+ {environment.runsEnabled && env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? (
+
+ ) : null}
-
-
-
paused : undefined}
- animate
- accessory={
-
- {environment.runsEnabled &&
- env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? (
-
- ) : null}
-
-
- }
- valueClassName={env.paused ? "text-warning tabular-nums" : "tabular-nums"}
- compactThreshold={1000000}
- />
-
- Including {environment.running - environment.concurrencyLimit} burst runs{" "}
-
-
- ) : limitStatus === "limit" ? (
- "At concurrency limit"
- ) : undefined
- }
- accessory={
-
- }
- compactThreshold={1000000}
- />
- 1 ? (
-
- Burst limit {environment.burstFactor * environment.concurrencyLimit}{" "}
-
-
- ) : undefined
- }
- accessory={
- plan ? (
- plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? (
-
- Increase limit
-
- ) : (
-
- Increase limit
-
- )
- ) : null
- }
- />
+
+
+ {QUEUE_HEADER_TILES.map((tile) => (
+ 1
+ ? [
+ {
+ y: Math.round(environment.burstFactor * 100),
+ label: `Burst ${Math.round(
+ environment.concurrencyLimit * environment.burstFactor
+ )}`,
+ },
+ ]
+ : []),
+ ]
+ : undefined
+ }
+ />
+ ))}
{success ? (
+
+ replace({ view: undefined })}
+ >
+ Queues
+
+ replace({ view: "allocation", page: undefined })}
+ >
+ Allocation
+
+
+ ) : (
+
+ )}
+
+ {success && view === "allocation" ? (
+ allocation ? (
+
+ ) : (
+
+
+
+ )
+ ) : success ? (
-
+
+
+
+
+
Limited by
+ Health
+
+ Delay p95
+
+ Backlog
Pause/resume
@@ -518,11 +669,19 @@ export default function Page() {
const queueFilterableName = `${queue.type === "task" ? "task/" : ""}${
queue.name
}`;
+ const queueMetric = metricsByQueue[queueFilterableName];
return (
-
+
+
+
{queue.concurrency?.overriddenAt ? (
+
+
+
+
+ {queueMetric && queueMetric.p95WaitMs !== null ? (
+ = 60_000
+ ? "text-warning"
+ : "text-text-bright"
+ )}
+ >
+ {formatWaitMs(queueMetric.p95WaitMs)}
+
+ ) : (
+ –
+ )}
+
+
+ v.toLocaleString()}
+ />
+
-
+
{hasFilters
@@ -1059,6 +1253,709 @@ export function QueueFilters() {
return ;
}
+const QUEUE_SORT_OPTIONS = [
+ { value: "busiest", label: "Busiest" },
+ { value: "queued", label: "Backlog" },
+ { value: "name", label: "Name" },
+] as const;
+
+type QueueSortValue = (typeof QUEUE_SORT_OPTIONS)[number]["value"];
+
+function QueueSortFilter() {
+ const { value, replace } = useSearchParams();
+ const sort: QueueSortValue = (value("sort") as QueueSortValue) ?? "busiest";
+ const label = QUEUE_SORT_OPTIONS.find((option) => option.value === sort)?.label ?? "Busiest";
+
+ return (
+
+ replace({ sort: next === "busiest" ? undefined : (next as string), page: undefined })
+ }
+ >
+ }>
+
+
+
+ {QUEUE_SORT_OPTIONS.map((option) => (
+
+ {option.label}
+
+ ))}
+
+
+ );
+}
+
+type MetricTileRow = Record;
+
+type QueueHeaderTile = {
+ id: string;
+ label: string;
+ color: string;
+ query: string;
+ /** Formats a single bucket's value in the chart tooltip. */
+ formatValue?: (value: number) => string;
+ derive: (rows: MetricTileRow[]) => {
+ sparkline: number[];
+ total: number;
+ formatTotal?: (total: number) => string;
+ totalClassName?: string;
+ };
+};
+
+function tileNumber(value: number | string | null): number {
+ const n = typeof value === "number" ? value : Number(value);
+ return Number.isFinite(n) ? n : 0;
+}
+
+function tileTimeToMs(value: number | string | null): number {
+ const s = String(value).replace(" ", "T");
+ return Date.parse(s.endsWith("Z") ? s : `${s}Z`);
+}
+
+// Header tiles fetch their own TRQL query client-side (resources.metric) with fillGaps, mirroring the
+// metrics dashboard widgets: the gauges (saturation inputs, backlog) carry, counters/p95 zero-fill.
+const QUEUE_HEADER_TILES: QueueHeaderTile[] = [
+ {
+ id: "saturation",
+ label: "Env saturation",
+ color: "#6366F1",
+ query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS env_limit\nFROM env_metrics\nGROUP BY t\nORDER BY t`,
+ formatValue: (v) => `${v}%`,
+ derive: (rows) => {
+ const sparkline = rows.map((r) => {
+ const limit = tileNumber(r.env_limit);
+ return limit > 0 ? Math.round((tileNumber(r.used) / limit) * 100) : 0;
+ });
+ const peak = sparkline.reduce((max, v) => Math.max(max, v), 0);
+ return { sparkline, total: peak, formatTotal: (v) => `${v}% peak` };
+ },
+ },
+ {
+ id: "backlog",
+ label: "Backlog",
+ color: "#A78BFA",
+ query: `SELECT timeBucket() AS t,\n max(max_env_queued) AS queued\nFROM env_metrics\nGROUP BY t\nORDER BY t`,
+ derive: (rows) => {
+ const sparkline = rows.map((r) => tileNumber(r.queued));
+ const peak = sparkline.reduce((max, v) => Math.max(max, v), 0);
+ return { sparkline, total: peak, formatTotal: (v) => `${v.toLocaleString()} peak` };
+ },
+ },
+ {
+ id: "p95",
+ label: "Scheduling delay p95",
+ color: "#F59E0B",
+ query: `SELECT timeBucket() AS t,\n round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[3]) AS p95\nFROM env_metrics\nGROUP BY t\nORDER BY t`,
+ formatValue: formatWaitMs,
+ derive: (rows) => {
+ const sparkline = rows.map((r) => tileNumber(r.p95));
+ const worst = sparkline.reduce((max, v) => Math.max(max, v), 0);
+ return {
+ sparkline,
+ total: worst,
+ formatTotal: (v) => (v > 0 ? formatWaitMs(v) : "–"),
+ totalClassName: worst >= 60_000 ? "text-warning" : undefined,
+ };
+ },
+ },
+ {
+ id: "throttled",
+ label: "Throttled",
+ color: "#F59E0B",
+ query: `SELECT timeBucket() AS t,\n sum(throttled_count) AS throttled\nFROM env_metrics\nGROUP BY t\nORDER BY t`,
+ derive: (rows) => {
+ const sparkline = rows.map((r) => tileNumber(r.throttled));
+ const total = sparkline.reduce((sum, v) => sum + v, 0);
+ return {
+ sparkline,
+ total,
+ totalClassName: total > 0 ? "text-warning" : undefined,
+ };
+ },
+ },
+];
+
+type TileTimeRange = MetricResourceTimeRange;
+
+function QueueEnvMetricTile({
+ tile,
+ timeRange,
+ referenceLines,
+}: {
+ tile: QueueHeaderTile;
+ timeRange: TileTimeRange;
+ referenceLines?: Array<{ y: number; label?: string }>;
+}) {
+ const organization = useOrganization();
+ const project = useProject();
+ const environment = useEnvironment();
+
+ const { rows, isLoading, showLoading, failed } = useMetricResourceQuery(tile.query, {
+ organizationId: organization.id,
+ projectId: project.id,
+ environmentId: environment.id,
+ timeRange,
+ defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD,
+ fillGaps: true,
+ });
+
+ const { sparkline, total, formatTotal, totalClassName } = tile.derive(rows);
+
+ // Same point shape the full-size charts use so the shared axis/tooltip helpers apply.
+ const data = rows
+ .map((r, i) => ({ bucket: tileTimeToMs(r.t), [tile.id]: sparkline[i] ?? 0 }))
+ .filter((p) => Number.isFinite(p.bucket));
+
+ const chartConfig = useMemo(
+ () => ({ [tile.id]: { label: tile.label, color: tile.color } }),
+ [tile.id, tile.label, tile.color]
+ );
+
+ const { tooltipLabelFormatter } = useMemo(() => buildActivityTimeAxis(data), [data]);
+ const hasData = data.length > 0 && sparkline.some((v) => v > 0);
+
+ return (
+
+ ) : failed ? undefined : formatTotal ? (
+ formatTotal(total)
+ ) : (
+ total.toLocaleString()
+ )
+ }
+ valueClassName={totalClassName}
+ >
+
+ {showLoading ? (
+
+ ) : failed ? (
+
+ Unable to load metrics
+
+ ) : hasData ? (
+
+
+
+
+
+ ) : (
+ No activity
+ )}
+
+ );
+}
+
+function HeaderTile({
+ label,
+ value,
+ valueClassName,
+ children,
+}: {
+ label: ReactNode;
+ value?: ReactNode;
+ valueClassName?: string;
+ children: ReactNode;
+}) {
+ return (
+
+
+ {label}
+ {value !== undefined ? (
+
+ {value}
+
+ ) : null}
+
+ {children}
+
+ );
+}
+
+function QueueHealthBadge({
+ paused,
+ running,
+ queued,
+ limit,
+}: {
+ paused: boolean;
+ running: number;
+ queued: number;
+ limit: number;
+}) {
+ if (paused) {
+ return (
+
+ Paused
+
+ );
+ }
+ if (running >= limit && queued > 0) {
+ return (
+
+ At capacity
+
+ );
+ }
+ if (queued > 0) {
+ return (
+
+ Backlogged
+
+ );
+ }
+ if (running > 0) {
+ return (
+
+ Active
+
+ );
+ }
+ return (
+
+ Idle
+
+ );
+}
+
+function formatWaitMs(ms: number): string {
+ if (ms < 1000) return `${Math.round(ms)}ms`;
+ if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`;
+ if (ms < 3_600_000) return `${(ms / 60_000).toFixed(1)}m`;
+ return `${(ms / 3_600_000).toFixed(1)}h`;
+}
+
+// Classic Queues page, restored verbatim from before the Queue Metrics feature. Rendered
+// when queueMetricsUiEnabled is off so a gated org sees exactly the pre-metrics UI.
+function ClassicQueuesView() {
+ const {
+ environment,
+ queues,
+ success,
+ pagination,
+ code,
+ totalQueues,
+ hasFilters,
+ autoReloadPollIntervalMs,
+ } = useTypedLoaderData();
+
+ const organization = useOrganization();
+ const project = useProject();
+ const env = useEnvironment();
+ const plan = useCurrentPlan();
+
+ useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true });
+
+ const limitStatus =
+ environment.running === environment.concurrencyLimit * environment.burstFactor
+ ? "limit"
+ : environment.running > environment.concurrencyLimit
+ ? "burst"
+ : "within";
+
+ const limitClassName =
+ limitStatus === "burst" ? "text-warning" : limitStatus === "limit" ? "text-error" : undefined;
+
+ return (
+
+
+
+
+
+
+ Queues docs
+
+
+
+
+
+
+
paused : undefined}
+ animate
+ accessory={
+
+ {environment.runsEnabled &&
+ env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? (
+
+ ) : null}
+
+
+ }
+ valueClassName={env.paused ? "text-warning tabular-nums" : "tabular-nums"}
+ compactThreshold={1000000}
+ />
+
+ Including {environment.running - environment.concurrencyLimit} burst runs{" "}
+
+
+ ) : limitStatus === "limit" ? (
+ "At concurrency limit"
+ ) : undefined
+ }
+ accessory={
+
+ }
+ compactThreshold={1000000}
+ />
+ 1 ? (
+
+ Burst limit {environment.burstFactor * environment.concurrencyLimit}{" "}
+
+
+ ) : undefined
+ }
+ accessory={
+ plan ? (
+ plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? (
+
+ Increase limit
+
+ ) : (
+
+ Increase limit
+
+ )
+ ) : null
+ }
+ />
+
+
+ {success ? (
+
+
+
+
+
+ Name
+ Queued
+ Running
+ Limit
+
+
+
Environment
+
+ This queue is limited by your environment's concurrency limit of{" "}
+ {environment.concurrencyLimit}.
+
+
+
+
User
+
+ This queue is limited by a concurrency limit set in your code.
+
+
+
+
Override
+
+ This queue's concurrency limit has been manually overridden from the
+ dashboard or API.
+
+
+
+ }
+ >
+ Limited by
+
+
+ Pause/resume
+
+
+
+
+ {queues.length > 0 ? (
+ queues.map((queue) => {
+ const limit = queue.concurrencyLimit ?? environment.concurrencyLimit;
+ const isAtConcurrencyLimit = queue.running >= limit;
+ const isAtQueueLimit =
+ environment.queueSizeLimit !== null &&
+ queue.queued >= environment.queueSizeLimit;
+ const queueFilterableName = `${queue.type === "task" ? "task/" : ""}${
+ queue.name
+ }`;
+ return (
+
+
+
+
+ {queue.concurrency?.overriddenAt ? (
+
+ Concurrency limit overridden
+
+ }
+ content="This queue's concurrency limit has been manually overridden from the dashboard or API."
+ className="max-w-xs"
+ disableHoverableContent
+ />
+ ) : null}
+ {queue.paused ? (
+
+ Paused
+
+ ) : null}
+ {isAtQueueLimit ? (
+
+ At queue limit
+
+ ) : null}
+ {isAtConcurrencyLimit ? (
+
+ At concurrency limit
+
+ ) : null}
+
+
+
+ {queue.queued}
+
+ 0 && "text-text-bright",
+ isAtConcurrencyLimit && "text-warning"
+ )}
+ >
+ {queue.running}
+
+
+ {limit}
+
+
+ {queue.concurrency?.overriddenAt ? (
+ Override
+ ) : queue.concurrencyLimit ? (
+ "User"
+ ) : (
+ "Environment"
+ )}
+
+
+ }
+ hiddenButtons={
+ !queue.paused &&
+ }
+ popoverContent={
+ <>
+ {queue.paused ? (
+
+ ) : (
+
+ )}
+
+
+
+
+
+ >
+ }
+ />
+
+ );
+ })
+ ) : (
+
+
+
+
+ {hasFilters
+ ? "No queues found matching your filters"
+ : "No queues found"}
+
+
+
+
+ )}
+
+
+
+ ) : (
+
+ {totalQueues === 0 ? (
+
+
+
+ ) : code === "engine-version" ? (
+
+ ) : (
+
Something went wrong
+ )}
+
+ )}
+
+
+
+ );
+}
+
function BurstFactorTooltip({
environment,
}: {
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx
new file mode 100644
index 00000000000..e6a21c6514f
--- /dev/null
+++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx
@@ -0,0 +1,783 @@
+import { type MetaFunction } from "@remix-run/react";
+import { type LoaderFunctionArgs } from "@remix-run/server-runtime";
+import { useMemo } from "react";
+import { typedjson, useTypedLoaderData } from "remix-typedjson";
+import { z } from "zod";
+import { PageBody, PageContainer } from "~/components/layout/AppLayout";
+import { NavBar, PageTitle } from "~/components/primitives/PageHeader";
+import { buildActivityTimeAxis } from "~/components/primitives/charts/activityTimeAxis";
+import {
+ Chart,
+ type ChartConfig,
+ type ChartState,
+} from "~/components/primitives/charts/ChartCompound";
+import { ChartCard } from "~/components/primitives/charts/ChartCard";
+import {
+ useMetricResourceQuery,
+ type MetricResourceTimeRange,
+} from "~/hooks/useMetricResourceQuery";
+import { findProjectBySlug } from "~/models/project.server";
+import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server";
+import { QueueRetrievePresenter } from "~/presenters/v3/QueueRetrievePresenter.server";
+import {
+ Table,
+ TableBody,
+ TableCell,
+ TableHeader,
+ TableHeaderCell,
+ TableRow,
+} from "~/components/primitives/Table";
+import { TabButton, TabContainer } from "~/components/primitives/Tabs";
+import { engine } from "~/v3/runEngine.server";
+import { TimeFilter } from "~/components/runs/v3/SharedFilters";
+import { useSearchParams } from "~/hooks/useSearchParam";
+import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route";
+import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server";
+import { requireUserId } from "~/services/session.server";
+import { cn } from "~/utils/cn";
+import { EnvironmentParamSchema } from "~/utils/pathBuilder";
+
+export const meta: MetaFunction = () => [{ title: `Queue metrics | Trigger.dev` }];
+
+const ParamsSchema = EnvironmentParamSchema.extend({ queueParam: z.string() });
+
+export const loader = async ({ request, params }: LoaderFunctionArgs) => {
+ const userId = await requireUserId(request);
+ const { organizationSlug, projectParam, envParam, queueParam } = ParamsSchema.parse(params);
+
+ // This whole page is part of the metrics UI; gate it per-org (the list already hides
+ // the only link to it, this is defense in depth).
+ if (!(await canAccessQueueMetricsUi({ userId, organizationSlug }))) {
+ throw new Response(undefined, { status: 404, statusText: "Not found" });
+ }
+
+ const url = new URL(request.url);
+
+ const project = await findProjectBySlug(organizationSlug, projectParam, userId);
+ if (!project) throw new Response(undefined, { status: 404, statusText: "Project not found" });
+
+ const environment = await findEnvironmentBySlug(project.id, envParam, userId);
+ if (!environment)
+ throw new Response(undefined, { status: 404, statusText: "Environment not found" });
+
+ const retrieve = await new QueueRetrievePresenter().call({ environment, queueInput: queueParam });
+ if (!retrieve.success) {
+ throw new Response(undefined, { status: 404, statusText: "Queue not found" });
+ }
+
+ const queue = retrieve.queue;
+ const fullName = queue.type === "task" ? `task/${queue.name}` : queue.name;
+
+ const ckBreakdown = await engine.concurrencyKeyBreakdown(environment, fullName, {
+ limit: CK_LIVE_LIMIT,
+ });
+
+ // Charts + CH-derived stats are fetched client-side per card (see QueueDetailChartCard /
+ // useQueueMetric) so the drill-down renders instantly. The loader only returns the live
+ // "now" counts + identifiers the client fetches need.
+ return typedjson({
+ queue,
+ fullName,
+ ckBreakdown,
+ loadedAt: Date.now(),
+ backPath: url.pathname.replace(/\/[^/]+$/, ""),
+ ids: {
+ organizationId: environment.organizationId,
+ projectId: environment.projectId,
+ environmentId: environment.id,
+ },
+ });
+};
+
+const COLORS = {
+ running: "#6366F1",
+ limit: "#4D525B",
+ queued: "#A78BFA",
+ p50: "#22D3EE",
+ p95: "#F59E0B",
+ p99: "#EF4444",
+ throttled: "#F59E0B",
+ ckKeys: "#34D399",
+ ckWait: "#F59E0B",
+};
+
+const CK_LIVE_LIMIT = 50;
+
+type Ids = { organizationId: string; projectId: string; environmentId: string };
+
+type TimeRangeParams = MetricResourceTimeRange;
+
+const QUEUE_METRICS_DEFAULT_PERIOD = "1d";
+
+export default function Page() {
+ const { queue, fullName, ckBreakdown, loadedAt, backPath, ids } =
+ useTypedLoaderData();
+ const plan = useCurrentPlan();
+ const maxPeriodDays = plan?.v3Subscription?.plan?.limits?.queryPeriodDays?.number;
+
+ const { value, replace } = useSearchParams();
+ const timeRange: TimeRangeParams = {
+ period: value("period") ?? null,
+ from: value("from") ?? null,
+ to: value("to") ?? null,
+ };
+
+ // The Concurrency keys tab exists only for queues with key activity: live keys in the
+ // ckIndex, or nonzero CK history in the selected range (one cached scalar query decides).
+ const { rows: gateRows, showLoading: gateLoading } = useQueueMetric(
+ `SELECT max(max_ck_backlogged) AS peak_keys, max(max_ck_wait_ms) AS peak_wait\nFROM queue_metrics`,
+ { ids, timeRange, queueName: fullName }
+ );
+ const gateRow = gateRows[0];
+ const hasHistory = gateRow
+ ? toNumber(gateRow.peak_keys) > 0 || toNumber(gateRow.peak_wait) > 0
+ : false;
+ const showKeysTab = ckBreakdown.keys.length > 0 || (!gateLoading && hasHistory);
+ const view = value("view") === "keys" && showKeysTab ? "keys" : "overview";
+
+ return (
+
+
+
+
+
+
+
+
+
+
+
+ {showKeysTab && (
+
+ replace({ view: undefined, key: undefined })}
+ >
+ Overview
+
+ replace({ view: "keys" })}
+ >
+ Concurrency keys
+
+
+ )}
+
+ {view === "keys" ? (
+
+ ) : (
+
+ )}
+
+
+
+ );
+}
+
+function OverviewCharts({
+ ids,
+ timeRange,
+ queueName,
+}: {
+ ids: Ids;
+ timeRange: TimeRangeParams;
+ queueName: string;
+}) {
+ return (
+ <>
+
+
+
+
+ >
+ );
+}
+
+type CkBreakdown = {
+ totalBackloggedKeys: number;
+ keys: Array<{
+ concurrencyKey: string;
+ queued: number;
+ running: number;
+ oldestEnqueuedAt: number;
+ }>;
+};
+
+function ConcurrencyKeysView({
+ breakdown,
+ loadedAt,
+ ids,
+ timeRange,
+ queueName,
+}: {
+ breakdown: CkBreakdown;
+ loadedAt: number;
+ ids: Ids;
+ timeRange: TimeRangeParams;
+ queueName: string;
+}) {
+ return (
+ <>
+
+
+
+
+
+ >
+ );
+}
+
+// TRQL string literal escape (standard SQL doubling).
+function trqlString(value: string): string {
+ return value.replace(/'/g, "''");
+}
+
+const KEY_SERIES_COLORS = [
+ "#34D399",
+ "#6366F1",
+ "#F59E0B",
+ "#22D3EE",
+ "#A78BFA",
+ "#EF4444",
+ "#F472B6",
+ "#84CC16",
+];
+
+type GroupedKeyChartProps = {
+ title: string;
+ /** Aggregate expression ranking keys over the whole range (top 8 charted). */
+ rankExpr: string;
+ /** Aggregate expression charted per (bucket, key). */
+ seriesExpr: string;
+ fillGaps?: boolean;
+ valueFormat?: (value: number) => string;
+ ids: Ids;
+ timeRange: TimeRangeParams;
+ queueName: string;
+};
+
+// Two-step top-N: rank keys over the range, then chart those keys as grouped series
+// (the per-key table is activity-bound, so ranking is a cheap scan).
+function GroupedKeyChartCard(props: GroupedKeyChartProps) {
+ const { rows, showLoading, failed } = useQueueMetric(
+ `SELECT concurrency_key, ${props.rankExpr} AS peak\nFROM queue_metrics_by_key\nGROUP BY concurrency_key\nORDER BY peak DESC\nLIMIT 8`,
+ { ids: props.ids, timeRange: props.timeRange, queueName: props.queueName }
+ );
+ const keys = useMemo(
+ () => rows.filter((r) => toNumber(r.peak) > 0).map((r) => String(r.concurrency_key)),
+ [rows]
+ );
+
+ if (showLoading || failed || keys.length === 0) return null;
+ return ;
+}
+
+function GroupedKeySeries({
+ keys,
+ title,
+ seriesExpr,
+ fillGaps,
+ valueFormat,
+ ids,
+ timeRange,
+ queueName,
+}: GroupedKeyChartProps & { keys: string[] }) {
+ const inList = keys.map((k) => `'${trqlString(k)}'`).join(", ");
+ const { rows, showLoading, failed } = useQueueMetric(
+ `SELECT timeBucket() AS t, concurrency_key, ${seriesExpr} AS v\nFROM queue_metrics_by_key\nWHERE concurrency_key IN (${inList})\nGROUP BY t, concurrency_key\nORDER BY t`,
+ { ids, timeRange, queueName, fillGaps }
+ );
+
+ const data = useMemo(() => {
+ const buckets = new Map>();
+ for (const r of rows) {
+ const bucket = clickhouseTimeToMs(r.t);
+ if (!Number.isFinite(bucket)) continue;
+ let point = buckets.get(bucket);
+ if (!point) {
+ point = { bucket } as { bucket: number } & Record;
+ buckets.set(bucket, point);
+ }
+ point[String(r.concurrency_key)] = toNumber(r.v);
+ }
+ return [...buckets.values()].sort((a, b) => a.bucket - b.bucket);
+ }, [rows]);
+
+ const chartConfig = useMemo(() => {
+ const cfg: ChartConfig = {};
+ keys.forEach((k, i) => {
+ cfg[k] = { label: k, color: KEY_SERIES_COLORS[i % KEY_SERIES_COLORS.length]! };
+ });
+ return cfg;
+ }, [keys]);
+
+ const { tickFormatter, tooltipLabelFormatter } = useMemo(
+ () => buildActivityTimeAxis(data),
+ [data]
+ );
+ const state: ChartState = showLoading ? "loading" : failed ? "invalid" : undefined;
+
+ return (
+
+
+
+ valueFormat(v) } : undefined}
+ tooltipLabelFormatter={tooltipLabelFormatter}
+ tooltipValueFormatter={valueFormat}
+ />
+
+
+
+ );
+}
+
+type KeyRangeStats = { started: number; peakBacklog: number; meanWaitMs: number };
+
+// Live breakdown (queued/running now, oldest wait) merged with per-key range stats from
+// the history tier; keys with history but no live backlog still appear. Clicking a key
+// pins the drill-down charts via the `key` search param.
+function KeyStatsTable({
+ breakdown,
+ loadedAt,
+ ids,
+ timeRange,
+ queueName,
+}: {
+ breakdown: CkBreakdown;
+ loadedAt: number;
+ ids: Ids;
+ timeRange: TimeRangeParams;
+ queueName: string;
+}) {
+ const { value, replace, del } = useSearchParams();
+ const selectedKey = value("key");
+
+ const { rows, showLoading } = useQueueMetric(
+ `SELECT concurrency_key,\n deltaSumTimestampMerge(started_delta) AS started,\n max(max_queued) AS peak_backlog,\n if(sum(wait_ms_count) > 0, round(sum(wait_ms_sum) / sum(wait_ms_count)), 0) AS mean_wait\nFROM queue_metrics_by_key\nGROUP BY concurrency_key\nORDER BY peak_backlog DESC\nLIMIT 50`,
+ { ids, timeRange, queueName }
+ );
+
+ const merged = useMemo(() => {
+ const range = new Map();
+ for (const r of rows) {
+ range.set(String(r.concurrency_key), {
+ started: toNumber(r.started),
+ peakBacklog: toNumber(r.peak_backlog),
+ meanWaitMs: toNumber(r.mean_wait),
+ });
+ }
+ const liveKeys = new Set(breakdown.keys.map((k) => k.concurrencyKey));
+ const live = breakdown.keys.map((k) => ({
+ key: k.concurrencyKey,
+ queued: k.queued,
+ running: k.running,
+ oldestWaitMs: Math.max(0, loadedAt - k.oldestEnqueuedAt),
+ range: range.get(k.concurrencyKey),
+ }));
+ const historyOnly = [...range.entries()]
+ .filter(([key]) => !liveKeys.has(key))
+ .map(([key, stats]) => ({
+ key,
+ queued: 0,
+ running: 0,
+ oldestWaitMs: null as number | null,
+ range: stats,
+ }));
+ return [...live, ...historyOnly].slice(0, 50);
+ }, [rows, breakdown, loadedAt]);
+
+ if (merged.length === 0) return null;
+
+ return (
+ <>
+
+
+
Concurrency keys
+
+ {breakdown.totalBackloggedKeys > 0
+ ? `${breakdown.totalBackloggedKeys.toLocaleString()} ${
+ breakdown.totalBackloggedKeys === 1 ? "key" : "keys"
+ } with queued runs now`
+ : "No keys with queued runs right now"}
+
+
+
+
+
+ Key
+ Queued now
+ Running now
+ Oldest wait
+ Started
+ Peak backlog
+ Mean delay
+
+
+
+ {merged.map((row) => (
+ (selectedKey === row.key ? del("key") : replace({ key: row.key }))}
+ >
+ {row.key}
+ {row.queued.toLocaleString()}
+ {row.running.toLocaleString()}
+
+ {row.oldestWaitMs === null ? "–" : formatWaitMs(row.oldestWaitMs)}
+
+
+ {row.range ? row.range.started.toLocaleString() : showLoading ? "…" : "–"}
+
+
+ {row.range ? row.range.peakBacklog.toLocaleString() : showLoading ? "…" : "–"}
+
+
+ {row.range && row.range.meanWaitMs > 0 ? formatWaitMs(row.range.meanWaitMs) : "–"}
+
+
+ ))}
+
+
+
+ {selectedKey && (
+
+ )}
+ >
+ );
+}
+
+function KeyDrilldown({
+ keyName,
+ ids,
+ timeRange,
+ queueName,
+}: {
+ keyName: string;
+ ids: Ids;
+ timeRange: TimeRangeParams;
+ queueName: string;
+}) {
+ const pin = `concurrency_key = '${trqlString(keyName)}'`;
+ return (
+ <>
+
+
+ 0, round(sum(wait_ms_sum) / sum(wait_ms_count)), 0) AS wait\nFROM queue_metrics_by_key\nWHERE ${pin}\nGROUP BY t\nORDER BY t`}
+ ids={ids}
+ timeRange={timeRange}
+ queueName={queueName}
+ valueFormat={formatWaitMs}
+ series={[{ key: "wait", label: "Mean delay", color: COLORS.p95 }]}
+ />
+ >
+ );
+}
+
+function useQueueMetric(
+ query: string,
+ opts: { ids: Ids; timeRange: TimeRangeParams; queueName: string; fillGaps?: boolean }
+) {
+ return useMetricResourceQuery(query, {
+ ...opts.ids,
+ timeRange: opts.timeRange,
+ defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD,
+ queues: [opts.queueName],
+ fillGaps: opts.fillGaps,
+ });
+}
+
+function toNumber(value: number | string | null | undefined): number {
+ const n = typeof value === "number" ? value : Number(value);
+ return Number.isFinite(n) ? n : 0;
+}
+
+function clickhouseTimeToMs(value: unknown): number {
+ const s = String(value).replace(" ", "T");
+ return Date.parse(s.endsWith("Z") ? s : `${s}Z`);
+}
+
+type SeriesConfig = { key: string; label: string; color: string };
+
+function QueueDetailChartCard({
+ title,
+ query,
+ series,
+ ids,
+ timeRange,
+ queueName,
+ valueFormat,
+ fillGaps,
+}: {
+ title: string;
+ query: string;
+ series: SeriesConfig[];
+ ids: Ids;
+ timeRange: TimeRangeParams;
+ queueName: string;
+ valueFormat?: (value: number) => string;
+ fillGaps?: boolean;
+}) {
+ const { rows, showLoading, failed } = useQueueMetric(query, {
+ ids,
+ timeRange,
+ queueName,
+ fillGaps,
+ });
+
+ const data = useMemo(() => {
+ return rows
+ .map((r) => {
+ const point: { bucket: number } & Record = {
+ bucket: clickhouseTimeToMs(r.t),
+ };
+ for (const s of series) point[s.key] = toNumber(r[s.key]);
+ return point;
+ })
+ .filter((p) => Number.isFinite(p.bucket));
+ }, [rows, series]);
+
+ const chartConfig = useMemo(() => {
+ const cfg: ChartConfig = {};
+ for (const s of series) cfg[s.key] = { label: s.label, color: s.color };
+ return cfg;
+ }, [series]);
+
+ const { tickFormatter, tooltipLabelFormatter } = useMemo(
+ () => buildActivityTimeAxis(data),
+ [data]
+ );
+
+ const state: ChartState = showLoading ? "loading" : failed ? "invalid" : undefined;
+
+ return (
+
+
+ s.key)}
+ state={state}
+ fillContainer
+ >
+ valueFormat(v) } : undefined}
+ tooltipLabelFormatter={tooltipLabelFormatter}
+ tooltipValueFormatter={valueFormat}
+ />
+
+
+
+ );
+}
+
+function QueueStats({
+ queue,
+ ids,
+ timeRange,
+ queueName,
+}: {
+ queue: { running: number; queued: number };
+ ids: Ids;
+ timeRange: TimeRangeParams;
+ queueName: string;
+}) {
+ // One scalar query feeds the CH-derived stats; the "now" counts come from the loader (live).
+ const { rows, showLoading } = useQueueMetric(
+ `SELECT max(max_limit) AS lim, max(max_queued) AS peak_queued, deltaSumTimestampMerge(started_delta) AS started,\n round(quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[3]) AS worst_p95\nFROM queue_metrics`,
+ { ids, timeRange, queueName }
+ );
+ const row = rows[0];
+ const worstP95 = row ? toNumber(row.worst_p95) : 0;
+
+ return (
+
+
+
+
+
+
+ 0 ? formatWaitMs(worstP95) : "–"}
+ loading={showLoading}
+ className={worstP95 >= 60_000 ? "text-warning" : undefined}
+ />
+
+ );
+}
+
+function Stat({
+ label,
+ value,
+ className,
+ loading,
+}: {
+ label: string;
+ value: string;
+ className?: string;
+ loading?: boolean;
+}) {
+ return (
+
+
{label}
+ {loading ? (
+
+ ) : (
+
{value}
+ )}
+
+ );
+}
+
+function formatWaitMs(ms: number): string {
+ if (ms < 1000) return `${Math.round(ms)}ms`;
+ if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`;
+ if (ms < 3_600_000) return `${(ms / 60_000).toFixed(1)}m`;
+ return `${(ms / 3_600_000).toFixed(1)}h`;
+}
diff --git a/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts b/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts
new file mode 100644
index 00000000000..69e4e8c1fac
--- /dev/null
+++ b/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts
@@ -0,0 +1,45 @@
+import { type ActionFunctionArgs, type LoaderFunctionArgs, json } from "@remix-run/server-runtime";
+import { z } from "zod";
+import { requireAdminApiRequest } from "~/services/personalAccessToken.server";
+import {
+ probeQueueMetricsStreams,
+ readQueueMetricsControls,
+ writeQueueMetricsControls,
+} from "~/v3/queueMetrics.server";
+
+export async function loader({ request }: LoaderFunctionArgs) {
+ await requireAdminApiRequest(request);
+ const [controls, streams] = await Promise.all([
+ readQueueMetricsControls(),
+ probeQueueMetricsStreams(),
+ ]);
+ return json({ controls, streams });
+}
+
+const BodySchema = z.object({
+ enabled: z.boolean().optional(),
+ sampleRate: z.number().min(0).max(1).optional(),
+});
+
+export async function action({ request }: ActionFunctionArgs) {
+ await requireAdminApiRequest(request);
+
+ if (request.method !== "POST") {
+ return json({ error: "Method not allowed" }, { status: 405 });
+ }
+
+ let body: unknown;
+ try {
+ body = await request.json();
+ } catch {
+ return json({ error: "Invalid JSON body" }, { status: 400 });
+ }
+
+ const parsed = BodySchema.safeParse(body);
+ if (!parsed.success) {
+ return json({ error: "Invalid payload", details: parsed.error.issues }, { status: 400 });
+ }
+
+ await writeQueueMetricsControls(parsed.data);
+ return json({ ok: true, controls: await readQueueMetricsControls() });
+}
diff --git a/apps/webapp/app/routes/admin.queue-metrics.tsx b/apps/webapp/app/routes/admin.queue-metrics.tsx
new file mode 100644
index 00000000000..6deaedce66e
--- /dev/null
+++ b/apps/webapp/app/routes/admin.queue-metrics.tsx
@@ -0,0 +1,190 @@
+import { useFetcher, useRevalidator } from "@remix-run/react";
+import { json } from "@remix-run/server-runtime";
+import { useEffect, useState } from "react";
+import { typedjson, useTypedLoaderData } from "remix-typedjson";
+import { z } from "zod";
+import { Button } from "~/components/primitives/Buttons";
+import { Callout } from "~/components/primitives/Callout";
+import { Header1, Header2 } from "~/components/primitives/Headers";
+import { Input } from "~/components/primitives/Input";
+import { Paragraph } from "~/components/primitives/Paragraph";
+import {
+ Table,
+ TableBody,
+ TableCell,
+ TableHeader,
+ TableHeaderCell,
+ TableRow,
+} from "~/components/primitives/Table";
+import { dashboardAction, dashboardLoader } from "~/services/routeBuilders/dashboardBuilder";
+import {
+ probeQueueMetricsStreams,
+ readQueueMetricsControls,
+ writeQueueMetricsControls,
+} from "~/v3/queueMetrics.server";
+
+export const loader = dashboardLoader({ authorization: { requireSuper: true } }, async () => {
+ const [controls, streams] = await Promise.all([
+ readQueueMetricsControls(),
+ probeQueueMetricsStreams(),
+ ]);
+ return typedjson({ controls, streams });
+});
+
+const BodySchema = z.object({
+ enabled: z.boolean().optional(),
+ sampleRate: z.number().min(0).max(1).optional(),
+});
+
+export const action = dashboardAction(
+ { authorization: { requireSuper: true } },
+ async ({ request }) => {
+ let body: unknown;
+ try {
+ body = await request.json();
+ } catch {
+ return json({ error: "Invalid JSON body" }, { status: 400 });
+ }
+ const parsed = BodySchema.safeParse(body);
+ if (!parsed.success) {
+ return json({ error: "Invalid payload" }, { status: 400 });
+ }
+ await writeQueueMetricsControls(parsed.data);
+ return json({ success: true });
+ }
+);
+
+export default function AdminQueueMetricsRoute() {
+ const { controls, streams } = useTypedLoaderData();
+ const saveFetcher = useFetcher<{ success?: boolean; error?: string }>();
+ const revalidator = useRevalidator();
+
+ const [enabled, setEnabled] = useState(controls.enabled);
+ const [sampleRate, setSampleRate] = useState(String(controls.sampleRate));
+ const [error, setError] = useState(null);
+
+ useEffect(() => {
+ setEnabled(controls.enabled);
+ setSampleRate(String(controls.sampleRate));
+ }, [controls.enabled, controls.sampleRate]);
+
+ useEffect(() => {
+ if (saveFetcher.data?.success) {
+ setError(null);
+ revalidator.revalidate();
+ } else if (saveFetcher.data?.error) {
+ setError(saveFetcher.data.error);
+ }
+ }, [saveFetcher.data]);
+
+ const isSaving = saveFetcher.state === "submitting";
+
+ const handleSave = () => {
+ const rate = Number(sampleRate);
+ if (!Number.isFinite(rate) || rate < 0 || rate > 1) {
+ setError("Sample rate must be a number between 0 and 1");
+ return;
+ }
+ saveFetcher.submit(JSON.stringify({ enabled, sampleRate: rate }), {
+ method: "POST",
+ encType: "application/json",
+ });
+ };
+
+ const totalLag = streams.reduce((sum, s) => sum + (s.lag ?? 0), 0);
+ const lagUnknownCount = streams.filter((s) => s.lag === null).length;
+
+ return (
+
+
+
Queue metrics ingest
+
+ Live controls for the queue-metrics ingest pipeline on the run-queue Redis. Changes take
+ effect within ~10s across all instances (no redeploy). Watch EngineCPU on the run-queue
+ Redis when enabling or raising the sample rate.
+
+
+
+
Controls
+
+ setEnabled(e.target.checked)}
+ />
+ Emission enabled (queue_metrics:enabled)
+
+
+
+ Gauge sample rate 0–1 (queue_metrics:gauge_sample_rate); default{" "}
+ {controls.sampleRateDefault}
+
+ setSampleRate(e.target.value)}
+ className="w-32"
+ />
+
+ {error &&
{error} }
+
+
+ {isSaving ? "Saving..." : "Save controls"}
+
+
+
+
+
+
+ Stream health{totalLag > 0 ? ` (lag ${totalLag})` : ""}
+ revalidator.revalidate()}
+ disabled={revalidator.state === "loading"}
+ >
+ Refresh
+
+
+
+ Depth = entries buffered in the shard stream; Lag = entries not yet delivered to the
+ consumer group (rising = consumer falling behind; "unknown" = entries were trimmed past
+ the group, i.e. data was lost); Pending = unacked entries. Gauges and counters share one
+ stream family on the metrics Redis.
+
+ {lagUnknownCount > 0 && (
+
+ Lag is unknown on {lagUnknownCount} shard{lagUnknownCount === 1 ? "" : "s"}: entries
+ were trimmed past the consumer group's read position, so stream data was lost. Check
+ consumer health.
+
+ )}
+
+
+
+ Stream
+ Shard
+ Depth
+ Lag
+ Pending
+
+
+
+ {streams.map((s) => (
+
+ {s.stream}
+ {s.shard}
+ {s.depth}
+ {s.lag ?? "unknown"}
+ {s.pending}
+
+ ))}
+
+
+
+
+
+ );
+}
diff --git a/apps/webapp/app/routes/admin.tsx b/apps/webapp/app/routes/admin.tsx
index a95b016ca5b..7d24fe312fa 100644
--- a/apps/webapp/app/routes/admin.tsx
+++ b/apps/webapp/app/routes/admin.tsx
@@ -38,6 +38,10 @@ export default function Page() {
label: "Global Feature Flags",
to: "/admin/feature-flags",
},
+ {
+ label: "Queue Metrics",
+ to: "/admin/queue-metrics",
+ },
{
label: "Notifications",
to: "/admin/notifications",
diff --git a/apps/webapp/app/routes/api.v1.query.schema.ts b/apps/webapp/app/routes/api.v1.query.schema.ts
index 3e95d16818d..976fa72b267 100644
--- a/apps/webapp/app/routes/api.v1.query.schema.ts
+++ b/apps/webapp/app/routes/api.v1.query.schema.ts
@@ -1,7 +1,7 @@
import { json } from "@remix-run/server-runtime";
import type { ColumnSchema, TableSchema } from "@internal/tsql";
import { createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server";
-import { querySchemas } from "~/v3/querySchemas";
+import { visibleQuerySchemas } from "~/v3/querySchemas";
function serializeColumn(col: ColumnSchema) {
const result: Record = {
@@ -51,7 +51,7 @@ export const loader = createLoaderApiRoute(
},
},
async () => {
- const tables = querySchemas.map(serializeTable);
+ const tables = visibleQuerySchemas.map(serializeTable);
return json({ tables });
}
);
diff --git a/apps/webapp/app/routes/resources.metric.tsx b/apps/webapp/app/routes/resources.metric.tsx
index d456ba1ce1b..5bf0ed693ad 100644
--- a/apps/webapp/app/routes/resources.metric.tsx
+++ b/apps/webapp/app/routes/resources.metric.tsx
@@ -50,6 +50,8 @@ const MetricWidgetQuery = z.object({
operations: z.array(z.string()).optional(),
providers: z.array(z.string()).optional(),
tags: z.array(z.string()).optional(),
+ // Opt into server-side gap fill (carry-forward for gauges, zero-fill for counters).
+ fillGaps: z.boolean().optional(),
});
export const action = async ({ request }: ActionFunctionArgs) => {
@@ -85,6 +87,7 @@ export const action = async ({ request }: ActionFunctionArgs) => {
operations,
providers,
tags: _tags,
+ fillGaps,
} = submission.data;
// Check they should be able to access it
@@ -122,6 +125,7 @@ export const action = async ({ request }: ActionFunctionArgs) => {
promptVersions,
operations,
providers,
+ fillGaps,
// Set higher concurrency if many widgets are on screen at once
customOrgConcurrencyLimit: env.METRIC_WIDGET_DEFAULT_ORG_CONCURRENCY_LIMIT,
});
diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query.ai-generate.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query.ai-generate.tsx
index c1626b966d2..4a9ab462dcf 100644
--- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query.ai-generate.tsx
+++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query.ai-generate.tsx
@@ -8,7 +8,7 @@ import type { AITimeFilter } from "~/routes/_app.orgs.$organizationSlug.projects
import { requireUserId } from "~/services/session.server";
import { EnvironmentParamSchema } from "~/utils/pathBuilder";
import { AIQueryService } from "~/v3/services/aiQueryService.server";
-import { querySchemas } from "~/v3/querySchemas";
+import { visibleQuerySchemas } from "~/v3/querySchemas";
const RequestSchema = z.object({
prompt: z.string().min(1, "Prompt is required"),
@@ -85,7 +85,7 @@ export async function action({ request, params }: ActionFunctionArgs) {
const { prompt, mode, currentQuery } = submission.data;
const service = new AIQueryService(
- querySchemas,
+ visibleQuerySchemas,
openai(env.AI_RUN_FILTER_MODEL ?? "gpt-4o-mini")
);
diff --git a/apps/webapp/app/services/queryService.server.ts b/apps/webapp/app/services/queryService.server.ts
index 57b877ed876..70af40ec89f 100644
--- a/apps/webapp/app/services/queryService.server.ts
+++ b/apps/webapp/app/services/queryService.server.ts
@@ -7,7 +7,12 @@ import {
type TSQLQueryResult,
} from "@internal/clickhouse";
import type { CustomerQuerySource } from "@trigger.dev/database";
-import type { TableSchema, WhereClauseCondition } from "@internal/tsql";
+import {
+ calculateTimeBucketInterval,
+ type TableSchema,
+ type TimeBucketInterval,
+ type WhereClauseCondition,
+} from "@internal/tsql";
import { z } from "zod";
import { prisma } from "~/db.server";
import { env } from "~/env.server";
@@ -110,6 +115,41 @@ export type ExecuteQueryResult =
}
| { success: false; error: Error };
+const INTERVAL_UNIT_SECONDS: Record = {
+ SECOND: 1,
+ MINUTE: 60,
+ HOUR: 3_600,
+ DAY: 86_400,
+ WEEK: 604_800,
+ MONTH: 2_592_000,
+};
+
+function floorToSeconds(date: Date, alignSeconds: number): Date {
+ const ms = alignSeconds * 1000;
+ return new Date(Math.floor(date.getTime() / ms) * ms);
+}
+
+/**
+ * Swap a table for one of its rollups when the query's bucket interval is at least the
+ * rollup's granularity. The rollup has identical logical columns, so only the physical
+ * table (and therefore rows read) changes.
+ */
+function resolveRollup(schema: TableSchema, timeRange: { from: Date; to: Date }): TableSchema {
+ if (!schema.rollups || schema.rollups.length === 0) {
+ return schema;
+ }
+ const interval = calculateTimeBucketInterval(
+ timeRange.from,
+ timeRange.to,
+ schema.timeBucketThresholds
+ );
+ const intervalSeconds = interval.value * INTERVAL_UNIT_SECONDS[interval.unit];
+ const best = [...schema.rollups]
+ .sort((a, b) => b.minIntervalSeconds - a.minIntervalSeconds)
+ .find((r) => r.minIntervalSeconds <= intervalSeconds);
+ return best ? { ...schema, clickhouseName: best.clickhouseName } : schema;
+}
+
export async function getDefaultPeriod(organizationId: string): Promise {
const idealDefaultPeriodDays = 7;
const maxQueryPeriod = await getLimit(organizationId, "queryPeriodDays", 30);
@@ -183,6 +223,14 @@ export async function executeQuery(
defaultPeriod,
});
+ // Align the time bounds so repeated auto-refresh queries produce identical query
+ // params and can share ClickHouse query-cache entries (params are part of the key).
+ const alignSeconds = matchedSchema?.queryCache?.alignSeconds;
+ if (alignSeconds) {
+ if (timeFilter.from) timeFilter.from = floorToSeconds(timeFilter.from, alignSeconds);
+ if (timeFilter.to) timeFilter.to = floorToSeconds(timeFilter.to, alignSeconds);
+ }
+
// Calculate the effective "from" date the user is requesting (for period clipping check)
// This is null only when the user specifies just a "to" date (rare case)
let requestedFromDate: Date | null = null;
@@ -192,6 +240,9 @@ export async function executeQuery(
// Period specified (or default) - calculate from now
const periodMs = parse(timeFilter.period ?? defaultPeriod) ?? 7 * 24 * 60 * 60 * 1000;
requestedFromDate = new Date(Date.now() - periodMs);
+ if (alignSeconds) {
+ requestedFromDate = floorToSeconds(requestedFromDate, alignSeconds);
+ }
}
// Build the fallback WHERE condition based on what the user specified
@@ -207,7 +258,10 @@ export async function executeQuery(
}
const maxQueryPeriod = await getLimit(organizationId, "queryPeriodDays", 30);
- const maxQueryPeriodDate = new Date(Date.now() - maxQueryPeriod * 24 * 60 * 60 * 1000);
+ let maxQueryPeriodDate = new Date(Date.now() - maxQueryPeriod * 24 * 60 * 60 * 1000);
+ if (alignSeconds) {
+ maxQueryPeriodDate = floorToSeconds(maxQueryPeriodDate, alignSeconds);
+ }
// Check if the requested time period exceeds the plan limit
const periodClipped = requestedFromDate !== null && requestedFromDate < maxQueryPeriodDate;
@@ -255,6 +309,10 @@ export async function executeQuery(
to: to ?? undefined,
defaultPeriod,
});
+ if (alignSeconds) {
+ timeRange.from = floorToSeconds(timeRange.from, alignSeconds);
+ timeRange.to = floorToSeconds(timeRange.to, alignSeconds);
+ }
try {
// Build field mappings for project_ref → project_id and environment_id → slug translation
@@ -277,10 +335,19 @@ export async function executeQuery(
organizationId,
"query"
);
+ // Serve coarse-bucket queries from the table's rollup when one qualifies.
+ const effectiveSchemas = matchedSchema?.rollups
+ ? querySchemas.map((s) => (s === matchedSchema ? resolveRollup(s, timeRange) : s))
+ : querySchemas;
+
+ const queryCacheSettings: ClickHouseSettings = matchedSchema?.queryCache
+ ? { use_query_cache: 1, query_cache_ttl: matchedSchema.queryCache.ttlSeconds }
+ : {};
+
const result = await executeTSQL(queryClickhouse.reader, {
...baseOptions,
schema: z.record(z.any()),
- tableSchema: querySchemas,
+ tableSchema: effectiveSchemas,
transformValues: true,
enforcedWhereClause,
fieldMappings,
@@ -290,6 +357,7 @@ export async function executeQuery(
timeRange,
clickhouseSettings: {
...getDefaultClickhouseSettings(),
+ ...queryCacheSettings,
...baseOptions.clickhouseSettings, // Allow caller overrides if needed
},
querySettings: {
diff --git a/apps/webapp/app/utils/pathBuilder.ts b/apps/webapp/app/utils/pathBuilder.ts
index 187bc50b549..edd65f8bde4 100644
--- a/apps/webapp/app/utils/pathBuilder.ts
+++ b/apps/webapp/app/utils/pathBuilder.ts
@@ -522,6 +522,15 @@ export function v3QueuesPath(
return `${v3EnvironmentPath(organization, project, environment)}/queues`;
}
+export function v3QueuePath(
+ organization: OrgForPath,
+ project: ProjectForPath,
+ environment: EnvironmentForPath,
+ queue: { friendlyId: string }
+) {
+ return `${v3QueuesPath(organization, project, environment)}/${queue.friendlyId}`;
+}
+
export function v3WaitpointTokensPath(
organization: OrgForPath,
project: ProjectForPath,
diff --git a/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts b/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts
new file mode 100644
index 00000000000..0e3c142b272
--- /dev/null
+++ b/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts
@@ -0,0 +1,26 @@
+import { prisma } from "~/db.server";
+import { FEATURE_FLAG } from "~/v3/featureFlags";
+import { makeFlag } from "~/v3/featureFlags.server";
+
+// Per-org gate for the Queue Metrics dashboard UI. Org override wins over the global
+// FeatureFlag table value, which wins over the off-by-default. Ingestion/emission is a
+// separate global flag; this only decides whether an org sees the metrics view.
+export async function canAccessQueueMetricsUi(options: {
+ userId: string;
+ organizationSlug: string;
+}): Promise {
+ const org = await prisma.organization.findFirst({
+ where: {
+ slug: options.organizationSlug,
+ members: { some: { userId: options.userId } },
+ },
+ select: { featureFlags: true },
+ });
+
+ const flag = makeFlag();
+ return flag({
+ key: FEATURE_FLAG.queueMetricsUiEnabled,
+ defaultValue: false,
+ overrides: (org?.featureFlags as Record) ?? {},
+ });
+}
diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts
index 637830aef06..fa17e504a37 100644
--- a/apps/webapp/app/v3/featureFlags.ts
+++ b/apps/webapp/app/v3/featureFlags.ts
@@ -19,6 +19,7 @@ export const FEATURE_FLAG = {
computeMigrationRequireTemplate: "computeMigrationRequireTemplate",
devBranchesEnabled: "devBranchesEnabled",
runOpsMintKind: "runOpsMintKind",
+ queueMetricsUiEnabled: "queueMetricsUiEnabled",
} as const;
export const FeatureFlagCatalog = {
@@ -54,6 +55,9 @@ export const FeatureFlagCatalog = {
// Per-org run-ops-id mint cutover. Defaults to "cuid"; only honored when
// RUN_OPS_MINT_ENABLED is on AND isSplitEnabled() is true.
[FEATURE_FLAG.runOpsMintKind]: z.enum(["cuid", "runOpsId"]),
+ // Per-org access to the Queue Metrics dashboard UI (view only; emission is global and
+ // separate). Off unless enabled for the org.
+ [FEATURE_FLAG.queueMetricsUiEnabled]: z.coerce.boolean(),
};
export type FeatureFlagKey = keyof typeof FeatureFlagCatalog;
diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts
index 4784ad75629..540ae670091 100644
--- a/apps/webapp/app/v3/querySchemas.ts
+++ b/apps/webapp/app/v3/querySchemas.ts
@@ -614,8 +614,333 @@ export const metricsSchema: TableSchema = {
};
/**
- * All available schemas for the query editor
+ * Schema definition for the queue_metrics table (trigger_dev.queue_metrics_v1).
+ * Pre-aggregated into 10-second buckets. Counter columns re-aggregate with sum(),
+ * gauges with max(), and wait_quantiles with quantilesMerge() — never FINAL.
*/
+export const queueMetricsSchema: TableSchema = {
+ name: "queue_metrics",
+ clickhouseName: "trigger_dev.queue_metrics_v1",
+ description: "Per-queue depth, concurrency, throttling, and scheduling-delay metrics",
+ timeConstraint: "bucket_start",
+ tenantColumns: {
+ organizationId: "organization_id",
+ projectId: "project_id",
+ environmentId: "environment_id",
+ },
+ columns: {
+ environment: {
+ name: "environment",
+ clickhouseName: "environment_id",
+ ...column("String", { description: "The environment slug", example: "prod" }),
+ fieldMapping: "environment",
+ customRenderType: "environment",
+ },
+ project: {
+ name: "project",
+ clickhouseName: "project_id",
+ ...column("String", {
+ description: "The project reference, they always start with `proj_`.",
+ example: "proj_howcnaxbfxdmwmxazktx",
+ }),
+ fieldMapping: "project",
+ customRenderType: "project",
+ },
+ queue: {
+ name: "queue",
+ clickhouseName: "queue_name",
+ ...column("LowCardinality(String)", {
+ description: "The queue name",
+ example: "my-queue",
+ coreColumn: true,
+ }),
+ },
+ bucket_start: {
+ name: "bucket_start",
+ ...column("DateTime", {
+ description: "The start of the 10-second aggregation bucket",
+ example: "2024-01-15 09:30:00",
+ coreColumn: true,
+ }),
+ },
+ // Cumulative-counter delta states. Read with deltaSumTimestampMerge( ) (loss-tolerant,
+ // reset-safe), never sum(); opaque like wait_quantiles. Merging across queues is
+ // invalid (mixes unrelated odometers): totals must GROUP BY queue, then sum outside.
+ enqueue_delta: {
+ name: "enqueue_delta",
+ mergeGroupKey: "queue",
+ ...column("String", {
+ description:
+ "Runs enqueued (cumulative-counter delta). Read with deltaSumTimestampMerge(enqueue_delta) grouped by queue. For totals across queues, sum the per-queue results in an outer query, never merge across queues. Per-bucket values can undercount by one inter-reading delta at bucket boundaries (the bridge lives in the prior bucket's state); totals over the whole range are exact.",
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ started_delta: {
+ name: "started_delta",
+ mergeGroupKey: "queue",
+ ...column("String", {
+ description:
+ "Runs dequeued/started (throughput). Read with deltaSumTimestampMerge(started_delta) grouped by queue. For totals across queues, sum the per-queue results in an outer query, never merge across queues. Per-bucket values can undercount by one inter-reading delta at bucket boundaries (the bridge lives in the prior bucket's state); totals over the whole range are exact.",
+ coreColumn: true,
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ ack_delta: {
+ name: "ack_delta",
+ mergeGroupKey: "queue",
+ ...column("String", {
+ description:
+ "Runs acked (completed). Read with deltaSumTimestampMerge(ack_delta) grouped by queue; sum per-queue results for totals.",
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ nack_delta: {
+ name: "nack_delta",
+ mergeGroupKey: "queue",
+ ...column("String", {
+ description:
+ "Runs nacked. Read with deltaSumTimestampMerge(nack_delta) grouped by queue; sum per-queue results for totals.",
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ dlq_delta: {
+ name: "dlq_delta",
+ mergeGroupKey: "queue",
+ ...column("String", {
+ description:
+ "Runs dead-lettered. Read with deltaSumTimestampMerge(dlq_delta) grouped by queue; sum per-queue results for totals.",
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ throttled_count: {
+ name: "throttled_count",
+ ...column("UInt64", {
+ description: "Gauge emissions where running>=limit and queued>0. Aggregate with sum().",
+ coreColumn: true,
+ }),
+ },
+ max_queued: {
+ name: "max_queued",
+ ...column("UInt32", {
+ description: "Peak queue depth in the bucket. Aggregate with max().",
+ coreColumn: true,
+ fillMode: "carry",
+ }),
+ },
+ max_running: {
+ name: "max_running",
+ ...column("UInt32", {
+ description: "Peak running (concurrency) in the bucket. Aggregate with max().",
+ coreColumn: true,
+ fillMode: "carry",
+ }),
+ },
+ max_limit: {
+ name: "max_limit",
+ ...column("UInt32", {
+ description: "The queue concurrency limit. Aggregate with max().",
+ coreColumn: true,
+ fillMode: "carry",
+ }),
+ },
+ max_env_queued: {
+ name: "max_env_queued",
+ ...column("UInt32", {
+ description: "Peak environment-wide queued in the bucket. Aggregate with max().",
+ fillMode: "carry",
+ }),
+ },
+ max_env_running: {
+ name: "max_env_running",
+ ...column("UInt32", {
+ description: "Peak environment-wide running in the bucket. Aggregate with max().",
+ fillMode: "carry",
+ }),
+ },
+ max_env_limit: {
+ name: "max_env_limit",
+ ...column("UInt32", {
+ description: "The environment concurrency limit. Aggregate with max().",
+ fillMode: "carry",
+ }),
+ },
+ max_ck_backlogged: {
+ name: "max_ck_backlogged",
+ ...column("UInt32", {
+ description:
+ "Peak number of distinct concurrency keys with queued runs in the bucket. Aggregate with max(). Zero for queues that do not use concurrency keys.",
+ fillMode: "carry",
+ }),
+ },
+ max_ck_wait_ms: {
+ name: "max_ck_wait_ms",
+ ...column("UInt32", {
+ description:
+ "Worst head-of-line wait (ms) across concurrency keys in the bucket: how long the most-starved key's oldest queued run has been waiting. Aggregate with max(). Zero for queues that do not use concurrency keys.",
+ fillMode: "carry",
+ }),
+ },
+ wait_ms_sum: {
+ name: "wait_ms_sum",
+ ...column("UInt64", {
+ description: "Sum of scheduling delays (ms). Mean = wait_ms_sum/wait_ms_count.",
+ }),
+ },
+ wait_ms_count: {
+ name: "wait_ms_count",
+ ...column("UInt64", {
+ description: "Count of scheduling-delay samples. Aggregate with sum().",
+ }),
+ },
+ wait_quantiles: {
+ name: "wait_quantiles",
+ ...column("String", {
+ description:
+ "Scheduling-delay (dequeue minus eligible-at) quantile state. Read with quantilesMerge(0.5,0.9,0.95,0.99)(wait_quantiles)[n].",
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ },
+ timeBucketThresholds: [
+ { maxRangeSeconds: 3 * 60 * 60, interval: { value: 10, unit: "SECOND" } },
+ { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } },
+ { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } },
+ { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } },
+ { maxRangeSeconds: 30 * 24 * 60 * 60, interval: { value: 1, unit: "HOUR" } },
+ { maxRangeSeconds: 90 * 24 * 60 * 60, interval: { value: 6, unit: "HOUR" } },
+ { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } },
+ { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } },
+ ] satisfies BucketThreshold[],
+ // Ranges whose bucket interval is >= 5 minutes read the 5m rollup instead (same
+ // logical columns, ~30x fewer rows).
+ rollups: [{ minIntervalSeconds: 300, clickhouseName: "trigger_dev.queue_metrics_5m_v1" }],
+ queryCache: { ttlSeconds: 30, alignSeconds: 30 },
+};
+
+/**
+ * Schema definition for the env_metrics table (trigger_dev.env_metrics_v1).
+ * Environment-level rollup of queue_metrics with the queue dimension dropped, so
+ * header tiles and saturation charts cost the same regardless of how many queues
+ * the environment has. Keeps the full 10-second granularity: row count is
+ * queue-independent, so even 30-day ranges stay small.
+ */
+export const envMetricsSchema: TableSchema = {
+ name: "env_metrics",
+ clickhouseName: "trigger_dev.env_metrics_v1",
+ description:
+ "Environment-level concurrency, saturation, throttling, and scheduling-delay metrics (10-second buckets)",
+ timeConstraint: "bucket_start",
+ tenantColumns: {
+ organizationId: "organization_id",
+ projectId: "project_id",
+ environmentId: "environment_id",
+ },
+ columns: {
+ environment: {
+ name: "environment",
+ clickhouseName: "environment_id",
+ ...column("String", { description: "The environment slug", example: "prod" }),
+ fieldMapping: "environment",
+ customRenderType: "environment",
+ },
+ project: {
+ name: "project",
+ clickhouseName: "project_id",
+ ...column("String", {
+ description: "The project reference, they always start with `proj_`.",
+ example: "proj_howcnaxbfxdmwmxazktx",
+ }),
+ fieldMapping: "project",
+ customRenderType: "project",
+ },
+ bucket_start: {
+ name: "bucket_start",
+ ...column("DateTime", {
+ description: "The start of the 10-second aggregation bucket",
+ example: "2024-01-15 09:30:00",
+ coreColumn: true,
+ }),
+ },
+ max_env_queued: {
+ name: "max_env_queued",
+ ...column("UInt32", {
+ description: "Peak environment-wide queued in the bucket. Aggregate with max().",
+ coreColumn: true,
+ fillMode: "carry",
+ }),
+ },
+ max_env_running: {
+ name: "max_env_running",
+ ...column("UInt32", {
+ description: "Peak environment-wide running in the bucket. Aggregate with max().",
+ coreColumn: true,
+ fillMode: "carry",
+ }),
+ },
+ max_env_limit: {
+ name: "max_env_limit",
+ ...column("UInt32", {
+ description: "The environment concurrency limit. Aggregate with max().",
+ coreColumn: true,
+ fillMode: "carry",
+ }),
+ },
+ throttled_count: {
+ name: "throttled_count",
+ ...column("UInt64", {
+ description:
+ "Gauge emissions where a queue was at its limit with work queued. Aggregate with sum().",
+ coreColumn: true,
+ }),
+ },
+ wait_ms_sum: {
+ name: "wait_ms_sum",
+ ...column("UInt64", {
+ description: "Sum of scheduling delays (ms). Mean = wait_ms_sum/wait_ms_count.",
+ }),
+ },
+ wait_ms_count: {
+ name: "wait_ms_count",
+ ...column("UInt64", {
+ description: "Count of scheduling-delay samples. Aggregate with sum().",
+ }),
+ },
+ wait_quantiles: {
+ name: "wait_quantiles",
+ ...column("String", {
+ description:
+ "Scheduling-delay quantile state (TDigest). Read with quantilesTDigestMerge(0.5,0.9,0.95,0.99)(wait_quantiles)[n].",
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ },
+ timeBucketThresholds: [
+ { maxRangeSeconds: 3 * 60 * 60, interval: { value: 10, unit: "SECOND" } },
+ { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } },
+ { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } },
+ { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } },
+ { maxRangeSeconds: 30 * 24 * 60 * 60, interval: { value: 1, unit: "HOUR" } },
+ { maxRangeSeconds: 90 * 24 * 60 * 60, interval: { value: 6, unit: "HOUR" } },
+ { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } },
+ { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } },
+ ] satisfies BucketThreshold[],
+ queryCache: { ttlSeconds: 30, alignSeconds: 30 },
+};
+
/**
* Schema definition for the llm_metrics table (trigger_dev.llm_metrics_v1)
*/
@@ -975,13 +1300,154 @@ export const llmModelsSchema: TableSchema = {
},
};
+/**
+ * Per-concurrency-key drill-down for queues that shard work with `concurrencyKey`
+ * (e.g. per-tenant fairness). Rows are activity-bound: a (queue, key, bucket) row exists
+ * only when that key had events, so key cardinality cannot inflate the table.
+ */
+export const queueMetricsByKeySchema: TableSchema = {
+ name: "queue_metrics_by_key",
+ clickhouseName: "trigger_dev.queue_metrics_ck_v1",
+ description: "Per-concurrency-key queue metrics: backlog, throughput, and wait by key",
+ hidden: true,
+ timeConstraint: "bucket_start",
+ tenantColumns: {
+ organizationId: "organization_id",
+ projectId: "project_id",
+ environmentId: "environment_id",
+ },
+ columns: {
+ environment: {
+ name: "environment",
+ clickhouseName: "environment_id",
+ ...column("String", { description: "The environment slug", example: "prod" }),
+ fieldMapping: "environment",
+ customRenderType: "environment",
+ },
+ project: {
+ name: "project",
+ clickhouseName: "project_id",
+ ...column("String", {
+ description: "The project reference, they always start with `proj_`.",
+ example: "proj_howcnaxbfxdmwmxazktx",
+ }),
+ fieldMapping: "project",
+ customRenderType: "project",
+ },
+ queue: {
+ name: "queue",
+ clickhouseName: "queue_name",
+ ...column("LowCardinality(String)", {
+ description: "The queue name",
+ example: "my-queue",
+ coreColumn: true,
+ }),
+ },
+ concurrency_key: {
+ name: "concurrency_key",
+ ...column("String", {
+ description: "The concurrency key the run was sharded by (e.g. a tenant id)",
+ example: "tenant-42",
+ coreColumn: true,
+ }),
+ },
+ bucket_start: {
+ name: "bucket_start",
+ ...column("DateTime", {
+ description: "The start of the 10-second aggregation bucket",
+ example: "2024-01-15 09:30:00",
+ coreColumn: true,
+ }),
+ },
+ enqueue_delta: {
+ name: "enqueue_delta",
+ mergeGroupKey: ["queue", "concurrency_key"],
+ ...column("String", {
+ description:
+ "Runs enqueued for this key (cumulative-counter delta). Read with deltaSumTimestampMerge(enqueue_delta) grouped by queue and concurrency_key, or with both pinned; never merge across keys.",
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ started_delta: {
+ name: "started_delta",
+ mergeGroupKey: ["queue", "concurrency_key"],
+ ...column("String", {
+ description:
+ "Runs dequeued/started for this key (throughput). Read with deltaSumTimestampMerge(started_delta) grouped by queue and concurrency_key, or with both pinned; never merge across keys.",
+ coreColumn: true,
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ ack_delta: {
+ name: "ack_delta",
+ mergeGroupKey: ["queue", "concurrency_key"],
+ ...column("String", {
+ description:
+ "Runs acked (completed) for this key. Read with deltaSumTimestampMerge(ack_delta) grouped by queue and concurrency_key, or with both pinned.",
+ }),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ max_queued: {
+ name: "max_queued",
+ ...column("UInt32", {
+ description: "Peak backlog for this key in the bucket. Aggregate with max().",
+ coreColumn: true,
+ fillMode: "carry",
+ }),
+ },
+ max_running: {
+ name: "max_running",
+ ...column("UInt32", {
+ description: "Peak running for this key in the bucket. Aggregate with max().",
+ fillMode: "carry",
+ }),
+ },
+ wait_ms_sum: {
+ name: "wait_ms_sum",
+ ...column("UInt64", {
+ description:
+ "Sum of scheduling delays (ms) for this key. Mean = wait_ms_sum/wait_ms_count.",
+ }),
+ },
+ wait_ms_count: {
+ name: "wait_ms_count",
+ ...column("UInt64", {
+ description: "Count of scheduling-delay samples for this key. Aggregate with sum().",
+ }),
+ },
+ },
+ timeBucketThresholds: [
+ { maxRangeSeconds: 3 * 60 * 60, interval: { value: 10, unit: "SECOND" } },
+ { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } },
+ { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } },
+ { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } },
+ { maxRangeSeconds: 30 * 24 * 60 * 60, interval: { value: 1, unit: "HOUR" } },
+ { maxRangeSeconds: 90 * 24 * 60 * 60, interval: { value: 6, unit: "HOUR" } },
+ { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } },
+ { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } },
+ ] satisfies BucketThreshold[],
+ queryCache: { ttlSeconds: 30, alignSeconds: 30 },
+};
+
export const querySchemas: TableSchema[] = [
runsSchema,
metricsSchema,
llmMetricsSchema,
llmModelsSchema,
+ queueMetricsSchema,
+ envMetricsSchema,
+ queueMetricsByKeySchema,
];
+/** Schemas shown in user-facing listings (editor autocomplete, schema docs, schema API). */
+export const visibleQuerySchemas: TableSchema[] = querySchemas.filter((s) => !s.hidden);
+
/**
* Default query for the query editor
*/
diff --git a/apps/webapp/app/v3/queueMetrics.server.ts b/apps/webapp/app/v3/queueMetrics.server.ts
new file mode 100644
index 00000000000..14d9c4dc93d
--- /dev/null
+++ b/apps/webapp/app/v3/queueMetrics.server.ts
@@ -0,0 +1,247 @@
+import { type ClickHouse, type QueueMetricsRawV1Input } from "@internal/clickhouse";
+import {
+ allStreamKeys,
+ CachedRedisFlag,
+ CachedRedisNumber,
+ MetricsStreamConsumer,
+ MetricsStreamEmitter,
+ probeShardStates,
+ type MetricDefinition,
+ type ShardState,
+ type StreamEntry,
+} from "@internal/metrics-pipeline";
+import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis";
+import os from "node:os";
+import { env } from "~/env.server";
+import { getDefaultClickhouseClient } from "~/services/clickhouse/clickhouseFactory.server";
+import { logger } from "~/services/logger.server";
+import { signalsEmitter } from "~/services/signals.server";
+import { singleton } from "~/utils/singleton";
+import { mapEntryToRows, QueueNameLimiter } from "./queueMetricsMapping";
+import { meter } from "./tracer.server";
+
+const FLAG_KEY = "queue_metrics:enabled";
+const SAMPLE_RATE_KEY = "queue_metrics:gauge_sample_rate";
+const TRUTHY = new Set(["1", "true", "on", "enabled", "yes"]);
+
+// Same physical Redis as the RunQueue (host/port/auth). Stream keys are kept out of the
+// keyPrefix on every access path, so only the connection details matter here.
+function runQueueRedisOptions(): RedisOptions {
+ return {
+ port: env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? undefined,
+ host: env.RUN_ENGINE_RUN_QUEUE_REDIS_HOST ?? undefined,
+ username: env.RUN_ENGINE_RUN_QUEUE_REDIS_USERNAME ?? undefined,
+ password: env.RUN_ENGINE_RUN_QUEUE_REDIS_PASSWORD ?? undefined,
+ enableAutoPipelining: true,
+ ...(env.RUN_ENGINE_RUN_QUEUE_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
+ };
+}
+
+// Metrics stream Redis: a dedicated instance when QUEUE_METRICS_REDIS_HOST is set (so the
+// metrics backlog never competes with the run queue), else the run-queue Redis. Carries BOTH
+// gauges and counters — gauges are read inside the queue-op Lua and returned on the reply,
+// then XADDed here by Node, so the run-queue Redis holds no metrics stream.
+function metricsRedisOptions(): RedisOptions {
+ if (!env.QUEUE_METRICS_REDIS_HOST) return runQueueRedisOptions();
+ return {
+ host: env.QUEUE_METRICS_REDIS_HOST,
+ port: env.QUEUE_METRICS_REDIS_PORT ?? undefined,
+ username: env.QUEUE_METRICS_REDIS_USERNAME ?? undefined,
+ password: env.QUEUE_METRICS_REDIS_PASSWORD ?? undefined,
+ enableAutoPipelining: true,
+ ...(env.QUEUE_METRICS_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
+ };
+}
+
+// One stream family on the metrics Redis carrying both gauge snapshots and cumulative
+// counter readings; one consumer group reads it.
+function metricsDefinition(): MetricDefinition {
+ // A stalled consumer holds up to maxLen entries per shard in Redis memory: cap lower
+ // by default when the stream shares the queue-critical run-queue Redis.
+ const defaultMaxLen = env.QUEUE_METRICS_REDIS_HOST ? 8_000_000 : 2_000_000;
+ return {
+ name: "queue_metrics",
+ shardCount: env.QUEUE_METRICS_STREAM_SHARD_COUNT,
+ consumerGroup: "queue_metrics_cg",
+ maxLen: env.QUEUE_METRICS_COUNTER_STREAM_MAXLEN ?? defaultMaxLen,
+ };
+}
+
+// Dedicated client for the admin read/write/probe surface — works regardless of whether
+// this instance runs the emitter/consumer. keyPrefix unset to match the raw control keys.
+function adminRedis(): Redis {
+ return singleton("queueMetricsAdminRedis", () =>
+ createRedisClient(
+ { ...runQueueRedisOptions(), keyPrefix: undefined },
+ { onError: (error) => logger.error("queue metrics admin redis error", { error }) }
+ )
+ );
+}
+
+function metricsAdminRedis(): Redis {
+ return singleton("queueMetricsCounterAdminRedis", () =>
+ createRedisClient(
+ { ...metricsRedisOptions(), keyPrefix: undefined },
+ { onError: (error) => logger.error("queue metrics counter admin redis error", { error }) }
+ )
+ );
+}
+
+export type QueueMetricsControls = {
+ enabled: boolean;
+ enabledKeySet: boolean;
+ sampleRate: number;
+ sampleRateKeySet: boolean;
+ sampleRateDefault: number;
+};
+
+export async function readQueueMetricsControls(): Promise {
+ const [enabledRaw, rateRaw] = (await adminRedis().mget(FLAG_KEY, SAMPLE_RATE_KEY)) as (
+ | string
+ | null
+ )[];
+ const sampleRateDefault = env.QUEUE_METRICS_GAUGE_SAMPLE_RATE;
+ const parsed = rateRaw == null ? Number.NaN : Number(rateRaw);
+ return {
+ enabled: enabledRaw != null && TRUTHY.has(enabledRaw.trim().toLowerCase()),
+ enabledKeySet: enabledRaw != null,
+ sampleRate: Number.isFinite(parsed) ? Math.min(1, Math.max(0, parsed)) : sampleRateDefault,
+ sampleRateKeySet: rateRaw != null,
+ sampleRateDefault,
+ };
+}
+
+export async function writeQueueMetricsControls(update: {
+ enabled?: boolean;
+ sampleRate?: number;
+}): Promise {
+ const client = adminRedis();
+ const ops: Promise[] = [];
+ if (update.enabled !== undefined) {
+ ops.push(client.set(FLAG_KEY, update.enabled ? "1" : "0"));
+ }
+ if (update.sampleRate !== undefined) {
+ ops.push(client.set(SAMPLE_RATE_KEY, String(Math.min(1, Math.max(0, update.sampleRate)))));
+ }
+ await Promise.all(ops);
+}
+
+export type LabeledShardState = ShardState & { stream: "queue_metrics" };
+
+export async function probeQueueMetricsStreams(): Promise {
+ const def = metricsDefinition();
+ const states = await probeShardStates(metricsAdminRedis(), allStreamKeys(def), def.consumerGroup);
+ return states.map((s) => ({ ...s, stream: "queue_metrics" as const }));
+}
+
+/** Injected into the RunQueue when QUEUE_METRICS_EMIT_ENABLED=1; emits only while the flag is on. */
+export function getQueueMetricsEmitter(): MetricsStreamEmitter {
+ return singleton("queueMetricsEmitter", () => {
+ // Control keys stay on the run-queue Redis (the admin surface + docs point there).
+ const controlRedis = runQueueRedisOptions();
+ const flag = new CachedRedisFlag({ redis: controlRedis, key: FLAG_KEY, cacheTtlMs: 10_000 });
+ // Live-tunable (Redis key, 10s cache); the env value is the default when the key is unset.
+ const gaugeSampleRate = new CachedRedisNumber({
+ redis: controlRedis,
+ key: SAMPLE_RATE_KEY,
+ defaultValue: env.QUEUE_METRICS_GAUGE_SAMPLE_RATE,
+ min: 0,
+ max: 1,
+ cacheTtlMs: 10_000,
+ });
+ return new MetricsStreamEmitter({
+ redis: metricsRedisOptions(),
+ definition: metricsDefinition(),
+ flag,
+ meter,
+ gaugeSampleRate,
+ counterOdometerTtlMs: env.QUEUE_METRICS_COUNTER_ODOMETER_TTL_SECONDS * 1000,
+ });
+ });
+}
+
+const queueNameLimiter = singleton(
+ "queueMetricsQueueNameLimiter",
+ () => new QueueNameLimiter(env.QUEUE_METRICS_MAX_QUEUE_NAMES_PER_ENV)
+);
+
+const concurrencyKeyLimiter = singleton(
+ "queueMetricsConcurrencyKeyLimiter",
+ () => new QueueNameLimiter(env.QUEUE_METRICS_MAX_CONCURRENCY_KEYS_PER_QUEUE, 50_000)
+);
+
+function mapEntry(entry: StreamEntry): QueueMetricsRawV1Input[] {
+ return mapEntryToRows(entry, {
+ queueNames: queueNameLimiter,
+ concurrencyKeys: concurrencyKeyLimiter,
+ });
+}
+
+function makeInsert(): (
+ rows: QueueMetricsRawV1Input[],
+ opts: { dedupToken: string }
+) => Promise {
+ const ch: ClickHouse = getDefaultClickhouseClient();
+ const insertRaw = ch.queueMetrics.insertRaw;
+ return async (rows, { dedupToken }) => {
+ const [error] = await insertRaw(rows, {
+ params: {
+ clickhouse_settings: {
+ insert_deduplication_token: dedupToken,
+ async_insert: 0,
+ // Propagate the token through the MV so a raw-deduped retry can't leave
+ // queue_metrics_v1 short when the MV insert failed on the first attempt.
+ deduplicate_blocks_in_dependent_materialized_views: 1,
+ },
+ },
+ });
+ if (error) throw error;
+ };
+}
+
+function getQueueMetricsConsumers(): MetricsStreamConsumer[] {
+ return singleton("queueMetricsConsumers", () => {
+ const insert = makeInsert();
+ return [
+ new MetricsStreamConsumer({
+ consumerName: `${os.hostname()}-${process.pid}`,
+ batchSize: env.QUEUE_METRICS_CONSUMER_BATCH_SIZE,
+ meter,
+ mapEntry,
+ insert,
+ redis: metricsRedisOptions(),
+ definition: metricsDefinition(),
+ }),
+ ];
+ });
+}
+
+// Construct the emitter at boot (not lazily on the first enqueue) so its flag has warmed
+// before any traffic — otherwise the first op after boot reads the default and is dropped.
+export function initQueueMetricsEmitter(): void {
+ if (env.QUEUE_METRICS_EMIT_ENABLED !== "1") return;
+ getQueueMetricsEmitter();
+}
+
+declare global {
+ // eslint-disable-next-line no-var
+ var __queueMetricsConsumerRegistered__: boolean | undefined;
+}
+
+export function initQueueMetricsConsumer(): void {
+ if (env.QUEUE_METRICS_CONSUMER_ENABLED !== "1") return;
+ if (global.__queueMetricsConsumerRegistered__) return;
+ global.__queueMetricsConsumerRegistered__ = true;
+
+ const consumers = getQueueMetricsConsumers();
+ const stop = () =>
+ Promise.all(consumers.map((c) => c.stop())).catch((error) =>
+ logger.error("queue metrics consumer stop failed", { error })
+ );
+ signalsEmitter.on("SIGTERM", stop);
+ signalsEmitter.on("SIGINT", stop);
+
+ Promise.all(consumers.map((c) => c.start()))
+ .then(() => logger.info("Queue metrics consumer started"))
+ .catch((error) => logger.error("queue metrics consumers failed to start", { error }));
+}
diff --git a/apps/webapp/app/v3/queueMetricsMapping.ts b/apps/webapp/app/v3/queueMetricsMapping.ts
new file mode 100644
index 00000000000..9433b361a88
--- /dev/null
+++ b/apps/webapp/app/v3/queueMetricsMapping.ts
@@ -0,0 +1,164 @@
+import { type QueueMetricsRawV1Input } from "@internal/clickhouse";
+import { entryOrderKey, entryTimeMs, type StreamEntry } from "@internal/metrics-pipeline";
+
+const OPS = new Set(["gauge", "enqueue", "started", "ack", "nack", "dlq"]);
+
+// {org:ORGID}:proj:PROJECTID:env:ENVID:queue:QUEUENAME[:ck:CK]. Anchored (not a
+// positional split) so a queue name containing ":" survives; the lazy name capture
+// stops before an optional ":ck:" suffix, which is captured (the ":ck:*" wildcard of
+// aggregate CK-dequeue gauges maps to no key).
+const DESCRIPTOR = /^\{org:([^}]+)\}:proj:([^:]+):env:([^:]+):queue:(.+?)(?::ck:(.+))?$/;
+
+export function descriptorFromQueue(q: string): {
+ organization_id: string;
+ project_id: string;
+ environment_id: string;
+ queue_name: string;
+ concurrency_key: string;
+} | null {
+ const match = DESCRIPTOR.exec(q);
+ if (!match) return null;
+ const ck = match[5];
+ return {
+ organization_id: match[1]!,
+ project_id: match[2]!,
+ environment_id: match[3]!,
+ queue_name: match[4]!,
+ concurrency_key: ck && ck !== "*" ? ck : "",
+ };
+}
+
+export const OVERFLOW_QUEUE_NAME = "__overflow__";
+
+/**
+ * Bounds per-scope name cardinality (both queue_name per env and concurrency_key per
+ * queue are user-controlled GROUP BY keys). Names beyond the cap map to OVERFLOW_QUEUE_NAME.
+ * Per-process and reset on restart, so the cap is approximate: a protective bound, not a quota.
+ */
+export class QueueNameLimiter {
+ private readonly byScope = new Map>();
+
+ constructor(
+ private readonly maxPerScope: number,
+ private readonly maxScopes = 10_000
+ ) {}
+
+ limit(scope: string, name: string): string {
+ if (this.maxPerScope <= 0) return name;
+ let names = this.byScope.get(scope);
+ if (!names) {
+ if (this.byScope.size >= this.maxScopes) {
+ const oldest = this.byScope.keys().next().value;
+ if (oldest !== undefined) this.byScope.delete(oldest);
+ }
+ names = new Set();
+ this.byScope.set(scope, names);
+ }
+ if (names.has(name)) return name;
+ if (names.size >= this.maxPerScope) return OVERFLOW_QUEUE_NAME;
+ names.add(name);
+ return name;
+ }
+}
+
+function num(value: string | undefined): number | undefined {
+ if (value == null) return undefined;
+ const n = Number(value);
+ return Number.isFinite(n) ? n : undefined;
+}
+
+export type QueueMetricsLimiters = {
+ queueNames?: QueueNameLimiter;
+ concurrencyKeys?: QueueNameLimiter;
+};
+
+/**
+ * One stream entry maps to 1..2 raw rows: gauges are single rows carrying their parsed
+ * concurrency_key; a counter entry yields a base row when `cum` is present plus a per-key
+ * row when `ck`/`ckcum` are present (the emitter's dual-odometer entry). Baseline entries
+ * carry only one of the two, by design.
+ */
+export function mapEntryToRows(
+ entry: StreamEntry,
+ limiters?: QueueMetricsLimiters
+): QueueMetricsRawV1Input[] {
+ const f = entry.fields;
+ const op = f.op;
+ if (!op || !OPS.has(op) || !f.q) return [];
+ const descriptor = descriptorFromQueue(f.q);
+ if (!descriptor || !descriptor.queue_name) return [];
+
+ let queueOverflowed = false;
+ if (limiters?.queueNames) {
+ descriptor.queue_name = limiters.queueNames.limit(
+ descriptor.environment_id,
+ descriptor.queue_name
+ );
+ queueOverflowed = descriptor.queue_name === OVERFLOW_QUEUE_NAME;
+ }
+
+ // Counter entries carry the key as a field (q is base-normalized); gauges carry it in q.
+ let ck = descriptor.concurrency_key || (typeof f.ck === "string" ? f.ck : "");
+ if (ck && limiters?.concurrencyKeys) {
+ const scope = `${descriptor.environment_id}:${descriptor.queue_name}`;
+ if (limiters.concurrencyKeys.limit(scope, ck) === OVERFLOW_QUEUE_NAME) ck = "";
+ }
+ // Overflowed queue names share one row; per-key attribution under them is meaningless.
+ if (queueOverflowed) ck = "";
+
+ const eventMs = entryTimeMs(entry.id) ?? Date.now();
+ const eventTime = new Date(eventMs).toISOString().slice(0, 19).replace("T", " ");
+ const base = {
+ organization_id: descriptor.organization_id,
+ project_id: descriptor.project_id,
+ environment_id: descriptor.environment_id,
+ queue_name: descriptor.queue_name,
+ event_time: eventTime,
+ op: op as QueueMetricsRawV1Input["op"],
+ };
+
+ if (op === "gauge") {
+ return [
+ {
+ ...base,
+ concurrency_key: ck,
+ queued: num(f.ql),
+ running: num(f.cc),
+ queue_limit: num(f.lim),
+ env_queued: num(f.eql),
+ env_running: num(f.ec),
+ env_limit: num(f.elim),
+ throttled: num(f.thr),
+ ck_backlogged: num(f.ckq),
+ ck_max_wait_ms: num(f.ckw),
+ },
+ ];
+ }
+
+ // Overflowed names drop counters entirely: merging distinct odometers under one shared
+ // name produces garbage deltas (gauges above stay, max across the overflow set is
+ // still meaningful).
+ if (queueOverflowed) return [];
+
+ const rows: QueueMetricsRawV1Input[] = [];
+ const orderKey = entryOrderKey(entry.id);
+ const waitMs = op === "started" && f.wait != null ? num(f.wait) : undefined;
+ if (f.cum != null) {
+ rows.push({
+ ...base,
+ cumulative: num(f.cum),
+ order_key: orderKey,
+ ...(waitMs !== undefined ? { wait_ms: waitMs } : {}),
+ });
+ }
+ if (ck && f.ckcum != null) {
+ rows.push({
+ ...base,
+ concurrency_key: ck,
+ cumulative: num(f.ckcum),
+ order_key: orderKey,
+ ...(waitMs !== undefined ? { wait_ms: waitMs } : {}),
+ });
+ }
+ return rows;
+}
diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts
index 4d9e263d6be..85986933290 100644
--- a/apps/webapp/app/v3/runEngine.server.ts
+++ b/apps/webapp/app/v3/runEngine.server.ts
@@ -7,6 +7,7 @@ import { logger } from "~/services/logger.server";
import { defaultMachine, getCurrentPlan } from "~/services/platform.v3.server";
import { singleton } from "~/utils/singleton";
import { allMachines } from "./machinePresets.server";
+import { getQueueMetricsEmitter } from "./queueMetrics.server";
import { runEnginePendingVersionLookup } from "./runEnginePendingVersionLookup.server";
import { pickRunOpsStoreForCompletion } from "./runOpsMigration/crossSeamGuard.server";
import { runEngineControlPlaneResolver } from "./runOpsMigration/runEngineControlPlaneResolver.server";
@@ -83,6 +84,7 @@ function createRunEngine() {
tracer,
},
shardCount: env.RUN_ENGINE_RUN_QUEUE_SHARD_COUNT,
+ queueMetrics: env.QUEUE_METRICS_EMIT_ENABLED === "1" ? getQueueMetricsEmitter() : undefined,
processWorkerQueueDebounceMs: env.RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS,
dequeueBlockingTimeoutSeconds: env.RUN_ENGINE_DEQUEUE_BLOCKING_TIMEOUT_SECONDS,
masterQueueConsumersIntervalMs: env.RUN_ENGINE_MASTER_QUEUE_CONSUMERS_INTERVAL_MS,
diff --git a/apps/webapp/package.json b/apps/webapp/package.json
index 643093624b4..90dc92447f7 100644
--- a/apps/webapp/package.json
+++ b/apps/webapp/package.json
@@ -17,6 +17,7 @@
"typecheck": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" tsc --noEmit -p ./tsconfig.check.json",
"db:seed": "tsx seed.ts",
"db:seed:ai-spans": "tsx seed-ai-spans.mts",
+ "db:seed:queue-metrics": "tsx seed-queue-metrics.mts",
"upload:sourcemaps": "bash ./upload-sourcemaps.sh",
"test": "vitest --no-file-parallelism",
"eval:dev": "evalite watch"
@@ -57,6 +58,7 @@
"@internal/dashboard-agent": "workspace:*",
"@internal/dashboard-agent-db": "workspace:*",
"@internal/llm-model-catalog": "workspace:*",
+ "@internal/metrics-pipeline": "workspace:*",
"@internal/redis": "workspace:*",
"@internal/run-engine": "workspace:*",
"@internal/run-ops-database": "workspace:*",
diff --git a/apps/webapp/seed-queue-metrics.mts b/apps/webapp/seed-queue-metrics.mts
new file mode 100644
index 00000000000..709ba8f25ed
--- /dev/null
+++ b/apps/webapp/seed-queue-metrics.mts
@@ -0,0 +1,947 @@
+import { prisma } from "./app/db.server";
+import { createOrganization } from "./app/models/organization.server";
+import { createProject } from "./app/models/project.server";
+import { ClickHouse } from "@internal/clickhouse";
+import type { QueueMetricsRawV1Input } from "@internal/clickhouse";
+import { generateFriendlyId } from "./app/v3/friendlyIdentifiers";
+
+// Queue metrics simulator: writes realistic raw rows into a synthetic tenant's
+// queue_metrics_raw_v1 and lets the MV build queue_metrics_v1 (the same path the real
+// consumer uses), so the dashboard can be built without the run engine. See TRI-10407.
+
+const ORG_TITLE = "Queue Metrics Dev";
+const PROJECT_NAME = "queue-metrics-demo";
+
+type Rng = () => number;
+type QueueProfile = {
+ name: string;
+ limit: (bucket: number) => number;
+ arrivals: (bucket: number, rng: Rng) => number; // expected new runs enqueued this bucket
+ waitBaseMs: number;
+ sparse?: boolean; // emit no rows when the queue is fully idle (tests carry-forward gaps)
+ // Concurrency-key queue: adds CK-health gauge fields + live ckIndex staging (--usage)
+ ck?: {
+ backlogged: (bucket: number, rng: Rng) => number;
+ maxWaitMs: (bucket: number, rng: Rng) => number;
+ };
+};
+type Scenario = {
+ description: string;
+ envLimit: (bucket: number) => number;
+ queues: QueueProfile[];
+};
+
+// ---------------------------------------------------------------------------
+// CLI args
+// ---------------------------------------------------------------------------
+
+function parseArgs(argv: string[]) {
+ const flags: Record = {};
+ for (let i = 0; i < argv.length; i++) {
+ const t = argv[i];
+ if (t.startsWith("--")) {
+ const k = t.slice(2);
+ const n = argv[i + 1];
+ if (n && !n.startsWith("--")) {
+ flags[k] = n;
+ i++;
+ } else flags[k] = "true";
+ }
+ }
+ return flags;
+}
+
+function parseDuration(s: string): number {
+ const m = s.match(/^(\d+)\s*(s|m|h|d)?$/);
+ if (!m) throw new Error(`bad duration: ${s}`);
+ const n = Number(m[1]);
+ const unit = m[2] ?? "s";
+ return n * { s: 1, m: 60, h: 3600, d: 86400 }[unit]!;
+}
+
+// ---------------------------------------------------------------------------
+// Deterministic RNG + distributions
+// ---------------------------------------------------------------------------
+
+function mulberry32(seed: number): Rng {
+ let a = seed >>> 0;
+ return () => {
+ a |= 0;
+ a = (a + 0x6d2b79f5) | 0;
+ let t = Math.imul(a ^ (a >>> 15), 1 | a);
+ t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+ };
+}
+
+function standardNormal(rng: Rng): number {
+ let u = 0;
+ let v = 0;
+ while (u === 0) u = rng();
+ while (v === 0) v = rng();
+ return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v);
+}
+
+function lognormal(medianMs: number, sigma: number, rng: Rng): number {
+ return Math.exp(Math.log(Math.max(medianMs, 1)) + sigma * standardNormal(rng));
+}
+
+function poisson(lambda: number, rng: Rng): number {
+ if (lambda <= 0) return 0;
+ if (lambda > 30) return Math.max(0, Math.round(lambda + standardNormal(rng) * Math.sqrt(lambda)));
+ const L = Math.exp(-lambda);
+ let k = 0;
+ let p = 1;
+ do {
+ k++;
+ p *= rng();
+ } while (p > L);
+ return k - 1;
+}
+
+function formatChDateTime(date: Date): string {
+ return date.toISOString().slice(0, 19).replace("T", " ");
+}
+
+// ---------------------------------------------------------------------------
+// Scenarios
+// ---------------------------------------------------------------------------
+
+const steady = (): QueueProfile[] => [
+ { name: "emails", limit: () => 20, arrivals: (_b, r) => poisson(12, r), waitBaseMs: 40 },
+ { name: "webhooks", limit: () => 15, arrivals: (_b, r) => poisson(9, r), waitBaseMs: 40 },
+ { name: "reports", limit: () => 10, arrivals: (_b, r) => poisson(5, r), waitBaseMs: 60 },
+];
+
+// periodic bursts every ~30 buckets
+const bursty = (name: string, limit: number, base: number): QueueProfile => ({
+ name,
+ limit: () => limit,
+ arrivals: (b, r) => poisson(b % 30 < 4 ? base * 5 : base, r),
+ waitBaseMs: 50,
+});
+
+const scenarios: Record Scenario> = {
+ steady: () => ({
+ description: "all queues below capacity, no throttling",
+ envLimit: () => 60,
+ queues: steady(),
+ }),
+
+ burst: () => ({
+ description: "periodic arrival bursts -> backlog + wait spikes + throttling",
+ envLimit: () => 60,
+ queues: [bursty("ingest", 20, 6), bursty("transform", 20, 7)],
+ }),
+
+ // Tela case: sum of per-queue limits far exceeds the env limit, so queues compete.
+ "over-allocated-env": () => ({
+ description: "Sum(queue limits)=120 >> env limit=40; env saturates, queues env-limited",
+ envLimit: () => 40,
+ queues: Array.from({ length: 6 }, (_v, i) => ({
+ name: `worker-${i + 1}`,
+ limit: () => 20,
+ arrivals: (_b: number, r: Rng) => poisson(14, r),
+ waitBaseMs: 50,
+ })),
+ }),
+
+ "single-queue-starves-others": () => ({
+ description: "one greedy queue consumes most of a small env limit, starving the rest",
+ envLimit: () => 30,
+ queues: [
+ { name: "greedy", limit: () => 40, arrivals: (_b, r) => poisson(45, r), waitBaseMs: 60 },
+ { name: "polite-1", limit: () => 10, arrivals: (_b, r) => poisson(6, r), waitBaseMs: 50 },
+ { name: "polite-2", limit: () => 10, arrivals: (_b, r) => poisson(6, r), waitBaseMs: 50 },
+ ],
+ }),
+
+ "throttled-backlog": () => ({
+ description:
+ "arrival rate persistently above the queue limit -> permanent backlog + throttling",
+ envLimit: () => 50,
+ queues: [
+ { name: "overloaded", limit: () => 10, arrivals: (_b, r) => poisson(16, r), waitBaseMs: 80 },
+ ],
+ }),
+
+ "idle-sparse": () => ({
+ description: "sparse arrivals with many empty buckets (carry-forward gaps)",
+ envLimit: () => 50,
+ queues: Array.from({ length: 4 }, (_v, i) => ({
+ name: `sparse-${i + 1}`,
+ limit: () => 5,
+ arrivals: (_b: number, r: Rng) => (r() < 0.12 ? poisson(3, r) : 0),
+ waitBaseMs: 30,
+ sparse: true,
+ })),
+ }),
+
+ "spike-then-drain": (totalBuckets) => ({
+ description: "heavy arrivals for the first third, then zero; backlog builds then drains",
+ envLimit: () => 60,
+ queues: [
+ {
+ name: "batch-job",
+ limit: () => 15,
+ arrivals: (b, r) => (b < totalBuckets / 3 ? poisson(30, r) : 0),
+ waitBaseMs: 70,
+ },
+ ],
+ }),
+
+ // Pagination + relevance-ranking design surface: one runaway queue, a busy-but-healthy
+ // head, a bursty middle, and a long sparse tail across 61 queues (the list pages at 25).
+ "many-queues": () => ({
+ description:
+ "61 queues: one runaway, busy head, bursty middle, long sparse tail (pagination + ranking)",
+ envLimit: () => 150,
+ queues: [
+ { name: "imports", limit: () => 8, arrivals: (_b, r) => poisson(14, r), waitBaseMs: 80 },
+ ...["checkout", "notifications", "emails"].map((name, i) => ({
+ name,
+ limit: () => 15,
+ arrivals: (_b: number, r: Rng) => poisson(7 + i, r),
+ waitBaseMs: 60,
+ })),
+ ...Array.from({ length: 12 }, (_v, i) =>
+ bursty(`service-${String(i + 1).padStart(2, "0")}`, 10, 2)
+ ),
+ ...Array.from({ length: 20 }, (_v, i) => ({
+ name: `job-${String(i + 1).padStart(2, "0")}`,
+ limit: () => 5,
+ arrivals: (_b: number, r: Rng) => poisson(1, r),
+ waitBaseMs: 40,
+ })),
+ ...Array.from({ length: 25 }, (_v, i) => ({
+ name: `tenant-${String(i + 1).padStart(2, "0")}`,
+ limit: () => 3,
+ arrivals: (_b: number, r: Rng) => (r() < 0.05 ? poisson(2, r) : 0),
+ waitBaseMs: 30,
+ sparse: true,
+ })),
+ ],
+ }),
+
+ // Per-tenant concurrency keys: a hog tenant periodically floods the queue and starves
+ // the others, so the CK charts (keys with backlog, most-starved wait) and the live
+ // per-key table on the queue detail page have something to show. Use with --usage.
+ "tenant-hotspot": () => ({
+ description:
+ "CK queue where a hog tenant starves others: CK charts + live key table (use --usage)",
+ envLimit: () => 40,
+ queues: [
+ {
+ name: "per-tenant",
+ limit: () => 10,
+ arrivals: (b, r) => poisson(b % 60 < 20 ? 25 : 8, r),
+ waitBaseMs: 60,
+ ck: {
+ backlogged: (b, r) => (b % 60 < 20 ? 6 + Math.round(r() * 6) : Math.round(r() * 3)),
+ maxWaitMs: (b, r) =>
+ b % 60 < 20
+ ? Math.round(lognormal(90_000, 0.5, r))
+ : Math.round(lognormal(3_000, 0.6, r)),
+ },
+ },
+ { name: "background", limit: () => 10, arrivals: (_b, r) => poisson(5, r), waitBaseMs: 40 },
+ ],
+ }),
+
+ // Default: one env with a variety of queue behaviours + occasional env saturation.
+ mixed: (totalBuckets) => ({
+ description: "variety of queue profiles in one env, with occasional env saturation",
+ envLimit: (b) => (b % 40 < 12 ? 45 : 70), // dips low periodically to flip env saturation
+ queues: [
+ { name: "emails", limit: () => 20, arrivals: (_b, r) => poisson(12, r), waitBaseMs: 40 },
+ bursty("webhooks", 20, 6),
+ { name: "reports", limit: () => 10, arrivals: (_b, r) => poisson(8, r), waitBaseMs: 80 },
+ {
+ name: "cleanup",
+ limit: () => 5,
+ arrivals: (_b, r) => (r() < 0.12 ? poisson(3, r) : 0),
+ waitBaseMs: 30,
+ sparse: true,
+ },
+ {
+ name: "nightly-batch",
+ limit: () => 15,
+ arrivals: (b, r) => (b < totalBuckets / 5 ? poisson(18, r) : 0),
+ waitBaseMs: 70,
+ },
+ ],
+ }),
+};
+
+// ---------------------------------------------------------------------------
+// Simulation
+// ---------------------------------------------------------------------------
+
+type Ids = { organization_id: string; project_id: string; environment_id: string };
+const WAIT_SIGMA = 0.6;
+const NACK_RATE = 0.02;
+const DLQ_RATE = 0.004;
+
+type CounterOp = "enqueue" | "started" | "ack" | "nack" | "dlq";
+// Per-(queue, op) odometers, mirroring the production emitter: cumulative readings with a
+// cum=0 baseline on the first one, so deltaSumTimestamp captures the 0->1 delta.
+type CounterState = Record[];
+
+function counterRows(
+ counters: CounterState,
+ q: number,
+ ids: Ids,
+ queueName: string,
+ eventTime: string,
+ orderKey: () => number,
+ op: CounterOp,
+ wait_ms?: number
+): QueueMetricsRawV1Input[] {
+ const rows: QueueMetricsRawV1Input[] = [];
+ if (counters[q][op] === 0) {
+ rows.push({
+ ...ids,
+ queue_name: queueName,
+ event_time: eventTime,
+ op,
+ cumulative: 0,
+ order_key: orderKey(),
+ });
+ }
+ counters[q][op] += 1;
+ rows.push({
+ ...ids,
+ queue_name: queueName,
+ event_time: eventTime,
+ op,
+ cumulative: counters[q][op],
+ order_key: orderKey(),
+ ...(wait_ms !== undefined ? { wait_ms } : {}),
+ });
+ return rows;
+}
+
+function newCounterState(n: number): CounterState {
+ return Array.from({ length: n }, () => ({ enqueue: 0, started: 0, ack: 0, nack: 0, dlq: 0 }));
+}
+
+// Per-key simulation for CK profiles: 12 tenants (tenant-01 is the hog, matching
+// stageRedisUsage), per-tenant backlog drained round-robin, per-tenant odometers.
+const CK_TENANT_COUNT = 12;
+type CkSimState = { backlog: number[]; counters: Map> };
+const ckSim = new Map();
+
+function ckTenantName(t: number): string {
+ return `tenant-${String(t + 1).padStart(2, "0")}`;
+}
+
+function ckCounterRows(
+ state: CkSimState,
+ tenant: number,
+ ids: Ids,
+ queueName: string,
+ eventTime: string,
+ orderKey: () => number,
+ op: CounterOp,
+ wait_ms?: number
+): QueueMetricsRawV1Input[] {
+ let c = state.counters.get(tenant);
+ if (!c) {
+ c = { enqueue: 0, started: 0, ack: 0, nack: 0, dlq: 0 };
+ state.counters.set(tenant, c);
+ }
+ const common = {
+ ...ids,
+ queue_name: queueName,
+ concurrency_key: ckTenantName(tenant),
+ event_time: eventTime,
+ };
+ const rows: QueueMetricsRawV1Input[] = [];
+ if (c[op] === 0) rows.push({ ...common, op, cumulative: 0, order_key: orderKey() });
+ c[op] += 1;
+ rows.push({
+ ...common,
+ op,
+ cumulative: c[op],
+ order_key: orderKey(),
+ ...(wait_ms !== undefined ? { wait_ms } : {}),
+ });
+ return rows;
+}
+
+// Advance one bucket of the simulation for every queue, returning the raw rows to insert.
+// `backlog` and `counters` are mutated in place so state carries across buckets (and into
+// live mode).
+function simulateBucket(
+ scenario: Scenario,
+ bucket: number,
+ bucketSec: number,
+ eventTime: string,
+ bucketEpochSec: number,
+ ids: Ids,
+ backlog: number[],
+ counters: CounterState,
+ rng: Rng
+): QueueMetricsRawV1Input[] {
+ const envLimit = scenario.envLimit(bucket);
+ const n = scenario.queues.length;
+
+ const limit = new Array(n);
+ const desired = new Array(n);
+ for (let q = 0; q < n; q++) {
+ limit[q] = scenario.queues[q].limit(bucket);
+ const arrivals = Math.min(500, scenario.queues[q].arrivals(bucket, rng));
+ const prior = backlog[q]; // backlog carried from earlier buckets, before this bucket's arrivals
+ backlog[q] += arrivals; // arrivals join the backlog; recorded as enqueues below
+ (desired as any)[q] = { arrivals, prior, want: Math.min(limit[q], backlog[q]) };
+ }
+
+ // Env cap: if the queues collectively want more concurrency than the env allows, scale down.
+ const sumWant = desired.reduce((s: number, d: any) => s + d.want, 0);
+ const scale = sumWant > envLimit && sumWant > 0 ? envLimit / sumWant : 1;
+
+ const running = new Array(n);
+ const queued = new Array(n);
+ let envRunning = 0;
+ let envQueued = 0;
+ for (let q = 0; q < n; q++) {
+ const d = desired[q] as any;
+ running[q] = Math.floor(d.want * scale);
+ queued[q] = backlog[q] - running[q];
+ envRunning += running[q];
+ envQueued += queued[q];
+ }
+
+ // Order keys are time-based (like the production stream ids) so appended runs and live
+ // mode stay monotonic; the per-bucket sequence keeps them unique within a bucket.
+ let bucketSeq = 0;
+ const orderKey = () => bucketEpochSec * 1_000_000 + bucketSeq++;
+
+ const rows: QueueMetricsRawV1Input[] = [];
+ for (let q = 0; q < n; q++) {
+ const profile = scenario.queues[q];
+ const started = running[q];
+ const arrivals = (desired[q] as any).arrivals as number;
+ const prior = (desired[q] as any).prior as number; // depth a starting run actually queued behind
+ backlog[q] = queued[q]; // carry the unserved remainder forward
+
+ if (profile.sparse && arrivals === 0 && started === 0 && prior === 0) {
+ continue; // fully idle: leave a gap so carry-forward is exercised
+ }
+
+ // CK-health fields stay coherent with the depth: no queued runs means no backlogged keys.
+ const ckBacklogged = profile.ck
+ ? queued[q] > 0
+ ? Math.max(1, Math.min(profile.ck.backlogged(bucket, rng), queued[q]))
+ : 0
+ : undefined;
+ const ckMaxWaitMs =
+ profile.ck && ckBacklogged ? Math.round(profile.ck.maxWaitMs(bucket, rng)) : undefined;
+
+ const gauge: QueueMetricsRawV1Input = {
+ ...ids,
+ queue_name: profile.name,
+ event_time: eventTime,
+ op: "gauge",
+ running: running[q],
+ queued: queued[q],
+ queue_limit: limit[q],
+ env_running: envRunning,
+ env_queued: envQueued,
+ env_limit: envLimit,
+ throttled: queued[q] > 0 && (running[q] >= limit[q] || scale < 1) ? 1 : 0,
+ ...(ckBacklogged !== undefined
+ ? { ck_backlogged: ckBacklogged, ck_max_wait_ms: ckMaxWaitMs ?? 0 }
+ : {}),
+ };
+ rows.push(gauge);
+
+ for (let a = 0; a < arrivals; a++) {
+ rows.push(...counterRows(counters, q, ids, profile.name, eventTime, orderKey, "enqueue"));
+ }
+
+ // Per-key rows for CK profiles: assign arrivals hog-weighted, drain round-robin
+ // (fair share), then emit per-tenant odometers + a per-key gauge per active tenant.
+ if (profile.ck) {
+ let ckq = ckSim.get(q);
+ if (!ckq) {
+ ckq = { backlog: new Array(CK_TENANT_COUNT).fill(0), counters: new Map() };
+ ckSim.set(q, ckq);
+ }
+ const hogShare = bucket % 60 < 20 ? 0.6 : 0.15;
+ const arrivalsPerTenant = new Array(CK_TENANT_COUNT).fill(0);
+ for (let a = 0; a < arrivals; a++) {
+ const t = rng() < hogShare ? 0 : 1 + Math.floor(rng() * (CK_TENANT_COUNT - 1));
+ arrivalsPerTenant[t]++;
+ ckq.backlog[t]++;
+ }
+ const drainedPerTenant = new Array(CK_TENANT_COUNT).fill(0);
+ let remaining = started;
+ while (remaining > 0 && ckq.backlog.some((v) => v > 0)) {
+ for (let t = 0; t < CK_TENANT_COUNT && remaining > 0; t++) {
+ if (ckq.backlog[t] > 0) {
+ ckq.backlog[t]--;
+ drainedPerTenant[t]++;
+ remaining--;
+ }
+ }
+ }
+ for (let t = 0; t < CK_TENANT_COUNT; t++) {
+ const fairShare = Math.max(1, limit[q] / CK_TENANT_COUNT);
+ const ckMedianWait = profile.waitBaseMs + (ckq.backlog[t] / fairShare) * bucketSec * 1000;
+ for (let a = 0; a < arrivalsPerTenant[t]; a++) {
+ rows.push(...ckCounterRows(ckq, t, ids, profile.name, eventTime, orderKey, "enqueue"));
+ }
+ for (let d = 0; d < drainedPerTenant[t]; d++) {
+ rows.push(
+ ...ckCounterRows(
+ ckq,
+ t,
+ ids,
+ profile.name,
+ eventTime,
+ orderKey,
+ "started",
+ Math.round(lognormal(ckMedianWait, WAIT_SIGMA, rng))
+ )
+ );
+ rows.push(...ckCounterRows(ckq, t, ids, profile.name, eventTime, orderKey, "ack"));
+ }
+ if (ckq.backlog[t] > 0 || drainedPerTenant[t] > 0) {
+ rows.push({
+ ...ids,
+ queue_name: profile.name,
+ concurrency_key: ckTenantName(t),
+ event_time: eventTime,
+ op: "gauge",
+ queued: ckq.backlog[t],
+ running: drainedPerTenant[t],
+ });
+ }
+ }
+ }
+
+ const medianWait = profile.waitBaseMs + (prior / Math.max(limit[q], 1)) * bucketSec * 1000;
+ for (let s = 0; s < started; s++) {
+ rows.push(
+ ...counterRows(
+ counters,
+ q,
+ ids,
+ profile.name,
+ eventTime,
+ orderKey,
+ "started",
+ Math.round(lognormal(medianWait, WAIT_SIGMA, rng))
+ )
+ );
+ const roll = rng();
+ const op: CounterOp = roll < DLQ_RATE ? "dlq" : roll < DLQ_RATE + NACK_RATE ? "nack" : "ack";
+ rows.push(...counterRows(counters, q, ids, profile.name, eventTime, orderKey, op));
+ }
+ }
+ return rows;
+}
+
+// ---------------------------------------------------------------------------
+// ClickHouse
+// ---------------------------------------------------------------------------
+
+function clickhouse(): ClickHouse {
+ const clickhouseUrl = process.env.CLICKHOUSE_URL ?? process.env.EVENTS_CLICKHOUSE_URL;
+ if (!clickhouseUrl) {
+ console.error("CLICKHOUSE_URL not set");
+ process.exit(1);
+ }
+ const url = new URL(clickhouseUrl);
+ // Allowlist local hosts only (this script TRUNCATEs), and never echo the URL (it carries creds).
+ const localHosts = new Set(["localhost", "127.0.0.1", "::1", "0.0.0.0"]);
+ if (!localHosts.has(url.hostname)) {
+ console.error(`Refusing to run against a non-local ClickHouse host: ${url.hostname}`);
+ process.exit(1);
+ }
+ url.searchParams.delete("secure");
+ return new ClickHouse({ url: url.toString(), name: "queue-metrics-simulator" });
+}
+
+async function insertBatched(ch: ClickHouse, rows: QueueMetricsRawV1Input[], nonce: string) {
+ const BATCH = 25_000;
+ for (let i = 0; i < rows.length; i += BATCH) {
+ const slice = rows.slice(i, i + BATCH);
+ const [error] = await ch.queueMetrics.insertRaw(slice, {
+ params: { clickhouse_settings: { insert_deduplication_token: `${nonce}:${i}` } },
+ });
+ if (error) {
+ console.error("insert failed:", error.message);
+ process.exit(1);
+ }
+ }
+}
+
+async function resetEnv(ch: ClickHouse, environmentId: string) {
+ const raw = (
+ ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } }
+ ).client;
+ for (const table of [
+ "queue_metrics_raw_v1",
+ "queue_metrics_v1",
+ "queue_metrics_5m_v1",
+ "env_metrics_v1",
+ "queue_metrics_ck_v1",
+ ]) {
+ await raw.command({
+ query: `DELETE FROM trigger_dev.${table} WHERE environment_id = '${environmentId}'`,
+ });
+ }
+ console.log(`Reset queue metrics for environment ${environmentId}`);
+}
+
+// Fake running counts in the run-queue Redis (Running column + allocation usage bars).
+// Reconciled every run: staged with --usage, cleared otherwise.
+async function stageRedisUsage(scenario: Scenario, ids: Ids, seed: number, clear: boolean) {
+ const host = process.env.RUN_ENGINE_RUN_QUEUE_REDIS_HOST ?? process.env.REDIS_HOST ?? "localhost";
+ const port = Number(
+ process.env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? process.env.REDIS_PORT ?? 6379
+ );
+ const localHosts = new Set(["localhost", "127.0.0.1", "::1", "0.0.0.0"]);
+ if (!localHosts.has(host)) {
+ console.warn(`Skipping Redis usage staging on a non-local host: ${host}`);
+ return;
+ }
+ try {
+ const { createRedisClient } = await import("@internal/redis");
+ const redis = createRedisClient({ host, port });
+ const rng = mulberry32(seed + 1);
+ const prefix = "engine:runqueue:";
+ const logicalBase = `{org:${ids.organization_id}}:proj:${ids.project_id}:env:${ids.environment_id}:queue:`;
+ const base = `${prefix}${logicalBase}`;
+ for (const [q, profile] of scenario.queues.entries()) {
+ const key = `${base}${profile.name}:currentDequeued`;
+ await redis.del(key);
+
+ // CK staging (ckIndex + per-key subqueues) feeds the live per-key table on the queue
+ // detail page. Members are stored unprefixed, exactly like the run-queue Lua does.
+ const ckIndexKey = `${base}${profile.name}:ckIndex`;
+ const lengthCounterKey = `${base}${profile.name}:lengthCounter`;
+ const staleMembers = await redis.zrange(ckIndexKey, 0, -1);
+ for (const member of staleMembers) {
+ await redis.del(`${prefix}${member}`, `${prefix}${member}:currentConcurrency`);
+ }
+ await redis.del(ckIndexKey, lengthCounterKey);
+
+ if (clear) continue;
+ const limit = profile.limit(0);
+ // First queue rides at/over its limit, the rest at 30-90%, sparse mostly idle.
+ const count = profile.sparse
+ ? rng() < 0.3
+ ? 1
+ : 0
+ : q === 0
+ ? limit + Math.round(rng() * 2)
+ : Math.round(limit * (0.3 + 0.6 * rng()));
+ if (count > 0) {
+ await redis.sadd(key, ...Array.from({ length: count }, (_v, i) => `sim_run_${i}`));
+ }
+
+ if (profile.ck) {
+ const now = Date.now();
+ const tenants = 12;
+ let totalCkQueued = 0;
+ for (let t = 1; t <= tenants; t++) {
+ const tenant = `tenant-${String(t).padStart(2, "0")}`;
+ const member = `${logicalBase}${profile.name}:ck:${tenant}`;
+ const hog = t === 1;
+ const queuedCount = hog ? 40 : 1 + Math.round(rng() * 5);
+ const runningCount = hog ? limit : Math.round(rng() * 2);
+ const oldestAgeMs = hog ? 15 * 60_000 : 5_000 + Math.round(rng() * 55_000);
+ const zargs: Array = [];
+ for (let i = 0; i < queuedCount; i++) {
+ zargs.push(now - oldestAgeMs + i * 250, `sim_${tenant}_run_${i}`);
+ }
+ await redis.zadd(`${prefix}${member}`, ...zargs);
+ if (runningCount > 0) {
+ await redis.sadd(
+ `${prefix}${member}:currentConcurrency`,
+ ...Array.from({ length: runningCount }, (_v, i) => `sim_${tenant}_running_${i}`)
+ );
+ }
+ await redis.zadd(ckIndexKey, now - oldestAgeMs, member);
+ totalCkQueued += queuedCount;
+ }
+ // The aggregate "Queued now" reads ZCARD(base) + this counter; keep them coherent.
+ await redis.set(lengthCounterKey, totalCkQueued, "EX", 24 * 3600);
+ }
+ }
+ await redis.quit();
+ console.log(
+ clear
+ ? "Cleared staged Redis usage."
+ : "Staged fake running counts in Redis (Running column + allocation usage bars)."
+ );
+ } catch (error) {
+ console.warn("Redis usage staging skipped:", error instanceof Error ? error.message : error);
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+// Make the synthetic project a V2 engine project with a current dev worker + a Postgres
+// TaskQueue per simulated queue, so the /queues list renders the V2 table (it pages from
+// Postgres and gates on engine version; ClickHouse only holds the metrics).
+async function ensureTaskQueues(
+ scenario: Scenario,
+ projectId: string,
+ runtimeEnvironmentId: string
+) {
+ await prisma.project.update({ where: { id: projectId }, data: { engine: "V2" } });
+
+ await prisma.backgroundWorker.upsert({
+ where: {
+ projectId_runtimeEnvironmentId_version: {
+ projectId,
+ runtimeEnvironmentId,
+ version: "queue-metrics-sim",
+ },
+ },
+ update: {},
+ create: {
+ friendlyId: generateFriendlyId("worker"),
+ engine: "V2",
+ contentHash: "queue-metrics-sim",
+ sdkVersion: "4.0.0",
+ cliVersion: "4.0.0",
+ projectId,
+ runtimeEnvironmentId,
+ version: "queue-metrics-sim",
+ metadata: {},
+ },
+ });
+
+ for (const profile of scenario.queues) {
+ const concurrencyLimit = profile.limit(0);
+ await prisma.taskQueue.upsert({
+ where: { runtimeEnvironmentId_name: { runtimeEnvironmentId, name: profile.name } },
+ create: {
+ friendlyId: generateFriendlyId("queue"),
+ version: "V2",
+ name: profile.name,
+ orderableName: profile.name,
+ concurrencyLimit,
+ runtimeEnvironmentId,
+ projectId,
+ type: "NAMED",
+ },
+ update: { concurrencyLimit },
+ });
+ }
+
+ // Drop queues left over from a previously seeded scenario so switching scenarios
+ // does not leave metric-less rows in the list.
+ const { count: pruned } = await prisma.taskQueue.deleteMany({
+ where: {
+ runtimeEnvironmentId,
+ name: { notIn: scenario.queues.map((q) => q.name) },
+ },
+ });
+ console.log(
+ `Ensured ${scenario.queues.length} task queues in Postgres${pruned > 0 ? `, pruned ${pruned} stale` : ""}.`
+ );
+}
+
+function printHelp() {
+ const lines = Object.entries(scenarios).map(
+ ([name, build]) => ` ${name.padEnd(28)}${build(720, 10).description}`
+ );
+ console.log(`Queue metrics simulator: seeds a synthetic tenant with realistic queue metrics.
+
+Usage: pnpm --filter webapp run db:seed:queue-metrics -- [flags]
+
+Flags:
+ --scenario which scenario to seed (default: mixed)
+ --project project to seed into (default: ${PROJECT_NAME}); use one
+ project per scenario to browse them side by side
+ --window how much history to backfill, e.g. 30m, 6h, 1d (default: 2h)
+ --bucket seconds per simulated bucket (default: 10)
+ --seed RNG seed for reproducible data (default: 1)
+ --usage stage fake running counts in Redis so the Running column and
+ the Allocation tab's usage bars have data (cleared when omitted)
+ --live after backfilling, keep appending one bucket per interval
+ --reset clear this environment's metrics before seeding
+ --reset-only clear and exit without seeding
+ --help this text
+
+Scenarios:
+${lines.join("\n")}
+
+Example designer setup (one project per scenario):
+ pnpm --filter webapp run db:seed:queue-metrics -- --scenario mixed --reset
+ pnpm --filter webapp run db:seed:queue-metrics -- --scenario many-queues --project qm-many-queues --reset
+ pnpm --filter webapp run db:seed:queue-metrics -- --scenario throttled-backlog --project qm-throttled --reset
+ pnpm --filter webapp run db:seed:queue-metrics -- --scenario tenant-hotspot --project qm-tenants --usage --reset`);
+}
+
+async function main() {
+ const flags = parseArgs(process.argv.slice(2));
+ if (flags.help === "true") {
+ printHelp();
+ process.exit(0);
+ }
+ const scenarioName = flags.scenario ?? "mixed";
+ const build = scenarios[scenarioName];
+ if (!build) {
+ console.error(
+ `Unknown scenario "${scenarioName}". Options: ${Object.keys(scenarios).join(", ")}`
+ );
+ process.exit(1);
+ }
+ const bucketSec = Number(flags.bucket ?? 10);
+ if (!Number.isFinite(bucketSec) || bucketSec <= 0) {
+ console.error(`--bucket must be a positive number of seconds, got: ${flags.bucket}`);
+ process.exit(1);
+ }
+ const windowSec = parseDuration(flags.window ?? "2h");
+ const totalBuckets = Math.floor(windowSec / bucketSec);
+ if (!Number.isFinite(totalBuckets) || totalBuckets <= 0) {
+ console.error(
+ `--window must be longer than --bucket (got ${windowSec}s window, ${bucketSec}s bucket)`
+ );
+ process.exit(1);
+ }
+ const seed = Number(flags.seed ?? 1);
+ const live = flags.live === "true";
+
+ const user = await prisma.user.findUnique({ where: { email: "local@trigger.dev" } });
+ if (!user) {
+ console.error("User local@trigger.dev not found. Run `pnpm run db:seed` first.");
+ process.exit(1);
+ }
+
+ let org = await prisma.organization.findFirst({
+ where: { title: ORG_TITLE, members: { some: { userId: user.id } } },
+ });
+ if (!org)
+ org = await createOrganization({ title: ORG_TITLE, userId: user.id, companySize: "1-10" });
+
+ const projectName = flags.project ?? PROJECT_NAME;
+ let project = await prisma.project.findFirst({
+ where: { name: projectName, organizationId: org.id },
+ });
+ if (!project) {
+ project = await createProject({
+ organizationSlug: org.slug,
+ name: projectName,
+ userId: user.id,
+ version: "v3",
+ });
+ }
+
+ const runtimeEnv = await prisma.runtimeEnvironment.findFirst({
+ where: { projectId: project.id, type: "DEVELOPMENT" },
+ });
+ if (!runtimeEnv) {
+ console.error("No DEVELOPMENT environment found for project.");
+ process.exit(1);
+ }
+
+ const ids: Ids = {
+ organization_id: org.id,
+ project_id: project.id,
+ environment_id: runtimeEnv.id,
+ };
+ const ch = clickhouse();
+ const nonce = `qmsim-${Date.now()}-${seed}`;
+
+ if (flags.reset === "true" || flags["reset-only"] === "true") {
+ await resetEnv(ch, runtimeEnv.id);
+ if (flags["reset-only"] === "true") {
+ await ch.close();
+ process.exit(0);
+ }
+ }
+
+ const scenario = build(totalBuckets, bucketSec);
+ await ensureTaskQueues(scenario, project.id, runtimeEnv.id);
+ await stageRedisUsage(scenario, ids, seed, flags.usage !== "true");
+ const rng = mulberry32(seed);
+ const backlog = new Array(scenario.queues.length).fill(0);
+
+ console.log(`Scenario "${scenarioName}": ${scenario.description}`);
+ console.log(
+ `Backfilling ${totalBuckets} x ${bucketSec}s buckets (${flags.window ?? "2h"}) for ${scenario.queues.length} queues...`
+ );
+
+ // Backfill: buckets from (now - window) up to now, aligned to the bucket grid.
+ const nowBucket = Math.floor(Date.now() / 1000 / bucketSec) * bucketSec;
+ const startBucket = nowBucket - totalBuckets * bucketSec;
+ const counters = newCounterState(scenario.queues.length);
+ const rows: QueueMetricsRawV1Input[] = [];
+ for (let b = 0; b < totalBuckets; b++) {
+ const bucketEpochSec = startBucket + b * bucketSec;
+ const eventTime = formatChDateTime(new Date(bucketEpochSec * 1000));
+ rows.push(
+ ...simulateBucket(
+ scenario,
+ b,
+ bucketSec,
+ eventTime,
+ bucketEpochSec,
+ ids,
+ backlog,
+ counters,
+ rng
+ )
+ );
+ }
+ await insertBatched(ch, rows, nonce);
+ console.log(`Inserted ${rows.length} raw rows.`);
+
+ // Merge the AggregatingMergeTree partials so argMax "current value" widgets read cleanly.
+ // The real pipeline relies on background merges; the simulator forces it for a tidy demo.
+ const raw = (
+ ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } }
+ ).client;
+ await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_v1 FINAL` });
+ await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_5m_v1 FINAL` });
+ await raw.command({ query: `OPTIMIZE TABLE trigger_dev.env_metrics_v1 FINAL` });
+ await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_ck_v1 FINAL` });
+
+ const origin = process.env.APP_ORIGIN ?? "http://localhost:3030";
+ console.log(
+ `\nQueues dashboard: ${origin}/orgs/${org.slug}/projects/${project.slug}/env/dev/dashboards/queues`
+ );
+
+ if (live) {
+ console.log(`\nLive mode: appending one bucket every ${bucketSec}s (Ctrl-C to stop)...`);
+ let b = totalBuckets;
+ // eslint-disable-next-line no-constant-condition
+ while (true) {
+ await new Promise((r) => setTimeout(r, bucketSec * 1000));
+ const bucketEpochSec = Math.floor(Date.now() / 1000 / bucketSec) * bucketSec;
+ const eventTime = formatChDateTime(new Date(bucketEpochSec * 1000));
+ const liveRows = simulateBucket(
+ scenario,
+ b,
+ bucketSec,
+ eventTime,
+ bucketEpochSec,
+ ids,
+ backlog,
+ counters,
+ rng
+ );
+ await insertBatched(ch, liveRows, `${nonce}:live:${b}`);
+ console.log(`bucket ${b}: ${liveRows.length} rows @ ${eventTime}`);
+ b++;
+ }
+ }
+
+ await ch.close();
+ process.exit(0);
+}
+
+main().catch((e) => {
+ console.error(e);
+ process.exit(1);
+});
diff --git a/apps/webapp/test/queueMetricsMapping.test.ts b/apps/webapp/test/queueMetricsMapping.test.ts
new file mode 100644
index 00000000000..61e3893c7fb
--- /dev/null
+++ b/apps/webapp/test/queueMetricsMapping.test.ts
@@ -0,0 +1,239 @@
+import { describe, expect, it } from "vitest";
+import {
+ descriptorFromQueue,
+ mapEntryToRows,
+ OVERFLOW_QUEUE_NAME,
+ QueueNameLimiter,
+} from "~/v3/queueMetricsMapping";
+
+describe("descriptorFromQueue", () => {
+ it("parses a plain descriptor", () => {
+ expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:task/my-task")).toEqual({
+ organization_id: "o1",
+ project_id: "p1",
+ environment_id: "e1",
+ queue_name: "task/my-task",
+ concurrency_key: "",
+ });
+ });
+
+ it("captures a concurrency-key suffix", () => {
+ expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:task/t:ck:tenant-3")).toEqual(
+ expect.objectContaining({ queue_name: "task/t", concurrency_key: "tenant-3" })
+ );
+ });
+
+ it("maps the ck wildcard to no key", () => {
+ expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:task/t:ck:*")).toEqual(
+ expect.objectContaining({ queue_name: "task/t", concurrency_key: "" })
+ );
+ });
+
+ it("keeps colons inside the queue name", () => {
+ expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:my:odd:queue")).toEqual(
+ expect.objectContaining({ queue_name: "my:odd:queue", concurrency_key: "" })
+ );
+ });
+
+ it("keeps colons in the name while capturing a real ck suffix", () => {
+ expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:a:b:ck:t9")).toEqual(
+ expect.objectContaining({ queue_name: "a:b", concurrency_key: "t9" })
+ );
+ });
+
+ it("rejects malformed descriptors", () => {
+ expect(descriptorFromQueue("not-a-descriptor")).toBeNull();
+ expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1")).toBeNull();
+ expect(descriptorFromQueue("")).toBeNull();
+ });
+});
+
+describe("QueueNameLimiter", () => {
+ it("passes names through under the cap and overflows past it, per scope", () => {
+ const limiter = new QueueNameLimiter(2);
+ expect(limiter.limit("env1", "a")).toBe("a");
+ expect(limiter.limit("env1", "b")).toBe("b");
+ expect(limiter.limit("env1", "c")).toBe(OVERFLOW_QUEUE_NAME);
+ expect(limiter.limit("env1", "a")).toBe("a");
+ expect(limiter.limit("env2", "c")).toBe("c");
+ });
+
+ it("is unlimited when the cap is 0", () => {
+ const limiter = new QueueNameLimiter(0);
+ for (let i = 0; i < 100; i++) {
+ expect(limiter.limit("env1", `q${i}`)).toBe(`q${i}`);
+ }
+ });
+
+ it("evicts the oldest scope when the scope map is full", () => {
+ const limiter = new QueueNameLimiter(1, 2);
+ expect(limiter.limit("env1", "a")).toBe("a");
+ expect(limiter.limit("env2", "a")).toBe("a");
+ expect(limiter.limit("env3", "a")).toBe("a");
+ expect(limiter.limit("env1", "b")).toBe("b");
+ });
+});
+
+describe("mapEntryToRows", () => {
+ const q = "{org:o1}:proj:p1:env:e1:queue:task/t";
+
+ it("maps a gauge entry with numeric fields", () => {
+ const rows = mapEntryToRows({
+ id: "1700000000000-0",
+ fields: {
+ op: "gauge",
+ q,
+ ql: "5",
+ cc: "2",
+ lim: "10",
+ eql: "7",
+ ec: "3",
+ elim: "20",
+ thr: "1",
+ },
+ });
+ expect(rows).toHaveLength(1);
+ expect(rows[0]).toEqual(
+ expect.objectContaining({
+ op: "gauge",
+ organization_id: "o1",
+ queue_name: "task/t",
+ concurrency_key: "",
+ queued: 5,
+ running: 2,
+ queue_limit: 10,
+ env_queued: 7,
+ env_running: 3,
+ env_limit: 20,
+ throttled: 1,
+ })
+ );
+ expect(rows[0]!.event_time).toBe("2023-11-14 22:13:20");
+ expect(rows[0]!.ck_backlogged).toBeUndefined();
+ expect(rows[0]!.ck_max_wait_ms).toBeUndefined();
+ });
+
+ it("keeps the key on per-subqueue gauges and maps the CK-health tail", () => {
+ const rows = mapEntryToRows({
+ id: "1700000000000-0",
+ fields: { op: "gauge", q: `${q}:ck:tenant-1`, ql: "4", ckq: "3", ckw: "2500" },
+ });
+ expect(rows).toHaveLength(1);
+ expect(rows[0]).toEqual(
+ expect.objectContaining({
+ op: "gauge",
+ queue_name: "task/t",
+ concurrency_key: "tenant-1",
+ queued: 4,
+ ck_backlogged: 3,
+ ck_max_wait_ms: 2500,
+ })
+ );
+ });
+
+ it("maps started with wait_ms + cumulative and drops unknown ops", () => {
+ const started = mapEntryToRows({
+ id: "1700000000000-0",
+ fields: { op: "started", q, wait: "48", cum: "512" },
+ });
+ expect(started).toHaveLength(1);
+ expect(started[0]).toEqual(
+ expect.objectContaining({
+ op: "started",
+ wait_ms: 48,
+ cumulative: 512,
+ order_key: (1700000000000n * 1000000n).toString(),
+ })
+ );
+ expect(mapEntryToRows({ id: "1-0", fields: { op: "ack", q, cum: "9" } })[0]).toEqual(
+ expect.objectContaining({ op: "ack", cumulative: 9 })
+ );
+ expect(mapEntryToRows({ id: "1-0", fields: { op: "bogus", q } })).toEqual([]);
+ expect(mapEntryToRows({ id: "1-0", fields: { op: "ack" } })).toEqual([]);
+ });
+
+ it("expands a dual-odometer counter entry into base + per-key rows", () => {
+ const rows = mapEntryToRows({
+ id: "1700000000000-3",
+ fields: { op: "started", q, ck: "tenant-9", wait: "80", cum: "41", ckcum: "7" },
+ });
+ expect(rows).toHaveLength(2);
+ expect(rows[0]).toEqual(
+ expect.objectContaining({ queue_name: "task/t", cumulative: 41, wait_ms: 80 })
+ );
+ expect(rows[0]!.concurrency_key).toBeUndefined();
+ expect(rows[1]).toEqual(
+ expect.objectContaining({
+ queue_name: "task/t",
+ concurrency_key: "tenant-9",
+ cumulative: 7,
+ wait_ms: 80,
+ })
+ );
+ expect(rows[0]!.order_key).toBe(rows[1]!.order_key);
+
+ // Baseline entries carry exactly one odometer each.
+ const baseBaseline = mapEntryToRows({ id: "1-0", fields: { op: "started", q, cum: "0" } });
+ expect(baseBaseline).toHaveLength(1);
+ expect(baseBaseline[0]!.concurrency_key).toBeUndefined();
+ const ckBaseline = mapEntryToRows({
+ id: "1-1",
+ fields: { op: "started", q, ck: "tenant-9", ckcum: "0" },
+ });
+ expect(ckBaseline).toHaveLength(1);
+ expect(ckBaseline[0]).toEqual(
+ expect.objectContaining({ concurrency_key: "tenant-9", cumulative: 0 })
+ );
+ });
+
+ it("applies the queue-name limiter: gauges overflow, counters drop", () => {
+ const limiters = { queueNames: new QueueNameLimiter(1) };
+ const first = mapEntryToRows({ id: "1-0", fields: { op: "ack", q, cum: "1" } }, limiters);
+ expect(first[0]!.queue_name).toBe("task/t");
+
+ // Overflowed gauges keep flowing under the shared name (max stays meaningful),
+ // with per-key attribution stripped.
+ const overflowGauge = mapEntryToRows(
+ {
+ id: "1-1",
+ fields: { op: "gauge", q: "{org:o1}:proj:p1:env:e1:queue:task/other:ck:t1", ql: "3" },
+ },
+ limiters
+ );
+ expect(overflowGauge[0]!.queue_name).toBe(OVERFLOW_QUEUE_NAME);
+ expect(overflowGauge[0]!.concurrency_key).toBe("");
+
+ // Overflowed counters are dropped: merging distinct odometers under one key
+ // produces garbage deltas.
+ const overflowCounter = mapEntryToRows(
+ { id: "1-2", fields: { op: "ack", q: "{org:o1}:proj:p1:env:e1:queue:task/other", cum: "4" } },
+ limiters
+ );
+ expect(overflowCounter).toEqual([]);
+ });
+
+ it("applies the concurrency-key limiter: overflow drops the per-key row, keeps base", () => {
+ const limiters = { concurrencyKeys: new QueueNameLimiter(1) };
+ const first = mapEntryToRows(
+ { id: "1-0", fields: { op: "ack", q, ck: "t1", cum: "5", ckcum: "2" } },
+ limiters
+ );
+ expect(first).toHaveLength(2);
+
+ const overflowed = mapEntryToRows(
+ { id: "1-1", fields: { op: "ack", q, ck: "t2", cum: "6", ckcum: "1" } },
+ limiters
+ );
+ expect(overflowed).toHaveLength(1);
+ expect(overflowed[0]!.cumulative).toBe(6);
+ expect(overflowed[0]!.concurrency_key).toBeUndefined();
+
+ // Gauge for an overflowed key keeps the row but loses the attribution.
+ const overflowGauge = mapEntryToRows(
+ { id: "1-2", fields: { op: "gauge", q: `${q}:ck:t3`, ql: "2" } },
+ limiters
+ );
+ expect(overflowGauge).toHaveLength(1);
+ expect(overflowGauge[0]!.concurrency_key).toBe("");
+ });
+});
diff --git a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql
new file mode 100644
index 00000000000..8ea1e65f09f
--- /dev/null
+++ b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql
@@ -0,0 +1,267 @@
+-- +goose Up
+
+-- Queue metrics: raw landing table -> MV -> aggregated read target (mirrors
+-- llm_model_aggregates_v1, migration 027). Raw rows feed an MV on insert, and
+-- reads hit the aggregated table.
+
+-- Short-TTL raw landing, one row per stream entry. non_replicated_deduplication_window
+-- makes consumer replays idempotent via insert_deduplication_token.
+CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_raw_v1
+(
+ organization_id LowCardinality(String),
+ project_id LowCardinality(String),
+ environment_id String CODEC(ZSTD(1)),
+ queue_name String CODEC(ZSTD(1)),
+ concurrency_key String DEFAULT '' CODEC(ZSTD(1)), -- per-key attribution ('' = base/whole-queue row)
+ event_time DateTime CODEC(Delta(4), ZSTD(1)),
+ order_key UInt64 DEFAULT 0, -- stream-id composite (ms*1e6+seq), deltaSumTimestamp ordering key
+ op LowCardinality(String), -- gauge | enqueue | started | ack | nack | dlq
+ running UInt32 DEFAULT 0,
+ queued UInt32 DEFAULT 0,
+ queue_limit UInt32 DEFAULT 0,
+ env_running UInt32 DEFAULT 0,
+ env_queued UInt32 DEFAULT 0,
+ env_limit UInt32 DEFAULT 0,
+ throttled UInt8 DEFAULT 0, -- 1 on a gauge emission with running>=limit AND queued>0
+ ck_backlogged UInt32 DEFAULT 0, -- gauge on CK queues: distinct concurrency keys with queued work
+ ck_max_wait_ms UInt32 DEFAULT 0, -- gauge on CK queues: most-starved key's head-of-line wait
+ wait_ms UInt32 DEFAULT 0, -- set on op='started' (scheduling delay)
+ cumulative UInt64 DEFAULT 0 -- monotonic per-(queue,op) odometer on a counter op, diffed at read time
+)
+ENGINE = MergeTree()
+PARTITION BY toDate(event_time)
+ORDER BY (organization_id, project_id, environment_id, queue_name, event_time)
+TTL event_time + INTERVAL 6 HOUR
+SETTINGS non_replicated_deduplication_window = 1000, ttl_only_drop_parts = 1;
+
+-- (2) Aggregated read target (TRQL/dashboards query this).
+CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_v1
+(
+ organization_id LowCardinality(String),
+ project_id LowCardinality(String),
+ environment_id String CODEC(ZSTD(1)),
+ queue_name String CODEC(ZSTD(1)),
+ bucket_start DateTime CODEC(Delta(4), ZSTD(1)),
+
+ -- Cumulative-counter deltas: each op maintains a monotonic odometer, and deltaSumTimestamp
+ -- sums positive consecutive deltas (ignoring resets) ordered by event_time, so a lost
+ -- reading self-heals (the next surviving reading restates the total). Read with
+ -- deltaSumTimestampMerge( ), never sum().
+ enqueue_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ started_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ ack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ nack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ dlq_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ throttled_count SimpleAggregateFunction(sum, UInt64),
+
+ max_queued SimpleAggregateFunction(max, UInt32),
+ max_running SimpleAggregateFunction(max, UInt32),
+ max_limit SimpleAggregateFunction(max, UInt32),
+ max_env_queued SimpleAggregateFunction(max, UInt32),
+ max_env_running SimpleAggregateFunction(max, UInt32),
+ max_env_limit SimpleAggregateFunction(max, UInt32),
+ max_ck_backlogged SimpleAggregateFunction(max, UInt32),
+ max_ck_wait_ms SimpleAggregateFunction(max, UInt32),
+
+ wait_ms_sum SimpleAggregateFunction(sum, UInt64),
+ wait_ms_count SimpleAggregateFunction(sum, UInt64),
+ wait_quantiles AggregateFunction(quantiles(0.5, 0.9, 0.95, 0.99), UInt32)
+)
+ENGINE = AggregatingMergeTree()
+PARTITION BY toDate(bucket_start)
+ORDER BY (organization_id, project_id, environment_id, queue_name, bucket_start)
+TTL bucket_start + INTERVAL 30 DAY
+SETTINGS ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000;
+
+-- (3) MV: raw -> aggregated, 10s buckets.
+CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.queue_metrics_mv_v1
+TO trigger_dev.queue_metrics_v1 AS
+SELECT
+ organization_id, project_id, environment_id, queue_name,
+ toStartOfInterval(event_time, INTERVAL 10 SECOND) AS bucket_start,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'enqueue' AND concurrency_key = '') AS enqueue_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'started' AND concurrency_key = '') AS started_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'ack' AND concurrency_key = '') AS ack_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'nack' AND concurrency_key = '') AS nack_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'dlq' AND concurrency_key = '') AS dlq_delta,
+ sum(throttled) AS throttled_count,
+ max(queued) AS max_queued,
+ max(running) AS max_running,
+ max(queue_limit) AS max_limit,
+ max(env_queued) AS max_env_queued,
+ max(env_running) AS max_env_running,
+ max(env_limit) AS max_env_limit,
+ max(ck_backlogged) AS max_ck_backlogged,
+ max(ck_max_wait_ms) AS max_ck_wait_ms,
+ sumIf(wait_ms, op = 'started' AND concurrency_key = '') AS wait_ms_sum,
+ countIf(op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_ms_count,
+ quantilesStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_quantiles
+FROM trigger_dev.queue_metrics_raw_v1
+GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start;
+
+-- (4) Env-level 10s rollup (no queue dimension) for header tiles/saturation charts.
+-- Row count is queue-independent (~8640/day/env), so full granularity stays cheap at any range.
+-- No counter deltas on purpose: cross-queue deltaSumTimestamp state merges mix unrelated
+-- odometers (env totals must GROUP BY queue then sum). TDigest because an env-level
+-- reservoir absorbs every sample in the environment.
+CREATE TABLE IF NOT EXISTS trigger_dev.env_metrics_v1
+(
+ organization_id LowCardinality(String),
+ project_id LowCardinality(String),
+ environment_id String CODEC(ZSTD(1)),
+ bucket_start DateTime CODEC(Delta(4), ZSTD(1)),
+
+ max_env_queued SimpleAggregateFunction(max, UInt32),
+ max_env_running SimpleAggregateFunction(max, UInt32),
+ max_env_limit SimpleAggregateFunction(max, UInt32),
+ throttled_count SimpleAggregateFunction(sum, UInt64),
+
+ wait_ms_sum SimpleAggregateFunction(sum, UInt64),
+ wait_ms_count SimpleAggregateFunction(sum, UInt64),
+ wait_quantiles AggregateFunction(quantilesTDigest(0.5, 0.9, 0.95, 0.99), UInt32)
+)
+ENGINE = AggregatingMergeTree()
+PARTITION BY toDate(bucket_start)
+ORDER BY (organization_id, project_id, environment_id, bucket_start)
+TTL bucket_start + INTERVAL 30 DAY
+SETTINGS ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000;
+
+-- (5) MV: raw -> env rollup.
+CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.env_metrics_mv_v1
+TO trigger_dev.env_metrics_v1 AS
+SELECT
+ organization_id, project_id, environment_id,
+ toStartOfInterval(event_time, INTERVAL 10 SECOND) AS bucket_start,
+ max(env_queued) AS max_env_queued,
+ max(env_running) AS max_env_running,
+ max(env_limit) AS max_env_limit,
+ sum(throttled) AS throttled_count,
+ sumIf(wait_ms, op = 'started' AND concurrency_key = '') AS wait_ms_sum,
+ countIf(op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_ms_count,
+ quantilesTDigestStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_quantiles
+FROM trigger_dev.queue_metrics_raw_v1
+GROUP BY organization_id, project_id, environment_id, bucket_start;
+
+-- (6) Per-queue 5m rollup, exact column mirror of queue_metrics_v1, for ranking and
+-- env-wide GROUP BY queue reads at long ranges.
+CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_5m_v1
+(
+ organization_id LowCardinality(String),
+ project_id LowCardinality(String),
+ environment_id String CODEC(ZSTD(1)),
+ queue_name String CODEC(ZSTD(1)),
+ bucket_start DateTime CODEC(Delta(4), ZSTD(1)),
+
+ enqueue_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ started_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ ack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ nack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ dlq_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ throttled_count SimpleAggregateFunction(sum, UInt64),
+
+ max_queued SimpleAggregateFunction(max, UInt32),
+ max_running SimpleAggregateFunction(max, UInt32),
+ max_limit SimpleAggregateFunction(max, UInt32),
+ max_env_queued SimpleAggregateFunction(max, UInt32),
+ max_env_running SimpleAggregateFunction(max, UInt32),
+ max_env_limit SimpleAggregateFunction(max, UInt32),
+ max_ck_backlogged SimpleAggregateFunction(max, UInt32),
+ max_ck_wait_ms SimpleAggregateFunction(max, UInt32),
+
+ wait_ms_sum SimpleAggregateFunction(sum, UInt64),
+ wait_ms_count SimpleAggregateFunction(sum, UInt64),
+ wait_quantiles AggregateFunction(quantiles(0.5, 0.9, 0.95, 0.99), UInt32)
+)
+ENGINE = AggregatingMergeTree()
+PARTITION BY toDate(bucket_start)
+ORDER BY (organization_id, project_id, environment_id, queue_name, bucket_start)
+TTL bucket_start + INTERVAL 30 DAY
+SETTINGS ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000;
+
+-- (7) MV: raw -> 5m rollup. MUST read raw, never cascade off queue_metrics_v1 with
+-- -MergeState: MV GROUP BY merges states in hash order, and out-of-time-order
+-- deltaSumTimestamp merges double-count bridging spans (verified 3x inflation).
+CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.queue_metrics_5m_mv_v1
+TO trigger_dev.queue_metrics_5m_v1 AS
+SELECT
+ organization_id, project_id, environment_id, queue_name,
+ toStartOfInterval(event_time, INTERVAL 5 MINUTE) AS bucket_start,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'enqueue' AND concurrency_key = '') AS enqueue_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'started' AND concurrency_key = '') AS started_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'ack' AND concurrency_key = '') AS ack_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'nack' AND concurrency_key = '') AS nack_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'dlq' AND concurrency_key = '') AS dlq_delta,
+ sum(throttled) AS throttled_count,
+ max(queued) AS max_queued,
+ max(running) AS max_running,
+ max(queue_limit) AS max_limit,
+ max(env_queued) AS max_env_queued,
+ max(env_running) AS max_env_running,
+ max(env_limit) AS max_env_limit,
+ max(ck_backlogged) AS max_ck_backlogged,
+ max(ck_max_wait_ms) AS max_ck_wait_ms,
+ sumIf(wait_ms, op = 'started' AND concurrency_key = '') AS wait_ms_sum,
+ countIf(op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_ms_count,
+ quantilesStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_quantiles
+FROM trigger_dev.queue_metrics_raw_v1
+GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start;
+
+
+-- (8) Per-concurrency-key 10s tier. Rows are activity-bound (a (queue, key, bucket) row
+-- exists only when that key had an event in that bucket), so user-controlled key
+-- cardinality cannot inflate it beyond event volume (~19 bytes/event measured).
+-- Lean columns: no nack/dlq deltas and no per-key quantile states (mean wait via sums).
+CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_ck_v1
+(
+ organization_id LowCardinality(String),
+ project_id LowCardinality(String),
+ environment_id String CODEC(ZSTD(1)),
+ queue_name String CODEC(ZSTD(1)),
+ concurrency_key String CODEC(ZSTD(1)),
+ bucket_start DateTime CODEC(Delta(4), ZSTD(1)),
+
+ enqueue_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ started_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+ ack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64),
+
+ max_queued SimpleAggregateFunction(max, UInt32),
+ max_running SimpleAggregateFunction(max, UInt32),
+
+ wait_ms_sum SimpleAggregateFunction(sum, UInt64),
+ wait_ms_count SimpleAggregateFunction(sum, UInt64)
+)
+ENGINE = AggregatingMergeTree()
+PARTITION BY toDate(bucket_start)
+ORDER BY (organization_id, project_id, environment_id, queue_name, concurrency_key, bucket_start)
+TTL bucket_start + INTERVAL 30 DAY
+SETTINGS ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000;
+
+-- (9) MV: raw -> per-key tier. Only rows with a real key: per-key counter rows carry
+-- per-key odometers (safe to merge within their own (queue, key) group), and per-key
+-- gauge rows carry per-subqueue depth/running.
+CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.queue_metrics_ck_mv_v1
+TO trigger_dev.queue_metrics_ck_v1 AS
+SELECT
+ organization_id, project_id, environment_id, queue_name, concurrency_key,
+ toStartOfInterval(event_time, INTERVAL 10 SECOND) AS bucket_start,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'enqueue') AS enqueue_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'started') AS started_delta,
+ deltaSumTimestampStateIf(cumulative, order_key, op = 'ack') AS ack_delta,
+ maxIf(queued, op = 'gauge') AS max_queued,
+ maxIf(running, op = 'gauge') AS max_running,
+ sumIf(wait_ms, op = 'started') AS wait_ms_sum,
+ countIf(op = 'started' AND wait_ms > 0) AS wait_ms_count
+FROM trigger_dev.queue_metrics_raw_v1
+WHERE concurrency_key != ''
+GROUP BY organization_id, project_id, environment_id, queue_name, concurrency_key, bucket_start;
+
+-- +goose Down
+DROP VIEW IF EXISTS trigger_dev.queue_metrics_ck_mv_v1;
+DROP TABLE IF EXISTS trigger_dev.queue_metrics_ck_v1;
+DROP VIEW IF EXISTS trigger_dev.queue_metrics_5m_mv_v1;
+DROP TABLE IF EXISTS trigger_dev.queue_metrics_5m_v1;
+DROP VIEW IF EXISTS trigger_dev.env_metrics_mv_v1;
+DROP TABLE IF EXISTS trigger_dev.env_metrics_v1;
+DROP VIEW IF EXISTS trigger_dev.queue_metrics_mv_v1;
+DROP TABLE IF EXISTS trigger_dev.queue_metrics_v1;
+DROP TABLE IF EXISTS trigger_dev.queue_metrics_raw_v1;
diff --git a/internal-packages/clickhouse/src/client/tsql.ts b/internal-packages/clickhouse/src/client/tsql.ts
index c712820812f..ddf1d059b97 100644
--- a/internal-packages/clickhouse/src/client/tsql.ts
+++ b/internal-packages/clickhouse/src/client/tsql.ts
@@ -108,6 +108,11 @@ export interface ExecuteTSQLOptions {
* based on the span of the time range.
*/
timeRange?: TimeRange;
+ /**
+ * Opt-in: emit rows for empty time buckets in a top-level time-bucketed query
+ * (counters zero-fill, gauges carry forward). Off by default.
+ */
+ fillGaps?: boolean;
}
/**
@@ -192,6 +197,7 @@ export async function executeTSQL(
fieldMappings: options.fieldMappings,
whereClauseFallback: options.whereClauseFallback,
timeRange: options.timeRange,
+ fillGaps: options.fillGaps,
});
generatedSql = sql;
diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts
index 0b252a98f67..97c2209b1cb 100644
--- a/internal-packages/clickhouse/src/index.ts
+++ b/internal-packages/clickhouse/src/index.ts
@@ -32,6 +32,14 @@ import {
} from "./taskEvents.js";
import { insertMetrics } from "./metrics.js";
import { insertLlmMetrics } from "./llmMetrics.js";
+import {
+ insertQueueMetricsRaw,
+ getQueueListMetricsSummary,
+ getQueueDepthSparklines,
+ getQueueRanking,
+ getQueueRankingNames,
+ getQueueRankingCount,
+} from "./queueMetrics.js";
import {
getSessionTagsQueryBuilder,
getSessionsCountQueryBuilder,
@@ -65,6 +73,7 @@ export type * from "./taskRuns.js";
export type * from "./taskEvents.js";
export type * from "./metrics.js";
export type * from "./llmMetrics.js";
+export type * from "./queueMetrics.js";
export type * from "./llmModelAggregates.js";
export type * from "./errors.js";
export type * from "./sessions.js";
@@ -260,6 +269,17 @@ export class ClickHouse {
};
}
+ get queueMetrics() {
+ return {
+ insertRaw: insertQueueMetricsRaw(this.writer),
+ listSummary: getQueueListMetricsSummary(this.reader),
+ depthSparklines: getQueueDepthSparklines(this.reader),
+ ranking: getQueueRanking(this.reader),
+ rankingNames: getQueueRankingNames(this.reader),
+ rankingCount: getQueueRankingCount(this.reader),
+ };
+ }
+
get llmModelAggregates() {
return {
globalMetrics: getGlobalModelMetrics(this.reader),
diff --git a/internal-packages/clickhouse/src/queueMetrics.test.ts b/internal-packages/clickhouse/src/queueMetrics.test.ts
new file mode 100644
index 00000000000..00532041e44
--- /dev/null
+++ b/internal-packages/clickhouse/src/queueMetrics.test.ts
@@ -0,0 +1,525 @@
+import { clickhouseTest } from "@internal/testcontainers";
+import { z } from "zod";
+import { ClickHouse } from "./index.js";
+import type { QueueMetricsRawV1Input } from "./queueMetrics.js";
+
+const ORG = "org_qm";
+const PROJECT = "project_qm";
+const ENV = "env_qm";
+const EVENT_TIME = "2026-06-30 12:00:05"; // all rows land in the 10s bucket starting 12:00:00
+
+function base(op: QueueMetricsRawV1Input["op"], queue: string): QueueMetricsRawV1Input {
+ return {
+ organization_id: ORG,
+ project_id: PROJECT,
+ environment_id: ENV,
+ queue_name: queue,
+ event_time: EVENT_TIME,
+ op,
+ };
+}
+
+// Cumulative counters: each op keeps a monotonic per-(queue,op) odometer, so a counter row
+// carries the running total in `cumulative`. deltaSumTimestamp reconstructs the increase
+// (last - first) from a seeded cum=0 baseline; order_key orders readings within an op.
+let orderKey = 0;
+function counter(
+ op: QueueMetricsRawV1Input["op"],
+ queue: string,
+ total: number,
+ waits?: number[]
+): QueueMetricsRawV1Input[] {
+ const rows: QueueMetricsRawV1Input[] = [
+ { ...base(op, queue), cumulative: 0, order_key: orderKey++ },
+ ];
+ for (let cum = 1; cum <= total; cum++) {
+ rows.push({
+ ...base(op, queue),
+ cumulative: cum,
+ order_key: orderKey++,
+ ...(waits ? { wait_ms: waits[cum - 1] } : {}),
+ });
+ }
+ return rows;
+}
+
+const aggregatedRow = z.object({
+ enqueue_count: z.coerce.number(),
+ started_count: z.coerce.number(),
+ ack_count: z.coerce.number(),
+ nack_count: z.coerce.number(),
+ dlq_count: z.coerce.number(),
+ throttled_count: z.coerce.number(),
+ max_running: z.coerce.number(),
+ max_queued: z.coerce.number(),
+ max_limit: z.coerce.number(),
+ max_env_running: z.coerce.number(),
+ max_env_queued: z.coerce.number(),
+ max_env_limit: z.coerce.number(),
+ max_ck_backlogged: z.coerce.number(),
+ max_ck_wait_ms: z.coerce.number(),
+ wait_ms_sum: z.coerce.number(),
+ wait_ms_count: z.coerce.number(),
+ wait_p50: z.coerce.number(),
+ wait_p90: z.coerce.number(),
+ wait_p95: z.coerce.number(),
+ wait_p99: z.coerce.number(),
+});
+
+function readAggregated(ch: ClickHouse) {
+ return ch.reader.query({
+ name: "read-queue-metrics-aggregated",
+ query: `SELECT
+ deltaSumTimestampMerge(enqueue_delta) AS enqueue_count,
+ deltaSumTimestampMerge(started_delta) AS started_count,
+ deltaSumTimestampMerge(ack_delta) AS ack_count,
+ deltaSumTimestampMerge(nack_delta) AS nack_count,
+ deltaSumTimestampMerge(dlq_delta) AS dlq_count,
+ sum(throttled_count) AS throttled_count,
+ max(max_running) AS max_running,
+ max(max_queued) AS max_queued,
+ max(max_limit) AS max_limit,
+ max(max_env_running) AS max_env_running,
+ max(max_env_queued) AS max_env_queued,
+ max(max_env_limit) AS max_env_limit,
+ max(max_ck_backlogged) AS max_ck_backlogged,
+ max(max_ck_wait_ms) AS max_ck_wait_ms,
+ sum(wait_ms_sum) AS wait_ms_sum,
+ sum(wait_ms_count) AS wait_ms_count,
+ quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles) AS wait_arr,
+ wait_arr[1] AS wait_p50,
+ wait_arr[2] AS wait_p90,
+ wait_arr[3] AS wait_p95,
+ wait_arr[4] AS wait_p99
+ FROM trigger_dev.queue_metrics_v1
+ WHERE queue_name = {queueName: String}
+ GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start`,
+ schema: aggregatedRow,
+ params: z.object({ queueName: z.string() }),
+ });
+}
+
+// Synchronous insert so the MV-populated rows are queryable immediately.
+const SYNC = { params: { clickhouse_settings: { async_insert: 0 as const } } };
+
+describe("queue_metrics_v1", () => {
+ clickhouseTest(
+ "buckets counters, gauges and wait percentiles via the MV",
+ async ({ clickhouseContainer }) => {
+ const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" });
+ const queue = "queue-a";
+
+ const rows: QueueMetricsRawV1Input[] = [
+ ...counter("enqueue", queue, 3),
+ ...counter("started", queue, 10, [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]),
+ ...counter("ack", queue, 2),
+ ...counter("nack", queue, 1),
+ ...counter("dlq", queue, 1),
+ {
+ ...base("gauge", queue),
+ running: 8,
+ queued: 4,
+ queue_limit: 10,
+ env_running: 40,
+ env_queued: 10,
+ env_limit: 50,
+ throttled: 0,
+ ck_backlogged: 3,
+ ck_max_wait_ms: 2500,
+ },
+ {
+ ...base("gauge", queue),
+ running: 10,
+ queued: 6,
+ queue_limit: 10,
+ env_running: 50,
+ env_queued: 20,
+ env_limit: 50,
+ throttled: 1, // running >= limit AND queued > 0
+ ck_backlogged: 2,
+ ck_max_wait_ms: 1500,
+ },
+ ];
+
+ const [insertError] = await ch.queueMetrics.insertRaw(rows, SYNC);
+ expect(insertError).toBeNull();
+
+ const [queryError, result] = await readAggregated(ch)({ queueName: queue });
+ expect(queryError).toBeNull();
+ expect(result).toHaveLength(1);
+ const row = result![0]!;
+
+ expect(row.enqueue_count).toBe(3);
+ expect(row.started_count).toBe(10);
+ expect(row.ack_count).toBe(2);
+ expect(row.nack_count).toBe(1);
+ expect(row.dlq_count).toBe(1);
+ expect(row.throttled_count).toBe(1);
+
+ expect(row.max_running).toBe(10);
+ expect(row.max_queued).toBe(6);
+ expect(row.max_limit).toBe(10);
+ expect(row.max_env_running).toBe(50);
+ expect(row.max_env_queued).toBe(20);
+ expect(row.max_env_limit).toBe(50);
+ expect(row.max_ck_backlogged).toBe(3);
+ expect(row.max_ck_wait_ms).toBe(2500);
+
+ expect(row.wait_ms_sum).toBe(5500);
+ expect(row.wait_ms_count).toBe(10);
+
+ // Percentiles over [100..1000]: monotonic and within the value range.
+ expect(row.wait_p50).toBeGreaterThanOrEqual(400);
+ expect(row.wait_p50).toBeLessThanOrEqual(650);
+ expect(row.wait_p90).toBeGreaterThanOrEqual(row.wait_p50);
+ expect(row.wait_p95).toBeGreaterThanOrEqual(row.wait_p90);
+ expect(row.wait_p99).toBeGreaterThanOrEqual(row.wait_p95);
+ expect(row.wait_p99).toBeLessThanOrEqual(1000);
+
+ await ch.close();
+ }
+ );
+
+ clickhouseTest(
+ "merges wait-quantile state across separate insert blocks",
+ async ({ clickhouseContainer }) => {
+ const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" });
+ const queue = "queue-b";
+
+ // Cumulative odometer continues across the two insert blocks (baseline 0, then 1..10);
+ // deltaSumTimestamp state and quantile state merge across the parts into one bucket.
+ const startedRow = (cum: number, wait_ms?: number): QueueMetricsRawV1Input => ({
+ ...base("started", queue),
+ cumulative: cum,
+ order_key: orderKey++,
+ ...(wait_ms !== undefined ? { wait_ms } : {}),
+ });
+
+ const [e1] = await ch.queueMetrics.insertRaw(
+ [startedRow(0), ...[100, 200, 300, 400, 500].map((w, i) => startedRow(i + 1, w))],
+ SYNC
+ );
+ expect(e1).toBeNull();
+ const [e2] = await ch.queueMetrics.insertRaw(
+ [600, 700, 800, 900, 1000].map((w, i) => startedRow(i + 6, w)),
+ SYNC
+ );
+ expect(e2).toBeNull();
+
+ const [queryError, result] = await readAggregated(ch)({ queueName: queue });
+ expect(queryError).toBeNull();
+ expect(result).toHaveLength(1);
+ const row = result![0]!;
+
+ // Both blocks contribute to one bucket: counts and sums add, quantile state merges.
+ expect(row.started_count).toBe(10);
+ expect(row.wait_ms_sum).toBe(5500);
+ expect(row.wait_ms_count).toBe(10);
+ expect(row.wait_p50).toBeGreaterThanOrEqual(400);
+ expect(row.wait_p50).toBeLessThanOrEqual(650);
+ expect(row.wait_p99).toBeGreaterThanOrEqual(row.wait_p50);
+ expect(row.wait_p99).toBeLessThanOrEqual(1000);
+
+ await ch.close();
+ }
+ );
+
+ clickhouseTest(
+ "5m and env rollups agree with the 10s tier, and env buckets are 10s",
+ async ({ clickhouseContainer }) => {
+ const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" });
+
+ // Own org so the env-level read (no queue filter) stays isolated from other tests.
+ const rollOrg = "org_qm_roll";
+ const rows: QueueMetricsRawV1Input[] = [
+ ...counter("started", "roll-a", 7, [100, 150, 200, 250, 300, 350, 400]),
+ ...counter("started", "roll-b", 3, [500, 600, 700]),
+ {
+ ...base("gauge", "roll-a"),
+ running: 4,
+ queued: 9,
+ env_running: 30,
+ env_limit: 50,
+ ck_backlogged: 5,
+ ck_max_wait_ms: 9000,
+ },
+ { ...base("gauge", "roll-b"), running: 2, queued: 1, env_running: 45, env_limit: 50 },
+ {
+ ...base("gauge", "roll-a"),
+ event_time: "2026-06-30 12:00:15",
+ running: 1,
+ queued: 2,
+ env_running: 20,
+ env_limit: 50,
+ ck_backlogged: 2,
+ ck_max_wait_ms: 3000,
+ },
+ ].map((row) => ({ ...row, organization_id: rollOrg }));
+ const [insertError] = await ch.queueMetrics.insertRaw(rows, SYNC);
+ expect(insertError).toBeNull();
+
+ const perQueue = (table: string) =>
+ ch.reader.query({
+ name: "per-queue-both-tiers",
+ query: `SELECT queue_name, deltaSumTimestampMerge(started_delta) AS started
+ FROM ${table}
+ WHERE queue_name IN ('roll-a', 'roll-b')
+ GROUP BY queue_name ORDER BY queue_name`,
+ schema: z.object({ queue_name: z.string(), started: z.coerce.number() }),
+ })({});
+ const [e10, rows10] = await perQueue("trigger_dev.queue_metrics_v1");
+ const [e5m, rows5m] = await perQueue("trigger_dev.queue_metrics_5m_v1");
+ expect(e10).toBeNull();
+ expect(e5m).toBeNull();
+ expect(rows10).toEqual([
+ { queue_name: "roll-a", started: 7 },
+ { queue_name: "roll-b", started: 3 },
+ ]);
+ expect(rows5m).toEqual(rows10);
+
+ // CK-health gauges roll into the 5m mirror too.
+ const [ckError, ckRows] = await ch.reader.query({
+ name: "ck-5m-read",
+ query: `SELECT max(max_ck_backlogged) AS ck_keys, max(max_ck_wait_ms) AS ck_wait
+ FROM trigger_dev.queue_metrics_5m_v1
+ WHERE queue_name = 'roll-a'`,
+ schema: z.object({ ck_keys: z.coerce.number(), ck_wait: z.coerce.number() }),
+ })({});
+ expect(ckError).toBeNull();
+ expect(ckRows![0]).toEqual({ ck_keys: 5, ck_wait: 9000 });
+
+ // Env-wide totals: sum of per-queue merges (a single merge across queues would mix
+ // odometers and double-count).
+ const [envTotalError, envTotal] = await ch.reader.query({
+ name: "env-total-per-queue-sum",
+ query: `SELECT sum(started) AS started FROM (
+ SELECT queue_name, deltaSumTimestampMerge(started_delta) AS started
+ FROM trigger_dev.queue_metrics_5m_v1
+ WHERE queue_name IN ('roll-a', 'roll-b')
+ GROUP BY queue_name
+ )`,
+ schema: z.object({ started: z.coerce.number() }),
+ })({});
+ expect(envTotalError).toBeNull();
+ expect(envTotal![0]!.started).toBe(10);
+
+ const [envError, envRows] = await ch.reader.query({
+ name: "env-rollup-read",
+ query: `SELECT
+ max(max_env_running) AS max_env_running,
+ max(max_env_limit) AS max_env_limit,
+ uniqExact(bucket_start) AS buckets,
+ round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[4]) AS wait_p99
+ FROM trigger_dev.env_metrics_v1
+ WHERE organization_id = {org: String}`,
+ schema: z.object({
+ max_env_running: z.coerce.number(),
+ max_env_limit: z.coerce.number(),
+ buckets: z.coerce.number(),
+ wait_p99: z.coerce.number(),
+ }),
+ params: z.object({ org: z.string() }),
+ })({ org: rollOrg });
+ expect(envError).toBeNull();
+ expect(envRows![0]!.max_env_running).toBe(45);
+ expect(envRows![0]!.max_env_limit).toBe(50);
+ // 12:00:05 and 12:00:15 land in separate 10s env buckets (12:00:00 and 12:00:10).
+ expect(envRows![0]!.buckets).toBe(2);
+ expect(envRows![0]!.wait_p99).toBeGreaterThanOrEqual(600);
+ expect(envRows![0]!.wait_p99).toBeLessThanOrEqual(1000);
+
+ await ch.close();
+ }
+ );
+
+ clickhouseTest(
+ "merged ranking returns the page and the windowed total in one query",
+ async ({ clickhouseContainer }) => {
+ const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" });
+
+ const gauge = (queue: string, queued: number, running: number): QueueMetricsRawV1Input => ({
+ ...base("gauge", queue),
+ queued,
+ running,
+ });
+ const [insertError] = await ch.queueMetrics.insertRaw(
+ [gauge("rank-low", 1, 0), gauge("rank-high", 50, 3), gauge("rank-mid", 10, 2)],
+ SYNC
+ );
+ expect(insertError).toBeNull();
+
+ const args = {
+ organizationId: ORG,
+ projectId: PROJECT,
+ environmentId: ENV,
+ startTime: "2026-06-30 11:50:00",
+ nameContains: "rank-",
+ byQueuedOnly: 0,
+ };
+ const [pageError, page] = await ch.queueMetrics.ranking({ ...args, limit: 2, offset: 0 });
+ expect(pageError).toBeNull();
+ expect(page).toEqual([
+ { queue_name: "rank-high", ranked_total: 3 },
+ { queue_name: "rank-mid", ranked_total: 3 },
+ ]);
+
+ const [countError, count] = await ch.queueMetrics.rankingCount(args);
+ expect(countError).toBeNull();
+ expect(count![0]!.ranked).toBe(3);
+
+ const [namesError, names] = await ch.queueMetrics.rankingNames({ ...args, limit: 10 });
+ expect(namesError).toBeNull();
+ expect(names!.map((r) => r.queue_name)).toEqual(["rank-high", "rank-mid", "rank-low"]);
+
+ await ch.close();
+ }
+ );
+});
+
+describe("consumer retry idempotency", () => {
+ clickhouseTest(
+ "re-inserting a batch with the same dedup token does not inflate any tier",
+ async ({ clickhouseContainer }) => {
+ const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" });
+
+ const dedupOrg = "org_qm_dedup";
+ const rows: QueueMetricsRawV1Input[] = [
+ ...counter("started", "dedup-q", 3, [100, 200, 300]),
+ { ...base("gauge", "dedup-q"), running: 2, queued: 1, env_running: 5, env_limit: 10 },
+ ].map((row) => ({ ...row, organization_id: dedupOrg }));
+
+ const retrySettings = {
+ params: {
+ clickhouse_settings: {
+ async_insert: 0 as const,
+ insert_deduplication_token: "qm-test-retry-batch",
+ deduplicate_blocks_in_dependent_materialized_views: 1 as const,
+ },
+ },
+ };
+ for (let attempt = 0; attempt < 3; attempt++) {
+ const [error] = await ch.queueMetrics.insertRaw(rows, retrySettings);
+ expect(error).toBeNull();
+ }
+
+ const [tiersError, tiers] = await ch.reader.query({
+ name: "dedup-tier-counts",
+ query: `SELECT
+ (SELECT count() FROM trigger_dev.queue_metrics_v1 WHERE organization_id = {org: String}) AS rows_10s,
+ (SELECT count() FROM trigger_dev.queue_metrics_5m_v1 WHERE organization_id = {org: String}) AS rows_5m,
+ (SELECT count() FROM trigger_dev.env_metrics_v1 WHERE organization_id = {org: String}) AS rows_env,
+ (SELECT sum(wait_ms_count) FROM trigger_dev.env_metrics_v1 WHERE organization_id = {org: String}) AS wait_count,
+ (SELECT deltaSumTimestampMerge(started_delta) FROM trigger_dev.queue_metrics_v1 WHERE organization_id = {org: String}) AS started`,
+ schema: z.object({
+ rows_10s: z.coerce.number(),
+ rows_5m: z.coerce.number(),
+ rows_env: z.coerce.number(),
+ wait_count: z.coerce.number(),
+ started: z.coerce.number(),
+ }),
+ params: z.object({ org: z.string() }),
+ })({ org: dedupOrg });
+ expect(tiersError).toBeNull();
+ const t = tiers![0]!;
+ // Without dedup windows on the MV targets, retries append copies: rows and sums triple.
+ expect(t.rows_10s).toBe(1);
+ expect(t.rows_5m).toBe(1);
+ expect(t.rows_env).toBe(1);
+ expect(t.wait_count).toBe(3);
+ expect(t.started).toBe(3);
+
+ await ch.close();
+ }
+ );
+});
+
+describe("per-concurrency-key tier", () => {
+ clickhouseTest(
+ "per-key rows feed the ck tier without polluting per-queue counters or waits",
+ async ({ clickhouseContainer }) => {
+ const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" });
+ const ckOrg = "org_qm_ck";
+ const queue = "ck-tier-q";
+ const withCk = (row: QueueMetricsRawV1Input, ck: string): QueueMetricsRawV1Input => ({
+ ...row,
+ concurrency_key: ck,
+ });
+
+ // 5 started events on one queue across two keys (t1 x3, t2 x2). Each event lands as
+ // a base row (base odometer) + a per-key row (per-key odometer), both carrying wait,
+ // exactly like the consumer expansion. Baselines seed each odometer.
+ const rows: QueueMetricsRawV1Input[] = [];
+ let ok = 0;
+ const started = (cum: number, ck: string, ckcum: number, wait: number) => {
+ rows.push({ ...base("started", queue), cumulative: cum, order_key: ok, wait_ms: wait });
+ rows.push(
+ withCk({ ...base("started", queue), cumulative: ckcum, order_key: ok, wait_ms: wait }, ck)
+ );
+ ok++;
+ };
+ rows.push({ ...base("started", queue), cumulative: 0, order_key: ok++ });
+ rows.push(withCk({ ...base("started", queue), cumulative: 0, order_key: ok++ }, "t1"));
+ rows.push(withCk({ ...base("started", queue), cumulative: 0, order_key: ok++ }, "t2"));
+ started(1, "t1", 1, 100);
+ started(2, "t1", 2, 200);
+ started(3, "t2", 1, 300);
+ started(4, "t1", 3, 400);
+ started(5, "t2", 2, 500);
+ // Per-subqueue gauges carry the key.
+ rows.push(withCk({ ...base("gauge", queue), queued: 4, running: 1 }, "t1"));
+ rows.push(withCk({ ...base("gauge", queue), queued: 2, running: 0 }, "t2"));
+
+ const [insertError] = await ch.queueMetrics.insertRaw(
+ rows.map((r) => ({ ...r, organization_id: ckOrg })),
+ SYNC
+ );
+ expect(insertError).toBeNull();
+
+ const [perQueueError, perQueue] = await ch.reader.query({
+ name: "ck-per-queue-read",
+ query: `SELECT
+ deltaSumTimestampMerge(started_delta) AS started,
+ sum(wait_ms_sum) AS wait_sum,
+ sum(wait_ms_count) AS wait_count,
+ max(max_queued) AS peak_queued
+ FROM trigger_dev.queue_metrics_v1
+ WHERE organization_id = {org: String}`,
+ schema: z.object({
+ started: z.coerce.number(),
+ wait_sum: z.coerce.number(),
+ wait_count: z.coerce.number(),
+ peak_queued: z.coerce.number(),
+ }),
+ params: z.object({ org: z.string() }),
+ })({ org: ckOrg });
+ expect(perQueueError).toBeNull();
+ // Base rows only: 5 events (not 10), waits counted once, per-key gauges still max in.
+ expect(perQueue![0]).toEqual({ started: 5, wait_sum: 1500, wait_count: 5, peak_queued: 4 });
+
+ const [ckError, ckRows] = await ch.reader.query({
+ name: "ck-tier-read",
+ query: `SELECT concurrency_key,
+ deltaSumTimestampMerge(started_delta) AS started,
+ max(max_queued) AS peak_queued,
+ sum(wait_ms_sum) AS wait_sum
+ FROM trigger_dev.queue_metrics_ck_v1
+ WHERE organization_id = {org: String}
+ GROUP BY concurrency_key ORDER BY concurrency_key`,
+ schema: z.object({
+ concurrency_key: z.string(),
+ started: z.coerce.number(),
+ peak_queued: z.coerce.number(),
+ wait_sum: z.coerce.number(),
+ }),
+ params: z.object({ org: z.string() }),
+ })({ org: ckOrg });
+ expect(ckError).toBeNull();
+ expect(ckRows).toEqual([
+ { concurrency_key: "t1", started: 3, peak_queued: 4, wait_sum: 700 },
+ { concurrency_key: "t2", started: 2, peak_queued: 2, wait_sum: 800 },
+ ]);
+
+ await ch.close();
+ }
+ );
+});
diff --git a/internal-packages/clickhouse/src/queueMetrics.ts b/internal-packages/clickhouse/src/queueMetrics.ts
new file mode 100644
index 00000000000..dce9323ef26
--- /dev/null
+++ b/internal-packages/clickhouse/src/queueMetrics.ts
@@ -0,0 +1,214 @@
+import { z } from "zod";
+import type { ClickhouseReader, ClickhouseWriter } from "./client/types.js";
+
+export const QueueMetricsRawV1Input = z.object({
+ organization_id: z.string(),
+ project_id: z.string(),
+ environment_id: z.string(),
+ queue_name: z.string(),
+ concurrency_key: z.string().optional(),
+ event_time: z.string(),
+ // Exact UInt64 ordering key; a string preserves precision past JS safe-integer range
+ // (see entryOrderKey). A plain number is still accepted for small test values.
+ order_key: z.union([z.string(), z.number()]).optional(),
+ op: z.enum(["gauge", "enqueue", "started", "ack", "nack", "dlq"]),
+ running: z.number().optional(),
+ queued: z.number().optional(),
+ queue_limit: z.number().optional(),
+ env_running: z.number().optional(),
+ env_queued: z.number().optional(),
+ env_limit: z.number().optional(),
+ throttled: z.number().optional(),
+ ck_backlogged: z.number().optional(),
+ ck_max_wait_ms: z.number().optional(),
+ wait_ms: z.number().optional(),
+ cumulative: z.number().optional(),
+});
+
+export type QueueMetricsRawV1Input = z.input;
+
+export function insertQueueMetricsRaw(ch: ClickhouseWriter) {
+ return ch.insertUnsafe({
+ name: "insertQueueMetricsRaw",
+ table: "trigger_dev.queue_metrics_raw_v1",
+ });
+}
+
+// --- Reads (Queues list metrics + health) ---
+
+const QueueMetricsListParams = z.object({
+ organizationId: z.string(),
+ projectId: z.string(),
+ environmentId: z.string(),
+ queueNames: z.array(z.string()),
+ startTime: z.string(),
+ endTime: z.string(),
+});
+
+const QueueMetricsSummaryRow = z.object({
+ queue_name: z.string(),
+ p50_wait_ms: z.coerce.number(),
+ p95_wait_ms: z.coerce.number(),
+ peak_queued: z.coerce.number(),
+ started_count: z.coerce.number(),
+});
+
+// Callers align window bounds to the bucket grid so repeated loads share cache entries.
+const QUEUE_METRICS_CACHE_SETTINGS = {
+ use_query_cache: 1,
+ query_cache_ttl: 30,
+} as const;
+
+/** Per-queue rollups over a window, for a fixed set of queues (the visible page). */
+export function getQueueListMetricsSummary(reader: ClickhouseReader) {
+ return reader.query({
+ name: "getQueueListMetricsSummary",
+ query: `SELECT
+ queue_name,
+ round(quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[1]) AS p50_wait_ms,
+ round(quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[3]) AS p95_wait_ms,
+ max(max_queued) AS peak_queued,
+ deltaSumTimestampMerge(started_delta) AS started_count
+ FROM trigger_dev.queue_metrics_v1
+ WHERE organization_id = {organizationId: String}
+ AND project_id = {projectId: String}
+ AND environment_id = {environmentId: String}
+ AND queue_name IN {queueNames: Array(String)}
+ AND bucket_start >= {startTime: DateTime}
+ AND bucket_start < {endTime: DateTime}
+ GROUP BY queue_name`,
+ params: QueueMetricsListParams,
+ schema: QueueMetricsSummaryRow,
+ settings: QUEUE_METRICS_CACHE_SETTINGS,
+ });
+}
+
+const QueueDepthSparklineParams = QueueMetricsListParams.extend({
+ bucketSeconds: z.number(),
+});
+
+const QueueDepthSparklineRow = z.object({
+ queue_name: z.string(),
+ bucket: z.string(),
+ depth: z.coerce.number(),
+});
+
+/** Per-queue, per-bucket peak depth for inline sparklines (carry-forward filled by the caller). */
+export function getQueueDepthSparklines(reader: ClickhouseReader) {
+ return reader.query({
+ name: "getQueueDepthSparklines",
+ query: `SELECT
+ queue_name,
+ toStartOfInterval(bucket_start, toIntervalSecond({bucketSeconds: UInt32})) AS bucket,
+ max(max_queued) AS depth
+ FROM trigger_dev.queue_metrics_v1
+ WHERE organization_id = {organizationId: String}
+ AND project_id = {projectId: String}
+ AND environment_id = {environmentId: String}
+ AND queue_name IN {queueNames: Array(String)}
+ AND bucket_start >= {startTime: DateTime}
+ AND bucket_start < {endTime: DateTime}
+ GROUP BY queue_name, bucket
+ ORDER BY bucket`,
+ params: QueueDepthSparklineParams,
+ schema: QueueDepthSparklineRow,
+ settings: QUEUE_METRICS_CACHE_SETTINGS,
+ });
+}
+
+const QueueRankingParams = z.object({
+ organizationId: z.string(),
+ projectId: z.string(),
+ environmentId: z.string(),
+ startTime: z.string(),
+ /** 1 = rank by peak backlog only; 0 = backlog + running ("busiest"). */
+ byQueuedOnly: z.number(),
+ nameContains: z.string(),
+ limit: z.number(),
+ offset: z.number(),
+});
+
+const QueueRankingRow = z.object({
+ queue_name: z.string(),
+ ranked_total: z.coerce.number(),
+});
+
+// Ranking reads the 5m rollup: a 15-minute window there costs ~30x fewer rows than the
+// 10s table.
+const RANKING_WHERE = `organization_id = {organizationId: String}
+ AND project_id = {projectId: String}
+ AND environment_id = {environmentId: String}
+ AND bucket_start >= {startTime: DateTime}
+ AND queue_name != '__overflow__'
+ AND ({nameContains: String} = '' OR positionCaseInsensitive(queue_name, {nameContains: String}) > 0)`;
+
+/**
+ * One page of queue names ranked by recent activity, with the total ranked count on
+ * every row (window function), so page + count cost a single scan.
+ */
+export function getQueueRanking(reader: ClickhouseReader) {
+ return reader.query({
+ name: "getQueueRanking",
+ query: `SELECT queue_name, count() OVER () AS ranked_total
+ FROM (
+ SELECT queue_name
+ FROM trigger_dev.queue_metrics_5m_v1
+ WHERE ${RANKING_WHERE}
+ GROUP BY queue_name
+ ORDER BY
+ if({byQueuedOnly: UInt8} = 1, max(max_queued), max(max_queued) + max(max_running)) DESC,
+ queue_name ASC
+ )
+ LIMIT {limit: UInt32} OFFSET {offset: UInt32}`,
+ params: QueueRankingParams,
+ schema: QueueRankingRow,
+ settings: QUEUE_METRICS_CACHE_SETTINGS,
+ });
+}
+
+const QueueRankingNamesParams = QueueRankingParams.omit({ byQueuedOnly: true, offset: true });
+
+const QueueRankingNameRow = z.object({
+ queue_name: z.string(),
+});
+
+/** All ranked queue names (activity order), used to exclude them from the alphabetical tail. */
+export function getQueueRankingNames(reader: ClickhouseReader) {
+ return reader.query({
+ name: "getQueueRankingNames",
+ query: `SELECT queue_name
+ FROM trigger_dev.queue_metrics_5m_v1
+ WHERE ${RANKING_WHERE}
+ GROUP BY queue_name
+ ORDER BY max(max_queued) + max(max_running) DESC, queue_name ASC
+ LIMIT {limit: UInt32}`,
+ params: QueueRankingNamesParams,
+ schema: QueueRankingNameRow,
+ settings: QUEUE_METRICS_CACHE_SETTINGS,
+ });
+}
+
+const QueueRankingCountParams = QueueRankingParams.omit({
+ byQueuedOnly: true,
+ limit: true,
+ offset: true,
+});
+
+const QueueRankingCountRow = z.object({
+ ranked: z.coerce.number(),
+});
+
+/** Ranked-queue count alone, for pages past the ranked head (approximate uniq is fine). */
+export function getQueueRankingCount(reader: ClickhouseReader) {
+ return reader.query({
+ name: "getQueueRankingCount",
+ query: `SELECT uniq(queue_name) AS ranked
+ FROM trigger_dev.queue_metrics_5m_v1
+ WHERE ${RANKING_WHERE}`,
+ params: QueueRankingCountParams,
+ schema: QueueRankingCountRow,
+ settings: QUEUE_METRICS_CACHE_SETTINGS,
+ });
+}
+
+// (per-queue detail series is now fetched via TRQL + fillGaps from the metric resource route)
diff --git a/internal-packages/metrics-pipeline/package.json b/internal-packages/metrics-pipeline/package.json
new file mode 100644
index 00000000000..10a7c137a1f
--- /dev/null
+++ b/internal-packages/metrics-pipeline/package.json
@@ -0,0 +1,33 @@
+{
+ "name": "@internal/metrics-pipeline",
+ "private": true,
+ "version": "0.0.1",
+ "main": "./dist/src/index.js",
+ "types": "./dist/src/index.d.ts",
+ "type": "module",
+ "exports": {
+ ".": {
+ "@triggerdotdev/source": "./src/index.ts",
+ "import": "./dist/src/index.js",
+ "types": "./dist/src/index.d.ts",
+ "default": "./dist/src/index.js"
+ }
+ },
+ "dependencies": {
+ "@internal/redis": "workspace:*",
+ "@internal/tracing": "workspace:*",
+ "@trigger.dev/core": "workspace:*"
+ },
+ "devDependencies": {
+ "@internal/testcontainers": "workspace:*",
+ "rimraf": "6.0.1"
+ },
+ "scripts": {
+ "clean": "rimraf dist",
+ "typecheck": "tsc --noEmit -p tsconfig.build.json",
+ "test": "vitest --sequence.concurrent=false --no-file-parallelism",
+ "test:coverage": "vitest --sequence.concurrent=false --no-file-parallelism --coverage.enabled",
+ "build": "pnpm run clean && tsc -p tsconfig.build.json",
+ "dev": "tsc --watch -p tsconfig.build.json"
+ }
+}
diff --git a/internal-packages/metrics-pipeline/src/cachedValue.ts b/internal-packages/metrics-pipeline/src/cachedValue.ts
new file mode 100644
index 00000000000..7f7bbb07903
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/cachedValue.ts
@@ -0,0 +1,125 @@
+import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis";
+import { Logger } from "@trigger.dev/core/logger";
+
+export type CachedRedisValueOptions = {
+ redis: RedisOptions;
+ key: string;
+ parse: (raw: string | null) => T;
+ defaultValue: T;
+ cacheTtlMs?: number;
+ logger?: Logger;
+ loggerName?: string;
+};
+
+// Reads a Redis key with a short stale-while-revalidate cache and a synchronous getter for
+// hot paths. Warms eagerly on construction; concurrent refreshes dedupe onto one GET so an
+// awaited refresh always resolves to a completed read.
+export class CachedRedisValue {
+ private readonly redis: Redis;
+ private readonly key: string;
+ private readonly parse: (raw: string | null) => T;
+ private readonly cacheTtlMs: number;
+ private readonly logger: Logger;
+ private value: T;
+ private lastFetchedAt = 0;
+ private refreshPromise?: Promise;
+
+ constructor(options: CachedRedisValueOptions) {
+ this.logger = options.logger ?? new Logger(options.loggerName ?? "CachedRedisValue", "warn");
+ this.redis = createRedisClient(
+ { ...options.redis, keyPrefix: undefined },
+ {
+ onError: (error) =>
+ this.logger.error("cached value redis error", { error, key: options.key }),
+ }
+ );
+ this.key = options.key;
+ this.parse = options.parse;
+ this.cacheTtlMs = options.cacheTtlMs ?? 10_000;
+ this.value = options.defaultValue;
+ void this.refresh();
+ }
+
+ get(): T {
+ if (Date.now() - this.lastFetchedAt > this.cacheTtlMs) {
+ void this.refresh();
+ }
+ return this.value;
+ }
+
+ async refresh(): Promise {
+ if (this.refreshPromise) return this.refreshPromise;
+ this.refreshPromise = this.#doRefresh();
+ try {
+ return await this.refreshPromise;
+ } finally {
+ this.refreshPromise = undefined;
+ }
+ }
+
+ async #doRefresh(): Promise {
+ try {
+ this.value = this.parse(await this.redis.get(this.key));
+ } catch (error) {
+ this.logger.debug("cached value refresh failed, keeping cached value", {
+ error,
+ key: this.key,
+ });
+ } finally {
+ this.lastFetchedAt = Date.now();
+ }
+ return this.value;
+ }
+
+ async close(): Promise {
+ await this.redis.quit();
+ }
+}
+
+export type CachedRedisNumberOptions = {
+ redis: RedisOptions;
+ key: string;
+ defaultValue: number;
+ min?: number;
+ max?: number;
+ cacheTtlMs?: number;
+ logger?: Logger;
+};
+
+// Live-tunable numeric value, clamped to [min,max]; falls back to defaultValue on a
+// missing/unparseable key. Exposes a synchronous value() for hot paths.
+export class CachedRedisNumber {
+ private readonly inner: CachedRedisValue;
+
+ constructor(options: CachedRedisNumberOptions) {
+ const min = options.min ?? Number.NEGATIVE_INFINITY;
+ const max = options.max ?? Number.POSITIVE_INFINITY;
+ const clamp = (n: number) => Math.min(max, Math.max(min, n));
+ const fallback = clamp(options.defaultValue);
+ this.inner = new CachedRedisValue({
+ redis: options.redis,
+ key: options.key,
+ parse: (raw) => {
+ // Number("") is 0 (not NaN), so treat blank/whitespace as missing => fallback.
+ const n = raw == null || raw.trim() === "" ? Number.NaN : Number(raw);
+ return Number.isFinite(n) ? clamp(n) : fallback;
+ },
+ defaultValue: fallback,
+ cacheTtlMs: options.cacheTtlMs,
+ logger: options.logger,
+ loggerName: "CachedRedisNumber",
+ });
+ }
+
+ value(): number {
+ return this.inner.get();
+ }
+
+ refresh(): Promise {
+ return this.inner.refresh();
+ }
+
+ close(): Promise {
+ return this.inner.close();
+ }
+}
diff --git a/internal-packages/metrics-pipeline/src/consumer.test.ts b/internal-packages/metrics-pipeline/src/consumer.test.ts
new file mode 100644
index 00000000000..672fa426999
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/consumer.test.ts
@@ -0,0 +1,392 @@
+import { createRedisClient } from "@internal/redis";
+import { redisTest } from "@internal/testcontainers";
+import { expect } from "vitest";
+import { CachedRedisFlag } from "./flag.js";
+import { CachedRedisNumber } from "./cachedValue.js";
+import { MetricsStreamConsumer } from "./consumer.js";
+import { MetricsStreamEmitter } from "./emitter.js";
+import { shardFor } from "./hash.js";
+import { streamKey, type MetricDefinition } from "./types.js";
+
+async function waitFor(cond: () => boolean, timeoutMs = 5000): Promise {
+ const start = Date.now();
+ while (!cond()) {
+ if (Date.now() - start > timeoutMs) throw new Error("waitFor timed out");
+ await new Promise((r) => setTimeout(r, 50));
+ }
+}
+
+function definitionFor(suffix: string, shardCount = 2): MetricDefinition {
+ return { name: `qm_${Date.now()}_${suffix}`, shardCount, consumerGroup: "cg", maxLen: 1000 };
+}
+
+redisTest(
+ "emitter -> consumer round trip maps rows, dedups, and acks",
+ async ({ redisOptions }) => {
+ const definition = definitionFor("rt");
+ const emitter = new MetricsStreamEmitter({
+ redis: redisOptions,
+ definition,
+ flag: { enabled: () => true },
+ });
+ const inserted: Array<{ rows: Array>; dedupToken: string }> = [];
+
+ const consumer = new MetricsStreamConsumer>({
+ redis: redisOptions,
+ definition,
+ consumerName: "c1",
+ mapEntry: (e) => ({ id: e.id, ...e.fields }),
+ insert: async (rows, { dedupToken }) => {
+ inserted.push({ rows, dedupToken });
+ },
+ blockMs: 200,
+ });
+
+ await consumer.start();
+ emitter.emit("queueA", { op: "enqueue", q: "queueA" });
+ emitter.emit("queueB", { op: "started", q: "queueB", wait: 42 });
+
+ await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2);
+ await consumer.stop();
+
+ const rows = inserted.flatMap((i) => i.rows);
+ expect(rows).toContainEqual(expect.objectContaining({ op: "enqueue", q: "queueA" }));
+ expect(rows).toContainEqual(
+ expect.objectContaining({ op: "started", q: "queueB", wait: "42" })
+ );
+ expect(inserted[0]!.dedupToken).toMatch(/^[0-9a-f]{40}$/);
+
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ for (const key of consumer.streamKeys()) {
+ const pending = (await admin.xpending(key, definition.consumerGroup)) as [
+ number,
+ ...unknown[],
+ ];
+ expect(pending[0]).toBe(0);
+ }
+ await admin.quit();
+ await emitter.close();
+ }
+);
+
+redisTest("emit is a no-op when the flag is disabled", async ({ redisOptions }) => {
+ const definition = definitionFor("off");
+ const emitter = new MetricsStreamEmitter({
+ redis: redisOptions,
+ definition,
+ flag: { enabled: () => false },
+ });
+
+ emitter.emit("q", { op: "enqueue", q: "q" });
+ await new Promise((r) => setTimeout(r, 200));
+
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ const len = await admin.xlen(streamKey(definition, shardFor("q", definition.shardCount)));
+ expect(len).toBe(0);
+ await admin.quit();
+ await emitter.close();
+});
+
+redisTest("reclaims stale pending entries from a dead consumer", async ({ redisOptions }) => {
+ const definition = definitionFor("claim", 1);
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ const key = streamKey(definition, 0);
+
+ await admin.xgroup("CREATE", key, definition.consumerGroup, "$", "MKSTREAM");
+ await admin.xadd(key, "*", "op", "ack", "q", "qZ");
+ await admin.xadd(key, "*", "op", "nack", "q", "qZ");
+ await admin.xreadgroup(
+ "GROUP",
+ definition.consumerGroup,
+ "zombie",
+ "COUNT",
+ 10,
+ "STREAMS",
+ key,
+ ">"
+ );
+
+ const inserted: Array> = [];
+ const consumer = new MetricsStreamConsumer>({
+ redis: redisOptions,
+ definition,
+ consumerName: "live",
+ mapEntry: (e) => ({ id: e.id, ...e.fields }),
+ insert: async (rows) => {
+ inserted.push(...rows);
+ },
+ blockMs: 200,
+ claimIdleMs: 0,
+ });
+
+ await consumer.start();
+ await waitFor(() => inserted.length >= 2);
+ await consumer.stop();
+
+ expect(inserted.map((r) => r.op).sort()).toEqual(["ack", "nack"]);
+ const pending = (await admin.xpending(key, definition.consumerGroup)) as [number, ...unknown[]];
+ expect(pending[0]).toBe(0);
+ await admin.quit();
+});
+
+redisTest(
+ "per-stream batches: one insert + distinct dedup token per shard stream",
+ async ({ redisOptions }) => {
+ const definition = definitionFor("pershard", 2);
+ const emitter = new MetricsStreamEmitter({
+ redis: redisOptions,
+ definition,
+ flag: { enabled: () => true },
+ });
+ // Two shard keys that land on different shards.
+ const a = "shardkey-a";
+ let b = "shardkey-b0";
+ for (let i = 1; shardFor(b, 2) === shardFor(a, 2); i++) b = `shardkey-b${i}`;
+
+ const inserted: Array<{ rows: Array>; dedupToken: string }> = [];
+ const consumer = new MetricsStreamConsumer>({
+ redis: redisOptions,
+ definition,
+ consumerName: "c1",
+ mapEntry: (e) => ({ id: e.id, ...e.fields }),
+ insert: async (rows, { dedupToken }) => {
+ inserted.push({ rows, dedupToken });
+ },
+ blockMs: 200,
+ });
+
+ await consumer.start();
+ emitter.emit(a, { op: "enqueue", q: a });
+ emitter.emit(b, { op: "enqueue", q: b });
+ await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2);
+ await consumer.stop();
+ await emitter.close();
+
+ // Each shard's batch is its own dedup block with its own (stream-scoped) token.
+ const batchesWithRows = inserted.filter((i) => i.rows.length > 0);
+ expect(batchesWithRows.length).toBe(2);
+ expect(new Set(batchesWithRows.map((i) => i.dedupToken)).size).toBe(2);
+ }
+);
+
+redisTest(
+ "probe reports lag as null (not 0) when Redis cannot compute it",
+ async ({ redisOptions }) => {
+ const definition = definitionFor("nillag", 1);
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ const key = streamKey(definition, 0);
+
+ await admin.xgroup("CREATE", key, definition.consumerGroup, "0", "MKSTREAM");
+ const ids: string[] = [];
+ for (let i = 0; i < 5; i++) {
+ ids.push((await admin.xadd(key, "*", "op", "enqueue", "q", "qT")) as string);
+ }
+ // SETID to an arbitrary id makes the group's entries-read unknown => lag is nil
+ // (severe trimming can do the same in prod); the probe must NOT report that as 0.
+ await admin.xgroup("SETID", key, definition.consumerGroup, ids[2]!);
+
+ const consumer = new MetricsStreamConsumer>({
+ redis: redisOptions,
+ definition,
+ consumerName: "c1",
+ mapEntry: (e) => ({ id: e.id, ...e.fields }),
+ insert: async () => {},
+ });
+ try {
+ const states = await consumer.streamState();
+ expect(states[0]!.lag).toBeNull();
+ } finally {
+ await consumer.stop();
+ await admin.quit();
+ }
+ }
+);
+
+redisTest(
+ "emitGauge XADDs an op=gauge snapshot onto the shared metrics stream",
+ async ({ redisOptions }) => {
+ const definition = definitionFor("gauge", 2);
+ const emitter = new MetricsStreamEmitter({
+ redis: redisOptions,
+ definition,
+ flag: { enabled: () => true },
+ });
+
+ // Emits before the connection is ready are dropped by design (loss-tolerant).
+ await emitter.waitUntilReady();
+ emitter.emitGauge("q1", {
+ op: "gauge",
+ q: "q1",
+ ql: 5,
+ cc: 2,
+ lim: 10,
+ eql: 3,
+ ec: 1,
+ elim: 20,
+ thr: 0,
+ });
+
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ const key = streamKey(definition, shardFor("q1", 2));
+ // Plain XADD (no odometer, no cum=0 seed) => exactly one entry, unlike counter emit().
+ await waitFor2(async () => (await admin.xlen(key)) === 1);
+ const raw = (await admin.xrange(key, "-", "+")) as Array<[string, string[]]>;
+ const flat = raw[0]![1];
+ const fields: Record = {};
+ for (let i = 0; i + 1 < flat.length; i += 2) fields[flat[i]!] = flat[i + 1]!;
+ expect(fields.op).toBe("gauge");
+ expect(fields.q).toBe("q1");
+ expect(fields.ql).toBe("5");
+ expect(fields.thr).toBe("0");
+ await admin.quit();
+ await emitter.close();
+ }
+);
+
+async function waitFor2(cond: () => Promise, timeoutMs = 5000): Promise {
+ const start = Date.now();
+ while (!(await cond())) {
+ if (Date.now() - start > timeoutMs) throw new Error("waitFor2 timed out");
+ await new Promise((r) => setTimeout(r, 50));
+ }
+}
+
+redisTest("sampledSync gates on both the flag and the sample rate", async ({ redisOptions }) => {
+ const definition = definitionFor("sample");
+ const off = new MetricsStreamEmitter({
+ redis: redisOptions,
+ definition,
+ flag: { enabled: () => true },
+ gaugeSampleRate: 0,
+ });
+ const on = new MetricsStreamEmitter({
+ redis: redisOptions,
+ definition,
+ flag: { enabled: () => true },
+ gaugeSampleRate: 1,
+ });
+ const disabled = new MetricsStreamEmitter({
+ redis: redisOptions,
+ definition,
+ flag: { enabled: () => false },
+ gaugeSampleRate: 1,
+ });
+
+ expect(off.sampledSync()).toBe(false); // rate 0 => never sampled in
+ expect(on.sampledSync()).toBe(true); // rate 1 + enabled => always
+ expect(disabled.sampledSync()).toBe(false); // disabled => never, regardless of rate
+ expect(on.enabledSync()).toBe(true); // enabledSync (counters) is unaffected by sampling
+
+ await Promise.all([off.close(), on.close(), disabled.close()]);
+});
+
+redisTest("sampledSync honors a live rate provider (no reconstruct)", async ({ redisOptions }) => {
+ const definition = definitionFor("live");
+ let rate = 1;
+ const emitter = new MetricsStreamEmitter({
+ redis: redisOptions,
+ definition,
+ flag: { enabled: () => true },
+ gaugeSampleRate: { value: () => rate },
+ });
+ expect(emitter.sampledSync()).toBe(true);
+ rate = 0;
+ expect(emitter.sampledSync()).toBe(false);
+ await emitter.close();
+});
+
+redisTest("CachedRedisNumber reads live, clamps, and falls back", async ({ redisOptions }) => {
+ const key = `rate_${Date.now()}`;
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ const num = new CachedRedisNumber({ redis: redisOptions, key, defaultValue: 1, min: 0, max: 1 });
+
+ await num.refresh();
+ expect(num.value()).toBe(1); // missing key => default
+ await admin.set(key, "0.25");
+ await num.refresh();
+ expect(num.value()).toBe(0.25);
+ await admin.set(key, "5");
+ await num.refresh();
+ expect(num.value()).toBe(1); // out of range => clamped
+ await admin.set(key, "nonsense");
+ await num.refresh();
+ expect(num.value()).toBe(1); // unparseable => default
+
+ await num.close();
+ await admin.quit();
+});
+
+redisTest("streamState reports depth, lag, and pending per shard", async ({ redisOptions }) => {
+ const definition = definitionFor("state", 1);
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ const key = streamKey(definition, 0);
+
+ await admin.xgroup("CREATE", key, definition.consumerGroup, "$", "MKSTREAM");
+ await admin.xadd(key, "*", "op", "enqueue", "q", "qX");
+ await admin.xadd(key, "*", "op", "ack", "q", "qX");
+ // Read one entry as some consumer and leave it unacked -> 1 pending, 1 still undelivered.
+ await admin.xreadgroup(
+ "GROUP",
+ definition.consumerGroup,
+ "reader",
+ "COUNT",
+ 1,
+ "STREAMS",
+ key,
+ ">"
+ );
+
+ const consumer = new MetricsStreamConsumer>({
+ redis: redisOptions,
+ definition,
+ consumerName: "c1",
+ mapEntry: (e) => ({ id: e.id, ...e.fields }),
+ insert: async () => {},
+ });
+
+ try {
+ const states = await consumer.streamState();
+ expect(states).toHaveLength(1);
+ expect(states[0]!.depth).toBe(2);
+ expect(states[0]!.pending).toBe(1);
+ expect(states[0]!.lag).toBe(1);
+ } finally {
+ await consumer.stop();
+ await admin.quit();
+ }
+});
+
+redisTest("CachedRedisFlag reads a redis key with caching", async ({ redisOptions }) => {
+ const key = `flag_${Date.now()}`;
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ const flag = new CachedRedisFlag({ redis: redisOptions, key, cacheTtlMs: 10_000 });
+
+ expect(flag.enabled()).toBe(false);
+ await flag.refresh();
+ expect(flag.enabled()).toBe(false);
+
+ await admin.set(key, "1");
+ await flag.refresh();
+ expect(flag.enabled()).toBe(true);
+
+ await admin.set(key, "0");
+ await flag.refresh();
+ expect(flag.enabled()).toBe(false);
+
+ await flag.close();
+ await admin.quit();
+});
+
+redisTest("CachedRedisFlag warms eagerly on construction", async ({ redisOptions }) => {
+ const key = `flag_eager_${Date.now()}`;
+ const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ await admin.set(key, "1");
+
+ const flag = new CachedRedisFlag({ redis: redisOptions, key });
+ // No manual refresh(): the constructor kicks one off so the first real read is warm.
+ await waitFor(() => flag.enabled() === true);
+ expect(flag.enabled()).toBe(true);
+
+ await flag.close();
+ await admin.quit();
+});
diff --git a/internal-packages/metrics-pipeline/src/consumer.ts b/internal-packages/metrics-pipeline/src/consumer.ts
new file mode 100644
index 00000000000..9e333e70ab1
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/consumer.ts
@@ -0,0 +1,336 @@
+import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis";
+import {
+ getMeter,
+ type Counter,
+ type Histogram,
+ type Meter,
+ type ObservableGauge,
+ ValueType,
+} from "@internal/tracing";
+import { Logger } from "@trigger.dev/core/logger";
+import { dedupTokenFromEntryIds } from "./idempotency.js";
+import { allStreamKeys, type MetricDefinition, type StreamEntry } from "./types.js";
+
+export type MetricsStreamConsumerOptions = {
+ redis: RedisOptions;
+ definition: MetricDefinition;
+ /** Unique per process; distinct replicas MUST use distinct names (PEL ownership). */
+ consumerName: string;
+ /** Map a stream entry to a row, or null to drop it (still acked). */
+ mapEntry: (entry: StreamEntry) => TRow | TRow[] | null;
+ /** Insert a batch. Must be idempotent w.r.t. dedupToken; throw to retry the batch. */
+ insert: (rows: TRow[], opts: { dedupToken: string }) => Promise;
+ batchSize?: number;
+ blockMs?: number;
+ claimIdleMs?: number;
+ /** How often to scan for stale pending entries (XAUTOCLAIM); not every poll. */
+ reclaimIntervalMs?: number;
+ errorBackoffMs?: number;
+ logger?: Logger;
+ meter?: Meter;
+};
+
+type RawEntry = [id: string, fields: string[]];
+type RawStream = [key: string, entries: RawEntry[]];
+
+/** Per-shard stream health, surfaced as observable gauges and usable directly in tests.
+ * `lag: null` means Redis could not compute it (entries trimmed past the group's read
+ * position) — treat as an alert, NOT as zero: it coincides with data loss. */
+export type ShardState = { shard: number; depth: number; lag: number | null; pending: number };
+
+function parseFields(flat: string[]): Record {
+ const out: Record = {};
+ for (let i = 0; i + 1 < flat.length; i += 2) {
+ out[flat[i]!] = flat[i + 1]!;
+ }
+ return out;
+}
+
+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
+
+/**
+ * Reads a sharded metrics stream via a consumer group, inserting each stream's poll-batch
+ * as its own dedup block (so an XAUTOCLAIM-reclaimed batch re-forms the same id set and
+ * token), acking only after a successful insert. Sequential read/insert/ack per process.
+ */
+export class MetricsStreamConsumer {
+ private readonly redis: Redis;
+ private readonly probeRedis: Redis;
+ private readonly def: MetricDefinition;
+ private readonly keys: string[];
+ private readonly consumerName: string;
+ private readonly batchSize: number;
+ private readonly blockMs: number;
+ private readonly claimIdleMs: number;
+ private readonly reclaimIntervalMs: number;
+ private lastReclaimAt = 0;
+ private readonly errorBackoffMs: number;
+ private readonly logger: Logger;
+ private readonly mapEntry: (entry: StreamEntry) => TRow | TRow[] | null;
+ private readonly insert: (rows: TRow[], opts: { dedupToken: string }) => Promise;
+
+ private readonly meter: Meter;
+ private readonly entriesCounter: Counter;
+ private readonly rowsCounter: Counter;
+ private readonly insertErrorCounter: Counter;
+ private readonly insertDuration: Histogram;
+ private readonly observables: ObservableGauge[];
+ private readonly batchCallback: Parameters[0];
+
+ private running = false;
+ private loopPromise?: Promise;
+
+ constructor(options: MetricsStreamConsumerOptions) {
+ this.logger = options.logger ?? new Logger("MetricsStreamConsumer", "info");
+ const redisConfig = { ...options.redis, keyPrefix: undefined };
+ this.redis = createRedisClient(redisConfig, {
+ onError: (error) => this.logger.error("consumer redis error", { error }),
+ });
+ // Separate client so the observable-gauge probes never queue behind the blocking XREADGROUP.
+ this.probeRedis = createRedisClient(redisConfig, {
+ onError: (error) => this.logger.error("consumer probe redis error", { error }),
+ });
+ this.def = options.definition;
+ this.keys = allStreamKeys(options.definition);
+ this.consumerName = options.consumerName;
+ this.batchSize = options.batchSize ?? 1000;
+ this.blockMs = options.blockMs ?? 1000;
+ this.claimIdleMs = options.claimIdleMs ?? 60_000;
+ this.reclaimIntervalMs = options.reclaimIntervalMs ?? 15_000;
+ this.errorBackoffMs = options.errorBackoffMs ?? 1000;
+ this.mapEntry = options.mapEntry;
+ this.insert = options.insert;
+
+ this.meter = options.meter ?? getMeter("metrics-pipeline");
+ this.entriesCounter = this.meter.createCounter("queue_metrics.consumer.entries", {
+ description: "Stream entries read (attr source=new|reclaimed)",
+ valueType: ValueType.INT,
+ });
+ this.rowsCounter = this.meter.createCounter("queue_metrics.consumer.rows_inserted", {
+ description: "Rows inserted into the sink",
+ valueType: ValueType.INT,
+ });
+ this.insertErrorCounter = this.meter.createCounter("queue_metrics.consumer.insert_errors", {
+ description: "Failed inserts (batch left pending for retry)",
+ valueType: ValueType.INT,
+ });
+ this.insertDuration = this.meter.createHistogram("queue_metrics.consumer.insert_duration", {
+ description: "Sink insert latency",
+ unit: "ms",
+ valueType: ValueType.INT,
+ });
+
+ const depthGauge = this.meter.createObservableGauge("queue_metrics.consumer.stream_depth", {
+ description: "Entries currently in each shard stream (approaches MAXLEN => trimming)",
+ valueType: ValueType.INT,
+ });
+ const lagGauge = this.meter.createObservableGauge("queue_metrics.consumer.group_lag", {
+ description: "Entries not yet delivered to the consumer group (consumer falling behind)",
+ valueType: ValueType.INT,
+ });
+ const pendingGauge = this.meter.createObservableGauge("queue_metrics.consumer.pending", {
+ description: "Unacked (in-flight or stuck) entries in the group PEL",
+ valueType: ValueType.INT,
+ });
+ const lagUnknownGauge = this.meter.createObservableGauge("queue_metrics.consumer.lag_unknown", {
+ description:
+ "1 when Redis cannot compute group lag (entries trimmed => data loss); alert on this",
+ valueType: ValueType.INT,
+ });
+ this.observables = [depthGauge, lagGauge, pendingGauge, lagUnknownGauge];
+ this.batchCallback = async (result) => {
+ const states = await this.streamState();
+ for (const s of states) {
+ const attrs = { stream: this.def.name, shard: String(s.shard) };
+ result.observe(depthGauge, s.depth, attrs);
+ if (s.lag !== null) result.observe(lagGauge, s.lag, attrs);
+ result.observe(lagUnknownGauge, s.lag === null ? 1 : 0, attrs);
+ result.observe(pendingGauge, s.pending, attrs);
+ }
+ };
+ this.meter.addBatchObservableCallback(this.batchCallback, this.observables);
+ }
+
+ async start(): Promise {
+ if (this.running) return;
+ await this.ensureGroups();
+ this.running = true;
+ this.loopPromise = this.loop();
+ }
+
+ async stop(): Promise {
+ this.running = false;
+ this.meter.removeBatchObservableCallback(this.batchCallback, this.observables);
+ await this.loopPromise?.catch(() => {});
+ await Promise.all([this.redis.quit().catch(() => {}), this.probeRedis.quit().catch(() => {})]);
+ }
+
+ private async ensureGroups(): Promise {
+ for (const key of this.keys) {
+ try {
+ // "0" (not "$"): a brand-new stream's group must not skip entries emitted
+ // between emitter boot and the first consumer's group creation.
+ await this.redis.xgroup("CREATE", key, this.def.consumerGroup, "0", "MKSTREAM");
+ } catch (error) {
+ const message = error instanceof Error ? error.message : String(error);
+ if (!message.includes("BUSYGROUP")) throw error;
+ }
+ }
+ }
+
+ private async loop(): Promise {
+ while (this.running) {
+ try {
+ if (Date.now() - this.lastReclaimAt >= this.reclaimIntervalMs) {
+ this.lastReclaimAt = Date.now();
+ await this.reclaimStale();
+ }
+ await this.readNew();
+ } catch (error) {
+ const message = error instanceof Error ? error.message : String(error);
+ // Self-heal a missing group (stream trimmed to nothing / deleted / Redis flushed):
+ // recreate it rather than wedging the loop on NOGROUP forever.
+ if (message.includes("NOGROUP")) {
+ this.logger.warn("consumer group missing; recreating", { error });
+ await this.ensureGroups().catch(() => {});
+ } else {
+ this.logger.error("consumer loop iteration failed", { error });
+ }
+ await sleep(this.errorBackoffMs);
+ }
+ }
+ }
+
+ private async readNew(): Promise {
+ const ids = this.keys.map(() => ">");
+ const response = (await this.redis.xreadgroup(
+ "GROUP",
+ this.def.consumerGroup,
+ this.consumerName,
+ "COUNT",
+ this.batchSize,
+ "BLOCK",
+ this.blockMs,
+ "STREAMS",
+ ...this.keys,
+ ...ids
+ )) as RawStream[] | null;
+
+ if (!response) return 0;
+ return this.processStreams(response, "new");
+ }
+
+ private async reclaimStale(): Promise {
+ for (const key of this.keys) {
+ const result = (await this.redis.xautoclaim(
+ key,
+ this.def.consumerGroup,
+ this.consumerName,
+ this.claimIdleMs,
+ "0",
+ "COUNT",
+ this.batchSize
+ )) as [string, RawEntry[], string[]] | null;
+
+ const entries = result?.[1] ?? [];
+ if (entries.length === 0) continue;
+ await this.processStreams([[key, entries]], "reclaimed");
+ }
+ }
+
+ // One insert (dedup block) and XACK per stream, so a reclaimed batch re-forms the
+ // original per-stream id set and token. On insert failure that stream's entries stay
+ // pending for a later XAUTOCLAIM; other streams still progress.
+ private async processStreams(streams: RawStream[], source: "new" | "reclaimed"): Promise {
+ let processed = 0;
+ let firstError: unknown;
+
+ for (const [key, entries] of streams) {
+ if (entries.length === 0) continue;
+ const keyIds: string[] = [];
+ const rows: TRow[] = [];
+ for (const [id, flat] of entries) {
+ keyIds.push(id);
+ const mapped = this.mapEntry({ id, fields: parseFields(flat) });
+ if (Array.isArray(mapped)) rows.push(...mapped);
+ else if (mapped !== null) rows.push(mapped);
+ }
+ this.entriesCounter.add(keyIds.length, { source });
+
+ if (rows.length > 0) {
+ const startedAt = Date.now();
+ try {
+ await this.insert(rows, { dedupToken: dedupTokenFromEntryIds(keyIds, key) });
+ } catch (error) {
+ this.insertErrorCounter.add(1);
+ firstError ??= error;
+ continue;
+ } finally {
+ this.insertDuration.record(Date.now() - startedAt);
+ }
+ this.rowsCounter.add(rows.length);
+ }
+
+ await this.redis.xack(key, this.def.consumerGroup, ...keyIds);
+ processed += keyIds.length;
+ }
+
+ if (firstError !== undefined) throw firstError;
+ return processed;
+ }
+
+ /** Per-shard depth (XLEN), group lag, and pending — the consumer-health signals. */
+ async streamState(): Promise {
+ return probeShardStates(this.probeRedis, this.keys, this.def.consumerGroup);
+ }
+
+ /** All shard stream keys this consumer reads (for diagnostics/tests). */
+ streamKeys(): string[] {
+ return this.keys.slice();
+ }
+}
+
+/**
+ * Per-shard depth/lag/pending for a metric stream — usable without a running consumer
+ * (e.g. from an admin route). `redis` should have keyPrefix unset, matching the stream keys.
+ */
+export async function probeShardStates(
+ redis: Redis,
+ keys: string[],
+ consumerGroup: string
+): Promise {
+ const out: ShardState[] = [];
+ for (let shard = 0; shard < keys.length; shard++) {
+ const key = keys[shard]!;
+ const depth = Number(await redis.xlen(key)) || 0;
+ // lag defaults to null (unknown) and only becomes a number when the group is found and
+ // Redis reports one: a nil lag (or a missing group on an existing stream) means we can't
+ // compute it, e.g. entries were trimmed past the group's read position (data loss).
+ let lag: number | null = null;
+ let pending = 0;
+ try {
+ const groups = (await redis.call("XINFO", "GROUPS", key)) as unknown[];
+ for (const raw of groups) {
+ const info = flatToMap(raw as unknown[]);
+ if (info.name === consumerGroup) {
+ const rawLag = info.lag;
+ lag = rawLag == null ? null : Number(rawLag);
+ if (lag !== null && !Number.isFinite(lag)) lag = null;
+ pending = Number(info.pending) || 0;
+ }
+ }
+ } catch {
+ // Stream/group may not exist yet; treat as zero.
+ }
+ out.push({ shard, depth, lag, pending });
+ }
+ return out;
+}
+
+function flatToMap(flat: unknown[]): Record {
+ const out: Record = {};
+ for (let i = 0; i + 1 < flat.length; i += 2) {
+ out[String(flat[i])] = flat[i + 1];
+ }
+ return out;
+}
diff --git a/internal-packages/metrics-pipeline/src/emitter.ts b/internal-packages/metrics-pipeline/src/emitter.ts
new file mode 100644
index 00000000000..692956d98cb
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/emitter.ts
@@ -0,0 +1,242 @@
+import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis";
+import { getMeter, type Counter, type Meter, ValueType } from "@internal/tracing";
+import { Logger } from "@trigger.dev/core/logger";
+import { shardFor } from "./hash.js";
+import { streamKey, type MetricDefinition, type MetricFields } from "./types.js";
+
+export type MetricsStreamEmitterOptions = {
+ redis: RedisOptions;
+ definition: MetricDefinition;
+ /** Synchronous enabled check (e.g. CachedRedisFlag); emits are no-ops when false. */
+ flag: { enabled(): boolean };
+ /** Probability (0..1) that a sampled emission fires; applies to `sampledSync()`, not
+ * `emit()`. Pass a `{ value() }` provider (e.g. CachedRedisNumber) to tune it live
+ * without a redeploy. Default 1 (always). */
+ gaugeSampleRate?: number | { value(): number };
+ /** TTL (ms) refreshed on every counter write on the per-(queue,op) odometer key.
+ * Active queues never expire; idle-past-TTL queues purge and self-heal on return.
+ * Default 7 days. */
+ counterOdometerTtlMs?: number;
+ /** TTL (ms) for per-concurrency-key odometers; short because key cardinality is
+ * user-controlled and cumulative counters make idle-gap expiry loss-free. Default 24h. */
+ ckOdometerTtlMs?: number;
+ logger?: Logger;
+ meter?: Meter;
+};
+
+type CumulativeCommand = (
+ odometerKey: string,
+ streamKey: string,
+ ttlMs: string,
+ maxLen: string,
+ op: string,
+ q: string,
+ ...extraFields: string[]
+) => Promise;
+
+type CumulativeCkCommand = (
+ odometerKey: string,
+ ckOdometerKey: string,
+ streamKey: string,
+ ttlMs: string,
+ ckTtlMs: string,
+ maxLen: string,
+ op: string,
+ q: string,
+ ck: string,
+ ...extraFields: string[]
+) => Promise;
+
+// INCR the odometer, refresh its TTL, and XADD the reading (new value as `cum`) in one round
+// trip. Refresh-on-write is load-bearing: only genuinely idle queues expire. On first creation
+// (v==1) XADD a cum=0 baseline first (smaller stream id => sorts first) so deltaSum captures the
+// 0->1 transition and the total reconstructs exactly.
+// ARGV: [1]=ttlMs [2]=maxLen [3]=op [4]=q [5..]=extra field/value pairs (e.g. wait).
+const CUMULATIVE_LUA = `
+local v = redis.call('INCR', KEYS[1])
+redis.call('PEXPIRE', KEYS[1], ARGV[1])
+local maxlen = tonumber(ARGV[2]) or 0
+local function xadd(cum, withExtra)
+ local x = {'XADD', KEYS[2]}
+ if maxlen > 0 then x[#x+1]='MAXLEN'; x[#x+1]='~'; x[#x+1]=ARGV[2] end
+ x[#x+1]='*'
+ x[#x+1]='op'; x[#x+1]=ARGV[3]
+ x[#x+1]='q'; x[#x+1]=ARGV[4]
+ if withExtra then for i=5,#ARGV do x[#x+1]=ARGV[i] end end
+ x[#x+1]='cum'; x[#x+1]=cum
+ redis.call(unpack(x))
+end
+if v == 1 then xadd(0, false) end
+xadd(v, true)
+`;
+
+// CK variant: advances base + per-key odometers, ONE reading entry carries both (cum +
+// ck/ckcum), so per-key attribution adds no stream volume. Baselines seed independently:
+// cum-only entry = base row, ck+ckcum-only entry = per-key row, reading entry = both.
+// KEYS: [1]=baseOdometer [2]=ckOdometer [3]=stream. ARGV: [1]=baseTtlMs [2]=ckTtlMs
+// [3]=maxLen [4]=op [5]=q [6]=ck [7..]=extra field/value pairs.
+const CUMULATIVE_CK_LUA = `
+local v = redis.call('INCR', KEYS[1])
+redis.call('PEXPIRE', KEYS[1], ARGV[1])
+local ckv = redis.call('INCR', KEYS[2])
+redis.call('PEXPIRE', KEYS[2], ARGV[2])
+local maxlen = tonumber(ARGV[3]) or 0
+local function xadd(fields, withExtra)
+ local x = {'XADD', KEYS[3]}
+ if maxlen > 0 then x[#x+1]='MAXLEN'; x[#x+1]='~'; x[#x+1]=ARGV[3] end
+ x[#x+1]='*'
+ x[#x+1]='op'; x[#x+1]=ARGV[4]
+ x[#x+1]='q'; x[#x+1]=ARGV[5]
+ if withExtra then for i=7,#ARGV do x[#x+1]=ARGV[i] end end
+ for i=1,#fields do x[#x+1]=fields[i] end
+ redis.call(unpack(x))
+end
+if v == 1 then xadd({'cum', 0}, false) end
+if ckv == 1 then xadd({'ck', ARGV[6], 'ckcum', 0}, false) end
+xadd({'ck', ARGV[6], 'cum', v, 'ckcum', ckv}, true)
+`;
+
+/** Node-side producer: XADDs events to a sharded metrics stream, gated on a flag. */
+export class MetricsStreamEmitter {
+ private readonly redis: Redis;
+ private readonly def: MetricDefinition;
+ private readonly flag: { enabled(): boolean };
+ private readonly sampleRate: () => number;
+ private readonly odometerTtlMs: number;
+ private readonly ckOdometerTtlMs: number;
+ private readonly logger: Logger;
+ private readonly emittedCounter: Counter;
+ private readonly errorCounter: Counter;
+
+ constructor(options: MetricsStreamEmitterOptions) {
+ this.logger = options.logger ?? new Logger("MetricsStreamEmitter", "warn");
+ this.redis = createRedisClient(
+ { ...options.redis, keyPrefix: undefined },
+ { onError: (error) => this.logger.error("emitter redis error", { error }) }
+ );
+ this.redis.defineCommand("qmEmitCumulative", { numberOfKeys: 2, lua: CUMULATIVE_LUA });
+ this.redis.defineCommand("qmEmitCumulativeCk", { numberOfKeys: 3, lua: CUMULATIVE_CK_LUA });
+ this.odometerTtlMs = options.counterOdometerTtlMs ?? 7 * 24 * 60 * 60 * 1000;
+ this.ckOdometerTtlMs = options.ckOdometerTtlMs ?? 24 * 60 * 60 * 1000;
+ this.def = options.definition;
+ this.flag = options.flag;
+ const rate = options.gaugeSampleRate;
+ if (typeof rate === "object") {
+ this.sampleRate = () => rate.value();
+ } else {
+ const fixed = Math.min(1, Math.max(0, rate ?? 1));
+ this.sampleRate = () => fixed;
+ }
+
+ const meter = options.meter ?? getMeter("metrics-pipeline");
+ this.emittedCounter = meter.createCounter("queue_metrics.emitter.emitted", {
+ description: "Node-side metric events XADDed to the stream",
+ valueType: ValueType.INT,
+ });
+ this.errorCounter = meter.createCounter("queue_metrics.emitter.errors", {
+ description: "Failed metric-event XADDs (dropped)",
+ valueType: ValueType.INT,
+ });
+ }
+
+ enabledSync(): boolean {
+ return this.flag.enabled();
+ }
+
+ // Enabled AND (probabilistically) sampled-in. For high-frequency sampled emissions
+ // (e.g. Lua gauges); exact-count events use enabledSync()/emit() and are never sampled.
+ sampledSync(): boolean {
+ if (!this.flag.enabled()) return false;
+ const rate = this.sampleRate();
+ if (rate >= 1) return true;
+ if (rate <= 0) return false;
+ return Math.random() < rate;
+ }
+
+ // Fire-and-forget gauge emit: a plain XADD of an op=gauge snapshot (no odometer). The
+ // gauge value was read atomically inside the queue op's Lua and returned on the reply;
+ // this just lands it on the metrics stream. Loss-tolerant (sampled), never throws into
+ // the caller. Shares the counter stream (one stream family on the metrics Redis).
+ emitGauge(shardKey: string, fields: MetricFields): void {
+ if (!this.flag.enabled()) return;
+ // Drop rather than queue while the metrics Redis is unreachable: ioredis would hold
+ // every command in its offline queue until rejection, and metrics are loss-tolerant.
+ if (this.redis.status !== "ready") return;
+ const op = String(fields.op ?? "gauge");
+ const stream = streamKey(this.def, shardFor(shardKey, this.def.shardCount));
+ const args: string[] = [];
+ if (this.def.maxLen) args.push("MAXLEN", "~", String(this.def.maxLen));
+ args.push("*");
+ for (const [field, value] of Object.entries(fields)) {
+ args.push(field, String(value));
+ }
+ this.emittedCounter.add(1, { op });
+ this.redis.xadd(stream, ...(args as [string, ...string[]])).catch((error) => {
+ this.errorCounter.add(1);
+ this.logger.debug("metrics gauge emit failed", { error, stream });
+ });
+ }
+
+ // Fire-and-forget cumulative counter emit: advances the per-(queue,op) odometer and
+ // XADDs its new absolute value. No-op when disabled, never throws into the caller. A
+ // lost XADD self-heals (the next reading restates the total); the INCR is never sampled.
+ // A non-empty `fields.ck` also advances a per-concurrency-key odometer and rides the
+ // same entry as ck/ckcum (see CUMULATIVE_CK_LUA for the baseline/row mapping).
+ emit(shardKey: string, fields: MetricFields): void {
+ if (!this.flag.enabled()) return;
+ if (this.redis.status !== "ready") return;
+ const op = String(fields.op ?? "unknown");
+ const q = String(fields.q ?? "");
+ const ck = fields.ck != null && String(fields.ck) !== "" ? String(fields.ck) : null;
+ const shard = shardFor(shardKey, this.def.shardCount);
+ const stream = streamKey(this.def, shard);
+ // The odometer carries the stream's {shard} hash tag so INCR + XADD stay in one
+ // Cluster slot (the shard is derived from the queue, so the mapping is stable).
+ // The key format is part of the rolling-deploy data shape: concurrent old/new
+ // emitters with different formats split an odometer and corrupt its deltas.
+ const odometerKey = `${this.def.name}_cum:{${shard}}:${op}:${q}`;
+ const extra: string[] = [];
+ for (const [field, value] of Object.entries(fields)) {
+ if (field === "op" || field === "q" || field === "ck") continue;
+ extra.push(field, String(value));
+ }
+ this.emittedCounter.add(1, { op });
+ const maxLen = String(this.def.maxLen ?? 0);
+ const done = (error: unknown) => {
+ this.errorCounter.add(1);
+ this.logger.debug("metrics emit failed", { error, stream });
+ };
+ if (ck) {
+ const client = this.redis as unknown as { qmEmitCumulativeCk: CumulativeCkCommand };
+ client
+ .qmEmitCumulativeCk(
+ odometerKey,
+ `${odometerKey}:ck:${ck}`,
+ stream,
+ String(this.odometerTtlMs),
+ String(this.ckOdometerTtlMs),
+ maxLen,
+ op,
+ q,
+ ck,
+ ...extra
+ )
+ .catch(done);
+ return;
+ }
+ const client = this.redis as unknown as { qmEmitCumulative: CumulativeCommand };
+ client
+ .qmEmitCumulative(odometerKey, stream, String(this.odometerTtlMs), maxLen, op, q, ...extra)
+ .catch(done);
+ }
+
+ // Resolves once the metrics Redis connection is ready (emits before that are dropped).
+ waitUntilReady(): Promise {
+ if (this.redis.status === "ready") return Promise.resolve();
+ return new Promise((resolve) => this.redis.once("ready", () => resolve()));
+ }
+
+ async close(): Promise {
+ await this.redis.quit();
+ }
+}
diff --git a/internal-packages/metrics-pipeline/src/flag.ts b/internal-packages/metrics-pipeline/src/flag.ts
new file mode 100644
index 00000000000..5931e088939
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/flag.ts
@@ -0,0 +1,46 @@
+import type { RedisOptions } from "@internal/redis";
+import type { Logger } from "@trigger.dev/core/logger";
+import { CachedRedisValue } from "./cachedValue.js";
+
+export type CachedRedisFlagOptions = {
+ redis: RedisOptions;
+ /** Redis key holding the flag. A value of "1"/"true"/"on"/"enabled" is truthy. */
+ key: string;
+ cacheTtlMs?: number;
+ defaultValue?: boolean;
+ logger?: Logger;
+};
+
+const TRUTHY = new Set(["1", "true", "on", "enabled", "yes"]);
+
+/**
+ * Boolean feature flag from a Redis key with a short stale-while-revalidate cache,
+ * exposing a synchronous getter for hot paths (building Lua ARGV on every op).
+ */
+export class CachedRedisFlag {
+ private readonly inner: CachedRedisValue;
+
+ constructor(options: CachedRedisFlagOptions) {
+ this.inner = new CachedRedisValue({
+ redis: options.redis,
+ key: options.key,
+ parse: (raw) => raw != null && TRUTHY.has(raw.trim().toLowerCase()),
+ defaultValue: options.defaultValue ?? false,
+ cacheTtlMs: options.cacheTtlMs,
+ logger: options.logger,
+ loggerName: "CachedRedisFlag",
+ });
+ }
+
+ enabled(): boolean {
+ return this.inner.get();
+ }
+
+ refresh(): Promise {
+ return this.inner.refresh();
+ }
+
+ async close(): Promise {
+ await this.inner.close();
+ }
+}
diff --git a/internal-packages/metrics-pipeline/src/hash.ts b/internal-packages/metrics-pipeline/src/hash.ts
new file mode 100644
index 00000000000..b14324c138a
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/hash.ts
@@ -0,0 +1,15 @@
+/** FNV-1a 32-bit hash. Deterministic across processes; used only for sharding. */
+export function fnv1a32(str: string): number {
+ let hash = 0x811c9dc5;
+ for (let i = 0; i < str.length; i++) {
+ hash ^= str.charCodeAt(i);
+ hash = Math.imul(hash, 0x01000193);
+ }
+ return hash >>> 0;
+}
+
+/** Deterministic shard index in [0, shardCount) for a key. */
+export function shardFor(key: string, shardCount: number): number {
+ if (shardCount <= 1) return 0;
+ return fnv1a32(key) % shardCount;
+}
diff --git a/internal-packages/metrics-pipeline/src/idempotency.ts b/internal-packages/metrics-pipeline/src/idempotency.ts
new file mode 100644
index 00000000000..60cbd661f53
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/idempotency.ts
@@ -0,0 +1,11 @@
+import { createHash } from "node:crypto";
+
+// Deterministic, order-independent token over a batch of entry ids. A redelivered
+// batch yields the same token, so ClickHouse's raw-table dedup window drops the replay.
+// `scope` (the stream key) disambiguates id sets that could collide across streams.
+export function dedupTokenFromEntryIds(ids: string[], scope = ""): string {
+ const sorted = [...ids].sort();
+ return createHash("sha1")
+ .update(`${scope}|${sorted.join(",")}`)
+ .digest("hex");
+}
diff --git a/internal-packages/metrics-pipeline/src/index.ts b/internal-packages/metrics-pipeline/src/index.ts
new file mode 100644
index 00000000000..223c5feab17
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/index.ts
@@ -0,0 +1,26 @@
+export { CachedRedisFlag, type CachedRedisFlagOptions } from "./flag.js";
+export {
+ CachedRedisNumber,
+ type CachedRedisNumberOptions,
+ CachedRedisValue,
+ type CachedRedisValueOptions,
+} from "./cachedValue.js";
+export { MetricsStreamEmitter, type MetricsStreamEmitterOptions } from "./emitter.js";
+export {
+ MetricsStreamConsumer,
+ type MetricsStreamConsumerOptions,
+ type ShardState,
+ probeShardStates,
+} from "./consumer.js";
+export { createMetricsGaugeComputeLua, type GaugeComputeLuaParams } from "./lua.js";
+export { dedupTokenFromEntryIds } from "./idempotency.js";
+export { shardFor, fnv1a32 } from "./hash.js";
+export {
+ streamKey,
+ allStreamKeys,
+ entryTimeMs,
+ entryOrderKey,
+ type MetricDefinition,
+ type MetricFields,
+ type StreamEntry,
+} from "./types.js";
diff --git a/internal-packages/metrics-pipeline/src/lua.ts b/internal-packages/metrics-pipeline/src/lua.ts
new file mode 100644
index 00000000000..64f3b896c0d
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/lua.ts
@@ -0,0 +1,50 @@
+// Each field is a Lua expression evaluated inside the target script. queueLimit/
+// envLimit must be the EFFECTIVE enforced limit, else an unset limit reads as throttled.
+export type GaugeComputeLuaParams = {
+ // Lua boolean expression; when true the gauge is computed (else the extra reads are skipped).
+ enabledArg: string;
+ queued: string;
+ running: string;
+ queueLimit: string;
+ envQueued: string;
+ envRunning: string;
+ envLimit: string;
+ // Lua statements run first inside the pcall (e.g. to compute aggregate locals).
+ preamble?: string;
+ // Lua boolean expression (in __cc/__lim/__ql) for the throttled flag. Pass "false"
+ // where cc >= lim is not a valid throttle signal (e.g. summed CK aggregates).
+ throttledExpr?: string;
+ // CK-health extras (both or neither): appended as an optional gauge tail, gauge[8]/gauge[9].
+ ckBacklogged?: string;
+ ckMaxWaitMs?: string;
+};
+
+// Computes an op=gauge snapshot into the enclosing script's `__qm_g` local (a flat
+// {ql, cc, lim, eql, ec, elim, thr} array) so the script can RETURN it; Node then XADDs it
+// to the metrics Redis. No Redis write here (the run-queue Redis carries no metrics stream).
+// Gated on the sample flag and pcall-wrapped. The script MUST declare `local __qm_g` first.
+export function createMetricsGaugeComputeLua(params: GaugeComputeLuaParams): string {
+ const throttled = params.throttledExpr ?? "__cc >= __lim and __ql > 0";
+ const hasCk = params.ckBacklogged != null && params.ckMaxWaitMs != null;
+ const gauge = hasCk
+ ? ` local __ckq = tonumber(${params.ckBacklogged}) or 0
+ local __ckw = tonumber(${params.ckMaxWaitMs}) or 0
+ __qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr, __ckq, __ckw}`
+ : ` __qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr}`;
+
+ return `
+if ${params.enabledArg} then
+ pcall(function()
+ ${params.preamble ?? ""}
+ local __ql = tonumber(${params.queued}) or 0
+ local __cc = tonumber(${params.running}) or 0
+ local __lim = tonumber(${params.queueLimit}) or 0
+ local __eql = tonumber(${params.envQueued}) or 0
+ local __ec = tonumber(${params.envRunning}) or 0
+ local __elim = tonumber(${params.envLimit}) or 0
+ local __thr = 0
+ if ${throttled} then __thr = 1 end
+${gauge}
+ end)
+end`;
+}
diff --git a/internal-packages/metrics-pipeline/src/pipeline.test.ts b/internal-packages/metrics-pipeline/src/pipeline.test.ts
new file mode 100644
index 00000000000..73979310798
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/pipeline.test.ts
@@ -0,0 +1,116 @@
+import { describe, expect, it } from "vitest";
+import { createMetricsGaugeComputeLua } from "./lua.js";
+import { dedupTokenFromEntryIds } from "./idempotency.js";
+import { fnv1a32, shardFor } from "./hash.js";
+import { allStreamKeys, entryOrderKey, entryTimeMs, streamKey } from "./types.js";
+
+describe("shardFor", () => {
+ it("is deterministic and in range", () => {
+ expect(shardFor("queueA", 1)).toBe(0);
+ const s = shardFor("queueA", 4);
+ expect(s).toBeGreaterThanOrEqual(0);
+ expect(s).toBeLessThan(4);
+ expect(shardFor("queueA", 4)).toBe(s);
+ expect(fnv1a32("queueA")).toBe(fnv1a32("queueA"));
+ });
+});
+
+describe("dedupTokenFromEntryIds", () => {
+ it("is order-independent and set-sensitive", () => {
+ expect(dedupTokenFromEntryIds(["1-0", "2-0"])).toBe(dedupTokenFromEntryIds(["2-0", "1-0"]));
+ expect(dedupTokenFromEntryIds(["1-0"])).not.toBe(dedupTokenFromEntryIds(["2-0"]));
+ expect(dedupTokenFromEntryIds(["1-0"])).toMatch(/^[0-9a-f]{40}$/);
+ });
+});
+
+describe("stream keys", () => {
+ it("names and parses entry time", () => {
+ expect(streamKey({ name: "queue_metrics" }, 3)).toBe("queue_metrics:{3}");
+ expect(allStreamKeys({ name: "qm", shardCount: 2, consumerGroup: "cg" })).toEqual([
+ "qm:{0}",
+ "qm:{1}",
+ ]);
+ expect(entryTimeMs("1717000000000-5")).toBe(1717000000000);
+ expect(entryTimeMs("nope")).toBeNull();
+ });
+
+ it("entryOrderKey stays exact and strictly monotonic at real epoch magnitudes", () => {
+ const ms = 1783000000000; // ~2026: ms*1e6 is past JS safe-integer range, so a number key
+ const k = (seq: number) => BigInt(entryOrderKey(`${ms}-${seq}`));
+ // adjacent seq within one ms must not collapse to the same key (the float bug)
+ expect(k(0)).toBe(BigInt(ms) * 1000000n);
+ expect(k(1) - k(0)).toBe(1n);
+ expect(k(2) - k(1)).toBe(1n);
+ // a later ms always outranks any seq of an earlier ms (up to the 1M/ms factor)
+ expect(BigInt(entryOrderKey(`${ms + 1}-0`))).toBeGreaterThan(k(999999));
+ });
+});
+
+describe("createMetricsGaugeComputeLua", () => {
+ it("assigns __qm_g inside a gated, pcall-wrapped block and never XADDs", () => {
+ const lua = createMetricsGaugeComputeLua({
+ enabledArg: "ARGV[#ARGV] == '1'",
+ queued: "redis.call('ZCARD', KEYS[2])",
+ running: "queueCurrent",
+ queueLimit: "queueLimit",
+ envQueued: "redis.call('ZCARD', KEYS[8])",
+ envRunning: "envCurrent",
+ envLimit: "envLimit",
+ });
+
+ expect(lua).toContain("if ARGV[#ARGV] == '1' then");
+ expect(lua).toContain("pcall(function()");
+ expect(lua).toContain("__qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr}");
+ expect(lua).toContain("if __cc >= __lim and __ql > 0 then __thr = 1 end");
+ // The whole point of the refactor: no Redis write happens in the run-queue script.
+ expect(lua).not.toContain("XADD");
+ });
+
+ it("honors a custom throttled expression and preamble", () => {
+ const lua = createMetricsGaugeComputeLua({
+ enabledArg: "true",
+ preamble: "local agg = 1",
+ queued: "0",
+ running: "0",
+ queueLimit: "0",
+ envQueued: "0",
+ envRunning: "0",
+ envLimit: "0",
+ throttledExpr: "false",
+ });
+ expect(lua).toContain("local agg = 1");
+ expect(lua).toContain("if false then __thr = 1 end");
+ expect(lua).not.toContain("XADD");
+ });
+
+ it("appends the CK-health tail only when both CK params are set", () => {
+ const withCk = createMetricsGaugeComputeLua({
+ enabledArg: "true",
+ queued: "0",
+ running: "0",
+ queueLimit: "0",
+ envQueued: "0",
+ envRunning: "0",
+ envLimit: "0",
+ ckBacklogged: "redis.call('ZCARD', ckIndexKey)",
+ ckMaxWaitMs: "__ckwait",
+ });
+ expect(withCk).toContain(
+ "__qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr, __ckq, __ckw}"
+ );
+ expect(withCk).toContain("local __ckq = tonumber(redis.call('ZCARD', ckIndexKey)) or 0");
+
+ const withoutCk = createMetricsGaugeComputeLua({
+ enabledArg: "true",
+ queued: "0",
+ running: "0",
+ queueLimit: "0",
+ envQueued: "0",
+ envRunning: "0",
+ envLimit: "0",
+ ckBacklogged: "0",
+ });
+ expect(withoutCk).toContain("__qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr}");
+ expect(withoutCk).not.toContain("__ckq");
+ });
+});
diff --git a/internal-packages/metrics-pipeline/src/types.ts b/internal-packages/metrics-pipeline/src/types.ts
new file mode 100644
index 00000000000..d9e9e43f554
--- /dev/null
+++ b/internal-packages/metrics-pipeline/src/types.ts
@@ -0,0 +1,42 @@
+export type MetricFields = Record;
+
+export type StreamEntry = {
+ id: string;
+ fields: Record;
+};
+
+export type MetricDefinition = {
+ /** Logical name, e.g. "queue_metrics". Used as the stream key prefix. */
+ name: string;
+ shardCount: number;
+ consumerGroup: string;
+ /** Approximate MAXLEN cap applied on XADD (`MAXLEN ~ N`). Omit for unbounded. */
+ maxLen?: number;
+};
+
+// Keys are used verbatim on every access path (Lua ARGV, emitter, consumer), so
+// they must NOT be subject to an ioredis keyPrefix. `{shard}` is a Cluster hash tag.
+export function streamKey(definition: Pick, shard: number): string {
+ return `${definition.name}:{${shard}}`;
+}
+
+export function allStreamKeys(definition: MetricDefinition): string[] {
+ return Array.from({ length: Math.max(1, definition.shardCount) }, (_, shard) =>
+ streamKey(definition, shard)
+ );
+}
+
+// The ms part of a stream entry id is its emission time.
+export function entryTimeMs(id: string): number | null {
+ const ms = Number(id.split("-")[0]);
+ return Number.isFinite(ms) ? ms : null;
+}
+
+// Ordering key from a stream id (`-`) = ms*1e6+seq, for deltaSumTimestamp. BigInt +
+// string because ms*1e6 exceeds JS safe-integer range at real epoch magnitudes (a number would
+// collapse nearby seq values); the ClickHouse order_key column is UInt64 and takes the string.
+// The 1e6 factor (1M entries/ms/shard, far above any single Redis stream) stays within UInt64.
+export function entryOrderKey(id: string): string {
+ const [ms, seq] = id.split("-");
+ return (BigInt(Number(ms) || 0) * 1000000n + BigInt(Number(seq) || 0)).toString();
+}
diff --git a/internal-packages/metrics-pipeline/test/setup.ts b/internal-packages/metrics-pipeline/test/setup.ts
new file mode 100644
index 00000000000..b2bacd6baf5
--- /dev/null
+++ b/internal-packages/metrics-pipeline/test/setup.ts
@@ -0,0 +1,4 @@
+import { vi } from "vitest";
+
+// Set extended timeout for container tests
+vi.setConfig({ testTimeout: 60_000 });
diff --git a/internal-packages/metrics-pipeline/tsconfig.build.json b/internal-packages/metrics-pipeline/tsconfig.build.json
new file mode 100644
index 00000000000..89c87a3dc67
--- /dev/null
+++ b/internal-packages/metrics-pipeline/tsconfig.build.json
@@ -0,0 +1,21 @@
+{
+ "include": ["src/**/*.ts"],
+ "exclude": ["src/**/*.test.ts"],
+ "compilerOptions": {
+ "composite": true,
+ "target": "ES2020",
+ "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"],
+ "outDir": "dist",
+ "module": "Node16",
+ "moduleResolution": "Node16",
+ "moduleDetection": "force",
+ "verbatimModuleSyntax": false,
+ "esModuleInterop": true,
+ "forceConsistentCasingInFileNames": true,
+ "isolatedModules": true,
+ "preserveWatchOutput": true,
+ "skipLibCheck": true,
+ "strict": true,
+ "declaration": true
+ }
+}
diff --git a/internal-packages/metrics-pipeline/tsconfig.json b/internal-packages/metrics-pipeline/tsconfig.json
new file mode 100644
index 00000000000..af630abe1f1
--- /dev/null
+++ b/internal-packages/metrics-pipeline/tsconfig.json
@@ -0,0 +1,8 @@
+{
+ "references": [{ "path": "./tsconfig.src.json" }, { "path": "./tsconfig.test.json" }],
+ "compilerOptions": {
+ "moduleResolution": "Node16",
+ "module": "Node16",
+ "customConditions": ["@triggerdotdev/source"]
+ }
+}
diff --git a/internal-packages/metrics-pipeline/tsconfig.src.json b/internal-packages/metrics-pipeline/tsconfig.src.json
new file mode 100644
index 00000000000..0df3d2d222f
--- /dev/null
+++ b/internal-packages/metrics-pipeline/tsconfig.src.json
@@ -0,0 +1,20 @@
+{
+ "include": ["src/**/*.ts"],
+ "exclude": ["node_modules", "src/**/*.test.ts"],
+ "compilerOptions": {
+ "composite": true,
+ "target": "ES2020",
+ "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"],
+ "module": "Node16",
+ "moduleResolution": "Node16",
+ "moduleDetection": "force",
+ "verbatimModuleSyntax": false,
+ "esModuleInterop": true,
+ "forceConsistentCasingInFileNames": true,
+ "isolatedModules": true,
+ "preserveWatchOutput": true,
+ "skipLibCheck": true,
+ "strict": true,
+ "customConditions": ["@triggerdotdev/source"]
+ }
+}
diff --git a/internal-packages/metrics-pipeline/tsconfig.test.json b/internal-packages/metrics-pipeline/tsconfig.test.json
new file mode 100644
index 00000000000..4c06c9f57bb
--- /dev/null
+++ b/internal-packages/metrics-pipeline/tsconfig.test.json
@@ -0,0 +1,21 @@
+{
+ "include": ["src/**/*.test.ts"],
+ "references": [{ "path": "./tsconfig.src.json" }],
+ "compilerOptions": {
+ "composite": true,
+ "target": "ES2020",
+ "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"],
+ "module": "Node16",
+ "moduleResolution": "Node16",
+ "moduleDetection": "force",
+ "verbatimModuleSyntax": false,
+ "types": ["vitest/globals"],
+ "esModuleInterop": true,
+ "forceConsistentCasingInFileNames": true,
+ "isolatedModules": true,
+ "preserveWatchOutput": true,
+ "skipLibCheck": true,
+ "strict": true,
+ "customConditions": ["@triggerdotdev/source"]
+ }
+}
diff --git a/internal-packages/metrics-pipeline/vitest.config.ts b/internal-packages/metrics-pipeline/vitest.config.ts
new file mode 100644
index 00000000000..daafd294fa8
--- /dev/null
+++ b/internal-packages/metrics-pipeline/vitest.config.ts
@@ -0,0 +1,17 @@
+import { defineConfig } from "vitest/config";
+import { DurationShardingSequencer } from "@internal/testcontainers/sequencer";
+
+export default defineConfig({
+ test: {
+ sequence: { sequencer: DurationShardingSequencer },
+ globals: true,
+ retry: process.env.CI ? 2 : 0,
+ environment: "node",
+ setupFiles: ["./test/setup.ts"],
+ testTimeout: 30000,
+ hookTimeout: 30000,
+ },
+ esbuild: {
+ target: "node18",
+ },
+});
diff --git a/internal-packages/run-engine/package.json b/internal-packages/run-engine/package.json
index 8d53974d10b..516e6a18696 100644
--- a/internal-packages/run-engine/package.json
+++ b/internal-packages/run-engine/package.json
@@ -21,6 +21,7 @@
},
"dependencies": {
"@internal/redis": "workspace:*",
+ "@internal/metrics-pipeline": "workspace:*",
"@internal/run-store": "workspace:*",
"@trigger.dev/redis-worker": "workspace:*",
"@internal/tracing": "workspace:*",
diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts
index f3091c93b88..c55184e594f 100644
--- a/internal-packages/run-engine/src/engine/index.ts
+++ b/internal-packages/run-engine/src/engine/index.ts
@@ -218,6 +218,7 @@ export class RunEngine {
callback: this.#concurrencySweeperCallback.bind(this),
},
shardCount: options.queue?.shardCount,
+ queueMetrics: options.queue?.queueMetrics,
masterQueueConsumersDisabled: options.queue?.masterQueueConsumersDisabled,
masterQueueConsumersIntervalMs: options.queue?.masterQueueConsumersIntervalMs,
processWorkerQueueDebounceMs: options.queue?.processWorkerQueueDebounceMs,
@@ -1628,6 +1629,14 @@ export class RunEngine {
return this.runQueue.currentConcurrencyOfQueues(environment, queues);
}
+ async concurrencyKeyBreakdown(
+ environment: MinimalAuthenticatedEnvironment,
+ queue: string,
+ options?: { limit?: number }
+ ) {
+ return this.runQueue.concurrencyKeyBreakdown(environment, queue, options);
+ }
+
async removeEnvironmentQueuesFromMasterQueue({
runtimeEnvironmentId,
organizationId,
diff --git a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts
index dc9d029c38c..4b236aefc16 100644
--- a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts
+++ b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts
@@ -98,10 +98,16 @@ export class EnqueueSystem {
// Force development runs to use the environment id as the worker queue.
const workerQueue = env.type === "DEVELOPMENT" ? env.id : run.workerQueue;
- const timestamp = (run.queueTimestamp ?? run.createdAt).getTime() - run.priorityMs;
+ // Ordering keeps the run's original position; the scheduling-delay anchor is the
+ // trigger/delay time only on first enqueue (includeTtl). Re-enqueues anchor to now,
+ // else the wait metric absorbs the whole waitpoint/checkpoint duration.
+ const queuePositionMs = (run.queueTimestamp ?? run.createdAt).getTime();
+ const timestamp = queuePositionMs - run.priorityMs;
+ const eligibleAtMs = includeTtl ? queuePositionMs : Date.now();
- // Include TTL only when explicitly requested (first enqueue from trigger).
- // Re-enqueues (waitpoint, checkpoint, delayed, pending version) must not add TTL.
+ // Include TTL only when explicitly requested (first enqueue from trigger or the
+ // delayed-run system). Re-enqueues (waitpoint, checkpoint, pending version) must
+ // not add TTL.
let ttlExpiresAt: number | undefined;
if (includeTtl && run.ttl) {
const expireAt = parseNaturalLanguageDuration(run.ttl);
@@ -124,6 +130,7 @@ export class EnqueueSystem {
queue: run.queue,
concurrencyKey: run.concurrencyKey ?? undefined,
timestamp,
+ eligibleAtMs,
attempt: 0,
ttlExpiresAt,
},
diff --git a/internal-packages/run-engine/src/engine/tests/ttl.test.ts b/internal-packages/run-engine/src/engine/tests/ttl.test.ts
index 949e47f8574..e33b361abdb 100644
--- a/internal-packages/run-engine/src/engine/tests/ttl.test.ts
+++ b/internal-packages/run-engine/src/engine/tests/ttl.test.ts
@@ -293,7 +293,12 @@ describe("RunEngine ttl", () => {
);
assertNonNullable(messageAfterTrigger);
expect(messageAfterTrigger.ttlExpiresAt).toBeDefined();
+ // First enqueue anchors the scheduling-delay clock at the trigger time.
+ expect(messageAfterTrigger.eligibleAtMs).toBe(
+ (run.queueTimestamp ?? run.createdAt).getTime()
+ );
+ const beforeReenqueue = Date.now();
await engine.enqueueSystem.enqueueRun({
run,
env: authenticatedEnvironment,
@@ -308,6 +313,10 @@ describe("RunEngine ttl", () => {
);
assertNonNullable(messageAfterReenqueue);
expect(messageAfterReenqueue.ttlExpiresAt).toBeUndefined();
+ // Re-enqueues anchor to now so the wait metric measures only this queue stint,
+ // while the ordering timestamp keeps the run's original position.
+ expect(messageAfterReenqueue.eligibleAtMs).toBeGreaterThanOrEqual(beforeReenqueue);
+ expect(messageAfterReenqueue.timestamp).toBe(messageAfterTrigger.timestamp);
} finally {
await engine.quit();
}
diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts
index bb1d6eb2fa9..f37ec7df50a 100644
--- a/internal-packages/run-engine/src/engine/types.ts
+++ b/internal-packages/run-engine/src/engine/types.ts
@@ -16,6 +16,7 @@ import {
} from "@trigger.dev/redis-worker";
import type { ControlPlaneResolver } from "./controlPlaneResolver.js";
import type { FairQueueSelectionStrategyOptions } from "../run-queue/fairQueueSelectionStrategy.js";
+import type { RunQueueMetricsEmitter } from "../run-queue/index.js";
import type { MinimalAuthenticatedEnvironment } from "../shared/index.js";
import type { LockRetryConfig } from "./locking.js";
import type { workerCatalog } from "./workerCatalog.js";
@@ -90,6 +91,8 @@ export type RunEngineOptions = {
defaultEnvConcurrency?: number;
defaultEnvConcurrencyBurstFactor?: number;
logLevel?: LogLevel;
+ /** Optional queue-metrics emitter; enables gauge + counter emission from the RunQueue. */
+ queueMetrics?: RunQueueMetricsEmitter;
queueSelectionStrategyOptions?: Pick<
FairQueueSelectionStrategyOptions,
"parentQueueLimit" | "tracer" | "biases" | "reuseSnapshotCount" | "maximumEnvCount"
diff --git a/internal-packages/run-engine/src/run-queue/index.ts b/internal-packages/run-engine/src/run-queue/index.ts
index a0571206538..4e6ca89d847 100644
--- a/internal-packages/run-engine/src/run-queue/index.ts
+++ b/internal-packages/run-engine/src/run-queue/index.ts
@@ -5,6 +5,7 @@ import {
type RedisOptions,
type Result,
} from "@internal/redis";
+import { createMetricsGaugeComputeLua } from "@internal/metrics-pipeline";
import type {
Attributes,
Meter,
@@ -57,6 +58,99 @@ const SemanticAttributes = {
ORG_ID: "runqueue.orgId",
};
+// Prelude spliced at the top of every gauge-carrying script: declares the gauge slot and
+// the return wrapper. A splice fills __qm_g; every return goes through __qmret so the reply
+// is always {original, gauge}. A nil original becomes false, else Lua drops it from the
+// multi-bulk reply (which would swallow the gauge on the dequeue throttle paths).
+const QUEUE_METRICS_GAUGE_PRELUDE = `
+local __qm_g = false
+local function __qmret(r) if r == nil then r = false end return {r, __qm_g} end`;
+
+// Fresh-read gauge for splice points with no reusable locals: enqueue slow-path (before
+// return 0) and the base dequeue top. Gated on the last ARGV so it is inert unless the
+// caller opts in. CK queues emit per-subqueue depth (queue_name aggregates via the MV).
+const QUEUE_METRICS_GAUGE_LUA = createMetricsGaugeComputeLua({
+ enabledArg: "ARGV[#ARGV] == '1'",
+ queued: "redis.call('ZCARD', queueKey)",
+ running: "redis.call('SCARD', queueCurrentConcurrencyKey)",
+ queueLimit: "redis.call('GET', queueConcurrencyLimitKey) or '1000000'",
+ envQueued: "redis.call('ZCARD', envQueueKey)",
+ envRunning: "redis.call('SCARD', envCurrentConcurrencyKey)",
+ envLimit: "redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit",
+});
+
+// Enqueue fast-path gauge: the admission check already computed queueCurrent/envCurrent/
+// queueLimit/envLimit, so reuse them (only 2 ZCARDs stay fresh). Fast path was taken, so
+// cc < lim and thr is always 0 — reusing the effective queueLimit is fine (max() recovers raw).
+const QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA = createMetricsGaugeComputeLua({
+ enabledArg: "ARGV[#ARGV] == '1'",
+ queued: "redis.call('ZCARD', queueKey)",
+ running: "queueCurrent",
+ queueLimit: "queueLimit",
+ envQueued: "redis.call('ZCARD', envQueueKey)",
+ envRunning: "envCurrent",
+ envLimit: "envLimit",
+});
+
+// CK-health extras: distinct backlogged keys + most-starved head-of-line wait (ckIndex scores
+// are per-subqueue oldest timestamps). Needs ckIndexKey/currentTime locals; clamps future scores.
+const QUEUE_METRICS_CK_GAUGE_EXTRAS = {
+ preamble: `local __ckhead = redis.call('ZRANGE', ckIndexKey, 0, 0, 'WITHSCORES')
+ local __ckwait = 0
+ if #__ckhead > 0 then __ckwait = math.floor(math.max(0, (tonumber(currentTime) or 0) - (tonumber(__ckhead[2]) or 0))) end`,
+ ckBacklogged: "redis.call('ZCARD', ckIndexKey)",
+ ckMaxWaitMs: "__ckwait",
+};
+
+// CK enqueue variants of the two gauges above, extended with the CK-health tail.
+const QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA = createMetricsGaugeComputeLua({
+ enabledArg: "ARGV[#ARGV] == '1'",
+ queued: "redis.call('ZCARD', queueKey)",
+ running: "redis.call('SCARD', queueCurrentConcurrencyKey)",
+ queueLimit: "redis.call('GET', queueConcurrencyLimitKey) or '1000000'",
+ envQueued: "redis.call('ZCARD', envQueueKey)",
+ envRunning: "redis.call('SCARD', envCurrentConcurrencyKey)",
+ envLimit: "redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit",
+ ...QUEUE_METRICS_CK_GAUGE_EXTRAS,
+});
+
+const QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA = createMetricsGaugeComputeLua({
+ enabledArg: "ARGV[#ARGV] == '1'",
+ queued: "redis.call('ZCARD', queueKey)",
+ running: "queueCurrent",
+ queueLimit: "queueLimit",
+ envQueued: "redis.call('ZCARD', envQueueKey)",
+ envRunning: "envCurrent",
+ envLimit: "envLimit",
+ ...QUEUE_METRICS_CK_GAUGE_EXTRAS,
+});
+
+// CK dequeue: depth/running from the per-base-queue aggregate counters the run-queue already
+// maintains (two O(1) GETs, not a per-variant scan). thr suppressed — an aggregate cc >= per-CK
+// limit would over-report; per-CK throttle is caught by the per-subqueue enqueue gauges.
+const QUEUE_METRICS_CK_DEQUEUE_GAUGE_LUA = createMetricsGaugeComputeLua({
+ enabledArg: "ARGV[#ARGV] == '1'",
+ queued: "redis.call('GET', lengthCounterKey) or '0'",
+ running: "redis.call('GET', runningCounterKey) or '0'",
+ queueLimit: "redis.call('GET', queueConcurrencyLimitKey) or '1000000'",
+ envQueued: "redis.call('ZCARD', envQueueKey)",
+ envRunning: "redis.call('SCARD', envCurrentConcurrencyKey)",
+ envLimit: "redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit",
+ throttledExpr: "false",
+ ...QUEUE_METRICS_CK_GAUGE_EXTRAS,
+});
+
+/** Injected queue-metrics stream emitter; all calls are no-ops when metrics are disabled. */
+export interface RunQueueMetricsEmitter {
+ enabledSync(): boolean;
+ /** enabled AND sampled-in; gates high-frequency sampled emissions (the Lua gauge). */
+ sampledSync(): boolean;
+ /** Counter event (cumulative odometer). */
+ emit(shardKey: string, fields: Record): void;
+ /** Gauge snapshot read inside the queue-op Lua and returned on the reply. */
+ emitGauge(shardKey: string, fields: Record): void;
+}
+
export type RunQueueOptions = {
name: string;
tracer: Tracer;
@@ -93,6 +187,8 @@ export type RunQueueOptions = {
disabled?: boolean;
};
meter?: Meter;
+ /** When set, enqueue/dequeue/ack/nack/dlq emit queue-metrics events (gated on the emitter's flag). */
+ queueMetrics?: RunQueueMetricsEmitter;
dequeueBlockingTimeoutSeconds?: number;
concurrencySweeper?: {
scanSchedule?: string;
@@ -458,6 +554,65 @@ export class RunQueue {
);
}
+ /**
+ * Live per-concurrency-key breakdown of a queue's backlog, most-starved first.
+ * Reads the ckIndex zset (members = CK subqueue names, scores = oldest-message
+ * timestamps), so only keys with queued work appear; running-only keys do not.
+ */
+ public async concurrencyKeyBreakdown(
+ env: MinimalAuthenticatedEnvironment,
+ queue: string,
+ options?: { limit?: number }
+ ): Promise<{
+ totalBackloggedKeys: number;
+ keys: Array<{
+ concurrencyKey: string;
+ queued: number;
+ running: number;
+ oldestEnqueuedAt: number;
+ }>;
+ }> {
+ const limit = options?.limit ?? 50;
+ const ckIndexKey = this.keys.ckIndexKeyFromQueue(this.keys.queueKey(env, queue));
+
+ const indexPipeline = this.redis.pipeline();
+ indexPipeline.zcard(ckIndexKey);
+ indexPipeline.zrange(ckIndexKey, 0, limit - 1, "WITHSCORES");
+ const indexResults = await indexPipeline.exec();
+ if (!indexResults) return { totalBackloggedKeys: 0, keys: [] };
+
+ const [totalErr, totalVal] = indexResults[0];
+ const [rangeErr, rangeVal] = indexResults[1];
+ const totalBackloggedKeys = totalErr || totalVal == null ? 0 : (totalVal as number);
+ const flat = rangeErr || rangeVal == null ? [] : (rangeVal as string[]);
+
+ const members: Array<{ member: string; score: number }> = [];
+ for (let i = 0; i < flat.length; i += 2) {
+ members.push({ member: flat[i], score: Number(flat[i + 1]) });
+ }
+ if (members.length === 0) return { totalBackloggedKeys, keys: [] };
+
+ const statsPipeline = this.redis.pipeline();
+ for (const { member } of members) {
+ statsPipeline.zcard(member);
+ statsPipeline.scard(this.keys.queueCurrentConcurrencyKeyFromQueue(member));
+ }
+ const stats = await statsPipeline.exec();
+
+ const keys = members.map(({ member, score }, i) => {
+ const queuedResult = stats?.[i * 2];
+ const runningResult = stats?.[i * 2 + 1];
+ return {
+ concurrencyKey: this.#concurrencyKeyFromQueue(member) ?? "",
+ queued: queuedResult && !queuedResult[0] ? ((queuedResult[1] as number) ?? 0) : 0,
+ running: runningResult && !runningResult[0] ? ((runningResult[1] as number) ?? 0) : 0,
+ oldestEnqueuedAt: score,
+ };
+ });
+
+ return { totalBackloggedKeys, keys };
+ }
+
public async lengthOfEnvQueue(env: MinimalAuthenticatedEnvironment) {
return this.redis.zcard(this.keys.envQueueKey(env));
}
@@ -751,6 +906,8 @@ export class RunQueue {
span.setAttribute("fastPath", fastPathTaken);
+ this.#emitQueueMetric(queueKey, { op: "enqueue", q: queueKey });
+
if (!fastPathTaken && !skipDequeueProcessing) {
// Slow path: schedule the dequeue job to move the message from queue to worker queue
await this.worker.enqueueOnce({
@@ -810,6 +967,15 @@ export class RunQueue {
...flattenAttributes(dequeuedMessage.message, "message"),
});
+ const startedFields: Record = {
+ op: "started",
+ q: dequeuedMessage.message.queue,
+ };
+ if (typeof dequeuedMessage.message.eligibleAtMs === "number") {
+ startedFields.wait = Math.max(0, Date.now() - dequeuedMessage.message.eligibleAtMs);
+ }
+ this.#emitQueueMetric(dequeuedMessage.message.queue, startedFields);
+
return dequeuedMessage;
},
{
@@ -877,6 +1043,8 @@ export class RunQueue {
message,
removeFromWorkerQueue: options?.removeFromWorkerQueue,
});
+
+ this.#emitQueueMetric(message.queue, { op: "ack", q: message.queue });
},
{
kind: SpanKind.CONSUMER,
@@ -934,6 +1102,7 @@ export class RunQueue {
message.attempt = message.attempt + 1;
if (message.attempt >= maxAttempts) {
await this.#callMoveToDeadLetterQueue({ message });
+ this.#emitQueueMetric(message.queue, { op: "dlq", q: message.queue });
return false;
}
}
@@ -960,6 +1129,8 @@ export class RunQueue {
await this.#callNackMessage({ message, retryAt });
+ this.#emitQueueMetric(message.queue, { op: "nack", q: message.queue });
+
return true;
},
{
@@ -1831,6 +2002,57 @@ export class RunQueue {
*
* @returns true if the fast path was taken (message pushed directly to worker queue)
*/
+ #queueMetricsGaugeArg(): string {
+ // Gauge gate ARGV: enabled AND sampled-in (sampling applies to the gauge, not counters).
+ return this.options.queueMetrics?.sampledSync() ? "1" : "0";
+ }
+
+ // Gauge returned on a script reply as a flat [ql, cc, lim, eql, ec, elim, thr] array,
+ // plus an optional [ckq, ckw] tail on CK-path scripts.
+ // Unlike counters, gauges are NOT base-normalized: the q label keeps its :ck: suffix so
+ // the CK-aggregate and per-subqueue readings stay distinguishable; the consumer's mapEntry
+ // strips :ck: to the base queue_name and the MV maxes them into one row.
+ #emitGauge(queue: string, gauge: number[]): void {
+ if (!Array.isArray(gauge) || gauge.length < 7) return;
+ const [ql, cc, lim, eql, ec, elim, thr, ckq, ckw] = gauge;
+ const fields: Record = {
+ op: "gauge",
+ q: queue,
+ ql,
+ cc,
+ lim,
+ eql,
+ ec,
+ elim,
+ thr,
+ };
+ if (gauge.length >= 9) {
+ fields.ckq = ckq;
+ fields.ckw = ckw;
+ }
+ this.options.queueMetrics?.emitGauge(queue, fields);
+ }
+
+ #concurrencyKeyFromQueue(queue: string): string | undefined {
+ const idx = queue.indexOf(":ck:");
+ return idx === -1 || idx + 4 >= queue.length ? undefined : queue.slice(idx + 4);
+ }
+
+ #emitQueueMetric(shardKey: string, fields: Record): void {
+ // Counters roll up per BASE queue: normalize the CK-qualified queue to its base so all
+ // concurrency keys share one monotonic odometer (and one shard/order key), matching the
+ // base queue_name the consumer buckets on. A real concurrency key rides along as `ck`,
+ // driving a SEPARATE per-key odometer on the same entry (per-key history tier).
+ const baseQueue = this.keys.baseQueueKeyFromQueue(shardKey);
+ let baseFields = fields;
+ if (typeof fields.q === "string") {
+ baseFields = { ...fields, q: this.keys.baseQueueKeyFromQueue(fields.q) };
+ const ck = this.#concurrencyKeyFromQueue(fields.q);
+ if (ck && ck !== "*") baseFields.ck = ck;
+ }
+ this.options.queueMetrics?.emit(baseQueue, baseFields);
+ }
+
async #callEnqueueMessage(
message: OutputPayloadV2,
ttlInfo?: {
@@ -1869,6 +2091,7 @@ export class RunQueue {
const messageScore = String(message.timestamp);
const currentTime = String(Date.now());
const enableFastPathArg = enableFastPath ? "1" : "0";
+ const metricsGaugeArg = this.#queueMetricsGaugeArg();
const defaultEnvConcurrencyLimit = String(this.options.defaultEnvConcurrency);
const defaultEnvConcurrencyBurstFactor = String(
this.options.defaultEnvConcurrencyBurstFactor ?? 1.0
@@ -1892,7 +2115,8 @@ export class RunQueue {
service: this.name,
});
- let result: number;
+ // Every gauge-carrying script returns a 2-tuple [originalReturn, gauge|null].
+ let result: [number, number[] | null];
// Use CK-aware enqueue for messages with concurrency keys
if (message.concurrencyKey) {
@@ -1935,7 +2159,8 @@ export class RunQueue {
currentTime,
enableFastPathArg,
ckKeyPrefix,
- String(this.counterTtlSeconds)
+ String(this.counterTtlSeconds),
+ metricsGaugeArg
);
} else {
result = await this.redis.enqueueMessageCkTracked(
@@ -1967,7 +2192,8 @@ export class RunQueue {
currentTime,
enableFastPathArg,
ckKeyPrefix,
- String(this.counterTtlSeconds)
+ String(this.counterTtlSeconds),
+ metricsGaugeArg
);
}
} else if (ttlInfo) {
@@ -1998,7 +2224,8 @@ export class RunQueue {
defaultEnvConcurrencyLimit,
defaultEnvConcurrencyBurstFactor,
currentTime,
- enableFastPathArg
+ enableFastPathArg,
+ metricsGaugeArg
);
} else {
result = await this.redis.enqueueMessage(
@@ -2024,11 +2251,14 @@ export class RunQueue {
defaultEnvConcurrencyLimit,
defaultEnvConcurrencyBurstFactor,
currentTime,
- enableFastPathArg
+ enableFastPathArg,
+ metricsGaugeArg
);
}
- return result === 1;
+ const [enqueueResult, gauge] = result;
+ if (gauge) this.#emitGauge(queueName, gauge);
+ return enqueueResult === 1;
}
async #callDequeueMessagesFromQueue({
@@ -2081,7 +2311,9 @@ export class RunQueue {
maxCount,
});
- const result = await this.redis.dequeueMessagesFromQueue(
+ const metricsGaugeArg = this.#queueMetricsGaugeArg();
+
+ const reply = await this.redis.dequeueMessagesFromQueue(
//keys
messageQueue,
queueConcurrencyLimitKey,
@@ -2099,9 +2331,16 @@ export class RunQueue {
String(this.options.defaultEnvConcurrency),
String(this.options.defaultEnvConcurrencyBurstFactor ?? 1),
this.options.redis.keyPrefix ?? "",
- String(maxCount)
+ String(maxCount),
+ metricsGaugeArg
);
+ // Reply is [flatMessages|null, gauge|null]: emit the gauge (read atomically inside
+ // the script, present on the throttle/empty paths too) and keep element 0 as the array.
+ const gauge = reply?.[1] ?? null;
+ if (gauge) this.#emitGauge(messageQueue, gauge);
+ const result = reply?.[0] ?? null;
+
if (!result) {
span.setAttribute("message_count", 0);
@@ -2202,8 +2441,11 @@ export class RunQueue {
});
const lengthCounterKey = this.keys.queueLengthCounterKeyFromQueue(ckWildcardQueue);
+ const runningCounterKey = this.keys.queueRunningCounterKeyFromQueue(ckWildcardQueue);
+
+ const metricsGaugeArg = this.#queueMetricsGaugeArg();
- const result = await this.redis.dequeueMessagesFromCkQueueTracked(
+ const reply = await this.redis.dequeueMessagesFromCkQueueTracked(
//keys
ckIndexKey,
queueConcurrencyLimitKey,
@@ -2215,15 +2457,22 @@ export class RunQueue {
masterQueueKey,
ttlQueueKey,
lengthCounterKey,
+ runningCounterKey,
//args
ckWildcardQueue,
String(Date.now()),
String(this.options.defaultEnvConcurrency),
String(this.options.defaultEnvConcurrencyBurstFactor ?? 1),
this.options.redis.keyPrefix ?? "",
- String(maxCount)
+ String(maxCount),
+ metricsGaugeArg
);
+ // Reply is [flatMessages|null, gauge|null]; the CK aggregate gauge rides here.
+ const gauge = reply?.[1] ?? null;
+ if (gauge) this.#emitGauge(ckWildcardQueue, gauge);
+ const result = reply?.[0] ?? null;
+
if (!result) {
span.setAttribute("message_count", 0);
return [];
@@ -3062,6 +3311,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[7]
local currentTime = ARGV[8]
local enableFastPath = ARGV[9]
+${QUEUE_METRICS_GAUGE_PRELUDE}
+
-- Fast path: check if we can skip the queue and go directly to worker queue
if enableFastPath == '1' then
local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1)
@@ -3083,7 +3334,8 @@ if enableFastPath == '1' then
redis.call('SADD', queueCurrentConcurrencyKey, messageId)
redis.call('SADD', envCurrentConcurrencyKey, messageId)
redis.call('RPUSH', workerQueueKey, messageKeyValue)
- return 1
+${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA}
+ return __qmret(1)
end
end
end
@@ -3113,8 +3365,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId)
redis.call('SREM', envCurrentConcurrencyKey, messageId)
redis.call('SREM', queueCurrentDequeuedKey, messageId)
redis.call('SREM', envCurrentDequeuedKey, messageId)
+${QUEUE_METRICS_GAUGE_LUA}
-return 0
+return __qmret(0)
`,
});
@@ -3153,6 +3406,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[9]
local currentTime = ARGV[10]
local enableFastPath = ARGV[11]
+${QUEUE_METRICS_GAUGE_PRELUDE}
+
-- Fast path: check if we can skip the queue and go directly to worker queue
if enableFastPath == '1' then
local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1)
@@ -3174,8 +3429,9 @@ if enableFastPath == '1' then
redis.call('SADD', queueCurrentConcurrencyKey, messageId)
redis.call('SADD', envCurrentConcurrencyKey, messageId)
redis.call('RPUSH', workerQueueKey, messageKeyValue)
+${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA}
-- Skip TTL sorted set: the expireRun worker job handles TTL expiry independently
- return 1
+ return __qmret(1)
end
end
end
@@ -3208,8 +3464,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId)
redis.call('SREM', envCurrentConcurrencyKey, messageId)
redis.call('SREM', queueCurrentDequeuedKey, messageId)
redis.call('SREM', envCurrentDequeuedKey, messageId)
+${QUEUE_METRICS_GAUGE_LUA}
-return 0
+return __qmret(0)
`,
});
@@ -3246,6 +3503,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[8]
local currentTime = ARGV[9]
local enableFastPath = ARGV[10]
+${QUEUE_METRICS_GAUGE_PRELUDE}
+
-- Fast path: check if we can skip the queue and go directly to worker queue
if enableFastPath == '1' then
local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1)
@@ -3268,7 +3527,8 @@ if enableFastPath == '1' then
redis.call('SADD', queueCurrentConcurrencyKey, messageId)
redis.call('SADD', envCurrentConcurrencyKey, messageId)
redis.call('RPUSH', workerQueueKey, messageKeyValue)
- return 1
+${QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA}
+ return __qmret(1)
end
end
end
@@ -3304,8 +3564,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId)
redis.call('SREM', envCurrentConcurrencyKey, messageId)
redis.call('SREM', queueCurrentDequeuedKey, messageId)
redis.call('SREM', envCurrentDequeuedKey, messageId)
+${QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA}
-return 0
+return __qmret(0)
`,
});
@@ -3344,6 +3605,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[10]
local currentTime = ARGV[11]
local enableFastPath = ARGV[12]
+${QUEUE_METRICS_GAUGE_PRELUDE}
+
-- Fast path: check if we can skip the queue and go directly to worker queue
if enableFastPath == '1' then
local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1)
@@ -3365,8 +3628,9 @@ if enableFastPath == '1' then
redis.call('SADD', queueCurrentConcurrencyKey, messageId)
redis.call('SADD', envCurrentConcurrencyKey, messageId)
redis.call('RPUSH', workerQueueKey, messageKeyValue)
+${QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA}
-- Skip TTL sorted set: the expireRun worker job handles TTL expiry independently
- return 1
+ return __qmret(1)
end
end
end
@@ -3405,8 +3669,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId)
redis.call('SREM', envCurrentConcurrencyKey, messageId)
redis.call('SREM', queueCurrentDequeuedKey, messageId)
redis.call('SREM', envCurrentDequeuedKey, messageId)
+${QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA}
-return 0
+return __qmret(0)
`,
});
@@ -3455,6 +3720,8 @@ local keyPrefix = ARGV[11]
-- TTL (seconds) applied to counter lazy-init SETs
local counterTtl = ARGV[12]
+${QUEUE_METRICS_GAUGE_PRELUDE}
+
-- Fast path: check if we can skip the queue and go directly to worker queue
if enableFastPath == '1' then
local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1)
@@ -3476,10 +3743,11 @@ if enableFastPath == '1' then
redis.call('SADD', queueCurrentConcurrencyKey, messageId)
redis.call('SADD', envCurrentConcurrencyKey, messageId)
redis.call('RPUSH', workerQueueKey, messageKeyValue)
+${QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA}
-- Fast-path skips the CK variant zset entirely; lengthCounter is unchanged.
-- runningCounter is bumped later by dequeueMessageFromKeyTracked when the
-- worker pulls the message from the worker queue.
- return 1
+ return __qmret(1)
end
end
end
@@ -3531,8 +3799,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId)
redis.call('SREM', envCurrentConcurrencyKey, messageId)
redis.call('SREM', queueCurrentDequeuedKey, messageId)
redis.call('SREM', envCurrentDequeuedKey, messageId)
+${QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA}
-return 0
+return __qmret(0)
`,
});
@@ -3576,6 +3845,8 @@ local keyPrefix = ARGV[13]
-- TTL (seconds) applied to counter lazy-init SETs
local counterTtl = ARGV[14]
+${QUEUE_METRICS_GAUGE_PRELUDE}
+
-- Fast path: check if we can skip the queue and go directly to worker queue
if enableFastPath == '1' then
local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1)
@@ -3597,7 +3868,8 @@ if enableFastPath == '1' then
redis.call('SADD', queueCurrentConcurrencyKey, messageId)
redis.call('SADD', envCurrentConcurrencyKey, messageId)
redis.call('RPUSH', workerQueueKey, messageKeyValue)
- return 1
+${QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA}
+ return __qmret(1)
end
end
end
@@ -3645,8 +3917,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId)
redis.call('SREM', envCurrentConcurrencyKey, messageId)
redis.call('SREM', queueCurrentDequeuedKey, messageId)
redis.call('SREM', envCurrentDequeuedKey, messageId)
+${QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA}
-return 0
+return __qmret(0)
`,
});
@@ -3891,6 +4164,8 @@ local defaultEnvConcurrencyLimit = ARGV[3]
local defaultEnvConcurrencyBurstFactor = ARGV[4]
local keyPrefix = ARGV[5]
local maxCount = tonumber(ARGV[6] or '1')
+${QUEUE_METRICS_GAUGE_PRELUDE}
+${QUEUE_METRICS_GAUGE_LUA}
-- Check current env concurrency against the limit
local envCurrentConcurrency = tonumber(redis.call('SCARD', envCurrentConcurrencyKey) or '0')
@@ -3899,7 +4174,7 @@ local envConcurrencyLimitBurstFactor = tonumber(redis.call('GET', envConcurrency
local envConcurrencyLimitWithBurstFactor = math.floor(envConcurrencyLimit * envConcurrencyLimitBurstFactor)
if envCurrentConcurrency >= envConcurrencyLimitWithBurstFactor then
- return nil
+ return __qmret(nil)
end
-- Check current queue concurrency against the limit
@@ -3909,7 +4184,7 @@ local totalQueueConcurrencyLimit = queueConcurrencyLimit
-- Check condition only if concurrencyLimit exists
if queueCurrentConcurrency >= totalQueueConcurrencyLimit then
- return nil
+ return __qmret(nil)
end
-- Calculate how many messages we can actually dequeue based on concurrency limits
@@ -3918,14 +4193,14 @@ local queueAvailableCapacity = totalQueueConcurrencyLimit - queueCurrentConcurre
local actualMaxCount = math.min(maxCount, envAvailableCapacity, queueAvailableCapacity)
if actualMaxCount <= 0 then
- return nil
+ return __qmret(nil)
end
-- Attempt to dequeue messages up to actualMaxCount
local messages = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'WITHSCORES', 'LIMIT', 0, actualMaxCount)
if #messages == 0 then
- return nil
+ return __qmret(nil)
end
local results = {}
@@ -3991,7 +4266,7 @@ else
end
-- Return results as a flat array: [messageId1, messageScore1, messagePayload1, messageId2, messageScore2, messagePayload2, ...]
-return results
+return __qmret(results)
`,
});
@@ -4145,7 +4420,7 @@ return results
// (normal dequeue, TTL-expired, or stale-orphan path — all of which were
// counted at enqueue time).
this.redis.defineCommand("dequeueMessagesFromCkQueueTracked", {
- numberOfKeys: 10,
+ numberOfKeys: 11,
lua: `
local ckIndexKey = KEYS[1]
local queueConcurrencyLimitKey = KEYS[2]
@@ -4157,6 +4432,7 @@ local envQueueKey = KEYS[7]
local masterQueueKey = KEYS[8]
local ttlQueueKey = KEYS[9]
local lengthCounterKey = KEYS[10]
+local runningCounterKey = KEYS[11]
local ckWildcardName = ARGV[1]
local currentTime = tonumber(ARGV[2])
@@ -4164,6 +4440,8 @@ local defaultEnvConcurrencyLimit = ARGV[3]
local defaultEnvConcurrencyBurstFactor = ARGV[4]
local keyPrefix = ARGV[5]
local maxCount = tonumber(ARGV[6] or '1')
+${QUEUE_METRICS_GAUGE_PRELUDE}
+${QUEUE_METRICS_CK_DEQUEUE_GAUGE_LUA}
local function decrLengthCounter()
if tonumber(redis.call('GET', lengthCounterKey) or '0') > 0 then
@@ -4178,7 +4456,7 @@ local envConcurrencyLimitBurstFactor = tonumber(redis.call('GET', envConcurrency
local envConcurrencyLimitWithBurstFactor = math.floor(envConcurrencyLimit * envConcurrencyLimitBurstFactor)
if envCurrentConcurrency >= envConcurrencyLimitWithBurstFactor then
- return nil
+ return __qmret(nil)
end
local queueConcurrencyLimit = math.min(tonumber(redis.call('GET', queueConcurrencyLimitKey) or '1000000'), envConcurrencyLimit)
@@ -4187,7 +4465,7 @@ local envAvailableCapacity = envConcurrencyLimitWithBurstFactor - envCurrentConc
local actualMaxCount = math.min(maxCount, envAvailableCapacity)
if actualMaxCount <= 0 then
- return nil
+ return __qmret(nil)
end
local ckQueues = redis.call('ZRANGEBYSCORE', ckIndexKey, '-inf', tostring(currentTime), 'LIMIT', 0, actualMaxCount * 3)
@@ -4199,7 +4477,7 @@ if #ckQueues == 0 then
else
redis.call('ZADD', masterQueueKey, anyIdx[2], ckWildcardName)
end
- return nil
+ return __qmret(nil)
end
local results = {}
@@ -4281,7 +4559,7 @@ else
redis.call('ZADD', masterQueueKey, earliestIdx[2], ckWildcardName)
end
-return results
+return __qmret(results)
`,
});
@@ -5199,8 +5477,9 @@ declare module "@internal/redis" {
defaultEnvConcurrencyBurstFactor: string,
currentTime: string,
enableFastPath: string,
- callback?: Callback
- ): Result;
+ metricsEnabled: string,
+ callback?: Callback<[number, number[] | null]>
+ ): Result<[number, number[] | null], Context>;
enqueueMessageWithTtl(
//keys
@@ -5229,8 +5508,9 @@ declare module "@internal/redis" {
defaultEnvConcurrencyBurstFactor: string,
currentTime: string,
enableFastPath: string,
- callback?: Callback
- ): Result;
+ metricsEnabled: string,
+ callback?: Callback<[number, number[] | null]>
+ ): Result<[number, number[] | null], Context>;
expireTtlRuns(
//keys
@@ -5265,8 +5545,9 @@ declare module "@internal/redis" {
defaultEnvConcurrencyBurstFactor: string,
keyPrefix: string,
maxCount: string,
- callback?: Callback
- ): Result;
+ metricsEnabled: string,
+ callback?: Callback<[string[] | null, number[] | null]>
+ ): Result<[string[] | null, number[] | null], Context>;
dequeueMessageFromWorkerQueueNonBlocking(
workerQueueKey: string,
@@ -5405,8 +5686,9 @@ declare module "@internal/redis" {
defaultEnvConcurrencyBurstFactor: string,
currentTime: string,
enableFastPath: string,
- callback?: Callback
- ): Result;
+ metricsEnabled: string,
+ callback?: Callback<[number, number[] | null]>
+ ): Result<[number, number[] | null], Context>;
enqueueMessageWithTtlCk(
//keys
@@ -5437,8 +5719,9 @@ declare module "@internal/redis" {
defaultEnvConcurrencyBurstFactor: string,
currentTime: string,
enableFastPath: string,
- callback?: Callback
- ): Result;
+ metricsEnabled: string,
+ callback?: Callback<[number, number[] | null]>
+ ): Result<[number, number[] | null], Context>;
dequeueMessagesFromCkQueue(
//keys
@@ -5551,8 +5834,9 @@ declare module "@internal/redis" {
enableFastPath: string,
keyPrefix: string,
counterTtl: string,
- callback?: Callback
- ): Result;
+ metricsEnabled: string,
+ callback?: Callback<[number, number[] | null]>
+ ): Result<[number, number[] | null], Context>;
enqueueMessageWithTtlCkTracked(
masterQueueKey: string,
@@ -5585,8 +5869,9 @@ declare module "@internal/redis" {
enableFastPath: string,
keyPrefix: string,
counterTtl: string,
- callback?: Callback
- ): Result;
+ metricsEnabled: string,
+ callback?: Callback<[number, number[] | null]>
+ ): Result<[number, number[] | null], Context>;
dequeueMessagesFromCkQueueTracked(
ckIndexKey: string,
@@ -5599,14 +5884,16 @@ declare module "@internal/redis" {
masterQueueKey: string,
ttlQueueKey: string,
lengthCounterKey: string,
+ runningCounterKey: string,
ckWildcardName: string,
currentTime: string,
defaultEnvConcurrencyLimit: string,
defaultEnvConcurrencyBurstFactor: string,
keyPrefix: string,
maxCount: string,
- callback?: Callback
- ): Result;
+ metricsEnabled: string,
+ callback?: Callback<[string[] | null, number[] | null]>
+ ): Result<[string[] | null, number[] | null], Context>;
dequeueMessageFromKeyTracked(
messageKey: string,
diff --git a/internal-packages/run-engine/src/run-queue/keyProducer.ts b/internal-packages/run-engine/src/run-queue/keyProducer.ts
index b185435f6f6..18a2727b7e4 100644
--- a/internal-packages/run-engine/src/run-queue/keyProducer.ts
+++ b/internal-packages/run-engine/src/run-queue/keyProducer.ts
@@ -141,8 +141,7 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer {
}
queueConcurrencyLimitKeyFromQueue(queue: string) {
- const concurrencyQueueName = queue.replace(/:ck:.+$/, "");
- return `${concurrencyQueueName}:${constants.CONCURRENCY_LIMIT_PART}`;
+ return `${this.baseQueueKeyFromQueue(queue)}:${constants.CONCURRENCY_LIMIT_PART}`;
}
queueCurrentConcurrencyKeyFromQueue(queue: string) {
@@ -313,12 +312,14 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer {
}
ckIndexKeyFromQueue(queue: string): string {
- const baseQueue = queue.replace(/:ck:.+$/, "");
- return `${baseQueue}:${constants.CK_INDEX_PART}`;
+ return `${this.baseQueueKeyFromQueue(queue)}:${constants.CK_INDEX_PART}`;
}
+ // indexOf instead of /:ck:.+$/ (queue names are user-controlled; polynomial regex).
+ // Only strips when at least one character follows ":ck:", matching the old semantics.
baseQueueKeyFromQueue(queue: string): string {
- return queue.replace(/:ck:.+$/, "");
+ const idx = queue.indexOf(":ck:");
+ return idx === -1 || idx + 4 >= queue.length ? queue : queue.slice(0, idx);
}
queueLengthCounterKey(env: RunQueueKeyProducerEnvironment, queue: string): string {
@@ -342,7 +343,8 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer {
}
toCkWildcard(queue: string): string {
- return queue.replace(/:ck:.+$/, ":ck:*");
+ const base = this.baseQueueKeyFromQueue(queue);
+ return base === queue ? queue : `${base}:ck:*`;
}
descriptorFromQueue(queue: string): QueueDescriptor {
diff --git a/internal-packages/run-engine/src/run-queue/metrics.test.ts b/internal-packages/run-engine/src/run-queue/metrics.test.ts
new file mode 100644
index 00000000000..ebfc295470e
--- /dev/null
+++ b/internal-packages/run-engine/src/run-queue/metrics.test.ts
@@ -0,0 +1,397 @@
+import { createRedisClient } from "@internal/redis";
+import { redisTest } from "@internal/testcontainers";
+import { trace } from "@internal/tracing";
+import {
+ allStreamKeys,
+ MetricsStreamEmitter,
+ type MetricDefinition,
+} from "@internal/metrics-pipeline";
+import { Logger } from "@trigger.dev/core/logger";
+import { Decimal } from "@trigger.dev/database";
+import { setTimeout } from "node:timers/promises";
+import { describe, expect } from "vitest";
+import { FairQueueSelectionStrategy } from "./fairQueueSelectionStrategy.js";
+import { RunQueue } from "./index.js";
+import { RunQueueFullKeyProducer } from "./keyProducer.js";
+import type { InputPayload } from "./types.js";
+
+const authenticatedEnvDev = {
+ id: "e1234",
+ type: "DEVELOPMENT" as const,
+ maximumConcurrencyLimit: 10,
+ concurrencyLimitBurstFactor: new Decimal(1.0),
+ project: { id: "p1234" },
+ organization: { id: "o1234" },
+};
+
+async function readAllEntries(
+ redisOptions: {
+ host: string;
+ port: number;
+ },
+ definition: MetricDefinition
+) {
+ const client = createRedisClient({ ...redisOptions, keyPrefix: undefined });
+ const entries: Array<{ id: string; fields: Record }> = [];
+ for (const key of allStreamKeys(definition)) {
+ const raw = (await client.xrange(key, "-", "+")) as Array<[string, string[]]>;
+ for (const [id, flat] of raw) {
+ const fields: Record = {};
+ for (let i = 0; i + 1 < flat.length; i += 2) fields[flat[i]!] = flat[i + 1]!;
+ entries.push({ id, fields });
+ }
+ }
+ await client.quit();
+ return entries;
+}
+
+// Gauges now land via a fire-and-forget Node XADD after the script reply (not synchronously
+// inside the Lua), so reads must poll until the expected entries appear.
+async function waitForEntries(
+ redisOptions: { host: string; port: number },
+ definition: MetricDefinition,
+ predicate: (entries: Array<{ id: string; fields: Record }>) => boolean,
+ timeoutMs = 5000
+) {
+ const start = Date.now();
+ let entries = await readAllEntries(redisOptions, definition);
+ while (!predicate(entries)) {
+ if (Date.now() - start > timeoutMs) return entries;
+ await setTimeout(50);
+ entries = await readAllEntries(redisOptions, definition);
+ }
+ return entries;
+}
+
+describe("RunQueue queue-metrics emission", () => {
+ redisTest("emits gauge + enqueue/started/ack events when enabled", async ({ redisContainer }) => {
+ const redis = {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ };
+ const definition: MetricDefinition = {
+ name: `qm_test_${Date.now()}`,
+ shardCount: 2,
+ consumerGroup: "cg",
+ maxLen: 1000,
+ };
+ const emitter = new MetricsStreamEmitter({
+ redis,
+ definition,
+ flag: { enabled: () => true },
+ });
+
+ const queue = new RunQueue({
+ name: "rq",
+ tracer: trace.getTracer("rq"),
+ defaultEnvConcurrency: 25,
+ logger: new Logger("RunQueue", "error"),
+ keys: new RunQueueFullKeyProducer(),
+ queueSelectionStrategy: new FairQueueSelectionStrategy({
+ redis,
+ keys: new RunQueueFullKeyProducer(),
+ }),
+ redis,
+ queueMetrics: emitter,
+ });
+
+ const message: InputPayload = {
+ runId: "r-metrics",
+ taskIdentifier: "task/my-task",
+ orgId: "o1234",
+ projectId: "p1234",
+ environmentId: authenticatedEnvDev.id,
+ environmentType: "DEVELOPMENT",
+ queue: "task/my-task",
+ timestamp: Date.now(),
+ eligibleAtMs: Date.now() - 500,
+ attempt: 0,
+ };
+
+ try {
+ await queue.enqueueMessage({
+ env: authenticatedEnvDev,
+ message,
+ workerQueue: authenticatedEnvDev.id,
+ });
+ await setTimeout(1000);
+ const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id);
+ expect(dequeued?.messageId).toBe(message.runId);
+ await queue.acknowledgeMessage(message.orgId, message.runId);
+ await setTimeout(100);
+
+ const entries = await waitForEntries(redis, definition, (es) => {
+ const seen = es.map((e) => e.fields.op);
+ return ["enqueue", "gauge", "started", "ack"].every((o) => seen.includes(o));
+ });
+ const ops = entries.map((e) => e.fields.op);
+ expect(ops).toContain("enqueue");
+ expect(ops).toContain("gauge");
+ expect(ops).toContain("started");
+ expect(ops).toContain("ack");
+
+ const gauge = entries.find((e) => e.fields.op === "gauge");
+ assertGauge(gauge);
+ expect(gauge!.fields.q).toContain("task/my-task");
+ for (const f of ["ql", "cc", "lim", "eql", "ec", "elim", "thr"]) {
+ expect(gauge!.fields[f]).toBeDefined();
+ }
+ // Non-CK scripts keep the 7-field gauge (no CK-health tail).
+ expect(gauge!.fields.ckq).toBeUndefined();
+ expect(gauge!.fields.ckw).toBeUndefined();
+
+ // The first counter emission also seeds a cum=0 baseline (no wait); the real reading
+ // carries wait. Pick the reading (cum > 0).
+ const started = entries.find((e) => e.fields.op === "started" && Number(e.fields.cum) > 0);
+ expect(started!.fields.wait).toBeDefined();
+ expect(Number(started!.fields.wait)).toBeGreaterThanOrEqual(0);
+ expect(Number(started!.fields.cum)).toBeGreaterThan(0);
+ } finally {
+ await queue.quit();
+ await emitter.close();
+ }
+ });
+
+ redisTest(
+ "emits a fast-path gauge reusing the admission-check locals",
+ async ({ redisContainer }) => {
+ const redis = {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ };
+ const definition: MetricDefinition = {
+ name: `qm_fp_${Date.now()}`,
+ shardCount: 2,
+ consumerGroup: "cg",
+ maxLen: 1000,
+ };
+ const emitter = new MetricsStreamEmitter({
+ redis,
+ definition,
+ flag: { enabled: () => true },
+ });
+ const queue = new RunQueue({
+ name: "rq",
+ tracer: trace.getTracer("rq"),
+ defaultEnvConcurrency: 25,
+ logger: new Logger("RunQueue", "error"),
+ keys: new RunQueueFullKeyProducer(),
+ queueSelectionStrategy: new FairQueueSelectionStrategy({
+ redis,
+ keys: new RunQueueFullKeyProducer(),
+ }),
+ redis,
+ queueMetrics: emitter,
+ });
+
+ const message: InputPayload = {
+ runId: "r-fastpath",
+ taskIdentifier: "task/my-task",
+ orgId: "o1234",
+ projectId: "p1234",
+ environmentId: authenticatedEnvDev.id,
+ environmentType: "DEVELOPMENT",
+ queue: "task/my-task",
+ timestamp: Date.now(),
+ attempt: 0,
+ };
+
+ try {
+ // enableFastPath + empty queue + zero concurrency => the Lua takes the fast path,
+ // so the gauge runs the reuse snippet (queueCurrent/envCurrent/queueLimit/envLimit).
+ await queue.enqueueMessage({
+ env: authenticatedEnvDev,
+ message,
+ workerQueue: authenticatedEnvDev.id,
+ enableFastPath: true,
+ });
+ const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id);
+ expect(dequeued?.messageId).toBe(message.runId);
+
+ const entries = await waitForEntries(
+ redis,
+ definition,
+ (es) =>
+ es.some((e) => e.fields.op === "gauge") && es.some((e) => e.fields.op === "enqueue")
+ );
+ const gauge = entries.find((e) => e.fields.op === "gauge");
+ assertGauge(gauge);
+ for (const f of ["ql", "cc", "lim", "eql", "ec", "elim", "thr"]) {
+ expect(gauge!.fields[f]).toBeDefined();
+ }
+ // Fast path was taken => capacity was available => not throttled.
+ expect(gauge!.fields.thr).toBe("0");
+ expect(entries.some((e) => e.fields.op === "enqueue")).toBe(true);
+ } finally {
+ await queue.quit();
+ await emitter.close();
+ }
+ }
+ );
+
+ redisTest("emits an aggregate gauge for CK queues at dequeue", async ({ redisContainer }) => {
+ const redis = {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ };
+ const definition: MetricDefinition = {
+ name: `qm_ck_${Date.now()}`,
+ shardCount: 2,
+ consumerGroup: "cg",
+ maxLen: 1000,
+ };
+ const emitter = new MetricsStreamEmitter({ redis, definition, flag: { enabled: () => true } });
+ const queue = new RunQueue({
+ name: "rq",
+ tracer: trace.getTracer("rq"),
+ defaultEnvConcurrency: 25,
+ logger: new Logger("RunQueue", "error"),
+ keys: new RunQueueFullKeyProducer(),
+ queueSelectionStrategy: new FairQueueSelectionStrategy({
+ redis,
+ keys: new RunQueueFullKeyProducer(),
+ }),
+ redis,
+ queueMetrics: emitter,
+ });
+
+ const message: InputPayload = {
+ runId: "r-ck",
+ taskIdentifier: "task/my-task",
+ orgId: "o1234",
+ projectId: "p1234",
+ environmentId: authenticatedEnvDev.id,
+ environmentType: "DEVELOPMENT",
+ queue: "task/my-task",
+ concurrencyKey: "tenant-1",
+ timestamp: Date.now(),
+ eligibleAtMs: Date.now() - 300,
+ attempt: 0,
+ };
+
+ try {
+ await queue.enqueueMessage({
+ env: authenticatedEnvDev,
+ message,
+ workerQueue: authenticatedEnvDev.id,
+ });
+ await setTimeout(1000);
+ const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id);
+ expect(dequeued?.messageId).toBe(message.runId);
+
+ const entries = await waitForEntries(redis, definition, (es) =>
+ es.some(
+ (e) => e.fields.op === "gauge" && e.fields.q.includes(":ck:") && e.fields.thr === "0"
+ )
+ );
+ const gauges = entries.filter((e) => e.fields.op === "gauge");
+ expect(gauges.length).toBeGreaterThan(0);
+ // The aggregate CK dequeue gauge targets the CK wildcard and never sets thr.
+ const aggregate = gauges.find((e) => e.fields.q.includes(":ck:") && e.fields.thr === "0");
+ assertGauge(aggregate);
+ expect(Number(aggregate!.fields.ql)).toBeGreaterThanOrEqual(0);
+ expect(Number(aggregate!.fields.cc)).toBeGreaterThanOrEqual(0);
+
+ // Every CK-path gauge carries the CK-health tail; the enqueue-time reading (and the
+ // pre-dequeue aggregate reading) sees the backlogged key.
+ const ckGauges = gauges.filter((e) => e.fields.q.includes(":ck:"));
+ for (const g of ckGauges) {
+ expect(g.fields.ckq).toBeDefined();
+ expect(g.fields.ckw).toBeDefined();
+ expect(Number(g.fields.ckw)).toBeGreaterThanOrEqual(0);
+ }
+ expect(ckGauges.some((g) => Number(g.fields.ckq) >= 1)).toBe(true);
+
+ // CK counter entries carry both odometers: the reading has cum + ck/ckcum, and each
+ // odometer seeds its own baseline entry (cum-only vs ck+ckcum-only).
+ const enqueues = entries.filter((e) => e.fields.op === "enqueue");
+ const reading = enqueues.find((e) => e.fields.cum != null && e.fields.ckcum != null);
+ expect(reading).toBeDefined();
+ expect(reading!.fields.ck).toBe("tenant-1");
+ expect(reading!.fields.q).not.toContain(":ck:");
+ expect(Number(reading!.fields.cum)).toBe(1);
+ expect(Number(reading!.fields.ckcum)).toBe(1);
+ const baseBaseline = enqueues.find((e) => e.fields.cum === "0" && e.fields.ck == null);
+ expect(baseBaseline).toBeDefined();
+ const ckBaseline = enqueues.find((e) => e.fields.ckcum === "0" && e.fields.cum == null);
+ expect(ckBaseline).toBeDefined();
+ expect(ckBaseline!.fields.ck).toBe("tenant-1");
+ } finally {
+ await queue.quit();
+ await emitter.close();
+ }
+ });
+
+ redisTest("gauge sampling gates gauges but not counters", async ({ redisContainer }) => {
+ const redis = {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ };
+ const definition: MetricDefinition = {
+ name: `qm_sample_${Date.now()}`,
+ shardCount: 2,
+ consumerGroup: "cg",
+ maxLen: 1000,
+ };
+ // gaugeSampleRate 0 => sampledSync() always false => Lua gauge never fires; counters still do.
+ const emitter = new MetricsStreamEmitter({
+ redis,
+ definition,
+ flag: { enabled: () => true },
+ gaugeSampleRate: 0,
+ });
+ const queue = new RunQueue({
+ name: "rq",
+ tracer: trace.getTracer("rq"),
+ defaultEnvConcurrency: 25,
+ logger: new Logger("RunQueue", "error"),
+ keys: new RunQueueFullKeyProducer(),
+ queueSelectionStrategy: new FairQueueSelectionStrategy({
+ redis,
+ keys: new RunQueueFullKeyProducer(),
+ }),
+ redis,
+ queueMetrics: emitter,
+ });
+
+ const message: InputPayload = {
+ runId: "r-sample",
+ taskIdentifier: "task/my-task",
+ orgId: "o1234",
+ projectId: "p1234",
+ environmentId: authenticatedEnvDev.id,
+ environmentType: "DEVELOPMENT",
+ queue: "task/my-task",
+ timestamp: Date.now(),
+ attempt: 0,
+ };
+
+ try {
+ await queue.enqueueMessage({
+ env: authenticatedEnvDev,
+ message,
+ workerQueue: authenticatedEnvDev.id,
+ });
+ await setTimeout(1000);
+ await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id);
+
+ // Poll until the counter (enqueue) lands; by then a gauge would have too, if sampled in.
+ const entries = await waitForEntries(redis, definition, (es) =>
+ es.some((e) => e.fields.op === "enqueue")
+ );
+ expect(entries.some((e) => e.fields.op === "gauge")).toBe(false);
+ expect(entries.some((e) => e.fields.op === "enqueue")).toBe(true);
+ } finally {
+ await queue.quit();
+ await emitter.close();
+ }
+ });
+});
+
+function assertGauge(gauge: unknown): asserts gauge {
+ if (!gauge) throw new Error("expected a gauge entry");
+}
diff --git a/internal-packages/run-engine/src/run-queue/tests/ckIndex.test.ts b/internal-packages/run-engine/src/run-queue/tests/ckIndex.test.ts
index 224540f4efb..4eb47d59bc0 100644
--- a/internal-packages/run-engine/src/run-queue/tests/ckIndex.test.ts
+++ b/internal-packages/run-engine/src/run-queue/tests/ckIndex.test.ts
@@ -471,4 +471,46 @@ describe("CK Index", () => {
await queue.quit();
}
});
+
+ redisTest(
+ "concurrencyKeyBreakdown lists backlogged keys most-starved first",
+ async ({ redisContainer }) => {
+ const queue = createQueue(redisContainer);
+ try {
+ const now = Date.now();
+ const enqueue = (runId: string, concurrencyKey: string, timestamp: number) =>
+ queue.enqueueMessage({
+ env: authenticatedEnvDev,
+ message: makeMessage({ runId, concurrencyKey, timestamp }),
+ workerQueue: authenticatedEnvDev.id,
+ skipDequeueProcessing: true,
+ });
+
+ // ck-a has the oldest head (most starved) and 2 queued; ck-b has 1.
+ await enqueue("r1", "ck-a", now - 10_000);
+ await enqueue("r2", "ck-a", now - 5_000);
+ await enqueue("r3", "ck-b", now - 2_000);
+
+ const breakdown = await queue.concurrencyKeyBreakdown(authenticatedEnvDev, "task/my-task");
+ expect(breakdown.totalBackloggedKeys).toBe(2);
+ expect(breakdown.keys).toEqual([
+ { concurrencyKey: "ck-a", queued: 2, running: 0, oldestEnqueuedAt: now - 10_000 },
+ { concurrencyKey: "ck-b", queued: 1, running: 0, oldestEnqueuedAt: now - 2_000 },
+ ]);
+
+ const limited = await queue.concurrencyKeyBreakdown(authenticatedEnvDev, "task/my-task", {
+ limit: 1,
+ });
+ expect(limited.totalBackloggedKeys).toBe(2);
+ expect(limited.keys).toHaveLength(1);
+ expect(limited.keys[0]!.concurrencyKey).toBe("ck-a");
+
+ // Queues with no CK backlog return an empty breakdown.
+ const empty = await queue.concurrencyKeyBreakdown(authenticatedEnvDev, "task/other-task");
+ expect(empty).toEqual({ totalBackloggedKeys: 0, keys: [] });
+ } finally {
+ await queue.quit();
+ }
+ }
+ );
});
diff --git a/internal-packages/run-engine/src/run-queue/types.ts b/internal-packages/run-engine/src/run-queue/types.ts
index 0905f3971de..8a7d3c93ec5 100644
--- a/internal-packages/run-engine/src/run-queue/types.ts
+++ b/internal-packages/run-engine/src/run-queue/types.ts
@@ -13,6 +13,9 @@ export const InputPayload = z.object({
queue: z.string(),
concurrencyKey: z.string().optional(),
timestamp: z.number(),
+ // Unix ms the run became eligible (delayUntil if set, else triggered-at), pre-priority.
+ // Dequeue scheduling delay = dequeueTime - eligibleAtMs. Optional for old-payload compat.
+ eligibleAtMs: z.number().optional(),
attempt: z.number(),
/** TTL expiration timestamp (unix ms). If set, run will be expired when this time is reached. */
ttlExpiresAt: z.number().optional(),
diff --git a/internal-packages/tsql/src/index.test.ts b/internal-packages/tsql/src/index.test.ts
index f9aca2f236d..ce358e6ac08 100644
--- a/internal-packages/tsql/src/index.test.ts
+++ b/internal-packages/tsql/src/index.test.ts
@@ -231,6 +231,26 @@ describe("injectFallbackConditions", () => {
expect(modified.where.expression_type).toBe("and");
}
});
+
+ it("should inject into a FROM subquery, where the fallback column's table lives", () => {
+ const ast = parseTSQLSelect(
+ "SELECT t, sum(total) AS total FROM (SELECT time AS t, status, count(*) AS total FROM task_runs GROUP BY t, status) GROUP BY t"
+ );
+ const fallbacks: Record = {
+ time: { op: "gte", value: "2024-01-01" },
+ };
+
+ const modified = injectFallbackConditions(ast, fallbacks);
+ expect(modified.expression_type).toBe("select_query");
+ if (modified.expression_type === "select_query") {
+ expect(modified.where).toBeUndefined();
+ const inner = modified.select_from?.table;
+ expect(inner?.expression_type).toBe("select_query");
+ if (inner?.expression_type === "select_query") {
+ expect(isColumnReferencedInExpression(inner.where, "time")).toBe(true);
+ }
+ }
+ });
});
describe("compileTSQL with whereClauseFallback", () => {
diff --git a/internal-packages/tsql/src/index.ts b/internal-packages/tsql/src/index.ts
index 1d8759c108c..1ebd1a60a5d 100644
--- a/internal-packages/tsql/src/index.ts
+++ b/internal-packages/tsql/src/index.ts
@@ -429,6 +429,24 @@ export function injectFallbackConditions(
// Handle SelectQuery
const selectQuery = ast as SelectQuery;
+
+ // When the FROM is a subquery, the fallback columns belong to the inner query's
+ // table, not this level; descend so e.g. a time fallback lands next to the table ref.
+ const fromTable = selectQuery.select_from?.table;
+ if (
+ fromTable &&
+ (fromTable.expression_type === "select_query" ||
+ fromTable.expression_type === "select_set_query")
+ ) {
+ return {
+ ...selectQuery,
+ select_from: {
+ ...selectQuery.select_from!,
+ table: injectFallbackConditions(fromTable, fallbacks) as SelectQuery | SelectSetQuery,
+ },
+ };
+ }
+
const existingWhere = selectQuery.where;
// Collect fallback expressions for columns not already in WHERE
@@ -541,6 +559,12 @@ export interface CompileTSQLOptions {
* ```
*/
timeRange?: TimeRange;
+ /**
+ * Opt-in: emit rows for empty time buckets in a top-level time-bucketed query.
+ * Counters zero-fill, gauges (columns with `fillMode: "carry"`) carry forward.
+ * Off by default; output is unchanged when not set.
+ */
+ fillGaps?: boolean;
}
/**
@@ -599,6 +623,7 @@ export function compileTSQL(query: string, options: CompileTSQLOptions): PrintRe
fieldMappings: options.fieldMappings,
enforcedWhereClause,
timeRange: options.timeRange,
+ fillGaps: options.fillGaps,
});
// 6. Print the AST to ClickHouse SQL (enforced conditions applied at printer level)
diff --git a/internal-packages/tsql/src/query/functions.ts b/internal-packages/tsql/src/query/functions.ts
index 2f2b9278454..a6dadf0f609 100644
--- a/internal-packages/tsql/src/query/functions.ts
+++ b/internal-packages/tsql/src/query/functions.ts
@@ -645,11 +645,24 @@ export const TSQL_AGGREGATIONS: Record = {
maxParams: 1,
aggregate: true,
},
+ quantilesTDigestMerge: {
+ clickhouseName: "quantilesTDigestMerge",
+ minArgs: 1,
+ maxArgs: 1,
+ minParams: 1,
+ aggregate: true,
+ },
sumMerge: { clickhouseName: "sumMerge", minArgs: 1, maxArgs: 1, aggregate: true },
avgMerge: { clickhouseName: "avgMerge", minArgs: 1, maxArgs: 1, aggregate: true },
countMerge: { clickhouseName: "countMerge", minArgs: 1, maxArgs: 1, aggregate: true },
minMerge: { clickhouseName: "minMerge", minArgs: 1, maxArgs: 1, aggregate: true },
maxMerge: { clickhouseName: "maxMerge", minArgs: 1, maxArgs: 1, aggregate: true },
+ deltaSumTimestampMerge: {
+ clickhouseName: "deltaSumTimestampMerge",
+ minArgs: 1,
+ maxArgs: 1,
+ aggregate: true,
+ },
// Statistical functions
simpleLinearRegression: {
diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts
index 0efa0d34fc4..dbc14818cae 100644
--- a/internal-packages/tsql/src/query/printer.test.ts
+++ b/internal-packages/tsql/src/query/printer.test.ts
@@ -3831,3 +3831,388 @@ describe("timeBucket()", () => {
});
});
});
+
+// ============================================================
+// fillGaps Tests
+// ============================================================
+
+describe("timeBucket() fillGaps", () => {
+ // Schema with a gauge column (fillMode: "carry"), a counter, and a groupable dim.
+ const metricsSchema: TableSchema = {
+ name: "metrics",
+ clickhouseName: "trigger_dev.queue_metrics_v1",
+ timeConstraint: "bucket_at",
+ columns: {
+ bucket_at: { name: "bucket_at", clickhouseName: "created_at", ...column("DateTime64") },
+ queue_name: { name: "queue_name", ...column("String") },
+ max_running: { name: "max_running", ...column("UInt64"), fillMode: "carry" },
+ enqueued: { name: "enqueued", ...column("UInt64"), fillMode: "zero" },
+ organization_id: { name: "organization_id", ...column("String") },
+ project_id: { name: "project_id", ...column("String") },
+ environment_id: { name: "environment_id", ...column("String") },
+ },
+ tenantColumns: {
+ organizationId: "organization_id",
+ projectId: "project_id",
+ environmentId: "environment_id",
+ },
+ };
+
+ // 7-day range -> 6 HOUR buckets (same as the timeBucket() block).
+ const sevenDayRange = {
+ from: new Date("2024-01-01T00:00:00Z"),
+ to: new Date("2024-01-08T00:00:00Z"),
+ };
+
+ function ctx(fillGaps: boolean): PrinterContext {
+ return createPrinterContext({
+ schema: createSchemaRegistry([metricsSchema]),
+ enforcedWhereClause: {
+ organization_id: { op: "eq", value: "org_test123" },
+ project_id: { op: "eq", value: "proj_test456" },
+ environment_id: { op: "eq", value: "env_test789" },
+ },
+ timeRange: sevenDayRange,
+ fillGaps,
+ });
+ }
+
+ function run(query: string, fillGaps: boolean) {
+ const context = ctx(fillGaps);
+ const result = printToClickHouse(parseTSQLSelect(query), context);
+ return { ...result, warnings: context.warnings };
+ }
+
+ it("emits no WITH FILL when fillGaps is off (unchanged)", () => {
+ const query =
+ "SELECT timeBucket(), max(max_running), count() FROM metrics GROUP BY timeBucket ORDER BY timeBucket";
+ const { sql } = run(query, false);
+ expect(sql).not.toContain("WITH FILL");
+ expect(sql).not.toContain("INTERPOLATE");
+ });
+
+ it("single-series gauge + counter: WITH FILL plus INTERPOLATE for the gauge only", () => {
+ const query =
+ "SELECT timeBucket(), max(max_running) AS max_running, count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket";
+ const { sql, params } = run(query, true);
+
+ // STEP matches the 6 HOUR bucket interval, FROM/TO snapped + parameterized.
+ expect(sql).toContain("WITH FILL FROM toStartOfInterval({");
+ expect(sql).toContain("STEP INTERVAL 6 HOUR");
+ expect(sql).toMatch(/TO toStartOfInterval\(\{[^}]+: DateTime64\(6\)\}, INTERVAL 6 HOUR\)/);
+
+ // Gauge carried forward; counter omitted (defaults to 0).
+ expect(sql).toContain("INTERPOLATE (max_running AS max_running)");
+ expect(sql).not.toContain("runs AS runs");
+
+ // FROM/TO bounds are real parameters carrying the time range.
+ const dateParams = Object.values(params).filter((v) => v instanceof Date);
+ expect(dateParams).toContainEqual(sevenDayRange.from);
+ expect(dateParams).toContainEqual(sevenDayRange.to);
+ });
+
+ it("single-series counter only: WITH FILL but no INTERPOLATE", () => {
+ const query =
+ "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket";
+ const { sql } = run(query, true);
+ expect(sql).toContain("WITH FILL FROM toStartOfInterval({");
+ expect(sql).toContain("STEP INTERVAL 6 HOUR");
+ expect(sql).not.toContain("INTERPOLATE");
+ });
+
+ it("grouped counter only: group dim first, then WITH FILL, no INTERPOLATE", () => {
+ const query =
+ "SELECT timeBucket(), queue_name, count() AS runs FROM metrics GROUP BY timeBucket, queue_name ORDER BY timeBucket";
+ const { sql } = run(query, true);
+ expect(sql).toMatch(/ORDER BY queue_name, timebucket ASC WITH FILL/);
+ expect(sql).toContain("STEP INTERVAL 6 HOUR");
+ expect(sql).not.toContain("INTERPOLATE");
+ });
+
+ it("grouped + carry gauge: per-group LOCF via window functions, no INTERPOLATE", () => {
+ const query =
+ "SELECT timeBucket(), queue_name, max(max_running) AS max_running FROM metrics GROUP BY timeBucket, queue_name ORDER BY timeBucket";
+ const { sql, warnings } = run(query, true);
+
+ // Inner query densifies per group (dims first, then the bucket WITH FILL) + sentinel.
+ expect(sql).toMatch(/ORDER BY queue_name, timebucket ASC WITH FILL/);
+ expect(sql).toContain("STEP INTERVAL 6 HOUR");
+ expect(sql).toContain("1 AS __tsql_present");
+
+ // Block id increments at each real row, partitioned by the group dim.
+ expect(sql).toContain(
+ "sum(__tsql_present) OVER (PARTITION BY queue_name ORDER BY timebucket ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS __tsql_block"
+ );
+
+ // Gauge carried within each (group, block); never INTERPOLATE (which bleeds across groups).
+ expect(sql).toContain(
+ "max(if(__tsql_present = 1, max_running, NULL)) OVER (PARTITION BY queue_name, __tsql_block) AS max_running"
+ );
+ expect(sql).not.toContain("INTERPOLATE");
+
+ // Final result re-ordered by the user's ORDER BY, and not skipped.
+ expect(sql).toMatch(/\)\s*ORDER BY timebucket ASC$/);
+ expect(warnings.some((w) => w.code === "fill_skipped_grouped_gauge")).toBe(false);
+ });
+
+ it("grouped + carry gauge with a non-plain group dim: fill is skipped", () => {
+ const query =
+ "SELECT timeBucket(), upper(queue_name) AS q, max(max_running) AS max_running FROM metrics GROUP BY timeBucket, upper(queue_name) ORDER BY timeBucket";
+ const { sql, warnings } = run(query, true);
+ expect(sql).not.toContain("WITH FILL");
+ expect(sql).not.toContain("__tsql_block");
+ expect(warnings.some((w) => w.code === "fill_skipped_grouped_gauge")).toBe(true);
+ });
+
+ it("user ORDER BY not led by timeBucket: fill is skipped", () => {
+ const query =
+ "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY runs DESC";
+ const { sql } = run(query, true);
+ expect(sql).not.toContain("WITH FILL");
+ expect(sql).not.toContain("INTERPOLATE");
+ });
+
+ it("bucket-led ORDER BY DESC: fill is skipped (ascending fill would be invalid)", () => {
+ const query =
+ "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket DESC";
+ const { sql } = run(query, true);
+ expect(sql).not.toContain("WITH FILL");
+ expect(sql).not.toContain("INTERPOLATE");
+ // The plain descending order still stands.
+ expect(sql).toContain("ORDER BY timebucket DESC");
+ });
+});
+
+describe("cross-queue counter totals via subquery (env-wide throughput shape)", () => {
+ // deltaSumTimestamp states must merge per queue, then sum outside; this is the
+ // supported shape for env-wide totals.
+ const metricsSchema: TableSchema = {
+ name: "metrics",
+ clickhouseName: "trigger_dev.queue_metrics_v1",
+ timeConstraint: "bucket_at",
+ columns: {
+ bucket_at: { name: "bucket_at", clickhouseName: "created_at", ...column("DateTime64") },
+ queue_name: { name: "queue_name", ...column("String") },
+ started_delta: {
+ name: "started_delta",
+ mergeGroupKey: "queue_name",
+ ...column("String"),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ organization_id: { name: "organization_id", ...column("String") },
+ project_id: { name: "project_id", ...column("String") },
+ environment_id: { name: "environment_id", ...column("String") },
+ },
+ tenantColumns: {
+ organizationId: "organization_id",
+ projectId: "project_id",
+ environmentId: "environment_id",
+ },
+ };
+
+ function runSubquery(query: string) {
+ const context = createPrinterContext({
+ schema: createSchemaRegistry([metricsSchema]),
+ enforcedWhereClause: {
+ organization_id: { op: "eq", value: "org_test123" },
+ },
+ timeRange: {
+ from: new Date("2024-01-01T00:00:00Z"),
+ to: new Date("2024-01-08T00:00:00Z"),
+ },
+ });
+ const result = printToClickHouse(parseTSQLSelect(query), context);
+ return { ...result, warnings: context.warnings };
+ }
+
+ it("compiles per-queue merge + outer sum, with tenant scoping inside the subquery", () => {
+ const { sql, params } = runSubquery(`
+ SELECT t, sum(started) AS started
+ FROM (
+ SELECT timeBucket() AS t, queue_name, deltaSumTimestampMerge(started_delta) AS started
+ FROM metrics
+ GROUP BY t, queue_name
+ )
+ GROUP BY t
+ ORDER BY t
+ `);
+
+ expect(sql).toContain("deltaSumTimestampMerge(started_delta)");
+ expect(sql).toContain("toStartOfInterval(created_at, INTERVAL 6 HOUR)");
+ const subqueryStart = sql.indexOf("FROM (");
+ const tenantFilter = sql.indexOf("organization_id");
+ expect(subqueryStart).toBeGreaterThan(-1);
+ expect(tenantFilter).toBeGreaterThan(subqueryStart);
+ expect(Object.values(params)).toContain("org_test123");
+ });
+});
+
+describe("mergeGroupKey validation", () => {
+ const metricsSchema: TableSchema = {
+ name: "metrics",
+ clickhouseName: "trigger_dev.queue_metrics_v1",
+ timeConstraint: "bucket_at",
+ columns: {
+ bucket_at: { name: "bucket_at", ...column("DateTime64") },
+ queue: { name: "queue", clickhouseName: "queue_name", ...column("String") },
+ started_delta: {
+ name: "started_delta",
+ mergeGroupKey: "queue",
+ ...column("String"),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ organization_id: { name: "organization_id", ...column("String") },
+ project_id: { name: "project_id", ...column("String") },
+ environment_id: { name: "environment_id", ...column("String") },
+ },
+ tenantColumns: {
+ organizationId: "organization_id",
+ projectId: "project_id",
+ environmentId: "environment_id",
+ },
+ };
+
+ function compile(
+ query: string,
+ enforced: Record = { organization_id: { op: "eq", value: "org_x" } }
+ ) {
+ const context = createPrinterContext({
+ schema: createSchemaRegistry([metricsSchema]),
+ enforcedWhereClause: enforced as never,
+ timeRange: {
+ from: new Date("2024-01-01T00:00:00Z"),
+ to: new Date("2024-01-08T00:00:00Z"),
+ },
+ });
+ return printToClickHouse(parseTSQLSelect(query), context);
+ }
+
+ it("rejects an ungrouped, unpinned merge with an actionable message", () => {
+ expect(() =>
+ compile(
+ "SELECT timeBucket() AS t, deltaSumTimestampMerge(started_delta) AS started FROM metrics GROUP BY t"
+ )
+ ).toThrowError(
+ /Merging 'started_delta' across every queue[\s\S]*GROUP BY queue\)[\s\S]*WHERE queue = 'my-queue'[\s\S]*inner GROUP BY t, queue and outer GROUP BY t/
+ );
+ });
+
+ it("allows the merge when queue is in the GROUP BY", () => {
+ const { sql } = compile(
+ "SELECT timeBucket() AS t, queue, deltaSumTimestampMerge(started_delta) AS started FROM metrics GROUP BY t, queue"
+ );
+ expect(sql).toContain("deltaSumTimestampMerge(started_delta)");
+ });
+
+ it("allows the merge when queue is pinned by an equality filter", () => {
+ const { sql } = compile(
+ "SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics WHERE queue = 'emails'"
+ );
+ expect(sql).toContain("deltaSumTimestampMerge(started_delta)");
+ });
+
+ it("allows the merge when the enforced clause pins queue to one value", () => {
+ const { sql } = compile(
+ "SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics",
+ { organization_id: { op: "eq", value: "org_x" }, queue: { op: "in", values: ["emails"] } }
+ );
+ expect(sql).toContain("deltaSumTimestampMerge(started_delta)");
+ });
+
+ it("rejects the merge when the enforced clause spans several queues", () => {
+ expect(() =>
+ compile("SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics", {
+ organization_id: { op: "eq", value: "org_x" },
+ queue: { op: "in", values: ["emails", "webhooks"] },
+ })
+ ).toThrowError(/only combine correctly within one queue/);
+ });
+
+ it("allows a grouped inner merge summed by the outer query", () => {
+ const { sql } = compile(
+ "SELECT t, sum(started) AS started FROM (SELECT timeBucket() AS t, queue, deltaSumTimestampMerge(started_delta) AS started FROM metrics GROUP BY t, queue) GROUP BY t ORDER BY t"
+ );
+ expect(sql).toContain("GROUP BY t, queue_name");
+ });
+
+ it("rejects an ungrouped merge inside a subquery", () => {
+ expect(() =>
+ compile(
+ "SELECT t, sum(started) AS started FROM (SELECT timeBucket() AS t, deltaSumTimestampMerge(started_delta) AS started FROM metrics GROUP BY t) GROUP BY t"
+ )
+ ).toThrowError(/only combine correctly within one queue/);
+ });
+});
+
+describe("compound mergeGroupKey validation", () => {
+ const byKeySchema: TableSchema = {
+ name: "metrics_by_key",
+ clickhouseName: "trigger_dev.queue_metrics_ck_v1",
+ timeConstraint: "bucket_at",
+ columns: {
+ bucket_at: { name: "bucket_at", ...column("DateTime64") },
+ queue: { name: "queue", clickhouseName: "queue_name", ...column("String") },
+ concurrency_key: { name: "concurrency_key", ...column("String") },
+ started_delta: {
+ name: "started_delta",
+ mergeGroupKey: ["queue", "concurrency_key"],
+ ...column("String"),
+ groupable: false,
+ sortable: false,
+ filterable: false,
+ },
+ organization_id: { name: "organization_id", ...column("String") },
+ project_id: { name: "project_id", ...column("String") },
+ environment_id: { name: "environment_id", ...column("String") },
+ },
+ tenantColumns: {
+ organizationId: "organization_id",
+ projectId: "project_id",
+ environmentId: "environment_id",
+ },
+ };
+
+ function compile(query: string) {
+ const context = createPrinterContext({
+ schema: createSchemaRegistry([byKeySchema]),
+ enforcedWhereClause: { organization_id: { op: "eq", value: "org_x" } } as never,
+ timeRange: {
+ from: new Date("2024-01-01T00:00:00Z"),
+ to: new Date("2024-01-08T00:00:00Z"),
+ },
+ });
+ return printToClickHouse(parseTSQLSelect(query), context);
+ }
+
+ it("requires EVERY listed key grouped or pinned", () => {
+ expect(() =>
+ compile(
+ "SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key WHERE queue = 'emails'"
+ )
+ ).toThrowError(/only combine correctly within one concurrency_key/);
+ expect(() =>
+ compile(
+ "SELECT concurrency_key, deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key GROUP BY concurrency_key"
+ )
+ ).toThrowError(/only combine correctly within one queue/);
+ });
+
+ it("allows pin + group combinations covering both keys", () => {
+ const grouped = compile(
+ "SELECT concurrency_key, deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key WHERE queue = 'emails' GROUP BY concurrency_key"
+ );
+ expect(grouped.sql).toContain("deltaSumTimestampMerge(started_delta)");
+ const pinned = compile(
+ "SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key WHERE queue = 'emails' AND concurrency_key = 't1'"
+ );
+ expect(pinned.sql).toContain("deltaSumTimestampMerge(started_delta)");
+ const bothGrouped = compile(
+ "SELECT queue, concurrency_key, deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key GROUP BY queue, concurrency_key"
+ );
+ expect(bothGrouped.sql).toContain("GROUP BY queue_name, concurrency_key");
+ });
+});
diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts
index 82d97f5491b..3ee9a0ab76a 100644
--- a/internal-packages/tsql/src/query/printer.ts
+++ b/internal-packages/tsql/src/query/printer.ts
@@ -385,6 +385,8 @@ export class ClickHousePrinter {
nextJoin = nextJoin.next_join;
}
+ this.validateMergeScopedColumns(node);
+
// Extract SELECT column aliases BEFORE visiting columns
// This allows ORDER BY/HAVING to reference aliased columns
const savedAliases = this.selectAliases;
@@ -459,6 +461,25 @@ export class ClickHousePrinter {
this.inProjectionContext = false;
}
+ // Opt-in gap-fill: emit rows for empty time buckets via WITH FILL / INTERPOLATE.
+ // No-op unless enabled, top-level, and the query is fill-eligible.
+ let interpolateClause: string | null = null;
+ let groupedFillWrap: ((inner: string) => string) | null = null;
+ if (this.context.fillGaps && isTopLevelQuery) {
+ const fill = this.buildGapFill(node, orderBy, groupBy);
+ if (fill) {
+ orderBy = fill.orderBy;
+ if (fill.kind === "inline") {
+ interpolateClause = fill.interpolate;
+ } else {
+ // Grouped per-group LOCF: add the `present` sentinel to this (now inner) query
+ // and wrap the rendered SQL in the block-id + carry window layers below.
+ columns.push(fill.presentColumn);
+ groupedFillWrap = fill.wrap;
+ }
+ }
+ }
+
// Process ARRAY JOIN
let arrayJoin = "";
if (node.array_join_op) {
@@ -487,6 +508,8 @@ export class ClickHousePrinter {
having ? `HAVING${space}${having}` : null,
windowClause ? `WINDOW${space}${windowClause}` : null,
orderBy && orderBy.length > 0 ? `ORDER BY${space}${orderBy.join(comma)}` : null,
+ // INTERPOLATE must follow the full ORDER BY (including WITH FILL)
+ interpolateClause,
];
// Process LIMIT
@@ -549,6 +572,11 @@ export class ClickHousePrinter {
response = this.pretty ? `(${response.trim()})` : `(${response})`;
}
+ // Grouped per-group gap fill wraps this query in the block-id + carry window layers.
+ if (groupedFillWrap) {
+ response = groupedFillWrap(response);
+ }
+
// Restore saved contexts (for nested queries)
this.selectAliases = savedAliases;
this.queryHasGroupBy = savedQueryHasGroupBy;
@@ -559,6 +587,183 @@ export class ClickHousePrinter {
return response;
}
+ /**
+ * Build the gap-fill transformation (WITH FILL + optional INTERPOLATE) for a
+ * top-level time-bucketed query. Returns null when the query is not
+ * fill-eligible (correct-by-construction: emit nothing extra rather than risk
+ * wrong values).
+ *
+ * Eligibility: exactly one timeBucket() column in SELECT, and ORDER BY led by
+ * that timeBucket column. Carry (gauge) columns are LOCF'd via INTERPOLATE;
+ * counters zero-fill via WITH FILL's default. Grouped gauge queries are unsafe
+ * (INTERPOLATE bleeds across groups) and are skipped with a warning.
+ */
+ private buildGapFill(
+ node: SelectQuery,
+ orderBy: string[] | null,
+ groupBy: string[] | null
+ ):
+ | { kind: "inline"; orderBy: string[]; interpolate: string | null }
+ | { kind: "wrap"; orderBy: string[]; presentColumn: string; wrap: (inner: string) => string }
+ | null {
+ if (!orderBy || orderBy.length === 0 || !node.select || node.select.length === 0) {
+ return null;
+ }
+
+ const timeRange = this.context.timeRange;
+ if (!timeRange) {
+ return null;
+ }
+
+ // Need a time-constraint table to derive the bucket column + interval.
+ const tableWithConstraint = this.findTimeConstraintTable();
+ if (!tableWithConstraint) {
+ return null;
+ }
+ const { tableSchema, clickhouseColumnName } = tableWithConstraint;
+ const interval = calculateTimeBucketInterval(
+ timeRange.from,
+ timeRange.to,
+ tableSchema.timeBucketThresholds
+ );
+ const bucketSql = `toStartOfInterval(${escapeClickHouseIdentifier(clickhouseColumnName)}, INTERVAL ${interval.value} ${interval.unit})`;
+
+ // Find exactly one timeBucket() column in SELECT and its output alias.
+ let bucketAlias: string | null = null;
+ let bucketCount = 0;
+ for (const col of node.select) {
+ const inner = (col as Alias).expression_type === "alias" ? (col as Alias).expr : col;
+ if (
+ (inner as Call).expression_type === "call" &&
+ (inner as Call).name.toLowerCase() === "timebucket"
+ ) {
+ bucketCount++;
+ bucketAlias =
+ (col as Alias).expression_type === "alias" ? (col as Alias).alias : "timebucket";
+ }
+ }
+ if (bucketCount !== 1 || !bucketAlias) {
+ return null;
+ }
+
+ // ORDER BY must be led by the timeBucket column (alias or full expression).
+ // Don't fight a user ordering like `ORDER BY count DESC`.
+ const leadTerm = orderBy[0];
+ // Strip a trailing ASC/DESC direction without a regex: an unanchored `\s+` before the
+ // keyword backtracks polynomially across start positions on whitespace runs (CodeQL
+ // js/polynomial-redos). endsWith + slice is linear.
+ const trimmedLead = leadTerm.trim();
+ const upperLead = trimmedLead.toUpperCase();
+ const isDescending = upperLead.endsWith(" DESC");
+ const leadExpr = upperLead.endsWith(" ASC")
+ ? trimmedLead.slice(0, -4).trimEnd()
+ : isDescending
+ ? trimmedLead.slice(0, -5).trimEnd()
+ : trimmedLead;
+ const matchesBucket = (expr: string): boolean =>
+ expr.toLowerCase() === bucketAlias!.toLowerCase() || expr === bucketSql;
+ if (!matchesBucket(leadExpr)) {
+ return null;
+ }
+ // WITH FILL is emitted with ascending bounds and a positive STEP, which is
+ // only valid for an ascending bucket order. A descending order would need
+ // swapped bounds and a negative step (newer ClickHouse only), so skip the
+ // gap-fill rewrite and let the plain descending ORDER BY stand.
+ if (isDescending) {
+ return null;
+ }
+
+ // Group dims = GROUP BY expressions that are NOT the timeBucket column.
+ const groupDims = (groupBy ?? []).filter((g) => !matchesBucket(g.trim()));
+
+ // Classify each SELECT output column. Carry (gauge) columns survive through
+ // aliases + value-preserving aggregates (see analyzeSelectColumn). A bare column
+ // that isn't the bucket is a GROUP BY dimension; everything else is a counter or
+ // derived value that zero-fills.
+ const carryAliases: string[] = [];
+ const dimNames: string[] = [];
+ const orderedOutputs: Array<{ name: string; carry: boolean }> = [];
+ for (const col of node.select) {
+ const { outputName, sourceColumn } = this.analyzeSelectColumn(col);
+ if (!outputName) continue;
+ const carry = sourceColumn?.fillMode === "carry";
+ orderedOutputs.push({ name: outputName, carry });
+ if (carry) carryAliases.push(outputName);
+ const inner = (col as Alias).expression_type === "alias" ? (col as Alias).expr : col;
+ if (!matchesBucket(outputName) && (inner as Field).expression_type === "field") {
+ dimNames.push(outputName);
+ }
+ }
+
+ // Snap FROM/TO to the bucket grid and parameterize the bounds.
+ const fromBound = this.context.addValue(timeRange.from);
+ const toBound = this.context.addValue(timeRange.to);
+ const withFill =
+ `WITH FILL FROM toStartOfInterval(${fromBound}, INTERVAL ${interval.value} ${interval.unit})` +
+ ` TO toStartOfInterval(${toBound}, INTERVAL ${interval.value} ${interval.unit})` +
+ ` STEP INTERVAL ${interval.value} ${interval.unit}`;
+
+ const esc = escapeClickHouseIdentifier;
+
+ // Single series: WITH FILL on the bucket + INTERPOLATE the carry columns (LOCF);
+ // counters omitted from INTERPOLATE so they zero-fill.
+ if (groupDims.length === 0) {
+ const newOrderBy = [...orderBy];
+ newOrderBy[0] = `${leadTerm} ${withFill}`;
+ const interpolate =
+ carryAliases.length > 0
+ ? `INTERPOLATE (${carryAliases.map((a) => `${esc(a)} AS ${esc(a)}`).join(", ")})`
+ : null;
+ return { kind: "inline", orderBy: newOrderBy, interpolate };
+ }
+
+ // Grouped, counters only: per-group zero-fill via WITH FILL ordered by the dims.
+ if (carryAliases.length === 0) {
+ return {
+ kind: "inline",
+ orderBy: [...groupDims, `${leadTerm} ${withFill}`],
+ interpolate: null,
+ };
+ }
+
+ // Grouped + gauge: per-group LOCF. INTERPOLATE bleeds across groups, so densify per
+ // group (WITH FILL + a `present` sentinel that is 0 on filled rows), assign a block id
+ // that increments at each real row, then carry the block's real value via window max.
+ // Only safe when every GROUP BY dim is a plain column we can PARTITION BY.
+ if (dimNames.length !== groupDims.length) {
+ this.context.addWarning(
+ "fill_skipped_grouped_gauge",
+ "fillGaps was skipped: per-group gap fill needs every GROUP BY dimension to be a plain column."
+ );
+ return null;
+ }
+
+ const userOrderBy = [...orderBy];
+ const presentCol = "__tsql_present";
+ const blockCol = "__tsql_block";
+ const partitionDims = dimNames.map(esc).join(", ");
+ const blockExpr =
+ `sum(${esc(presentCol)}) OVER (PARTITION BY ${partitionDims} ORDER BY ${esc(bucketAlias)}` +
+ ` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS ${esc(blockCol)}`;
+ const finalColumns = orderedOutputs.map(({ name, carry }) =>
+ carry
+ ? `max(if(${esc(presentCol)} = 1, ${esc(name)}, NULL)) OVER (PARTITION BY ${partitionDims}, ${esc(
+ blockCol
+ )}) AS ${esc(name)}`
+ : esc(name)
+ );
+ const finalOrderBy = userOrderBy.length > 0 ? ` ORDER BY ${userOrderBy.join(", ")}` : "";
+ const wrap = (inner: string): string =>
+ `SELECT ${finalColumns.join(", ")} FROM (SELECT *, ${blockExpr} FROM (${inner.trim()}))${finalOrderBy}`;
+
+ return {
+ kind: "wrap",
+ orderBy: [...dimNames.map(esc), `${leadTerm} ${withFill}`],
+ presentColumn: `1 AS ${esc(presentCol)}`,
+ wrap,
+ };
+ }
+
/**
* Extract column aliases from a SELECT expression.
* Handles explicit aliases (AS name) and implicit names from aggregations/functions.
@@ -1014,11 +1219,12 @@ export class ClickHousePrinter {
if ((firstArg as Field).expression_type === "field") {
const field = firstArg as Field;
const columnInfo = this.resolveFieldToColumn(field.chain);
- // Only propagate customRenderType, not the full column schema
- if (columnInfo.column?.customRenderType) {
+ // Propagate customRenderType and fillMode (gauge-ness), not the full column schema
+ if (columnInfo.column?.customRenderType || columnInfo.column?.fillMode) {
sourceColumn = {
type: inferredType,
customRenderType: columnInfo.column.customRenderType,
+ fillMode: columnInfo.column.fillMode,
};
}
}
@@ -1679,6 +1885,138 @@ export class ClickHousePrinter {
// Note: projectId and environmentId are optional - no validation needed
}
+ /**
+ * Reject queries that merge a scope-keyed aggregate state column (`mergeGroupKey`)
+ * across values of its key: such merges silently return wrong numbers. Valid shapes
+ * group by the key column or pin it to a single value (in the query's WHERE or via
+ * the enforced clause). Runs per SELECT scope; subqueries validate themselves.
+ */
+ private validateMergeScopedColumns(node: SelectQuery): void {
+ for (const tableSchema of this.tableContexts.values()) {
+ for (const column of Object.values(tableSchema.columns)) {
+ if (!column.mergeGroupKey) continue;
+ const keys = Array.isArray(column.mergeGroupKey)
+ ? column.mergeGroupKey
+ : [column.mergeGroupKey];
+ if (!this.scopeReferencesColumn(node, column.name)) continue;
+ for (const key of keys) {
+ if (this.groupByIncludesColumn(node, key)) continue;
+ if (this.wherePinsColumn(node.where, key)) continue;
+ if (this.enforcedPinsColumn(tableSchema, key)) continue;
+ throw new QueryError(
+ `Merging '${column.name}' across every ${key} returns wrong totals: its aggregate ` +
+ `states are kept per ${key} and only combine correctly within one ${key}. Either ` +
+ `add '${key}' to the GROUP BY and sum the per-${key} results in an outer query, ` +
+ `for example: SELECT sum(v) AS total FROM (SELECT ${key}, ` +
+ `deltaSumTimestampMerge(${column.name}) AS v FROM ${tableSchema.name} ` +
+ `GROUP BY ${key}). Or filter to a single ${key}, for example: ` +
+ `WHERE ${key} = 'my-${key}'. For a time series, bucket both layers: ` +
+ `inner GROUP BY t, ${key} and outer GROUP BY t.`
+ );
+ }
+ }
+ }
+ }
+
+ private scopeReferencesColumn(node: SelectQuery, name: string): boolean {
+ const parts: unknown[] = [
+ node.select,
+ node.prewhere,
+ node.where,
+ node.group_by,
+ node.having,
+ node.order_by,
+ ];
+ return parts.some((part) => this.expressionReferencesColumn(part, name));
+ }
+
+ private expressionReferencesColumn(
+ expr: unknown,
+ name: string,
+ seen = new WeakSet()
+ ): boolean {
+ if (expr === null || typeof expr !== "object") return false;
+ if (seen.has(expr)) return false;
+ seen.add(expr);
+ if (Array.isArray(expr)) {
+ return expr.some((item) => this.expressionReferencesColumn(item, name, seen));
+ }
+ const candidate = expr as { expression_type?: string; chain?: unknown[] };
+ if (
+ candidate.expression_type === "select_query" ||
+ candidate.expression_type === "select_set_query"
+ ) {
+ return false;
+ }
+ if (
+ candidate.expression_type === "field" &&
+ Array.isArray(candidate.chain) &&
+ candidate.chain[candidate.chain.length - 1] === name
+ ) {
+ return true;
+ }
+ return Object.entries(expr).some(
+ ([property, value]) =>
+ property !== "type" &&
+ property !== "parent" &&
+ this.expressionReferencesColumn(value, name, seen)
+ );
+ }
+
+ private groupByIncludesColumn(node: SelectQuery, name: string): boolean {
+ return (node.group_by ?? []).some((expr) => {
+ const field = expr as Field;
+ return (
+ field.expression_type === "field" &&
+ Array.isArray(field.chain) &&
+ field.chain[field.chain.length - 1] === name
+ );
+ });
+ }
+
+ // Pins only count on the top-level AND chain: a pin inside an OR guarantees nothing.
+ private wherePinsColumn(where: Expression | undefined, name: string): boolean {
+ if (!where) return false;
+ if (where.expression_type === "and") {
+ return (where as And).exprs.some((expr) => this.wherePinsColumn(expr, name));
+ }
+ if (where.expression_type !== "compare_operation") return false;
+ const cmp = where as CompareOperation;
+ const isKeyField = (side: Expression) => {
+ const field = side as Field;
+ return (
+ field.expression_type === "field" &&
+ Array.isArray(field.chain) &&
+ field.chain[field.chain.length - 1] === name
+ );
+ };
+ const fieldSide = [cmp.left, cmp.right].find(isKeyField);
+ if (!fieldSide) return false;
+ if (cmp.op === CompareOperationOp.Eq) return true;
+ if (cmp.op === CompareOperationOp.In || cmp.op === CompareOperationOp.GlobalIn) {
+ const other = fieldSide === cmp.left ? cmp.right : cmp.left;
+ if ((other as Constant).expression_type === "constant") return true;
+ const tuple = other as Tuple;
+ return tuple.expression_type === "tuple" && tuple.exprs.length === 1;
+ }
+ return false;
+ }
+
+ private enforcedPinsColumn(tableSchema: TableSchema, key: string): boolean {
+ const names = [key];
+ const clickhouseName = tableSchema.columns[key]?.clickhouseName;
+ if (clickhouseName) names.push(clickhouseName);
+ for (const name of names) {
+ const condition = this.context.enforcedWhereClause[name] as
+ | { op?: string; values?: unknown[] }
+ | undefined;
+ if (!condition) continue;
+ if (condition.op === "eq") return true;
+ if (condition.op === "in" && condition.values?.length === 1) return true;
+ }
+ return false;
+ }
+
/**
* Format a Date as a ClickHouse-compatible DateTime64 string.
* ClickHouse expects format: 'YYYY-MM-DD HH:MM:SS.mmm' (in UTC)
diff --git a/internal-packages/tsql/src/query/printer_context.ts b/internal-packages/tsql/src/query/printer_context.ts
index d0fb41b5327..a964e2e04af 100644
--- a/internal-packages/tsql/src/query/printer_context.ts
+++ b/internal-packages/tsql/src/query/printer_context.ts
@@ -125,6 +125,9 @@ export class PrinterContext {
*/
readonly timeRange?: TimeRange;
+ /** When true, time-bucketed queries emit rows for empty buckets (opt-in). */
+ readonly fillGaps?: boolean;
+
constructor(
/** Schema registry containing allowed tables and columns */
public readonly schema: SchemaRegistry,
@@ -138,13 +141,16 @@ export class PrinterContext {
*/
enforcedWhereClause: Record = {},
/** Time range for timeBucket() interval calculation */
- timeRange?: TimeRange
+ timeRange?: TimeRange,
+ /** Opt-in gap-fill for time-bucketed queries */
+ fillGaps?: boolean
) {
// Initialize with default settings
this.settings = { ...DEFAULT_QUERY_SETTINGS, ...settings };
this.fieldMappings = fieldMappings;
this.enforcedWhereClause = enforcedWhereClause;
this.timeRange = timeRange;
+ this.fillGaps = fillGaps;
}
/**
@@ -225,7 +231,8 @@ export class PrinterContext {
this.settings,
this.fieldMappings,
this.enforcedWhereClause,
- this.timeRange
+ this.timeRange,
+ this.fillGaps
);
// Share the same values map so parameters are unified
child.values = this.values;
@@ -277,6 +284,8 @@ export interface PrinterContextOptions {
* When provided, `timeBucket()` uses this to determine the appropriate bucket size.
*/
timeRange?: TimeRange;
+ /** When true, time-bucketed queries emit rows for empty buckets (opt-in). */
+ fillGaps?: boolean;
}
/**
@@ -288,6 +297,7 @@ export function createPrinterContext(options: PrinterContextOptions): PrinterCon
options.settings,
options.fieldMappings,
options.enforcedWhereClause,
- options.timeRange
+ options.timeRange,
+ options.fillGaps
);
}
diff --git a/internal-packages/tsql/src/query/schema.ts b/internal-packages/tsql/src/query/schema.ts
index 9a1e2d2ddfe..a32b8ea142c 100644
--- a/internal-packages/tsql/src/query/schema.ts
+++ b/internal-packages/tsql/src/query/schema.ts
@@ -122,6 +122,18 @@ export interface ColumnSchema {
* ```
*/
customRenderType?: string;
+ /**
+ * Gap-fill behavior when the opt-in `fillGaps` feature emits rows for empty
+ * time buckets: `"carry"` = gauge (LOCF via INTERPOLATE), `"zero"` (default)
+ * = counter (missing buckets get 0).
+ */
+ fillMode?: "zero" | "carry";
+ /**
+ * Aggregate-state column whose states only merge correctly within one value of the
+ * named column(s) (e.g. per-queue counter states). Queries referencing it must GROUP BY
+ * every listed column or pin each to a single value; other shapes fail to compile.
+ */
+ mergeGroupKey?: string | string[];
/**
* Example value for documentation purposes.
*
@@ -409,6 +421,21 @@ export interface TableSchema {
* is needed to get correct results. Not needed for plain MergeTree tables.
*/
useFinal?: boolean;
+ /**
+ * Coarser physical rollups with an identical logical schema, substituted by callers
+ * (not the printer) when the timeBucket() interval is at least minIntervalSeconds.
+ */
+ rollups?: Array<{ minIntervalSeconds: number; clickhouseName: string }>;
+ /**
+ * Opt into the ClickHouse query cache; callers align time bounds to alignSeconds
+ * so repeated auto-refresh queries share cache entries.
+ */
+ queryCache?: { ttlSeconds: number; alignSeconds: number };
+ /**
+ * Excluded from user-facing listings (query editor, schema docs, schema API) by
+ * callers; the engine still compiles queries against it.
+ */
+ hidden?: boolean;
}
/**
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 1a56a054f42..a49afc04da5 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -365,6 +365,9 @@ importers:
'@internal/llm-model-catalog':
specifier: workspace:*
version: link:../../internal-packages/llm-model-catalog
+ '@internal/metrics-pipeline':
+ specifier: workspace:*
+ version: link:../../internal-packages/metrics-pipeline
'@internal/redis':
specifier: workspace:*
version: link:../../internal-packages/redis
@@ -1255,6 +1258,25 @@ importers:
specifier: 4.1.7
version: 4.1.7(@opentelemetry/api@1.9.1)(@types/node@22.20.0)(@vitest/coverage-v8@4.1.7)(vite@6.4.2(@types/node@22.20.0)(jiti@2.6.1)(lightningcss@1.29.2)(terser@5.46.1)(tsx@4.22.4)(yaml@2.9.0))
+ internal-packages/metrics-pipeline:
+ dependencies:
+ '@internal/redis':
+ specifier: workspace:*
+ version: link:../redis
+ '@internal/tracing':
+ specifier: workspace:*
+ version: link:../tracing
+ '@trigger.dev/core':
+ specifier: workspace:*
+ version: link:../../packages/core
+ devDependencies:
+ '@internal/testcontainers':
+ specifier: workspace:*
+ version: link:../testcontainers
+ rimraf:
+ specifier: 6.0.1
+ version: 6.0.1
+
internal-packages/otlp-importer:
dependencies:
long:
@@ -1335,6 +1357,9 @@ importers:
'@internal/cache':
specifier: workspace:*
version: link:../cache
+ '@internal/metrics-pipeline':
+ specifier: workspace:*
+ version: link:../metrics-pipeline
'@internal/redis':
specifier: workspace:*
version: link:../redis