diff --git a/.server-changes/queue-metrics-dashboard.md b/.server-changes/queue-metrics-dashboard.md new file mode 100644 index 00000000000..37baffc7aaa --- /dev/null +++ b/.server-changes/queue-metrics-dashboard.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Queue metrics and health on the Queues page: per-queue depth, throughput, concurrency, throttling, and scheduling-delay charts, plus a per-queue detail view. Off by default; enabled per organization. diff --git a/apps/webapp/app/components/primitives/UsageSparkline.tsx b/apps/webapp/app/components/primitives/UsageSparkline.tsx index 2ffc1936a1d..7c4bbd5d262 100644 --- a/apps/webapp/app/components/primitives/UsageSparkline.tsx +++ b/apps/webapp/app/components/primitives/UsageSparkline.tsx @@ -27,6 +27,8 @@ export type UsageSparklineProps = { color?: string; /** Unit shown in the tooltip (e.g. calls, tokens). */ unitLabel?: UnitLabel; + /** Trailing scalar shown after the chart. Defaults to the sum of buckets (override for gauges, e.g. peak). */ + total?: number; /** Format the trailing total. Defaults to `toLocaleString`. */ formatTotal?: (total: number) => string; /** Class for the trailing total label. */ @@ -44,14 +46,16 @@ export function UsageSparkline({ bucketIntervalMs, color = "#3B82F6", unitLabel = { singular: "call", plural: "calls" }, + total: totalOverride, formatTotal, totalClassName = "text-blue-400", }: UsageSparklineProps) { - if (!data || data.every((v) => v === 0)) { + const hasTotalOverride = totalOverride !== undefined; + if (!data || data.length === 0 || (data.every((v) => v === 0) && !hasTotalOverride)) { return ; } - const total = data.reduce((a, b) => a + b, 0); + const total = totalOverride ?? data.reduce((a, b) => a + b, 0); const max = Math.max(...data); // Map each bucket to a dated point so the tooltip can show the window it diff --git a/apps/webapp/app/components/primitives/charts/Chart.tsx b/apps/webapp/app/components/primitives/charts/Chart.tsx index 57a2692e677..8894c2da34d 100644 --- a/apps/webapp/app/components/primitives/charts/Chart.tsx +++ b/apps/webapp/app/components/primitives/charts/Chart.tsx @@ -216,7 +216,7 @@ const ChartTooltipContent = React.forwardRef< )}
diff --git a/apps/webapp/app/components/primitives/charts/ChartLine.tsx b/apps/webapp/app/components/primitives/charts/ChartLine.tsx index 1edd5a2357e..5d5fb95ecce 100644 --- a/apps/webapp/app/components/primitives/charts/ChartLine.tsx +++ b/apps/webapp/app/components/primitives/charts/ChartLine.tsx @@ -4,6 +4,7 @@ import { CartesianGrid, Line, LineChart, + ReferenceLine, XAxis, YAxis, type XAxisProps, @@ -48,12 +49,38 @@ export type ChartLineRendererProps = { tooltipLabelFormatter?: (label: string, payload: any[]) => string; /** Optional formatter for numeric tooltip values (e.g. bytes, duration) */ tooltipValueFormatter?: (value: number) => string; + /** Draw a dot at each data point. Defaults to true; turn off for dense/compact charts. */ + showDots?: boolean; + /** Horizontal reference lines (e.g. limits); the y-domain extends to include them. */ + referenceLines?: Array<{ y: number; label?: string; color?: string }>; /** Width injected by ResponsiveContainer */ width?: number; /** Height injected by ResponsiveContainer */ height?: number; }; +/** Reference-line label: right-aligned just below the line (recharts injects viewBox). */ +function ReferenceLineLabel({ + viewBox, + value, +}: { + viewBox?: { x: number; y: number; width: number }; + value: string; +}) { + if (!viewBox) return null; + return ( + + {value} + + ); +} + /** * Line chart renderer for the compound component system. * Must be used within a Chart.Root. @@ -73,6 +100,8 @@ export function ChartLineRenderer({ stacked = false, tooltipLabelFormatter, tooltipValueFormatter, + showDots = true, + referenceLines, width, height, }: ChartLineRendererProps) { @@ -176,6 +205,17 @@ export function ChartLineRenderer({ labelFormatter={tooltipLabelFormatter} /> {/* Note: Legend is now rendered by ChartRoot outside the chart container */} + {referenceLines?.map((line) => ( + : undefined} + /> + ))} {visibleSeries.map((key) => ( {/* Note: Legend is now rendered by ChartRoot outside the chart container */} + {referenceLines?.map((line) => ( + : undefined} + /> + ))} {visibleSeries.map((key) => ( diff --git a/apps/webapp/app/components/query/QueryEditor.tsx b/apps/webapp/app/components/query/QueryEditor.tsx index 8520d2a7a0b..eb9fd08ffb1 100644 --- a/apps/webapp/app/components/query/QueryEditor.tsx +++ b/apps/webapp/app/components/query/QueryEditor.tsx @@ -72,7 +72,7 @@ import type { action as titleAction } from "~/routes/resources.orgs.$organizatio import type { QueryScope } from "~/services/queryService.server"; import { downloadFile, rowsToCSV, rowsToJSON } from "~/utils/dataExport"; import { organizationBillingPath } from "~/utils/pathBuilder"; -import { querySchemas } from "~/v3/querySchemas"; +import { visibleQuerySchemas } from "~/v3/querySchemas"; /** Convert a Date or ISO string to ISO string format */ function toISOString(value: Date | string): string { @@ -245,7 +245,7 @@ const QueryEditorForm = forwardRef< { initMollifierDrainerWorker(); initMollifierStaleSweepWorker(); initBillingLimitWorker(); +initQueueMetricsEmitter(); +initQueueMetricsConsumer(); bootstrap().catch((error) => { logError(error); diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index f5e0f0a671c..26e178d18a6 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -883,6 +883,31 @@ const EnvironmentSchema = z RUN_ENGINE_REUSE_SNAPSHOT_COUNT: z.coerce.number().int().default(0), RUN_ENGINE_MAXIMUM_ENV_COUNT: z.coerce.number().int().optional(), RUN_ENGINE_RUN_QUEUE_SHARD_COUNT: z.coerce.number().int().default(4), + // Queue metrics ingestion (Redis Stream -> ClickHouse). The runtime on/off is the + // `queue_metrics:enabled` Redis key; these gate emitter construction + consumer boot. + QUEUE_METRICS_EMIT_ENABLED: z.string().default("0"), + QUEUE_METRICS_CONSUMER_ENABLED: z.string().default("0"), + QUEUE_METRICS_STREAM_SHARD_COUNT: z.coerce.number().int().default(4), + QUEUE_METRICS_CONSUMER_BATCH_SIZE: z.coerce.number().int().default(1000), + // Counter stream (exact counts, loss-intolerant). Unset host => the run-queue Redis; + // set it to a dedicated instance so counter backlog never competes with the run queue. + QUEUE_METRICS_REDIS_HOST: z.string().optional(), + QUEUE_METRICS_REDIS_PORT: z.coerce.number().optional(), + QUEUE_METRICS_REDIS_USERNAME: z.string().optional(), + QUEUE_METRICS_REDIS_PASSWORD: z.string().optional(), + QUEUE_METRICS_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"), + // Default depends on where the stream lives: see metricsDefinition() in + // queueMetrics.server.ts (2M on the shared run-queue Redis, 8M on a dedicated one). + QUEUE_METRICS_COUNTER_STREAM_MAXLEN: z.coerce.number().int().optional(), + // TTL (seconds) on the per-(queue,op) cumulative odometer key, refreshed on every write. + // Idle-past-TTL queues purge and self-heal (restart from 1) on return; default 7 days. + QUEUE_METRICS_COUNTER_ODOMETER_TTL_SECONDS: z.coerce.number().int().default(604_800), + // Per-env distinct queue_name cap (0 = unlimited); overflow maps to "__overflow__". + QUEUE_METRICS_MAX_QUEUE_NAMES_PER_ENV: z.coerce.number().int().default(1000), + QUEUE_METRICS_MAX_CONCURRENCY_KEYS_PER_QUEUE: z.coerce.number().int().default(10_000), + // Fraction (0..1) of ops that emit a gauge; counters are never sampled. Dial below 1 + // only if EngineCPU is too high in slow-path-heavy regions (hurts low-traffic queues). + QUEUE_METRICS_GAUGE_SAMPLE_RATE: z.coerce.number().min(0).max(1).default(1), RUN_ENGINE_WORKER_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().default(60_000), RUN_ENGINE_RETRY_WARM_START_THRESHOLD_MS: z.coerce.number().int().default(30_000), RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS: z.coerce.number().int().default(200), diff --git a/apps/webapp/app/hooks/useMetricResourceQuery.ts b/apps/webapp/app/hooks/useMetricResourceQuery.ts new file mode 100644 index 00000000000..8cb8faec507 --- /dev/null +++ b/apps/webapp/app/hooks/useMetricResourceQuery.ts @@ -0,0 +1,109 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { useInterval } from "./useInterval"; + +export type MetricResourceRow = Record; + +type MetricResourceResponse = + | { success: true; data: { rows: MetricResourceRow[] } } + | { success: false; error: string }; + +export type MetricResourceTimeRange = { + period: string | null; + from: string | null; + to: string | null; +}; + +export type MetricResourceQueryOptions = { + organizationId: string; + projectId: string; + environmentId: string; + timeRange: MetricResourceTimeRange; + defaultPeriod: string; + queues?: string[]; + fillGaps?: boolean; + refreshIntervalMs?: number; +}; + +/** + * Client-fetch a TRQL query from the metric resource route (like the dashboard + * widgets): own loading state, interval + on-focus refresh, abort on change/unmount. + */ +export function useMetricResourceQuery(query: string, opts: MetricResourceQueryOptions) { + const [rows, setRows] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [failed, setFailed] = useState(false); + const abortRef = useRef(null); + + const { + organizationId, + projectId, + environmentId, + defaultPeriod, + fillGaps, + refreshIntervalMs = 60_000, + } = opts; + const { period, from, to } = opts.timeRange; + const queuesKey = opts.queues && opts.queues.length > 0 ? opts.queues.join(",") : undefined; + + const load = useCallback(() => { + abortRef.current?.abort(); + const controller = new AbortController(); + abortRef.current = controller; + setIsLoading(true); + fetch("/resources/metric", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + query, + scope: "environment", + period: period ?? (from || to ? null : defaultPeriod), + from, + to, + fillGaps: !!fillGaps, + organizationId, + projectId, + environmentId, + ...(queuesKey !== undefined ? { queues: queuesKey.split(",") } : {}), + }), + signal: controller.signal, + }) + .then((res) => res.json() as Promise) + .then((data) => { + if (controller.signal.aborted) return; + if (data.success) { + setRows(data.data.rows); + setFailed(false); + } else { + setFailed(true); + } + setIsLoading(false); + }) + .catch((error) => { + if (error instanceof DOMException && error.name === "AbortError") return; + if (!controller.signal.aborted) { + setFailed(true); + setIsLoading(false); + } + }); + }, [ + query, + period, + from, + to, + defaultPeriod, + fillGaps, + organizationId, + projectId, + environmentId, + queuesKey, + ]); + + useEffect(() => { + load(); + return () => abortRef.current?.abort(); + }, [load]); + + useInterval({ interval: refreshIntervalMs, onLoad: false, onFocus: true, callback: load }); + + return { rows: rows ?? [], isLoading, showLoading: isLoading && !rows, failed }; +} diff --git a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts index 971fc9a3033..d831568248d 100644 --- a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts +++ b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts @@ -550,7 +550,186 @@ const llmDashboard: BuiltInDashboard = { }, }; -const builtInDashboards: BuiltInDashboard[] = [overviewDashboard, llmDashboard]; +const queuesDashboard: BuiltInDashboard = { + key: "queues", + title: "Queues", + filters: ["queues"], + layout: { + version: "1", + layout: [ + { i: "env-used", x: 0, y: 0, w: 3, h: 4 }, + { i: "env-limit", x: 3, y: 0, w: 3, h: 4 }, + { i: "env-avail", x: 6, y: 0, w: 3, h: 4 }, + { i: "env-sat", x: 9, y: 0, w: 3, h: 4 }, + { i: "sat-time", x: 0, y: 4, w: 6, h: 9 }, + { i: "used-limit", x: 6, y: 4, w: 6, h: 9 }, + { i: "t-pressure", x: 0, y: 13, w: 12, h: 2, minH: 2, maxH: 2 }, + { i: "pressure", x: 0, y: 15, w: 12, h: 11 }, + { i: "t-trends", x: 0, y: 26, w: 12, h: 2, minH: 2, maxH: 2 }, + { i: "running-q", x: 0, y: 28, w: 6, h: 9 }, + { i: "queued-q", x: 6, y: 28, w: 6, h: 9 }, + { i: "throttled-q", x: 0, y: 37, w: 6, h: 9 }, + { i: "throughput", x: 6, y: 37, w: 6, h: 9 }, + { i: "wait-pct", x: 0, y: 46, w: 12, h: 9 }, + ], + widgets: { + "env-used": { + title: "Concurrency in use", + query: `SELECT argMax(max_env_running, bucket_start) AS in_use\nFROM env_metrics`, + display: { type: "bignumber", column: "in_use", aggregation: "max", abbreviate: false }, + }, + "env-limit": { + title: "Environment limit", + query: `SELECT argMax(max_env_limit, bucket_start) AS env_limit\nFROM env_metrics`, + display: { type: "bignumber", column: "env_limit", aggregation: "max", abbreviate: false }, + }, + "env-avail": { + title: "Available slots", + query: `SELECT argMax(max_env_limit, bucket_start) - argMax(max_env_running, bucket_start) AS available\nFROM env_metrics`, + display: { type: "bignumber", column: "available", aggregation: "max", abbreviate: false }, + }, + "env-sat": { + title: "Env saturation", + query: `SELECT round(argMax(max_env_running, bucket_start) * 100.0 / nullIf(argMax(max_env_limit, bucket_start), 0), 1) AS saturation\nFROM env_metrics`, + display: { + type: "bignumber", + column: "saturation", + aggregation: "max", + abbreviate: false, + suffix: "%", + }, + }, + "sat-time": { + title: "Environment saturation over time", + query: `SELECT timeBucket() AS t,\n round(max(max_env_running) * 100.0 / nullIf(max(max_env_limit), 0), 1) AS saturation\nFROM env_metrics\nGROUP BY t\nORDER BY t`, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["saturation"], + groupByColumn: null, + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + "used-limit": { + title: "Concurrency used vs limit", + query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS limit\nFROM env_metrics\nGROUP BY t\nORDER BY t`, + // Single-series gauge: carry the last known used/limit across idle buckets instead of dropping to 0. + fillGaps: true, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["used", "limit"], + groupByColumn: null, + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + "t-pressure": { title: "Queue pressure", query: "", display: { type: "title" } }, + pressure: { + title: "Queue pressure", + query: `SELECT queue,\n argMax(max_running, bucket_start) AS running,\n argMax(max_queued, bucket_start) AS queued,\n argMax(max_limit, bucket_start) AS limit,\n running + queued AS demand,\n max(max_queued) AS peak_queued,\n sum(throttled_count) AS throttled,\n multiIf(running >= limit AND queued > 0, 'queue-limited', queued > 0, 'backlogged', 'healthy') AS status\nFROM queue_metrics\nGROUP BY queue\nORDER BY peak_queued DESC`, + display: { + type: "table", + prettyFormatting: true, + sorting: [{ id: "peak_queued", desc: true }], + }, + }, + "t-trends": { title: "Per-queue trends", query: "", display: { type: "title" } }, + "running-q": { + title: "Running by queue", + query: `SELECT timeBucket() AS t, queue, max(max_running) AS running\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`, + // Grouped gauge: carry each queue's running across idle buckets (per-group LOCF). + fillGaps: true, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["running"], + groupByColumn: "queue", + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + "queued-q": { + title: "Queue depth (backlog) by queue", + query: `SELECT timeBucket() AS t, queue, max(max_queued) AS queued\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`, + // Grouped gauge: carry each queue's backlog across idle buckets (per-group LOCF). + fillGaps: true, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["queued"], + groupByColumn: "queue", + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + "throttled-q": { + title: "Throttled buckets by queue", + query: `SELECT timeBucket() AS t, queue, sum(throttled_count) AS throttled\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`, + // Grouped counter: per-group zero-fill so idle buckets read 0, not a gap. + fillGaps: true, + display: { + type: "chart", + chartType: "bar", + xAxisColumn: "t", + yAxisColumns: ["throttled"], + groupByColumn: "queue", + stacked: true, + sortByColumn: null, + sortDirection: "asc", + aggregation: "sum", + }, + }, + throughput: { + title: "Enqueued vs started", + // Counter states merge per queue, then sum outside: a single merge across queues + // mixes unrelated odometers and returns wrong totals. + query: `SELECT t, sum(enq) AS enqueued, sum(st) AS started\nFROM (\n SELECT timeBucket() AS t, queue,\n deltaSumTimestampMerge(enqueue_delta) AS enq,\n deltaSumTimestampMerge(started_delta) AS st\n FROM queue_metrics\n GROUP BY t, queue\n)\nGROUP BY t\nORDER BY t`, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["enqueued", "started"], + groupByColumn: null, + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "sum", + }, + }, + "wait-pct": { + title: "Scheduling delay p50/p95/p99 (ms)", + query: `SELECT timeBucket() AS t,\n round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[1]) AS p50,\n round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[3]) AS p95,\n round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[4]) AS p99\nFROM env_metrics\nGROUP BY t\nORDER BY t`, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["p50", "p95", "p99"], + groupByColumn: null, + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + }, + }, +}; + +const builtInDashboards: BuiltInDashboard[] = [overviewDashboard, llmDashboard, queuesDashboard]; export function builtInDashboardList(): BuiltInDashboard[] { return builtInDashboards; diff --git a/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts b/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts index df43864b53a..0b84e971b2f 100644 --- a/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts @@ -37,6 +37,9 @@ export const Widget = z.object({ title: z.string(), query: z.string().default(""), display: QueryWidgetConfig, + // Opt into server-side gap fill (carry-forward for gauges, zero-fill for counters). + // Top-level rather than in `display` because display config is client-only and never reaches the query POST. + fillGaps: z.boolean().optional(), }); export type Widget = z.infer; diff --git a/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts new file mode 100644 index 00000000000..c7a8166b6a3 --- /dev/null +++ b/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts @@ -0,0 +1,94 @@ +import { TaskQueueType, type Prisma } from "@trigger.dev/database"; +import { type AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { engine } from "~/v3/runEngine.server"; +import { BasePresenter } from "./basePresenter.server"; + +const MAX_ALLOCATION_QUEUES = 500; + +export type QueueAllocationItem = { + id: string; + name: string; + type: "task" | "custom"; + running: number; + queued: number; + paused: boolean; + /** Explicit per-queue limit; null means the queue floats up to the env limit. */ + limit: number | null; + overridden: boolean; +}; + +export type QueueAllocation = { + queues: QueueAllocationItem[]; + totalQueues: number; + truncated: boolean; + /** Sum of explicit limits, each clamped to the env limit. */ + allocated: number; + unlimitedCount: number; +}; + +/** Every queue in the environment (capped) with live counts, for the allocation view. */ +export class QueueAllocationPresenter extends BasePresenter { + public async call({ + environment, + }: { + environment: AuthenticatedEnvironment; + }): Promise { + const where: Prisma.TaskQueueWhereInput = { + runtimeEnvironmentId: environment.id, + version: "V2", + }; + + const [totalQueues, queues] = await Promise.all([ + this._replica.taskQueue.count({ where }), + this._replica.taskQueue.findMany({ + where, + select: { + friendlyId: true, + name: true, + type: true, + paused: true, + concurrencyLimit: true, + concurrencyLimitOverriddenAt: true, + }, + orderBy: { orderableName: "asc" }, + take: MAX_ALLOCATION_QUEUES, + }), + ]); + + const names = queues.map((q) => q.name); + const [queuedByQueue, runningByQueue] = await Promise.all([ + engine.lengthOfQueues(environment, names), + engine.currentConcurrencyOfQueues(environment, names), + ]); + + const envLimit = environment.maximumConcurrencyLimit; + let allocated = 0; + let unlimitedCount = 0; + + const items: QueueAllocationItem[] = queues.map((queue) => { + if (queue.concurrencyLimit === null) { + unlimitedCount++; + } else { + allocated += Math.min(queue.concurrencyLimit, envLimit); + } + return { + id: queue.friendlyId, + name: queue.name.replace(/^task\//, ""), + type: queue.type === TaskQueueType.VIRTUAL ? ("task" as const) : ("custom" as const), + running: runningByQueue[queue.name] ?? 0, + queued: queuedByQueue[queue.name] ?? 0, + paused: queue.paused, + limit: queue.concurrencyLimit, + overridden: queue.concurrencyLimitOverriddenAt !== null, + }; + }); + + return { + queues: items, + totalQueues, + truncated: totalQueues > queues.length, + allocated, + unlimitedCount, + }; + } +} diff --git a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts index 024a1342b0a..751d4b0a602 100644 --- a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts @@ -3,6 +3,8 @@ import type { Prisma } from "@trigger.dev/database"; import { TaskQueueType } from "@trigger.dev/database"; import { type PrismaClientOrTransaction } from "~/db.server"; import { type AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { logger } from "~/services/logger.server"; import { determineEngineVersion } from "~/v3/engineVersion.server"; import { engine } from "~/v3/runEngine.server"; import { BasePresenter } from "./basePresenter.server"; @@ -13,6 +15,12 @@ type QueueListEngine = Pick = { task: TaskQueueType.VIRTUAL, custom: TaskQueueType.NAMED, @@ -30,6 +38,38 @@ const queueListSelect = { paused: true, } satisfies Prisma.TaskQueueSelect; +type QueueListRow = Prisma.TaskQueueGetPayload<{ select: typeof queueListSelect }>; + +type QueueListItem = ReturnType; + +type QueueListPagination = + | { mode: "filtered"; currentPage: number; hasMore: boolean } + | { mode: "unfiltered"; currentPage: number; totalPages: number; count: number }; + +// The `?: undefined` markers keep every key reachable across the union, so consumers +// can destructure before narrowing on `success`. +export type QueueListResult = + | { + success: false; + code: string; + totalQueues: number; + hasFilters: boolean; + queues?: undefined; + pagination?: undefined; + } + | { + success: true; + queues: QueueListItem[]; + pagination: QueueListPagination; + totalQueues?: number; + hasFilters: boolean; + code?: undefined; + }; + +function formatClickhouseDateTime(date: Date): string { + return date.toISOString().slice(0, 19).replace("T", " "); +} + function buildQueueListWhere( environmentId: string, query: string | undefined, @@ -70,13 +110,15 @@ export class QueueListPresenter extends BasePresenter { query, page, type, + sort = "name", }: { environment: AuthenticatedEnvironment; query?: string; page: number; perPage?: number; type?: "task" | "custom"; - }) { + sort?: QueueListSort; + }): Promise { const hasFilters = Boolean(query?.trim()) || type !== undefined; const engineVersion = await determineEngineVersion({ environment }); @@ -110,6 +152,18 @@ export class QueueListPresenter extends BasePresenter { }; } + if (sort !== "name") { + // Ranking is additive: any failure or unsupported input falls back to name order. + try { + const ranked = await this.getRankedQueues(environment, query, page, type, sort); + if (ranked) { + return ranked; + } + } catch (error) { + logger.warn("Queue ranking unavailable, falling back to name order", { error }); + } + } + if (hasFilters) { const { queues, hasMore } = await this.getFilteredQueues(environment, query, page, type); @@ -143,6 +197,132 @@ export class QueueListPresenter extends BasePresenter { }; } + /** + * ClickHouse ranks queues by recent activity and returns the requested page of names; + * queues with no recent metrics follow in name order. Null when ranking does not apply. + */ + private async getRankedQueues( + environment: AuthenticatedEnvironment, + query: string | undefined, + page: number, + type: "task" | "custom" | undefined, + sort: Exclude + ) { + if (type !== undefined) { + return null; + } + + const clickhouse = await clickhouseFactory.getClickhouseForOrganization( + environment.organizationId, + "query" + ); + + // The window start is aligned to the minute so repeated page loads produce identical + // query text and can share ClickHouse query-cache entries. + const windowStartMs = + Math.floor((Date.now() - QUEUE_RANKING_WINDOW_MINUTES * 60 * 1000) / 60_000) * 60_000; + const rankingArgs = { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + startTime: formatClickhouseDateTime(new Date(windowStartMs)), + nameContains: query?.trim() ?? "", + }; + + const offset = (page - 1) * this.perPage; + + // One scan returns the page and the total ranked count (window function). + const [pageError, pageRows] = await clickhouse.queueMetrics.ranking({ + ...rankingArgs, + byQueuedOnly: sort === "queued" ? 1 : 0, + limit: this.perPage, + offset, + }); + if (pageError) { + throw pageError; + } + + let ranked = pageRows?.[0]?.ranked_total ?? 0; + if (ranked === 0 && offset > 0) { + // Empty page past the ranked head: fetch the count alone for the tail slot math. + const [countError, countRows] = await clickhouse.queueMetrics.rankingCount(rankingArgs); + if (countError) { + throw countError; + } + ranked = countRows?.[0]?.ranked ?? 0; + } + if (ranked > MAX_RANKED_QUEUES) { + return null; + } + + const where = buildQueueListWhere(environment.id, query, type); + const totalQueues = await this._replica.taskQueue.count({ where }); + + let rankedPageQueues: QueueListRow[] = []; + if ((pageRows?.length ?? 0) > 0) { + const rankedNames = (pageRows ?? []).map((row) => row.queue_name); + rankedPageQueues = await this.findQueuesByNames(where, rankedNames); + } + + // Tail of the page: name-ordered queues that have no recent metrics. Slot math uses the + // ClickHouse counts so pages never overlap, even if some ranked names no longer exist. + const rankedSlots = Math.min(Math.max(ranked - offset, 0), this.perPage); + const tailNeeded = this.perPage - rankedSlots; + let tailQueues: QueueListRow[] = []; + if (tailNeeded > 0) { + let excludedNames: string[] = []; + if (ranked > 0) { + const [allError, allRows] = await clickhouse.queueMetrics.rankingNames({ + ...rankingArgs, + limit: MAX_RANKED_QUEUES, + }); + if (allError) { + throw allError; + } + excludedNames = (allRows ?? []).map((row) => row.queue_name); + } + // AND keeps the search's name filter intact alongside the exclusion (a spread + // would overwrite one name condition with the other). + tailQueues = await this._replica.taskQueue.findMany({ + where: { AND: [where, { name: { notIn: excludedNames } }] }, + select: queueListSelect, + orderBy: { + orderableName: "asc", + }, + skip: Math.max(0, offset - ranked), + take: tailNeeded, + }); + } + + return { + success: true as const, + queues: await this.enrichQueues(environment, [...rankedPageQueues, ...tailQueues]), + pagination: { + mode: "unfiltered" as const, + currentPage: page, + totalPages: Math.max(1, Math.ceil(totalQueues / this.perPage)), + count: totalQueues, + }, + totalQueues, + hasFilters: Boolean(query?.trim()) || type !== undefined, + }; + } + + private async findQueuesByNames( + where: Prisma.TaskQueueWhereInput, + names: string[] + ): Promise { + if (names.length === 0) { + return []; + } + const queues = await this._replica.taskQueue.findMany({ + where: { AND: [where, { name: { in: names } }] }, + select: queueListSelect, + }); + const byName = new Map(queues.map((queue) => [queue.name, queue])); + return names.flatMap((name) => byName.get(name) ?? []); + } + private async getFilteredQueues( environment: AuthenticatedEnvironment, query: string | undefined, diff --git a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts new file mode 100644 index 00000000000..a36c402dda7 --- /dev/null +++ b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts @@ -0,0 +1,139 @@ +import { type AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { logger } from "~/services/logger.server"; + +export type QueueListMetric = { + p50WaitMs: number | null; + p95WaitMs: number | null; + peakQueued: number; + /** Equal-width buckets, oldest first, carry-forward filled across idle gaps. */ + depthSparkline: number[]; +}; + +export type QueueListMetrics = { + bucketStartMs: number; + bucketIntervalMs: number; + byQueue: Map; +}; + +const SPARKLINE_POINTS = 48; + +function formatClickhouseDateTime(date: Date): string { + return date.toISOString().slice(0, 19).replace("T", " "); +} + +function finiteOrNull(value: number): number | null { + return Number.isFinite(value) ? value : null; +} + +export class QueueMetricsPresenter { + /** + * Per-queue metrics over a time range for a fixed set of queues (the visible list page), + * scoped to one ClickHouse query window so cost is independent of total queue count. + * Degrades to an empty map if ClickHouse is unavailable so the live list still renders. + */ + public async getQueueListMetrics({ + environment, + queueNames, + from, + to, + }: { + environment: AuthenticatedEnvironment; + queueNames: string[]; + from: Date; + to: Date; + }): Promise { + const rangeSeconds = Math.max(60, Math.round((to.getTime() - from.getTime()) / 1000)); + const bucketSeconds = Math.max(60, Math.round(rangeSeconds / SPARKLINE_POINTS)); + const numBuckets = Math.max(1, Math.ceil(rangeSeconds / bucketSeconds)); + const gridStartSeconds = + Math.floor(Math.floor(from.getTime() / 1000) / bucketSeconds) * bucketSeconds; + const bucketStartMs = gridStartSeconds * 1000; + const bucketIntervalMs = bucketSeconds * 1000; + + const empty: QueueListMetrics = { + bucketStartMs, + bucketIntervalMs, + byQueue: new Map(), + }; + + if (queueNames.length === 0) { + return empty; + } + + try { + const clickhouse = await clickhouseFactory.getClickhouseForOrganization( + environment.organizationId, + "query" + ); + + // End bound snaps up to the bucket grid so repeated loads within a bucket produce + // identical params and share ClickHouse query-cache entries. + const endMs = Math.ceil(to.getTime() / bucketIntervalMs) * bucketIntervalMs; + const ids = { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + queueNames, + startTime: formatClickhouseDateTime(new Date(bucketStartMs)), + endTime: formatClickhouseDateTime(new Date(endMs)), + }; + + const [summaryResult, sparklineResult] = await Promise.all([ + clickhouse.queueMetrics.listSummary(ids), + clickhouse.queueMetrics.depthSparklines({ ...ids, bucketSeconds }), + ]); + + const [summaryError, summaryRows] = summaryResult; + const [sparklineError, sparklineRows] = sparklineResult; + + if (summaryError || sparklineError) { + logger.warn("QueueMetricsPresenter: clickhouse query failed", { + summaryError: summaryError?.message, + sparklineError: sparklineError?.message, + }); + return empty; + } + + // Bucket -> depth per queue, mapped onto the aligned grid and forward-filled. + const depthsByQueue = new Map>(); + for (const row of sparklineRows ?? []) { + const bucketMs = Date.parse(row.bucket.replace(" ", "T") + "Z"); + if (Number.isNaN(bucketMs)) continue; + const index = Math.round((bucketMs - bucketStartMs) / bucketIntervalMs); + if (index < 0 || index >= numBuckets) continue; + let byIndex = depthsByQueue.get(row.queue_name); + if (!byIndex) { + byIndex = new Map(); + depthsByQueue.set(row.queue_name, byIndex); + } + byIndex.set(index, row.depth); + } + + const byQueue = new Map(); + for (const row of summaryRows ?? []) { + const byIndex = depthsByQueue.get(row.queue_name); + const sparkline: number[] = new Array(numBuckets); + let last = 0; + for (let i = 0; i < numBuckets; i++) { + const value = byIndex?.get(i); + if (value !== undefined) last = value; + sparkline[i] = last; + } + byQueue.set(row.queue_name, { + p50WaitMs: finiteOrNull(row.p50_wait_ms), + p95WaitMs: finiteOrNull(row.p95_wait_ms), + peakQueued: row.peak_queued, + depthSparkline: sparkline, + }); + } + + return { bucketStartMs, bucketIntervalMs, byQueue }; + } catch (error) { + logger.warn("QueueMetricsPresenter: failed to load queue metrics", { + error: error instanceof Error ? error.message : String(error), + }); + return empty; + } + } +} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx index 5fa237cee6e..d529fdf0d22 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx @@ -38,6 +38,7 @@ import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstan import { requireUser } from "~/services/session.server"; import { cn } from "~/utils/cn"; import { EnvironmentParamSchema } from "~/utils/pathBuilder"; +import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server"; import { QueryScopeSchema } from "~/v3/querySchemas"; import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route"; import { MetricWidget } from "../resources.metric"; @@ -50,6 +51,15 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const user = await requireUser(request); const { projectParam, organizationSlug, envParam, dashboardKey } = ParamSchema.parse(params); + // The built-in "queues" dashboard is part of the metrics UI (unlinked, but reachable by + // URL), so gate it per-org like the rest of the Queue Metrics view. + if ( + dashboardKey === "queues" && + !(await canAccessQueueMetricsUi({ userId: user.id, organizationSlug })) + ) { + throw new Response(undefined, { status: 404, statusText: "Not found" }); + } + const project = await findProjectBySlug(organizationSlug, projectParam, user.id); if (!project) { throw new Response(undefined, { @@ -376,6 +386,7 @@ export function MetricDashboard({ promptSlugs={prompts.length > 0 ? prompts : undefined} operations={operations.length > 0 ? operations : undefined} providers={providers.length > 0 ? providers : undefined} + fillGaps={widget.fillGaps} config={widget.display} organizationId={organization.id} projectId={project.id} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/ExamplesContent.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/ExamplesContent.tsx index 05b4f4d9b62..3188b5409a6 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/ExamplesContent.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/ExamplesContent.tsx @@ -3,7 +3,7 @@ import { Header3 } from "~/components/primitives/Headers"; import { Paragraph } from "~/components/primitives/Paragraph"; import SegmentedControl from "~/components/primitives/SegmentedControl"; import type { QueryScope } from "~/services/queryService.server"; -import { querySchemas } from "~/v3/querySchemas"; +import { visibleQuerySchemas } from "~/v3/querySchemas"; import { TryableCodeBlock } from "./TRQLGuideContent"; // Example queries for the Examples tab @@ -211,14 +211,14 @@ LIMIT 20`, }, ]; -const tableOptions = querySchemas.map((s) => ({ label: s.name, value: s.name })); +const tableOptions = visibleQuerySchemas.map((s) => ({ label: s.name, value: s.name })); export function ExamplesContent({ onTryExample, }: { onTryExample: (query: string, scope: QueryScope) => void; }) { - const [selectedTable, setSelectedTable] = useState(querySchemas[0].name); + const [selectedTable, setSelectedTable] = useState(visibleQuerySchemas[0].name); const filtered = exampleQueries.filter((e) => e.table === selectedTable); return ( diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/TableSchemaContent.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/TableSchemaContent.tsx index 285a1f68731..9fc6ec32923 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/TableSchemaContent.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/TableSchemaContent.tsx @@ -4,7 +4,7 @@ import { Badge } from "~/components/primitives/Badge"; import { CopyableText } from "~/components/primitives/CopyableText"; import { Paragraph } from "~/components/primitives/Paragraph"; import SegmentedControl from "~/components/primitives/SegmentedControl"; -import { querySchemas } from "~/v3/querySchemas"; +import { visibleQuerySchemas } from "~/v3/querySchemas"; function ColumnHelpItem({ col }: { col: ColumnSchema }) { return ( @@ -43,11 +43,11 @@ function ColumnHelpItem({ col }: { col: ColumnSchema }) { ); } -const tableOptions = querySchemas.map((s) => ({ label: s.name, value: s.name })); +const tableOptions = visibleQuerySchemas.map((s) => ({ label: s.name, value: s.name })); export function TableSchemaContent() { - const [selectedTable, setSelectedTable] = useState(querySchemas[0].name); - const table = querySchemas.find((s) => s.name === selectedTable) ?? querySchemas[0]; + const [selectedTable, setSelectedTable] = useState(visibleQuerySchemas[0].name); + const table = visibleQuerySchemas.find((s) => s.name === selectedTable) ?? visibleQuerySchemas[0]; return (
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx new file mode 100644 index 00000000000..8b13afbcb87 --- /dev/null +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx @@ -0,0 +1,490 @@ +import { Form, useNavigation } from "@remix-run/react"; +import { type ReactNode, useEffect, useMemo, useState } from "react"; +import { BigNumber } from "~/components/metrics/BigNumber"; +import { Badge } from "~/components/primitives/Badge"; +import { Button } from "~/components/primitives/Buttons"; +import { Callout } from "~/components/primitives/Callout"; +import { Dialog, DialogContent, DialogHeader, DialogTrigger } from "~/components/primitives/Dialog"; +import { Input } from "~/components/primitives/Input"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import { + Table, + TableBody, + TableCell, + TableHeader, + TableHeaderCell, + TableRow, +} from "~/components/primitives/Table"; +import { SimpleTooltip } from "~/components/primitives/Tooltip"; +import { getSeriesColor } from "~/components/code/chartColors"; +import { QueueName } from "~/components/runs/v3/QueueName"; +import { type Environment } from "~/presenters/v3/EnvironmentQueuePresenter.server"; +import { + type QueueAllocation, + type QueueAllocationItem, +} from "~/presenters/v3/QueueAllocationPresenter.server"; +import { cn } from "~/utils/cn"; + +type Drafts = Record; + +export function AllocationView({ + allocation, + environment, +}: { + allocation: QueueAllocation; + environment: Environment; +}) { + const [drafts, setDrafts] = useState({}); + const [reviewOpen, setReviewOpen] = useState(false); + const navigation = useNavigation(); + const isSubmitting = navigation.state !== "idle"; + + const envLimit = environment.concurrencyLimit; + const burstLimit = Math.round(envLimit * environment.burstFactor); + + useEffect(() => { + if (navigation.state === "loading" || navigation.state === "idle") { + setReviewOpen(false); + } + }, [navigation.state]); + + // After an apply revalidates the loader, drop drafts that now match the saved limits. + useEffect(() => { + setDrafts((prev) => { + const next = { ...prev }; + for (const queue of allocation.queues) { + if (next[queue.id] !== undefined && next[queue.id] === queue.limit) { + delete next[queue.id]; + } + } + return next; + }); + }, [allocation]); + + const draftLimit = (queue: QueueAllocationItem): number | null => drafts[queue.id] ?? queue.limit; + + const draftAllocated = allocation.queues.reduce((sum, queue) => { + const limit = draftLimit(queue); + return limit === null ? sum : sum + Math.min(limit, envLimit); + }, 0); + + const changes = allocation.queues.filter( + (queue) => drafts[queue.id] !== undefined && drafts[queue.id] !== queue.limit + ); + + const unlimitedCount = allocation.queues.filter((queue) => draftLimit(queue) === null).length; + const allocationPct = envLimit > 0 ? Math.round((draftAllocated / envLimit) * 100) : 0; + const overAllocated = draftAllocated > envLimit; + + const setDraft = (queue: QueueAllocationItem, value: string) => { + setDrafts((prev) => { + const next = { ...prev }; + if (value.trim() === "") { + delete next[queue.id]; + return next; + } + const parsed = parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed < 0) return prev; + if (parsed === queue.limit) { + delete next[queue.id]; + } else { + next[queue.id] = parsed; + } + return next; + }); + }; + + const changesPayload = useMemo( + () => + JSON.stringify(changes.map((queue) => ({ friendlyId: queue.id, limit: drafts[queue.id] }))), + [changes, drafts] + ); + + const colorByQueue = useMemo(() => { + const map = new Map(); + allocation.queues.forEach((queue, i) => map.set(queue.id, getSeriesColor(i))); + return map; + }, [allocation.queues]); + const colorFor = (id: string) => colorByQueue.get(id) ?? "#878C99"; + + // Busiest first: the queues you'd rebalance are the ones under load. Colors stay + // keyed to the loader order so they don't shift as counts change. + const tableQueues = useMemo( + () => [...allocation.queues].sort((a, b) => b.running + b.queued - (a.running + a.queued)), + [allocation.queues] + ); + + return ( +
+
+ 1 ? `bursts up to ${burstLimit}` : undefined} + suffixClassName="text-text-dimmed" + /> + + 0 + ? `${unlimitedCount} without a limit (can use up to ${envLimit})` + : "all have limits" + } + suffixClassName="text-text-dimmed" + /> +
+ + + + {overAllocated && ( + + The queue limits add up to more than the environment limit, so queues will compete for + concurrency when the environment saturates. Reduce limits to guarantee each queue its + allocation. + + )} + + {allocation.truncated && ( + + Showing the first {allocation.queues.length} of {allocation.totalQueues} queues. + Allocation totals only include the queues shown. + + )} + +
+ +
+ + + + + + Apply queue limits +
+ + + + Queue + Current + New + + + + {changes.map((queue) => ( + + + + + {queue.limit ?? "–"} + {drafts[queue.id]} + + ))} + +
+
+ + Limits apply immediately and are set as overrides, so they survive deploys until + removed. + +
+ + + +
+
+
+
+ + + + + Name + Running + Queued + + Limit + + + + + {tableQueues.map((queue) => { + const changed = drafts[queue.id] !== undefined && drafts[queue.id] !== queue.limit; + return ( + + + + + + {queue.paused && ( + + Paused + + )} + {queue.overridden && ( + + Override + + )} + + + {queue.running} + {queue.queued} + + + {changed && ( + + {queue.limit ?? "–"} → {drafts[queue.id]} + + )} + setDraft(queue, e.target.value)} + disabled={isSubmitting} + className="w-24" + variant="small" + /> + + + + ); + })} + +
+
+ ); +} + +const MAX_BAR_SEGMENTS = 24; + +function AllocationBar({ + queues, + draftLimit, + envLimit, + burstLimit, + draftAllocated, + colorFor, +}: { + queues: QueueAllocationItem[]; + draftLimit: (queue: QueueAllocationItem) => number | null; + envLimit: number; + burstLimit: number; + draftAllocated: number; + colorFor: (id: string) => string; +}) { + const limited = queues + .map((queue) => ({ queue, limit: draftLimit(queue) })) + .filter( + (entry): entry is { queue: QueueAllocationItem; limit: number } => + typeof entry.limit === "number" && entry.limit > 0 + ) + .sort((a, b) => b.limit - a.limit); + + const top = limited.slice(0, MAX_BAR_SEGMENTS); + const rest = limited.slice(MAX_BAR_SEGMENTS); + const restTotal = rest.reduce((sum, entry) => sum + entry.limit, 0); + const restRunning = rest.reduce( + (sum, entry) => sum + Math.min(entry.queue.running, entry.limit), + 0 + ); + + const hasBurst = burstLimit > envLimit; + // The axis runs to the burst ceiling: allocations are guaranteed up to the env + // limit, and everything between the limit and burst is shared overflow headroom. + const scale = Math.max(draftAllocated, envLimit, burstLimit); + if (scale === 0) return null; + + const free = Math.max(0, envLimit - draftAllocated); + const limitMarkerPct = (envLimit / scale) * 100; + const burstZoneWidthPct = ((Math.min(burstLimit, scale) - envLimit) / scale) * 100; + + return ( +
+
+
+ {hasBurst && ( + + } + content={`Shared burst headroom: beyond the environment limit, queues can burst up to ${burstLimit} combined`} + disableHoverableContent + /> + )} +
+ {top.map((entry) => ( + + } + /> + ))} + {restTotal > 0 && ( + + )} +
+
+
+
+
+ + {draftAllocated} allocated + {free > 0 ? ` · ${free} unallocated` : ""} + + {hasBurst ? ( + <> + + Environment limit {envLimit} + + Burst {burstLimit} + + ) : ( + Environment limit {envLimit} + )} +
+
+ ); +} + +function QueueSegmentTooltip({ + queue, + limit, + envLimit, + color, +}: { + queue: QueueAllocationItem; + limit: number; + envLimit: number; + color: string; +}) { + const utilizationPct = limit > 0 ? Math.round((queue.running / limit) * 100) : 0; + const sharePct = envLimit > 0 ? Math.round((limit / envLimit) * 100) : 0; + return ( +
+ + + + {queue.paused && ( + + Paused + + )} + +
+ Running + + {queue.running} of {limit} ({utilizationPct}%) + + Queued + {queue.queued} + Allocation + + {sharePct}% of the environment limit + +
+
+ ); +} + +/** One queue's slice of the capacity bar: dim fill = allocation, solid fill = current usage. */ +function BarSegment({ + color, + widthPct, + usagePct, + tooltip, +}: { + color: string; + widthPct: number; + usagePct: number; + tooltip: ReactNode; +}) { + return ( + + {usagePct > 0 && ( +
+ )} +
+ } + content={tooltip} + disableHoverableContent + /> + ); +} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index 877b1235a97..24fe2212953 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -7,11 +7,11 @@ import { RectangleStackIcon, } from "@heroicons/react/20/solid"; import { DialogClose } from "@radix-ui/react-dialog"; -import { Form, useNavigation, type MetaFunction } from "@remix-run/react"; +import { Form, Link, useNavigation, type MetaFunction } from "@remix-run/react"; import { type ActionFunctionArgs, type LoaderFunctionArgs } from "@remix-run/server-runtime"; import type { QueueItem } from "@trigger.dev/core/v3/schemas"; import type { RuntimeEnvironmentType } from "@trigger.dev/database"; -import { useEffect, useState } from "react"; +import { type ReactNode, useEffect, useMemo, useState } from "react"; import { typedjson, useTypedLoaderData } from "remix-typedjson"; import { z } from "zod"; import { ConcurrencyIcon } from "~/assets/icons/ConcurrencyIcon"; @@ -21,7 +21,6 @@ import { AdminDebugTooltip } from "~/components/admin/debugTooltip"; import { QueuesHasNoTasks } from "~/components/BlankStatePanels"; import { environmentFullTitle } from "~/components/environments/EnvironmentLabel"; import { PageBody, PageContainer } from "~/components/layout/AppLayout"; -import { BigNumber } from "~/components/metrics/BigNumber"; import { Badge } from "~/components/primitives/Badge"; import { Button, LinkButton, type ButtonVariant } from "~/components/primitives/Buttons"; import { Callout } from "~/components/primitives/Callout"; @@ -55,6 +54,7 @@ import { import { QueueName } from "~/components/runs/v3/QueueName"; import { env } from "~/env.server"; import { useAutoRevalidate } from "~/hooks/useAutoRevalidate"; +import { LoadingBarDivider } from "~/components/primitives/LoadingBarDivider"; import { useEnvironment } from "~/hooks/useEnvironment"; import { useOrganization } from "~/hooks/useOrganizations"; import { useProject } from "~/hooks/useProject"; @@ -64,6 +64,24 @@ import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { getUserById } from "~/models/user.server"; import { EnvironmentQueuePresenter } from "~/presenters/v3/EnvironmentQueuePresenter.server"; import { QueueListPresenter } from "~/presenters/v3/QueueListPresenter.server"; +import { + QueueMetricsPresenter, + type QueueListMetric, +} from "~/presenters/v3/QueueMetricsPresenter.server"; +import * as Ariakit from "@ariakit/react"; +import { AppliedFilter } from "~/components/primitives/AppliedFilter"; +import { SelectItem, SelectPopover, SelectProvider } from "~/components/primitives/Select"; +import { TimeFilter, timeFilterFromTo } from "~/components/runs/v3/SharedFilters"; +import { useSearchParams } from "~/hooks/useSearchParam"; +import { parseFiniteInt } from "~/utils/searchParams"; +import { UsageSparkline } from "~/components/primitives/UsageSparkline"; +import { buildActivityTimeAxis } from "~/components/primitives/charts/activityTimeAxis"; +import { Chart, type ChartConfig } from "~/components/primitives/charts/ChartCompound"; +import { + useMetricResourceQuery, + type MetricResourceTimeRange, +} from "~/hooks/useMetricResourceQuery"; +import { logger } from "~/services/logger.server"; import { requireUserId } from "~/services/session.server"; import { cn } from "~/utils/cn"; import { ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT } from "~/utils/environmentPauseSource"; @@ -72,18 +90,36 @@ import { docsPath, EnvironmentParamSchema, v3BillingPath, + v3QueuePath, v3RunsPath, } from "~/utils/pathBuilder"; import { concurrencySystem } from "~/v3/services/concurrencySystemInstance.server"; import { PauseEnvironmentService } from "~/v3/services/pauseEnvironment.server"; import { PauseQueueService } from "~/v3/services/pauseQueue.server"; import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route"; +import { BigNumber } from "~/components/metrics/BigNumber"; +import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server"; +import { QueueAllocationPresenter } from "~/presenters/v3/QueueAllocationPresenter.server"; +import { TabButton, TabContainer } from "~/components/primitives/Tabs"; +import { AllocationView } from "./AllocationView"; const SearchParamsSchema = z.object({ query: z.string().optional(), page: z.coerce.number().min(1).default(1), + period: z.string().optional(), + from: z.string().optional(), + to: z.string().optional(), + view: z.string().optional(), + sort: z.enum(["busiest", "queued", "name"]).optional(), }); +const AllocationChangesSchema = z + .array(z.object({ friendlyId: z.string(), limit: z.number().int().min(0) })) + .min(1) + .max(200); + +const QUEUE_METRICS_DEFAULT_PERIOD = "1d"; + export const meta: MetaFunction = () => { return [ { @@ -97,7 +133,9 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const { organizationSlug, projectParam, envParam } = EnvironmentParamSchema.parse(params); const url = new URL(request.url); - const { page, query } = SearchParamsSchema.parse(Object.fromEntries(url.searchParams)); + const { page, query, period, from, to, view, sort } = SearchParamsSchema.parse( + Object.fromEntries(url.searchParams) + ); const project = await findProjectBySlug(organizationSlug, projectParam, userId); if (!project) { @@ -115,22 +153,82 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }); } + // Per-org gate for the metrics UI. When off, this org gets the classic Queues page and + // no metrics query fires. + const queueMetricsUiEnabled = await canAccessQueueMetricsUi({ userId, organizationSlug }); + try { const queueListPresenter = new QueueListPresenter(); const queues = await queueListPresenter.call({ environment, query, page, + // Relevance ordering rides the metrics pipeline, so it is part of the gated UI. + sort: queueMetricsUiEnabled ? (sort ?? "busiest") : "name", }); const environmentQueuePresenter = new EnvironmentQueuePresenter(); const autoReloadPollIntervalMs = env.QUEUES_AUTORELOAD_POLL_INTERVAL_MS; + // Per-queue list metrics (Delay p95 + backlog sparkline columns) are SSR'd with the table. + // The environment header tiles are fetched client-side per card (see QueueEnvMetricTile) so a + // slow ClickHouse query never blocks the queues list from rendering. + let metrics: { + bucketStartMs: number; + bucketIntervalMs: number; + byQueue: Record; + } | null = null; + + const allocationView = queueMetricsUiEnabled && view === "allocation"; + + if (queueMetricsUiEnabled && queues.success && !allocationView) { + // Metrics are additive observability; a ClickHouse hiccup must not take down queue + // management. Fail open to metrics: null instead of bubbling to the page-level 400. + try { + const presenter = new QueueMetricsPresenter(); + const queueNames = queues.queues.map((q) => + q.type === "task" ? `task/${q.name}` : q.name + ); + const timeRange = timeFilterFromTo({ + period, + from: parseFiniteInt(from), + to: parseFiniteInt(to), + defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD, + }); + const queueMetrics = + queueNames.length > 0 + ? await presenter.getQueueListMetrics({ + environment, + queueNames, + from: timeRange.from, + to: timeRange.to, + }) + : null; + if (queueMetrics) { + metrics = { + bucketStartMs: queueMetrics.bucketStartMs, + bucketIntervalMs: queueMetrics.bucketIntervalMs, + byQueue: Object.fromEntries(queueMetrics.byQueue), + }; + } + } catch (error) { + logger.warn("Queue list metrics unavailable, rendering without them", { error }); + } + } + + const allocation = + allocationView && queues.success + ? await new QueueAllocationPresenter().call({ environment }) + : null; + return typedjson({ ...queues, environment: await environmentQueuePresenter.call(environment), autoReloadPollIntervalMs, + metrics, + allocation, + queueMetricsUiEnabled, }); } catch (error) { console.error(error); @@ -293,12 +391,61 @@ export const action = async ({ request, params }: ActionFunctionArgs) => { return redirectWithSuccessMessage(redirectPath, request, "Queue concurrency limit reset"); } + case "allocation-apply": { + if (!(await canAccessQueueMetricsUi({ userId, organizationSlug }))) { + return redirectWithErrorMessage(redirectPath, request, "Not available"); + } + + let changes; + try { + changes = AllocationChangesSchema.parse(JSON.parse(String(formData.get("changes")))); + } catch { + return redirectWithErrorMessage(redirectPath, request, "Invalid changes"); + } + + const user = await getUserById(userId); + if (!user) { + return redirectWithErrorMessage(redirectPath, request, "User not found"); + } + + let failed = 0; + for (const change of changes) { + const result = await concurrencySystem.queues.overrideQueueConcurrencyLimit( + environment, + change.friendlyId, + change.limit, + user + ); + if (!result.isOk()) failed++; + } + + if (failed > 0) { + return redirectWithErrorMessage( + redirectPath, + request, + `Failed to update ${failed} of ${changes.length} queue limits` + ); + } + + return redirectWithSuccessMessage( + redirectPath, + request, + `Updated ${changes.length} queue limit${changes.length === 1 ? "" : "s"}` + ); + } default: return redirectWithErrorMessage(redirectPath, request, "Something went wrong"); } }; export default function Page() { + // Per-org flag decides which whole page renders. Off => the classic Queues page, + // byte-for-byte the pre-metrics UI. Each branch is its own component (own hooks). + const { queueMetricsUiEnabled } = useTypedLoaderData(); + return queueMetricsUiEnabled ? : ; +} + +function QueuesWithMetricsView() { const { environment, queues, @@ -308,24 +455,28 @@ export default function Page() { totalQueues, hasFilters, autoReloadPollIntervalMs, + metrics, + allocation, } = useTypedLoaderData(); + const metricsByQueue = metrics?.byQueue ?? {}; + const organization = useOrganization(); const project = useProject(); const env = useEnvironment(); const plan = useCurrentPlan(); + const maxPeriodDays = plan?.v3Subscription?.plan?.limits?.queryPeriodDays?.number; - useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true }); - - const limitStatus = - environment.running === environment.concurrencyLimit * environment.burstFactor - ? "limit" - : environment.running > environment.concurrencyLimit - ? "burst" - : "within"; + // The header tiles fetch client-side with the same period/from/to the TimeFilter writes. + const { value, replace } = useSearchParams(); + const timeRange = { + period: value("period") ?? null, + from: value("from") ?? null, + to: value("to") ?? null, + }; + const view = value("view") === "allocation" ? ("allocation" as const) : ("queues" as const); - const limitClassName = - limitStatus === "burst" ? "text-warning" : limitStatus === "limit" ? "text-error" : undefined; + useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true }); return ( @@ -333,6 +484,30 @@ export default function Page() { + {plan ? ( + plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? ( + + Increase limit + + ) : ( + + Increase limit + + ) + ) : null} + {environment.runsEnabled && env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? ( + + ) : null} -
-
- paused : undefined} - animate - accessory={ -
- {environment.runsEnabled && - env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? ( - - ) : null} - -
- } - valueClassName={env.paused ? "text-warning tabular-nums" : "tabular-nums"} - compactThreshold={1000000} - /> - - Including {environment.running - environment.concurrencyLimit} burst runs{" "} - - - ) : limitStatus === "limit" ? ( - "At concurrency limit" - ) : undefined - } - accessory={ - - } - compactThreshold={1000000} - /> - 1 ? ( - - Burst limit {environment.burstFactor * environment.concurrencyLimit}{" "} - - - ) : undefined - } - accessory={ - plan ? ( - plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? ( - - Increase limit - - ) : ( - - Increase limit - - ) - ) : null - } - /> +
+
+ {QUEUE_HEADER_TILES.map((tile) => ( + 1 + ? [ + { + y: Math.round(environment.burstFactor * 100), + label: `Burst ${Math.round( + environment.concurrencyLimit * environment.burstFactor + )}`, + }, + ] + : []), + ] + : undefined + } + /> + ))}
{success ? ( + + replace({ view: undefined })} + > + Queues + + replace({ view: "allocation", page: undefined })} + > + Allocation + + + ) : ( +
+ )} + + {success && view === "allocation" ? ( + allocation ? ( + + ) : ( +
+ +
+ ) + ) : success ? (
- +
+ + + +
Limited by + Health + + Delay p95 + + Backlog Pause/resume @@ -518,11 +669,19 @@ export default function Page() { const queueFilterableName = `${queue.type === "task" ? "task/" : ""}${ queue.name }`; + const queueMetric = metricsByQueue[queueFilterableName]; return ( - + + + {queue.concurrency?.overriddenAt ? ( + + + + + {queueMetric && queueMetric.p95WaitMs !== null ? ( + = 60_000 + ? "text-warning" + : "text-text-bright" + )} + > + {formatWaitMs(queueMetric.p95WaitMs)} + + ) : ( + + )} + + + v.toLocaleString()} + /> + - +
{hasFilters @@ -1059,6 +1253,709 @@ export function QueueFilters() { return ; } +const QUEUE_SORT_OPTIONS = [ + { value: "busiest", label: "Busiest" }, + { value: "queued", label: "Backlog" }, + { value: "name", label: "Name" }, +] as const; + +type QueueSortValue = (typeof QUEUE_SORT_OPTIONS)[number]["value"]; + +function QueueSortFilter() { + const { value, replace } = useSearchParams(); + const sort: QueueSortValue = (value("sort") as QueueSortValue) ?? "busiest"; + const label = QUEUE_SORT_OPTIONS.find((option) => option.value === sort)?.label ?? "Busiest"; + + return ( + + replace({ sort: next === "busiest" ? undefined : (next as string), page: undefined }) + } + > + }> + + + + {QUEUE_SORT_OPTIONS.map((option) => ( + + {option.label} + + ))} + + + ); +} + +type MetricTileRow = Record; + +type QueueHeaderTile = { + id: string; + label: string; + color: string; + query: string; + /** Formats a single bucket's value in the chart tooltip. */ + formatValue?: (value: number) => string; + derive: (rows: MetricTileRow[]) => { + sparkline: number[]; + total: number; + formatTotal?: (total: number) => string; + totalClassName?: string; + }; +}; + +function tileNumber(value: number | string | null): number { + const n = typeof value === "number" ? value : Number(value); + return Number.isFinite(n) ? n : 0; +} + +function tileTimeToMs(value: number | string | null): number { + const s = String(value).replace(" ", "T"); + return Date.parse(s.endsWith("Z") ? s : `${s}Z`); +} + +// Header tiles fetch their own TRQL query client-side (resources.metric) with fillGaps, mirroring the +// metrics dashboard widgets: the gauges (saturation inputs, backlog) carry, counters/p95 zero-fill. +const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ + { + id: "saturation", + label: "Env saturation", + color: "#6366F1", + query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS env_limit\nFROM env_metrics\nGROUP BY t\nORDER BY t`, + formatValue: (v) => `${v}%`, + derive: (rows) => { + const sparkline = rows.map((r) => { + const limit = tileNumber(r.env_limit); + return limit > 0 ? Math.round((tileNumber(r.used) / limit) * 100) : 0; + }); + const peak = sparkline.reduce((max, v) => Math.max(max, v), 0); + return { sparkline, total: peak, formatTotal: (v) => `${v}% peak` }; + }, + }, + { + id: "backlog", + label: "Backlog", + color: "#A78BFA", + query: `SELECT timeBucket() AS t,\n max(max_env_queued) AS queued\nFROM env_metrics\nGROUP BY t\nORDER BY t`, + derive: (rows) => { + const sparkline = rows.map((r) => tileNumber(r.queued)); + const peak = sparkline.reduce((max, v) => Math.max(max, v), 0); + return { sparkline, total: peak, formatTotal: (v) => `${v.toLocaleString()} peak` }; + }, + }, + { + id: "p95", + label: "Scheduling delay p95", + color: "#F59E0B", + query: `SELECT timeBucket() AS t,\n round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[3]) AS p95\nFROM env_metrics\nGROUP BY t\nORDER BY t`, + formatValue: formatWaitMs, + derive: (rows) => { + const sparkline = rows.map((r) => tileNumber(r.p95)); + const worst = sparkline.reduce((max, v) => Math.max(max, v), 0); + return { + sparkline, + total: worst, + formatTotal: (v) => (v > 0 ? formatWaitMs(v) : "–"), + totalClassName: worst >= 60_000 ? "text-warning" : undefined, + }; + }, + }, + { + id: "throttled", + label: "Throttled", + color: "#F59E0B", + query: `SELECT timeBucket() AS t,\n sum(throttled_count) AS throttled\nFROM env_metrics\nGROUP BY t\nORDER BY t`, + derive: (rows) => { + const sparkline = rows.map((r) => tileNumber(r.throttled)); + const total = sparkline.reduce((sum, v) => sum + v, 0); + return { + sparkline, + total, + totalClassName: total > 0 ? "text-warning" : undefined, + }; + }, + }, +]; + +type TileTimeRange = MetricResourceTimeRange; + +function QueueEnvMetricTile({ + tile, + timeRange, + referenceLines, +}: { + tile: QueueHeaderTile; + timeRange: TileTimeRange; + referenceLines?: Array<{ y: number; label?: string }>; +}) { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + + const { rows, isLoading, showLoading, failed } = useMetricResourceQuery(tile.query, { + organizationId: organization.id, + projectId: project.id, + environmentId: environment.id, + timeRange, + defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD, + fillGaps: true, + }); + + const { sparkline, total, formatTotal, totalClassName } = tile.derive(rows); + + // Same point shape the full-size charts use so the shared axis/tooltip helpers apply. + const data = rows + .map((r, i) => ({ bucket: tileTimeToMs(r.t), [tile.id]: sparkline[i] ?? 0 })) + .filter((p) => Number.isFinite(p.bucket)); + + const chartConfig = useMemo( + () => ({ [tile.id]: { label: tile.label, color: tile.color } }), + [tile.id, tile.label, tile.color] + ); + + const { tooltipLabelFormatter } = useMemo(() => buildActivityTimeAxis(data), [data]); + const hasData = data.length > 0 && sparkline.some((v) => v > 0); + + return ( + + ) : failed ? undefined : formatTotal ? ( + formatTotal(total) + ) : ( + total.toLocaleString() + ) + } + valueClassName={totalClassName} + > + + {showLoading ? ( +
+ ) : failed ? ( +
+ Unable to load metrics +
+ ) : hasData ? ( +
+ + + +
+ ) : ( +
No activity
+ )} + + ); +} + +function HeaderTile({ + label, + value, + valueClassName, + children, +}: { + label: ReactNode; + value?: ReactNode; + valueClassName?: string; + children: ReactNode; +}) { + return ( +
+
+ {label} + {value !== undefined ? ( + + {value} + + ) : null} +
+ {children} +
+ ); +} + +function QueueHealthBadge({ + paused, + running, + queued, + limit, +}: { + paused: boolean; + running: number; + queued: number; + limit: number; +}) { + if (paused) { + return ( + + Paused + + ); + } + if (running >= limit && queued > 0) { + return ( + + At capacity + + ); + } + if (queued > 0) { + return ( + + Backlogged + + ); + } + if (running > 0) { + return ( + + Active + + ); + } + return ( + + Idle + + ); +} + +function formatWaitMs(ms: number): string { + if (ms < 1000) return `${Math.round(ms)}ms`; + if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`; + if (ms < 3_600_000) return `${(ms / 60_000).toFixed(1)}m`; + return `${(ms / 3_600_000).toFixed(1)}h`; +} + +// Classic Queues page, restored verbatim from before the Queue Metrics feature. Rendered +// when queueMetricsUiEnabled is off so a gated org sees exactly the pre-metrics UI. +function ClassicQueuesView() { + const { + environment, + queues, + success, + pagination, + code, + totalQueues, + hasFilters, + autoReloadPollIntervalMs, + } = useTypedLoaderData(); + + const organization = useOrganization(); + const project = useProject(); + const env = useEnvironment(); + const plan = useCurrentPlan(); + + useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true }); + + const limitStatus = + environment.running === environment.concurrencyLimit * environment.burstFactor + ? "limit" + : environment.running > environment.concurrencyLimit + ? "burst" + : "within"; + + const limitClassName = + limitStatus === "burst" ? "text-warning" : limitStatus === "limit" ? "text-error" : undefined; + + return ( + + + + + + + Queues docs + + + + +
+
+ paused : undefined} + animate + accessory={ +
+ {environment.runsEnabled && + env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? ( + + ) : null} + +
+ } + valueClassName={env.paused ? "text-warning tabular-nums" : "tabular-nums"} + compactThreshold={1000000} + /> + + Including {environment.running - environment.concurrencyLimit} burst runs{" "} + + + ) : limitStatus === "limit" ? ( + "At concurrency limit" + ) : undefined + } + accessory={ + + } + compactThreshold={1000000} + /> + 1 ? ( + + Burst limit {environment.burstFactor * environment.concurrencyLimit}{" "} + + + ) : undefined + } + accessory={ + plan ? ( + plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? ( + + Increase limit + + ) : ( + + Increase limit + + ) + ) : null + } + /> +
+ + {success ? ( +
+
+ + +
+ + + + Name + Queued + Running + Limit + +
+ Environment + + This queue is limited by your environment's concurrency limit of{" "} + {environment.concurrencyLimit}. + +
+
+ User + + This queue is limited by a concurrency limit set in your code. + +
+
+ Override + + This queue's concurrency limit has been manually overridden from the + dashboard or API. + +
+ + } + > + Limited by +
+ + Pause/resume + +
+
+ + {queues.length > 0 ? ( + queues.map((queue) => { + const limit = queue.concurrencyLimit ?? environment.concurrencyLimit; + const isAtConcurrencyLimit = queue.running >= limit; + const isAtQueueLimit = + environment.queueSizeLimit !== null && + queue.queued >= environment.queueSizeLimit; + const queueFilterableName = `${queue.type === "task" ? "task/" : ""}${ + queue.name + }`; + return ( + + + + + {queue.concurrency?.overriddenAt ? ( + + Concurrency limit overridden + + } + content="This queue's concurrency limit has been manually overridden from the dashboard or API." + className="max-w-xs" + disableHoverableContent + /> + ) : null} + {queue.paused ? ( + + Paused + + ) : null} + {isAtQueueLimit ? ( + + At queue limit + + ) : null} + {isAtConcurrencyLimit ? ( + + At concurrency limit + + ) : null} + + + + {queue.queued} + + 0 && "text-text-bright", + isAtConcurrencyLimit && "text-warning" + )} + > + {queue.running} + + + {limit} + + + {queue.concurrency?.overriddenAt ? ( + Override + ) : queue.concurrencyLimit ? ( + "User" + ) : ( + "Environment" + )} + + + } + hiddenButtons={ + !queue.paused && + } + popoverContent={ + <> + {queue.paused ? ( + + ) : ( + + )} + + + + + + + } + /> + + ); + }) + ) : ( + + +
+ + {hasFilters + ? "No queues found matching your filters" + : "No queues found"} + +
+
+
+ )} +
+
+
+ ) : ( +
+ {totalQueues === 0 ? ( +
+ +
+ ) : code === "engine-version" ? ( + + ) : ( + Something went wrong + )} +
+ )} +
+
+
+ ); +} + function BurstFactorTooltip({ environment, }: { diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx new file mode 100644 index 00000000000..e6a21c6514f --- /dev/null +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx @@ -0,0 +1,783 @@ +import { type MetaFunction } from "@remix-run/react"; +import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; +import { useMemo } from "react"; +import { typedjson, useTypedLoaderData } from "remix-typedjson"; +import { z } from "zod"; +import { PageBody, PageContainer } from "~/components/layout/AppLayout"; +import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; +import { buildActivityTimeAxis } from "~/components/primitives/charts/activityTimeAxis"; +import { + Chart, + type ChartConfig, + type ChartState, +} from "~/components/primitives/charts/ChartCompound"; +import { ChartCard } from "~/components/primitives/charts/ChartCard"; +import { + useMetricResourceQuery, + type MetricResourceTimeRange, +} from "~/hooks/useMetricResourceQuery"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; +import { QueueRetrievePresenter } from "~/presenters/v3/QueueRetrievePresenter.server"; +import { + Table, + TableBody, + TableCell, + TableHeader, + TableHeaderCell, + TableRow, +} from "~/components/primitives/Table"; +import { TabButton, TabContainer } from "~/components/primitives/Tabs"; +import { engine } from "~/v3/runEngine.server"; +import { TimeFilter } from "~/components/runs/v3/SharedFilters"; +import { useSearchParams } from "~/hooks/useSearchParam"; +import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route"; +import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server"; +import { requireUserId } from "~/services/session.server"; +import { cn } from "~/utils/cn"; +import { EnvironmentParamSchema } from "~/utils/pathBuilder"; + +export const meta: MetaFunction = () => [{ title: `Queue metrics | Trigger.dev` }]; + +const ParamsSchema = EnvironmentParamSchema.extend({ queueParam: z.string() }); + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const userId = await requireUserId(request); + const { organizationSlug, projectParam, envParam, queueParam } = ParamsSchema.parse(params); + + // This whole page is part of the metrics UI; gate it per-org (the list already hides + // the only link to it, this is defense in depth). + if (!(await canAccessQueueMetricsUi({ userId, organizationSlug }))) { + throw new Response(undefined, { status: 404, statusText: "Not found" }); + } + + const url = new URL(request.url); + + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) throw new Response(undefined, { status: 404, statusText: "Project not found" }); + + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) + throw new Response(undefined, { status: 404, statusText: "Environment not found" }); + + const retrieve = await new QueueRetrievePresenter().call({ environment, queueInput: queueParam }); + if (!retrieve.success) { + throw new Response(undefined, { status: 404, statusText: "Queue not found" }); + } + + const queue = retrieve.queue; + const fullName = queue.type === "task" ? `task/${queue.name}` : queue.name; + + const ckBreakdown = await engine.concurrencyKeyBreakdown(environment, fullName, { + limit: CK_LIVE_LIMIT, + }); + + // Charts + CH-derived stats are fetched client-side per card (see QueueDetailChartCard / + // useQueueMetric) so the drill-down renders instantly. The loader only returns the live + // "now" counts + identifiers the client fetches need. + return typedjson({ + queue, + fullName, + ckBreakdown, + loadedAt: Date.now(), + backPath: url.pathname.replace(/\/[^/]+$/, ""), + ids: { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + }, + }); +}; + +const COLORS = { + running: "#6366F1", + limit: "#4D525B", + queued: "#A78BFA", + p50: "#22D3EE", + p95: "#F59E0B", + p99: "#EF4444", + throttled: "#F59E0B", + ckKeys: "#34D399", + ckWait: "#F59E0B", +}; + +const CK_LIVE_LIMIT = 50; + +type Ids = { organizationId: string; projectId: string; environmentId: string }; + +type TimeRangeParams = MetricResourceTimeRange; + +const QUEUE_METRICS_DEFAULT_PERIOD = "1d"; + +export default function Page() { + const { queue, fullName, ckBreakdown, loadedAt, backPath, ids } = + useTypedLoaderData(); + const plan = useCurrentPlan(); + const maxPeriodDays = plan?.v3Subscription?.plan?.limits?.queryPeriodDays?.number; + + const { value, replace } = useSearchParams(); + const timeRange: TimeRangeParams = { + period: value("period") ?? null, + from: value("from") ?? null, + to: value("to") ?? null, + }; + + // The Concurrency keys tab exists only for queues with key activity: live keys in the + // ckIndex, or nonzero CK history in the selected range (one cached scalar query decides). + const { rows: gateRows, showLoading: gateLoading } = useQueueMetric( + `SELECT max(max_ck_backlogged) AS peak_keys, max(max_ck_wait_ms) AS peak_wait\nFROM queue_metrics`, + { ids, timeRange, queueName: fullName } + ); + const gateRow = gateRows[0]; + const hasHistory = gateRow + ? toNumber(gateRow.peak_keys) > 0 || toNumber(gateRow.peak_wait) > 0 + : false; + const showKeysTab = ckBreakdown.keys.length > 0 || (!gateLoading && hasHistory); + const view = value("view") === "keys" && showKeysTab ? "keys" : "overview"; + + return ( + + + + + +
+
+ + +
+ + {showKeysTab && ( + + replace({ view: undefined, key: undefined })} + > + Overview + + replace({ view: "keys" })} + > + Concurrency keys + + + )} + + {view === "keys" ? ( + + ) : ( + + )} +
+
+
+ ); +} + +function OverviewCharts({ + ids, + timeRange, + queueName, +}: { + ids: Ids; + timeRange: TimeRangeParams; + queueName: string; +}) { + return ( + <> + + + + + + ); +} + +type CkBreakdown = { + totalBackloggedKeys: number; + keys: Array<{ + concurrencyKey: string; + queued: number; + running: number; + oldestEnqueuedAt: number; + }>; +}; + +function ConcurrencyKeysView({ + breakdown, + loadedAt, + ids, + timeRange, + queueName, +}: { + breakdown: CkBreakdown; + loadedAt: number; + ids: Ids; + timeRange: TimeRangeParams; + queueName: string; +}) { + return ( + <> + + + + + + + ); +} + +// TRQL string literal escape (standard SQL doubling). +function trqlString(value: string): string { + return value.replace(/'/g, "''"); +} + +const KEY_SERIES_COLORS = [ + "#34D399", + "#6366F1", + "#F59E0B", + "#22D3EE", + "#A78BFA", + "#EF4444", + "#F472B6", + "#84CC16", +]; + +type GroupedKeyChartProps = { + title: string; + /** Aggregate expression ranking keys over the whole range (top 8 charted). */ + rankExpr: string; + /** Aggregate expression charted per (bucket, key). */ + seriesExpr: string; + fillGaps?: boolean; + valueFormat?: (value: number) => string; + ids: Ids; + timeRange: TimeRangeParams; + queueName: string; +}; + +// Two-step top-N: rank keys over the range, then chart those keys as grouped series +// (the per-key table is activity-bound, so ranking is a cheap scan). +function GroupedKeyChartCard(props: GroupedKeyChartProps) { + const { rows, showLoading, failed } = useQueueMetric( + `SELECT concurrency_key, ${props.rankExpr} AS peak\nFROM queue_metrics_by_key\nGROUP BY concurrency_key\nORDER BY peak DESC\nLIMIT 8`, + { ids: props.ids, timeRange: props.timeRange, queueName: props.queueName } + ); + const keys = useMemo( + () => rows.filter((r) => toNumber(r.peak) > 0).map((r) => String(r.concurrency_key)), + [rows] + ); + + if (showLoading || failed || keys.length === 0) return null; + return ; +} + +function GroupedKeySeries({ + keys, + title, + seriesExpr, + fillGaps, + valueFormat, + ids, + timeRange, + queueName, +}: GroupedKeyChartProps & { keys: string[] }) { + const inList = keys.map((k) => `'${trqlString(k)}'`).join(", "); + const { rows, showLoading, failed } = useQueueMetric( + `SELECT timeBucket() AS t, concurrency_key, ${seriesExpr} AS v\nFROM queue_metrics_by_key\nWHERE concurrency_key IN (${inList})\nGROUP BY t, concurrency_key\nORDER BY t`, + { ids, timeRange, queueName, fillGaps } + ); + + const data = useMemo(() => { + const buckets = new Map>(); + for (const r of rows) { + const bucket = clickhouseTimeToMs(r.t); + if (!Number.isFinite(bucket)) continue; + let point = buckets.get(bucket); + if (!point) { + point = { bucket } as { bucket: number } & Record; + buckets.set(bucket, point); + } + point[String(r.concurrency_key)] = toNumber(r.v); + } + return [...buckets.values()].sort((a, b) => a.bucket - b.bucket); + }, [rows]); + + const chartConfig = useMemo(() => { + const cfg: ChartConfig = {}; + keys.forEach((k, i) => { + cfg[k] = { label: k, color: KEY_SERIES_COLORS[i % KEY_SERIES_COLORS.length]! }; + }); + return cfg; + }, [keys]); + + const { tickFormatter, tooltipLabelFormatter } = useMemo( + () => buildActivityTimeAxis(data), + [data] + ); + const state: ChartState = showLoading ? "loading" : failed ? "invalid" : undefined; + + return ( +
+ + + valueFormat(v) } : undefined} + tooltipLabelFormatter={tooltipLabelFormatter} + tooltipValueFormatter={valueFormat} + /> + + +
+ ); +} + +type KeyRangeStats = { started: number; peakBacklog: number; meanWaitMs: number }; + +// Live breakdown (queued/running now, oldest wait) merged with per-key range stats from +// the history tier; keys with history but no live backlog still appear. Clicking a key +// pins the drill-down charts via the `key` search param. +function KeyStatsTable({ + breakdown, + loadedAt, + ids, + timeRange, + queueName, +}: { + breakdown: CkBreakdown; + loadedAt: number; + ids: Ids; + timeRange: TimeRangeParams; + queueName: string; +}) { + const { value, replace, del } = useSearchParams(); + const selectedKey = value("key"); + + const { rows, showLoading } = useQueueMetric( + `SELECT concurrency_key,\n deltaSumTimestampMerge(started_delta) AS started,\n max(max_queued) AS peak_backlog,\n if(sum(wait_ms_count) > 0, round(sum(wait_ms_sum) / sum(wait_ms_count)), 0) AS mean_wait\nFROM queue_metrics_by_key\nGROUP BY concurrency_key\nORDER BY peak_backlog DESC\nLIMIT 50`, + { ids, timeRange, queueName } + ); + + const merged = useMemo(() => { + const range = new Map(); + for (const r of rows) { + range.set(String(r.concurrency_key), { + started: toNumber(r.started), + peakBacklog: toNumber(r.peak_backlog), + meanWaitMs: toNumber(r.mean_wait), + }); + } + const liveKeys = new Set(breakdown.keys.map((k) => k.concurrencyKey)); + const live = breakdown.keys.map((k) => ({ + key: k.concurrencyKey, + queued: k.queued, + running: k.running, + oldestWaitMs: Math.max(0, loadedAt - k.oldestEnqueuedAt), + range: range.get(k.concurrencyKey), + })); + const historyOnly = [...range.entries()] + .filter(([key]) => !liveKeys.has(key)) + .map(([key, stats]) => ({ + key, + queued: 0, + running: 0, + oldestWaitMs: null as number | null, + range: stats, + })); + return [...live, ...historyOnly].slice(0, 50); + }, [rows, breakdown, loadedAt]); + + if (merged.length === 0) return null; + + return ( + <> +
+
+
Concurrency keys
+
+ {breakdown.totalBackloggedKeys > 0 + ? `${breakdown.totalBackloggedKeys.toLocaleString()} ${ + breakdown.totalBackloggedKeys === 1 ? "key" : "keys" + } with queued runs now` + : "No keys with queued runs right now"} +
+
+ + + + Key + Queued now + Running now + Oldest wait + Started + Peak backlog + Mean delay + + + + {merged.map((row) => ( + (selectedKey === row.key ? del("key") : replace({ key: row.key }))} + > + {row.key} + {row.queued.toLocaleString()} + {row.running.toLocaleString()} + + {row.oldestWaitMs === null ? "–" : formatWaitMs(row.oldestWaitMs)} + + + {row.range ? row.range.started.toLocaleString() : showLoading ? "…" : "–"} + + + {row.range ? row.range.peakBacklog.toLocaleString() : showLoading ? "…" : "–"} + + + {row.range && row.range.meanWaitMs > 0 ? formatWaitMs(row.range.meanWaitMs) : "–"} + + + ))} + +
+
+ {selectedKey && ( + + )} + + ); +} + +function KeyDrilldown({ + keyName, + ids, + timeRange, + queueName, +}: { + keyName: string; + ids: Ids; + timeRange: TimeRangeParams; + queueName: string; +}) { + const pin = `concurrency_key = '${trqlString(keyName)}'`; + return ( + <> + + + 0, round(sum(wait_ms_sum) / sum(wait_ms_count)), 0) AS wait\nFROM queue_metrics_by_key\nWHERE ${pin}\nGROUP BY t\nORDER BY t`} + ids={ids} + timeRange={timeRange} + queueName={queueName} + valueFormat={formatWaitMs} + series={[{ key: "wait", label: "Mean delay", color: COLORS.p95 }]} + /> + + ); +} + +function useQueueMetric( + query: string, + opts: { ids: Ids; timeRange: TimeRangeParams; queueName: string; fillGaps?: boolean } +) { + return useMetricResourceQuery(query, { + ...opts.ids, + timeRange: opts.timeRange, + defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD, + queues: [opts.queueName], + fillGaps: opts.fillGaps, + }); +} + +function toNumber(value: number | string | null | undefined): number { + const n = typeof value === "number" ? value : Number(value); + return Number.isFinite(n) ? n : 0; +} + +function clickhouseTimeToMs(value: unknown): number { + const s = String(value).replace(" ", "T"); + return Date.parse(s.endsWith("Z") ? s : `${s}Z`); +} + +type SeriesConfig = { key: string; label: string; color: string }; + +function QueueDetailChartCard({ + title, + query, + series, + ids, + timeRange, + queueName, + valueFormat, + fillGaps, +}: { + title: string; + query: string; + series: SeriesConfig[]; + ids: Ids; + timeRange: TimeRangeParams; + queueName: string; + valueFormat?: (value: number) => string; + fillGaps?: boolean; +}) { + const { rows, showLoading, failed } = useQueueMetric(query, { + ids, + timeRange, + queueName, + fillGaps, + }); + + const data = useMemo(() => { + return rows + .map((r) => { + const point: { bucket: number } & Record = { + bucket: clickhouseTimeToMs(r.t), + }; + for (const s of series) point[s.key] = toNumber(r[s.key]); + return point; + }) + .filter((p) => Number.isFinite(p.bucket)); + }, [rows, series]); + + const chartConfig = useMemo(() => { + const cfg: ChartConfig = {}; + for (const s of series) cfg[s.key] = { label: s.label, color: s.color }; + return cfg; + }, [series]); + + const { tickFormatter, tooltipLabelFormatter } = useMemo( + () => buildActivityTimeAxis(data), + [data] + ); + + const state: ChartState = showLoading ? "loading" : failed ? "invalid" : undefined; + + return ( +
+ + s.key)} + state={state} + fillContainer + > + valueFormat(v) } : undefined} + tooltipLabelFormatter={tooltipLabelFormatter} + tooltipValueFormatter={valueFormat} + /> + + +
+ ); +} + +function QueueStats({ + queue, + ids, + timeRange, + queueName, +}: { + queue: { running: number; queued: number }; + ids: Ids; + timeRange: TimeRangeParams; + queueName: string; +}) { + // One scalar query feeds the CH-derived stats; the "now" counts come from the loader (live). + const { rows, showLoading } = useQueueMetric( + `SELECT max(max_limit) AS lim, max(max_queued) AS peak_queued, deltaSumTimestampMerge(started_delta) AS started,\n round(quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[3]) AS worst_p95\nFROM queue_metrics`, + { ids, timeRange, queueName } + ); + const row = rows[0]; + const worstP95 = row ? toNumber(row.worst_p95) : 0; + + return ( +
+ + + + + + 0 ? formatWaitMs(worstP95) : "–"} + loading={showLoading} + className={worstP95 >= 60_000 ? "text-warning" : undefined} + /> +
+ ); +} + +function Stat({ + label, + value, + className, + loading, +}: { + label: string; + value: string; + className?: string; + loading?: boolean; +}) { + return ( +
+
{label}
+ {loading ? ( +
+ ) : ( +
{value}
+ )} +
+ ); +} + +function formatWaitMs(ms: number): string { + if (ms < 1000) return `${Math.round(ms)}ms`; + if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`; + if (ms < 3_600_000) return `${(ms / 60_000).toFixed(1)}m`; + return `${(ms / 3_600_000).toFixed(1)}h`; +} diff --git a/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts b/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts new file mode 100644 index 00000000000..69e4e8c1fac --- /dev/null +++ b/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts @@ -0,0 +1,45 @@ +import { type ActionFunctionArgs, type LoaderFunctionArgs, json } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { + probeQueueMetricsStreams, + readQueueMetricsControls, + writeQueueMetricsControls, +} from "~/v3/queueMetrics.server"; + +export async function loader({ request }: LoaderFunctionArgs) { + await requireAdminApiRequest(request); + const [controls, streams] = await Promise.all([ + readQueueMetricsControls(), + probeQueueMetricsStreams(), + ]); + return json({ controls, streams }); +} + +const BodySchema = z.object({ + enabled: z.boolean().optional(), + sampleRate: z.number().min(0).max(1).optional(), +}); + +export async function action({ request }: ActionFunctionArgs) { + await requireAdminApiRequest(request); + + if (request.method !== "POST") { + return json({ error: "Method not allowed" }, { status: 405 }); + } + + let body: unknown; + try { + body = await request.json(); + } catch { + return json({ error: "Invalid JSON body" }, { status: 400 }); + } + + const parsed = BodySchema.safeParse(body); + if (!parsed.success) { + return json({ error: "Invalid payload", details: parsed.error.issues }, { status: 400 }); + } + + await writeQueueMetricsControls(parsed.data); + return json({ ok: true, controls: await readQueueMetricsControls() }); +} diff --git a/apps/webapp/app/routes/admin.queue-metrics.tsx b/apps/webapp/app/routes/admin.queue-metrics.tsx new file mode 100644 index 00000000000..6deaedce66e --- /dev/null +++ b/apps/webapp/app/routes/admin.queue-metrics.tsx @@ -0,0 +1,190 @@ +import { useFetcher, useRevalidator } from "@remix-run/react"; +import { json } from "@remix-run/server-runtime"; +import { useEffect, useState } from "react"; +import { typedjson, useTypedLoaderData } from "remix-typedjson"; +import { z } from "zod"; +import { Button } from "~/components/primitives/Buttons"; +import { Callout } from "~/components/primitives/Callout"; +import { Header1, Header2 } from "~/components/primitives/Headers"; +import { Input } from "~/components/primitives/Input"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import { + Table, + TableBody, + TableCell, + TableHeader, + TableHeaderCell, + TableRow, +} from "~/components/primitives/Table"; +import { dashboardAction, dashboardLoader } from "~/services/routeBuilders/dashboardBuilder"; +import { + probeQueueMetricsStreams, + readQueueMetricsControls, + writeQueueMetricsControls, +} from "~/v3/queueMetrics.server"; + +export const loader = dashboardLoader({ authorization: { requireSuper: true } }, async () => { + const [controls, streams] = await Promise.all([ + readQueueMetricsControls(), + probeQueueMetricsStreams(), + ]); + return typedjson({ controls, streams }); +}); + +const BodySchema = z.object({ + enabled: z.boolean().optional(), + sampleRate: z.number().min(0).max(1).optional(), +}); + +export const action = dashboardAction( + { authorization: { requireSuper: true } }, + async ({ request }) => { + let body: unknown; + try { + body = await request.json(); + } catch { + return json({ error: "Invalid JSON body" }, { status: 400 }); + } + const parsed = BodySchema.safeParse(body); + if (!parsed.success) { + return json({ error: "Invalid payload" }, { status: 400 }); + } + await writeQueueMetricsControls(parsed.data); + return json({ success: true }); + } +); + +export default function AdminQueueMetricsRoute() { + const { controls, streams } = useTypedLoaderData(); + const saveFetcher = useFetcher<{ success?: boolean; error?: string }>(); + const revalidator = useRevalidator(); + + const [enabled, setEnabled] = useState(controls.enabled); + const [sampleRate, setSampleRate] = useState(String(controls.sampleRate)); + const [error, setError] = useState(null); + + useEffect(() => { + setEnabled(controls.enabled); + setSampleRate(String(controls.sampleRate)); + }, [controls.enabled, controls.sampleRate]); + + useEffect(() => { + if (saveFetcher.data?.success) { + setError(null); + revalidator.revalidate(); + } else if (saveFetcher.data?.error) { + setError(saveFetcher.data.error); + } + }, [saveFetcher.data]); + + const isSaving = saveFetcher.state === "submitting"; + + const handleSave = () => { + const rate = Number(sampleRate); + if (!Number.isFinite(rate) || rate < 0 || rate > 1) { + setError("Sample rate must be a number between 0 and 1"); + return; + } + saveFetcher.submit(JSON.stringify({ enabled, sampleRate: rate }), { + method: "POST", + encType: "application/json", + }); + }; + + const totalLag = streams.reduce((sum, s) => sum + (s.lag ?? 0), 0); + const lagUnknownCount = streams.filter((s) => s.lag === null).length; + + return ( +
+
+ Queue metrics ingest + + Live controls for the queue-metrics ingest pipeline on the run-queue Redis. Changes take + effect within ~10s across all instances (no redeploy). Watch EngineCPU on the run-queue + Redis when enabling or raising the sample rate. + + +
+ Controls + +
+ + setSampleRate(e.target.value)} + className="w-32" + /> +
+ {error && {error}} +
+ +
+
+ +
+
+ Stream health{totalLag > 0 ? ` (lag ${totalLag})` : ""} + +
+ + Depth = entries buffered in the shard stream; Lag = entries not yet delivered to the + consumer group (rising = consumer falling behind; "unknown" = entries were trimmed past + the group, i.e. data was lost); Pending = unacked entries. Gauges and counters share one + stream family on the metrics Redis. + + {lagUnknownCount > 0 && ( + + Lag is unknown on {lagUnknownCount} shard{lagUnknownCount === 1 ? "" : "s"}: entries + were trimmed past the consumer group's read position, so stream data was lost. Check + consumer health. + + )} + + + + Stream + Shard + Depth + Lag + Pending + + + + {streams.map((s) => ( + + {s.stream} + {s.shard} + {s.depth} + {s.lag ?? "unknown"} + {s.pending} + + ))} + +
+
+
+
+ ); +} diff --git a/apps/webapp/app/routes/admin.tsx b/apps/webapp/app/routes/admin.tsx index a95b016ca5b..7d24fe312fa 100644 --- a/apps/webapp/app/routes/admin.tsx +++ b/apps/webapp/app/routes/admin.tsx @@ -38,6 +38,10 @@ export default function Page() { label: "Global Feature Flags", to: "/admin/feature-flags", }, + { + label: "Queue Metrics", + to: "/admin/queue-metrics", + }, { label: "Notifications", to: "/admin/notifications", diff --git a/apps/webapp/app/routes/api.v1.query.schema.ts b/apps/webapp/app/routes/api.v1.query.schema.ts index 3e95d16818d..976fa72b267 100644 --- a/apps/webapp/app/routes/api.v1.query.schema.ts +++ b/apps/webapp/app/routes/api.v1.query.schema.ts @@ -1,7 +1,7 @@ import { json } from "@remix-run/server-runtime"; import type { ColumnSchema, TableSchema } from "@internal/tsql"; import { createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server"; -import { querySchemas } from "~/v3/querySchemas"; +import { visibleQuerySchemas } from "~/v3/querySchemas"; function serializeColumn(col: ColumnSchema) { const result: Record = { @@ -51,7 +51,7 @@ export const loader = createLoaderApiRoute( }, }, async () => { - const tables = querySchemas.map(serializeTable); + const tables = visibleQuerySchemas.map(serializeTable); return json({ tables }); } ); diff --git a/apps/webapp/app/routes/resources.metric.tsx b/apps/webapp/app/routes/resources.metric.tsx index d456ba1ce1b..5bf0ed693ad 100644 --- a/apps/webapp/app/routes/resources.metric.tsx +++ b/apps/webapp/app/routes/resources.metric.tsx @@ -50,6 +50,8 @@ const MetricWidgetQuery = z.object({ operations: z.array(z.string()).optional(), providers: z.array(z.string()).optional(), tags: z.array(z.string()).optional(), + // Opt into server-side gap fill (carry-forward for gauges, zero-fill for counters). + fillGaps: z.boolean().optional(), }); export const action = async ({ request }: ActionFunctionArgs) => { @@ -85,6 +87,7 @@ export const action = async ({ request }: ActionFunctionArgs) => { operations, providers, tags: _tags, + fillGaps, } = submission.data; // Check they should be able to access it @@ -122,6 +125,7 @@ export const action = async ({ request }: ActionFunctionArgs) => { promptVersions, operations, providers, + fillGaps, // Set higher concurrency if many widgets are on screen at once customOrgConcurrencyLimit: env.METRIC_WIDGET_DEFAULT_ORG_CONCURRENCY_LIMIT, }); diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query.ai-generate.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query.ai-generate.tsx index c1626b966d2..4a9ab462dcf 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query.ai-generate.tsx +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query.ai-generate.tsx @@ -8,7 +8,7 @@ import type { AITimeFilter } from "~/routes/_app.orgs.$organizationSlug.projects import { requireUserId } from "~/services/session.server"; import { EnvironmentParamSchema } from "~/utils/pathBuilder"; import { AIQueryService } from "~/v3/services/aiQueryService.server"; -import { querySchemas } from "~/v3/querySchemas"; +import { visibleQuerySchemas } from "~/v3/querySchemas"; const RequestSchema = z.object({ prompt: z.string().min(1, "Prompt is required"), @@ -85,7 +85,7 @@ export async function action({ request, params }: ActionFunctionArgs) { const { prompt, mode, currentQuery } = submission.data; const service = new AIQueryService( - querySchemas, + visibleQuerySchemas, openai(env.AI_RUN_FILTER_MODEL ?? "gpt-4o-mini") ); diff --git a/apps/webapp/app/services/queryService.server.ts b/apps/webapp/app/services/queryService.server.ts index 57b877ed876..70af40ec89f 100644 --- a/apps/webapp/app/services/queryService.server.ts +++ b/apps/webapp/app/services/queryService.server.ts @@ -7,7 +7,12 @@ import { type TSQLQueryResult, } from "@internal/clickhouse"; import type { CustomerQuerySource } from "@trigger.dev/database"; -import type { TableSchema, WhereClauseCondition } from "@internal/tsql"; +import { + calculateTimeBucketInterval, + type TableSchema, + type TimeBucketInterval, + type WhereClauseCondition, +} from "@internal/tsql"; import { z } from "zod"; import { prisma } from "~/db.server"; import { env } from "~/env.server"; @@ -110,6 +115,41 @@ export type ExecuteQueryResult = } | { success: false; error: Error }; +const INTERVAL_UNIT_SECONDS: Record = { + SECOND: 1, + MINUTE: 60, + HOUR: 3_600, + DAY: 86_400, + WEEK: 604_800, + MONTH: 2_592_000, +}; + +function floorToSeconds(date: Date, alignSeconds: number): Date { + const ms = alignSeconds * 1000; + return new Date(Math.floor(date.getTime() / ms) * ms); +} + +/** + * Swap a table for one of its rollups when the query's bucket interval is at least the + * rollup's granularity. The rollup has identical logical columns, so only the physical + * table (and therefore rows read) changes. + */ +function resolveRollup(schema: TableSchema, timeRange: { from: Date; to: Date }): TableSchema { + if (!schema.rollups || schema.rollups.length === 0) { + return schema; + } + const interval = calculateTimeBucketInterval( + timeRange.from, + timeRange.to, + schema.timeBucketThresholds + ); + const intervalSeconds = interval.value * INTERVAL_UNIT_SECONDS[interval.unit]; + const best = [...schema.rollups] + .sort((a, b) => b.minIntervalSeconds - a.minIntervalSeconds) + .find((r) => r.minIntervalSeconds <= intervalSeconds); + return best ? { ...schema, clickhouseName: best.clickhouseName } : schema; +} + export async function getDefaultPeriod(organizationId: string): Promise { const idealDefaultPeriodDays = 7; const maxQueryPeriod = await getLimit(organizationId, "queryPeriodDays", 30); @@ -183,6 +223,14 @@ export async function executeQuery( defaultPeriod, }); + // Align the time bounds so repeated auto-refresh queries produce identical query + // params and can share ClickHouse query-cache entries (params are part of the key). + const alignSeconds = matchedSchema?.queryCache?.alignSeconds; + if (alignSeconds) { + if (timeFilter.from) timeFilter.from = floorToSeconds(timeFilter.from, alignSeconds); + if (timeFilter.to) timeFilter.to = floorToSeconds(timeFilter.to, alignSeconds); + } + // Calculate the effective "from" date the user is requesting (for period clipping check) // This is null only when the user specifies just a "to" date (rare case) let requestedFromDate: Date | null = null; @@ -192,6 +240,9 @@ export async function executeQuery( // Period specified (or default) - calculate from now const periodMs = parse(timeFilter.period ?? defaultPeriod) ?? 7 * 24 * 60 * 60 * 1000; requestedFromDate = new Date(Date.now() - periodMs); + if (alignSeconds) { + requestedFromDate = floorToSeconds(requestedFromDate, alignSeconds); + } } // Build the fallback WHERE condition based on what the user specified @@ -207,7 +258,10 @@ export async function executeQuery( } const maxQueryPeriod = await getLimit(organizationId, "queryPeriodDays", 30); - const maxQueryPeriodDate = new Date(Date.now() - maxQueryPeriod * 24 * 60 * 60 * 1000); + let maxQueryPeriodDate = new Date(Date.now() - maxQueryPeriod * 24 * 60 * 60 * 1000); + if (alignSeconds) { + maxQueryPeriodDate = floorToSeconds(maxQueryPeriodDate, alignSeconds); + } // Check if the requested time period exceeds the plan limit const periodClipped = requestedFromDate !== null && requestedFromDate < maxQueryPeriodDate; @@ -255,6 +309,10 @@ export async function executeQuery( to: to ?? undefined, defaultPeriod, }); + if (alignSeconds) { + timeRange.from = floorToSeconds(timeRange.from, alignSeconds); + timeRange.to = floorToSeconds(timeRange.to, alignSeconds); + } try { // Build field mappings for project_ref → project_id and environment_id → slug translation @@ -277,10 +335,19 @@ export async function executeQuery( organizationId, "query" ); + // Serve coarse-bucket queries from the table's rollup when one qualifies. + const effectiveSchemas = matchedSchema?.rollups + ? querySchemas.map((s) => (s === matchedSchema ? resolveRollup(s, timeRange) : s)) + : querySchemas; + + const queryCacheSettings: ClickHouseSettings = matchedSchema?.queryCache + ? { use_query_cache: 1, query_cache_ttl: matchedSchema.queryCache.ttlSeconds } + : {}; + const result = await executeTSQL(queryClickhouse.reader, { ...baseOptions, schema: z.record(z.any()), - tableSchema: querySchemas, + tableSchema: effectiveSchemas, transformValues: true, enforcedWhereClause, fieldMappings, @@ -290,6 +357,7 @@ export async function executeQuery( timeRange, clickhouseSettings: { ...getDefaultClickhouseSettings(), + ...queryCacheSettings, ...baseOptions.clickhouseSettings, // Allow caller overrides if needed }, querySettings: { diff --git a/apps/webapp/app/utils/pathBuilder.ts b/apps/webapp/app/utils/pathBuilder.ts index 187bc50b549..edd65f8bde4 100644 --- a/apps/webapp/app/utils/pathBuilder.ts +++ b/apps/webapp/app/utils/pathBuilder.ts @@ -522,6 +522,15 @@ export function v3QueuesPath( return `${v3EnvironmentPath(organization, project, environment)}/queues`; } +export function v3QueuePath( + organization: OrgForPath, + project: ProjectForPath, + environment: EnvironmentForPath, + queue: { friendlyId: string } +) { + return `${v3QueuesPath(organization, project, environment)}/${queue.friendlyId}`; +} + export function v3WaitpointTokensPath( organization: OrgForPath, project: ProjectForPath, diff --git a/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts b/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts new file mode 100644 index 00000000000..0e3c142b272 --- /dev/null +++ b/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts @@ -0,0 +1,26 @@ +import { prisma } from "~/db.server"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; + +// Per-org gate for the Queue Metrics dashboard UI. Org override wins over the global +// FeatureFlag table value, which wins over the off-by-default. Ingestion/emission is a +// separate global flag; this only decides whether an org sees the metrics view. +export async function canAccessQueueMetricsUi(options: { + userId: string; + organizationSlug: string; +}): Promise { + const org = await prisma.organization.findFirst({ + where: { + slug: options.organizationSlug, + members: { some: { userId: options.userId } }, + }, + select: { featureFlags: true }, + }); + + const flag = makeFlag(); + return flag({ + key: FEATURE_FLAG.queueMetricsUiEnabled, + defaultValue: false, + overrides: (org?.featureFlags as Record) ?? {}, + }); +} diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 637830aef06..fa17e504a37 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -19,6 +19,7 @@ export const FEATURE_FLAG = { computeMigrationRequireTemplate: "computeMigrationRequireTemplate", devBranchesEnabled: "devBranchesEnabled", runOpsMintKind: "runOpsMintKind", + queueMetricsUiEnabled: "queueMetricsUiEnabled", } as const; export const FeatureFlagCatalog = { @@ -54,6 +55,9 @@ export const FeatureFlagCatalog = { // Per-org run-ops-id mint cutover. Defaults to "cuid"; only honored when // RUN_OPS_MINT_ENABLED is on AND isSplitEnabled() is true. [FEATURE_FLAG.runOpsMintKind]: z.enum(["cuid", "runOpsId"]), + // Per-org access to the Queue Metrics dashboard UI (view only; emission is global and + // separate). Off unless enabled for the org. + [FEATURE_FLAG.queueMetricsUiEnabled]: z.coerce.boolean(), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts index 4784ad75629..540ae670091 100644 --- a/apps/webapp/app/v3/querySchemas.ts +++ b/apps/webapp/app/v3/querySchemas.ts @@ -614,8 +614,333 @@ export const metricsSchema: TableSchema = { }; /** - * All available schemas for the query editor + * Schema definition for the queue_metrics table (trigger_dev.queue_metrics_v1). + * Pre-aggregated into 10-second buckets. Counter columns re-aggregate with sum(), + * gauges with max(), and wait_quantiles with quantilesMerge() — never FINAL. */ +export const queueMetricsSchema: TableSchema = { + name: "queue_metrics", + clickhouseName: "trigger_dev.queue_metrics_v1", + description: "Per-queue depth, concurrency, throttling, and scheduling-delay metrics", + timeConstraint: "bucket_start", + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + columns: { + environment: { + name: "environment", + clickhouseName: "environment_id", + ...column("String", { description: "The environment slug", example: "prod" }), + fieldMapping: "environment", + customRenderType: "environment", + }, + project: { + name: "project", + clickhouseName: "project_id", + ...column("String", { + description: "The project reference, they always start with `proj_`.", + example: "proj_howcnaxbfxdmwmxazktx", + }), + fieldMapping: "project", + customRenderType: "project", + }, + queue: { + name: "queue", + clickhouseName: "queue_name", + ...column("LowCardinality(String)", { + description: "The queue name", + example: "my-queue", + coreColumn: true, + }), + }, + bucket_start: { + name: "bucket_start", + ...column("DateTime", { + description: "The start of the 10-second aggregation bucket", + example: "2024-01-15 09:30:00", + coreColumn: true, + }), + }, + // Cumulative-counter delta states. Read with deltaSumTimestampMerge() (loss-tolerant, + // reset-safe), never sum(); opaque like wait_quantiles. Merging across queues is + // invalid (mixes unrelated odometers): totals must GROUP BY queue, then sum outside. + enqueue_delta: { + name: "enqueue_delta", + mergeGroupKey: "queue", + ...column("String", { + description: + "Runs enqueued (cumulative-counter delta). Read with deltaSumTimestampMerge(enqueue_delta) grouped by queue. For totals across queues, sum the per-queue results in an outer query, never merge across queues. Per-bucket values can undercount by one inter-reading delta at bucket boundaries (the bridge lives in the prior bucket's state); totals over the whole range are exact.", + }), + groupable: false, + sortable: false, + filterable: false, + }, + started_delta: { + name: "started_delta", + mergeGroupKey: "queue", + ...column("String", { + description: + "Runs dequeued/started (throughput). Read with deltaSumTimestampMerge(started_delta) grouped by queue. For totals across queues, sum the per-queue results in an outer query, never merge across queues. Per-bucket values can undercount by one inter-reading delta at bucket boundaries (the bridge lives in the prior bucket's state); totals over the whole range are exact.", + coreColumn: true, + }), + groupable: false, + sortable: false, + filterable: false, + }, + ack_delta: { + name: "ack_delta", + mergeGroupKey: "queue", + ...column("String", { + description: + "Runs acked (completed). Read with deltaSumTimestampMerge(ack_delta) grouped by queue; sum per-queue results for totals.", + }), + groupable: false, + sortable: false, + filterable: false, + }, + nack_delta: { + name: "nack_delta", + mergeGroupKey: "queue", + ...column("String", { + description: + "Runs nacked. Read with deltaSumTimestampMerge(nack_delta) grouped by queue; sum per-queue results for totals.", + }), + groupable: false, + sortable: false, + filterable: false, + }, + dlq_delta: { + name: "dlq_delta", + mergeGroupKey: "queue", + ...column("String", { + description: + "Runs dead-lettered. Read with deltaSumTimestampMerge(dlq_delta) grouped by queue; sum per-queue results for totals.", + }), + groupable: false, + sortable: false, + filterable: false, + }, + throttled_count: { + name: "throttled_count", + ...column("UInt64", { + description: "Gauge emissions where running>=limit and queued>0. Aggregate with sum().", + coreColumn: true, + }), + }, + max_queued: { + name: "max_queued", + ...column("UInt32", { + description: "Peak queue depth in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + max_running: { + name: "max_running", + ...column("UInt32", { + description: "Peak running (concurrency) in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + max_limit: { + name: "max_limit", + ...column("UInt32", { + description: "The queue concurrency limit. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + max_env_queued: { + name: "max_env_queued", + ...column("UInt32", { + description: "Peak environment-wide queued in the bucket. Aggregate with max().", + fillMode: "carry", + }), + }, + max_env_running: { + name: "max_env_running", + ...column("UInt32", { + description: "Peak environment-wide running in the bucket. Aggregate with max().", + fillMode: "carry", + }), + }, + max_env_limit: { + name: "max_env_limit", + ...column("UInt32", { + description: "The environment concurrency limit. Aggregate with max().", + fillMode: "carry", + }), + }, + max_ck_backlogged: { + name: "max_ck_backlogged", + ...column("UInt32", { + description: + "Peak number of distinct concurrency keys with queued runs in the bucket. Aggregate with max(). Zero for queues that do not use concurrency keys.", + fillMode: "carry", + }), + }, + max_ck_wait_ms: { + name: "max_ck_wait_ms", + ...column("UInt32", { + description: + "Worst head-of-line wait (ms) across concurrency keys in the bucket: how long the most-starved key's oldest queued run has been waiting. Aggregate with max(). Zero for queues that do not use concurrency keys.", + fillMode: "carry", + }), + }, + wait_ms_sum: { + name: "wait_ms_sum", + ...column("UInt64", { + description: "Sum of scheduling delays (ms). Mean = wait_ms_sum/wait_ms_count.", + }), + }, + wait_ms_count: { + name: "wait_ms_count", + ...column("UInt64", { + description: "Count of scheduling-delay samples. Aggregate with sum().", + }), + }, + wait_quantiles: { + name: "wait_quantiles", + ...column("String", { + description: + "Scheduling-delay (dequeue minus eligible-at) quantile state. Read with quantilesMerge(0.5,0.9,0.95,0.99)(wait_quantiles)[n].", + }), + groupable: false, + sortable: false, + filterable: false, + }, + }, + timeBucketThresholds: [ + { maxRangeSeconds: 3 * 60 * 60, interval: { value: 10, unit: "SECOND" } }, + { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } }, + { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } }, + { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } }, + { maxRangeSeconds: 30 * 24 * 60 * 60, interval: { value: 1, unit: "HOUR" } }, + { maxRangeSeconds: 90 * 24 * 60 * 60, interval: { value: 6, unit: "HOUR" } }, + { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } }, + { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } }, + ] satisfies BucketThreshold[], + // Ranges whose bucket interval is >= 5 minutes read the 5m rollup instead (same + // logical columns, ~30x fewer rows). + rollups: [{ minIntervalSeconds: 300, clickhouseName: "trigger_dev.queue_metrics_5m_v1" }], + queryCache: { ttlSeconds: 30, alignSeconds: 30 }, +}; + +/** + * Schema definition for the env_metrics table (trigger_dev.env_metrics_v1). + * Environment-level rollup of queue_metrics with the queue dimension dropped, so + * header tiles and saturation charts cost the same regardless of how many queues + * the environment has. Keeps the full 10-second granularity: row count is + * queue-independent, so even 30-day ranges stay small. + */ +export const envMetricsSchema: TableSchema = { + name: "env_metrics", + clickhouseName: "trigger_dev.env_metrics_v1", + description: + "Environment-level concurrency, saturation, throttling, and scheduling-delay metrics (10-second buckets)", + timeConstraint: "bucket_start", + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + columns: { + environment: { + name: "environment", + clickhouseName: "environment_id", + ...column("String", { description: "The environment slug", example: "prod" }), + fieldMapping: "environment", + customRenderType: "environment", + }, + project: { + name: "project", + clickhouseName: "project_id", + ...column("String", { + description: "The project reference, they always start with `proj_`.", + example: "proj_howcnaxbfxdmwmxazktx", + }), + fieldMapping: "project", + customRenderType: "project", + }, + bucket_start: { + name: "bucket_start", + ...column("DateTime", { + description: "The start of the 10-second aggregation bucket", + example: "2024-01-15 09:30:00", + coreColumn: true, + }), + }, + max_env_queued: { + name: "max_env_queued", + ...column("UInt32", { + description: "Peak environment-wide queued in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + max_env_running: { + name: "max_env_running", + ...column("UInt32", { + description: "Peak environment-wide running in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + max_env_limit: { + name: "max_env_limit", + ...column("UInt32", { + description: "The environment concurrency limit. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + throttled_count: { + name: "throttled_count", + ...column("UInt64", { + description: + "Gauge emissions where a queue was at its limit with work queued. Aggregate with sum().", + coreColumn: true, + }), + }, + wait_ms_sum: { + name: "wait_ms_sum", + ...column("UInt64", { + description: "Sum of scheduling delays (ms). Mean = wait_ms_sum/wait_ms_count.", + }), + }, + wait_ms_count: { + name: "wait_ms_count", + ...column("UInt64", { + description: "Count of scheduling-delay samples. Aggregate with sum().", + }), + }, + wait_quantiles: { + name: "wait_quantiles", + ...column("String", { + description: + "Scheduling-delay quantile state (TDigest). Read with quantilesTDigestMerge(0.5,0.9,0.95,0.99)(wait_quantiles)[n].", + }), + groupable: false, + sortable: false, + filterable: false, + }, + }, + timeBucketThresholds: [ + { maxRangeSeconds: 3 * 60 * 60, interval: { value: 10, unit: "SECOND" } }, + { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } }, + { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } }, + { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } }, + { maxRangeSeconds: 30 * 24 * 60 * 60, interval: { value: 1, unit: "HOUR" } }, + { maxRangeSeconds: 90 * 24 * 60 * 60, interval: { value: 6, unit: "HOUR" } }, + { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } }, + { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } }, + ] satisfies BucketThreshold[], + queryCache: { ttlSeconds: 30, alignSeconds: 30 }, +}; + /** * Schema definition for the llm_metrics table (trigger_dev.llm_metrics_v1) */ @@ -975,13 +1300,154 @@ export const llmModelsSchema: TableSchema = { }, }; +/** + * Per-concurrency-key drill-down for queues that shard work with `concurrencyKey` + * (e.g. per-tenant fairness). Rows are activity-bound: a (queue, key, bucket) row exists + * only when that key had events, so key cardinality cannot inflate the table. + */ +export const queueMetricsByKeySchema: TableSchema = { + name: "queue_metrics_by_key", + clickhouseName: "trigger_dev.queue_metrics_ck_v1", + description: "Per-concurrency-key queue metrics: backlog, throughput, and wait by key", + hidden: true, + timeConstraint: "bucket_start", + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + columns: { + environment: { + name: "environment", + clickhouseName: "environment_id", + ...column("String", { description: "The environment slug", example: "prod" }), + fieldMapping: "environment", + customRenderType: "environment", + }, + project: { + name: "project", + clickhouseName: "project_id", + ...column("String", { + description: "The project reference, they always start with `proj_`.", + example: "proj_howcnaxbfxdmwmxazktx", + }), + fieldMapping: "project", + customRenderType: "project", + }, + queue: { + name: "queue", + clickhouseName: "queue_name", + ...column("LowCardinality(String)", { + description: "The queue name", + example: "my-queue", + coreColumn: true, + }), + }, + concurrency_key: { + name: "concurrency_key", + ...column("String", { + description: "The concurrency key the run was sharded by (e.g. a tenant id)", + example: "tenant-42", + coreColumn: true, + }), + }, + bucket_start: { + name: "bucket_start", + ...column("DateTime", { + description: "The start of the 10-second aggregation bucket", + example: "2024-01-15 09:30:00", + coreColumn: true, + }), + }, + enqueue_delta: { + name: "enqueue_delta", + mergeGroupKey: ["queue", "concurrency_key"], + ...column("String", { + description: + "Runs enqueued for this key (cumulative-counter delta). Read with deltaSumTimestampMerge(enqueue_delta) grouped by queue and concurrency_key, or with both pinned; never merge across keys.", + }), + groupable: false, + sortable: false, + filterable: false, + }, + started_delta: { + name: "started_delta", + mergeGroupKey: ["queue", "concurrency_key"], + ...column("String", { + description: + "Runs dequeued/started for this key (throughput). Read with deltaSumTimestampMerge(started_delta) grouped by queue and concurrency_key, or with both pinned; never merge across keys.", + coreColumn: true, + }), + groupable: false, + sortable: false, + filterable: false, + }, + ack_delta: { + name: "ack_delta", + mergeGroupKey: ["queue", "concurrency_key"], + ...column("String", { + description: + "Runs acked (completed) for this key. Read with deltaSumTimestampMerge(ack_delta) grouped by queue and concurrency_key, or with both pinned.", + }), + groupable: false, + sortable: false, + filterable: false, + }, + max_queued: { + name: "max_queued", + ...column("UInt32", { + description: "Peak backlog for this key in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + max_running: { + name: "max_running", + ...column("UInt32", { + description: "Peak running for this key in the bucket. Aggregate with max().", + fillMode: "carry", + }), + }, + wait_ms_sum: { + name: "wait_ms_sum", + ...column("UInt64", { + description: + "Sum of scheduling delays (ms) for this key. Mean = wait_ms_sum/wait_ms_count.", + }), + }, + wait_ms_count: { + name: "wait_ms_count", + ...column("UInt64", { + description: "Count of scheduling-delay samples for this key. Aggregate with sum().", + }), + }, + }, + timeBucketThresholds: [ + { maxRangeSeconds: 3 * 60 * 60, interval: { value: 10, unit: "SECOND" } }, + { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } }, + { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } }, + { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } }, + { maxRangeSeconds: 30 * 24 * 60 * 60, interval: { value: 1, unit: "HOUR" } }, + { maxRangeSeconds: 90 * 24 * 60 * 60, interval: { value: 6, unit: "HOUR" } }, + { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } }, + { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } }, + ] satisfies BucketThreshold[], + queryCache: { ttlSeconds: 30, alignSeconds: 30 }, +}; + export const querySchemas: TableSchema[] = [ runsSchema, metricsSchema, llmMetricsSchema, llmModelsSchema, + queueMetricsSchema, + envMetricsSchema, + queueMetricsByKeySchema, ]; +/** Schemas shown in user-facing listings (editor autocomplete, schema docs, schema API). */ +export const visibleQuerySchemas: TableSchema[] = querySchemas.filter((s) => !s.hidden); + /** * Default query for the query editor */ diff --git a/apps/webapp/app/v3/queueMetrics.server.ts b/apps/webapp/app/v3/queueMetrics.server.ts new file mode 100644 index 00000000000..14d9c4dc93d --- /dev/null +++ b/apps/webapp/app/v3/queueMetrics.server.ts @@ -0,0 +1,247 @@ +import { type ClickHouse, type QueueMetricsRawV1Input } from "@internal/clickhouse"; +import { + allStreamKeys, + CachedRedisFlag, + CachedRedisNumber, + MetricsStreamConsumer, + MetricsStreamEmitter, + probeShardStates, + type MetricDefinition, + type ShardState, + type StreamEntry, +} from "@internal/metrics-pipeline"; +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import os from "node:os"; +import { env } from "~/env.server"; +import { getDefaultClickhouseClient } from "~/services/clickhouse/clickhouseFactory.server"; +import { logger } from "~/services/logger.server"; +import { signalsEmitter } from "~/services/signals.server"; +import { singleton } from "~/utils/singleton"; +import { mapEntryToRows, QueueNameLimiter } from "./queueMetricsMapping"; +import { meter } from "./tracer.server"; + +const FLAG_KEY = "queue_metrics:enabled"; +const SAMPLE_RATE_KEY = "queue_metrics:gauge_sample_rate"; +const TRUTHY = new Set(["1", "true", "on", "enabled", "yes"]); + +// Same physical Redis as the RunQueue (host/port/auth). Stream keys are kept out of the +// keyPrefix on every access path, so only the connection details matter here. +function runQueueRedisOptions(): RedisOptions { + return { + port: env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? undefined, + host: env.RUN_ENGINE_RUN_QUEUE_REDIS_HOST ?? undefined, + username: env.RUN_ENGINE_RUN_QUEUE_REDIS_USERNAME ?? undefined, + password: env.RUN_ENGINE_RUN_QUEUE_REDIS_PASSWORD ?? undefined, + enableAutoPipelining: true, + ...(env.RUN_ENGINE_RUN_QUEUE_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }; +} + +// Metrics stream Redis: a dedicated instance when QUEUE_METRICS_REDIS_HOST is set (so the +// metrics backlog never competes with the run queue), else the run-queue Redis. Carries BOTH +// gauges and counters — gauges are read inside the queue-op Lua and returned on the reply, +// then XADDed here by Node, so the run-queue Redis holds no metrics stream. +function metricsRedisOptions(): RedisOptions { + if (!env.QUEUE_METRICS_REDIS_HOST) return runQueueRedisOptions(); + return { + host: env.QUEUE_METRICS_REDIS_HOST, + port: env.QUEUE_METRICS_REDIS_PORT ?? undefined, + username: env.QUEUE_METRICS_REDIS_USERNAME ?? undefined, + password: env.QUEUE_METRICS_REDIS_PASSWORD ?? undefined, + enableAutoPipelining: true, + ...(env.QUEUE_METRICS_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }; +} + +// One stream family on the metrics Redis carrying both gauge snapshots and cumulative +// counter readings; one consumer group reads it. +function metricsDefinition(): MetricDefinition { + // A stalled consumer holds up to maxLen entries per shard in Redis memory: cap lower + // by default when the stream shares the queue-critical run-queue Redis. + const defaultMaxLen = env.QUEUE_METRICS_REDIS_HOST ? 8_000_000 : 2_000_000; + return { + name: "queue_metrics", + shardCount: env.QUEUE_METRICS_STREAM_SHARD_COUNT, + consumerGroup: "queue_metrics_cg", + maxLen: env.QUEUE_METRICS_COUNTER_STREAM_MAXLEN ?? defaultMaxLen, + }; +} + +// Dedicated client for the admin read/write/probe surface — works regardless of whether +// this instance runs the emitter/consumer. keyPrefix unset to match the raw control keys. +function adminRedis(): Redis { + return singleton("queueMetricsAdminRedis", () => + createRedisClient( + { ...runQueueRedisOptions(), keyPrefix: undefined }, + { onError: (error) => logger.error("queue metrics admin redis error", { error }) } + ) + ); +} + +function metricsAdminRedis(): Redis { + return singleton("queueMetricsCounterAdminRedis", () => + createRedisClient( + { ...metricsRedisOptions(), keyPrefix: undefined }, + { onError: (error) => logger.error("queue metrics counter admin redis error", { error }) } + ) + ); +} + +export type QueueMetricsControls = { + enabled: boolean; + enabledKeySet: boolean; + sampleRate: number; + sampleRateKeySet: boolean; + sampleRateDefault: number; +}; + +export async function readQueueMetricsControls(): Promise { + const [enabledRaw, rateRaw] = (await adminRedis().mget(FLAG_KEY, SAMPLE_RATE_KEY)) as ( + | string + | null + )[]; + const sampleRateDefault = env.QUEUE_METRICS_GAUGE_SAMPLE_RATE; + const parsed = rateRaw == null ? Number.NaN : Number(rateRaw); + return { + enabled: enabledRaw != null && TRUTHY.has(enabledRaw.trim().toLowerCase()), + enabledKeySet: enabledRaw != null, + sampleRate: Number.isFinite(parsed) ? Math.min(1, Math.max(0, parsed)) : sampleRateDefault, + sampleRateKeySet: rateRaw != null, + sampleRateDefault, + }; +} + +export async function writeQueueMetricsControls(update: { + enabled?: boolean; + sampleRate?: number; +}): Promise { + const client = adminRedis(); + const ops: Promise[] = []; + if (update.enabled !== undefined) { + ops.push(client.set(FLAG_KEY, update.enabled ? "1" : "0")); + } + if (update.sampleRate !== undefined) { + ops.push(client.set(SAMPLE_RATE_KEY, String(Math.min(1, Math.max(0, update.sampleRate))))); + } + await Promise.all(ops); +} + +export type LabeledShardState = ShardState & { stream: "queue_metrics" }; + +export async function probeQueueMetricsStreams(): Promise { + const def = metricsDefinition(); + const states = await probeShardStates(metricsAdminRedis(), allStreamKeys(def), def.consumerGroup); + return states.map((s) => ({ ...s, stream: "queue_metrics" as const })); +} + +/** Injected into the RunQueue when QUEUE_METRICS_EMIT_ENABLED=1; emits only while the flag is on. */ +export function getQueueMetricsEmitter(): MetricsStreamEmitter { + return singleton("queueMetricsEmitter", () => { + // Control keys stay on the run-queue Redis (the admin surface + docs point there). + const controlRedis = runQueueRedisOptions(); + const flag = new CachedRedisFlag({ redis: controlRedis, key: FLAG_KEY, cacheTtlMs: 10_000 }); + // Live-tunable (Redis key, 10s cache); the env value is the default when the key is unset. + const gaugeSampleRate = new CachedRedisNumber({ + redis: controlRedis, + key: SAMPLE_RATE_KEY, + defaultValue: env.QUEUE_METRICS_GAUGE_SAMPLE_RATE, + min: 0, + max: 1, + cacheTtlMs: 10_000, + }); + return new MetricsStreamEmitter({ + redis: metricsRedisOptions(), + definition: metricsDefinition(), + flag, + meter, + gaugeSampleRate, + counterOdometerTtlMs: env.QUEUE_METRICS_COUNTER_ODOMETER_TTL_SECONDS * 1000, + }); + }); +} + +const queueNameLimiter = singleton( + "queueMetricsQueueNameLimiter", + () => new QueueNameLimiter(env.QUEUE_METRICS_MAX_QUEUE_NAMES_PER_ENV) +); + +const concurrencyKeyLimiter = singleton( + "queueMetricsConcurrencyKeyLimiter", + () => new QueueNameLimiter(env.QUEUE_METRICS_MAX_CONCURRENCY_KEYS_PER_QUEUE, 50_000) +); + +function mapEntry(entry: StreamEntry): QueueMetricsRawV1Input[] { + return mapEntryToRows(entry, { + queueNames: queueNameLimiter, + concurrencyKeys: concurrencyKeyLimiter, + }); +} + +function makeInsert(): ( + rows: QueueMetricsRawV1Input[], + opts: { dedupToken: string } +) => Promise { + const ch: ClickHouse = getDefaultClickhouseClient(); + const insertRaw = ch.queueMetrics.insertRaw; + return async (rows, { dedupToken }) => { + const [error] = await insertRaw(rows, { + params: { + clickhouse_settings: { + insert_deduplication_token: dedupToken, + async_insert: 0, + // Propagate the token through the MV so a raw-deduped retry can't leave + // queue_metrics_v1 short when the MV insert failed on the first attempt. + deduplicate_blocks_in_dependent_materialized_views: 1, + }, + }, + }); + if (error) throw error; + }; +} + +function getQueueMetricsConsumers(): MetricsStreamConsumer[] { + return singleton("queueMetricsConsumers", () => { + const insert = makeInsert(); + return [ + new MetricsStreamConsumer({ + consumerName: `${os.hostname()}-${process.pid}`, + batchSize: env.QUEUE_METRICS_CONSUMER_BATCH_SIZE, + meter, + mapEntry, + insert, + redis: metricsRedisOptions(), + definition: metricsDefinition(), + }), + ]; + }); +} + +// Construct the emitter at boot (not lazily on the first enqueue) so its flag has warmed +// before any traffic — otherwise the first op after boot reads the default and is dropped. +export function initQueueMetricsEmitter(): void { + if (env.QUEUE_METRICS_EMIT_ENABLED !== "1") return; + getQueueMetricsEmitter(); +} + +declare global { + // eslint-disable-next-line no-var + var __queueMetricsConsumerRegistered__: boolean | undefined; +} + +export function initQueueMetricsConsumer(): void { + if (env.QUEUE_METRICS_CONSUMER_ENABLED !== "1") return; + if (global.__queueMetricsConsumerRegistered__) return; + global.__queueMetricsConsumerRegistered__ = true; + + const consumers = getQueueMetricsConsumers(); + const stop = () => + Promise.all(consumers.map((c) => c.stop())).catch((error) => + logger.error("queue metrics consumer stop failed", { error }) + ); + signalsEmitter.on("SIGTERM", stop); + signalsEmitter.on("SIGINT", stop); + + Promise.all(consumers.map((c) => c.start())) + .then(() => logger.info("Queue metrics consumer started")) + .catch((error) => logger.error("queue metrics consumers failed to start", { error })); +} diff --git a/apps/webapp/app/v3/queueMetricsMapping.ts b/apps/webapp/app/v3/queueMetricsMapping.ts new file mode 100644 index 00000000000..9433b361a88 --- /dev/null +++ b/apps/webapp/app/v3/queueMetricsMapping.ts @@ -0,0 +1,164 @@ +import { type QueueMetricsRawV1Input } from "@internal/clickhouse"; +import { entryOrderKey, entryTimeMs, type StreamEntry } from "@internal/metrics-pipeline"; + +const OPS = new Set(["gauge", "enqueue", "started", "ack", "nack", "dlq"]); + +// {org:ORGID}:proj:PROJECTID:env:ENVID:queue:QUEUENAME[:ck:CK]. Anchored (not a +// positional split) so a queue name containing ":" survives; the lazy name capture +// stops before an optional ":ck:" suffix, which is captured (the ":ck:*" wildcard of +// aggregate CK-dequeue gauges maps to no key). +const DESCRIPTOR = /^\{org:([^}]+)\}:proj:([^:]+):env:([^:]+):queue:(.+?)(?::ck:(.+))?$/; + +export function descriptorFromQueue(q: string): { + organization_id: string; + project_id: string; + environment_id: string; + queue_name: string; + concurrency_key: string; +} | null { + const match = DESCRIPTOR.exec(q); + if (!match) return null; + const ck = match[5]; + return { + organization_id: match[1]!, + project_id: match[2]!, + environment_id: match[3]!, + queue_name: match[4]!, + concurrency_key: ck && ck !== "*" ? ck : "", + }; +} + +export const OVERFLOW_QUEUE_NAME = "__overflow__"; + +/** + * Bounds per-scope name cardinality (both queue_name per env and concurrency_key per + * queue are user-controlled GROUP BY keys). Names beyond the cap map to OVERFLOW_QUEUE_NAME. + * Per-process and reset on restart, so the cap is approximate: a protective bound, not a quota. + */ +export class QueueNameLimiter { + private readonly byScope = new Map>(); + + constructor( + private readonly maxPerScope: number, + private readonly maxScopes = 10_000 + ) {} + + limit(scope: string, name: string): string { + if (this.maxPerScope <= 0) return name; + let names = this.byScope.get(scope); + if (!names) { + if (this.byScope.size >= this.maxScopes) { + const oldest = this.byScope.keys().next().value; + if (oldest !== undefined) this.byScope.delete(oldest); + } + names = new Set(); + this.byScope.set(scope, names); + } + if (names.has(name)) return name; + if (names.size >= this.maxPerScope) return OVERFLOW_QUEUE_NAME; + names.add(name); + return name; + } +} + +function num(value: string | undefined): number | undefined { + if (value == null) return undefined; + const n = Number(value); + return Number.isFinite(n) ? n : undefined; +} + +export type QueueMetricsLimiters = { + queueNames?: QueueNameLimiter; + concurrencyKeys?: QueueNameLimiter; +}; + +/** + * One stream entry maps to 1..2 raw rows: gauges are single rows carrying their parsed + * concurrency_key; a counter entry yields a base row when `cum` is present plus a per-key + * row when `ck`/`ckcum` are present (the emitter's dual-odometer entry). Baseline entries + * carry only one of the two, by design. + */ +export function mapEntryToRows( + entry: StreamEntry, + limiters?: QueueMetricsLimiters +): QueueMetricsRawV1Input[] { + const f = entry.fields; + const op = f.op; + if (!op || !OPS.has(op) || !f.q) return []; + const descriptor = descriptorFromQueue(f.q); + if (!descriptor || !descriptor.queue_name) return []; + + let queueOverflowed = false; + if (limiters?.queueNames) { + descriptor.queue_name = limiters.queueNames.limit( + descriptor.environment_id, + descriptor.queue_name + ); + queueOverflowed = descriptor.queue_name === OVERFLOW_QUEUE_NAME; + } + + // Counter entries carry the key as a field (q is base-normalized); gauges carry it in q. + let ck = descriptor.concurrency_key || (typeof f.ck === "string" ? f.ck : ""); + if (ck && limiters?.concurrencyKeys) { + const scope = `${descriptor.environment_id}:${descriptor.queue_name}`; + if (limiters.concurrencyKeys.limit(scope, ck) === OVERFLOW_QUEUE_NAME) ck = ""; + } + // Overflowed queue names share one row; per-key attribution under them is meaningless. + if (queueOverflowed) ck = ""; + + const eventMs = entryTimeMs(entry.id) ?? Date.now(); + const eventTime = new Date(eventMs).toISOString().slice(0, 19).replace("T", " "); + const base = { + organization_id: descriptor.organization_id, + project_id: descriptor.project_id, + environment_id: descriptor.environment_id, + queue_name: descriptor.queue_name, + event_time: eventTime, + op: op as QueueMetricsRawV1Input["op"], + }; + + if (op === "gauge") { + return [ + { + ...base, + concurrency_key: ck, + queued: num(f.ql), + running: num(f.cc), + queue_limit: num(f.lim), + env_queued: num(f.eql), + env_running: num(f.ec), + env_limit: num(f.elim), + throttled: num(f.thr), + ck_backlogged: num(f.ckq), + ck_max_wait_ms: num(f.ckw), + }, + ]; + } + + // Overflowed names drop counters entirely: merging distinct odometers under one shared + // name produces garbage deltas (gauges above stay, max across the overflow set is + // still meaningful). + if (queueOverflowed) return []; + + const rows: QueueMetricsRawV1Input[] = []; + const orderKey = entryOrderKey(entry.id); + const waitMs = op === "started" && f.wait != null ? num(f.wait) : undefined; + if (f.cum != null) { + rows.push({ + ...base, + cumulative: num(f.cum), + order_key: orderKey, + ...(waitMs !== undefined ? { wait_ms: waitMs } : {}), + }); + } + if (ck && f.ckcum != null) { + rows.push({ + ...base, + concurrency_key: ck, + cumulative: num(f.ckcum), + order_key: orderKey, + ...(waitMs !== undefined ? { wait_ms: waitMs } : {}), + }); + } + return rows; +} diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts index 4d9e263d6be..85986933290 100644 --- a/apps/webapp/app/v3/runEngine.server.ts +++ b/apps/webapp/app/v3/runEngine.server.ts @@ -7,6 +7,7 @@ import { logger } from "~/services/logger.server"; import { defaultMachine, getCurrentPlan } from "~/services/platform.v3.server"; import { singleton } from "~/utils/singleton"; import { allMachines } from "./machinePresets.server"; +import { getQueueMetricsEmitter } from "./queueMetrics.server"; import { runEnginePendingVersionLookup } from "./runEnginePendingVersionLookup.server"; import { pickRunOpsStoreForCompletion } from "./runOpsMigration/crossSeamGuard.server"; import { runEngineControlPlaneResolver } from "./runOpsMigration/runEngineControlPlaneResolver.server"; @@ -83,6 +84,7 @@ function createRunEngine() { tracer, }, shardCount: env.RUN_ENGINE_RUN_QUEUE_SHARD_COUNT, + queueMetrics: env.QUEUE_METRICS_EMIT_ENABLED === "1" ? getQueueMetricsEmitter() : undefined, processWorkerQueueDebounceMs: env.RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS, dequeueBlockingTimeoutSeconds: env.RUN_ENGINE_DEQUEUE_BLOCKING_TIMEOUT_SECONDS, masterQueueConsumersIntervalMs: env.RUN_ENGINE_MASTER_QUEUE_CONSUMERS_INTERVAL_MS, diff --git a/apps/webapp/package.json b/apps/webapp/package.json index 643093624b4..90dc92447f7 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -17,6 +17,7 @@ "typecheck": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" tsc --noEmit -p ./tsconfig.check.json", "db:seed": "tsx seed.ts", "db:seed:ai-spans": "tsx seed-ai-spans.mts", + "db:seed:queue-metrics": "tsx seed-queue-metrics.mts", "upload:sourcemaps": "bash ./upload-sourcemaps.sh", "test": "vitest --no-file-parallelism", "eval:dev": "evalite watch" @@ -57,6 +58,7 @@ "@internal/dashboard-agent": "workspace:*", "@internal/dashboard-agent-db": "workspace:*", "@internal/llm-model-catalog": "workspace:*", + "@internal/metrics-pipeline": "workspace:*", "@internal/redis": "workspace:*", "@internal/run-engine": "workspace:*", "@internal/run-ops-database": "workspace:*", diff --git a/apps/webapp/seed-queue-metrics.mts b/apps/webapp/seed-queue-metrics.mts new file mode 100644 index 00000000000..709ba8f25ed --- /dev/null +++ b/apps/webapp/seed-queue-metrics.mts @@ -0,0 +1,947 @@ +import { prisma } from "./app/db.server"; +import { createOrganization } from "./app/models/organization.server"; +import { createProject } from "./app/models/project.server"; +import { ClickHouse } from "@internal/clickhouse"; +import type { QueueMetricsRawV1Input } from "@internal/clickhouse"; +import { generateFriendlyId } from "./app/v3/friendlyIdentifiers"; + +// Queue metrics simulator: writes realistic raw rows into a synthetic tenant's +// queue_metrics_raw_v1 and lets the MV build queue_metrics_v1 (the same path the real +// consumer uses), so the dashboard can be built without the run engine. See TRI-10407. + +const ORG_TITLE = "Queue Metrics Dev"; +const PROJECT_NAME = "queue-metrics-demo"; + +type Rng = () => number; +type QueueProfile = { + name: string; + limit: (bucket: number) => number; + arrivals: (bucket: number, rng: Rng) => number; // expected new runs enqueued this bucket + waitBaseMs: number; + sparse?: boolean; // emit no rows when the queue is fully idle (tests carry-forward gaps) + // Concurrency-key queue: adds CK-health gauge fields + live ckIndex staging (--usage) + ck?: { + backlogged: (bucket: number, rng: Rng) => number; + maxWaitMs: (bucket: number, rng: Rng) => number; + }; +}; +type Scenario = { + description: string; + envLimit: (bucket: number) => number; + queues: QueueProfile[]; +}; + +// --------------------------------------------------------------------------- +// CLI args +// --------------------------------------------------------------------------- + +function parseArgs(argv: string[]) { + const flags: Record = {}; + for (let i = 0; i < argv.length; i++) { + const t = argv[i]; + if (t.startsWith("--")) { + const k = t.slice(2); + const n = argv[i + 1]; + if (n && !n.startsWith("--")) { + flags[k] = n; + i++; + } else flags[k] = "true"; + } + } + return flags; +} + +function parseDuration(s: string): number { + const m = s.match(/^(\d+)\s*(s|m|h|d)?$/); + if (!m) throw new Error(`bad duration: ${s}`); + const n = Number(m[1]); + const unit = m[2] ?? "s"; + return n * { s: 1, m: 60, h: 3600, d: 86400 }[unit]!; +} + +// --------------------------------------------------------------------------- +// Deterministic RNG + distributions +// --------------------------------------------------------------------------- + +function mulberry32(seed: number): Rng { + let a = seed >>> 0; + return () => { + a |= 0; + a = (a + 0x6d2b79f5) | 0; + let t = Math.imul(a ^ (a >>> 15), 1 | a); + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t; + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; +} + +function standardNormal(rng: Rng): number { + let u = 0; + let v = 0; + while (u === 0) u = rng(); + while (v === 0) v = rng(); + return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v); +} + +function lognormal(medianMs: number, sigma: number, rng: Rng): number { + return Math.exp(Math.log(Math.max(medianMs, 1)) + sigma * standardNormal(rng)); +} + +function poisson(lambda: number, rng: Rng): number { + if (lambda <= 0) return 0; + if (lambda > 30) return Math.max(0, Math.round(lambda + standardNormal(rng) * Math.sqrt(lambda))); + const L = Math.exp(-lambda); + let k = 0; + let p = 1; + do { + k++; + p *= rng(); + } while (p > L); + return k - 1; +} + +function formatChDateTime(date: Date): string { + return date.toISOString().slice(0, 19).replace("T", " "); +} + +// --------------------------------------------------------------------------- +// Scenarios +// --------------------------------------------------------------------------- + +const steady = (): QueueProfile[] => [ + { name: "emails", limit: () => 20, arrivals: (_b, r) => poisson(12, r), waitBaseMs: 40 }, + { name: "webhooks", limit: () => 15, arrivals: (_b, r) => poisson(9, r), waitBaseMs: 40 }, + { name: "reports", limit: () => 10, arrivals: (_b, r) => poisson(5, r), waitBaseMs: 60 }, +]; + +// periodic bursts every ~30 buckets +const bursty = (name: string, limit: number, base: number): QueueProfile => ({ + name, + limit: () => limit, + arrivals: (b, r) => poisson(b % 30 < 4 ? base * 5 : base, r), + waitBaseMs: 50, +}); + +const scenarios: Record Scenario> = { + steady: () => ({ + description: "all queues below capacity, no throttling", + envLimit: () => 60, + queues: steady(), + }), + + burst: () => ({ + description: "periodic arrival bursts -> backlog + wait spikes + throttling", + envLimit: () => 60, + queues: [bursty("ingest", 20, 6), bursty("transform", 20, 7)], + }), + + // Tela case: sum of per-queue limits far exceeds the env limit, so queues compete. + "over-allocated-env": () => ({ + description: "Sum(queue limits)=120 >> env limit=40; env saturates, queues env-limited", + envLimit: () => 40, + queues: Array.from({ length: 6 }, (_v, i) => ({ + name: `worker-${i + 1}`, + limit: () => 20, + arrivals: (_b: number, r: Rng) => poisson(14, r), + waitBaseMs: 50, + })), + }), + + "single-queue-starves-others": () => ({ + description: "one greedy queue consumes most of a small env limit, starving the rest", + envLimit: () => 30, + queues: [ + { name: "greedy", limit: () => 40, arrivals: (_b, r) => poisson(45, r), waitBaseMs: 60 }, + { name: "polite-1", limit: () => 10, arrivals: (_b, r) => poisson(6, r), waitBaseMs: 50 }, + { name: "polite-2", limit: () => 10, arrivals: (_b, r) => poisson(6, r), waitBaseMs: 50 }, + ], + }), + + "throttled-backlog": () => ({ + description: + "arrival rate persistently above the queue limit -> permanent backlog + throttling", + envLimit: () => 50, + queues: [ + { name: "overloaded", limit: () => 10, arrivals: (_b, r) => poisson(16, r), waitBaseMs: 80 }, + ], + }), + + "idle-sparse": () => ({ + description: "sparse arrivals with many empty buckets (carry-forward gaps)", + envLimit: () => 50, + queues: Array.from({ length: 4 }, (_v, i) => ({ + name: `sparse-${i + 1}`, + limit: () => 5, + arrivals: (_b: number, r: Rng) => (r() < 0.12 ? poisson(3, r) : 0), + waitBaseMs: 30, + sparse: true, + })), + }), + + "spike-then-drain": (totalBuckets) => ({ + description: "heavy arrivals for the first third, then zero; backlog builds then drains", + envLimit: () => 60, + queues: [ + { + name: "batch-job", + limit: () => 15, + arrivals: (b, r) => (b < totalBuckets / 3 ? poisson(30, r) : 0), + waitBaseMs: 70, + }, + ], + }), + + // Pagination + relevance-ranking design surface: one runaway queue, a busy-but-healthy + // head, a bursty middle, and a long sparse tail across 61 queues (the list pages at 25). + "many-queues": () => ({ + description: + "61 queues: one runaway, busy head, bursty middle, long sparse tail (pagination + ranking)", + envLimit: () => 150, + queues: [ + { name: "imports", limit: () => 8, arrivals: (_b, r) => poisson(14, r), waitBaseMs: 80 }, + ...["checkout", "notifications", "emails"].map((name, i) => ({ + name, + limit: () => 15, + arrivals: (_b: number, r: Rng) => poisson(7 + i, r), + waitBaseMs: 60, + })), + ...Array.from({ length: 12 }, (_v, i) => + bursty(`service-${String(i + 1).padStart(2, "0")}`, 10, 2) + ), + ...Array.from({ length: 20 }, (_v, i) => ({ + name: `job-${String(i + 1).padStart(2, "0")}`, + limit: () => 5, + arrivals: (_b: number, r: Rng) => poisson(1, r), + waitBaseMs: 40, + })), + ...Array.from({ length: 25 }, (_v, i) => ({ + name: `tenant-${String(i + 1).padStart(2, "0")}`, + limit: () => 3, + arrivals: (_b: number, r: Rng) => (r() < 0.05 ? poisson(2, r) : 0), + waitBaseMs: 30, + sparse: true, + })), + ], + }), + + // Per-tenant concurrency keys: a hog tenant periodically floods the queue and starves + // the others, so the CK charts (keys with backlog, most-starved wait) and the live + // per-key table on the queue detail page have something to show. Use with --usage. + "tenant-hotspot": () => ({ + description: + "CK queue where a hog tenant starves others: CK charts + live key table (use --usage)", + envLimit: () => 40, + queues: [ + { + name: "per-tenant", + limit: () => 10, + arrivals: (b, r) => poisson(b % 60 < 20 ? 25 : 8, r), + waitBaseMs: 60, + ck: { + backlogged: (b, r) => (b % 60 < 20 ? 6 + Math.round(r() * 6) : Math.round(r() * 3)), + maxWaitMs: (b, r) => + b % 60 < 20 + ? Math.round(lognormal(90_000, 0.5, r)) + : Math.round(lognormal(3_000, 0.6, r)), + }, + }, + { name: "background", limit: () => 10, arrivals: (_b, r) => poisson(5, r), waitBaseMs: 40 }, + ], + }), + + // Default: one env with a variety of queue behaviours + occasional env saturation. + mixed: (totalBuckets) => ({ + description: "variety of queue profiles in one env, with occasional env saturation", + envLimit: (b) => (b % 40 < 12 ? 45 : 70), // dips low periodically to flip env saturation + queues: [ + { name: "emails", limit: () => 20, arrivals: (_b, r) => poisson(12, r), waitBaseMs: 40 }, + bursty("webhooks", 20, 6), + { name: "reports", limit: () => 10, arrivals: (_b, r) => poisson(8, r), waitBaseMs: 80 }, + { + name: "cleanup", + limit: () => 5, + arrivals: (_b, r) => (r() < 0.12 ? poisson(3, r) : 0), + waitBaseMs: 30, + sparse: true, + }, + { + name: "nightly-batch", + limit: () => 15, + arrivals: (b, r) => (b < totalBuckets / 5 ? poisson(18, r) : 0), + waitBaseMs: 70, + }, + ], + }), +}; + +// --------------------------------------------------------------------------- +// Simulation +// --------------------------------------------------------------------------- + +type Ids = { organization_id: string; project_id: string; environment_id: string }; +const WAIT_SIGMA = 0.6; +const NACK_RATE = 0.02; +const DLQ_RATE = 0.004; + +type CounterOp = "enqueue" | "started" | "ack" | "nack" | "dlq"; +// Per-(queue, op) odometers, mirroring the production emitter: cumulative readings with a +// cum=0 baseline on the first one, so deltaSumTimestamp captures the 0->1 delta. +type CounterState = Record[]; + +function counterRows( + counters: CounterState, + q: number, + ids: Ids, + queueName: string, + eventTime: string, + orderKey: () => number, + op: CounterOp, + wait_ms?: number +): QueueMetricsRawV1Input[] { + const rows: QueueMetricsRawV1Input[] = []; + if (counters[q][op] === 0) { + rows.push({ + ...ids, + queue_name: queueName, + event_time: eventTime, + op, + cumulative: 0, + order_key: orderKey(), + }); + } + counters[q][op] += 1; + rows.push({ + ...ids, + queue_name: queueName, + event_time: eventTime, + op, + cumulative: counters[q][op], + order_key: orderKey(), + ...(wait_ms !== undefined ? { wait_ms } : {}), + }); + return rows; +} + +function newCounterState(n: number): CounterState { + return Array.from({ length: n }, () => ({ enqueue: 0, started: 0, ack: 0, nack: 0, dlq: 0 })); +} + +// Per-key simulation for CK profiles: 12 tenants (tenant-01 is the hog, matching +// stageRedisUsage), per-tenant backlog drained round-robin, per-tenant odometers. +const CK_TENANT_COUNT = 12; +type CkSimState = { backlog: number[]; counters: Map> }; +const ckSim = new Map(); + +function ckTenantName(t: number): string { + return `tenant-${String(t + 1).padStart(2, "0")}`; +} + +function ckCounterRows( + state: CkSimState, + tenant: number, + ids: Ids, + queueName: string, + eventTime: string, + orderKey: () => number, + op: CounterOp, + wait_ms?: number +): QueueMetricsRawV1Input[] { + let c = state.counters.get(tenant); + if (!c) { + c = { enqueue: 0, started: 0, ack: 0, nack: 0, dlq: 0 }; + state.counters.set(tenant, c); + } + const common = { + ...ids, + queue_name: queueName, + concurrency_key: ckTenantName(tenant), + event_time: eventTime, + }; + const rows: QueueMetricsRawV1Input[] = []; + if (c[op] === 0) rows.push({ ...common, op, cumulative: 0, order_key: orderKey() }); + c[op] += 1; + rows.push({ + ...common, + op, + cumulative: c[op], + order_key: orderKey(), + ...(wait_ms !== undefined ? { wait_ms } : {}), + }); + return rows; +} + +// Advance one bucket of the simulation for every queue, returning the raw rows to insert. +// `backlog` and `counters` are mutated in place so state carries across buckets (and into +// live mode). +function simulateBucket( + scenario: Scenario, + bucket: number, + bucketSec: number, + eventTime: string, + bucketEpochSec: number, + ids: Ids, + backlog: number[], + counters: CounterState, + rng: Rng +): QueueMetricsRawV1Input[] { + const envLimit = scenario.envLimit(bucket); + const n = scenario.queues.length; + + const limit = new Array(n); + const desired = new Array(n); + for (let q = 0; q < n; q++) { + limit[q] = scenario.queues[q].limit(bucket); + const arrivals = Math.min(500, scenario.queues[q].arrivals(bucket, rng)); + const prior = backlog[q]; // backlog carried from earlier buckets, before this bucket's arrivals + backlog[q] += arrivals; // arrivals join the backlog; recorded as enqueues below + (desired as any)[q] = { arrivals, prior, want: Math.min(limit[q], backlog[q]) }; + } + + // Env cap: if the queues collectively want more concurrency than the env allows, scale down. + const sumWant = desired.reduce((s: number, d: any) => s + d.want, 0); + const scale = sumWant > envLimit && sumWant > 0 ? envLimit / sumWant : 1; + + const running = new Array(n); + const queued = new Array(n); + let envRunning = 0; + let envQueued = 0; + for (let q = 0; q < n; q++) { + const d = desired[q] as any; + running[q] = Math.floor(d.want * scale); + queued[q] = backlog[q] - running[q]; + envRunning += running[q]; + envQueued += queued[q]; + } + + // Order keys are time-based (like the production stream ids) so appended runs and live + // mode stay monotonic; the per-bucket sequence keeps them unique within a bucket. + let bucketSeq = 0; + const orderKey = () => bucketEpochSec * 1_000_000 + bucketSeq++; + + const rows: QueueMetricsRawV1Input[] = []; + for (let q = 0; q < n; q++) { + const profile = scenario.queues[q]; + const started = running[q]; + const arrivals = (desired[q] as any).arrivals as number; + const prior = (desired[q] as any).prior as number; // depth a starting run actually queued behind + backlog[q] = queued[q]; // carry the unserved remainder forward + + if (profile.sparse && arrivals === 0 && started === 0 && prior === 0) { + continue; // fully idle: leave a gap so carry-forward is exercised + } + + // CK-health fields stay coherent with the depth: no queued runs means no backlogged keys. + const ckBacklogged = profile.ck + ? queued[q] > 0 + ? Math.max(1, Math.min(profile.ck.backlogged(bucket, rng), queued[q])) + : 0 + : undefined; + const ckMaxWaitMs = + profile.ck && ckBacklogged ? Math.round(profile.ck.maxWaitMs(bucket, rng)) : undefined; + + const gauge: QueueMetricsRawV1Input = { + ...ids, + queue_name: profile.name, + event_time: eventTime, + op: "gauge", + running: running[q], + queued: queued[q], + queue_limit: limit[q], + env_running: envRunning, + env_queued: envQueued, + env_limit: envLimit, + throttled: queued[q] > 0 && (running[q] >= limit[q] || scale < 1) ? 1 : 0, + ...(ckBacklogged !== undefined + ? { ck_backlogged: ckBacklogged, ck_max_wait_ms: ckMaxWaitMs ?? 0 } + : {}), + }; + rows.push(gauge); + + for (let a = 0; a < arrivals; a++) { + rows.push(...counterRows(counters, q, ids, profile.name, eventTime, orderKey, "enqueue")); + } + + // Per-key rows for CK profiles: assign arrivals hog-weighted, drain round-robin + // (fair share), then emit per-tenant odometers + a per-key gauge per active tenant. + if (profile.ck) { + let ckq = ckSim.get(q); + if (!ckq) { + ckq = { backlog: new Array(CK_TENANT_COUNT).fill(0), counters: new Map() }; + ckSim.set(q, ckq); + } + const hogShare = bucket % 60 < 20 ? 0.6 : 0.15; + const arrivalsPerTenant = new Array(CK_TENANT_COUNT).fill(0); + for (let a = 0; a < arrivals; a++) { + const t = rng() < hogShare ? 0 : 1 + Math.floor(rng() * (CK_TENANT_COUNT - 1)); + arrivalsPerTenant[t]++; + ckq.backlog[t]++; + } + const drainedPerTenant = new Array(CK_TENANT_COUNT).fill(0); + let remaining = started; + while (remaining > 0 && ckq.backlog.some((v) => v > 0)) { + for (let t = 0; t < CK_TENANT_COUNT && remaining > 0; t++) { + if (ckq.backlog[t] > 0) { + ckq.backlog[t]--; + drainedPerTenant[t]++; + remaining--; + } + } + } + for (let t = 0; t < CK_TENANT_COUNT; t++) { + const fairShare = Math.max(1, limit[q] / CK_TENANT_COUNT); + const ckMedianWait = profile.waitBaseMs + (ckq.backlog[t] / fairShare) * bucketSec * 1000; + for (let a = 0; a < arrivalsPerTenant[t]; a++) { + rows.push(...ckCounterRows(ckq, t, ids, profile.name, eventTime, orderKey, "enqueue")); + } + for (let d = 0; d < drainedPerTenant[t]; d++) { + rows.push( + ...ckCounterRows( + ckq, + t, + ids, + profile.name, + eventTime, + orderKey, + "started", + Math.round(lognormal(ckMedianWait, WAIT_SIGMA, rng)) + ) + ); + rows.push(...ckCounterRows(ckq, t, ids, profile.name, eventTime, orderKey, "ack")); + } + if (ckq.backlog[t] > 0 || drainedPerTenant[t] > 0) { + rows.push({ + ...ids, + queue_name: profile.name, + concurrency_key: ckTenantName(t), + event_time: eventTime, + op: "gauge", + queued: ckq.backlog[t], + running: drainedPerTenant[t], + }); + } + } + } + + const medianWait = profile.waitBaseMs + (prior / Math.max(limit[q], 1)) * bucketSec * 1000; + for (let s = 0; s < started; s++) { + rows.push( + ...counterRows( + counters, + q, + ids, + profile.name, + eventTime, + orderKey, + "started", + Math.round(lognormal(medianWait, WAIT_SIGMA, rng)) + ) + ); + const roll = rng(); + const op: CounterOp = roll < DLQ_RATE ? "dlq" : roll < DLQ_RATE + NACK_RATE ? "nack" : "ack"; + rows.push(...counterRows(counters, q, ids, profile.name, eventTime, orderKey, op)); + } + } + return rows; +} + +// --------------------------------------------------------------------------- +// ClickHouse +// --------------------------------------------------------------------------- + +function clickhouse(): ClickHouse { + const clickhouseUrl = process.env.CLICKHOUSE_URL ?? process.env.EVENTS_CLICKHOUSE_URL; + if (!clickhouseUrl) { + console.error("CLICKHOUSE_URL not set"); + process.exit(1); + } + const url = new URL(clickhouseUrl); + // Allowlist local hosts only (this script TRUNCATEs), and never echo the URL (it carries creds). + const localHosts = new Set(["localhost", "127.0.0.1", "::1", "0.0.0.0"]); + if (!localHosts.has(url.hostname)) { + console.error(`Refusing to run against a non-local ClickHouse host: ${url.hostname}`); + process.exit(1); + } + url.searchParams.delete("secure"); + return new ClickHouse({ url: url.toString(), name: "queue-metrics-simulator" }); +} + +async function insertBatched(ch: ClickHouse, rows: QueueMetricsRawV1Input[], nonce: string) { + const BATCH = 25_000; + for (let i = 0; i < rows.length; i += BATCH) { + const slice = rows.slice(i, i + BATCH); + const [error] = await ch.queueMetrics.insertRaw(slice, { + params: { clickhouse_settings: { insert_deduplication_token: `${nonce}:${i}` } }, + }); + if (error) { + console.error("insert failed:", error.message); + process.exit(1); + } + } +} + +async function resetEnv(ch: ClickHouse, environmentId: string) { + const raw = ( + ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } } + ).client; + for (const table of [ + "queue_metrics_raw_v1", + "queue_metrics_v1", + "queue_metrics_5m_v1", + "env_metrics_v1", + "queue_metrics_ck_v1", + ]) { + await raw.command({ + query: `DELETE FROM trigger_dev.${table} WHERE environment_id = '${environmentId}'`, + }); + } + console.log(`Reset queue metrics for environment ${environmentId}`); +} + +// Fake running counts in the run-queue Redis (Running column + allocation usage bars). +// Reconciled every run: staged with --usage, cleared otherwise. +async function stageRedisUsage(scenario: Scenario, ids: Ids, seed: number, clear: boolean) { + const host = process.env.RUN_ENGINE_RUN_QUEUE_REDIS_HOST ?? process.env.REDIS_HOST ?? "localhost"; + const port = Number( + process.env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? process.env.REDIS_PORT ?? 6379 + ); + const localHosts = new Set(["localhost", "127.0.0.1", "::1", "0.0.0.0"]); + if (!localHosts.has(host)) { + console.warn(`Skipping Redis usage staging on a non-local host: ${host}`); + return; + } + try { + const { createRedisClient } = await import("@internal/redis"); + const redis = createRedisClient({ host, port }); + const rng = mulberry32(seed + 1); + const prefix = "engine:runqueue:"; + const logicalBase = `{org:${ids.organization_id}}:proj:${ids.project_id}:env:${ids.environment_id}:queue:`; + const base = `${prefix}${logicalBase}`; + for (const [q, profile] of scenario.queues.entries()) { + const key = `${base}${profile.name}:currentDequeued`; + await redis.del(key); + + // CK staging (ckIndex + per-key subqueues) feeds the live per-key table on the queue + // detail page. Members are stored unprefixed, exactly like the run-queue Lua does. + const ckIndexKey = `${base}${profile.name}:ckIndex`; + const lengthCounterKey = `${base}${profile.name}:lengthCounter`; + const staleMembers = await redis.zrange(ckIndexKey, 0, -1); + for (const member of staleMembers) { + await redis.del(`${prefix}${member}`, `${prefix}${member}:currentConcurrency`); + } + await redis.del(ckIndexKey, lengthCounterKey); + + if (clear) continue; + const limit = profile.limit(0); + // First queue rides at/over its limit, the rest at 30-90%, sparse mostly idle. + const count = profile.sparse + ? rng() < 0.3 + ? 1 + : 0 + : q === 0 + ? limit + Math.round(rng() * 2) + : Math.round(limit * (0.3 + 0.6 * rng())); + if (count > 0) { + await redis.sadd(key, ...Array.from({ length: count }, (_v, i) => `sim_run_${i}`)); + } + + if (profile.ck) { + const now = Date.now(); + const tenants = 12; + let totalCkQueued = 0; + for (let t = 1; t <= tenants; t++) { + const tenant = `tenant-${String(t).padStart(2, "0")}`; + const member = `${logicalBase}${profile.name}:ck:${tenant}`; + const hog = t === 1; + const queuedCount = hog ? 40 : 1 + Math.round(rng() * 5); + const runningCount = hog ? limit : Math.round(rng() * 2); + const oldestAgeMs = hog ? 15 * 60_000 : 5_000 + Math.round(rng() * 55_000); + const zargs: Array = []; + for (let i = 0; i < queuedCount; i++) { + zargs.push(now - oldestAgeMs + i * 250, `sim_${tenant}_run_${i}`); + } + await redis.zadd(`${prefix}${member}`, ...zargs); + if (runningCount > 0) { + await redis.sadd( + `${prefix}${member}:currentConcurrency`, + ...Array.from({ length: runningCount }, (_v, i) => `sim_${tenant}_running_${i}`) + ); + } + await redis.zadd(ckIndexKey, now - oldestAgeMs, member); + totalCkQueued += queuedCount; + } + // The aggregate "Queued now" reads ZCARD(base) + this counter; keep them coherent. + await redis.set(lengthCounterKey, totalCkQueued, "EX", 24 * 3600); + } + } + await redis.quit(); + console.log( + clear + ? "Cleared staged Redis usage." + : "Staged fake running counts in Redis (Running column + allocation usage bars)." + ); + } catch (error) { + console.warn("Redis usage staging skipped:", error instanceof Error ? error.message : error); + } +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +// Make the synthetic project a V2 engine project with a current dev worker + a Postgres +// TaskQueue per simulated queue, so the /queues list renders the V2 table (it pages from +// Postgres and gates on engine version; ClickHouse only holds the metrics). +async function ensureTaskQueues( + scenario: Scenario, + projectId: string, + runtimeEnvironmentId: string +) { + await prisma.project.update({ where: { id: projectId }, data: { engine: "V2" } }); + + await prisma.backgroundWorker.upsert({ + where: { + projectId_runtimeEnvironmentId_version: { + projectId, + runtimeEnvironmentId, + version: "queue-metrics-sim", + }, + }, + update: {}, + create: { + friendlyId: generateFriendlyId("worker"), + engine: "V2", + contentHash: "queue-metrics-sim", + sdkVersion: "4.0.0", + cliVersion: "4.0.0", + projectId, + runtimeEnvironmentId, + version: "queue-metrics-sim", + metadata: {}, + }, + }); + + for (const profile of scenario.queues) { + const concurrencyLimit = profile.limit(0); + await prisma.taskQueue.upsert({ + where: { runtimeEnvironmentId_name: { runtimeEnvironmentId, name: profile.name } }, + create: { + friendlyId: generateFriendlyId("queue"), + version: "V2", + name: profile.name, + orderableName: profile.name, + concurrencyLimit, + runtimeEnvironmentId, + projectId, + type: "NAMED", + }, + update: { concurrencyLimit }, + }); + } + + // Drop queues left over from a previously seeded scenario so switching scenarios + // does not leave metric-less rows in the list. + const { count: pruned } = await prisma.taskQueue.deleteMany({ + where: { + runtimeEnvironmentId, + name: { notIn: scenario.queues.map((q) => q.name) }, + }, + }); + console.log( + `Ensured ${scenario.queues.length} task queues in Postgres${pruned > 0 ? `, pruned ${pruned} stale` : ""}.` + ); +} + +function printHelp() { + const lines = Object.entries(scenarios).map( + ([name, build]) => ` ${name.padEnd(28)}${build(720, 10).description}` + ); + console.log(`Queue metrics simulator: seeds a synthetic tenant with realistic queue metrics. + +Usage: pnpm --filter webapp run db:seed:queue-metrics -- [flags] + +Flags: + --scenario which scenario to seed (default: mixed) + --project project to seed into (default: ${PROJECT_NAME}); use one + project per scenario to browse them side by side + --window how much history to backfill, e.g. 30m, 6h, 1d (default: 2h) + --bucket seconds per simulated bucket (default: 10) + --seed RNG seed for reproducible data (default: 1) + --usage stage fake running counts in Redis so the Running column and + the Allocation tab's usage bars have data (cleared when omitted) + --live after backfilling, keep appending one bucket per interval + --reset clear this environment's metrics before seeding + --reset-only clear and exit without seeding + --help this text + +Scenarios: +${lines.join("\n")} + +Example designer setup (one project per scenario): + pnpm --filter webapp run db:seed:queue-metrics -- --scenario mixed --reset + pnpm --filter webapp run db:seed:queue-metrics -- --scenario many-queues --project qm-many-queues --reset + pnpm --filter webapp run db:seed:queue-metrics -- --scenario throttled-backlog --project qm-throttled --reset + pnpm --filter webapp run db:seed:queue-metrics -- --scenario tenant-hotspot --project qm-tenants --usage --reset`); +} + +async function main() { + const flags = parseArgs(process.argv.slice(2)); + if (flags.help === "true") { + printHelp(); + process.exit(0); + } + const scenarioName = flags.scenario ?? "mixed"; + const build = scenarios[scenarioName]; + if (!build) { + console.error( + `Unknown scenario "${scenarioName}". Options: ${Object.keys(scenarios).join(", ")}` + ); + process.exit(1); + } + const bucketSec = Number(flags.bucket ?? 10); + if (!Number.isFinite(bucketSec) || bucketSec <= 0) { + console.error(`--bucket must be a positive number of seconds, got: ${flags.bucket}`); + process.exit(1); + } + const windowSec = parseDuration(flags.window ?? "2h"); + const totalBuckets = Math.floor(windowSec / bucketSec); + if (!Number.isFinite(totalBuckets) || totalBuckets <= 0) { + console.error( + `--window must be longer than --bucket (got ${windowSec}s window, ${bucketSec}s bucket)` + ); + process.exit(1); + } + const seed = Number(flags.seed ?? 1); + const live = flags.live === "true"; + + const user = await prisma.user.findUnique({ where: { email: "local@trigger.dev" } }); + if (!user) { + console.error("User local@trigger.dev not found. Run `pnpm run db:seed` first."); + process.exit(1); + } + + let org = await prisma.organization.findFirst({ + where: { title: ORG_TITLE, members: { some: { userId: user.id } } }, + }); + if (!org) + org = await createOrganization({ title: ORG_TITLE, userId: user.id, companySize: "1-10" }); + + const projectName = flags.project ?? PROJECT_NAME; + let project = await prisma.project.findFirst({ + where: { name: projectName, organizationId: org.id }, + }); + if (!project) { + project = await createProject({ + organizationSlug: org.slug, + name: projectName, + userId: user.id, + version: "v3", + }); + } + + const runtimeEnv = await prisma.runtimeEnvironment.findFirst({ + where: { projectId: project.id, type: "DEVELOPMENT" }, + }); + if (!runtimeEnv) { + console.error("No DEVELOPMENT environment found for project."); + process.exit(1); + } + + const ids: Ids = { + organization_id: org.id, + project_id: project.id, + environment_id: runtimeEnv.id, + }; + const ch = clickhouse(); + const nonce = `qmsim-${Date.now()}-${seed}`; + + if (flags.reset === "true" || flags["reset-only"] === "true") { + await resetEnv(ch, runtimeEnv.id); + if (flags["reset-only"] === "true") { + await ch.close(); + process.exit(0); + } + } + + const scenario = build(totalBuckets, bucketSec); + await ensureTaskQueues(scenario, project.id, runtimeEnv.id); + await stageRedisUsage(scenario, ids, seed, flags.usage !== "true"); + const rng = mulberry32(seed); + const backlog = new Array(scenario.queues.length).fill(0); + + console.log(`Scenario "${scenarioName}": ${scenario.description}`); + console.log( + `Backfilling ${totalBuckets} x ${bucketSec}s buckets (${flags.window ?? "2h"}) for ${scenario.queues.length} queues...` + ); + + // Backfill: buckets from (now - window) up to now, aligned to the bucket grid. + const nowBucket = Math.floor(Date.now() / 1000 / bucketSec) * bucketSec; + const startBucket = nowBucket - totalBuckets * bucketSec; + const counters = newCounterState(scenario.queues.length); + const rows: QueueMetricsRawV1Input[] = []; + for (let b = 0; b < totalBuckets; b++) { + const bucketEpochSec = startBucket + b * bucketSec; + const eventTime = formatChDateTime(new Date(bucketEpochSec * 1000)); + rows.push( + ...simulateBucket( + scenario, + b, + bucketSec, + eventTime, + bucketEpochSec, + ids, + backlog, + counters, + rng + ) + ); + } + await insertBatched(ch, rows, nonce); + console.log(`Inserted ${rows.length} raw rows.`); + + // Merge the AggregatingMergeTree partials so argMax "current value" widgets read cleanly. + // The real pipeline relies on background merges; the simulator forces it for a tidy demo. + const raw = ( + ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } } + ).client; + await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_v1 FINAL` }); + await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_5m_v1 FINAL` }); + await raw.command({ query: `OPTIMIZE TABLE trigger_dev.env_metrics_v1 FINAL` }); + await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_ck_v1 FINAL` }); + + const origin = process.env.APP_ORIGIN ?? "http://localhost:3030"; + console.log( + `\nQueues dashboard: ${origin}/orgs/${org.slug}/projects/${project.slug}/env/dev/dashboards/queues` + ); + + if (live) { + console.log(`\nLive mode: appending one bucket every ${bucketSec}s (Ctrl-C to stop)...`); + let b = totalBuckets; + // eslint-disable-next-line no-constant-condition + while (true) { + await new Promise((r) => setTimeout(r, bucketSec * 1000)); + const bucketEpochSec = Math.floor(Date.now() / 1000 / bucketSec) * bucketSec; + const eventTime = formatChDateTime(new Date(bucketEpochSec * 1000)); + const liveRows = simulateBucket( + scenario, + b, + bucketSec, + eventTime, + bucketEpochSec, + ids, + backlog, + counters, + rng + ); + await insertBatched(ch, liveRows, `${nonce}:live:${b}`); + console.log(`bucket ${b}: ${liveRows.length} rows @ ${eventTime}`); + b++; + } + } + + await ch.close(); + process.exit(0); +} + +main().catch((e) => { + console.error(e); + process.exit(1); +}); diff --git a/apps/webapp/test/queueMetricsMapping.test.ts b/apps/webapp/test/queueMetricsMapping.test.ts new file mode 100644 index 00000000000..61e3893c7fb --- /dev/null +++ b/apps/webapp/test/queueMetricsMapping.test.ts @@ -0,0 +1,239 @@ +import { describe, expect, it } from "vitest"; +import { + descriptorFromQueue, + mapEntryToRows, + OVERFLOW_QUEUE_NAME, + QueueNameLimiter, +} from "~/v3/queueMetricsMapping"; + +describe("descriptorFromQueue", () => { + it("parses a plain descriptor", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:task/my-task")).toEqual({ + organization_id: "o1", + project_id: "p1", + environment_id: "e1", + queue_name: "task/my-task", + concurrency_key: "", + }); + }); + + it("captures a concurrency-key suffix", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:task/t:ck:tenant-3")).toEqual( + expect.objectContaining({ queue_name: "task/t", concurrency_key: "tenant-3" }) + ); + }); + + it("maps the ck wildcard to no key", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:task/t:ck:*")).toEqual( + expect.objectContaining({ queue_name: "task/t", concurrency_key: "" }) + ); + }); + + it("keeps colons inside the queue name", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:my:odd:queue")).toEqual( + expect.objectContaining({ queue_name: "my:odd:queue", concurrency_key: "" }) + ); + }); + + it("keeps colons in the name while capturing a real ck suffix", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:a:b:ck:t9")).toEqual( + expect.objectContaining({ queue_name: "a:b", concurrency_key: "t9" }) + ); + }); + + it("rejects malformed descriptors", () => { + expect(descriptorFromQueue("not-a-descriptor")).toBeNull(); + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1")).toBeNull(); + expect(descriptorFromQueue("")).toBeNull(); + }); +}); + +describe("QueueNameLimiter", () => { + it("passes names through under the cap and overflows past it, per scope", () => { + const limiter = new QueueNameLimiter(2); + expect(limiter.limit("env1", "a")).toBe("a"); + expect(limiter.limit("env1", "b")).toBe("b"); + expect(limiter.limit("env1", "c")).toBe(OVERFLOW_QUEUE_NAME); + expect(limiter.limit("env1", "a")).toBe("a"); + expect(limiter.limit("env2", "c")).toBe("c"); + }); + + it("is unlimited when the cap is 0", () => { + const limiter = new QueueNameLimiter(0); + for (let i = 0; i < 100; i++) { + expect(limiter.limit("env1", `q${i}`)).toBe(`q${i}`); + } + }); + + it("evicts the oldest scope when the scope map is full", () => { + const limiter = new QueueNameLimiter(1, 2); + expect(limiter.limit("env1", "a")).toBe("a"); + expect(limiter.limit("env2", "a")).toBe("a"); + expect(limiter.limit("env3", "a")).toBe("a"); + expect(limiter.limit("env1", "b")).toBe("b"); + }); +}); + +describe("mapEntryToRows", () => { + const q = "{org:o1}:proj:p1:env:e1:queue:task/t"; + + it("maps a gauge entry with numeric fields", () => { + const rows = mapEntryToRows({ + id: "1700000000000-0", + fields: { + op: "gauge", + q, + ql: "5", + cc: "2", + lim: "10", + eql: "7", + ec: "3", + elim: "20", + thr: "1", + }, + }); + expect(rows).toHaveLength(1); + expect(rows[0]).toEqual( + expect.objectContaining({ + op: "gauge", + organization_id: "o1", + queue_name: "task/t", + concurrency_key: "", + queued: 5, + running: 2, + queue_limit: 10, + env_queued: 7, + env_running: 3, + env_limit: 20, + throttled: 1, + }) + ); + expect(rows[0]!.event_time).toBe("2023-11-14 22:13:20"); + expect(rows[0]!.ck_backlogged).toBeUndefined(); + expect(rows[0]!.ck_max_wait_ms).toBeUndefined(); + }); + + it("keeps the key on per-subqueue gauges and maps the CK-health tail", () => { + const rows = mapEntryToRows({ + id: "1700000000000-0", + fields: { op: "gauge", q: `${q}:ck:tenant-1`, ql: "4", ckq: "3", ckw: "2500" }, + }); + expect(rows).toHaveLength(1); + expect(rows[0]).toEqual( + expect.objectContaining({ + op: "gauge", + queue_name: "task/t", + concurrency_key: "tenant-1", + queued: 4, + ck_backlogged: 3, + ck_max_wait_ms: 2500, + }) + ); + }); + + it("maps started with wait_ms + cumulative and drops unknown ops", () => { + const started = mapEntryToRows({ + id: "1700000000000-0", + fields: { op: "started", q, wait: "48", cum: "512" }, + }); + expect(started).toHaveLength(1); + expect(started[0]).toEqual( + expect.objectContaining({ + op: "started", + wait_ms: 48, + cumulative: 512, + order_key: (1700000000000n * 1000000n).toString(), + }) + ); + expect(mapEntryToRows({ id: "1-0", fields: { op: "ack", q, cum: "9" } })[0]).toEqual( + expect.objectContaining({ op: "ack", cumulative: 9 }) + ); + expect(mapEntryToRows({ id: "1-0", fields: { op: "bogus", q } })).toEqual([]); + expect(mapEntryToRows({ id: "1-0", fields: { op: "ack" } })).toEqual([]); + }); + + it("expands a dual-odometer counter entry into base + per-key rows", () => { + const rows = mapEntryToRows({ + id: "1700000000000-3", + fields: { op: "started", q, ck: "tenant-9", wait: "80", cum: "41", ckcum: "7" }, + }); + expect(rows).toHaveLength(2); + expect(rows[0]).toEqual( + expect.objectContaining({ queue_name: "task/t", cumulative: 41, wait_ms: 80 }) + ); + expect(rows[0]!.concurrency_key).toBeUndefined(); + expect(rows[1]).toEqual( + expect.objectContaining({ + queue_name: "task/t", + concurrency_key: "tenant-9", + cumulative: 7, + wait_ms: 80, + }) + ); + expect(rows[0]!.order_key).toBe(rows[1]!.order_key); + + // Baseline entries carry exactly one odometer each. + const baseBaseline = mapEntryToRows({ id: "1-0", fields: { op: "started", q, cum: "0" } }); + expect(baseBaseline).toHaveLength(1); + expect(baseBaseline[0]!.concurrency_key).toBeUndefined(); + const ckBaseline = mapEntryToRows({ + id: "1-1", + fields: { op: "started", q, ck: "tenant-9", ckcum: "0" }, + }); + expect(ckBaseline).toHaveLength(1); + expect(ckBaseline[0]).toEqual( + expect.objectContaining({ concurrency_key: "tenant-9", cumulative: 0 }) + ); + }); + + it("applies the queue-name limiter: gauges overflow, counters drop", () => { + const limiters = { queueNames: new QueueNameLimiter(1) }; + const first = mapEntryToRows({ id: "1-0", fields: { op: "ack", q, cum: "1" } }, limiters); + expect(first[0]!.queue_name).toBe("task/t"); + + // Overflowed gauges keep flowing under the shared name (max stays meaningful), + // with per-key attribution stripped. + const overflowGauge = mapEntryToRows( + { + id: "1-1", + fields: { op: "gauge", q: "{org:o1}:proj:p1:env:e1:queue:task/other:ck:t1", ql: "3" }, + }, + limiters + ); + expect(overflowGauge[0]!.queue_name).toBe(OVERFLOW_QUEUE_NAME); + expect(overflowGauge[0]!.concurrency_key).toBe(""); + + // Overflowed counters are dropped: merging distinct odometers under one key + // produces garbage deltas. + const overflowCounter = mapEntryToRows( + { id: "1-2", fields: { op: "ack", q: "{org:o1}:proj:p1:env:e1:queue:task/other", cum: "4" } }, + limiters + ); + expect(overflowCounter).toEqual([]); + }); + + it("applies the concurrency-key limiter: overflow drops the per-key row, keeps base", () => { + const limiters = { concurrencyKeys: new QueueNameLimiter(1) }; + const first = mapEntryToRows( + { id: "1-0", fields: { op: "ack", q, ck: "t1", cum: "5", ckcum: "2" } }, + limiters + ); + expect(first).toHaveLength(2); + + const overflowed = mapEntryToRows( + { id: "1-1", fields: { op: "ack", q, ck: "t2", cum: "6", ckcum: "1" } }, + limiters + ); + expect(overflowed).toHaveLength(1); + expect(overflowed[0]!.cumulative).toBe(6); + expect(overflowed[0]!.concurrency_key).toBeUndefined(); + + // Gauge for an overflowed key keeps the row but loses the attribution. + const overflowGauge = mapEntryToRows( + { id: "1-2", fields: { op: "gauge", q: `${q}:ck:t3`, ql: "2" } }, + limiters + ); + expect(overflowGauge).toHaveLength(1); + expect(overflowGauge[0]!.concurrency_key).toBe(""); + }); +}); diff --git a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql new file mode 100644 index 00000000000..8ea1e65f09f --- /dev/null +++ b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql @@ -0,0 +1,267 @@ +-- +goose Up + +-- Queue metrics: raw landing table -> MV -> aggregated read target (mirrors +-- llm_model_aggregates_v1, migration 027). Raw rows feed an MV on insert, and +-- reads hit the aggregated table. + +-- Short-TTL raw landing, one row per stream entry. non_replicated_deduplication_window +-- makes consumer replays idempotent via insert_deduplication_token. +CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_raw_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + queue_name String CODEC(ZSTD(1)), + concurrency_key String DEFAULT '' CODEC(ZSTD(1)), -- per-key attribution ('' = base/whole-queue row) + event_time DateTime CODEC(Delta(4), ZSTD(1)), + order_key UInt64 DEFAULT 0, -- stream-id composite (ms*1e6+seq), deltaSumTimestamp ordering key + op LowCardinality(String), -- gauge | enqueue | started | ack | nack | dlq + running UInt32 DEFAULT 0, + queued UInt32 DEFAULT 0, + queue_limit UInt32 DEFAULT 0, + env_running UInt32 DEFAULT 0, + env_queued UInt32 DEFAULT 0, + env_limit UInt32 DEFAULT 0, + throttled UInt8 DEFAULT 0, -- 1 on a gauge emission with running>=limit AND queued>0 + ck_backlogged UInt32 DEFAULT 0, -- gauge on CK queues: distinct concurrency keys with queued work + ck_max_wait_ms UInt32 DEFAULT 0, -- gauge on CK queues: most-starved key's head-of-line wait + wait_ms UInt32 DEFAULT 0, -- set on op='started' (scheduling delay) + cumulative UInt64 DEFAULT 0 -- monotonic per-(queue,op) odometer on a counter op, diffed at read time +) +ENGINE = MergeTree() +PARTITION BY toDate(event_time) +ORDER BY (organization_id, project_id, environment_id, queue_name, event_time) +TTL event_time + INTERVAL 6 HOUR +SETTINGS non_replicated_deduplication_window = 1000, ttl_only_drop_parts = 1; + +-- (2) Aggregated read target (TRQL/dashboards query this). +CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + queue_name String CODEC(ZSTD(1)), + bucket_start DateTime CODEC(Delta(4), ZSTD(1)), + + -- Cumulative-counter deltas: each op maintains a monotonic odometer, and deltaSumTimestamp + -- sums positive consecutive deltas (ignoring resets) ordered by event_time, so a lost + -- reading self-heals (the next surviving reading restates the total). Read with + -- deltaSumTimestampMerge(), never sum(). + enqueue_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + started_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + ack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + nack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + dlq_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + throttled_count SimpleAggregateFunction(sum, UInt64), + + max_queued SimpleAggregateFunction(max, UInt32), + max_running SimpleAggregateFunction(max, UInt32), + max_limit SimpleAggregateFunction(max, UInt32), + max_env_queued SimpleAggregateFunction(max, UInt32), + max_env_running SimpleAggregateFunction(max, UInt32), + max_env_limit SimpleAggregateFunction(max, UInt32), + max_ck_backlogged SimpleAggregateFunction(max, UInt32), + max_ck_wait_ms SimpleAggregateFunction(max, UInt32), + + wait_ms_sum SimpleAggregateFunction(sum, UInt64), + wait_ms_count SimpleAggregateFunction(sum, UInt64), + wait_quantiles AggregateFunction(quantiles(0.5, 0.9, 0.95, 0.99), UInt32) +) +ENGINE = AggregatingMergeTree() +PARTITION BY toDate(bucket_start) +ORDER BY (organization_id, project_id, environment_id, queue_name, bucket_start) +TTL bucket_start + INTERVAL 30 DAY +SETTINGS ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000; + +-- (3) MV: raw -> aggregated, 10s buckets. +CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.queue_metrics_mv_v1 +TO trigger_dev.queue_metrics_v1 AS +SELECT + organization_id, project_id, environment_id, queue_name, + toStartOfInterval(event_time, INTERVAL 10 SECOND) AS bucket_start, + deltaSumTimestampStateIf(cumulative, order_key, op = 'enqueue' AND concurrency_key = '') AS enqueue_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'started' AND concurrency_key = '') AS started_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'ack' AND concurrency_key = '') AS ack_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'nack' AND concurrency_key = '') AS nack_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'dlq' AND concurrency_key = '') AS dlq_delta, + sum(throttled) AS throttled_count, + max(queued) AS max_queued, + max(running) AS max_running, + max(queue_limit) AS max_limit, + max(env_queued) AS max_env_queued, + max(env_running) AS max_env_running, + max(env_limit) AS max_env_limit, + max(ck_backlogged) AS max_ck_backlogged, + max(ck_max_wait_ms) AS max_ck_wait_ms, + sumIf(wait_ms, op = 'started' AND concurrency_key = '') AS wait_ms_sum, + countIf(op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_ms_count, + quantilesStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_quantiles +FROM trigger_dev.queue_metrics_raw_v1 +GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start; + +-- (4) Env-level 10s rollup (no queue dimension) for header tiles/saturation charts. +-- Row count is queue-independent (~8640/day/env), so full granularity stays cheap at any range. +-- No counter deltas on purpose: cross-queue deltaSumTimestamp state merges mix unrelated +-- odometers (env totals must GROUP BY queue then sum). TDigest because an env-level +-- reservoir absorbs every sample in the environment. +CREATE TABLE IF NOT EXISTS trigger_dev.env_metrics_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + bucket_start DateTime CODEC(Delta(4), ZSTD(1)), + + max_env_queued SimpleAggregateFunction(max, UInt32), + max_env_running SimpleAggregateFunction(max, UInt32), + max_env_limit SimpleAggregateFunction(max, UInt32), + throttled_count SimpleAggregateFunction(sum, UInt64), + + wait_ms_sum SimpleAggregateFunction(sum, UInt64), + wait_ms_count SimpleAggregateFunction(sum, UInt64), + wait_quantiles AggregateFunction(quantilesTDigest(0.5, 0.9, 0.95, 0.99), UInt32) +) +ENGINE = AggregatingMergeTree() +PARTITION BY toDate(bucket_start) +ORDER BY (organization_id, project_id, environment_id, bucket_start) +TTL bucket_start + INTERVAL 30 DAY +SETTINGS ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000; + +-- (5) MV: raw -> env rollup. +CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.env_metrics_mv_v1 +TO trigger_dev.env_metrics_v1 AS +SELECT + organization_id, project_id, environment_id, + toStartOfInterval(event_time, INTERVAL 10 SECOND) AS bucket_start, + max(env_queued) AS max_env_queued, + max(env_running) AS max_env_running, + max(env_limit) AS max_env_limit, + sum(throttled) AS throttled_count, + sumIf(wait_ms, op = 'started' AND concurrency_key = '') AS wait_ms_sum, + countIf(op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_ms_count, + quantilesTDigestStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_quantiles +FROM trigger_dev.queue_metrics_raw_v1 +GROUP BY organization_id, project_id, environment_id, bucket_start; + +-- (6) Per-queue 5m rollup, exact column mirror of queue_metrics_v1, for ranking and +-- env-wide GROUP BY queue reads at long ranges. +CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_5m_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + queue_name String CODEC(ZSTD(1)), + bucket_start DateTime CODEC(Delta(4), ZSTD(1)), + + enqueue_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + started_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + ack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + nack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + dlq_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + throttled_count SimpleAggregateFunction(sum, UInt64), + + max_queued SimpleAggregateFunction(max, UInt32), + max_running SimpleAggregateFunction(max, UInt32), + max_limit SimpleAggregateFunction(max, UInt32), + max_env_queued SimpleAggregateFunction(max, UInt32), + max_env_running SimpleAggregateFunction(max, UInt32), + max_env_limit SimpleAggregateFunction(max, UInt32), + max_ck_backlogged SimpleAggregateFunction(max, UInt32), + max_ck_wait_ms SimpleAggregateFunction(max, UInt32), + + wait_ms_sum SimpleAggregateFunction(sum, UInt64), + wait_ms_count SimpleAggregateFunction(sum, UInt64), + wait_quantiles AggregateFunction(quantiles(0.5, 0.9, 0.95, 0.99), UInt32) +) +ENGINE = AggregatingMergeTree() +PARTITION BY toDate(bucket_start) +ORDER BY (organization_id, project_id, environment_id, queue_name, bucket_start) +TTL bucket_start + INTERVAL 30 DAY +SETTINGS ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000; + +-- (7) MV: raw -> 5m rollup. MUST read raw, never cascade off queue_metrics_v1 with +-- -MergeState: MV GROUP BY merges states in hash order, and out-of-time-order +-- deltaSumTimestamp merges double-count bridging spans (verified 3x inflation). +CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.queue_metrics_5m_mv_v1 +TO trigger_dev.queue_metrics_5m_v1 AS +SELECT + organization_id, project_id, environment_id, queue_name, + toStartOfInterval(event_time, INTERVAL 5 MINUTE) AS bucket_start, + deltaSumTimestampStateIf(cumulative, order_key, op = 'enqueue' AND concurrency_key = '') AS enqueue_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'started' AND concurrency_key = '') AS started_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'ack' AND concurrency_key = '') AS ack_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'nack' AND concurrency_key = '') AS nack_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'dlq' AND concurrency_key = '') AS dlq_delta, + sum(throttled) AS throttled_count, + max(queued) AS max_queued, + max(running) AS max_running, + max(queue_limit) AS max_limit, + max(env_queued) AS max_env_queued, + max(env_running) AS max_env_running, + max(env_limit) AS max_env_limit, + max(ck_backlogged) AS max_ck_backlogged, + max(ck_max_wait_ms) AS max_ck_wait_ms, + sumIf(wait_ms, op = 'started' AND concurrency_key = '') AS wait_ms_sum, + countIf(op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_ms_count, + quantilesStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0 AND concurrency_key = '') AS wait_quantiles +FROM trigger_dev.queue_metrics_raw_v1 +GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start; + + +-- (8) Per-concurrency-key 10s tier. Rows are activity-bound (a (queue, key, bucket) row +-- exists only when that key had an event in that bucket), so user-controlled key +-- cardinality cannot inflate it beyond event volume (~19 bytes/event measured). +-- Lean columns: no nack/dlq deltas and no per-key quantile states (mean wait via sums). +CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_ck_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + queue_name String CODEC(ZSTD(1)), + concurrency_key String CODEC(ZSTD(1)), + bucket_start DateTime CODEC(Delta(4), ZSTD(1)), + + enqueue_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + started_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + ack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + + max_queued SimpleAggregateFunction(max, UInt32), + max_running SimpleAggregateFunction(max, UInt32), + + wait_ms_sum SimpleAggregateFunction(sum, UInt64), + wait_ms_count SimpleAggregateFunction(sum, UInt64) +) +ENGINE = AggregatingMergeTree() +PARTITION BY toDate(bucket_start) +ORDER BY (organization_id, project_id, environment_id, queue_name, concurrency_key, bucket_start) +TTL bucket_start + INTERVAL 30 DAY +SETTINGS ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000; + +-- (9) MV: raw -> per-key tier. Only rows with a real key: per-key counter rows carry +-- per-key odometers (safe to merge within their own (queue, key) group), and per-key +-- gauge rows carry per-subqueue depth/running. +CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.queue_metrics_ck_mv_v1 +TO trigger_dev.queue_metrics_ck_v1 AS +SELECT + organization_id, project_id, environment_id, queue_name, concurrency_key, + toStartOfInterval(event_time, INTERVAL 10 SECOND) AS bucket_start, + deltaSumTimestampStateIf(cumulative, order_key, op = 'enqueue') AS enqueue_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'started') AS started_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'ack') AS ack_delta, + maxIf(queued, op = 'gauge') AS max_queued, + maxIf(running, op = 'gauge') AS max_running, + sumIf(wait_ms, op = 'started') AS wait_ms_sum, + countIf(op = 'started' AND wait_ms > 0) AS wait_ms_count +FROM trigger_dev.queue_metrics_raw_v1 +WHERE concurrency_key != '' +GROUP BY organization_id, project_id, environment_id, queue_name, concurrency_key, bucket_start; + +-- +goose Down +DROP VIEW IF EXISTS trigger_dev.queue_metrics_ck_mv_v1; +DROP TABLE IF EXISTS trigger_dev.queue_metrics_ck_v1; +DROP VIEW IF EXISTS trigger_dev.queue_metrics_5m_mv_v1; +DROP TABLE IF EXISTS trigger_dev.queue_metrics_5m_v1; +DROP VIEW IF EXISTS trigger_dev.env_metrics_mv_v1; +DROP TABLE IF EXISTS trigger_dev.env_metrics_v1; +DROP VIEW IF EXISTS trigger_dev.queue_metrics_mv_v1; +DROP TABLE IF EXISTS trigger_dev.queue_metrics_v1; +DROP TABLE IF EXISTS trigger_dev.queue_metrics_raw_v1; diff --git a/internal-packages/clickhouse/src/client/tsql.ts b/internal-packages/clickhouse/src/client/tsql.ts index c712820812f..ddf1d059b97 100644 --- a/internal-packages/clickhouse/src/client/tsql.ts +++ b/internal-packages/clickhouse/src/client/tsql.ts @@ -108,6 +108,11 @@ export interface ExecuteTSQLOptions { * based on the span of the time range. */ timeRange?: TimeRange; + /** + * Opt-in: emit rows for empty time buckets in a top-level time-bucketed query + * (counters zero-fill, gauges carry forward). Off by default. + */ + fillGaps?: boolean; } /** @@ -192,6 +197,7 @@ export async function executeTSQL( fieldMappings: options.fieldMappings, whereClauseFallback: options.whereClauseFallback, timeRange: options.timeRange, + fillGaps: options.fillGaps, }); generatedSql = sql; diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index 0b252a98f67..97c2209b1cb 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -32,6 +32,14 @@ import { } from "./taskEvents.js"; import { insertMetrics } from "./metrics.js"; import { insertLlmMetrics } from "./llmMetrics.js"; +import { + insertQueueMetricsRaw, + getQueueListMetricsSummary, + getQueueDepthSparklines, + getQueueRanking, + getQueueRankingNames, + getQueueRankingCount, +} from "./queueMetrics.js"; import { getSessionTagsQueryBuilder, getSessionsCountQueryBuilder, @@ -65,6 +73,7 @@ export type * from "./taskRuns.js"; export type * from "./taskEvents.js"; export type * from "./metrics.js"; export type * from "./llmMetrics.js"; +export type * from "./queueMetrics.js"; export type * from "./llmModelAggregates.js"; export type * from "./errors.js"; export type * from "./sessions.js"; @@ -260,6 +269,17 @@ export class ClickHouse { }; } + get queueMetrics() { + return { + insertRaw: insertQueueMetricsRaw(this.writer), + listSummary: getQueueListMetricsSummary(this.reader), + depthSparklines: getQueueDepthSparklines(this.reader), + ranking: getQueueRanking(this.reader), + rankingNames: getQueueRankingNames(this.reader), + rankingCount: getQueueRankingCount(this.reader), + }; + } + get llmModelAggregates() { return { globalMetrics: getGlobalModelMetrics(this.reader), diff --git a/internal-packages/clickhouse/src/queueMetrics.test.ts b/internal-packages/clickhouse/src/queueMetrics.test.ts new file mode 100644 index 00000000000..00532041e44 --- /dev/null +++ b/internal-packages/clickhouse/src/queueMetrics.test.ts @@ -0,0 +1,525 @@ +import { clickhouseTest } from "@internal/testcontainers"; +import { z } from "zod"; +import { ClickHouse } from "./index.js"; +import type { QueueMetricsRawV1Input } from "./queueMetrics.js"; + +const ORG = "org_qm"; +const PROJECT = "project_qm"; +const ENV = "env_qm"; +const EVENT_TIME = "2026-06-30 12:00:05"; // all rows land in the 10s bucket starting 12:00:00 + +function base(op: QueueMetricsRawV1Input["op"], queue: string): QueueMetricsRawV1Input { + return { + organization_id: ORG, + project_id: PROJECT, + environment_id: ENV, + queue_name: queue, + event_time: EVENT_TIME, + op, + }; +} + +// Cumulative counters: each op keeps a monotonic per-(queue,op) odometer, so a counter row +// carries the running total in `cumulative`. deltaSumTimestamp reconstructs the increase +// (last - first) from a seeded cum=0 baseline; order_key orders readings within an op. +let orderKey = 0; +function counter( + op: QueueMetricsRawV1Input["op"], + queue: string, + total: number, + waits?: number[] +): QueueMetricsRawV1Input[] { + const rows: QueueMetricsRawV1Input[] = [ + { ...base(op, queue), cumulative: 0, order_key: orderKey++ }, + ]; + for (let cum = 1; cum <= total; cum++) { + rows.push({ + ...base(op, queue), + cumulative: cum, + order_key: orderKey++, + ...(waits ? { wait_ms: waits[cum - 1] } : {}), + }); + } + return rows; +} + +const aggregatedRow = z.object({ + enqueue_count: z.coerce.number(), + started_count: z.coerce.number(), + ack_count: z.coerce.number(), + nack_count: z.coerce.number(), + dlq_count: z.coerce.number(), + throttled_count: z.coerce.number(), + max_running: z.coerce.number(), + max_queued: z.coerce.number(), + max_limit: z.coerce.number(), + max_env_running: z.coerce.number(), + max_env_queued: z.coerce.number(), + max_env_limit: z.coerce.number(), + max_ck_backlogged: z.coerce.number(), + max_ck_wait_ms: z.coerce.number(), + wait_ms_sum: z.coerce.number(), + wait_ms_count: z.coerce.number(), + wait_p50: z.coerce.number(), + wait_p90: z.coerce.number(), + wait_p95: z.coerce.number(), + wait_p99: z.coerce.number(), +}); + +function readAggregated(ch: ClickHouse) { + return ch.reader.query({ + name: "read-queue-metrics-aggregated", + query: `SELECT + deltaSumTimestampMerge(enqueue_delta) AS enqueue_count, + deltaSumTimestampMerge(started_delta) AS started_count, + deltaSumTimestampMerge(ack_delta) AS ack_count, + deltaSumTimestampMerge(nack_delta) AS nack_count, + deltaSumTimestampMerge(dlq_delta) AS dlq_count, + sum(throttled_count) AS throttled_count, + max(max_running) AS max_running, + max(max_queued) AS max_queued, + max(max_limit) AS max_limit, + max(max_env_running) AS max_env_running, + max(max_env_queued) AS max_env_queued, + max(max_env_limit) AS max_env_limit, + max(max_ck_backlogged) AS max_ck_backlogged, + max(max_ck_wait_ms) AS max_ck_wait_ms, + sum(wait_ms_sum) AS wait_ms_sum, + sum(wait_ms_count) AS wait_ms_count, + quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles) AS wait_arr, + wait_arr[1] AS wait_p50, + wait_arr[2] AS wait_p90, + wait_arr[3] AS wait_p95, + wait_arr[4] AS wait_p99 + FROM trigger_dev.queue_metrics_v1 + WHERE queue_name = {queueName: String} + GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start`, + schema: aggregatedRow, + params: z.object({ queueName: z.string() }), + }); +} + +// Synchronous insert so the MV-populated rows are queryable immediately. +const SYNC = { params: { clickhouse_settings: { async_insert: 0 as const } } }; + +describe("queue_metrics_v1", () => { + clickhouseTest( + "buckets counters, gauges and wait percentiles via the MV", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + const queue = "queue-a"; + + const rows: QueueMetricsRawV1Input[] = [ + ...counter("enqueue", queue, 3), + ...counter("started", queue, 10, [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]), + ...counter("ack", queue, 2), + ...counter("nack", queue, 1), + ...counter("dlq", queue, 1), + { + ...base("gauge", queue), + running: 8, + queued: 4, + queue_limit: 10, + env_running: 40, + env_queued: 10, + env_limit: 50, + throttled: 0, + ck_backlogged: 3, + ck_max_wait_ms: 2500, + }, + { + ...base("gauge", queue), + running: 10, + queued: 6, + queue_limit: 10, + env_running: 50, + env_queued: 20, + env_limit: 50, + throttled: 1, // running >= limit AND queued > 0 + ck_backlogged: 2, + ck_max_wait_ms: 1500, + }, + ]; + + const [insertError] = await ch.queueMetrics.insertRaw(rows, SYNC); + expect(insertError).toBeNull(); + + const [queryError, result] = await readAggregated(ch)({ queueName: queue }); + expect(queryError).toBeNull(); + expect(result).toHaveLength(1); + const row = result![0]!; + + expect(row.enqueue_count).toBe(3); + expect(row.started_count).toBe(10); + expect(row.ack_count).toBe(2); + expect(row.nack_count).toBe(1); + expect(row.dlq_count).toBe(1); + expect(row.throttled_count).toBe(1); + + expect(row.max_running).toBe(10); + expect(row.max_queued).toBe(6); + expect(row.max_limit).toBe(10); + expect(row.max_env_running).toBe(50); + expect(row.max_env_queued).toBe(20); + expect(row.max_env_limit).toBe(50); + expect(row.max_ck_backlogged).toBe(3); + expect(row.max_ck_wait_ms).toBe(2500); + + expect(row.wait_ms_sum).toBe(5500); + expect(row.wait_ms_count).toBe(10); + + // Percentiles over [100..1000]: monotonic and within the value range. + expect(row.wait_p50).toBeGreaterThanOrEqual(400); + expect(row.wait_p50).toBeLessThanOrEqual(650); + expect(row.wait_p90).toBeGreaterThanOrEqual(row.wait_p50); + expect(row.wait_p95).toBeGreaterThanOrEqual(row.wait_p90); + expect(row.wait_p99).toBeGreaterThanOrEqual(row.wait_p95); + expect(row.wait_p99).toBeLessThanOrEqual(1000); + + await ch.close(); + } + ); + + clickhouseTest( + "merges wait-quantile state across separate insert blocks", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + const queue = "queue-b"; + + // Cumulative odometer continues across the two insert blocks (baseline 0, then 1..10); + // deltaSumTimestamp state and quantile state merge across the parts into one bucket. + const startedRow = (cum: number, wait_ms?: number): QueueMetricsRawV1Input => ({ + ...base("started", queue), + cumulative: cum, + order_key: orderKey++, + ...(wait_ms !== undefined ? { wait_ms } : {}), + }); + + const [e1] = await ch.queueMetrics.insertRaw( + [startedRow(0), ...[100, 200, 300, 400, 500].map((w, i) => startedRow(i + 1, w))], + SYNC + ); + expect(e1).toBeNull(); + const [e2] = await ch.queueMetrics.insertRaw( + [600, 700, 800, 900, 1000].map((w, i) => startedRow(i + 6, w)), + SYNC + ); + expect(e2).toBeNull(); + + const [queryError, result] = await readAggregated(ch)({ queueName: queue }); + expect(queryError).toBeNull(); + expect(result).toHaveLength(1); + const row = result![0]!; + + // Both blocks contribute to one bucket: counts and sums add, quantile state merges. + expect(row.started_count).toBe(10); + expect(row.wait_ms_sum).toBe(5500); + expect(row.wait_ms_count).toBe(10); + expect(row.wait_p50).toBeGreaterThanOrEqual(400); + expect(row.wait_p50).toBeLessThanOrEqual(650); + expect(row.wait_p99).toBeGreaterThanOrEqual(row.wait_p50); + expect(row.wait_p99).toBeLessThanOrEqual(1000); + + await ch.close(); + } + ); + + clickhouseTest( + "5m and env rollups agree with the 10s tier, and env buckets are 10s", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + + // Own org so the env-level read (no queue filter) stays isolated from other tests. + const rollOrg = "org_qm_roll"; + const rows: QueueMetricsRawV1Input[] = [ + ...counter("started", "roll-a", 7, [100, 150, 200, 250, 300, 350, 400]), + ...counter("started", "roll-b", 3, [500, 600, 700]), + { + ...base("gauge", "roll-a"), + running: 4, + queued: 9, + env_running: 30, + env_limit: 50, + ck_backlogged: 5, + ck_max_wait_ms: 9000, + }, + { ...base("gauge", "roll-b"), running: 2, queued: 1, env_running: 45, env_limit: 50 }, + { + ...base("gauge", "roll-a"), + event_time: "2026-06-30 12:00:15", + running: 1, + queued: 2, + env_running: 20, + env_limit: 50, + ck_backlogged: 2, + ck_max_wait_ms: 3000, + }, + ].map((row) => ({ ...row, organization_id: rollOrg })); + const [insertError] = await ch.queueMetrics.insertRaw(rows, SYNC); + expect(insertError).toBeNull(); + + const perQueue = (table: string) => + ch.reader.query({ + name: "per-queue-both-tiers", + query: `SELECT queue_name, deltaSumTimestampMerge(started_delta) AS started + FROM ${table} + WHERE queue_name IN ('roll-a', 'roll-b') + GROUP BY queue_name ORDER BY queue_name`, + schema: z.object({ queue_name: z.string(), started: z.coerce.number() }), + })({}); + const [e10, rows10] = await perQueue("trigger_dev.queue_metrics_v1"); + const [e5m, rows5m] = await perQueue("trigger_dev.queue_metrics_5m_v1"); + expect(e10).toBeNull(); + expect(e5m).toBeNull(); + expect(rows10).toEqual([ + { queue_name: "roll-a", started: 7 }, + { queue_name: "roll-b", started: 3 }, + ]); + expect(rows5m).toEqual(rows10); + + // CK-health gauges roll into the 5m mirror too. + const [ckError, ckRows] = await ch.reader.query({ + name: "ck-5m-read", + query: `SELECT max(max_ck_backlogged) AS ck_keys, max(max_ck_wait_ms) AS ck_wait + FROM trigger_dev.queue_metrics_5m_v1 + WHERE queue_name = 'roll-a'`, + schema: z.object({ ck_keys: z.coerce.number(), ck_wait: z.coerce.number() }), + })({}); + expect(ckError).toBeNull(); + expect(ckRows![0]).toEqual({ ck_keys: 5, ck_wait: 9000 }); + + // Env-wide totals: sum of per-queue merges (a single merge across queues would mix + // odometers and double-count). + const [envTotalError, envTotal] = await ch.reader.query({ + name: "env-total-per-queue-sum", + query: `SELECT sum(started) AS started FROM ( + SELECT queue_name, deltaSumTimestampMerge(started_delta) AS started + FROM trigger_dev.queue_metrics_5m_v1 + WHERE queue_name IN ('roll-a', 'roll-b') + GROUP BY queue_name + )`, + schema: z.object({ started: z.coerce.number() }), + })({}); + expect(envTotalError).toBeNull(); + expect(envTotal![0]!.started).toBe(10); + + const [envError, envRows] = await ch.reader.query({ + name: "env-rollup-read", + query: `SELECT + max(max_env_running) AS max_env_running, + max(max_env_limit) AS max_env_limit, + uniqExact(bucket_start) AS buckets, + round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[4]) AS wait_p99 + FROM trigger_dev.env_metrics_v1 + WHERE organization_id = {org: String}`, + schema: z.object({ + max_env_running: z.coerce.number(), + max_env_limit: z.coerce.number(), + buckets: z.coerce.number(), + wait_p99: z.coerce.number(), + }), + params: z.object({ org: z.string() }), + })({ org: rollOrg }); + expect(envError).toBeNull(); + expect(envRows![0]!.max_env_running).toBe(45); + expect(envRows![0]!.max_env_limit).toBe(50); + // 12:00:05 and 12:00:15 land in separate 10s env buckets (12:00:00 and 12:00:10). + expect(envRows![0]!.buckets).toBe(2); + expect(envRows![0]!.wait_p99).toBeGreaterThanOrEqual(600); + expect(envRows![0]!.wait_p99).toBeLessThanOrEqual(1000); + + await ch.close(); + } + ); + + clickhouseTest( + "merged ranking returns the page and the windowed total in one query", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + + const gauge = (queue: string, queued: number, running: number): QueueMetricsRawV1Input => ({ + ...base("gauge", queue), + queued, + running, + }); + const [insertError] = await ch.queueMetrics.insertRaw( + [gauge("rank-low", 1, 0), gauge("rank-high", 50, 3), gauge("rank-mid", 10, 2)], + SYNC + ); + expect(insertError).toBeNull(); + + const args = { + organizationId: ORG, + projectId: PROJECT, + environmentId: ENV, + startTime: "2026-06-30 11:50:00", + nameContains: "rank-", + byQueuedOnly: 0, + }; + const [pageError, page] = await ch.queueMetrics.ranking({ ...args, limit: 2, offset: 0 }); + expect(pageError).toBeNull(); + expect(page).toEqual([ + { queue_name: "rank-high", ranked_total: 3 }, + { queue_name: "rank-mid", ranked_total: 3 }, + ]); + + const [countError, count] = await ch.queueMetrics.rankingCount(args); + expect(countError).toBeNull(); + expect(count![0]!.ranked).toBe(3); + + const [namesError, names] = await ch.queueMetrics.rankingNames({ ...args, limit: 10 }); + expect(namesError).toBeNull(); + expect(names!.map((r) => r.queue_name)).toEqual(["rank-high", "rank-mid", "rank-low"]); + + await ch.close(); + } + ); +}); + +describe("consumer retry idempotency", () => { + clickhouseTest( + "re-inserting a batch with the same dedup token does not inflate any tier", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + + const dedupOrg = "org_qm_dedup"; + const rows: QueueMetricsRawV1Input[] = [ + ...counter("started", "dedup-q", 3, [100, 200, 300]), + { ...base("gauge", "dedup-q"), running: 2, queued: 1, env_running: 5, env_limit: 10 }, + ].map((row) => ({ ...row, organization_id: dedupOrg })); + + const retrySettings = { + params: { + clickhouse_settings: { + async_insert: 0 as const, + insert_deduplication_token: "qm-test-retry-batch", + deduplicate_blocks_in_dependent_materialized_views: 1 as const, + }, + }, + }; + for (let attempt = 0; attempt < 3; attempt++) { + const [error] = await ch.queueMetrics.insertRaw(rows, retrySettings); + expect(error).toBeNull(); + } + + const [tiersError, tiers] = await ch.reader.query({ + name: "dedup-tier-counts", + query: `SELECT + (SELECT count() FROM trigger_dev.queue_metrics_v1 WHERE organization_id = {org: String}) AS rows_10s, + (SELECT count() FROM trigger_dev.queue_metrics_5m_v1 WHERE organization_id = {org: String}) AS rows_5m, + (SELECT count() FROM trigger_dev.env_metrics_v1 WHERE organization_id = {org: String}) AS rows_env, + (SELECT sum(wait_ms_count) FROM trigger_dev.env_metrics_v1 WHERE organization_id = {org: String}) AS wait_count, + (SELECT deltaSumTimestampMerge(started_delta) FROM trigger_dev.queue_metrics_v1 WHERE organization_id = {org: String}) AS started`, + schema: z.object({ + rows_10s: z.coerce.number(), + rows_5m: z.coerce.number(), + rows_env: z.coerce.number(), + wait_count: z.coerce.number(), + started: z.coerce.number(), + }), + params: z.object({ org: z.string() }), + })({ org: dedupOrg }); + expect(tiersError).toBeNull(); + const t = tiers![0]!; + // Without dedup windows on the MV targets, retries append copies: rows and sums triple. + expect(t.rows_10s).toBe(1); + expect(t.rows_5m).toBe(1); + expect(t.rows_env).toBe(1); + expect(t.wait_count).toBe(3); + expect(t.started).toBe(3); + + await ch.close(); + } + ); +}); + +describe("per-concurrency-key tier", () => { + clickhouseTest( + "per-key rows feed the ck tier without polluting per-queue counters or waits", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + const ckOrg = "org_qm_ck"; + const queue = "ck-tier-q"; + const withCk = (row: QueueMetricsRawV1Input, ck: string): QueueMetricsRawV1Input => ({ + ...row, + concurrency_key: ck, + }); + + // 5 started events on one queue across two keys (t1 x3, t2 x2). Each event lands as + // a base row (base odometer) + a per-key row (per-key odometer), both carrying wait, + // exactly like the consumer expansion. Baselines seed each odometer. + const rows: QueueMetricsRawV1Input[] = []; + let ok = 0; + const started = (cum: number, ck: string, ckcum: number, wait: number) => { + rows.push({ ...base("started", queue), cumulative: cum, order_key: ok, wait_ms: wait }); + rows.push( + withCk({ ...base("started", queue), cumulative: ckcum, order_key: ok, wait_ms: wait }, ck) + ); + ok++; + }; + rows.push({ ...base("started", queue), cumulative: 0, order_key: ok++ }); + rows.push(withCk({ ...base("started", queue), cumulative: 0, order_key: ok++ }, "t1")); + rows.push(withCk({ ...base("started", queue), cumulative: 0, order_key: ok++ }, "t2")); + started(1, "t1", 1, 100); + started(2, "t1", 2, 200); + started(3, "t2", 1, 300); + started(4, "t1", 3, 400); + started(5, "t2", 2, 500); + // Per-subqueue gauges carry the key. + rows.push(withCk({ ...base("gauge", queue), queued: 4, running: 1 }, "t1")); + rows.push(withCk({ ...base("gauge", queue), queued: 2, running: 0 }, "t2")); + + const [insertError] = await ch.queueMetrics.insertRaw( + rows.map((r) => ({ ...r, organization_id: ckOrg })), + SYNC + ); + expect(insertError).toBeNull(); + + const [perQueueError, perQueue] = await ch.reader.query({ + name: "ck-per-queue-read", + query: `SELECT + deltaSumTimestampMerge(started_delta) AS started, + sum(wait_ms_sum) AS wait_sum, + sum(wait_ms_count) AS wait_count, + max(max_queued) AS peak_queued + FROM trigger_dev.queue_metrics_v1 + WHERE organization_id = {org: String}`, + schema: z.object({ + started: z.coerce.number(), + wait_sum: z.coerce.number(), + wait_count: z.coerce.number(), + peak_queued: z.coerce.number(), + }), + params: z.object({ org: z.string() }), + })({ org: ckOrg }); + expect(perQueueError).toBeNull(); + // Base rows only: 5 events (not 10), waits counted once, per-key gauges still max in. + expect(perQueue![0]).toEqual({ started: 5, wait_sum: 1500, wait_count: 5, peak_queued: 4 }); + + const [ckError, ckRows] = await ch.reader.query({ + name: "ck-tier-read", + query: `SELECT concurrency_key, + deltaSumTimestampMerge(started_delta) AS started, + max(max_queued) AS peak_queued, + sum(wait_ms_sum) AS wait_sum + FROM trigger_dev.queue_metrics_ck_v1 + WHERE organization_id = {org: String} + GROUP BY concurrency_key ORDER BY concurrency_key`, + schema: z.object({ + concurrency_key: z.string(), + started: z.coerce.number(), + peak_queued: z.coerce.number(), + wait_sum: z.coerce.number(), + }), + params: z.object({ org: z.string() }), + })({ org: ckOrg }); + expect(ckError).toBeNull(); + expect(ckRows).toEqual([ + { concurrency_key: "t1", started: 3, peak_queued: 4, wait_sum: 700 }, + { concurrency_key: "t2", started: 2, peak_queued: 2, wait_sum: 800 }, + ]); + + await ch.close(); + } + ); +}); diff --git a/internal-packages/clickhouse/src/queueMetrics.ts b/internal-packages/clickhouse/src/queueMetrics.ts new file mode 100644 index 00000000000..dce9323ef26 --- /dev/null +++ b/internal-packages/clickhouse/src/queueMetrics.ts @@ -0,0 +1,214 @@ +import { z } from "zod"; +import type { ClickhouseReader, ClickhouseWriter } from "./client/types.js"; + +export const QueueMetricsRawV1Input = z.object({ + organization_id: z.string(), + project_id: z.string(), + environment_id: z.string(), + queue_name: z.string(), + concurrency_key: z.string().optional(), + event_time: z.string(), + // Exact UInt64 ordering key; a string preserves precision past JS safe-integer range + // (see entryOrderKey). A plain number is still accepted for small test values. + order_key: z.union([z.string(), z.number()]).optional(), + op: z.enum(["gauge", "enqueue", "started", "ack", "nack", "dlq"]), + running: z.number().optional(), + queued: z.number().optional(), + queue_limit: z.number().optional(), + env_running: z.number().optional(), + env_queued: z.number().optional(), + env_limit: z.number().optional(), + throttled: z.number().optional(), + ck_backlogged: z.number().optional(), + ck_max_wait_ms: z.number().optional(), + wait_ms: z.number().optional(), + cumulative: z.number().optional(), +}); + +export type QueueMetricsRawV1Input = z.input; + +export function insertQueueMetricsRaw(ch: ClickhouseWriter) { + return ch.insertUnsafe({ + name: "insertQueueMetricsRaw", + table: "trigger_dev.queue_metrics_raw_v1", + }); +} + +// --- Reads (Queues list metrics + health) --- + +const QueueMetricsListParams = z.object({ + organizationId: z.string(), + projectId: z.string(), + environmentId: z.string(), + queueNames: z.array(z.string()), + startTime: z.string(), + endTime: z.string(), +}); + +const QueueMetricsSummaryRow = z.object({ + queue_name: z.string(), + p50_wait_ms: z.coerce.number(), + p95_wait_ms: z.coerce.number(), + peak_queued: z.coerce.number(), + started_count: z.coerce.number(), +}); + +// Callers align window bounds to the bucket grid so repeated loads share cache entries. +const QUEUE_METRICS_CACHE_SETTINGS = { + use_query_cache: 1, + query_cache_ttl: 30, +} as const; + +/** Per-queue rollups over a window, for a fixed set of queues (the visible page). */ +export function getQueueListMetricsSummary(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueListMetricsSummary", + query: `SELECT + queue_name, + round(quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[1]) AS p50_wait_ms, + round(quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[3]) AS p95_wait_ms, + max(max_queued) AS peak_queued, + deltaSumTimestampMerge(started_delta) AS started_count + FROM trigger_dev.queue_metrics_v1 + WHERE organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND queue_name IN {queueNames: Array(String)} + AND bucket_start >= {startTime: DateTime} + AND bucket_start < {endTime: DateTime} + GROUP BY queue_name`, + params: QueueMetricsListParams, + schema: QueueMetricsSummaryRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, + }); +} + +const QueueDepthSparklineParams = QueueMetricsListParams.extend({ + bucketSeconds: z.number(), +}); + +const QueueDepthSparklineRow = z.object({ + queue_name: z.string(), + bucket: z.string(), + depth: z.coerce.number(), +}); + +/** Per-queue, per-bucket peak depth for inline sparklines (carry-forward filled by the caller). */ +export function getQueueDepthSparklines(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueDepthSparklines", + query: `SELECT + queue_name, + toStartOfInterval(bucket_start, toIntervalSecond({bucketSeconds: UInt32})) AS bucket, + max(max_queued) AS depth + FROM trigger_dev.queue_metrics_v1 + WHERE organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND queue_name IN {queueNames: Array(String)} + AND bucket_start >= {startTime: DateTime} + AND bucket_start < {endTime: DateTime} + GROUP BY queue_name, bucket + ORDER BY bucket`, + params: QueueDepthSparklineParams, + schema: QueueDepthSparklineRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, + }); +} + +const QueueRankingParams = z.object({ + organizationId: z.string(), + projectId: z.string(), + environmentId: z.string(), + startTime: z.string(), + /** 1 = rank by peak backlog only; 0 = backlog + running ("busiest"). */ + byQueuedOnly: z.number(), + nameContains: z.string(), + limit: z.number(), + offset: z.number(), +}); + +const QueueRankingRow = z.object({ + queue_name: z.string(), + ranked_total: z.coerce.number(), +}); + +// Ranking reads the 5m rollup: a 15-minute window there costs ~30x fewer rows than the +// 10s table. +const RANKING_WHERE = `organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND bucket_start >= {startTime: DateTime} + AND queue_name != '__overflow__' + AND ({nameContains: String} = '' OR positionCaseInsensitive(queue_name, {nameContains: String}) > 0)`; + +/** + * One page of queue names ranked by recent activity, with the total ranked count on + * every row (window function), so page + count cost a single scan. + */ +export function getQueueRanking(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueRanking", + query: `SELECT queue_name, count() OVER () AS ranked_total + FROM ( + SELECT queue_name + FROM trigger_dev.queue_metrics_5m_v1 + WHERE ${RANKING_WHERE} + GROUP BY queue_name + ORDER BY + if({byQueuedOnly: UInt8} = 1, max(max_queued), max(max_queued) + max(max_running)) DESC, + queue_name ASC + ) + LIMIT {limit: UInt32} OFFSET {offset: UInt32}`, + params: QueueRankingParams, + schema: QueueRankingRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, + }); +} + +const QueueRankingNamesParams = QueueRankingParams.omit({ byQueuedOnly: true, offset: true }); + +const QueueRankingNameRow = z.object({ + queue_name: z.string(), +}); + +/** All ranked queue names (activity order), used to exclude them from the alphabetical tail. */ +export function getQueueRankingNames(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueRankingNames", + query: `SELECT queue_name + FROM trigger_dev.queue_metrics_5m_v1 + WHERE ${RANKING_WHERE} + GROUP BY queue_name + ORDER BY max(max_queued) + max(max_running) DESC, queue_name ASC + LIMIT {limit: UInt32}`, + params: QueueRankingNamesParams, + schema: QueueRankingNameRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, + }); +} + +const QueueRankingCountParams = QueueRankingParams.omit({ + byQueuedOnly: true, + limit: true, + offset: true, +}); + +const QueueRankingCountRow = z.object({ + ranked: z.coerce.number(), +}); + +/** Ranked-queue count alone, for pages past the ranked head (approximate uniq is fine). */ +export function getQueueRankingCount(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueRankingCount", + query: `SELECT uniq(queue_name) AS ranked + FROM trigger_dev.queue_metrics_5m_v1 + WHERE ${RANKING_WHERE}`, + params: QueueRankingCountParams, + schema: QueueRankingCountRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, + }); +} + +// (per-queue detail series is now fetched via TRQL + fillGaps from the metric resource route) diff --git a/internal-packages/metrics-pipeline/package.json b/internal-packages/metrics-pipeline/package.json new file mode 100644 index 00000000000..10a7c137a1f --- /dev/null +++ b/internal-packages/metrics-pipeline/package.json @@ -0,0 +1,33 @@ +{ + "name": "@internal/metrics-pipeline", + "private": true, + "version": "0.0.1", + "main": "./dist/src/index.js", + "types": "./dist/src/index.d.ts", + "type": "module", + "exports": { + ".": { + "@triggerdotdev/source": "./src/index.ts", + "import": "./dist/src/index.js", + "types": "./dist/src/index.d.ts", + "default": "./dist/src/index.js" + } + }, + "dependencies": { + "@internal/redis": "workspace:*", + "@internal/tracing": "workspace:*", + "@trigger.dev/core": "workspace:*" + }, + "devDependencies": { + "@internal/testcontainers": "workspace:*", + "rimraf": "6.0.1" + }, + "scripts": { + "clean": "rimraf dist", + "typecheck": "tsc --noEmit -p tsconfig.build.json", + "test": "vitest --sequence.concurrent=false --no-file-parallelism", + "test:coverage": "vitest --sequence.concurrent=false --no-file-parallelism --coverage.enabled", + "build": "pnpm run clean && tsc -p tsconfig.build.json", + "dev": "tsc --watch -p tsconfig.build.json" + } +} diff --git a/internal-packages/metrics-pipeline/src/cachedValue.ts b/internal-packages/metrics-pipeline/src/cachedValue.ts new file mode 100644 index 00000000000..7f7bbb07903 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/cachedValue.ts @@ -0,0 +1,125 @@ +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import { Logger } from "@trigger.dev/core/logger"; + +export type CachedRedisValueOptions = { + redis: RedisOptions; + key: string; + parse: (raw: string | null) => T; + defaultValue: T; + cacheTtlMs?: number; + logger?: Logger; + loggerName?: string; +}; + +// Reads a Redis key with a short stale-while-revalidate cache and a synchronous getter for +// hot paths. Warms eagerly on construction; concurrent refreshes dedupe onto one GET so an +// awaited refresh always resolves to a completed read. +export class CachedRedisValue { + private readonly redis: Redis; + private readonly key: string; + private readonly parse: (raw: string | null) => T; + private readonly cacheTtlMs: number; + private readonly logger: Logger; + private value: T; + private lastFetchedAt = 0; + private refreshPromise?: Promise; + + constructor(options: CachedRedisValueOptions) { + this.logger = options.logger ?? new Logger(options.loggerName ?? "CachedRedisValue", "warn"); + this.redis = createRedisClient( + { ...options.redis, keyPrefix: undefined }, + { + onError: (error) => + this.logger.error("cached value redis error", { error, key: options.key }), + } + ); + this.key = options.key; + this.parse = options.parse; + this.cacheTtlMs = options.cacheTtlMs ?? 10_000; + this.value = options.defaultValue; + void this.refresh(); + } + + get(): T { + if (Date.now() - this.lastFetchedAt > this.cacheTtlMs) { + void this.refresh(); + } + return this.value; + } + + async refresh(): Promise { + if (this.refreshPromise) return this.refreshPromise; + this.refreshPromise = this.#doRefresh(); + try { + return await this.refreshPromise; + } finally { + this.refreshPromise = undefined; + } + } + + async #doRefresh(): Promise { + try { + this.value = this.parse(await this.redis.get(this.key)); + } catch (error) { + this.logger.debug("cached value refresh failed, keeping cached value", { + error, + key: this.key, + }); + } finally { + this.lastFetchedAt = Date.now(); + } + return this.value; + } + + async close(): Promise { + await this.redis.quit(); + } +} + +export type CachedRedisNumberOptions = { + redis: RedisOptions; + key: string; + defaultValue: number; + min?: number; + max?: number; + cacheTtlMs?: number; + logger?: Logger; +}; + +// Live-tunable numeric value, clamped to [min,max]; falls back to defaultValue on a +// missing/unparseable key. Exposes a synchronous value() for hot paths. +export class CachedRedisNumber { + private readonly inner: CachedRedisValue; + + constructor(options: CachedRedisNumberOptions) { + const min = options.min ?? Number.NEGATIVE_INFINITY; + const max = options.max ?? Number.POSITIVE_INFINITY; + const clamp = (n: number) => Math.min(max, Math.max(min, n)); + const fallback = clamp(options.defaultValue); + this.inner = new CachedRedisValue({ + redis: options.redis, + key: options.key, + parse: (raw) => { + // Number("") is 0 (not NaN), so treat blank/whitespace as missing => fallback. + const n = raw == null || raw.trim() === "" ? Number.NaN : Number(raw); + return Number.isFinite(n) ? clamp(n) : fallback; + }, + defaultValue: fallback, + cacheTtlMs: options.cacheTtlMs, + logger: options.logger, + loggerName: "CachedRedisNumber", + }); + } + + value(): number { + return this.inner.get(); + } + + refresh(): Promise { + return this.inner.refresh(); + } + + close(): Promise { + return this.inner.close(); + } +} diff --git a/internal-packages/metrics-pipeline/src/consumer.test.ts b/internal-packages/metrics-pipeline/src/consumer.test.ts new file mode 100644 index 00000000000..672fa426999 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/consumer.test.ts @@ -0,0 +1,392 @@ +import { createRedisClient } from "@internal/redis"; +import { redisTest } from "@internal/testcontainers"; +import { expect } from "vitest"; +import { CachedRedisFlag } from "./flag.js"; +import { CachedRedisNumber } from "./cachedValue.js"; +import { MetricsStreamConsumer } from "./consumer.js"; +import { MetricsStreamEmitter } from "./emitter.js"; +import { shardFor } from "./hash.js"; +import { streamKey, type MetricDefinition } from "./types.js"; + +async function waitFor(cond: () => boolean, timeoutMs = 5000): Promise { + const start = Date.now(); + while (!cond()) { + if (Date.now() - start > timeoutMs) throw new Error("waitFor timed out"); + await new Promise((r) => setTimeout(r, 50)); + } +} + +function definitionFor(suffix: string, shardCount = 2): MetricDefinition { + return { name: `qm_${Date.now()}_${suffix}`, shardCount, consumerGroup: "cg", maxLen: 1000 }; +} + +redisTest( + "emitter -> consumer round trip maps rows, dedups, and acks", + async ({ redisOptions }) => { + const definition = definitionFor("rt"); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + }); + const inserted: Array<{ rows: Array>; dedupToken: string }> = []; + + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async (rows, { dedupToken }) => { + inserted.push({ rows, dedupToken }); + }, + blockMs: 200, + }); + + await consumer.start(); + emitter.emit("queueA", { op: "enqueue", q: "queueA" }); + emitter.emit("queueB", { op: "started", q: "queueB", wait: 42 }); + + await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2); + await consumer.stop(); + + const rows = inserted.flatMap((i) => i.rows); + expect(rows).toContainEqual(expect.objectContaining({ op: "enqueue", q: "queueA" })); + expect(rows).toContainEqual( + expect.objectContaining({ op: "started", q: "queueB", wait: "42" }) + ); + expect(inserted[0]!.dedupToken).toMatch(/^[0-9a-f]{40}$/); + + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + for (const key of consumer.streamKeys()) { + const pending = (await admin.xpending(key, definition.consumerGroup)) as [ + number, + ...unknown[], + ]; + expect(pending[0]).toBe(0); + } + await admin.quit(); + await emitter.close(); + } +); + +redisTest("emit is a no-op when the flag is disabled", async ({ redisOptions }) => { + const definition = definitionFor("off"); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => false }, + }); + + emitter.emit("q", { op: "enqueue", q: "q" }); + await new Promise((r) => setTimeout(r, 200)); + + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const len = await admin.xlen(streamKey(definition, shardFor("q", definition.shardCount))); + expect(len).toBe(0); + await admin.quit(); + await emitter.close(); +}); + +redisTest("reclaims stale pending entries from a dead consumer", async ({ redisOptions }) => { + const definition = definitionFor("claim", 1); + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, 0); + + await admin.xgroup("CREATE", key, definition.consumerGroup, "$", "MKSTREAM"); + await admin.xadd(key, "*", "op", "ack", "q", "qZ"); + await admin.xadd(key, "*", "op", "nack", "q", "qZ"); + await admin.xreadgroup( + "GROUP", + definition.consumerGroup, + "zombie", + "COUNT", + 10, + "STREAMS", + key, + ">" + ); + + const inserted: Array> = []; + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "live", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async (rows) => { + inserted.push(...rows); + }, + blockMs: 200, + claimIdleMs: 0, + }); + + await consumer.start(); + await waitFor(() => inserted.length >= 2); + await consumer.stop(); + + expect(inserted.map((r) => r.op).sort()).toEqual(["ack", "nack"]); + const pending = (await admin.xpending(key, definition.consumerGroup)) as [number, ...unknown[]]; + expect(pending[0]).toBe(0); + await admin.quit(); +}); + +redisTest( + "per-stream batches: one insert + distinct dedup token per shard stream", + async ({ redisOptions }) => { + const definition = definitionFor("pershard", 2); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + }); + // Two shard keys that land on different shards. + const a = "shardkey-a"; + let b = "shardkey-b0"; + for (let i = 1; shardFor(b, 2) === shardFor(a, 2); i++) b = `shardkey-b${i}`; + + const inserted: Array<{ rows: Array>; dedupToken: string }> = []; + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async (rows, { dedupToken }) => { + inserted.push({ rows, dedupToken }); + }, + blockMs: 200, + }); + + await consumer.start(); + emitter.emit(a, { op: "enqueue", q: a }); + emitter.emit(b, { op: "enqueue", q: b }); + await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2); + await consumer.stop(); + await emitter.close(); + + // Each shard's batch is its own dedup block with its own (stream-scoped) token. + const batchesWithRows = inserted.filter((i) => i.rows.length > 0); + expect(batchesWithRows.length).toBe(2); + expect(new Set(batchesWithRows.map((i) => i.dedupToken)).size).toBe(2); + } +); + +redisTest( + "probe reports lag as null (not 0) when Redis cannot compute it", + async ({ redisOptions }) => { + const definition = definitionFor("nillag", 1); + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, 0); + + await admin.xgroup("CREATE", key, definition.consumerGroup, "0", "MKSTREAM"); + const ids: string[] = []; + for (let i = 0; i < 5; i++) { + ids.push((await admin.xadd(key, "*", "op", "enqueue", "q", "qT")) as string); + } + // SETID to an arbitrary id makes the group's entries-read unknown => lag is nil + // (severe trimming can do the same in prod); the probe must NOT report that as 0. + await admin.xgroup("SETID", key, definition.consumerGroup, ids[2]!); + + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async () => {}, + }); + try { + const states = await consumer.streamState(); + expect(states[0]!.lag).toBeNull(); + } finally { + await consumer.stop(); + await admin.quit(); + } + } +); + +redisTest( + "emitGauge XADDs an op=gauge snapshot onto the shared metrics stream", + async ({ redisOptions }) => { + const definition = definitionFor("gauge", 2); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + }); + + // Emits before the connection is ready are dropped by design (loss-tolerant). + await emitter.waitUntilReady(); + emitter.emitGauge("q1", { + op: "gauge", + q: "q1", + ql: 5, + cc: 2, + lim: 10, + eql: 3, + ec: 1, + elim: 20, + thr: 0, + }); + + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, shardFor("q1", 2)); + // Plain XADD (no odometer, no cum=0 seed) => exactly one entry, unlike counter emit(). + await waitFor2(async () => (await admin.xlen(key)) === 1); + const raw = (await admin.xrange(key, "-", "+")) as Array<[string, string[]]>; + const flat = raw[0]![1]; + const fields: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) fields[flat[i]!] = flat[i + 1]!; + expect(fields.op).toBe("gauge"); + expect(fields.q).toBe("q1"); + expect(fields.ql).toBe("5"); + expect(fields.thr).toBe("0"); + await admin.quit(); + await emitter.close(); + } +); + +async function waitFor2(cond: () => Promise, timeoutMs = 5000): Promise { + const start = Date.now(); + while (!(await cond())) { + if (Date.now() - start > timeoutMs) throw new Error("waitFor2 timed out"); + await new Promise((r) => setTimeout(r, 50)); + } +} + +redisTest("sampledSync gates on both the flag and the sample rate", async ({ redisOptions }) => { + const definition = definitionFor("sample"); + const off = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + gaugeSampleRate: 0, + }); + const on = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + gaugeSampleRate: 1, + }); + const disabled = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => false }, + gaugeSampleRate: 1, + }); + + expect(off.sampledSync()).toBe(false); // rate 0 => never sampled in + expect(on.sampledSync()).toBe(true); // rate 1 + enabled => always + expect(disabled.sampledSync()).toBe(false); // disabled => never, regardless of rate + expect(on.enabledSync()).toBe(true); // enabledSync (counters) is unaffected by sampling + + await Promise.all([off.close(), on.close(), disabled.close()]); +}); + +redisTest("sampledSync honors a live rate provider (no reconstruct)", async ({ redisOptions }) => { + const definition = definitionFor("live"); + let rate = 1; + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + gaugeSampleRate: { value: () => rate }, + }); + expect(emitter.sampledSync()).toBe(true); + rate = 0; + expect(emitter.sampledSync()).toBe(false); + await emitter.close(); +}); + +redisTest("CachedRedisNumber reads live, clamps, and falls back", async ({ redisOptions }) => { + const key = `rate_${Date.now()}`; + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const num = new CachedRedisNumber({ redis: redisOptions, key, defaultValue: 1, min: 0, max: 1 }); + + await num.refresh(); + expect(num.value()).toBe(1); // missing key => default + await admin.set(key, "0.25"); + await num.refresh(); + expect(num.value()).toBe(0.25); + await admin.set(key, "5"); + await num.refresh(); + expect(num.value()).toBe(1); // out of range => clamped + await admin.set(key, "nonsense"); + await num.refresh(); + expect(num.value()).toBe(1); // unparseable => default + + await num.close(); + await admin.quit(); +}); + +redisTest("streamState reports depth, lag, and pending per shard", async ({ redisOptions }) => { + const definition = definitionFor("state", 1); + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, 0); + + await admin.xgroup("CREATE", key, definition.consumerGroup, "$", "MKSTREAM"); + await admin.xadd(key, "*", "op", "enqueue", "q", "qX"); + await admin.xadd(key, "*", "op", "ack", "q", "qX"); + // Read one entry as some consumer and leave it unacked -> 1 pending, 1 still undelivered. + await admin.xreadgroup( + "GROUP", + definition.consumerGroup, + "reader", + "COUNT", + 1, + "STREAMS", + key, + ">" + ); + + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async () => {}, + }); + + try { + const states = await consumer.streamState(); + expect(states).toHaveLength(1); + expect(states[0]!.depth).toBe(2); + expect(states[0]!.pending).toBe(1); + expect(states[0]!.lag).toBe(1); + } finally { + await consumer.stop(); + await admin.quit(); + } +}); + +redisTest("CachedRedisFlag reads a redis key with caching", async ({ redisOptions }) => { + const key = `flag_${Date.now()}`; + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const flag = new CachedRedisFlag({ redis: redisOptions, key, cacheTtlMs: 10_000 }); + + expect(flag.enabled()).toBe(false); + await flag.refresh(); + expect(flag.enabled()).toBe(false); + + await admin.set(key, "1"); + await flag.refresh(); + expect(flag.enabled()).toBe(true); + + await admin.set(key, "0"); + await flag.refresh(); + expect(flag.enabled()).toBe(false); + + await flag.close(); + await admin.quit(); +}); + +redisTest("CachedRedisFlag warms eagerly on construction", async ({ redisOptions }) => { + const key = `flag_eager_${Date.now()}`; + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + await admin.set(key, "1"); + + const flag = new CachedRedisFlag({ redis: redisOptions, key }); + // No manual refresh(): the constructor kicks one off so the first real read is warm. + await waitFor(() => flag.enabled() === true); + expect(flag.enabled()).toBe(true); + + await flag.close(); + await admin.quit(); +}); diff --git a/internal-packages/metrics-pipeline/src/consumer.ts b/internal-packages/metrics-pipeline/src/consumer.ts new file mode 100644 index 00000000000..9e333e70ab1 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/consumer.ts @@ -0,0 +1,336 @@ +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import { + getMeter, + type Counter, + type Histogram, + type Meter, + type ObservableGauge, + ValueType, +} from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { dedupTokenFromEntryIds } from "./idempotency.js"; +import { allStreamKeys, type MetricDefinition, type StreamEntry } from "./types.js"; + +export type MetricsStreamConsumerOptions = { + redis: RedisOptions; + definition: MetricDefinition; + /** Unique per process; distinct replicas MUST use distinct names (PEL ownership). */ + consumerName: string; + /** Map a stream entry to a row, or null to drop it (still acked). */ + mapEntry: (entry: StreamEntry) => TRow | TRow[] | null; + /** Insert a batch. Must be idempotent w.r.t. dedupToken; throw to retry the batch. */ + insert: (rows: TRow[], opts: { dedupToken: string }) => Promise; + batchSize?: number; + blockMs?: number; + claimIdleMs?: number; + /** How often to scan for stale pending entries (XAUTOCLAIM); not every poll. */ + reclaimIntervalMs?: number; + errorBackoffMs?: number; + logger?: Logger; + meter?: Meter; +}; + +type RawEntry = [id: string, fields: string[]]; +type RawStream = [key: string, entries: RawEntry[]]; + +/** Per-shard stream health, surfaced as observable gauges and usable directly in tests. + * `lag: null` means Redis could not compute it (entries trimmed past the group's read + * position) — treat as an alert, NOT as zero: it coincides with data loss. */ +export type ShardState = { shard: number; depth: number; lag: number | null; pending: number }; + +function parseFields(flat: string[]): Record { + const out: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) { + out[flat[i]!] = flat[i + 1]!; + } + return out; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Reads a sharded metrics stream via a consumer group, inserting each stream's poll-batch + * as its own dedup block (so an XAUTOCLAIM-reclaimed batch re-forms the same id set and + * token), acking only after a successful insert. Sequential read/insert/ack per process. + */ +export class MetricsStreamConsumer { + private readonly redis: Redis; + private readonly probeRedis: Redis; + private readonly def: MetricDefinition; + private readonly keys: string[]; + private readonly consumerName: string; + private readonly batchSize: number; + private readonly blockMs: number; + private readonly claimIdleMs: number; + private readonly reclaimIntervalMs: number; + private lastReclaimAt = 0; + private readonly errorBackoffMs: number; + private readonly logger: Logger; + private readonly mapEntry: (entry: StreamEntry) => TRow | TRow[] | null; + private readonly insert: (rows: TRow[], opts: { dedupToken: string }) => Promise; + + private readonly meter: Meter; + private readonly entriesCounter: Counter; + private readonly rowsCounter: Counter; + private readonly insertErrorCounter: Counter; + private readonly insertDuration: Histogram; + private readonly observables: ObservableGauge[]; + private readonly batchCallback: Parameters[0]; + + private running = false; + private loopPromise?: Promise; + + constructor(options: MetricsStreamConsumerOptions) { + this.logger = options.logger ?? new Logger("MetricsStreamConsumer", "info"); + const redisConfig = { ...options.redis, keyPrefix: undefined }; + this.redis = createRedisClient(redisConfig, { + onError: (error) => this.logger.error("consumer redis error", { error }), + }); + // Separate client so the observable-gauge probes never queue behind the blocking XREADGROUP. + this.probeRedis = createRedisClient(redisConfig, { + onError: (error) => this.logger.error("consumer probe redis error", { error }), + }); + this.def = options.definition; + this.keys = allStreamKeys(options.definition); + this.consumerName = options.consumerName; + this.batchSize = options.batchSize ?? 1000; + this.blockMs = options.blockMs ?? 1000; + this.claimIdleMs = options.claimIdleMs ?? 60_000; + this.reclaimIntervalMs = options.reclaimIntervalMs ?? 15_000; + this.errorBackoffMs = options.errorBackoffMs ?? 1000; + this.mapEntry = options.mapEntry; + this.insert = options.insert; + + this.meter = options.meter ?? getMeter("metrics-pipeline"); + this.entriesCounter = this.meter.createCounter("queue_metrics.consumer.entries", { + description: "Stream entries read (attr source=new|reclaimed)", + valueType: ValueType.INT, + }); + this.rowsCounter = this.meter.createCounter("queue_metrics.consumer.rows_inserted", { + description: "Rows inserted into the sink", + valueType: ValueType.INT, + }); + this.insertErrorCounter = this.meter.createCounter("queue_metrics.consumer.insert_errors", { + description: "Failed inserts (batch left pending for retry)", + valueType: ValueType.INT, + }); + this.insertDuration = this.meter.createHistogram("queue_metrics.consumer.insert_duration", { + description: "Sink insert latency", + unit: "ms", + valueType: ValueType.INT, + }); + + const depthGauge = this.meter.createObservableGauge("queue_metrics.consumer.stream_depth", { + description: "Entries currently in each shard stream (approaches MAXLEN => trimming)", + valueType: ValueType.INT, + }); + const lagGauge = this.meter.createObservableGauge("queue_metrics.consumer.group_lag", { + description: "Entries not yet delivered to the consumer group (consumer falling behind)", + valueType: ValueType.INT, + }); + const pendingGauge = this.meter.createObservableGauge("queue_metrics.consumer.pending", { + description: "Unacked (in-flight or stuck) entries in the group PEL", + valueType: ValueType.INT, + }); + const lagUnknownGauge = this.meter.createObservableGauge("queue_metrics.consumer.lag_unknown", { + description: + "1 when Redis cannot compute group lag (entries trimmed => data loss); alert on this", + valueType: ValueType.INT, + }); + this.observables = [depthGauge, lagGauge, pendingGauge, lagUnknownGauge]; + this.batchCallback = async (result) => { + const states = await this.streamState(); + for (const s of states) { + const attrs = { stream: this.def.name, shard: String(s.shard) }; + result.observe(depthGauge, s.depth, attrs); + if (s.lag !== null) result.observe(lagGauge, s.lag, attrs); + result.observe(lagUnknownGauge, s.lag === null ? 1 : 0, attrs); + result.observe(pendingGauge, s.pending, attrs); + } + }; + this.meter.addBatchObservableCallback(this.batchCallback, this.observables); + } + + async start(): Promise { + if (this.running) return; + await this.ensureGroups(); + this.running = true; + this.loopPromise = this.loop(); + } + + async stop(): Promise { + this.running = false; + this.meter.removeBatchObservableCallback(this.batchCallback, this.observables); + await this.loopPromise?.catch(() => {}); + await Promise.all([this.redis.quit().catch(() => {}), this.probeRedis.quit().catch(() => {})]); + } + + private async ensureGroups(): Promise { + for (const key of this.keys) { + try { + // "0" (not "$"): a brand-new stream's group must not skip entries emitted + // between emitter boot and the first consumer's group creation. + await this.redis.xgroup("CREATE", key, this.def.consumerGroup, "0", "MKSTREAM"); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (!message.includes("BUSYGROUP")) throw error; + } + } + } + + private async loop(): Promise { + while (this.running) { + try { + if (Date.now() - this.lastReclaimAt >= this.reclaimIntervalMs) { + this.lastReclaimAt = Date.now(); + await this.reclaimStale(); + } + await this.readNew(); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + // Self-heal a missing group (stream trimmed to nothing / deleted / Redis flushed): + // recreate it rather than wedging the loop on NOGROUP forever. + if (message.includes("NOGROUP")) { + this.logger.warn("consumer group missing; recreating", { error }); + await this.ensureGroups().catch(() => {}); + } else { + this.logger.error("consumer loop iteration failed", { error }); + } + await sleep(this.errorBackoffMs); + } + } + } + + private async readNew(): Promise { + const ids = this.keys.map(() => ">"); + const response = (await this.redis.xreadgroup( + "GROUP", + this.def.consumerGroup, + this.consumerName, + "COUNT", + this.batchSize, + "BLOCK", + this.blockMs, + "STREAMS", + ...this.keys, + ...ids + )) as RawStream[] | null; + + if (!response) return 0; + return this.processStreams(response, "new"); + } + + private async reclaimStale(): Promise { + for (const key of this.keys) { + const result = (await this.redis.xautoclaim( + key, + this.def.consumerGroup, + this.consumerName, + this.claimIdleMs, + "0", + "COUNT", + this.batchSize + )) as [string, RawEntry[], string[]] | null; + + const entries = result?.[1] ?? []; + if (entries.length === 0) continue; + await this.processStreams([[key, entries]], "reclaimed"); + } + } + + // One insert (dedup block) and XACK per stream, so a reclaimed batch re-forms the + // original per-stream id set and token. On insert failure that stream's entries stay + // pending for a later XAUTOCLAIM; other streams still progress. + private async processStreams(streams: RawStream[], source: "new" | "reclaimed"): Promise { + let processed = 0; + let firstError: unknown; + + for (const [key, entries] of streams) { + if (entries.length === 0) continue; + const keyIds: string[] = []; + const rows: TRow[] = []; + for (const [id, flat] of entries) { + keyIds.push(id); + const mapped = this.mapEntry({ id, fields: parseFields(flat) }); + if (Array.isArray(mapped)) rows.push(...mapped); + else if (mapped !== null) rows.push(mapped); + } + this.entriesCounter.add(keyIds.length, { source }); + + if (rows.length > 0) { + const startedAt = Date.now(); + try { + await this.insert(rows, { dedupToken: dedupTokenFromEntryIds(keyIds, key) }); + } catch (error) { + this.insertErrorCounter.add(1); + firstError ??= error; + continue; + } finally { + this.insertDuration.record(Date.now() - startedAt); + } + this.rowsCounter.add(rows.length); + } + + await this.redis.xack(key, this.def.consumerGroup, ...keyIds); + processed += keyIds.length; + } + + if (firstError !== undefined) throw firstError; + return processed; + } + + /** Per-shard depth (XLEN), group lag, and pending — the consumer-health signals. */ + async streamState(): Promise { + return probeShardStates(this.probeRedis, this.keys, this.def.consumerGroup); + } + + /** All shard stream keys this consumer reads (for diagnostics/tests). */ + streamKeys(): string[] { + return this.keys.slice(); + } +} + +/** + * Per-shard depth/lag/pending for a metric stream — usable without a running consumer + * (e.g. from an admin route). `redis` should have keyPrefix unset, matching the stream keys. + */ +export async function probeShardStates( + redis: Redis, + keys: string[], + consumerGroup: string +): Promise { + const out: ShardState[] = []; + for (let shard = 0; shard < keys.length; shard++) { + const key = keys[shard]!; + const depth = Number(await redis.xlen(key)) || 0; + // lag defaults to null (unknown) and only becomes a number when the group is found and + // Redis reports one: a nil lag (or a missing group on an existing stream) means we can't + // compute it, e.g. entries were trimmed past the group's read position (data loss). + let lag: number | null = null; + let pending = 0; + try { + const groups = (await redis.call("XINFO", "GROUPS", key)) as unknown[]; + for (const raw of groups) { + const info = flatToMap(raw as unknown[]); + if (info.name === consumerGroup) { + const rawLag = info.lag; + lag = rawLag == null ? null : Number(rawLag); + if (lag !== null && !Number.isFinite(lag)) lag = null; + pending = Number(info.pending) || 0; + } + } + } catch { + // Stream/group may not exist yet; treat as zero. + } + out.push({ shard, depth, lag, pending }); + } + return out; +} + +function flatToMap(flat: unknown[]): Record { + const out: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) { + out[String(flat[i])] = flat[i + 1]; + } + return out; +} diff --git a/internal-packages/metrics-pipeline/src/emitter.ts b/internal-packages/metrics-pipeline/src/emitter.ts new file mode 100644 index 00000000000..692956d98cb --- /dev/null +++ b/internal-packages/metrics-pipeline/src/emitter.ts @@ -0,0 +1,242 @@ +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import { getMeter, type Counter, type Meter, ValueType } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { shardFor } from "./hash.js"; +import { streamKey, type MetricDefinition, type MetricFields } from "./types.js"; + +export type MetricsStreamEmitterOptions = { + redis: RedisOptions; + definition: MetricDefinition; + /** Synchronous enabled check (e.g. CachedRedisFlag); emits are no-ops when false. */ + flag: { enabled(): boolean }; + /** Probability (0..1) that a sampled emission fires; applies to `sampledSync()`, not + * `emit()`. Pass a `{ value() }` provider (e.g. CachedRedisNumber) to tune it live + * without a redeploy. Default 1 (always). */ + gaugeSampleRate?: number | { value(): number }; + /** TTL (ms) refreshed on every counter write on the per-(queue,op) odometer key. + * Active queues never expire; idle-past-TTL queues purge and self-heal on return. + * Default 7 days. */ + counterOdometerTtlMs?: number; + /** TTL (ms) for per-concurrency-key odometers; short because key cardinality is + * user-controlled and cumulative counters make idle-gap expiry loss-free. Default 24h. */ + ckOdometerTtlMs?: number; + logger?: Logger; + meter?: Meter; +}; + +type CumulativeCommand = ( + odometerKey: string, + streamKey: string, + ttlMs: string, + maxLen: string, + op: string, + q: string, + ...extraFields: string[] +) => Promise; + +type CumulativeCkCommand = ( + odometerKey: string, + ckOdometerKey: string, + streamKey: string, + ttlMs: string, + ckTtlMs: string, + maxLen: string, + op: string, + q: string, + ck: string, + ...extraFields: string[] +) => Promise; + +// INCR the odometer, refresh its TTL, and XADD the reading (new value as `cum`) in one round +// trip. Refresh-on-write is load-bearing: only genuinely idle queues expire. On first creation +// (v==1) XADD a cum=0 baseline first (smaller stream id => sorts first) so deltaSum captures the +// 0->1 transition and the total reconstructs exactly. +// ARGV: [1]=ttlMs [2]=maxLen [3]=op [4]=q [5..]=extra field/value pairs (e.g. wait). +const CUMULATIVE_LUA = ` +local v = redis.call('INCR', KEYS[1]) +redis.call('PEXPIRE', KEYS[1], ARGV[1]) +local maxlen = tonumber(ARGV[2]) or 0 +local function xadd(cum, withExtra) + local x = {'XADD', KEYS[2]} + if maxlen > 0 then x[#x+1]='MAXLEN'; x[#x+1]='~'; x[#x+1]=ARGV[2] end + x[#x+1]='*' + x[#x+1]='op'; x[#x+1]=ARGV[3] + x[#x+1]='q'; x[#x+1]=ARGV[4] + if withExtra then for i=5,#ARGV do x[#x+1]=ARGV[i] end end + x[#x+1]='cum'; x[#x+1]=cum + redis.call(unpack(x)) +end +if v == 1 then xadd(0, false) end +xadd(v, true) +`; + +// CK variant: advances base + per-key odometers, ONE reading entry carries both (cum + +// ck/ckcum), so per-key attribution adds no stream volume. Baselines seed independently: +// cum-only entry = base row, ck+ckcum-only entry = per-key row, reading entry = both. +// KEYS: [1]=baseOdometer [2]=ckOdometer [3]=stream. ARGV: [1]=baseTtlMs [2]=ckTtlMs +// [3]=maxLen [4]=op [5]=q [6]=ck [7..]=extra field/value pairs. +const CUMULATIVE_CK_LUA = ` +local v = redis.call('INCR', KEYS[1]) +redis.call('PEXPIRE', KEYS[1], ARGV[1]) +local ckv = redis.call('INCR', KEYS[2]) +redis.call('PEXPIRE', KEYS[2], ARGV[2]) +local maxlen = tonumber(ARGV[3]) or 0 +local function xadd(fields, withExtra) + local x = {'XADD', KEYS[3]} + if maxlen > 0 then x[#x+1]='MAXLEN'; x[#x+1]='~'; x[#x+1]=ARGV[3] end + x[#x+1]='*' + x[#x+1]='op'; x[#x+1]=ARGV[4] + x[#x+1]='q'; x[#x+1]=ARGV[5] + if withExtra then for i=7,#ARGV do x[#x+1]=ARGV[i] end end + for i=1,#fields do x[#x+1]=fields[i] end + redis.call(unpack(x)) +end +if v == 1 then xadd({'cum', 0}, false) end +if ckv == 1 then xadd({'ck', ARGV[6], 'ckcum', 0}, false) end +xadd({'ck', ARGV[6], 'cum', v, 'ckcum', ckv}, true) +`; + +/** Node-side producer: XADDs events to a sharded metrics stream, gated on a flag. */ +export class MetricsStreamEmitter { + private readonly redis: Redis; + private readonly def: MetricDefinition; + private readonly flag: { enabled(): boolean }; + private readonly sampleRate: () => number; + private readonly odometerTtlMs: number; + private readonly ckOdometerTtlMs: number; + private readonly logger: Logger; + private readonly emittedCounter: Counter; + private readonly errorCounter: Counter; + + constructor(options: MetricsStreamEmitterOptions) { + this.logger = options.logger ?? new Logger("MetricsStreamEmitter", "warn"); + this.redis = createRedisClient( + { ...options.redis, keyPrefix: undefined }, + { onError: (error) => this.logger.error("emitter redis error", { error }) } + ); + this.redis.defineCommand("qmEmitCumulative", { numberOfKeys: 2, lua: CUMULATIVE_LUA }); + this.redis.defineCommand("qmEmitCumulativeCk", { numberOfKeys: 3, lua: CUMULATIVE_CK_LUA }); + this.odometerTtlMs = options.counterOdometerTtlMs ?? 7 * 24 * 60 * 60 * 1000; + this.ckOdometerTtlMs = options.ckOdometerTtlMs ?? 24 * 60 * 60 * 1000; + this.def = options.definition; + this.flag = options.flag; + const rate = options.gaugeSampleRate; + if (typeof rate === "object") { + this.sampleRate = () => rate.value(); + } else { + const fixed = Math.min(1, Math.max(0, rate ?? 1)); + this.sampleRate = () => fixed; + } + + const meter = options.meter ?? getMeter("metrics-pipeline"); + this.emittedCounter = meter.createCounter("queue_metrics.emitter.emitted", { + description: "Node-side metric events XADDed to the stream", + valueType: ValueType.INT, + }); + this.errorCounter = meter.createCounter("queue_metrics.emitter.errors", { + description: "Failed metric-event XADDs (dropped)", + valueType: ValueType.INT, + }); + } + + enabledSync(): boolean { + return this.flag.enabled(); + } + + // Enabled AND (probabilistically) sampled-in. For high-frequency sampled emissions + // (e.g. Lua gauges); exact-count events use enabledSync()/emit() and are never sampled. + sampledSync(): boolean { + if (!this.flag.enabled()) return false; + const rate = this.sampleRate(); + if (rate >= 1) return true; + if (rate <= 0) return false; + return Math.random() < rate; + } + + // Fire-and-forget gauge emit: a plain XADD of an op=gauge snapshot (no odometer). The + // gauge value was read atomically inside the queue op's Lua and returned on the reply; + // this just lands it on the metrics stream. Loss-tolerant (sampled), never throws into + // the caller. Shares the counter stream (one stream family on the metrics Redis). + emitGauge(shardKey: string, fields: MetricFields): void { + if (!this.flag.enabled()) return; + // Drop rather than queue while the metrics Redis is unreachable: ioredis would hold + // every command in its offline queue until rejection, and metrics are loss-tolerant. + if (this.redis.status !== "ready") return; + const op = String(fields.op ?? "gauge"); + const stream = streamKey(this.def, shardFor(shardKey, this.def.shardCount)); + const args: string[] = []; + if (this.def.maxLen) args.push("MAXLEN", "~", String(this.def.maxLen)); + args.push("*"); + for (const [field, value] of Object.entries(fields)) { + args.push(field, String(value)); + } + this.emittedCounter.add(1, { op }); + this.redis.xadd(stream, ...(args as [string, ...string[]])).catch((error) => { + this.errorCounter.add(1); + this.logger.debug("metrics gauge emit failed", { error, stream }); + }); + } + + // Fire-and-forget cumulative counter emit: advances the per-(queue,op) odometer and + // XADDs its new absolute value. No-op when disabled, never throws into the caller. A + // lost XADD self-heals (the next reading restates the total); the INCR is never sampled. + // A non-empty `fields.ck` also advances a per-concurrency-key odometer and rides the + // same entry as ck/ckcum (see CUMULATIVE_CK_LUA for the baseline/row mapping). + emit(shardKey: string, fields: MetricFields): void { + if (!this.flag.enabled()) return; + if (this.redis.status !== "ready") return; + const op = String(fields.op ?? "unknown"); + const q = String(fields.q ?? ""); + const ck = fields.ck != null && String(fields.ck) !== "" ? String(fields.ck) : null; + const shard = shardFor(shardKey, this.def.shardCount); + const stream = streamKey(this.def, shard); + // The odometer carries the stream's {shard} hash tag so INCR + XADD stay in one + // Cluster slot (the shard is derived from the queue, so the mapping is stable). + // The key format is part of the rolling-deploy data shape: concurrent old/new + // emitters with different formats split an odometer and corrupt its deltas. + const odometerKey = `${this.def.name}_cum:{${shard}}:${op}:${q}`; + const extra: string[] = []; + for (const [field, value] of Object.entries(fields)) { + if (field === "op" || field === "q" || field === "ck") continue; + extra.push(field, String(value)); + } + this.emittedCounter.add(1, { op }); + const maxLen = String(this.def.maxLen ?? 0); + const done = (error: unknown) => { + this.errorCounter.add(1); + this.logger.debug("metrics emit failed", { error, stream }); + }; + if (ck) { + const client = this.redis as unknown as { qmEmitCumulativeCk: CumulativeCkCommand }; + client + .qmEmitCumulativeCk( + odometerKey, + `${odometerKey}:ck:${ck}`, + stream, + String(this.odometerTtlMs), + String(this.ckOdometerTtlMs), + maxLen, + op, + q, + ck, + ...extra + ) + .catch(done); + return; + } + const client = this.redis as unknown as { qmEmitCumulative: CumulativeCommand }; + client + .qmEmitCumulative(odometerKey, stream, String(this.odometerTtlMs), maxLen, op, q, ...extra) + .catch(done); + } + + // Resolves once the metrics Redis connection is ready (emits before that are dropped). + waitUntilReady(): Promise { + if (this.redis.status === "ready") return Promise.resolve(); + return new Promise((resolve) => this.redis.once("ready", () => resolve())); + } + + async close(): Promise { + await this.redis.quit(); + } +} diff --git a/internal-packages/metrics-pipeline/src/flag.ts b/internal-packages/metrics-pipeline/src/flag.ts new file mode 100644 index 00000000000..5931e088939 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/flag.ts @@ -0,0 +1,46 @@ +import type { RedisOptions } from "@internal/redis"; +import type { Logger } from "@trigger.dev/core/logger"; +import { CachedRedisValue } from "./cachedValue.js"; + +export type CachedRedisFlagOptions = { + redis: RedisOptions; + /** Redis key holding the flag. A value of "1"/"true"/"on"/"enabled" is truthy. */ + key: string; + cacheTtlMs?: number; + defaultValue?: boolean; + logger?: Logger; +}; + +const TRUTHY = new Set(["1", "true", "on", "enabled", "yes"]); + +/** + * Boolean feature flag from a Redis key with a short stale-while-revalidate cache, + * exposing a synchronous getter for hot paths (building Lua ARGV on every op). + */ +export class CachedRedisFlag { + private readonly inner: CachedRedisValue; + + constructor(options: CachedRedisFlagOptions) { + this.inner = new CachedRedisValue({ + redis: options.redis, + key: options.key, + parse: (raw) => raw != null && TRUTHY.has(raw.trim().toLowerCase()), + defaultValue: options.defaultValue ?? false, + cacheTtlMs: options.cacheTtlMs, + logger: options.logger, + loggerName: "CachedRedisFlag", + }); + } + + enabled(): boolean { + return this.inner.get(); + } + + refresh(): Promise { + return this.inner.refresh(); + } + + async close(): Promise { + await this.inner.close(); + } +} diff --git a/internal-packages/metrics-pipeline/src/hash.ts b/internal-packages/metrics-pipeline/src/hash.ts new file mode 100644 index 00000000000..b14324c138a --- /dev/null +++ b/internal-packages/metrics-pipeline/src/hash.ts @@ -0,0 +1,15 @@ +/** FNV-1a 32-bit hash. Deterministic across processes; used only for sharding. */ +export function fnv1a32(str: string): number { + let hash = 0x811c9dc5; + for (let i = 0; i < str.length; i++) { + hash ^= str.charCodeAt(i); + hash = Math.imul(hash, 0x01000193); + } + return hash >>> 0; +} + +/** Deterministic shard index in [0, shardCount) for a key. */ +export function shardFor(key: string, shardCount: number): number { + if (shardCount <= 1) return 0; + return fnv1a32(key) % shardCount; +} diff --git a/internal-packages/metrics-pipeline/src/idempotency.ts b/internal-packages/metrics-pipeline/src/idempotency.ts new file mode 100644 index 00000000000..60cbd661f53 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/idempotency.ts @@ -0,0 +1,11 @@ +import { createHash } from "node:crypto"; + +// Deterministic, order-independent token over a batch of entry ids. A redelivered +// batch yields the same token, so ClickHouse's raw-table dedup window drops the replay. +// `scope` (the stream key) disambiguates id sets that could collide across streams. +export function dedupTokenFromEntryIds(ids: string[], scope = ""): string { + const sorted = [...ids].sort(); + return createHash("sha1") + .update(`${scope}|${sorted.join(",")}`) + .digest("hex"); +} diff --git a/internal-packages/metrics-pipeline/src/index.ts b/internal-packages/metrics-pipeline/src/index.ts new file mode 100644 index 00000000000..223c5feab17 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/index.ts @@ -0,0 +1,26 @@ +export { CachedRedisFlag, type CachedRedisFlagOptions } from "./flag.js"; +export { + CachedRedisNumber, + type CachedRedisNumberOptions, + CachedRedisValue, + type CachedRedisValueOptions, +} from "./cachedValue.js"; +export { MetricsStreamEmitter, type MetricsStreamEmitterOptions } from "./emitter.js"; +export { + MetricsStreamConsumer, + type MetricsStreamConsumerOptions, + type ShardState, + probeShardStates, +} from "./consumer.js"; +export { createMetricsGaugeComputeLua, type GaugeComputeLuaParams } from "./lua.js"; +export { dedupTokenFromEntryIds } from "./idempotency.js"; +export { shardFor, fnv1a32 } from "./hash.js"; +export { + streamKey, + allStreamKeys, + entryTimeMs, + entryOrderKey, + type MetricDefinition, + type MetricFields, + type StreamEntry, +} from "./types.js"; diff --git a/internal-packages/metrics-pipeline/src/lua.ts b/internal-packages/metrics-pipeline/src/lua.ts new file mode 100644 index 00000000000..64f3b896c0d --- /dev/null +++ b/internal-packages/metrics-pipeline/src/lua.ts @@ -0,0 +1,50 @@ +// Each field is a Lua expression evaluated inside the target script. queueLimit/ +// envLimit must be the EFFECTIVE enforced limit, else an unset limit reads as throttled. +export type GaugeComputeLuaParams = { + // Lua boolean expression; when true the gauge is computed (else the extra reads are skipped). + enabledArg: string; + queued: string; + running: string; + queueLimit: string; + envQueued: string; + envRunning: string; + envLimit: string; + // Lua statements run first inside the pcall (e.g. to compute aggregate locals). + preamble?: string; + // Lua boolean expression (in __cc/__lim/__ql) for the throttled flag. Pass "false" + // where cc >= lim is not a valid throttle signal (e.g. summed CK aggregates). + throttledExpr?: string; + // CK-health extras (both or neither): appended as an optional gauge tail, gauge[8]/gauge[9]. + ckBacklogged?: string; + ckMaxWaitMs?: string; +}; + +// Computes an op=gauge snapshot into the enclosing script's `__qm_g` local (a flat +// {ql, cc, lim, eql, ec, elim, thr} array) so the script can RETURN it; Node then XADDs it +// to the metrics Redis. No Redis write here (the run-queue Redis carries no metrics stream). +// Gated on the sample flag and pcall-wrapped. The script MUST declare `local __qm_g` first. +export function createMetricsGaugeComputeLua(params: GaugeComputeLuaParams): string { + const throttled = params.throttledExpr ?? "__cc >= __lim and __ql > 0"; + const hasCk = params.ckBacklogged != null && params.ckMaxWaitMs != null; + const gauge = hasCk + ? ` local __ckq = tonumber(${params.ckBacklogged}) or 0 + local __ckw = tonumber(${params.ckMaxWaitMs}) or 0 + __qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr, __ckq, __ckw}` + : ` __qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr}`; + + return ` +if ${params.enabledArg} then + pcall(function() + ${params.preamble ?? ""} + local __ql = tonumber(${params.queued}) or 0 + local __cc = tonumber(${params.running}) or 0 + local __lim = tonumber(${params.queueLimit}) or 0 + local __eql = tonumber(${params.envQueued}) or 0 + local __ec = tonumber(${params.envRunning}) or 0 + local __elim = tonumber(${params.envLimit}) or 0 + local __thr = 0 + if ${throttled} then __thr = 1 end +${gauge} + end) +end`; +} diff --git a/internal-packages/metrics-pipeline/src/pipeline.test.ts b/internal-packages/metrics-pipeline/src/pipeline.test.ts new file mode 100644 index 00000000000..73979310798 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/pipeline.test.ts @@ -0,0 +1,116 @@ +import { describe, expect, it } from "vitest"; +import { createMetricsGaugeComputeLua } from "./lua.js"; +import { dedupTokenFromEntryIds } from "./idempotency.js"; +import { fnv1a32, shardFor } from "./hash.js"; +import { allStreamKeys, entryOrderKey, entryTimeMs, streamKey } from "./types.js"; + +describe("shardFor", () => { + it("is deterministic and in range", () => { + expect(shardFor("queueA", 1)).toBe(0); + const s = shardFor("queueA", 4); + expect(s).toBeGreaterThanOrEqual(0); + expect(s).toBeLessThan(4); + expect(shardFor("queueA", 4)).toBe(s); + expect(fnv1a32("queueA")).toBe(fnv1a32("queueA")); + }); +}); + +describe("dedupTokenFromEntryIds", () => { + it("is order-independent and set-sensitive", () => { + expect(dedupTokenFromEntryIds(["1-0", "2-0"])).toBe(dedupTokenFromEntryIds(["2-0", "1-0"])); + expect(dedupTokenFromEntryIds(["1-0"])).not.toBe(dedupTokenFromEntryIds(["2-0"])); + expect(dedupTokenFromEntryIds(["1-0"])).toMatch(/^[0-9a-f]{40}$/); + }); +}); + +describe("stream keys", () => { + it("names and parses entry time", () => { + expect(streamKey({ name: "queue_metrics" }, 3)).toBe("queue_metrics:{3}"); + expect(allStreamKeys({ name: "qm", shardCount: 2, consumerGroup: "cg" })).toEqual([ + "qm:{0}", + "qm:{1}", + ]); + expect(entryTimeMs("1717000000000-5")).toBe(1717000000000); + expect(entryTimeMs("nope")).toBeNull(); + }); + + it("entryOrderKey stays exact and strictly monotonic at real epoch magnitudes", () => { + const ms = 1783000000000; // ~2026: ms*1e6 is past JS safe-integer range, so a number key + const k = (seq: number) => BigInt(entryOrderKey(`${ms}-${seq}`)); + // adjacent seq within one ms must not collapse to the same key (the float bug) + expect(k(0)).toBe(BigInt(ms) * 1000000n); + expect(k(1) - k(0)).toBe(1n); + expect(k(2) - k(1)).toBe(1n); + // a later ms always outranks any seq of an earlier ms (up to the 1M/ms factor) + expect(BigInt(entryOrderKey(`${ms + 1}-0`))).toBeGreaterThan(k(999999)); + }); +}); + +describe("createMetricsGaugeComputeLua", () => { + it("assigns __qm_g inside a gated, pcall-wrapped block and never XADDs", () => { + const lua = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('ZCARD', KEYS[2])", + running: "queueCurrent", + queueLimit: "queueLimit", + envQueued: "redis.call('ZCARD', KEYS[8])", + envRunning: "envCurrent", + envLimit: "envLimit", + }); + + expect(lua).toContain("if ARGV[#ARGV] == '1' then"); + expect(lua).toContain("pcall(function()"); + expect(lua).toContain("__qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr}"); + expect(lua).toContain("if __cc >= __lim and __ql > 0 then __thr = 1 end"); + // The whole point of the refactor: no Redis write happens in the run-queue script. + expect(lua).not.toContain("XADD"); + }); + + it("honors a custom throttled expression and preamble", () => { + const lua = createMetricsGaugeComputeLua({ + enabledArg: "true", + preamble: "local agg = 1", + queued: "0", + running: "0", + queueLimit: "0", + envQueued: "0", + envRunning: "0", + envLimit: "0", + throttledExpr: "false", + }); + expect(lua).toContain("local agg = 1"); + expect(lua).toContain("if false then __thr = 1 end"); + expect(lua).not.toContain("XADD"); + }); + + it("appends the CK-health tail only when both CK params are set", () => { + const withCk = createMetricsGaugeComputeLua({ + enabledArg: "true", + queued: "0", + running: "0", + queueLimit: "0", + envQueued: "0", + envRunning: "0", + envLimit: "0", + ckBacklogged: "redis.call('ZCARD', ckIndexKey)", + ckMaxWaitMs: "__ckwait", + }); + expect(withCk).toContain( + "__qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr, __ckq, __ckw}" + ); + expect(withCk).toContain("local __ckq = tonumber(redis.call('ZCARD', ckIndexKey)) or 0"); + + const withoutCk = createMetricsGaugeComputeLua({ + enabledArg: "true", + queued: "0", + running: "0", + queueLimit: "0", + envQueued: "0", + envRunning: "0", + envLimit: "0", + ckBacklogged: "0", + }); + expect(withoutCk).toContain("__qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr}"); + expect(withoutCk).not.toContain("__ckq"); + }); +}); diff --git a/internal-packages/metrics-pipeline/src/types.ts b/internal-packages/metrics-pipeline/src/types.ts new file mode 100644 index 00000000000..d9e9e43f554 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/types.ts @@ -0,0 +1,42 @@ +export type MetricFields = Record; + +export type StreamEntry = { + id: string; + fields: Record; +}; + +export type MetricDefinition = { + /** Logical name, e.g. "queue_metrics". Used as the stream key prefix. */ + name: string; + shardCount: number; + consumerGroup: string; + /** Approximate MAXLEN cap applied on XADD (`MAXLEN ~ N`). Omit for unbounded. */ + maxLen?: number; +}; + +// Keys are used verbatim on every access path (Lua ARGV, emitter, consumer), so +// they must NOT be subject to an ioredis keyPrefix. `{shard}` is a Cluster hash tag. +export function streamKey(definition: Pick, shard: number): string { + return `${definition.name}:{${shard}}`; +} + +export function allStreamKeys(definition: MetricDefinition): string[] { + return Array.from({ length: Math.max(1, definition.shardCount) }, (_, shard) => + streamKey(definition, shard) + ); +} + +// The ms part of a stream entry id is its emission time. +export function entryTimeMs(id: string): number | null { + const ms = Number(id.split("-")[0]); + return Number.isFinite(ms) ? ms : null; +} + +// Ordering key from a stream id (`-`) = ms*1e6+seq, for deltaSumTimestamp. BigInt + +// string because ms*1e6 exceeds JS safe-integer range at real epoch magnitudes (a number would +// collapse nearby seq values); the ClickHouse order_key column is UInt64 and takes the string. +// The 1e6 factor (1M entries/ms/shard, far above any single Redis stream) stays within UInt64. +export function entryOrderKey(id: string): string { + const [ms, seq] = id.split("-"); + return (BigInt(Number(ms) || 0) * 1000000n + BigInt(Number(seq) || 0)).toString(); +} diff --git a/internal-packages/metrics-pipeline/test/setup.ts b/internal-packages/metrics-pipeline/test/setup.ts new file mode 100644 index 00000000000..b2bacd6baf5 --- /dev/null +++ b/internal-packages/metrics-pipeline/test/setup.ts @@ -0,0 +1,4 @@ +import { vi } from "vitest"; + +// Set extended timeout for container tests +vi.setConfig({ testTimeout: 60_000 }); diff --git a/internal-packages/metrics-pipeline/tsconfig.build.json b/internal-packages/metrics-pipeline/tsconfig.build.json new file mode 100644 index 00000000000..89c87a3dc67 --- /dev/null +++ b/internal-packages/metrics-pipeline/tsconfig.build.json @@ -0,0 +1,21 @@ +{ + "include": ["src/**/*.ts"], + "exclude": ["src/**/*.test.ts"], + "compilerOptions": { + "composite": true, + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "outDir": "dist", + "module": "Node16", + "moduleResolution": "Node16", + "moduleDetection": "force", + "verbatimModuleSyntax": false, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "preserveWatchOutput": true, + "skipLibCheck": true, + "strict": true, + "declaration": true + } +} diff --git a/internal-packages/metrics-pipeline/tsconfig.json b/internal-packages/metrics-pipeline/tsconfig.json new file mode 100644 index 00000000000..af630abe1f1 --- /dev/null +++ b/internal-packages/metrics-pipeline/tsconfig.json @@ -0,0 +1,8 @@ +{ + "references": [{ "path": "./tsconfig.src.json" }, { "path": "./tsconfig.test.json" }], + "compilerOptions": { + "moduleResolution": "Node16", + "module": "Node16", + "customConditions": ["@triggerdotdev/source"] + } +} diff --git a/internal-packages/metrics-pipeline/tsconfig.src.json b/internal-packages/metrics-pipeline/tsconfig.src.json new file mode 100644 index 00000000000..0df3d2d222f --- /dev/null +++ b/internal-packages/metrics-pipeline/tsconfig.src.json @@ -0,0 +1,20 @@ +{ + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "src/**/*.test.ts"], + "compilerOptions": { + "composite": true, + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "module": "Node16", + "moduleResolution": "Node16", + "moduleDetection": "force", + "verbatimModuleSyntax": false, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "preserveWatchOutput": true, + "skipLibCheck": true, + "strict": true, + "customConditions": ["@triggerdotdev/source"] + } +} diff --git a/internal-packages/metrics-pipeline/tsconfig.test.json b/internal-packages/metrics-pipeline/tsconfig.test.json new file mode 100644 index 00000000000..4c06c9f57bb --- /dev/null +++ b/internal-packages/metrics-pipeline/tsconfig.test.json @@ -0,0 +1,21 @@ +{ + "include": ["src/**/*.test.ts"], + "references": [{ "path": "./tsconfig.src.json" }], + "compilerOptions": { + "composite": true, + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "module": "Node16", + "moduleResolution": "Node16", + "moduleDetection": "force", + "verbatimModuleSyntax": false, + "types": ["vitest/globals"], + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "preserveWatchOutput": true, + "skipLibCheck": true, + "strict": true, + "customConditions": ["@triggerdotdev/source"] + } +} diff --git a/internal-packages/metrics-pipeline/vitest.config.ts b/internal-packages/metrics-pipeline/vitest.config.ts new file mode 100644 index 00000000000..daafd294fa8 --- /dev/null +++ b/internal-packages/metrics-pipeline/vitest.config.ts @@ -0,0 +1,17 @@ +import { defineConfig } from "vitest/config"; +import { DurationShardingSequencer } from "@internal/testcontainers/sequencer"; + +export default defineConfig({ + test: { + sequence: { sequencer: DurationShardingSequencer }, + globals: true, + retry: process.env.CI ? 2 : 0, + environment: "node", + setupFiles: ["./test/setup.ts"], + testTimeout: 30000, + hookTimeout: 30000, + }, + esbuild: { + target: "node18", + }, +}); diff --git a/internal-packages/run-engine/package.json b/internal-packages/run-engine/package.json index 8d53974d10b..516e6a18696 100644 --- a/internal-packages/run-engine/package.json +++ b/internal-packages/run-engine/package.json @@ -21,6 +21,7 @@ }, "dependencies": { "@internal/redis": "workspace:*", + "@internal/metrics-pipeline": "workspace:*", "@internal/run-store": "workspace:*", "@trigger.dev/redis-worker": "workspace:*", "@internal/tracing": "workspace:*", diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index f3091c93b88..c55184e594f 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -218,6 +218,7 @@ export class RunEngine { callback: this.#concurrencySweeperCallback.bind(this), }, shardCount: options.queue?.shardCount, + queueMetrics: options.queue?.queueMetrics, masterQueueConsumersDisabled: options.queue?.masterQueueConsumersDisabled, masterQueueConsumersIntervalMs: options.queue?.masterQueueConsumersIntervalMs, processWorkerQueueDebounceMs: options.queue?.processWorkerQueueDebounceMs, @@ -1628,6 +1629,14 @@ export class RunEngine { return this.runQueue.currentConcurrencyOfQueues(environment, queues); } + async concurrencyKeyBreakdown( + environment: MinimalAuthenticatedEnvironment, + queue: string, + options?: { limit?: number } + ) { + return this.runQueue.concurrencyKeyBreakdown(environment, queue, options); + } + async removeEnvironmentQueuesFromMasterQueue({ runtimeEnvironmentId, organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts index dc9d029c38c..4b236aefc16 100644 --- a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts @@ -98,10 +98,16 @@ export class EnqueueSystem { // Force development runs to use the environment id as the worker queue. const workerQueue = env.type === "DEVELOPMENT" ? env.id : run.workerQueue; - const timestamp = (run.queueTimestamp ?? run.createdAt).getTime() - run.priorityMs; + // Ordering keeps the run's original position; the scheduling-delay anchor is the + // trigger/delay time only on first enqueue (includeTtl). Re-enqueues anchor to now, + // else the wait metric absorbs the whole waitpoint/checkpoint duration. + const queuePositionMs = (run.queueTimestamp ?? run.createdAt).getTime(); + const timestamp = queuePositionMs - run.priorityMs; + const eligibleAtMs = includeTtl ? queuePositionMs : Date.now(); - // Include TTL only when explicitly requested (first enqueue from trigger). - // Re-enqueues (waitpoint, checkpoint, delayed, pending version) must not add TTL. + // Include TTL only when explicitly requested (first enqueue from trigger or the + // delayed-run system). Re-enqueues (waitpoint, checkpoint, pending version) must + // not add TTL. let ttlExpiresAt: number | undefined; if (includeTtl && run.ttl) { const expireAt = parseNaturalLanguageDuration(run.ttl); @@ -124,6 +130,7 @@ export class EnqueueSystem { queue: run.queue, concurrencyKey: run.concurrencyKey ?? undefined, timestamp, + eligibleAtMs, attempt: 0, ttlExpiresAt, }, diff --git a/internal-packages/run-engine/src/engine/tests/ttl.test.ts b/internal-packages/run-engine/src/engine/tests/ttl.test.ts index 949e47f8574..e33b361abdb 100644 --- a/internal-packages/run-engine/src/engine/tests/ttl.test.ts +++ b/internal-packages/run-engine/src/engine/tests/ttl.test.ts @@ -293,7 +293,12 @@ describe("RunEngine ttl", () => { ); assertNonNullable(messageAfterTrigger); expect(messageAfterTrigger.ttlExpiresAt).toBeDefined(); + // First enqueue anchors the scheduling-delay clock at the trigger time. + expect(messageAfterTrigger.eligibleAtMs).toBe( + (run.queueTimestamp ?? run.createdAt).getTime() + ); + const beforeReenqueue = Date.now(); await engine.enqueueSystem.enqueueRun({ run, env: authenticatedEnvironment, @@ -308,6 +313,10 @@ describe("RunEngine ttl", () => { ); assertNonNullable(messageAfterReenqueue); expect(messageAfterReenqueue.ttlExpiresAt).toBeUndefined(); + // Re-enqueues anchor to now so the wait metric measures only this queue stint, + // while the ordering timestamp keeps the run's original position. + expect(messageAfterReenqueue.eligibleAtMs).toBeGreaterThanOrEqual(beforeReenqueue); + expect(messageAfterReenqueue.timestamp).toBe(messageAfterTrigger.timestamp); } finally { await engine.quit(); } diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts index bb1d6eb2fa9..f37ec7df50a 100644 --- a/internal-packages/run-engine/src/engine/types.ts +++ b/internal-packages/run-engine/src/engine/types.ts @@ -16,6 +16,7 @@ import { } from "@trigger.dev/redis-worker"; import type { ControlPlaneResolver } from "./controlPlaneResolver.js"; import type { FairQueueSelectionStrategyOptions } from "../run-queue/fairQueueSelectionStrategy.js"; +import type { RunQueueMetricsEmitter } from "../run-queue/index.js"; import type { MinimalAuthenticatedEnvironment } from "../shared/index.js"; import type { LockRetryConfig } from "./locking.js"; import type { workerCatalog } from "./workerCatalog.js"; @@ -90,6 +91,8 @@ export type RunEngineOptions = { defaultEnvConcurrency?: number; defaultEnvConcurrencyBurstFactor?: number; logLevel?: LogLevel; + /** Optional queue-metrics emitter; enables gauge + counter emission from the RunQueue. */ + queueMetrics?: RunQueueMetricsEmitter; queueSelectionStrategyOptions?: Pick< FairQueueSelectionStrategyOptions, "parentQueueLimit" | "tracer" | "biases" | "reuseSnapshotCount" | "maximumEnvCount" diff --git a/internal-packages/run-engine/src/run-queue/index.ts b/internal-packages/run-engine/src/run-queue/index.ts index a0571206538..4e6ca89d847 100644 --- a/internal-packages/run-engine/src/run-queue/index.ts +++ b/internal-packages/run-engine/src/run-queue/index.ts @@ -5,6 +5,7 @@ import { type RedisOptions, type Result, } from "@internal/redis"; +import { createMetricsGaugeComputeLua } from "@internal/metrics-pipeline"; import type { Attributes, Meter, @@ -57,6 +58,99 @@ const SemanticAttributes = { ORG_ID: "runqueue.orgId", }; +// Prelude spliced at the top of every gauge-carrying script: declares the gauge slot and +// the return wrapper. A splice fills __qm_g; every return goes through __qmret so the reply +// is always {original, gauge}. A nil original becomes false, else Lua drops it from the +// multi-bulk reply (which would swallow the gauge on the dequeue throttle paths). +const QUEUE_METRICS_GAUGE_PRELUDE = ` +local __qm_g = false +local function __qmret(r) if r == nil then r = false end return {r, __qm_g} end`; + +// Fresh-read gauge for splice points with no reusable locals: enqueue slow-path (before +// return 0) and the base dequeue top. Gated on the last ARGV so it is inert unless the +// caller opts in. CK queues emit per-subqueue depth (queue_name aggregates via the MV). +const QUEUE_METRICS_GAUGE_LUA = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('ZCARD', queueKey)", + running: "redis.call('SCARD', queueCurrentConcurrencyKey)", + queueLimit: "redis.call('GET', queueConcurrencyLimitKey) or '1000000'", + envQueued: "redis.call('ZCARD', envQueueKey)", + envRunning: "redis.call('SCARD', envCurrentConcurrencyKey)", + envLimit: "redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit", +}); + +// Enqueue fast-path gauge: the admission check already computed queueCurrent/envCurrent/ +// queueLimit/envLimit, so reuse them (only 2 ZCARDs stay fresh). Fast path was taken, so +// cc < lim and thr is always 0 — reusing the effective queueLimit is fine (max() recovers raw). +const QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('ZCARD', queueKey)", + running: "queueCurrent", + queueLimit: "queueLimit", + envQueued: "redis.call('ZCARD', envQueueKey)", + envRunning: "envCurrent", + envLimit: "envLimit", +}); + +// CK-health extras: distinct backlogged keys + most-starved head-of-line wait (ckIndex scores +// are per-subqueue oldest timestamps). Needs ckIndexKey/currentTime locals; clamps future scores. +const QUEUE_METRICS_CK_GAUGE_EXTRAS = { + preamble: `local __ckhead = redis.call('ZRANGE', ckIndexKey, 0, 0, 'WITHSCORES') + local __ckwait = 0 + if #__ckhead > 0 then __ckwait = math.floor(math.max(0, (tonumber(currentTime) or 0) - (tonumber(__ckhead[2]) or 0))) end`, + ckBacklogged: "redis.call('ZCARD', ckIndexKey)", + ckMaxWaitMs: "__ckwait", +}; + +// CK enqueue variants of the two gauges above, extended with the CK-health tail. +const QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('ZCARD', queueKey)", + running: "redis.call('SCARD', queueCurrentConcurrencyKey)", + queueLimit: "redis.call('GET', queueConcurrencyLimitKey) or '1000000'", + envQueued: "redis.call('ZCARD', envQueueKey)", + envRunning: "redis.call('SCARD', envCurrentConcurrencyKey)", + envLimit: "redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit", + ...QUEUE_METRICS_CK_GAUGE_EXTRAS, +}); + +const QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('ZCARD', queueKey)", + running: "queueCurrent", + queueLimit: "queueLimit", + envQueued: "redis.call('ZCARD', envQueueKey)", + envRunning: "envCurrent", + envLimit: "envLimit", + ...QUEUE_METRICS_CK_GAUGE_EXTRAS, +}); + +// CK dequeue: depth/running from the per-base-queue aggregate counters the run-queue already +// maintains (two O(1) GETs, not a per-variant scan). thr suppressed — an aggregate cc >= per-CK +// limit would over-report; per-CK throttle is caught by the per-subqueue enqueue gauges. +const QUEUE_METRICS_CK_DEQUEUE_GAUGE_LUA = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('GET', lengthCounterKey) or '0'", + running: "redis.call('GET', runningCounterKey) or '0'", + queueLimit: "redis.call('GET', queueConcurrencyLimitKey) or '1000000'", + envQueued: "redis.call('ZCARD', envQueueKey)", + envRunning: "redis.call('SCARD', envCurrentConcurrencyKey)", + envLimit: "redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit", + throttledExpr: "false", + ...QUEUE_METRICS_CK_GAUGE_EXTRAS, +}); + +/** Injected queue-metrics stream emitter; all calls are no-ops when metrics are disabled. */ +export interface RunQueueMetricsEmitter { + enabledSync(): boolean; + /** enabled AND sampled-in; gates high-frequency sampled emissions (the Lua gauge). */ + sampledSync(): boolean; + /** Counter event (cumulative odometer). */ + emit(shardKey: string, fields: Record): void; + /** Gauge snapshot read inside the queue-op Lua and returned on the reply. */ + emitGauge(shardKey: string, fields: Record): void; +} + export type RunQueueOptions = { name: string; tracer: Tracer; @@ -93,6 +187,8 @@ export type RunQueueOptions = { disabled?: boolean; }; meter?: Meter; + /** When set, enqueue/dequeue/ack/nack/dlq emit queue-metrics events (gated on the emitter's flag). */ + queueMetrics?: RunQueueMetricsEmitter; dequeueBlockingTimeoutSeconds?: number; concurrencySweeper?: { scanSchedule?: string; @@ -458,6 +554,65 @@ export class RunQueue { ); } + /** + * Live per-concurrency-key breakdown of a queue's backlog, most-starved first. + * Reads the ckIndex zset (members = CK subqueue names, scores = oldest-message + * timestamps), so only keys with queued work appear; running-only keys do not. + */ + public async concurrencyKeyBreakdown( + env: MinimalAuthenticatedEnvironment, + queue: string, + options?: { limit?: number } + ): Promise<{ + totalBackloggedKeys: number; + keys: Array<{ + concurrencyKey: string; + queued: number; + running: number; + oldestEnqueuedAt: number; + }>; + }> { + const limit = options?.limit ?? 50; + const ckIndexKey = this.keys.ckIndexKeyFromQueue(this.keys.queueKey(env, queue)); + + const indexPipeline = this.redis.pipeline(); + indexPipeline.zcard(ckIndexKey); + indexPipeline.zrange(ckIndexKey, 0, limit - 1, "WITHSCORES"); + const indexResults = await indexPipeline.exec(); + if (!indexResults) return { totalBackloggedKeys: 0, keys: [] }; + + const [totalErr, totalVal] = indexResults[0]; + const [rangeErr, rangeVal] = indexResults[1]; + const totalBackloggedKeys = totalErr || totalVal == null ? 0 : (totalVal as number); + const flat = rangeErr || rangeVal == null ? [] : (rangeVal as string[]); + + const members: Array<{ member: string; score: number }> = []; + for (let i = 0; i < flat.length; i += 2) { + members.push({ member: flat[i], score: Number(flat[i + 1]) }); + } + if (members.length === 0) return { totalBackloggedKeys, keys: [] }; + + const statsPipeline = this.redis.pipeline(); + for (const { member } of members) { + statsPipeline.zcard(member); + statsPipeline.scard(this.keys.queueCurrentConcurrencyKeyFromQueue(member)); + } + const stats = await statsPipeline.exec(); + + const keys = members.map(({ member, score }, i) => { + const queuedResult = stats?.[i * 2]; + const runningResult = stats?.[i * 2 + 1]; + return { + concurrencyKey: this.#concurrencyKeyFromQueue(member) ?? "", + queued: queuedResult && !queuedResult[0] ? ((queuedResult[1] as number) ?? 0) : 0, + running: runningResult && !runningResult[0] ? ((runningResult[1] as number) ?? 0) : 0, + oldestEnqueuedAt: score, + }; + }); + + return { totalBackloggedKeys, keys }; + } + public async lengthOfEnvQueue(env: MinimalAuthenticatedEnvironment) { return this.redis.zcard(this.keys.envQueueKey(env)); } @@ -751,6 +906,8 @@ export class RunQueue { span.setAttribute("fastPath", fastPathTaken); + this.#emitQueueMetric(queueKey, { op: "enqueue", q: queueKey }); + if (!fastPathTaken && !skipDequeueProcessing) { // Slow path: schedule the dequeue job to move the message from queue to worker queue await this.worker.enqueueOnce({ @@ -810,6 +967,15 @@ export class RunQueue { ...flattenAttributes(dequeuedMessage.message, "message"), }); + const startedFields: Record = { + op: "started", + q: dequeuedMessage.message.queue, + }; + if (typeof dequeuedMessage.message.eligibleAtMs === "number") { + startedFields.wait = Math.max(0, Date.now() - dequeuedMessage.message.eligibleAtMs); + } + this.#emitQueueMetric(dequeuedMessage.message.queue, startedFields); + return dequeuedMessage; }, { @@ -877,6 +1043,8 @@ export class RunQueue { message, removeFromWorkerQueue: options?.removeFromWorkerQueue, }); + + this.#emitQueueMetric(message.queue, { op: "ack", q: message.queue }); }, { kind: SpanKind.CONSUMER, @@ -934,6 +1102,7 @@ export class RunQueue { message.attempt = message.attempt + 1; if (message.attempt >= maxAttempts) { await this.#callMoveToDeadLetterQueue({ message }); + this.#emitQueueMetric(message.queue, { op: "dlq", q: message.queue }); return false; } } @@ -960,6 +1129,8 @@ export class RunQueue { await this.#callNackMessage({ message, retryAt }); + this.#emitQueueMetric(message.queue, { op: "nack", q: message.queue }); + return true; }, { @@ -1831,6 +2002,57 @@ export class RunQueue { * * @returns true if the fast path was taken (message pushed directly to worker queue) */ + #queueMetricsGaugeArg(): string { + // Gauge gate ARGV: enabled AND sampled-in (sampling applies to the gauge, not counters). + return this.options.queueMetrics?.sampledSync() ? "1" : "0"; + } + + // Gauge returned on a script reply as a flat [ql, cc, lim, eql, ec, elim, thr] array, + // plus an optional [ckq, ckw] tail on CK-path scripts. + // Unlike counters, gauges are NOT base-normalized: the q label keeps its :ck: suffix so + // the CK-aggregate and per-subqueue readings stay distinguishable; the consumer's mapEntry + // strips :ck: to the base queue_name and the MV maxes them into one row. + #emitGauge(queue: string, gauge: number[]): void { + if (!Array.isArray(gauge) || gauge.length < 7) return; + const [ql, cc, lim, eql, ec, elim, thr, ckq, ckw] = gauge; + const fields: Record = { + op: "gauge", + q: queue, + ql, + cc, + lim, + eql, + ec, + elim, + thr, + }; + if (gauge.length >= 9) { + fields.ckq = ckq; + fields.ckw = ckw; + } + this.options.queueMetrics?.emitGauge(queue, fields); + } + + #concurrencyKeyFromQueue(queue: string): string | undefined { + const idx = queue.indexOf(":ck:"); + return idx === -1 || idx + 4 >= queue.length ? undefined : queue.slice(idx + 4); + } + + #emitQueueMetric(shardKey: string, fields: Record): void { + // Counters roll up per BASE queue: normalize the CK-qualified queue to its base so all + // concurrency keys share one monotonic odometer (and one shard/order key), matching the + // base queue_name the consumer buckets on. A real concurrency key rides along as `ck`, + // driving a SEPARATE per-key odometer on the same entry (per-key history tier). + const baseQueue = this.keys.baseQueueKeyFromQueue(shardKey); + let baseFields = fields; + if (typeof fields.q === "string") { + baseFields = { ...fields, q: this.keys.baseQueueKeyFromQueue(fields.q) }; + const ck = this.#concurrencyKeyFromQueue(fields.q); + if (ck && ck !== "*") baseFields.ck = ck; + } + this.options.queueMetrics?.emit(baseQueue, baseFields); + } + async #callEnqueueMessage( message: OutputPayloadV2, ttlInfo?: { @@ -1869,6 +2091,7 @@ export class RunQueue { const messageScore = String(message.timestamp); const currentTime = String(Date.now()); const enableFastPathArg = enableFastPath ? "1" : "0"; + const metricsGaugeArg = this.#queueMetricsGaugeArg(); const defaultEnvConcurrencyLimit = String(this.options.defaultEnvConcurrency); const defaultEnvConcurrencyBurstFactor = String( this.options.defaultEnvConcurrencyBurstFactor ?? 1.0 @@ -1892,7 +2115,8 @@ export class RunQueue { service: this.name, }); - let result: number; + // Every gauge-carrying script returns a 2-tuple [originalReturn, gauge|null]. + let result: [number, number[] | null]; // Use CK-aware enqueue for messages with concurrency keys if (message.concurrencyKey) { @@ -1935,7 +2159,8 @@ export class RunQueue { currentTime, enableFastPathArg, ckKeyPrefix, - String(this.counterTtlSeconds) + String(this.counterTtlSeconds), + metricsGaugeArg ); } else { result = await this.redis.enqueueMessageCkTracked( @@ -1967,7 +2192,8 @@ export class RunQueue { currentTime, enableFastPathArg, ckKeyPrefix, - String(this.counterTtlSeconds) + String(this.counterTtlSeconds), + metricsGaugeArg ); } } else if (ttlInfo) { @@ -1998,7 +2224,8 @@ export class RunQueue { defaultEnvConcurrencyLimit, defaultEnvConcurrencyBurstFactor, currentTime, - enableFastPathArg + enableFastPathArg, + metricsGaugeArg ); } else { result = await this.redis.enqueueMessage( @@ -2024,11 +2251,14 @@ export class RunQueue { defaultEnvConcurrencyLimit, defaultEnvConcurrencyBurstFactor, currentTime, - enableFastPathArg + enableFastPathArg, + metricsGaugeArg ); } - return result === 1; + const [enqueueResult, gauge] = result; + if (gauge) this.#emitGauge(queueName, gauge); + return enqueueResult === 1; } async #callDequeueMessagesFromQueue({ @@ -2081,7 +2311,9 @@ export class RunQueue { maxCount, }); - const result = await this.redis.dequeueMessagesFromQueue( + const metricsGaugeArg = this.#queueMetricsGaugeArg(); + + const reply = await this.redis.dequeueMessagesFromQueue( //keys messageQueue, queueConcurrencyLimitKey, @@ -2099,9 +2331,16 @@ export class RunQueue { String(this.options.defaultEnvConcurrency), String(this.options.defaultEnvConcurrencyBurstFactor ?? 1), this.options.redis.keyPrefix ?? "", - String(maxCount) + String(maxCount), + metricsGaugeArg ); + // Reply is [flatMessages|null, gauge|null]: emit the gauge (read atomically inside + // the script, present on the throttle/empty paths too) and keep element 0 as the array. + const gauge = reply?.[1] ?? null; + if (gauge) this.#emitGauge(messageQueue, gauge); + const result = reply?.[0] ?? null; + if (!result) { span.setAttribute("message_count", 0); @@ -2202,8 +2441,11 @@ export class RunQueue { }); const lengthCounterKey = this.keys.queueLengthCounterKeyFromQueue(ckWildcardQueue); + const runningCounterKey = this.keys.queueRunningCounterKeyFromQueue(ckWildcardQueue); + + const metricsGaugeArg = this.#queueMetricsGaugeArg(); - const result = await this.redis.dequeueMessagesFromCkQueueTracked( + const reply = await this.redis.dequeueMessagesFromCkQueueTracked( //keys ckIndexKey, queueConcurrencyLimitKey, @@ -2215,15 +2457,22 @@ export class RunQueue { masterQueueKey, ttlQueueKey, lengthCounterKey, + runningCounterKey, //args ckWildcardQueue, String(Date.now()), String(this.options.defaultEnvConcurrency), String(this.options.defaultEnvConcurrencyBurstFactor ?? 1), this.options.redis.keyPrefix ?? "", - String(maxCount) + String(maxCount), + metricsGaugeArg ); + // Reply is [flatMessages|null, gauge|null]; the CK aggregate gauge rides here. + const gauge = reply?.[1] ?? null; + if (gauge) this.#emitGauge(ckWildcardQueue, gauge); + const result = reply?.[0] ?? null; + if (!result) { span.setAttribute("message_count", 0); return []; @@ -3062,6 +3311,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[7] local currentTime = ARGV[8] local enableFastPath = ARGV[9] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3083,7 +3334,8 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) - return 1 +${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA} + return __qmret(1) end end end @@ -3113,8 +3365,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3153,6 +3406,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[9] local currentTime = ARGV[10] local enableFastPath = ARGV[11] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3174,8 +3429,9 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) +${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA} -- Skip TTL sorted set: the expireRun worker job handles TTL expiry independently - return 1 + return __qmret(1) end end end @@ -3208,8 +3464,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3246,6 +3503,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[8] local currentTime = ARGV[9] local enableFastPath = ARGV[10] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3268,7 +3527,8 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) - return 1 +${QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA} + return __qmret(1) end end end @@ -3304,8 +3564,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3344,6 +3605,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[10] local currentTime = ARGV[11] local enableFastPath = ARGV[12] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3365,8 +3628,9 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) +${QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA} -- Skip TTL sorted set: the expireRun worker job handles TTL expiry independently - return 1 + return __qmret(1) end end end @@ -3405,8 +3669,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3455,6 +3720,8 @@ local keyPrefix = ARGV[11] -- TTL (seconds) applied to counter lazy-init SETs local counterTtl = ARGV[12] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3476,10 +3743,11 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) +${QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA} -- Fast-path skips the CK variant zset entirely; lengthCounter is unchanged. -- runningCounter is bumped later by dequeueMessageFromKeyTracked when the -- worker pulls the message from the worker queue. - return 1 + return __qmret(1) end end end @@ -3531,8 +3799,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3576,6 +3845,8 @@ local keyPrefix = ARGV[13] -- TTL (seconds) applied to counter lazy-init SETs local counterTtl = ARGV[14] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3597,7 +3868,8 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) - return 1 +${QUEUE_METRICS_CK_ENQUEUE_FASTPATH_GAUGE_LUA} + return __qmret(1) end end end @@ -3645,8 +3917,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_CK_ENQUEUE_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3891,6 +4164,8 @@ local defaultEnvConcurrencyLimit = ARGV[3] local defaultEnvConcurrencyBurstFactor = ARGV[4] local keyPrefix = ARGV[5] local maxCount = tonumber(ARGV[6] or '1') +${QUEUE_METRICS_GAUGE_PRELUDE} +${QUEUE_METRICS_GAUGE_LUA} -- Check current env concurrency against the limit local envCurrentConcurrency = tonumber(redis.call('SCARD', envCurrentConcurrencyKey) or '0') @@ -3899,7 +4174,7 @@ local envConcurrencyLimitBurstFactor = tonumber(redis.call('GET', envConcurrency local envConcurrencyLimitWithBurstFactor = math.floor(envConcurrencyLimit * envConcurrencyLimitBurstFactor) if envCurrentConcurrency >= envConcurrencyLimitWithBurstFactor then - return nil + return __qmret(nil) end -- Check current queue concurrency against the limit @@ -3909,7 +4184,7 @@ local totalQueueConcurrencyLimit = queueConcurrencyLimit -- Check condition only if concurrencyLimit exists if queueCurrentConcurrency >= totalQueueConcurrencyLimit then - return nil + return __qmret(nil) end -- Calculate how many messages we can actually dequeue based on concurrency limits @@ -3918,14 +4193,14 @@ local queueAvailableCapacity = totalQueueConcurrencyLimit - queueCurrentConcurre local actualMaxCount = math.min(maxCount, envAvailableCapacity, queueAvailableCapacity) if actualMaxCount <= 0 then - return nil + return __qmret(nil) end -- Attempt to dequeue messages up to actualMaxCount local messages = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'WITHSCORES', 'LIMIT', 0, actualMaxCount) if #messages == 0 then - return nil + return __qmret(nil) end local results = {} @@ -3991,7 +4266,7 @@ else end -- Return results as a flat array: [messageId1, messageScore1, messagePayload1, messageId2, messageScore2, messagePayload2, ...] -return results +return __qmret(results) `, }); @@ -4145,7 +4420,7 @@ return results // (normal dequeue, TTL-expired, or stale-orphan path — all of which were // counted at enqueue time). this.redis.defineCommand("dequeueMessagesFromCkQueueTracked", { - numberOfKeys: 10, + numberOfKeys: 11, lua: ` local ckIndexKey = KEYS[1] local queueConcurrencyLimitKey = KEYS[2] @@ -4157,6 +4432,7 @@ local envQueueKey = KEYS[7] local masterQueueKey = KEYS[8] local ttlQueueKey = KEYS[9] local lengthCounterKey = KEYS[10] +local runningCounterKey = KEYS[11] local ckWildcardName = ARGV[1] local currentTime = tonumber(ARGV[2]) @@ -4164,6 +4440,8 @@ local defaultEnvConcurrencyLimit = ARGV[3] local defaultEnvConcurrencyBurstFactor = ARGV[4] local keyPrefix = ARGV[5] local maxCount = tonumber(ARGV[6] or '1') +${QUEUE_METRICS_GAUGE_PRELUDE} +${QUEUE_METRICS_CK_DEQUEUE_GAUGE_LUA} local function decrLengthCounter() if tonumber(redis.call('GET', lengthCounterKey) or '0') > 0 then @@ -4178,7 +4456,7 @@ local envConcurrencyLimitBurstFactor = tonumber(redis.call('GET', envConcurrency local envConcurrencyLimitWithBurstFactor = math.floor(envConcurrencyLimit * envConcurrencyLimitBurstFactor) if envCurrentConcurrency >= envConcurrencyLimitWithBurstFactor then - return nil + return __qmret(nil) end local queueConcurrencyLimit = math.min(tonumber(redis.call('GET', queueConcurrencyLimitKey) or '1000000'), envConcurrencyLimit) @@ -4187,7 +4465,7 @@ local envAvailableCapacity = envConcurrencyLimitWithBurstFactor - envCurrentConc local actualMaxCount = math.min(maxCount, envAvailableCapacity) if actualMaxCount <= 0 then - return nil + return __qmret(nil) end local ckQueues = redis.call('ZRANGEBYSCORE', ckIndexKey, '-inf', tostring(currentTime), 'LIMIT', 0, actualMaxCount * 3) @@ -4199,7 +4477,7 @@ if #ckQueues == 0 then else redis.call('ZADD', masterQueueKey, anyIdx[2], ckWildcardName) end - return nil + return __qmret(nil) end local results = {} @@ -4281,7 +4559,7 @@ else redis.call('ZADD', masterQueueKey, earliestIdx[2], ckWildcardName) end -return results +return __qmret(results) `, }); @@ -5199,8 +5477,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, currentTime: string, enableFastPath: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; enqueueMessageWithTtl( //keys @@ -5229,8 +5508,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, currentTime: string, enableFastPath: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; expireTtlRuns( //keys @@ -5265,8 +5545,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, keyPrefix: string, maxCount: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[string[] | null, number[] | null]> + ): Result<[string[] | null, number[] | null], Context>; dequeueMessageFromWorkerQueueNonBlocking( workerQueueKey: string, @@ -5405,8 +5686,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, currentTime: string, enableFastPath: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; enqueueMessageWithTtlCk( //keys @@ -5437,8 +5719,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, currentTime: string, enableFastPath: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; dequeueMessagesFromCkQueue( //keys @@ -5551,8 +5834,9 @@ declare module "@internal/redis" { enableFastPath: string, keyPrefix: string, counterTtl: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; enqueueMessageWithTtlCkTracked( masterQueueKey: string, @@ -5585,8 +5869,9 @@ declare module "@internal/redis" { enableFastPath: string, keyPrefix: string, counterTtl: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; dequeueMessagesFromCkQueueTracked( ckIndexKey: string, @@ -5599,14 +5884,16 @@ declare module "@internal/redis" { masterQueueKey: string, ttlQueueKey: string, lengthCounterKey: string, + runningCounterKey: string, ckWildcardName: string, currentTime: string, defaultEnvConcurrencyLimit: string, defaultEnvConcurrencyBurstFactor: string, keyPrefix: string, maxCount: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[string[] | null, number[] | null]> + ): Result<[string[] | null, number[] | null], Context>; dequeueMessageFromKeyTracked( messageKey: string, diff --git a/internal-packages/run-engine/src/run-queue/keyProducer.ts b/internal-packages/run-engine/src/run-queue/keyProducer.ts index b185435f6f6..18a2727b7e4 100644 --- a/internal-packages/run-engine/src/run-queue/keyProducer.ts +++ b/internal-packages/run-engine/src/run-queue/keyProducer.ts @@ -141,8 +141,7 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer { } queueConcurrencyLimitKeyFromQueue(queue: string) { - const concurrencyQueueName = queue.replace(/:ck:.+$/, ""); - return `${concurrencyQueueName}:${constants.CONCURRENCY_LIMIT_PART}`; + return `${this.baseQueueKeyFromQueue(queue)}:${constants.CONCURRENCY_LIMIT_PART}`; } queueCurrentConcurrencyKeyFromQueue(queue: string) { @@ -313,12 +312,14 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer { } ckIndexKeyFromQueue(queue: string): string { - const baseQueue = queue.replace(/:ck:.+$/, ""); - return `${baseQueue}:${constants.CK_INDEX_PART}`; + return `${this.baseQueueKeyFromQueue(queue)}:${constants.CK_INDEX_PART}`; } + // indexOf instead of /:ck:.+$/ (queue names are user-controlled; polynomial regex). + // Only strips when at least one character follows ":ck:", matching the old semantics. baseQueueKeyFromQueue(queue: string): string { - return queue.replace(/:ck:.+$/, ""); + const idx = queue.indexOf(":ck:"); + return idx === -1 || idx + 4 >= queue.length ? queue : queue.slice(0, idx); } queueLengthCounterKey(env: RunQueueKeyProducerEnvironment, queue: string): string { @@ -342,7 +343,8 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer { } toCkWildcard(queue: string): string { - return queue.replace(/:ck:.+$/, ":ck:*"); + const base = this.baseQueueKeyFromQueue(queue); + return base === queue ? queue : `${base}:ck:*`; } descriptorFromQueue(queue: string): QueueDescriptor { diff --git a/internal-packages/run-engine/src/run-queue/metrics.test.ts b/internal-packages/run-engine/src/run-queue/metrics.test.ts new file mode 100644 index 00000000000..ebfc295470e --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/metrics.test.ts @@ -0,0 +1,397 @@ +import { createRedisClient } from "@internal/redis"; +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { + allStreamKeys, + MetricsStreamEmitter, + type MetricDefinition, +} from "@internal/metrics-pipeline"; +import { Logger } from "@trigger.dev/core/logger"; +import { Decimal } from "@trigger.dev/database"; +import { setTimeout } from "node:timers/promises"; +import { describe, expect } from "vitest"; +import { FairQueueSelectionStrategy } from "./fairQueueSelectionStrategy.js"; +import { RunQueue } from "./index.js"; +import { RunQueueFullKeyProducer } from "./keyProducer.js"; +import type { InputPayload } from "./types.js"; + +const authenticatedEnvDev = { + id: "e1234", + type: "DEVELOPMENT" as const, + maximumConcurrencyLimit: 10, + concurrencyLimitBurstFactor: new Decimal(1.0), + project: { id: "p1234" }, + organization: { id: "o1234" }, +}; + +async function readAllEntries( + redisOptions: { + host: string; + port: number; + }, + definition: MetricDefinition +) { + const client = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const entries: Array<{ id: string; fields: Record }> = []; + for (const key of allStreamKeys(definition)) { + const raw = (await client.xrange(key, "-", "+")) as Array<[string, string[]]>; + for (const [id, flat] of raw) { + const fields: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) fields[flat[i]!] = flat[i + 1]!; + entries.push({ id, fields }); + } + } + await client.quit(); + return entries; +} + +// Gauges now land via a fire-and-forget Node XADD after the script reply (not synchronously +// inside the Lua), so reads must poll until the expected entries appear. +async function waitForEntries( + redisOptions: { host: string; port: number }, + definition: MetricDefinition, + predicate: (entries: Array<{ id: string; fields: Record }>) => boolean, + timeoutMs = 5000 +) { + const start = Date.now(); + let entries = await readAllEntries(redisOptions, definition); + while (!predicate(entries)) { + if (Date.now() - start > timeoutMs) return entries; + await setTimeout(50); + entries = await readAllEntries(redisOptions, definition); + } + return entries; +} + +describe("RunQueue queue-metrics emission", () => { + redisTest("emits gauge + enqueue/started/ack events when enabled", async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_test_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + const emitter = new MetricsStreamEmitter({ + redis, + definition, + flag: { enabled: () => true }, + }); + + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), + keys: new RunQueueFullKeyProducer(), + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); + + const message: InputPayload = { + runId: "r-metrics", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + eligibleAtMs: Date.now() - 500, + attempt: 0, + }; + + try { + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + }); + await setTimeout(1000); + const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + expect(dequeued?.messageId).toBe(message.runId); + await queue.acknowledgeMessage(message.orgId, message.runId); + await setTimeout(100); + + const entries = await waitForEntries(redis, definition, (es) => { + const seen = es.map((e) => e.fields.op); + return ["enqueue", "gauge", "started", "ack"].every((o) => seen.includes(o)); + }); + const ops = entries.map((e) => e.fields.op); + expect(ops).toContain("enqueue"); + expect(ops).toContain("gauge"); + expect(ops).toContain("started"); + expect(ops).toContain("ack"); + + const gauge = entries.find((e) => e.fields.op === "gauge"); + assertGauge(gauge); + expect(gauge!.fields.q).toContain("task/my-task"); + for (const f of ["ql", "cc", "lim", "eql", "ec", "elim", "thr"]) { + expect(gauge!.fields[f]).toBeDefined(); + } + // Non-CK scripts keep the 7-field gauge (no CK-health tail). + expect(gauge!.fields.ckq).toBeUndefined(); + expect(gauge!.fields.ckw).toBeUndefined(); + + // The first counter emission also seeds a cum=0 baseline (no wait); the real reading + // carries wait. Pick the reading (cum > 0). + const started = entries.find((e) => e.fields.op === "started" && Number(e.fields.cum) > 0); + expect(started!.fields.wait).toBeDefined(); + expect(Number(started!.fields.wait)).toBeGreaterThanOrEqual(0); + expect(Number(started!.fields.cum)).toBeGreaterThan(0); + } finally { + await queue.quit(); + await emitter.close(); + } + }); + + redisTest( + "emits a fast-path gauge reusing the admission-check locals", + async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_fp_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + const emitter = new MetricsStreamEmitter({ + redis, + definition, + flag: { enabled: () => true }, + }); + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), + keys: new RunQueueFullKeyProducer(), + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); + + const message: InputPayload = { + runId: "r-fastpath", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, + }; + + try { + // enableFastPath + empty queue + zero concurrency => the Lua takes the fast path, + // so the gauge runs the reuse snippet (queueCurrent/envCurrent/queueLimit/envLimit). + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + enableFastPath: true, + }); + const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + expect(dequeued?.messageId).toBe(message.runId); + + const entries = await waitForEntries( + redis, + definition, + (es) => + es.some((e) => e.fields.op === "gauge") && es.some((e) => e.fields.op === "enqueue") + ); + const gauge = entries.find((e) => e.fields.op === "gauge"); + assertGauge(gauge); + for (const f of ["ql", "cc", "lim", "eql", "ec", "elim", "thr"]) { + expect(gauge!.fields[f]).toBeDefined(); + } + // Fast path was taken => capacity was available => not throttled. + expect(gauge!.fields.thr).toBe("0"); + expect(entries.some((e) => e.fields.op === "enqueue")).toBe(true); + } finally { + await queue.quit(); + await emitter.close(); + } + } + ); + + redisTest("emits an aggregate gauge for CK queues at dequeue", async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_ck_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + const emitter = new MetricsStreamEmitter({ redis, definition, flag: { enabled: () => true } }); + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), + keys: new RunQueueFullKeyProducer(), + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); + + const message: InputPayload = { + runId: "r-ck", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + concurrencyKey: "tenant-1", + timestamp: Date.now(), + eligibleAtMs: Date.now() - 300, + attempt: 0, + }; + + try { + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + }); + await setTimeout(1000); + const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + expect(dequeued?.messageId).toBe(message.runId); + + const entries = await waitForEntries(redis, definition, (es) => + es.some( + (e) => e.fields.op === "gauge" && e.fields.q.includes(":ck:") && e.fields.thr === "0" + ) + ); + const gauges = entries.filter((e) => e.fields.op === "gauge"); + expect(gauges.length).toBeGreaterThan(0); + // The aggregate CK dequeue gauge targets the CK wildcard and never sets thr. + const aggregate = gauges.find((e) => e.fields.q.includes(":ck:") && e.fields.thr === "0"); + assertGauge(aggregate); + expect(Number(aggregate!.fields.ql)).toBeGreaterThanOrEqual(0); + expect(Number(aggregate!.fields.cc)).toBeGreaterThanOrEqual(0); + + // Every CK-path gauge carries the CK-health tail; the enqueue-time reading (and the + // pre-dequeue aggregate reading) sees the backlogged key. + const ckGauges = gauges.filter((e) => e.fields.q.includes(":ck:")); + for (const g of ckGauges) { + expect(g.fields.ckq).toBeDefined(); + expect(g.fields.ckw).toBeDefined(); + expect(Number(g.fields.ckw)).toBeGreaterThanOrEqual(0); + } + expect(ckGauges.some((g) => Number(g.fields.ckq) >= 1)).toBe(true); + + // CK counter entries carry both odometers: the reading has cum + ck/ckcum, and each + // odometer seeds its own baseline entry (cum-only vs ck+ckcum-only). + const enqueues = entries.filter((e) => e.fields.op === "enqueue"); + const reading = enqueues.find((e) => e.fields.cum != null && e.fields.ckcum != null); + expect(reading).toBeDefined(); + expect(reading!.fields.ck).toBe("tenant-1"); + expect(reading!.fields.q).not.toContain(":ck:"); + expect(Number(reading!.fields.cum)).toBe(1); + expect(Number(reading!.fields.ckcum)).toBe(1); + const baseBaseline = enqueues.find((e) => e.fields.cum === "0" && e.fields.ck == null); + expect(baseBaseline).toBeDefined(); + const ckBaseline = enqueues.find((e) => e.fields.ckcum === "0" && e.fields.cum == null); + expect(ckBaseline).toBeDefined(); + expect(ckBaseline!.fields.ck).toBe("tenant-1"); + } finally { + await queue.quit(); + await emitter.close(); + } + }); + + redisTest("gauge sampling gates gauges but not counters", async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_sample_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + // gaugeSampleRate 0 => sampledSync() always false => Lua gauge never fires; counters still do. + const emitter = new MetricsStreamEmitter({ + redis, + definition, + flag: { enabled: () => true }, + gaugeSampleRate: 0, + }); + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), + keys: new RunQueueFullKeyProducer(), + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); + + const message: InputPayload = { + runId: "r-sample", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, + }; + + try { + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + }); + await setTimeout(1000); + await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + + // Poll until the counter (enqueue) lands; by then a gauge would have too, if sampled in. + const entries = await waitForEntries(redis, definition, (es) => + es.some((e) => e.fields.op === "enqueue") + ); + expect(entries.some((e) => e.fields.op === "gauge")).toBe(false); + expect(entries.some((e) => e.fields.op === "enqueue")).toBe(true); + } finally { + await queue.quit(); + await emitter.close(); + } + }); +}); + +function assertGauge(gauge: unknown): asserts gauge { + if (!gauge) throw new Error("expected a gauge entry"); +} diff --git a/internal-packages/run-engine/src/run-queue/tests/ckIndex.test.ts b/internal-packages/run-engine/src/run-queue/tests/ckIndex.test.ts index 224540f4efb..4eb47d59bc0 100644 --- a/internal-packages/run-engine/src/run-queue/tests/ckIndex.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/ckIndex.test.ts @@ -471,4 +471,46 @@ describe("CK Index", () => { await queue.quit(); } }); + + redisTest( + "concurrencyKeyBreakdown lists backlogged keys most-starved first", + async ({ redisContainer }) => { + const queue = createQueue(redisContainer); + try { + const now = Date.now(); + const enqueue = (runId: string, concurrencyKey: string, timestamp: number) => + queue.enqueueMessage({ + env: authenticatedEnvDev, + message: makeMessage({ runId, concurrencyKey, timestamp }), + workerQueue: authenticatedEnvDev.id, + skipDequeueProcessing: true, + }); + + // ck-a has the oldest head (most starved) and 2 queued; ck-b has 1. + await enqueue("r1", "ck-a", now - 10_000); + await enqueue("r2", "ck-a", now - 5_000); + await enqueue("r3", "ck-b", now - 2_000); + + const breakdown = await queue.concurrencyKeyBreakdown(authenticatedEnvDev, "task/my-task"); + expect(breakdown.totalBackloggedKeys).toBe(2); + expect(breakdown.keys).toEqual([ + { concurrencyKey: "ck-a", queued: 2, running: 0, oldestEnqueuedAt: now - 10_000 }, + { concurrencyKey: "ck-b", queued: 1, running: 0, oldestEnqueuedAt: now - 2_000 }, + ]); + + const limited = await queue.concurrencyKeyBreakdown(authenticatedEnvDev, "task/my-task", { + limit: 1, + }); + expect(limited.totalBackloggedKeys).toBe(2); + expect(limited.keys).toHaveLength(1); + expect(limited.keys[0]!.concurrencyKey).toBe("ck-a"); + + // Queues with no CK backlog return an empty breakdown. + const empty = await queue.concurrencyKeyBreakdown(authenticatedEnvDev, "task/other-task"); + expect(empty).toEqual({ totalBackloggedKeys: 0, keys: [] }); + } finally { + await queue.quit(); + } + } + ); }); diff --git a/internal-packages/run-engine/src/run-queue/types.ts b/internal-packages/run-engine/src/run-queue/types.ts index 0905f3971de..8a7d3c93ec5 100644 --- a/internal-packages/run-engine/src/run-queue/types.ts +++ b/internal-packages/run-engine/src/run-queue/types.ts @@ -13,6 +13,9 @@ export const InputPayload = z.object({ queue: z.string(), concurrencyKey: z.string().optional(), timestamp: z.number(), + // Unix ms the run became eligible (delayUntil if set, else triggered-at), pre-priority. + // Dequeue scheduling delay = dequeueTime - eligibleAtMs. Optional for old-payload compat. + eligibleAtMs: z.number().optional(), attempt: z.number(), /** TTL expiration timestamp (unix ms). If set, run will be expired when this time is reached. */ ttlExpiresAt: z.number().optional(), diff --git a/internal-packages/tsql/src/index.test.ts b/internal-packages/tsql/src/index.test.ts index f9aca2f236d..ce358e6ac08 100644 --- a/internal-packages/tsql/src/index.test.ts +++ b/internal-packages/tsql/src/index.test.ts @@ -231,6 +231,26 @@ describe("injectFallbackConditions", () => { expect(modified.where.expression_type).toBe("and"); } }); + + it("should inject into a FROM subquery, where the fallback column's table lives", () => { + const ast = parseTSQLSelect( + "SELECT t, sum(total) AS total FROM (SELECT time AS t, status, count(*) AS total FROM task_runs GROUP BY t, status) GROUP BY t" + ); + const fallbacks: Record = { + time: { op: "gte", value: "2024-01-01" }, + }; + + const modified = injectFallbackConditions(ast, fallbacks); + expect(modified.expression_type).toBe("select_query"); + if (modified.expression_type === "select_query") { + expect(modified.where).toBeUndefined(); + const inner = modified.select_from?.table; + expect(inner?.expression_type).toBe("select_query"); + if (inner?.expression_type === "select_query") { + expect(isColumnReferencedInExpression(inner.where, "time")).toBe(true); + } + } + }); }); describe("compileTSQL with whereClauseFallback", () => { diff --git a/internal-packages/tsql/src/index.ts b/internal-packages/tsql/src/index.ts index 1d8759c108c..1ebd1a60a5d 100644 --- a/internal-packages/tsql/src/index.ts +++ b/internal-packages/tsql/src/index.ts @@ -429,6 +429,24 @@ export function injectFallbackConditions( // Handle SelectQuery const selectQuery = ast as SelectQuery; + + // When the FROM is a subquery, the fallback columns belong to the inner query's + // table, not this level; descend so e.g. a time fallback lands next to the table ref. + const fromTable = selectQuery.select_from?.table; + if ( + fromTable && + (fromTable.expression_type === "select_query" || + fromTable.expression_type === "select_set_query") + ) { + return { + ...selectQuery, + select_from: { + ...selectQuery.select_from!, + table: injectFallbackConditions(fromTable, fallbacks) as SelectQuery | SelectSetQuery, + }, + }; + } + const existingWhere = selectQuery.where; // Collect fallback expressions for columns not already in WHERE @@ -541,6 +559,12 @@ export interface CompileTSQLOptions { * ``` */ timeRange?: TimeRange; + /** + * Opt-in: emit rows for empty time buckets in a top-level time-bucketed query. + * Counters zero-fill, gauges (columns with `fillMode: "carry"`) carry forward. + * Off by default; output is unchanged when not set. + */ + fillGaps?: boolean; } /** @@ -599,6 +623,7 @@ export function compileTSQL(query: string, options: CompileTSQLOptions): PrintRe fieldMappings: options.fieldMappings, enforcedWhereClause, timeRange: options.timeRange, + fillGaps: options.fillGaps, }); // 6. Print the AST to ClickHouse SQL (enforced conditions applied at printer level) diff --git a/internal-packages/tsql/src/query/functions.ts b/internal-packages/tsql/src/query/functions.ts index 2f2b9278454..a6dadf0f609 100644 --- a/internal-packages/tsql/src/query/functions.ts +++ b/internal-packages/tsql/src/query/functions.ts @@ -645,11 +645,24 @@ export const TSQL_AGGREGATIONS: Record = { maxParams: 1, aggregate: true, }, + quantilesTDigestMerge: { + clickhouseName: "quantilesTDigestMerge", + minArgs: 1, + maxArgs: 1, + minParams: 1, + aggregate: true, + }, sumMerge: { clickhouseName: "sumMerge", minArgs: 1, maxArgs: 1, aggregate: true }, avgMerge: { clickhouseName: "avgMerge", minArgs: 1, maxArgs: 1, aggregate: true }, countMerge: { clickhouseName: "countMerge", minArgs: 1, maxArgs: 1, aggregate: true }, minMerge: { clickhouseName: "minMerge", minArgs: 1, maxArgs: 1, aggregate: true }, maxMerge: { clickhouseName: "maxMerge", minArgs: 1, maxArgs: 1, aggregate: true }, + deltaSumTimestampMerge: { + clickhouseName: "deltaSumTimestampMerge", + minArgs: 1, + maxArgs: 1, + aggregate: true, + }, // Statistical functions simpleLinearRegression: { diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts index 0efa0d34fc4..dbc14818cae 100644 --- a/internal-packages/tsql/src/query/printer.test.ts +++ b/internal-packages/tsql/src/query/printer.test.ts @@ -3831,3 +3831,388 @@ describe("timeBucket()", () => { }); }); }); + +// ============================================================ +// fillGaps Tests +// ============================================================ + +describe("timeBucket() fillGaps", () => { + // Schema with a gauge column (fillMode: "carry"), a counter, and a groupable dim. + const metricsSchema: TableSchema = { + name: "metrics", + clickhouseName: "trigger_dev.queue_metrics_v1", + timeConstraint: "bucket_at", + columns: { + bucket_at: { name: "bucket_at", clickhouseName: "created_at", ...column("DateTime64") }, + queue_name: { name: "queue_name", ...column("String") }, + max_running: { name: "max_running", ...column("UInt64"), fillMode: "carry" }, + enqueued: { name: "enqueued", ...column("UInt64"), fillMode: "zero" }, + organization_id: { name: "organization_id", ...column("String") }, + project_id: { name: "project_id", ...column("String") }, + environment_id: { name: "environment_id", ...column("String") }, + }, + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + }; + + // 7-day range -> 6 HOUR buckets (same as the timeBucket() block). + const sevenDayRange = { + from: new Date("2024-01-01T00:00:00Z"), + to: new Date("2024-01-08T00:00:00Z"), + }; + + function ctx(fillGaps: boolean): PrinterContext { + return createPrinterContext({ + schema: createSchemaRegistry([metricsSchema]), + enforcedWhereClause: { + organization_id: { op: "eq", value: "org_test123" }, + project_id: { op: "eq", value: "proj_test456" }, + environment_id: { op: "eq", value: "env_test789" }, + }, + timeRange: sevenDayRange, + fillGaps, + }); + } + + function run(query: string, fillGaps: boolean) { + const context = ctx(fillGaps); + const result = printToClickHouse(parseTSQLSelect(query), context); + return { ...result, warnings: context.warnings }; + } + + it("emits no WITH FILL when fillGaps is off (unchanged)", () => { + const query = + "SELECT timeBucket(), max(max_running), count() FROM metrics GROUP BY timeBucket ORDER BY timeBucket"; + const { sql } = run(query, false); + expect(sql).not.toContain("WITH FILL"); + expect(sql).not.toContain("INTERPOLATE"); + }); + + it("single-series gauge + counter: WITH FILL plus INTERPOLATE for the gauge only", () => { + const query = + "SELECT timeBucket(), max(max_running) AS max_running, count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket"; + const { sql, params } = run(query, true); + + // STEP matches the 6 HOUR bucket interval, FROM/TO snapped + parameterized. + expect(sql).toContain("WITH FILL FROM toStartOfInterval({"); + expect(sql).toContain("STEP INTERVAL 6 HOUR"); + expect(sql).toMatch(/TO toStartOfInterval\(\{[^}]+: DateTime64\(6\)\}, INTERVAL 6 HOUR\)/); + + // Gauge carried forward; counter omitted (defaults to 0). + expect(sql).toContain("INTERPOLATE (max_running AS max_running)"); + expect(sql).not.toContain("runs AS runs"); + + // FROM/TO bounds are real parameters carrying the time range. + const dateParams = Object.values(params).filter((v) => v instanceof Date); + expect(dateParams).toContainEqual(sevenDayRange.from); + expect(dateParams).toContainEqual(sevenDayRange.to); + }); + + it("single-series counter only: WITH FILL but no INTERPOLATE", () => { + const query = + "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket"; + const { sql } = run(query, true); + expect(sql).toContain("WITH FILL FROM toStartOfInterval({"); + expect(sql).toContain("STEP INTERVAL 6 HOUR"); + expect(sql).not.toContain("INTERPOLATE"); + }); + + it("grouped counter only: group dim first, then WITH FILL, no INTERPOLATE", () => { + const query = + "SELECT timeBucket(), queue_name, count() AS runs FROM metrics GROUP BY timeBucket, queue_name ORDER BY timeBucket"; + const { sql } = run(query, true); + expect(sql).toMatch(/ORDER BY queue_name, timebucket ASC WITH FILL/); + expect(sql).toContain("STEP INTERVAL 6 HOUR"); + expect(sql).not.toContain("INTERPOLATE"); + }); + + it("grouped + carry gauge: per-group LOCF via window functions, no INTERPOLATE", () => { + const query = + "SELECT timeBucket(), queue_name, max(max_running) AS max_running FROM metrics GROUP BY timeBucket, queue_name ORDER BY timeBucket"; + const { sql, warnings } = run(query, true); + + // Inner query densifies per group (dims first, then the bucket WITH FILL) + sentinel. + expect(sql).toMatch(/ORDER BY queue_name, timebucket ASC WITH FILL/); + expect(sql).toContain("STEP INTERVAL 6 HOUR"); + expect(sql).toContain("1 AS __tsql_present"); + + // Block id increments at each real row, partitioned by the group dim. + expect(sql).toContain( + "sum(__tsql_present) OVER (PARTITION BY queue_name ORDER BY timebucket ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS __tsql_block" + ); + + // Gauge carried within each (group, block); never INTERPOLATE (which bleeds across groups). + expect(sql).toContain( + "max(if(__tsql_present = 1, max_running, NULL)) OVER (PARTITION BY queue_name, __tsql_block) AS max_running" + ); + expect(sql).not.toContain("INTERPOLATE"); + + // Final result re-ordered by the user's ORDER BY, and not skipped. + expect(sql).toMatch(/\)\s*ORDER BY timebucket ASC$/); + expect(warnings.some((w) => w.code === "fill_skipped_grouped_gauge")).toBe(false); + }); + + it("grouped + carry gauge with a non-plain group dim: fill is skipped", () => { + const query = + "SELECT timeBucket(), upper(queue_name) AS q, max(max_running) AS max_running FROM metrics GROUP BY timeBucket, upper(queue_name) ORDER BY timeBucket"; + const { sql, warnings } = run(query, true); + expect(sql).not.toContain("WITH FILL"); + expect(sql).not.toContain("__tsql_block"); + expect(warnings.some((w) => w.code === "fill_skipped_grouped_gauge")).toBe(true); + }); + + it("user ORDER BY not led by timeBucket: fill is skipped", () => { + const query = + "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY runs DESC"; + const { sql } = run(query, true); + expect(sql).not.toContain("WITH FILL"); + expect(sql).not.toContain("INTERPOLATE"); + }); + + it("bucket-led ORDER BY DESC: fill is skipped (ascending fill would be invalid)", () => { + const query = + "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket DESC"; + const { sql } = run(query, true); + expect(sql).not.toContain("WITH FILL"); + expect(sql).not.toContain("INTERPOLATE"); + // The plain descending order still stands. + expect(sql).toContain("ORDER BY timebucket DESC"); + }); +}); + +describe("cross-queue counter totals via subquery (env-wide throughput shape)", () => { + // deltaSumTimestamp states must merge per queue, then sum outside; this is the + // supported shape for env-wide totals. + const metricsSchema: TableSchema = { + name: "metrics", + clickhouseName: "trigger_dev.queue_metrics_v1", + timeConstraint: "bucket_at", + columns: { + bucket_at: { name: "bucket_at", clickhouseName: "created_at", ...column("DateTime64") }, + queue_name: { name: "queue_name", ...column("String") }, + started_delta: { + name: "started_delta", + mergeGroupKey: "queue_name", + ...column("String"), + groupable: false, + sortable: false, + filterable: false, + }, + organization_id: { name: "organization_id", ...column("String") }, + project_id: { name: "project_id", ...column("String") }, + environment_id: { name: "environment_id", ...column("String") }, + }, + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + }; + + function runSubquery(query: string) { + const context = createPrinterContext({ + schema: createSchemaRegistry([metricsSchema]), + enforcedWhereClause: { + organization_id: { op: "eq", value: "org_test123" }, + }, + timeRange: { + from: new Date("2024-01-01T00:00:00Z"), + to: new Date("2024-01-08T00:00:00Z"), + }, + }); + const result = printToClickHouse(parseTSQLSelect(query), context); + return { ...result, warnings: context.warnings }; + } + + it("compiles per-queue merge + outer sum, with tenant scoping inside the subquery", () => { + const { sql, params } = runSubquery(` + SELECT t, sum(started) AS started + FROM ( + SELECT timeBucket() AS t, queue_name, deltaSumTimestampMerge(started_delta) AS started + FROM metrics + GROUP BY t, queue_name + ) + GROUP BY t + ORDER BY t + `); + + expect(sql).toContain("deltaSumTimestampMerge(started_delta)"); + expect(sql).toContain("toStartOfInterval(created_at, INTERVAL 6 HOUR)"); + const subqueryStart = sql.indexOf("FROM ("); + const tenantFilter = sql.indexOf("organization_id"); + expect(subqueryStart).toBeGreaterThan(-1); + expect(tenantFilter).toBeGreaterThan(subqueryStart); + expect(Object.values(params)).toContain("org_test123"); + }); +}); + +describe("mergeGroupKey validation", () => { + const metricsSchema: TableSchema = { + name: "metrics", + clickhouseName: "trigger_dev.queue_metrics_v1", + timeConstraint: "bucket_at", + columns: { + bucket_at: { name: "bucket_at", ...column("DateTime64") }, + queue: { name: "queue", clickhouseName: "queue_name", ...column("String") }, + started_delta: { + name: "started_delta", + mergeGroupKey: "queue", + ...column("String"), + groupable: false, + sortable: false, + filterable: false, + }, + organization_id: { name: "organization_id", ...column("String") }, + project_id: { name: "project_id", ...column("String") }, + environment_id: { name: "environment_id", ...column("String") }, + }, + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + }; + + function compile( + query: string, + enforced: Record = { organization_id: { op: "eq", value: "org_x" } } + ) { + const context = createPrinterContext({ + schema: createSchemaRegistry([metricsSchema]), + enforcedWhereClause: enforced as never, + timeRange: { + from: new Date("2024-01-01T00:00:00Z"), + to: new Date("2024-01-08T00:00:00Z"), + }, + }); + return printToClickHouse(parseTSQLSelect(query), context); + } + + it("rejects an ungrouped, unpinned merge with an actionable message", () => { + expect(() => + compile( + "SELECT timeBucket() AS t, deltaSumTimestampMerge(started_delta) AS started FROM metrics GROUP BY t" + ) + ).toThrowError( + /Merging 'started_delta' across every queue[\s\S]*GROUP BY queue\)[\s\S]*WHERE queue = 'my-queue'[\s\S]*inner GROUP BY t, queue and outer GROUP BY t/ + ); + }); + + it("allows the merge when queue is in the GROUP BY", () => { + const { sql } = compile( + "SELECT timeBucket() AS t, queue, deltaSumTimestampMerge(started_delta) AS started FROM metrics GROUP BY t, queue" + ); + expect(sql).toContain("deltaSumTimestampMerge(started_delta)"); + }); + + it("allows the merge when queue is pinned by an equality filter", () => { + const { sql } = compile( + "SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics WHERE queue = 'emails'" + ); + expect(sql).toContain("deltaSumTimestampMerge(started_delta)"); + }); + + it("allows the merge when the enforced clause pins queue to one value", () => { + const { sql } = compile( + "SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics", + { organization_id: { op: "eq", value: "org_x" }, queue: { op: "in", values: ["emails"] } } + ); + expect(sql).toContain("deltaSumTimestampMerge(started_delta)"); + }); + + it("rejects the merge when the enforced clause spans several queues", () => { + expect(() => + compile("SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics", { + organization_id: { op: "eq", value: "org_x" }, + queue: { op: "in", values: ["emails", "webhooks"] }, + }) + ).toThrowError(/only combine correctly within one queue/); + }); + + it("allows a grouped inner merge summed by the outer query", () => { + const { sql } = compile( + "SELECT t, sum(started) AS started FROM (SELECT timeBucket() AS t, queue, deltaSumTimestampMerge(started_delta) AS started FROM metrics GROUP BY t, queue) GROUP BY t ORDER BY t" + ); + expect(sql).toContain("GROUP BY t, queue_name"); + }); + + it("rejects an ungrouped merge inside a subquery", () => { + expect(() => + compile( + "SELECT t, sum(started) AS started FROM (SELECT timeBucket() AS t, deltaSumTimestampMerge(started_delta) AS started FROM metrics GROUP BY t) GROUP BY t" + ) + ).toThrowError(/only combine correctly within one queue/); + }); +}); + +describe("compound mergeGroupKey validation", () => { + const byKeySchema: TableSchema = { + name: "metrics_by_key", + clickhouseName: "trigger_dev.queue_metrics_ck_v1", + timeConstraint: "bucket_at", + columns: { + bucket_at: { name: "bucket_at", ...column("DateTime64") }, + queue: { name: "queue", clickhouseName: "queue_name", ...column("String") }, + concurrency_key: { name: "concurrency_key", ...column("String") }, + started_delta: { + name: "started_delta", + mergeGroupKey: ["queue", "concurrency_key"], + ...column("String"), + groupable: false, + sortable: false, + filterable: false, + }, + organization_id: { name: "organization_id", ...column("String") }, + project_id: { name: "project_id", ...column("String") }, + environment_id: { name: "environment_id", ...column("String") }, + }, + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + }; + + function compile(query: string) { + const context = createPrinterContext({ + schema: createSchemaRegistry([byKeySchema]), + enforcedWhereClause: { organization_id: { op: "eq", value: "org_x" } } as never, + timeRange: { + from: new Date("2024-01-01T00:00:00Z"), + to: new Date("2024-01-08T00:00:00Z"), + }, + }); + return printToClickHouse(parseTSQLSelect(query), context); + } + + it("requires EVERY listed key grouped or pinned", () => { + expect(() => + compile( + "SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key WHERE queue = 'emails'" + ) + ).toThrowError(/only combine correctly within one concurrency_key/); + expect(() => + compile( + "SELECT concurrency_key, deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key GROUP BY concurrency_key" + ) + ).toThrowError(/only combine correctly within one queue/); + }); + + it("allows pin + group combinations covering both keys", () => { + const grouped = compile( + "SELECT concurrency_key, deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key WHERE queue = 'emails' GROUP BY concurrency_key" + ); + expect(grouped.sql).toContain("deltaSumTimestampMerge(started_delta)"); + const pinned = compile( + "SELECT deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key WHERE queue = 'emails' AND concurrency_key = 't1'" + ); + expect(pinned.sql).toContain("deltaSumTimestampMerge(started_delta)"); + const bothGrouped = compile( + "SELECT queue, concurrency_key, deltaSumTimestampMerge(started_delta) AS started FROM metrics_by_key GROUP BY queue, concurrency_key" + ); + expect(bothGrouped.sql).toContain("GROUP BY queue_name, concurrency_key"); + }); +}); diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index 82d97f5491b..3ee9a0ab76a 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -385,6 +385,8 @@ export class ClickHousePrinter { nextJoin = nextJoin.next_join; } + this.validateMergeScopedColumns(node); + // Extract SELECT column aliases BEFORE visiting columns // This allows ORDER BY/HAVING to reference aliased columns const savedAliases = this.selectAliases; @@ -459,6 +461,25 @@ export class ClickHousePrinter { this.inProjectionContext = false; } + // Opt-in gap-fill: emit rows for empty time buckets via WITH FILL / INTERPOLATE. + // No-op unless enabled, top-level, and the query is fill-eligible. + let interpolateClause: string | null = null; + let groupedFillWrap: ((inner: string) => string) | null = null; + if (this.context.fillGaps && isTopLevelQuery) { + const fill = this.buildGapFill(node, orderBy, groupBy); + if (fill) { + orderBy = fill.orderBy; + if (fill.kind === "inline") { + interpolateClause = fill.interpolate; + } else { + // Grouped per-group LOCF: add the `present` sentinel to this (now inner) query + // and wrap the rendered SQL in the block-id + carry window layers below. + columns.push(fill.presentColumn); + groupedFillWrap = fill.wrap; + } + } + } + // Process ARRAY JOIN let arrayJoin = ""; if (node.array_join_op) { @@ -487,6 +508,8 @@ export class ClickHousePrinter { having ? `HAVING${space}${having}` : null, windowClause ? `WINDOW${space}${windowClause}` : null, orderBy && orderBy.length > 0 ? `ORDER BY${space}${orderBy.join(comma)}` : null, + // INTERPOLATE must follow the full ORDER BY (including WITH FILL) + interpolateClause, ]; // Process LIMIT @@ -549,6 +572,11 @@ export class ClickHousePrinter { response = this.pretty ? `(${response.trim()})` : `(${response})`; } + // Grouped per-group gap fill wraps this query in the block-id + carry window layers. + if (groupedFillWrap) { + response = groupedFillWrap(response); + } + // Restore saved contexts (for nested queries) this.selectAliases = savedAliases; this.queryHasGroupBy = savedQueryHasGroupBy; @@ -559,6 +587,183 @@ export class ClickHousePrinter { return response; } + /** + * Build the gap-fill transformation (WITH FILL + optional INTERPOLATE) for a + * top-level time-bucketed query. Returns null when the query is not + * fill-eligible (correct-by-construction: emit nothing extra rather than risk + * wrong values). + * + * Eligibility: exactly one timeBucket() column in SELECT, and ORDER BY led by + * that timeBucket column. Carry (gauge) columns are LOCF'd via INTERPOLATE; + * counters zero-fill via WITH FILL's default. Grouped gauge queries are unsafe + * (INTERPOLATE bleeds across groups) and are skipped with a warning. + */ + private buildGapFill( + node: SelectQuery, + orderBy: string[] | null, + groupBy: string[] | null + ): + | { kind: "inline"; orderBy: string[]; interpolate: string | null } + | { kind: "wrap"; orderBy: string[]; presentColumn: string; wrap: (inner: string) => string } + | null { + if (!orderBy || orderBy.length === 0 || !node.select || node.select.length === 0) { + return null; + } + + const timeRange = this.context.timeRange; + if (!timeRange) { + return null; + } + + // Need a time-constraint table to derive the bucket column + interval. + const tableWithConstraint = this.findTimeConstraintTable(); + if (!tableWithConstraint) { + return null; + } + const { tableSchema, clickhouseColumnName } = tableWithConstraint; + const interval = calculateTimeBucketInterval( + timeRange.from, + timeRange.to, + tableSchema.timeBucketThresholds + ); + const bucketSql = `toStartOfInterval(${escapeClickHouseIdentifier(clickhouseColumnName)}, INTERVAL ${interval.value} ${interval.unit})`; + + // Find exactly one timeBucket() column in SELECT and its output alias. + let bucketAlias: string | null = null; + let bucketCount = 0; + for (const col of node.select) { + const inner = (col as Alias).expression_type === "alias" ? (col as Alias).expr : col; + if ( + (inner as Call).expression_type === "call" && + (inner as Call).name.toLowerCase() === "timebucket" + ) { + bucketCount++; + bucketAlias = + (col as Alias).expression_type === "alias" ? (col as Alias).alias : "timebucket"; + } + } + if (bucketCount !== 1 || !bucketAlias) { + return null; + } + + // ORDER BY must be led by the timeBucket column (alias or full expression). + // Don't fight a user ordering like `ORDER BY count DESC`. + const leadTerm = orderBy[0]; + // Strip a trailing ASC/DESC direction without a regex: an unanchored `\s+` before the + // keyword backtracks polynomially across start positions on whitespace runs (CodeQL + // js/polynomial-redos). endsWith + slice is linear. + const trimmedLead = leadTerm.trim(); + const upperLead = trimmedLead.toUpperCase(); + const isDescending = upperLead.endsWith(" DESC"); + const leadExpr = upperLead.endsWith(" ASC") + ? trimmedLead.slice(0, -4).trimEnd() + : isDescending + ? trimmedLead.slice(0, -5).trimEnd() + : trimmedLead; + const matchesBucket = (expr: string): boolean => + expr.toLowerCase() === bucketAlias!.toLowerCase() || expr === bucketSql; + if (!matchesBucket(leadExpr)) { + return null; + } + // WITH FILL is emitted with ascending bounds and a positive STEP, which is + // only valid for an ascending bucket order. A descending order would need + // swapped bounds and a negative step (newer ClickHouse only), so skip the + // gap-fill rewrite and let the plain descending ORDER BY stand. + if (isDescending) { + return null; + } + + // Group dims = GROUP BY expressions that are NOT the timeBucket column. + const groupDims = (groupBy ?? []).filter((g) => !matchesBucket(g.trim())); + + // Classify each SELECT output column. Carry (gauge) columns survive through + // aliases + value-preserving aggregates (see analyzeSelectColumn). A bare column + // that isn't the bucket is a GROUP BY dimension; everything else is a counter or + // derived value that zero-fills. + const carryAliases: string[] = []; + const dimNames: string[] = []; + const orderedOutputs: Array<{ name: string; carry: boolean }> = []; + for (const col of node.select) { + const { outputName, sourceColumn } = this.analyzeSelectColumn(col); + if (!outputName) continue; + const carry = sourceColumn?.fillMode === "carry"; + orderedOutputs.push({ name: outputName, carry }); + if (carry) carryAliases.push(outputName); + const inner = (col as Alias).expression_type === "alias" ? (col as Alias).expr : col; + if (!matchesBucket(outputName) && (inner as Field).expression_type === "field") { + dimNames.push(outputName); + } + } + + // Snap FROM/TO to the bucket grid and parameterize the bounds. + const fromBound = this.context.addValue(timeRange.from); + const toBound = this.context.addValue(timeRange.to); + const withFill = + `WITH FILL FROM toStartOfInterval(${fromBound}, INTERVAL ${interval.value} ${interval.unit})` + + ` TO toStartOfInterval(${toBound}, INTERVAL ${interval.value} ${interval.unit})` + + ` STEP INTERVAL ${interval.value} ${interval.unit}`; + + const esc = escapeClickHouseIdentifier; + + // Single series: WITH FILL on the bucket + INTERPOLATE the carry columns (LOCF); + // counters omitted from INTERPOLATE so they zero-fill. + if (groupDims.length === 0) { + const newOrderBy = [...orderBy]; + newOrderBy[0] = `${leadTerm} ${withFill}`; + const interpolate = + carryAliases.length > 0 + ? `INTERPOLATE (${carryAliases.map((a) => `${esc(a)} AS ${esc(a)}`).join(", ")})` + : null; + return { kind: "inline", orderBy: newOrderBy, interpolate }; + } + + // Grouped, counters only: per-group zero-fill via WITH FILL ordered by the dims. + if (carryAliases.length === 0) { + return { + kind: "inline", + orderBy: [...groupDims, `${leadTerm} ${withFill}`], + interpolate: null, + }; + } + + // Grouped + gauge: per-group LOCF. INTERPOLATE bleeds across groups, so densify per + // group (WITH FILL + a `present` sentinel that is 0 on filled rows), assign a block id + // that increments at each real row, then carry the block's real value via window max. + // Only safe when every GROUP BY dim is a plain column we can PARTITION BY. + if (dimNames.length !== groupDims.length) { + this.context.addWarning( + "fill_skipped_grouped_gauge", + "fillGaps was skipped: per-group gap fill needs every GROUP BY dimension to be a plain column." + ); + return null; + } + + const userOrderBy = [...orderBy]; + const presentCol = "__tsql_present"; + const blockCol = "__tsql_block"; + const partitionDims = dimNames.map(esc).join(", "); + const blockExpr = + `sum(${esc(presentCol)}) OVER (PARTITION BY ${partitionDims} ORDER BY ${esc(bucketAlias)}` + + ` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS ${esc(blockCol)}`; + const finalColumns = orderedOutputs.map(({ name, carry }) => + carry + ? `max(if(${esc(presentCol)} = 1, ${esc(name)}, NULL)) OVER (PARTITION BY ${partitionDims}, ${esc( + blockCol + )}) AS ${esc(name)}` + : esc(name) + ); + const finalOrderBy = userOrderBy.length > 0 ? ` ORDER BY ${userOrderBy.join(", ")}` : ""; + const wrap = (inner: string): string => + `SELECT ${finalColumns.join(", ")} FROM (SELECT *, ${blockExpr} FROM (${inner.trim()}))${finalOrderBy}`; + + return { + kind: "wrap", + orderBy: [...dimNames.map(esc), `${leadTerm} ${withFill}`], + presentColumn: `1 AS ${esc(presentCol)}`, + wrap, + }; + } + /** * Extract column aliases from a SELECT expression. * Handles explicit aliases (AS name) and implicit names from aggregations/functions. @@ -1014,11 +1219,12 @@ export class ClickHousePrinter { if ((firstArg as Field).expression_type === "field") { const field = firstArg as Field; const columnInfo = this.resolveFieldToColumn(field.chain); - // Only propagate customRenderType, not the full column schema - if (columnInfo.column?.customRenderType) { + // Propagate customRenderType and fillMode (gauge-ness), not the full column schema + if (columnInfo.column?.customRenderType || columnInfo.column?.fillMode) { sourceColumn = { type: inferredType, customRenderType: columnInfo.column.customRenderType, + fillMode: columnInfo.column.fillMode, }; } } @@ -1679,6 +1885,138 @@ export class ClickHousePrinter { // Note: projectId and environmentId are optional - no validation needed } + /** + * Reject queries that merge a scope-keyed aggregate state column (`mergeGroupKey`) + * across values of its key: such merges silently return wrong numbers. Valid shapes + * group by the key column or pin it to a single value (in the query's WHERE or via + * the enforced clause). Runs per SELECT scope; subqueries validate themselves. + */ + private validateMergeScopedColumns(node: SelectQuery): void { + for (const tableSchema of this.tableContexts.values()) { + for (const column of Object.values(tableSchema.columns)) { + if (!column.mergeGroupKey) continue; + const keys = Array.isArray(column.mergeGroupKey) + ? column.mergeGroupKey + : [column.mergeGroupKey]; + if (!this.scopeReferencesColumn(node, column.name)) continue; + for (const key of keys) { + if (this.groupByIncludesColumn(node, key)) continue; + if (this.wherePinsColumn(node.where, key)) continue; + if (this.enforcedPinsColumn(tableSchema, key)) continue; + throw new QueryError( + `Merging '${column.name}' across every ${key} returns wrong totals: its aggregate ` + + `states are kept per ${key} and only combine correctly within one ${key}. Either ` + + `add '${key}' to the GROUP BY and sum the per-${key} results in an outer query, ` + + `for example: SELECT sum(v) AS total FROM (SELECT ${key}, ` + + `deltaSumTimestampMerge(${column.name}) AS v FROM ${tableSchema.name} ` + + `GROUP BY ${key}). Or filter to a single ${key}, for example: ` + + `WHERE ${key} = 'my-${key}'. For a time series, bucket both layers: ` + + `inner GROUP BY t, ${key} and outer GROUP BY t.` + ); + } + } + } + } + + private scopeReferencesColumn(node: SelectQuery, name: string): boolean { + const parts: unknown[] = [ + node.select, + node.prewhere, + node.where, + node.group_by, + node.having, + node.order_by, + ]; + return parts.some((part) => this.expressionReferencesColumn(part, name)); + } + + private expressionReferencesColumn( + expr: unknown, + name: string, + seen = new WeakSet() + ): boolean { + if (expr === null || typeof expr !== "object") return false; + if (seen.has(expr)) return false; + seen.add(expr); + if (Array.isArray(expr)) { + return expr.some((item) => this.expressionReferencesColumn(item, name, seen)); + } + const candidate = expr as { expression_type?: string; chain?: unknown[] }; + if ( + candidate.expression_type === "select_query" || + candidate.expression_type === "select_set_query" + ) { + return false; + } + if ( + candidate.expression_type === "field" && + Array.isArray(candidate.chain) && + candidate.chain[candidate.chain.length - 1] === name + ) { + return true; + } + return Object.entries(expr).some( + ([property, value]) => + property !== "type" && + property !== "parent" && + this.expressionReferencesColumn(value, name, seen) + ); + } + + private groupByIncludesColumn(node: SelectQuery, name: string): boolean { + return (node.group_by ?? []).some((expr) => { + const field = expr as Field; + return ( + field.expression_type === "field" && + Array.isArray(field.chain) && + field.chain[field.chain.length - 1] === name + ); + }); + } + + // Pins only count on the top-level AND chain: a pin inside an OR guarantees nothing. + private wherePinsColumn(where: Expression | undefined, name: string): boolean { + if (!where) return false; + if (where.expression_type === "and") { + return (where as And).exprs.some((expr) => this.wherePinsColumn(expr, name)); + } + if (where.expression_type !== "compare_operation") return false; + const cmp = where as CompareOperation; + const isKeyField = (side: Expression) => { + const field = side as Field; + return ( + field.expression_type === "field" && + Array.isArray(field.chain) && + field.chain[field.chain.length - 1] === name + ); + }; + const fieldSide = [cmp.left, cmp.right].find(isKeyField); + if (!fieldSide) return false; + if (cmp.op === CompareOperationOp.Eq) return true; + if (cmp.op === CompareOperationOp.In || cmp.op === CompareOperationOp.GlobalIn) { + const other = fieldSide === cmp.left ? cmp.right : cmp.left; + if ((other as Constant).expression_type === "constant") return true; + const tuple = other as Tuple; + return tuple.expression_type === "tuple" && tuple.exprs.length === 1; + } + return false; + } + + private enforcedPinsColumn(tableSchema: TableSchema, key: string): boolean { + const names = [key]; + const clickhouseName = tableSchema.columns[key]?.clickhouseName; + if (clickhouseName) names.push(clickhouseName); + for (const name of names) { + const condition = this.context.enforcedWhereClause[name] as + | { op?: string; values?: unknown[] } + | undefined; + if (!condition) continue; + if (condition.op === "eq") return true; + if (condition.op === "in" && condition.values?.length === 1) return true; + } + return false; + } + /** * Format a Date as a ClickHouse-compatible DateTime64 string. * ClickHouse expects format: 'YYYY-MM-DD HH:MM:SS.mmm' (in UTC) diff --git a/internal-packages/tsql/src/query/printer_context.ts b/internal-packages/tsql/src/query/printer_context.ts index d0fb41b5327..a964e2e04af 100644 --- a/internal-packages/tsql/src/query/printer_context.ts +++ b/internal-packages/tsql/src/query/printer_context.ts @@ -125,6 +125,9 @@ export class PrinterContext { */ readonly timeRange?: TimeRange; + /** When true, time-bucketed queries emit rows for empty buckets (opt-in). */ + readonly fillGaps?: boolean; + constructor( /** Schema registry containing allowed tables and columns */ public readonly schema: SchemaRegistry, @@ -138,13 +141,16 @@ export class PrinterContext { */ enforcedWhereClause: Record = {}, /** Time range for timeBucket() interval calculation */ - timeRange?: TimeRange + timeRange?: TimeRange, + /** Opt-in gap-fill for time-bucketed queries */ + fillGaps?: boolean ) { // Initialize with default settings this.settings = { ...DEFAULT_QUERY_SETTINGS, ...settings }; this.fieldMappings = fieldMappings; this.enforcedWhereClause = enforcedWhereClause; this.timeRange = timeRange; + this.fillGaps = fillGaps; } /** @@ -225,7 +231,8 @@ export class PrinterContext { this.settings, this.fieldMappings, this.enforcedWhereClause, - this.timeRange + this.timeRange, + this.fillGaps ); // Share the same values map so parameters are unified child.values = this.values; @@ -277,6 +284,8 @@ export interface PrinterContextOptions { * When provided, `timeBucket()` uses this to determine the appropriate bucket size. */ timeRange?: TimeRange; + /** When true, time-bucketed queries emit rows for empty buckets (opt-in). */ + fillGaps?: boolean; } /** @@ -288,6 +297,7 @@ export function createPrinterContext(options: PrinterContextOptions): PrinterCon options.settings, options.fieldMappings, options.enforcedWhereClause, - options.timeRange + options.timeRange, + options.fillGaps ); } diff --git a/internal-packages/tsql/src/query/schema.ts b/internal-packages/tsql/src/query/schema.ts index 9a1e2d2ddfe..a32b8ea142c 100644 --- a/internal-packages/tsql/src/query/schema.ts +++ b/internal-packages/tsql/src/query/schema.ts @@ -122,6 +122,18 @@ export interface ColumnSchema { * ``` */ customRenderType?: string; + /** + * Gap-fill behavior when the opt-in `fillGaps` feature emits rows for empty + * time buckets: `"carry"` = gauge (LOCF via INTERPOLATE), `"zero"` (default) + * = counter (missing buckets get 0). + */ + fillMode?: "zero" | "carry"; + /** + * Aggregate-state column whose states only merge correctly within one value of the + * named column(s) (e.g. per-queue counter states). Queries referencing it must GROUP BY + * every listed column or pin each to a single value; other shapes fail to compile. + */ + mergeGroupKey?: string | string[]; /** * Example value for documentation purposes. * @@ -409,6 +421,21 @@ export interface TableSchema { * is needed to get correct results. Not needed for plain MergeTree tables. */ useFinal?: boolean; + /** + * Coarser physical rollups with an identical logical schema, substituted by callers + * (not the printer) when the timeBucket() interval is at least minIntervalSeconds. + */ + rollups?: Array<{ minIntervalSeconds: number; clickhouseName: string }>; + /** + * Opt into the ClickHouse query cache; callers align time bounds to alignSeconds + * so repeated auto-refresh queries share cache entries. + */ + queryCache?: { ttlSeconds: number; alignSeconds: number }; + /** + * Excluded from user-facing listings (query editor, schema docs, schema API) by + * callers; the engine still compiles queries against it. + */ + hidden?: boolean; } /** diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1a56a054f42..a49afc04da5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -365,6 +365,9 @@ importers: '@internal/llm-model-catalog': specifier: workspace:* version: link:../../internal-packages/llm-model-catalog + '@internal/metrics-pipeline': + specifier: workspace:* + version: link:../../internal-packages/metrics-pipeline '@internal/redis': specifier: workspace:* version: link:../../internal-packages/redis @@ -1255,6 +1258,25 @@ importers: specifier: 4.1.7 version: 4.1.7(@opentelemetry/api@1.9.1)(@types/node@22.20.0)(@vitest/coverage-v8@4.1.7)(vite@6.4.2(@types/node@22.20.0)(jiti@2.6.1)(lightningcss@1.29.2)(terser@5.46.1)(tsx@4.22.4)(yaml@2.9.0)) + internal-packages/metrics-pipeline: + dependencies: + '@internal/redis': + specifier: workspace:* + version: link:../redis + '@internal/tracing': + specifier: workspace:* + version: link:../tracing + '@trigger.dev/core': + specifier: workspace:* + version: link:../../packages/core + devDependencies: + '@internal/testcontainers': + specifier: workspace:* + version: link:../testcontainers + rimraf: + specifier: 6.0.1 + version: 6.0.1 + internal-packages/otlp-importer: dependencies: long: @@ -1335,6 +1357,9 @@ importers: '@internal/cache': specifier: workspace:* version: link:../cache + '@internal/metrics-pipeline': + specifier: workspace:* + version: link:../metrics-pipeline '@internal/redis': specifier: workspace:* version: link:../redis