From c0d5a540a77e38ccb2d4cbf30b797e9fa1fb62c4 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 2 Jul 2026 23:29:35 +0100 Subject: [PATCH 01/37] feat(metrics-pipeline): generic Redis-stream to ClickHouse metrics pipeline --- .../metrics-pipeline/package.json | 33 ++ .../metrics-pipeline/src/cachedValue.ts | 121 +++++++ .../metrics-pipeline/src/consumer.test.ts | 332 +++++++++++++++++ .../metrics-pipeline/src/consumer.ts | 336 ++++++++++++++++++ .../metrics-pipeline/src/emitter.ts | 167 +++++++++ .../metrics-pipeline/src/flag.ts | 46 +++ .../metrics-pipeline/src/hash.ts | 15 + .../metrics-pipeline/src/idempotency.ts | 9 + .../metrics-pipeline/src/index.ts | 26 ++ internal-packages/metrics-pipeline/src/lua.ts | 41 +++ .../metrics-pipeline/src/pipeline.test.ts | 74 ++++ .../metrics-pipeline/src/types.ts | 40 +++ .../metrics-pipeline/test/setup.ts | 4 + .../metrics-pipeline/tsconfig.build.json | 21 ++ .../metrics-pipeline/tsconfig.json | 8 + .../metrics-pipeline/tsconfig.src.json | 20 ++ .../metrics-pipeline/tsconfig.test.json | 21 ++ .../metrics-pipeline/vitest.config.ts | 17 + pnpm-lock.yaml | 25 ++ 19 files changed, 1356 insertions(+) create mode 100644 internal-packages/metrics-pipeline/package.json create mode 100644 internal-packages/metrics-pipeline/src/cachedValue.ts create mode 100644 internal-packages/metrics-pipeline/src/consumer.test.ts create mode 100644 internal-packages/metrics-pipeline/src/consumer.ts create mode 100644 internal-packages/metrics-pipeline/src/emitter.ts create mode 100644 internal-packages/metrics-pipeline/src/flag.ts create mode 100644 internal-packages/metrics-pipeline/src/hash.ts create mode 100644 internal-packages/metrics-pipeline/src/idempotency.ts create mode 100644 internal-packages/metrics-pipeline/src/index.ts create mode 100644 internal-packages/metrics-pipeline/src/lua.ts create mode 100644 internal-packages/metrics-pipeline/src/pipeline.test.ts create mode 100644 internal-packages/metrics-pipeline/src/types.ts create mode 100644 internal-packages/metrics-pipeline/test/setup.ts create mode 100644 internal-packages/metrics-pipeline/tsconfig.build.json create mode 100644 internal-packages/metrics-pipeline/tsconfig.json create mode 100644 internal-packages/metrics-pipeline/tsconfig.src.json create mode 100644 internal-packages/metrics-pipeline/tsconfig.test.json create mode 100644 internal-packages/metrics-pipeline/vitest.config.ts diff --git a/internal-packages/metrics-pipeline/package.json b/internal-packages/metrics-pipeline/package.json new file mode 100644 index 00000000000..10a7c137a1f --- /dev/null +++ b/internal-packages/metrics-pipeline/package.json @@ -0,0 +1,33 @@ +{ + "name": "@internal/metrics-pipeline", + "private": true, + "version": "0.0.1", + "main": "./dist/src/index.js", + "types": "./dist/src/index.d.ts", + "type": "module", + "exports": { + ".": { + "@triggerdotdev/source": "./src/index.ts", + "import": "./dist/src/index.js", + "types": "./dist/src/index.d.ts", + "default": "./dist/src/index.js" + } + }, + "dependencies": { + "@internal/redis": "workspace:*", + "@internal/tracing": "workspace:*", + "@trigger.dev/core": "workspace:*" + }, + "devDependencies": { + "@internal/testcontainers": "workspace:*", + "rimraf": "6.0.1" + }, + "scripts": { + "clean": "rimraf dist", + "typecheck": "tsc --noEmit -p tsconfig.build.json", + "test": "vitest --sequence.concurrent=false --no-file-parallelism", + "test:coverage": "vitest --sequence.concurrent=false --no-file-parallelism --coverage.enabled", + "build": "pnpm run clean && tsc -p tsconfig.build.json", + "dev": "tsc --watch -p tsconfig.build.json" + } +} diff --git a/internal-packages/metrics-pipeline/src/cachedValue.ts b/internal-packages/metrics-pipeline/src/cachedValue.ts new file mode 100644 index 00000000000..6183d62c072 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/cachedValue.ts @@ -0,0 +1,121 @@ +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import { Logger } from "@trigger.dev/core/logger"; + +export type CachedRedisValueOptions = { + redis: RedisOptions; + key: string; + parse: (raw: string | null) => T; + defaultValue: T; + cacheTtlMs?: number; + logger?: Logger; + loggerName?: string; +}; + +// Reads a Redis key with a short stale-while-revalidate cache and a synchronous getter for +// hot paths. Warms eagerly on construction; concurrent refreshes dedupe onto one GET so an +// awaited refresh always resolves to a completed read. +export class CachedRedisValue { + private readonly redis: Redis; + private readonly key: string; + private readonly parse: (raw: string | null) => T; + private readonly cacheTtlMs: number; + private readonly logger: Logger; + private value: T; + private lastFetchedAt = 0; + private refreshPromise?: Promise; + + constructor(options: CachedRedisValueOptions) { + this.logger = options.logger ?? new Logger(options.loggerName ?? "CachedRedisValue", "warn"); + this.redis = createRedisClient( + { ...options.redis, keyPrefix: undefined }, + { onError: (error) => this.logger.error("cached value redis error", { error, key: options.key }) } + ); + this.key = options.key; + this.parse = options.parse; + this.cacheTtlMs = options.cacheTtlMs ?? 10_000; + this.value = options.defaultValue; + void this.refresh(); + } + + get(): T { + if (Date.now() - this.lastFetchedAt > this.cacheTtlMs) { + void this.refresh(); + } + return this.value; + } + + async refresh(): Promise { + if (this.refreshPromise) return this.refreshPromise; + this.refreshPromise = this.#doRefresh(); + try { + return await this.refreshPromise; + } finally { + this.refreshPromise = undefined; + } + } + + async #doRefresh(): Promise { + try { + this.value = this.parse(await this.redis.get(this.key)); + } catch (error) { + this.logger.debug("cached value refresh failed, keeping cached value", { + error, + key: this.key, + }); + } finally { + this.lastFetchedAt = Date.now(); + } + return this.value; + } + + async close(): Promise { + await this.redis.quit(); + } +} + +export type CachedRedisNumberOptions = { + redis: RedisOptions; + key: string; + defaultValue: number; + min?: number; + max?: number; + cacheTtlMs?: number; + logger?: Logger; +}; + +// Live-tunable numeric value, clamped to [min,max]; falls back to defaultValue on a +// missing/unparseable key. Exposes a synchronous value() for hot paths. +export class CachedRedisNumber { + private readonly inner: CachedRedisValue; + + constructor(options: CachedRedisNumberOptions) { + const min = options.min ?? Number.NEGATIVE_INFINITY; + const max = options.max ?? Number.POSITIVE_INFINITY; + const clamp = (n: number) => Math.min(max, Math.max(min, n)); + const fallback = clamp(options.defaultValue); + this.inner = new CachedRedisValue({ + redis: options.redis, + key: options.key, + parse: (raw) => { + const n = raw == null ? Number.NaN : Number(raw); + return Number.isFinite(n) ? clamp(n) : fallback; + }, + defaultValue: fallback, + cacheTtlMs: options.cacheTtlMs, + logger: options.logger, + loggerName: "CachedRedisNumber", + }); + } + + value(): number { + return this.inner.get(); + } + + refresh(): Promise { + return this.inner.refresh(); + } + + close(): Promise { + return this.inner.close(); + } +} diff --git a/internal-packages/metrics-pipeline/src/consumer.test.ts b/internal-packages/metrics-pipeline/src/consumer.test.ts new file mode 100644 index 00000000000..ff4406ed449 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/consumer.test.ts @@ -0,0 +1,332 @@ +import { createRedisClient } from "@internal/redis"; +import { redisTest } from "@internal/testcontainers"; +import { expect } from "vitest"; +import { CachedRedisFlag } from "./flag.js"; +import { CachedRedisNumber } from "./cachedValue.js"; +import { MetricsStreamConsumer } from "./consumer.js"; +import { MetricsStreamEmitter } from "./emitter.js"; +import { shardFor } from "./hash.js"; +import { streamKey, type MetricDefinition } from "./types.js"; + +async function waitFor(cond: () => boolean, timeoutMs = 5000): Promise { + const start = Date.now(); + while (!cond()) { + if (Date.now() - start > timeoutMs) throw new Error("waitFor timed out"); + await new Promise((r) => setTimeout(r, 50)); + } +} + +function definitionFor(suffix: string, shardCount = 2): MetricDefinition { + return { name: `qm_${Date.now()}_${suffix}`, shardCount, consumerGroup: "cg", maxLen: 1000 }; +} + +redisTest("emitter -> consumer round trip maps rows, dedups, and acks", async ({ redisOptions }) => { + const definition = definitionFor("rt"); + const emitter = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => true } }); + const inserted: Array<{ rows: Array>; dedupToken: string }> = []; + + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async (rows, { dedupToken }) => { + inserted.push({ rows, dedupToken }); + }, + blockMs: 200, + }); + + await consumer.start(); + emitter.emit("queueA", { op: "enqueue", q: "queueA" }); + emitter.emit("queueB", { op: "started", q: "queueB", wait: 42 }); + + await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2); + await consumer.stop(); + + const rows = inserted.flatMap((i) => i.rows); + expect(rows).toContainEqual(expect.objectContaining({ op: "enqueue", q: "queueA" })); + expect(rows).toContainEqual(expect.objectContaining({ op: "started", q: "queueB", wait: "42" })); + expect(inserted[0]!.dedupToken).toMatch(/^[0-9a-f]{40}$/); + + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + for (const key of consumer.streamKeys()) { + const pending = (await admin.xpending(key, definition.consumerGroup)) as [number, ...unknown[]]; + expect(pending[0]).toBe(0); + } + await admin.quit(); + await emitter.close(); +}); + +redisTest("emit is a no-op when the flag is disabled", async ({ redisOptions }) => { + const definition = definitionFor("off"); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => false }, + }); + + emitter.emit("q", { op: "enqueue", q: "q" }); + await new Promise((r) => setTimeout(r, 200)); + + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const len = await admin.xlen(streamKey(definition, shardFor("q", definition.shardCount))); + expect(len).toBe(0); + await admin.quit(); + await emitter.close(); +}); + +redisTest("reclaims stale pending entries from a dead consumer", async ({ redisOptions }) => { + const definition = definitionFor("claim", 1); + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, 0); + + await admin.xgroup("CREATE", key, definition.consumerGroup, "$", "MKSTREAM"); + await admin.xadd(key, "*", "op", "ack", "q", "qZ"); + await admin.xadd(key, "*", "op", "nack", "q", "qZ"); + await admin.xreadgroup("GROUP", definition.consumerGroup, "zombie", "COUNT", 10, "STREAMS", key, ">"); + + const inserted: Array> = []; + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "live", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async (rows) => { + inserted.push(...rows); + }, + blockMs: 200, + claimIdleMs: 0, + }); + + await consumer.start(); + await waitFor(() => inserted.length >= 2); + await consumer.stop(); + + expect(inserted.map((r) => r.op).sort()).toEqual(["ack", "nack"]); + const pending = (await admin.xpending(key, definition.consumerGroup)) as [number, ...unknown[]]; + expect(pending[0]).toBe(0); + await admin.quit(); +}); + +redisTest("per-stream batches: one insert + distinct dedup token per shard stream", async ({ redisOptions }) => { + const definition = definitionFor("pershard", 2); + const emitter = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => true } }); + // Two shard keys that land on different shards. + const a = "shardkey-a"; + let b = "shardkey-b0"; + for (let i = 1; shardFor(b, 2) === shardFor(a, 2); i++) b = `shardkey-b${i}`; + + const inserted: Array<{ rows: Array>; dedupToken: string }> = []; + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async (rows, { dedupToken }) => { + inserted.push({ rows, dedupToken }); + }, + blockMs: 200, + }); + + await consumer.start(); + emitter.emit(a, { op: "enqueue", q: a }); + emitter.emit(b, { op: "enqueue", q: b }); + await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2); + await consumer.stop(); + await emitter.close(); + + // Each shard's batch is its own dedup block with its own (stream-scoped) token. + const batchesWithRows = inserted.filter((i) => i.rows.length > 0); + expect(batchesWithRows.length).toBe(2); + expect(new Set(batchesWithRows.map((i) => i.dedupToken)).size).toBe(2); +}); + +redisTest("probe reports lag as null (not 0) when Redis cannot compute it", async ({ redisOptions }) => { + const definition = definitionFor("nillag", 1); + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, 0); + + await admin.xgroup("CREATE", key, definition.consumerGroup, "0", "MKSTREAM"); + const ids: string[] = []; + for (let i = 0; i < 5; i++) { + ids.push((await admin.xadd(key, "*", "op", "enqueue", "q", "qT")) as string); + } + // SETID to an arbitrary id makes the group's entries-read unknown => lag is nil + // (severe trimming can do the same in prod); the probe must NOT report that as 0. + await admin.xgroup("SETID", key, definition.consumerGroup, ids[2]!); + + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async () => {}, + }); + try { + const states = await consumer.streamState(); + expect(states[0]!.lag).toBeNull(); + } finally { + await consumer.stop(); + await admin.quit(); + } +}); + +redisTest("emitGauge XADDs an op=gauge snapshot onto the shared metrics stream", async ({ redisOptions }) => { + const definition = definitionFor("gauge", 2); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + }); + + emitter.emitGauge("q1", { + op: "gauge", + q: "q1", + ql: 5, + cc: 2, + lim: 10, + eql: 3, + ec: 1, + elim: 20, + thr: 0, + }); + + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, shardFor("q1", 2)); + // Plain XADD (no odometer, no cum=0 seed) => exactly one entry, unlike counter emit(). + await waitFor2(async () => (await admin.xlen(key)) === 1); + const raw = (await admin.xrange(key, "-", "+")) as Array<[string, string[]]>; + const flat = raw[0]![1]; + const fields: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) fields[flat[i]!] = flat[i + 1]!; + expect(fields.op).toBe("gauge"); + expect(fields.q).toBe("q1"); + expect(fields.ql).toBe("5"); + expect(fields.thr).toBe("0"); + await admin.quit(); + await emitter.close(); +}); + +async function waitFor2(cond: () => Promise, timeoutMs = 5000): Promise { + const start = Date.now(); + while (!(await cond())) { + if (Date.now() - start > timeoutMs) throw new Error("waitFor2 timed out"); + await new Promise((r) => setTimeout(r, 50)); + } +} + +redisTest("sampledSync gates on both the flag and the sample rate", async ({ redisOptions }) => { + const definition = definitionFor("sample"); + const off = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => true }, gaugeSampleRate: 0 }); + const on = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => true }, gaugeSampleRate: 1 }); + const disabled = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => false }, gaugeSampleRate: 1 }); + + expect(off.sampledSync()).toBe(false); // rate 0 => never sampled in + expect(on.sampledSync()).toBe(true); // rate 1 + enabled => always + expect(disabled.sampledSync()).toBe(false); // disabled => never, regardless of rate + expect(on.enabledSync()).toBe(true); // enabledSync (counters) is unaffected by sampling + + await Promise.all([off.close(), on.close(), disabled.close()]); +}); + +redisTest("sampledSync honors a live rate provider (no reconstruct)", async ({ redisOptions }) => { + const definition = definitionFor("live"); + let rate = 1; + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + gaugeSampleRate: { value: () => rate }, + }); + expect(emitter.sampledSync()).toBe(true); + rate = 0; + expect(emitter.sampledSync()).toBe(false); + await emitter.close(); +}); + +redisTest("CachedRedisNumber reads live, clamps, and falls back", async ({ redisOptions }) => { + const key = `rate_${Date.now()}`; + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const num = new CachedRedisNumber({ redis: redisOptions, key, defaultValue: 1, min: 0, max: 1 }); + + await num.refresh(); + expect(num.value()).toBe(1); // missing key => default + await admin.set(key, "0.25"); + await num.refresh(); + expect(num.value()).toBe(0.25); + await admin.set(key, "5"); + await num.refresh(); + expect(num.value()).toBe(1); // out of range => clamped + await admin.set(key, "nonsense"); + await num.refresh(); + expect(num.value()).toBe(1); // unparseable => default + + await num.close(); + await admin.quit(); +}); + +redisTest("streamState reports depth, lag, and pending per shard", async ({ redisOptions }) => { + const definition = definitionFor("state", 1); + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, 0); + + await admin.xgroup("CREATE", key, definition.consumerGroup, "$", "MKSTREAM"); + await admin.xadd(key, "*", "op", "enqueue", "q", "qX"); + await admin.xadd(key, "*", "op", "ack", "q", "qX"); + // Read one entry as some consumer and leave it unacked -> 1 pending, 1 still undelivered. + await admin.xreadgroup("GROUP", definition.consumerGroup, "reader", "COUNT", 1, "STREAMS", key, ">"); + + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async () => {}, + }); + + try { + const states = await consumer.streamState(); + expect(states).toHaveLength(1); + expect(states[0]!.depth).toBe(2); + expect(states[0]!.pending).toBe(1); + expect(states[0]!.lag).toBe(1); + } finally { + await consumer.stop(); + await admin.quit(); + } +}); + +redisTest("CachedRedisFlag reads a redis key with caching", async ({ redisOptions }) => { + const key = `flag_${Date.now()}`; + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const flag = new CachedRedisFlag({ redis: redisOptions, key, cacheTtlMs: 10_000 }); + + expect(flag.enabled()).toBe(false); + await flag.refresh(); + expect(flag.enabled()).toBe(false); + + await admin.set(key, "1"); + await flag.refresh(); + expect(flag.enabled()).toBe(true); + + await admin.set(key, "0"); + await flag.refresh(); + expect(flag.enabled()).toBe(false); + + await flag.close(); + await admin.quit(); +}); + +redisTest("CachedRedisFlag warms eagerly on construction", async ({ redisOptions }) => { + const key = `flag_eager_${Date.now()}`; + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + await admin.set(key, "1"); + + const flag = new CachedRedisFlag({ redis: redisOptions, key }); + // No manual refresh(): the constructor kicks one off so the first real read is warm. + await waitFor(() => flag.enabled() === true); + expect(flag.enabled()).toBe(true); + + await flag.close(); + await admin.quit(); +}); diff --git a/internal-packages/metrics-pipeline/src/consumer.ts b/internal-packages/metrics-pipeline/src/consumer.ts new file mode 100644 index 00000000000..c088f56f14e --- /dev/null +++ b/internal-packages/metrics-pipeline/src/consumer.ts @@ -0,0 +1,336 @@ +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import { + getMeter, + type Counter, + type Histogram, + type Meter, + type ObservableGauge, + ValueType, +} from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { dedupTokenFromEntryIds } from "./idempotency.js"; +import { allStreamKeys, type MetricDefinition, type StreamEntry } from "./types.js"; + +export type MetricsStreamConsumerOptions = { + redis: RedisOptions; + definition: MetricDefinition; + /** Unique per process; distinct replicas MUST use distinct names (PEL ownership). */ + consumerName: string; + /** Map a stream entry to a row, or null to drop it (still acked). */ + mapEntry: (entry: StreamEntry) => TRow | null; + /** Insert a batch. Must be idempotent w.r.t. dedupToken; throw to retry the batch. */ + insert: (rows: TRow[], opts: { dedupToken: string }) => Promise; + batchSize?: number; + blockMs?: number; + claimIdleMs?: number; + /** How often to scan for stale pending entries (XAUTOCLAIM); not every poll. */ + reclaimIntervalMs?: number; + errorBackoffMs?: number; + logger?: Logger; + meter?: Meter; +}; + +type RawEntry = [id: string, fields: string[]]; +type RawStream = [key: string, entries: RawEntry[]]; + +/** Per-shard stream health, surfaced as observable gauges and usable directly in tests. + * `lag: null` means Redis could not compute it (entries trimmed past the group's read + * position) — treat as an alert, NOT as zero: it coincides with data loss. */ +export type ShardState = { shard: number; depth: number; lag: number | null; pending: number }; + +function parseFields(flat: string[]): Record { + const out: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) { + out[flat[i]!] = flat[i + 1]!; + } + return out; +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Reads a sharded metrics stream via a consumer group, inserting each stream's poll-batch + * as its own dedup block (so an XAUTOCLAIM-reclaimed batch re-forms the same id set and + * token), acking only after a successful insert. Sequential read/insert/ack per process. + */ +export class MetricsStreamConsumer { + private readonly redis: Redis; + private readonly probeRedis: Redis; + private readonly def: MetricDefinition; + private readonly keys: string[]; + private readonly consumerName: string; + private readonly batchSize: number; + private readonly blockMs: number; + private readonly claimIdleMs: number; + private readonly reclaimIntervalMs: number; + private lastReclaimAt = 0; + private readonly errorBackoffMs: number; + private readonly logger: Logger; + private readonly mapEntry: (entry: StreamEntry) => TRow | null; + private readonly insert: (rows: TRow[], opts: { dedupToken: string }) => Promise; + + private readonly meter: Meter; + private readonly entriesCounter: Counter; + private readonly rowsCounter: Counter; + private readonly insertErrorCounter: Counter; + private readonly insertDuration: Histogram; + private readonly observables: ObservableGauge[]; + private readonly batchCallback: Parameters[0]; + + private running = false; + private loopPromise?: Promise; + + constructor(options: MetricsStreamConsumerOptions) { + this.logger = options.logger ?? new Logger("MetricsStreamConsumer", "info"); + const redisConfig = { ...options.redis, keyPrefix: undefined }; + this.redis = createRedisClient(redisConfig, { + onError: (error) => this.logger.error("consumer redis error", { error }), + }); + // Separate client so the observable-gauge probes never queue behind the blocking XREADGROUP. + this.probeRedis = createRedisClient(redisConfig, { + onError: (error) => this.logger.error("consumer probe redis error", { error }), + }); + this.def = options.definition; + this.keys = allStreamKeys(options.definition); + this.consumerName = options.consumerName; + this.batchSize = options.batchSize ?? 1000; + this.blockMs = options.blockMs ?? 1000; + this.claimIdleMs = options.claimIdleMs ?? 60_000; + this.reclaimIntervalMs = options.reclaimIntervalMs ?? 15_000; + this.errorBackoffMs = options.errorBackoffMs ?? 1000; + this.mapEntry = options.mapEntry; + this.insert = options.insert; + + this.meter = options.meter ?? getMeter("metrics-pipeline"); + this.entriesCounter = this.meter.createCounter("queue_metrics.consumer.entries", { + description: "Stream entries read (attr source=new|reclaimed)", + valueType: ValueType.INT, + }); + this.rowsCounter = this.meter.createCounter("queue_metrics.consumer.rows_inserted", { + description: "Rows inserted into the sink", + valueType: ValueType.INT, + }); + this.insertErrorCounter = this.meter.createCounter("queue_metrics.consumer.insert_errors", { + description: "Failed inserts (batch left pending for retry)", + valueType: ValueType.INT, + }); + this.insertDuration = this.meter.createHistogram("queue_metrics.consumer.insert_duration_ms", { + description: "Sink insert latency", + unit: "ms", + valueType: ValueType.INT, + }); + + const depthGauge = this.meter.createObservableGauge("queue_metrics.consumer.stream_depth", { + description: "Entries currently in each shard stream (approaches MAXLEN => trimming)", + valueType: ValueType.INT, + }); + const lagGauge = this.meter.createObservableGauge("queue_metrics.consumer.group_lag", { + description: "Entries not yet delivered to the consumer group (consumer falling behind)", + valueType: ValueType.INT, + }); + const pendingGauge = this.meter.createObservableGauge("queue_metrics.consumer.pending", { + description: "Unacked (in-flight or stuck) entries in the group PEL", + valueType: ValueType.INT, + }); + const lagUnknownGauge = this.meter.createObservableGauge("queue_metrics.consumer.lag_unknown", { + description: "1 when Redis cannot compute group lag (entries trimmed => data loss); alert on this", + valueType: ValueType.INT, + }); + this.observables = [depthGauge, lagGauge, pendingGauge, lagUnknownGauge]; + this.batchCallback = async (result) => { + const states = await this.streamState(); + for (const s of states) { + const attrs = { stream: this.def.name, shard: String(s.shard) }; + result.observe(depthGauge, s.depth, attrs); + if (s.lag !== null) result.observe(lagGauge, s.lag, attrs); + result.observe(lagUnknownGauge, s.lag === null ? 1 : 0, attrs); + result.observe(pendingGauge, s.pending, attrs); + } + }; + this.meter.addBatchObservableCallback(this.batchCallback, this.observables); + } + + async start(): Promise { + if (this.running) return; + await this.ensureGroups(); + this.running = true; + this.loopPromise = this.loop(); + } + + async stop(): Promise { + this.running = false; + this.meter.removeBatchObservableCallback(this.batchCallback, this.observables); + await this.loopPromise?.catch(() => {}); + await Promise.all([ + this.redis.quit().catch(() => {}), + this.probeRedis.quit().catch(() => {}), + ]); + } + + private async ensureGroups(): Promise { + for (const key of this.keys) { + try { + // "0" (not "$"): a brand-new stream's group must not skip entries emitted + // between emitter boot and the first consumer's group creation. + await this.redis.xgroup("CREATE", key, this.def.consumerGroup, "0", "MKSTREAM"); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (!message.includes("BUSYGROUP")) throw error; + } + } + } + + private async loop(): Promise { + while (this.running) { + try { + if (Date.now() - this.lastReclaimAt >= this.reclaimIntervalMs) { + this.lastReclaimAt = Date.now(); + await this.reclaimStale(); + } + await this.readNew(); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + // Self-heal a missing group (stream trimmed to nothing / deleted / Redis flushed): + // recreate it rather than wedging the loop on NOGROUP forever. + if (message.includes("NOGROUP")) { + this.logger.warn("consumer group missing; recreating", { error }); + await this.ensureGroups().catch(() => {}); + } else { + this.logger.error("consumer loop iteration failed", { error }); + } + await sleep(this.errorBackoffMs); + } + } + } + + private async readNew(): Promise { + const ids = this.keys.map(() => ">"); + const response = (await this.redis.xreadgroup( + "GROUP", + this.def.consumerGroup, + this.consumerName, + "COUNT", + this.batchSize, + "BLOCK", + this.blockMs, + "STREAMS", + ...this.keys, + ...ids + )) as RawStream[] | null; + + if (!response) return 0; + return this.processStreams(response, "new"); + } + + private async reclaimStale(): Promise { + for (const key of this.keys) { + const result = (await this.redis.xautoclaim( + key, + this.def.consumerGroup, + this.consumerName, + this.claimIdleMs, + "0", + "COUNT", + this.batchSize + )) as [string, RawEntry[], string[]] | null; + + const entries = result?.[1] ?? []; + if (entries.length === 0) continue; + await this.processStreams([[key, entries]], "reclaimed"); + } + } + + // One insert (dedup block) and XACK per stream, so a reclaimed batch re-forms the + // original per-stream id set and token. On insert failure that stream's entries stay + // pending for a later XAUTOCLAIM; other streams still progress. + private async processStreams(streams: RawStream[], source: "new" | "reclaimed"): Promise { + let processed = 0; + let firstError: unknown; + + for (const [key, entries] of streams) { + if (entries.length === 0) continue; + const keyIds: string[] = []; + const rows: TRow[] = []; + for (const [id, flat] of entries) { + keyIds.push(id); + const row = this.mapEntry({ id, fields: parseFields(flat) }); + if (row !== null) rows.push(row); + } + this.entriesCounter.add(keyIds.length, { source }); + + if (rows.length > 0) { + const startedAt = Date.now(); + try { + await this.insert(rows, { dedupToken: dedupTokenFromEntryIds(keyIds, key) }); + } catch (error) { + this.insertErrorCounter.add(1); + firstError ??= error; + continue; + } finally { + this.insertDuration.record(Date.now() - startedAt); + } + this.rowsCounter.add(rows.length); + } + + await this.redis.xack(key, this.def.consumerGroup, ...keyIds); + processed += keyIds.length; + } + + if (firstError !== undefined) throw firstError; + return processed; + } + + /** Per-shard depth (XLEN), group lag, and pending — the consumer-health signals. */ + async streamState(): Promise { + return probeShardStates(this.probeRedis, this.keys, this.def.consumerGroup); + } + + /** All shard stream keys this consumer reads (for diagnostics/tests). */ + streamKeys(): string[] { + return this.keys.slice(); + } +} + +/** + * Per-shard depth/lag/pending for a metric stream — usable without a running consumer + * (e.g. from an admin route). `redis` should have keyPrefix unset, matching the stream keys. + */ +export async function probeShardStates( + redis: Redis, + keys: string[], + consumerGroup: string +): Promise { + const out: ShardState[] = []; + for (let shard = 0; shard < keys.length; shard++) { + const key = keys[shard]!; + const depth = Number(await redis.xlen(key)) || 0; + // lag defaults to null (unknown) and only becomes a number when Redis reports one: + // a nil lag means entries were trimmed past the group's read position (data loss). + let lag: number | null = 0; + let pending = 0; + try { + const groups = (await redis.call("XINFO", "GROUPS", key)) as unknown[]; + for (const raw of groups) { + const info = flatToMap(raw as unknown[]); + if (info.name === consumerGroup) { + const rawLag = info.lag; + lag = rawLag == null ? null : Number(rawLag); + if (lag !== null && !Number.isFinite(lag)) lag = null; + pending = Number(info.pending) || 0; + } + } + } catch { + // Stream/group may not exist yet; treat as zero. + } + out.push({ shard, depth, lag, pending }); + } + return out; +} + +function flatToMap(flat: unknown[]): Record { + const out: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) { + out[String(flat[i])] = flat[i + 1]; + } + return out; +} diff --git a/internal-packages/metrics-pipeline/src/emitter.ts b/internal-packages/metrics-pipeline/src/emitter.ts new file mode 100644 index 00000000000..7bab52176b6 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/emitter.ts @@ -0,0 +1,167 @@ +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import { getMeter, type Counter, type Meter, ValueType } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { shardFor } from "./hash.js"; +import { streamKey, type MetricDefinition, type MetricFields } from "./types.js"; + +export type MetricsStreamEmitterOptions = { + redis: RedisOptions; + definition: MetricDefinition; + /** Synchronous enabled check (e.g. CachedRedisFlag); emits are no-ops when false. */ + flag: { enabled(): boolean }; + /** Probability (0..1) that a sampled emission fires; applies to `sampledSync()`, not + * `emit()`. Pass a `{ value() }` provider (e.g. CachedRedisNumber) to tune it live + * without a redeploy. Default 1 (always). */ + gaugeSampleRate?: number | { value(): number }; + /** TTL (ms) refreshed on every counter write on the per-(queue,op) odometer key. + * Active queues never expire; idle-past-TTL queues purge and self-heal on return. + * Default 7 days. */ + counterOdometerTtlMs?: number; + logger?: Logger; + meter?: Meter; +}; + +type CumulativeCommand = ( + odometerKey: string, + streamKey: string, + ttlMs: string, + maxLen: string, + op: string, + q: string, + ...extraFields: string[] +) => Promise; + +// INCR the odometer, refresh its TTL, and XADD the reading (new value as `cum`) in one round +// trip. Refresh-on-write is load-bearing: only genuinely idle queues expire. On first creation +// (v==1) XADD a cum=0 baseline first (smaller stream id => sorts first) so deltaSum captures the +// 0->1 transition and the total reconstructs exactly. +// ARGV: [1]=ttlMs [2]=maxLen [3]=op [4]=q [5..]=extra field/value pairs (e.g. wait). +const CUMULATIVE_LUA = ` +local v = redis.call('INCR', KEYS[1]) +redis.call('PEXPIRE', KEYS[1], ARGV[1]) +local maxlen = tonumber(ARGV[2]) or 0 +local function xadd(cum, withExtra) + local x = {'XADD', KEYS[2]} + if maxlen > 0 then x[#x+1]='MAXLEN'; x[#x+1]='~'; x[#x+1]=ARGV[2] end + x[#x+1]='*' + x[#x+1]='op'; x[#x+1]=ARGV[3] + x[#x+1]='q'; x[#x+1]=ARGV[4] + if withExtra then for i=5,#ARGV do x[#x+1]=ARGV[i] end end + x[#x+1]='cum'; x[#x+1]=cum + redis.call(unpack(x)) +end +if v == 1 then xadd(0, false) end +xadd(v, true) +`; + +/** Node-side producer: XADDs events to a sharded metrics stream, gated on a flag. */ +export class MetricsStreamEmitter { + private readonly redis: Redis; + private readonly def: MetricDefinition; + private readonly flag: { enabled(): boolean }; + private readonly sampleRate: () => number; + private readonly odometerTtlMs: number; + private readonly logger: Logger; + private readonly emittedCounter: Counter; + private readonly errorCounter: Counter; + + constructor(options: MetricsStreamEmitterOptions) { + this.logger = options.logger ?? new Logger("MetricsStreamEmitter", "warn"); + this.redis = createRedisClient( + { ...options.redis, keyPrefix: undefined }, + { onError: (error) => this.logger.error("emitter redis error", { error }) } + ); + this.redis.defineCommand("qmEmitCumulative", { numberOfKeys: 2, lua: CUMULATIVE_LUA }); + this.odometerTtlMs = options.counterOdometerTtlMs ?? 7 * 24 * 60 * 60 * 1000; + this.def = options.definition; + this.flag = options.flag; + const rate = options.gaugeSampleRate; + if (typeof rate === "object") { + this.sampleRate = () => rate.value(); + } else { + const fixed = Math.min(1, Math.max(0, rate ?? 1)); + this.sampleRate = () => fixed; + } + + const meter = options.meter ?? getMeter("metrics-pipeline"); + this.emittedCounter = meter.createCounter("queue_metrics.emitter.emitted", { + description: "Node-side metric events XADDed to the stream", + valueType: ValueType.INT, + }); + this.errorCounter = meter.createCounter("queue_metrics.emitter.errors", { + description: "Failed metric-event XADDs (dropped)", + valueType: ValueType.INT, + }); + } + + enabledSync(): boolean { + return this.flag.enabled(); + } + + // Enabled AND (probabilistically) sampled-in. For high-frequency sampled emissions + // (e.g. Lua gauges); exact-count events use enabledSync()/emit() and are never sampled. + sampledSync(): boolean { + if (!this.flag.enabled()) return false; + const rate = this.sampleRate(); + if (rate >= 1) return true; + if (rate <= 0) return false; + return Math.random() < rate; + } + + // Fire-and-forget gauge emit: a plain XADD of an op=gauge snapshot (no odometer). The + // gauge value was read atomically inside the queue op's Lua and returned on the reply; + // this just lands it on the metrics stream. Loss-tolerant (sampled), never throws into + // the caller. Shares the counter stream (one stream family on the metrics Redis). + emitGauge(shardKey: string, fields: MetricFields): void { + if (!this.flag.enabled()) return; + const op = String(fields.op ?? "gauge"); + const stream = streamKey(this.def, shardFor(shardKey, this.def.shardCount)); + const args: string[] = []; + if (this.def.maxLen) args.push("MAXLEN", "~", String(this.def.maxLen)); + args.push("*"); + for (const [field, value] of Object.entries(fields)) { + args.push(field, String(value)); + } + this.emittedCounter.add(1, { op }); + this.redis.xadd(stream, ...(args as [string, ...string[]])).catch((error) => { + this.errorCounter.add(1); + this.logger.debug("metrics gauge emit failed", { error, stream }); + }); + } + + // Fire-and-forget cumulative counter emit: advances the per-(queue,op) odometer and + // XADDs its new absolute value. No-op when disabled, never throws into the caller. A + // lost XADD self-heals (the next reading restates the total); the INCR is never sampled. + emit(shardKey: string, fields: MetricFields): void { + if (!this.flag.enabled()) return; + const op = String(fields.op ?? "unknown"); + const q = String(fields.q ?? ""); + const odometerKey = `queue_metrics_cum:${op}:${q}`; + const stream = streamKey(this.def, shardFor(shardKey, this.def.shardCount)); + const extra: string[] = []; + for (const [field, value] of Object.entries(fields)) { + if (field === "op" || field === "q") continue; + extra.push(field, String(value)); + } + this.emittedCounter.add(1, { op }); + const client = this.redis as unknown as { qmEmitCumulative: CumulativeCommand }; + client + .qmEmitCumulative( + odometerKey, + stream, + String(this.odometerTtlMs), + String(this.def.maxLen ?? 0), + op, + q, + ...extra + ) + .catch((error) => { + this.errorCounter.add(1); + this.logger.debug("metrics emit failed", { error, stream }); + }); + } + + async close(): Promise { + await this.redis.quit(); + } +} diff --git a/internal-packages/metrics-pipeline/src/flag.ts b/internal-packages/metrics-pipeline/src/flag.ts new file mode 100644 index 00000000000..6573e55789f --- /dev/null +++ b/internal-packages/metrics-pipeline/src/flag.ts @@ -0,0 +1,46 @@ +import type { RedisOptions } from "@internal/redis"; +import { Logger } from "@trigger.dev/core/logger"; +import { CachedRedisValue } from "./cachedValue.js"; + +export type CachedRedisFlagOptions = { + redis: RedisOptions; + /** Redis key holding the flag. A value of "1"/"true"/"on"/"enabled" is truthy. */ + key: string; + cacheTtlMs?: number; + defaultValue?: boolean; + logger?: Logger; +}; + +const TRUTHY = new Set(["1", "true", "on", "enabled", "yes"]); + +/** + * Boolean feature flag from a Redis key with a short stale-while-revalidate cache, + * exposing a synchronous getter for hot paths (building Lua ARGV on every op). + */ +export class CachedRedisFlag { + private readonly inner: CachedRedisValue; + + constructor(options: CachedRedisFlagOptions) { + this.inner = new CachedRedisValue({ + redis: options.redis, + key: options.key, + parse: (raw) => raw != null && TRUTHY.has(raw.trim().toLowerCase()), + defaultValue: options.defaultValue ?? false, + cacheTtlMs: options.cacheTtlMs, + logger: options.logger, + loggerName: "CachedRedisFlag", + }); + } + + enabled(): boolean { + return this.inner.get(); + } + + refresh(): Promise { + return this.inner.refresh(); + } + + async close(): Promise { + await this.inner.close(); + } +} diff --git a/internal-packages/metrics-pipeline/src/hash.ts b/internal-packages/metrics-pipeline/src/hash.ts new file mode 100644 index 00000000000..b14324c138a --- /dev/null +++ b/internal-packages/metrics-pipeline/src/hash.ts @@ -0,0 +1,15 @@ +/** FNV-1a 32-bit hash. Deterministic across processes; used only for sharding. */ +export function fnv1a32(str: string): number { + let hash = 0x811c9dc5; + for (let i = 0; i < str.length; i++) { + hash ^= str.charCodeAt(i); + hash = Math.imul(hash, 0x01000193); + } + return hash >>> 0; +} + +/** Deterministic shard index in [0, shardCount) for a key. */ +export function shardFor(key: string, shardCount: number): number { + if (shardCount <= 1) return 0; + return fnv1a32(key) % shardCount; +} diff --git a/internal-packages/metrics-pipeline/src/idempotency.ts b/internal-packages/metrics-pipeline/src/idempotency.ts new file mode 100644 index 00000000000..b9f5ead0a5e --- /dev/null +++ b/internal-packages/metrics-pipeline/src/idempotency.ts @@ -0,0 +1,9 @@ +import { createHash } from "node:crypto"; + +// Deterministic, order-independent token over a batch of entry ids. A redelivered +// batch yields the same token, so ClickHouse's raw-table dedup window drops the replay. +// `scope` (the stream key) disambiguates id sets that could collide across streams. +export function dedupTokenFromEntryIds(ids: string[], scope = ""): string { + const sorted = [...ids].sort(); + return createHash("sha1").update(`${scope}|${sorted.join(",")}`).digest("hex"); +} diff --git a/internal-packages/metrics-pipeline/src/index.ts b/internal-packages/metrics-pipeline/src/index.ts new file mode 100644 index 00000000000..223c5feab17 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/index.ts @@ -0,0 +1,26 @@ +export { CachedRedisFlag, type CachedRedisFlagOptions } from "./flag.js"; +export { + CachedRedisNumber, + type CachedRedisNumberOptions, + CachedRedisValue, + type CachedRedisValueOptions, +} from "./cachedValue.js"; +export { MetricsStreamEmitter, type MetricsStreamEmitterOptions } from "./emitter.js"; +export { + MetricsStreamConsumer, + type MetricsStreamConsumerOptions, + type ShardState, + probeShardStates, +} from "./consumer.js"; +export { createMetricsGaugeComputeLua, type GaugeComputeLuaParams } from "./lua.js"; +export { dedupTokenFromEntryIds } from "./idempotency.js"; +export { shardFor, fnv1a32 } from "./hash.js"; +export { + streamKey, + allStreamKeys, + entryTimeMs, + entryOrderKey, + type MetricDefinition, + type MetricFields, + type StreamEntry, +} from "./types.js"; diff --git a/internal-packages/metrics-pipeline/src/lua.ts b/internal-packages/metrics-pipeline/src/lua.ts new file mode 100644 index 00000000000..b4940c3cef6 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/lua.ts @@ -0,0 +1,41 @@ +// Each field is a Lua expression evaluated inside the target script. queueLimit/ +// envLimit must be the EFFECTIVE enforced limit, else an unset limit reads as throttled. +export type GaugeComputeLuaParams = { + // Lua boolean expression; when true the gauge is computed (else the extra reads are skipped). + enabledArg: string; + queued: string; + running: string; + queueLimit: string; + envQueued: string; + envRunning: string; + envLimit: string; + // Lua statements run first inside the pcall (e.g. to compute aggregate locals). + preamble?: string; + // Lua boolean expression (in __cc/__lim/__ql) for the throttled flag. Pass "false" + // where cc >= lim is not a valid throttle signal (e.g. summed CK aggregates). + throttledExpr?: string; +}; + +// Computes an op=gauge snapshot into the enclosing script's `__qm_g` local (a flat +// {ql, cc, lim, eql, ec, elim, thr} array) so the script can RETURN it; Node then XADDs it +// to the metrics Redis. No Redis write here (the run-queue Redis carries no metrics stream). +// Gated on the sample flag and pcall-wrapped. The script MUST declare `local __qm_g` first. +export function createMetricsGaugeComputeLua(params: GaugeComputeLuaParams): string { + const throttled = params.throttledExpr ?? "__cc >= __lim and __ql > 0"; + + return ` +if ${params.enabledArg} then + pcall(function() + ${params.preamble ?? ""} + local __ql = tonumber(${params.queued}) or 0 + local __cc = tonumber(${params.running}) or 0 + local __lim = tonumber(${params.queueLimit}) or 0 + local __eql = tonumber(${params.envQueued}) or 0 + local __ec = tonumber(${params.envRunning}) or 0 + local __elim = tonumber(${params.envLimit}) or 0 + local __thr = 0 + if ${throttled} then __thr = 1 end + __qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr} + end) +end`; +} diff --git a/internal-packages/metrics-pipeline/src/pipeline.test.ts b/internal-packages/metrics-pipeline/src/pipeline.test.ts new file mode 100644 index 00000000000..195d0102ff6 --- /dev/null +++ b/internal-packages/metrics-pipeline/src/pipeline.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from "vitest"; +import { createMetricsGaugeComputeLua } from "./lua.js"; +import { dedupTokenFromEntryIds } from "./idempotency.js"; +import { fnv1a32, shardFor } from "./hash.js"; +import { allStreamKeys, entryTimeMs, streamKey } from "./types.js"; + +describe("shardFor", () => { + it("is deterministic and in range", () => { + expect(shardFor("queueA", 1)).toBe(0); + const s = shardFor("queueA", 4); + expect(s).toBeGreaterThanOrEqual(0); + expect(s).toBeLessThan(4); + expect(shardFor("queueA", 4)).toBe(s); + expect(fnv1a32("queueA")).toBe(fnv1a32("queueA")); + }); +}); + +describe("dedupTokenFromEntryIds", () => { + it("is order-independent and set-sensitive", () => { + expect(dedupTokenFromEntryIds(["1-0", "2-0"])).toBe(dedupTokenFromEntryIds(["2-0", "1-0"])); + expect(dedupTokenFromEntryIds(["1-0"])).not.toBe(dedupTokenFromEntryIds(["2-0"])); + expect(dedupTokenFromEntryIds(["1-0"])).toMatch(/^[0-9a-f]{40}$/); + }); +}); + +describe("stream keys", () => { + it("names and parses entry time", () => { + expect(streamKey({ name: "queue_metrics" }, 3)).toBe("queue_metrics:{3}"); + expect(allStreamKeys({ name: "qm", shardCount: 2, consumerGroup: "cg" })).toEqual([ + "qm:{0}", + "qm:{1}", + ]); + expect(entryTimeMs("1717000000000-5")).toBe(1717000000000); + expect(entryTimeMs("nope")).toBeNull(); + }); +}); + +describe("createMetricsGaugeComputeLua", () => { + it("assigns __qm_g inside a gated, pcall-wrapped block and never XADDs", () => { + const lua = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('ZCARD', KEYS[2])", + running: "queueCurrent", + queueLimit: "queueLimit", + envQueued: "redis.call('ZCARD', KEYS[8])", + envRunning: "envCurrent", + envLimit: "envLimit", + }); + + expect(lua).toContain("if ARGV[#ARGV] == '1' then"); + expect(lua).toContain("pcall(function()"); + expect(lua).toContain("__qm_g = {__ql, __cc, __lim, __eql, __ec, __elim, __thr}"); + expect(lua).toContain("if __cc >= __lim and __ql > 0 then __thr = 1 end"); + // The whole point of the refactor: no Redis write happens in the run-queue script. + expect(lua).not.toContain("XADD"); + }); + + it("honors a custom throttled expression and preamble", () => { + const lua = createMetricsGaugeComputeLua({ + enabledArg: "true", + preamble: "local agg = 1", + queued: "0", + running: "0", + queueLimit: "0", + envQueued: "0", + envRunning: "0", + envLimit: "0", + throttledExpr: "false", + }); + expect(lua).toContain("local agg = 1"); + expect(lua).toContain("if false then __thr = 1 end"); + expect(lua).not.toContain("XADD"); + }); +}); diff --git a/internal-packages/metrics-pipeline/src/types.ts b/internal-packages/metrics-pipeline/src/types.ts new file mode 100644 index 00000000000..c5336efa5eb --- /dev/null +++ b/internal-packages/metrics-pipeline/src/types.ts @@ -0,0 +1,40 @@ +export type MetricFields = Record; + +export type StreamEntry = { + id: string; + fields: Record; +}; + +export type MetricDefinition = { + /** Logical name, e.g. "queue_metrics". Used as the stream key prefix. */ + name: string; + shardCount: number; + consumerGroup: string; + /** Approximate MAXLEN cap applied on XADD (`MAXLEN ~ N`). Omit for unbounded. */ + maxLen?: number; +}; + +// Keys are used verbatim on every access path (Lua ARGV, emitter, consumer), so +// they must NOT be subject to an ioredis keyPrefix. `{shard}` is a Cluster hash tag. +export function streamKey(definition: Pick, shard: number): string { + return `${definition.name}:{${shard}}`; +} + +export function allStreamKeys(definition: MetricDefinition): string[] { + return Array.from({ length: Math.max(1, definition.shardCount) }, (_, shard) => + streamKey(definition, shard) + ); +} + +// The ms part of a stream entry id is its emission time. +export function entryTimeMs(id: string): number | null { + const ms = Number(id.split("-")[0]); + return Number.isFinite(ms) ? ms : null; +} + +// Strictly-monotonic-per-stream ordering key from a stream id (`-`): ms*1e5+seq. +// Used to order cumulative readings for deltaSumTimestamp so within-ms ties don't misorder. +export function entryOrderKey(id: string): number { + const [ms, seq] = id.split("-"); + return (Number(ms) || 0) * 100000 + (Number(seq) || 0); +} diff --git a/internal-packages/metrics-pipeline/test/setup.ts b/internal-packages/metrics-pipeline/test/setup.ts new file mode 100644 index 00000000000..b2bacd6baf5 --- /dev/null +++ b/internal-packages/metrics-pipeline/test/setup.ts @@ -0,0 +1,4 @@ +import { vi } from "vitest"; + +// Set extended timeout for container tests +vi.setConfig({ testTimeout: 60_000 }); diff --git a/internal-packages/metrics-pipeline/tsconfig.build.json b/internal-packages/metrics-pipeline/tsconfig.build.json new file mode 100644 index 00000000000..89c87a3dc67 --- /dev/null +++ b/internal-packages/metrics-pipeline/tsconfig.build.json @@ -0,0 +1,21 @@ +{ + "include": ["src/**/*.ts"], + "exclude": ["src/**/*.test.ts"], + "compilerOptions": { + "composite": true, + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "outDir": "dist", + "module": "Node16", + "moduleResolution": "Node16", + "moduleDetection": "force", + "verbatimModuleSyntax": false, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "preserveWatchOutput": true, + "skipLibCheck": true, + "strict": true, + "declaration": true + } +} diff --git a/internal-packages/metrics-pipeline/tsconfig.json b/internal-packages/metrics-pipeline/tsconfig.json new file mode 100644 index 00000000000..af630abe1f1 --- /dev/null +++ b/internal-packages/metrics-pipeline/tsconfig.json @@ -0,0 +1,8 @@ +{ + "references": [{ "path": "./tsconfig.src.json" }, { "path": "./tsconfig.test.json" }], + "compilerOptions": { + "moduleResolution": "Node16", + "module": "Node16", + "customConditions": ["@triggerdotdev/source"] + } +} diff --git a/internal-packages/metrics-pipeline/tsconfig.src.json b/internal-packages/metrics-pipeline/tsconfig.src.json new file mode 100644 index 00000000000..0df3d2d222f --- /dev/null +++ b/internal-packages/metrics-pipeline/tsconfig.src.json @@ -0,0 +1,20 @@ +{ + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "src/**/*.test.ts"], + "compilerOptions": { + "composite": true, + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "module": "Node16", + "moduleResolution": "Node16", + "moduleDetection": "force", + "verbatimModuleSyntax": false, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "preserveWatchOutput": true, + "skipLibCheck": true, + "strict": true, + "customConditions": ["@triggerdotdev/source"] + } +} diff --git a/internal-packages/metrics-pipeline/tsconfig.test.json b/internal-packages/metrics-pipeline/tsconfig.test.json new file mode 100644 index 00000000000..4c06c9f57bb --- /dev/null +++ b/internal-packages/metrics-pipeline/tsconfig.test.json @@ -0,0 +1,21 @@ +{ + "include": ["src/**/*.test.ts"], + "references": [{ "path": "./tsconfig.src.json" }], + "compilerOptions": { + "composite": true, + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "module": "Node16", + "moduleResolution": "Node16", + "moduleDetection": "force", + "verbatimModuleSyntax": false, + "types": ["vitest/globals"], + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "preserveWatchOutput": true, + "skipLibCheck": true, + "strict": true, + "customConditions": ["@triggerdotdev/source"] + } +} diff --git a/internal-packages/metrics-pipeline/vitest.config.ts b/internal-packages/metrics-pipeline/vitest.config.ts new file mode 100644 index 00000000000..daafd294fa8 --- /dev/null +++ b/internal-packages/metrics-pipeline/vitest.config.ts @@ -0,0 +1,17 @@ +import { defineConfig } from "vitest/config"; +import { DurationShardingSequencer } from "@internal/testcontainers/sequencer"; + +export default defineConfig({ + test: { + sequence: { sequencer: DurationShardingSequencer }, + globals: true, + retry: process.env.CI ? 2 : 0, + environment: "node", + setupFiles: ["./test/setup.ts"], + testTimeout: 30000, + hookTimeout: 30000, + }, + esbuild: { + target: "node18", + }, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1a56a054f42..a49afc04da5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -365,6 +365,9 @@ importers: '@internal/llm-model-catalog': specifier: workspace:* version: link:../../internal-packages/llm-model-catalog + '@internal/metrics-pipeline': + specifier: workspace:* + version: link:../../internal-packages/metrics-pipeline '@internal/redis': specifier: workspace:* version: link:../../internal-packages/redis @@ -1255,6 +1258,25 @@ importers: specifier: 4.1.7 version: 4.1.7(@opentelemetry/api@1.9.1)(@types/node@22.20.0)(@vitest/coverage-v8@4.1.7)(vite@6.4.2(@types/node@22.20.0)(jiti@2.6.1)(lightningcss@1.29.2)(terser@5.46.1)(tsx@4.22.4)(yaml@2.9.0)) + internal-packages/metrics-pipeline: + dependencies: + '@internal/redis': + specifier: workspace:* + version: link:../redis + '@internal/tracing': + specifier: workspace:* + version: link:../tracing + '@trigger.dev/core': + specifier: workspace:* + version: link:../../packages/core + devDependencies: + '@internal/testcontainers': + specifier: workspace:* + version: link:../testcontainers + rimraf: + specifier: 6.0.1 + version: 6.0.1 + internal-packages/otlp-importer: dependencies: long: @@ -1335,6 +1357,9 @@ importers: '@internal/cache': specifier: workspace:* version: link:../cache + '@internal/metrics-pipeline': + specifier: workspace:* + version: link:../metrics-pipeline '@internal/redis': specifier: workspace:* version: link:../redis From 09ab1b11325afff152c81552a81672d098bcfd20 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 2 Jul 2026 23:30:16 +0100 Subject: [PATCH 02/37] feat(clickhouse): queue metrics tables and read queries --- .../schema/035_create_queue_metrics_v1.sql | 98 ++++++++++ internal-packages/clickhouse/src/index.ts | 14 ++ .../clickhouse/src/queueMetrics.test.ts | 183 ++++++++++++++++++ .../clickhouse/src/queueMetrics.ts | 103 ++++++++++ 4 files changed, 398 insertions(+) create mode 100644 internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql create mode 100644 internal-packages/clickhouse/src/queueMetrics.test.ts create mode 100644 internal-packages/clickhouse/src/queueMetrics.ts diff --git a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql new file mode 100644 index 00000000000..28e076fb1ae --- /dev/null +++ b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql @@ -0,0 +1,98 @@ +-- +goose Up + +-- Queue metrics: raw landing table -> MV -> aggregated read target (mirrors +-- llm_model_aggregates_v1, migration 027). Raw rows feed an MV on insert, and +-- reads hit the aggregated table. + +-- Short-TTL raw landing, one row per stream entry. non_replicated_deduplication_window +-- makes consumer replays idempotent via insert_deduplication_token. +CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_raw_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + queue_name String CODEC(ZSTD(1)), + event_time DateTime CODEC(Delta(4), ZSTD(1)), + order_key UInt64 DEFAULT 0, -- stream-id composite (ms*1e5+seq); deltaSumTimestamp ordering key + op LowCardinality(String), -- gauge | enqueue | started | ack | nack | dlq + running UInt32 DEFAULT 0, + queued UInt32 DEFAULT 0, + queue_limit UInt32 DEFAULT 0, + env_running UInt32 DEFAULT 0, + env_queued UInt32 DEFAULT 0, + env_limit UInt32 DEFAULT 0, + throttled UInt8 DEFAULT 0, -- 1 on a gauge emission with running>=limit AND queued>0 + wait_ms UInt32 DEFAULT 0, -- set on op='started' (scheduling delay) + cumulative UInt64 DEFAULT 0 -- monotonic per-(queue,op) odometer on a counter op; diffed at read time +) +ENGINE = MergeTree() +PARTITION BY toDate(event_time) +ORDER BY (organization_id, project_id, environment_id, queue_name, event_time) +TTL event_time + INTERVAL 6 HOUR +SETTINGS non_replicated_deduplication_window = 1000, ttl_only_drop_parts = 1; + +-- (2) Aggregated read target (TRQL/dashboards query this). +CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + queue_name String CODEC(ZSTD(1)), + bucket_start DateTime CODEC(Delta(4), ZSTD(1)), + + -- Cumulative-counter deltas: each op maintains a monotonic odometer; deltaSumTimestamp + -- sums positive consecutive deltas (ignoring resets) ordered by event_time, so a lost + -- reading self-heals (the next surviving reading restates the total). Read with + -- deltaSumTimestampMerge(), never sum(). + enqueue_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + started_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + ack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + nack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + dlq_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + throttled_count SimpleAggregateFunction(sum, UInt64), + + max_queued SimpleAggregateFunction(max, UInt32), + max_running SimpleAggregateFunction(max, UInt32), + max_limit SimpleAggregateFunction(max, UInt32), + max_env_queued SimpleAggregateFunction(max, UInt32), + max_env_running SimpleAggregateFunction(max, UInt32), + max_env_limit SimpleAggregateFunction(max, UInt32), + + wait_ms_sum SimpleAggregateFunction(sum, UInt64), + wait_ms_count SimpleAggregateFunction(sum, UInt64), + wait_quantiles AggregateFunction(quantiles(0.5, 0.9, 0.95, 0.99), UInt32) +) +ENGINE = AggregatingMergeTree() +PARTITION BY toDate(bucket_start) +ORDER BY (organization_id, project_id, environment_id, queue_name, bucket_start) +TTL bucket_start + INTERVAL 30 DAY +SETTINGS ttl_only_drop_parts = 1; + +-- (3) MV: raw -> aggregated, 10s buckets. +CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.queue_metrics_mv_v1 +TO trigger_dev.queue_metrics_v1 AS +SELECT + organization_id, project_id, environment_id, queue_name, + toStartOfInterval(event_time, INTERVAL 10 SECOND) AS bucket_start, + deltaSumTimestampStateIf(cumulative, order_key, op = 'enqueue') AS enqueue_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'started') AS started_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'ack') AS ack_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'nack') AS nack_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'dlq') AS dlq_delta, + sum(throttled) AS throttled_count, + max(queued) AS max_queued, + max(running) AS max_running, + max(queue_limit) AS max_limit, + max(env_queued) AS max_env_queued, + max(env_running) AS max_env_running, + max(env_limit) AS max_env_limit, + sumIf(wait_ms, op = 'started') AS wait_ms_sum, + countIf(op = 'started' AND wait_ms > 0) AS wait_ms_count, + quantilesStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started') AS wait_quantiles +FROM trigger_dev.queue_metrics_raw_v1 +GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start; + +-- +goose Down +DROP VIEW IF EXISTS trigger_dev.queue_metrics_mv_v1; +DROP TABLE IF EXISTS trigger_dev.queue_metrics_v1; +DROP TABLE IF EXISTS trigger_dev.queue_metrics_raw_v1; diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index 0b252a98f67..52fc3507def 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -32,6 +32,11 @@ import { } from "./taskEvents.js"; import { insertMetrics } from "./metrics.js"; import { insertLlmMetrics } from "./llmMetrics.js"; +import { + insertQueueMetricsRaw, + getQueueListMetricsSummary, + getQueueDepthSparklines, +} from "./queueMetrics.js"; import { getSessionTagsQueryBuilder, getSessionsCountQueryBuilder, @@ -65,6 +70,7 @@ export type * from "./taskRuns.js"; export type * from "./taskEvents.js"; export type * from "./metrics.js"; export type * from "./llmMetrics.js"; +export type * from "./queueMetrics.js"; export type * from "./llmModelAggregates.js"; export type * from "./errors.js"; export type * from "./sessions.js"; @@ -260,6 +266,14 @@ export class ClickHouse { }; } + get queueMetrics() { + return { + insertRaw: insertQueueMetricsRaw(this.writer), + listSummary: getQueueListMetricsSummary(this.reader), + depthSparklines: getQueueDepthSparklines(this.reader), + }; + } + get llmModelAggregates() { return { globalMetrics: getGlobalModelMetrics(this.reader), diff --git a/internal-packages/clickhouse/src/queueMetrics.test.ts b/internal-packages/clickhouse/src/queueMetrics.test.ts new file mode 100644 index 00000000000..ef18d48288e --- /dev/null +++ b/internal-packages/clickhouse/src/queueMetrics.test.ts @@ -0,0 +1,183 @@ +import { clickhouseTest } from "@internal/testcontainers"; +import { z } from "zod"; +import { ClickHouse } from "./index.js"; +import type { QueueMetricsRawV1Input } from "./queueMetrics.js"; + +const ORG = "org_qm"; +const PROJECT = "project_qm"; +const ENV = "env_qm"; +const EVENT_TIME = "2026-06-30 12:00:05"; // all rows land in the 10s bucket starting 12:00:00 + +function base(op: QueueMetricsRawV1Input["op"], queue: string): QueueMetricsRawV1Input { + return { + organization_id: ORG, + project_id: PROJECT, + environment_id: ENV, + queue_name: queue, + event_time: EVENT_TIME, + op, + }; +} + +const aggregatedRow = z.object({ + enqueue_count: z.coerce.number(), + started_count: z.coerce.number(), + ack_count: z.coerce.number(), + nack_count: z.coerce.number(), + dlq_count: z.coerce.number(), + throttled_count: z.coerce.number(), + max_running: z.coerce.number(), + max_queued: z.coerce.number(), + max_limit: z.coerce.number(), + max_env_running: z.coerce.number(), + max_env_queued: z.coerce.number(), + max_env_limit: z.coerce.number(), + wait_ms_sum: z.coerce.number(), + wait_ms_count: z.coerce.number(), + wait_p50: z.coerce.number(), + wait_p90: z.coerce.number(), + wait_p95: z.coerce.number(), + wait_p99: z.coerce.number(), +}); + +function readAggregated(ch: ClickHouse) { + return ch.reader.query({ + name: "read-queue-metrics-aggregated", + query: `SELECT + sum(enqueue_count) AS enqueue_count, + sum(started_count) AS started_count, + sum(ack_count) AS ack_count, + sum(nack_count) AS nack_count, + sum(dlq_count) AS dlq_count, + sum(throttled_count) AS throttled_count, + max(max_running) AS max_running, + max(max_queued) AS max_queued, + max(max_limit) AS max_limit, + max(max_env_running) AS max_env_running, + max(max_env_queued) AS max_env_queued, + max(max_env_limit) AS max_env_limit, + sum(wait_ms_sum) AS wait_ms_sum, + sum(wait_ms_count) AS wait_ms_count, + quantilesMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles) AS wait_arr, + wait_arr[1] AS wait_p50, + wait_arr[2] AS wait_p90, + wait_arr[3] AS wait_p95, + wait_arr[4] AS wait_p99 + FROM trigger_dev.queue_metrics_v1 + WHERE queue_name = {queueName: String} + GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start`, + schema: aggregatedRow, + params: z.object({ queueName: z.string() }), + }); +} + +// Synchronous insert so the MV-populated rows are queryable immediately. +const SYNC = { params: { clickhouse_settings: { async_insert: 0 as const } } }; + +describe("queue_metrics_v1", () => { + clickhouseTest( + "buckets counters, gauges and wait percentiles via the MV", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + const queue = "queue-a"; + + const rows: QueueMetricsRawV1Input[] = [ + ...Array.from({ length: 3 }, () => base("enqueue", queue)), + ...[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000].map((wait_ms) => ({ + ...base("started", queue), + wait_ms, + })), + ...Array.from({ length: 2 }, () => base("ack", queue)), + base("nack", queue), + base("dlq", queue), + { + ...base("gauge", queue), + running: 8, + queued: 4, + queue_limit: 10, + env_running: 40, + env_queued: 10, + env_limit: 50, + throttled: 0, + }, + { + ...base("gauge", queue), + running: 10, + queued: 6, + queue_limit: 10, + env_running: 50, + env_queued: 20, + env_limit: 50, + throttled: 1, // running >= limit AND queued > 0 + }, + ]; + + const [insertError] = await ch.queueMetrics.insertRaw(rows, SYNC); + expect(insertError).toBeNull(); + + const [queryError, result] = await readAggregated(ch)({ queueName: queue }); + expect(queryError).toBeNull(); + expect(result).toHaveLength(1); + const row = result![0]!; + + expect(row.enqueue_count).toBe(3); + expect(row.started_count).toBe(10); + expect(row.ack_count).toBe(2); + expect(row.nack_count).toBe(1); + expect(row.dlq_count).toBe(1); + expect(row.throttled_count).toBe(1); + + expect(row.max_running).toBe(10); + expect(row.max_queued).toBe(6); + expect(row.max_limit).toBe(10); + expect(row.max_env_running).toBe(50); + expect(row.max_env_queued).toBe(20); + expect(row.max_env_limit).toBe(50); + + expect(row.wait_ms_sum).toBe(5500); + expect(row.wait_ms_count).toBe(10); + + // Percentiles over [100..1000]: monotonic and within the value range. + expect(row.wait_p50).toBeGreaterThanOrEqual(400); + expect(row.wait_p50).toBeLessThanOrEqual(650); + expect(row.wait_p90).toBeGreaterThanOrEqual(row.wait_p50); + expect(row.wait_p95).toBeGreaterThanOrEqual(row.wait_p90); + expect(row.wait_p99).toBeGreaterThanOrEqual(row.wait_p95); + expect(row.wait_p99).toBeLessThanOrEqual(1000); + + await ch.close(); + } + ); + + clickhouseTest( + "merges wait-quantile state across separate insert blocks", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + const queue = "queue-b"; + + const block = (waits: number[]) => + waits.map((wait_ms) => ({ ...base("started", queue), wait_ms })); + + const [e1] = await ch.queueMetrics.insertRaw(block([100, 200, 300, 400, 500]), SYNC); + expect(e1).toBeNull(); + const [e2] = await ch.queueMetrics.insertRaw(block([600, 700, 800, 900, 1000]), SYNC); + expect(e2).toBeNull(); + + const [queryError, result] = await readAggregated(ch)({ queueName: queue }); + expect(queryError).toBeNull(); + expect(result).toHaveLength(1); + const row = result![0]!; + + // Both blocks contribute to one bucket: counts and sums add, quantile state merges. + expect(row.started_count).toBe(10); + expect(row.wait_ms_sum).toBe(5500); + expect(row.wait_ms_count).toBe(10); + expect(row.wait_p50).toBeGreaterThanOrEqual(400); + expect(row.wait_p50).toBeLessThanOrEqual(650); + expect(row.wait_p99).toBeGreaterThanOrEqual(row.wait_p50); + expect(row.wait_p99).toBeLessThanOrEqual(1000); + + await ch.close(); + } + ); +}); diff --git a/internal-packages/clickhouse/src/queueMetrics.ts b/internal-packages/clickhouse/src/queueMetrics.ts new file mode 100644 index 00000000000..81870c96ba5 --- /dev/null +++ b/internal-packages/clickhouse/src/queueMetrics.ts @@ -0,0 +1,103 @@ +import { z } from "zod"; +import { ClickhouseReader, ClickhouseWriter } from "./client/types.js"; + +export const QueueMetricsRawV1Input = z.object({ + organization_id: z.string(), + project_id: z.string(), + environment_id: z.string(), + queue_name: z.string(), + event_time: z.string(), + order_key: z.number().optional(), + op: z.enum(["gauge", "enqueue", "started", "ack", "nack", "dlq"]), + running: z.number().optional(), + queued: z.number().optional(), + queue_limit: z.number().optional(), + env_running: z.number().optional(), + env_queued: z.number().optional(), + env_limit: z.number().optional(), + throttled: z.number().optional(), + wait_ms: z.number().optional(), + cumulative: z.number().optional(), +}); + +export type QueueMetricsRawV1Input = z.input; + +export function insertQueueMetricsRaw(ch: ClickhouseWriter) { + return ch.insertUnsafe({ + name: "insertQueueMetricsRaw", + table: "trigger_dev.queue_metrics_raw_v1", + }); +} + +// --- Reads (Queues list metrics + health) --- + +const QueueMetricsListParams = z.object({ + organizationId: z.string(), + projectId: z.string(), + environmentId: z.string(), + queueNames: z.array(z.string()), + startTime: z.string(), +}); + +const QueueMetricsSummaryRow = z.object({ + queue_name: z.string(), + p50_wait_ms: z.coerce.number(), + p95_wait_ms: z.coerce.number(), + peak_queued: z.coerce.number(), + started_count: z.coerce.number(), +}); + +/** Per-queue rollups over a window, for a fixed set of queues (the visible page). */ +export function getQueueListMetricsSummary(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueListMetricsSummary", + query: `SELECT + queue_name, + round(quantilesMerge(0.5, 0.95)(wait_quantiles)[1]) AS p50_wait_ms, + round(quantilesMerge(0.5, 0.95)(wait_quantiles)[2]) AS p95_wait_ms, + max(max_queued) AS peak_queued, + deltaSumTimestampMerge(started_delta) AS started_count + FROM trigger_dev.queue_metrics_v1 + WHERE organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND queue_name IN {queueNames: Array(String)} + AND bucket_start >= {startTime: DateTime} + GROUP BY queue_name`, + params: QueueMetricsListParams, + schema: QueueMetricsSummaryRow, + }); +} + +const QueueDepthSparklineParams = QueueMetricsListParams.extend({ + bucketSeconds: z.number(), +}); + +const QueueDepthSparklineRow = z.object({ + queue_name: z.string(), + bucket: z.string(), + depth: z.coerce.number(), +}); + +/** Per-queue, per-bucket peak depth for inline sparklines (carry-forward filled by the caller). */ +export function getQueueDepthSparklines(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueDepthSparklines", + query: `SELECT + queue_name, + toStartOfInterval(bucket_start, toIntervalSecond({bucketSeconds: UInt32})) AS bucket, + max(max_queued) AS depth + FROM trigger_dev.queue_metrics_v1 + WHERE organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND queue_name IN {queueNames: Array(String)} + AND bucket_start >= {startTime: DateTime} + GROUP BY queue_name, bucket + ORDER BY bucket`, + params: QueueDepthSparklineParams, + schema: QueueDepthSparklineRow, + }); +} + +// (per-queue detail series is now fetched via TRQL + fillGaps from the metric resource route) From c32a1cdd021c1e250176c21fc59faface60391a1 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 2 Jul 2026 23:30:30 +0100 Subject: [PATCH 03/37] feat(run-engine): emit queue depth, throughput, and scheduling-delay signals Gauges are read inside the enqueue/dequeue Lua and returned on the script reply as a 2-tuple; counters are cumulative odometers. The run-queue Redis carries no metrics stream of its own. --- internal-packages/run-engine/package.json | 1 + .../run-engine/src/engine/index.ts | 1 + .../src/engine/systems/enqueueSystem.ts | 4 +- .../run-engine/src/engine/types.ts | 3 + .../run-engine/src/run-queue/index.ts | 277 +++++++++++--- .../run-engine/src/run-queue/metrics.test.ts | 345 ++++++++++++++++++ .../run-engine/src/run-queue/types.ts | 3 + 7 files changed, 585 insertions(+), 49 deletions(-) create mode 100644 internal-packages/run-engine/src/run-queue/metrics.test.ts diff --git a/internal-packages/run-engine/package.json b/internal-packages/run-engine/package.json index 8d53974d10b..516e6a18696 100644 --- a/internal-packages/run-engine/package.json +++ b/internal-packages/run-engine/package.json @@ -21,6 +21,7 @@ }, "dependencies": { "@internal/redis": "workspace:*", + "@internal/metrics-pipeline": "workspace:*", "@internal/run-store": "workspace:*", "@trigger.dev/redis-worker": "workspace:*", "@internal/tracing": "workspace:*", diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index f3091c93b88..fcb29333fdd 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -218,6 +218,7 @@ export class RunEngine { callback: this.#concurrencySweeperCallback.bind(this), }, shardCount: options.queue?.shardCount, + queueMetrics: options.queue?.queueMetrics, masterQueueConsumersDisabled: options.queue?.masterQueueConsumersDisabled, masterQueueConsumersIntervalMs: options.queue?.masterQueueConsumersIntervalMs, processWorkerQueueDebounceMs: options.queue?.processWorkerQueueDebounceMs, diff --git a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts index dc9d029c38c..7ae551d7c00 100644 --- a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts @@ -98,7 +98,8 @@ export class EnqueueSystem { // Force development runs to use the environment id as the worker queue. const workerQueue = env.type === "DEVELOPMENT" ? env.id : run.workerQueue; - const timestamp = (run.queueTimestamp ?? run.createdAt).getTime() - run.priorityMs; + const eligibleAtMs = (run.queueTimestamp ?? run.createdAt).getTime(); + const timestamp = eligibleAtMs - run.priorityMs; // Include TTL only when explicitly requested (first enqueue from trigger). // Re-enqueues (waitpoint, checkpoint, delayed, pending version) must not add TTL. @@ -124,6 +125,7 @@ export class EnqueueSystem { queue: run.queue, concurrencyKey: run.concurrencyKey ?? undefined, timestamp, + eligibleAtMs, attempt: 0, ttlExpiresAt, }, diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts index bb1d6eb2fa9..f37ec7df50a 100644 --- a/internal-packages/run-engine/src/engine/types.ts +++ b/internal-packages/run-engine/src/engine/types.ts @@ -16,6 +16,7 @@ import { } from "@trigger.dev/redis-worker"; import type { ControlPlaneResolver } from "./controlPlaneResolver.js"; import type { FairQueueSelectionStrategyOptions } from "../run-queue/fairQueueSelectionStrategy.js"; +import type { RunQueueMetricsEmitter } from "../run-queue/index.js"; import type { MinimalAuthenticatedEnvironment } from "../shared/index.js"; import type { LockRetryConfig } from "./locking.js"; import type { workerCatalog } from "./workerCatalog.js"; @@ -90,6 +91,8 @@ export type RunEngineOptions = { defaultEnvConcurrency?: number; defaultEnvConcurrencyBurstFactor?: number; logLevel?: LogLevel; + /** Optional queue-metrics emitter; enables gauge + counter emission from the RunQueue. */ + queueMetrics?: RunQueueMetricsEmitter; queueSelectionStrategyOptions?: Pick< FairQueueSelectionStrategyOptions, "parentQueueLimit" | "tracer" | "biases" | "reuseSnapshotCount" | "maximumEnvCount" diff --git a/internal-packages/run-engine/src/run-queue/index.ts b/internal-packages/run-engine/src/run-queue/index.ts index a0571206538..3c9d271a471 100644 --- a/internal-packages/run-engine/src/run-queue/index.ts +++ b/internal-packages/run-engine/src/run-queue/index.ts @@ -5,6 +5,7 @@ import { type RedisOptions, type Result, } from "@internal/redis"; +import { createMetricsGaugeComputeLua } from "@internal/metrics-pipeline"; import type { Attributes, Meter, @@ -57,6 +58,65 @@ const SemanticAttributes = { ORG_ID: "runqueue.orgId", }; +// Prelude spliced at the top of every gauge-carrying script: declares the gauge slot and +// the return wrapper. A splice fills __qm_g; every return goes through __qmret so the reply +// is always {original, gauge}. A nil original becomes false, else Lua drops it from the +// multi-bulk reply (which would swallow the gauge on the dequeue throttle paths). +const QUEUE_METRICS_GAUGE_PRELUDE = ` +local __qm_g = false +local function __qmret(r) if r == nil then r = false end return {r, __qm_g} end`; + +// Fresh-read gauge for splice points with no reusable locals: enqueue slow-path (before +// return 0) and the base dequeue top. Gated on the last ARGV so it is inert unless the +// caller opts in. CK queues emit per-subqueue depth (queue_name aggregates via the MV). +const QUEUE_METRICS_GAUGE_LUA = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('ZCARD', queueKey)", + running: "redis.call('SCARD', queueCurrentConcurrencyKey)", + queueLimit: "redis.call('GET', queueConcurrencyLimitKey) or '1000000'", + envQueued: "redis.call('ZCARD', envQueueKey)", + envRunning: "redis.call('SCARD', envCurrentConcurrencyKey)", + envLimit: "redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit", +}); + +// Enqueue fast-path gauge: the admission check already computed queueCurrent/envCurrent/ +// queueLimit/envLimit, so reuse them (only 2 ZCARDs stay fresh). Fast path was taken, so +// cc < lim and thr is always 0 — reusing the effective queueLimit is fine (max() recovers raw). +const QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('ZCARD', queueKey)", + running: "queueCurrent", + queueLimit: "queueLimit", + envQueued: "redis.call('ZCARD', envQueueKey)", + envRunning: "envCurrent", + envLimit: "envLimit", +}); + +// CK dequeue: depth/running from the per-base-queue aggregate counters the run-queue already +// maintains (two O(1) GETs, not a per-variant scan). thr suppressed — an aggregate cc >= per-CK +// limit would over-report; per-CK throttle is caught by the per-subqueue enqueue gauges. +const QUEUE_METRICS_CK_DEQUEUE_GAUGE_LUA = createMetricsGaugeComputeLua({ + enabledArg: "ARGV[#ARGV] == '1'", + queued: "redis.call('GET', lengthCounterKey) or '0'", + running: "redis.call('GET', runningCounterKey) or '0'", + queueLimit: "redis.call('GET', queueConcurrencyLimitKey) or '1000000'", + envQueued: "redis.call('ZCARD', envQueueKey)", + envRunning: "redis.call('SCARD', envCurrentConcurrencyKey)", + envLimit: "redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit", + throttledExpr: "false", +}); + +/** Injected queue-metrics stream emitter; all calls are no-ops when metrics are disabled. */ +export interface RunQueueMetricsEmitter { + enabledSync(): boolean; + /** enabled AND sampled-in; gates high-frequency sampled emissions (the Lua gauge). */ + sampledSync(): boolean; + /** Counter event (cumulative odometer). */ + emit(shardKey: string, fields: Record): void; + /** Gauge snapshot read inside the queue-op Lua and returned on the reply. */ + emitGauge(shardKey: string, fields: Record): void; +} + export type RunQueueOptions = { name: string; tracer: Tracer; @@ -93,6 +153,8 @@ export type RunQueueOptions = { disabled?: boolean; }; meter?: Meter; + /** When set, enqueue/dequeue/ack/nack/dlq emit queue-metrics events (gated on the emitter's flag). */ + queueMetrics?: RunQueueMetricsEmitter; dequeueBlockingTimeoutSeconds?: number; concurrencySweeper?: { scanSchedule?: string; @@ -751,6 +813,8 @@ export class RunQueue { span.setAttribute("fastPath", fastPathTaken); + this.#emitQueueMetric(queueKey, { op: "enqueue", q: queueKey }); + if (!fastPathTaken && !skipDequeueProcessing) { // Slow path: schedule the dequeue job to move the message from queue to worker queue await this.worker.enqueueOnce({ @@ -810,6 +874,15 @@ export class RunQueue { ...flattenAttributes(dequeuedMessage.message, "message"), }); + const startedFields: Record = { + op: "started", + q: dequeuedMessage.message.queue, + }; + if (typeof dequeuedMessage.message.eligibleAtMs === "number") { + startedFields.wait = Math.max(0, Date.now() - dequeuedMessage.message.eligibleAtMs); + } + this.#emitQueueMetric(dequeuedMessage.message.queue, startedFields); + return dequeuedMessage; }, { @@ -877,6 +950,8 @@ export class RunQueue { message, removeFromWorkerQueue: options?.removeFromWorkerQueue, }); + + this.#emitQueueMetric(message.queue, { op: "ack", q: message.queue }); }, { kind: SpanKind.CONSUMER, @@ -934,6 +1009,7 @@ export class RunQueue { message.attempt = message.attempt + 1; if (message.attempt >= maxAttempts) { await this.#callMoveToDeadLetterQueue({ message }); + this.#emitQueueMetric(message.queue, { op: "dlq", q: message.queue }); return false; } } @@ -960,6 +1036,8 @@ export class RunQueue { await this.#callNackMessage({ message, retryAt }); + this.#emitQueueMetric(message.queue, { op: "nack", q: message.queue }); + return true; }, { @@ -1831,6 +1909,44 @@ export class RunQueue { * * @returns true if the fast path was taken (message pushed directly to worker queue) */ + #queueMetricsGaugeArg(): string { + // Gauge gate ARGV: enabled AND sampled-in (sampling applies to the gauge, not counters). + return this.options.queueMetrics?.sampledSync() ? "1" : "0"; + } + + // Gauge returned on a script reply as a flat [ql, cc, lim, eql, ec, elim, thr] array. + // Unlike counters, gauges are NOT base-normalized: the q label keeps its :ck: suffix so + // the CK-aggregate and per-subqueue readings stay distinguishable; the consumer's mapEntry + // strips :ck: to the base queue_name and the MV maxes them into one row. + #emitGauge(queue: string, gauge: number[]): void { + if (!Array.isArray(gauge) || gauge.length < 7) return; + const [ql, cc, lim, eql, ec, elim, thr] = gauge; + this.options.queueMetrics?.emitGauge(queue, { + op: "gauge", + q: queue, + ql, + cc, + lim, + eql, + ec, + elim, + thr, + }); + } + + #emitQueueMetric(shardKey: string, fields: Record): void { + // Counters roll up per BASE queue: normalize the CK-qualified queue to its base so all + // concurrency keys share one monotonic odometer (and one shard/order key), matching the + // base queue_name the consumer buckets on. Otherwise per-CK odometers would collide in + // the deltaSum reconstruction. + const baseQueue = this.keys.baseQueueKeyFromQueue(shardKey); + const baseFields = + typeof fields.q === "string" + ? { ...fields, q: this.keys.baseQueueKeyFromQueue(fields.q) } + : fields; + this.options.queueMetrics?.emit(baseQueue, baseFields); + } + async #callEnqueueMessage( message: OutputPayloadV2, ttlInfo?: { @@ -1869,6 +1985,7 @@ export class RunQueue { const messageScore = String(message.timestamp); const currentTime = String(Date.now()); const enableFastPathArg = enableFastPath ? "1" : "0"; + const metricsGaugeArg = this.#queueMetricsGaugeArg(); const defaultEnvConcurrencyLimit = String(this.options.defaultEnvConcurrency); const defaultEnvConcurrencyBurstFactor = String( this.options.defaultEnvConcurrencyBurstFactor ?? 1.0 @@ -1892,7 +2009,8 @@ export class RunQueue { service: this.name, }); - let result: number; + // Every gauge-carrying script returns a 2-tuple [originalReturn, gauge|null]. + let result: [number, number[] | null]; // Use CK-aware enqueue for messages with concurrency keys if (message.concurrencyKey) { @@ -1935,7 +2053,8 @@ export class RunQueue { currentTime, enableFastPathArg, ckKeyPrefix, - String(this.counterTtlSeconds) + String(this.counterTtlSeconds), + metricsGaugeArg ); } else { result = await this.redis.enqueueMessageCkTracked( @@ -1967,7 +2086,8 @@ export class RunQueue { currentTime, enableFastPathArg, ckKeyPrefix, - String(this.counterTtlSeconds) + String(this.counterTtlSeconds), + metricsGaugeArg ); } } else if (ttlInfo) { @@ -1998,7 +2118,8 @@ export class RunQueue { defaultEnvConcurrencyLimit, defaultEnvConcurrencyBurstFactor, currentTime, - enableFastPathArg + enableFastPathArg, + metricsGaugeArg ); } else { result = await this.redis.enqueueMessage( @@ -2024,11 +2145,14 @@ export class RunQueue { defaultEnvConcurrencyLimit, defaultEnvConcurrencyBurstFactor, currentTime, - enableFastPathArg + enableFastPathArg, + metricsGaugeArg ); } - return result === 1; + const [enqueueResult, gauge] = result; + if (gauge) this.#emitGauge(queueName, gauge); + return enqueueResult === 1; } async #callDequeueMessagesFromQueue({ @@ -2081,7 +2205,9 @@ export class RunQueue { maxCount, }); - const result = await this.redis.dequeueMessagesFromQueue( + const metricsGaugeArg = this.#queueMetricsGaugeArg(); + + const reply = await this.redis.dequeueMessagesFromQueue( //keys messageQueue, queueConcurrencyLimitKey, @@ -2099,9 +2225,16 @@ export class RunQueue { String(this.options.defaultEnvConcurrency), String(this.options.defaultEnvConcurrencyBurstFactor ?? 1), this.options.redis.keyPrefix ?? "", - String(maxCount) + String(maxCount), + metricsGaugeArg ); + // Reply is [flatMessages|null, gauge|null]: emit the gauge (read atomically inside + // the script, present on the throttle/empty paths too) and keep element 0 as the array. + const gauge = reply?.[1] ?? null; + if (gauge) this.#emitGauge(messageQueue, gauge); + const result = reply?.[0] ?? null; + if (!result) { span.setAttribute("message_count", 0); @@ -2202,8 +2335,11 @@ export class RunQueue { }); const lengthCounterKey = this.keys.queueLengthCounterKeyFromQueue(ckWildcardQueue); + const runningCounterKey = this.keys.queueRunningCounterKeyFromQueue(ckWildcardQueue); - const result = await this.redis.dequeueMessagesFromCkQueueTracked( + const metricsGaugeArg = this.#queueMetricsGaugeArg(); + + const reply = await this.redis.dequeueMessagesFromCkQueueTracked( //keys ckIndexKey, queueConcurrencyLimitKey, @@ -2215,15 +2351,22 @@ export class RunQueue { masterQueueKey, ttlQueueKey, lengthCounterKey, + runningCounterKey, //args ckWildcardQueue, String(Date.now()), String(this.options.defaultEnvConcurrency), String(this.options.defaultEnvConcurrencyBurstFactor ?? 1), this.options.redis.keyPrefix ?? "", - String(maxCount) + String(maxCount), + metricsGaugeArg ); + // Reply is [flatMessages|null, gauge|null]; the CK aggregate gauge rides here. + const gauge = reply?.[1] ?? null; + if (gauge) this.#emitGauge(ckWildcardQueue, gauge); + const result = reply?.[0] ?? null; + if (!result) { span.setAttribute("message_count", 0); return []; @@ -3062,6 +3205,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[7] local currentTime = ARGV[8] local enableFastPath = ARGV[9] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3083,7 +3228,8 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) - return 1 +${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA} + return __qmret(1) end end end @@ -3113,8 +3259,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3153,6 +3300,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[9] local currentTime = ARGV[10] local enableFastPath = ARGV[11] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3174,8 +3323,9 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) +${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA} -- Skip TTL sorted set: the expireRun worker job handles TTL expiry independently - return 1 + return __qmret(1) end end end @@ -3208,8 +3358,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3246,6 +3397,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[8] local currentTime = ARGV[9] local enableFastPath = ARGV[10] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3268,7 +3421,8 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) - return 1 +${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA} + return __qmret(1) end end end @@ -3304,8 +3458,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3344,6 +3499,8 @@ local defaultEnvConcurrencyBurstFactor = ARGV[10] local currentTime = ARGV[11] local enableFastPath = ARGV[12] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3365,8 +3522,9 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) +${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA} -- Skip TTL sorted set: the expireRun worker job handles TTL expiry independently - return 1 + return __qmret(1) end end end @@ -3405,8 +3563,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3455,6 +3614,8 @@ local keyPrefix = ARGV[11] -- TTL (seconds) applied to counter lazy-init SETs local counterTtl = ARGV[12] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3476,10 +3637,11 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) +${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA} -- Fast-path skips the CK variant zset entirely; lengthCounter is unchanged. -- runningCounter is bumped later by dequeueMessageFromKeyTracked when the -- worker pulls the message from the worker queue. - return 1 + return __qmret(1) end end end @@ -3531,8 +3693,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3576,6 +3739,8 @@ local keyPrefix = ARGV[13] -- TTL (seconds) applied to counter lazy-init SETs local counterTtl = ARGV[14] +${QUEUE_METRICS_GAUGE_PRELUDE} + -- Fast path: check if we can skip the queue and go directly to worker queue if enableFastPath == '1' then local available = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'LIMIT', 0, 1) @@ -3597,7 +3762,8 @@ if enableFastPath == '1' then redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('RPUSH', workerQueueKey, messageKeyValue) - return 1 +${QUEUE_METRICS_ENQUEUE_FASTPATH_GAUGE_LUA} + return __qmret(1) end end end @@ -3645,8 +3811,9 @@ redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) redis.call('SREM', queueCurrentDequeuedKey, messageId) redis.call('SREM', envCurrentDequeuedKey, messageId) +${QUEUE_METRICS_GAUGE_LUA} -return 0 +return __qmret(0) `, }); @@ -3891,6 +4058,8 @@ local defaultEnvConcurrencyLimit = ARGV[3] local defaultEnvConcurrencyBurstFactor = ARGV[4] local keyPrefix = ARGV[5] local maxCount = tonumber(ARGV[6] or '1') +${QUEUE_METRICS_GAUGE_PRELUDE} +${QUEUE_METRICS_GAUGE_LUA} -- Check current env concurrency against the limit local envCurrentConcurrency = tonumber(redis.call('SCARD', envCurrentConcurrencyKey) or '0') @@ -3899,7 +4068,7 @@ local envConcurrencyLimitBurstFactor = tonumber(redis.call('GET', envConcurrency local envConcurrencyLimitWithBurstFactor = math.floor(envConcurrencyLimit * envConcurrencyLimitBurstFactor) if envCurrentConcurrency >= envConcurrencyLimitWithBurstFactor then - return nil + return __qmret(nil) end -- Check current queue concurrency against the limit @@ -3909,7 +4078,7 @@ local totalQueueConcurrencyLimit = queueConcurrencyLimit -- Check condition only if concurrencyLimit exists if queueCurrentConcurrency >= totalQueueConcurrencyLimit then - return nil + return __qmret(nil) end -- Calculate how many messages we can actually dequeue based on concurrency limits @@ -3918,14 +4087,14 @@ local queueAvailableCapacity = totalQueueConcurrencyLimit - queueCurrentConcurre local actualMaxCount = math.min(maxCount, envAvailableCapacity, queueAvailableCapacity) if actualMaxCount <= 0 then - return nil + return __qmret(nil) end -- Attempt to dequeue messages up to actualMaxCount local messages = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'WITHSCORES', 'LIMIT', 0, actualMaxCount) if #messages == 0 then - return nil + return __qmret(nil) end local results = {} @@ -3991,7 +4160,7 @@ else end -- Return results as a flat array: [messageId1, messageScore1, messagePayload1, messageId2, messageScore2, messagePayload2, ...] -return results +return __qmret(results) `, }); @@ -4145,7 +4314,7 @@ return results // (normal dequeue, TTL-expired, or stale-orphan path — all of which were // counted at enqueue time). this.redis.defineCommand("dequeueMessagesFromCkQueueTracked", { - numberOfKeys: 10, + numberOfKeys: 11, lua: ` local ckIndexKey = KEYS[1] local queueConcurrencyLimitKey = KEYS[2] @@ -4157,6 +4326,7 @@ local envQueueKey = KEYS[7] local masterQueueKey = KEYS[8] local ttlQueueKey = KEYS[9] local lengthCounterKey = KEYS[10] +local runningCounterKey = KEYS[11] local ckWildcardName = ARGV[1] local currentTime = tonumber(ARGV[2]) @@ -4164,6 +4334,8 @@ local defaultEnvConcurrencyLimit = ARGV[3] local defaultEnvConcurrencyBurstFactor = ARGV[4] local keyPrefix = ARGV[5] local maxCount = tonumber(ARGV[6] or '1') +${QUEUE_METRICS_GAUGE_PRELUDE} +${QUEUE_METRICS_CK_DEQUEUE_GAUGE_LUA} local function decrLengthCounter() if tonumber(redis.call('GET', lengthCounterKey) or '0') > 0 then @@ -4178,7 +4350,7 @@ local envConcurrencyLimitBurstFactor = tonumber(redis.call('GET', envConcurrency local envConcurrencyLimitWithBurstFactor = math.floor(envConcurrencyLimit * envConcurrencyLimitBurstFactor) if envCurrentConcurrency >= envConcurrencyLimitWithBurstFactor then - return nil + return __qmret(nil) end local queueConcurrencyLimit = math.min(tonumber(redis.call('GET', queueConcurrencyLimitKey) or '1000000'), envConcurrencyLimit) @@ -4187,7 +4359,7 @@ local envAvailableCapacity = envConcurrencyLimitWithBurstFactor - envCurrentConc local actualMaxCount = math.min(maxCount, envAvailableCapacity) if actualMaxCount <= 0 then - return nil + return __qmret(nil) end local ckQueues = redis.call('ZRANGEBYSCORE', ckIndexKey, '-inf', tostring(currentTime), 'LIMIT', 0, actualMaxCount * 3) @@ -4199,7 +4371,7 @@ if #ckQueues == 0 then else redis.call('ZADD', masterQueueKey, anyIdx[2], ckWildcardName) end - return nil + return __qmret(nil) end local results = {} @@ -4281,7 +4453,7 @@ else redis.call('ZADD', masterQueueKey, earliestIdx[2], ckWildcardName) end -return results +return __qmret(results) `, }); @@ -5199,8 +5371,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, currentTime: string, enableFastPath: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; enqueueMessageWithTtl( //keys @@ -5229,8 +5402,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, currentTime: string, enableFastPath: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; expireTtlRuns( //keys @@ -5265,8 +5439,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, keyPrefix: string, maxCount: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[string[] | null, number[] | null]> + ): Result<[string[] | null, number[] | null], Context>; dequeueMessageFromWorkerQueueNonBlocking( workerQueueKey: string, @@ -5405,8 +5580,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, currentTime: string, enableFastPath: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; enqueueMessageWithTtlCk( //keys @@ -5437,8 +5613,9 @@ declare module "@internal/redis" { defaultEnvConcurrencyBurstFactor: string, currentTime: string, enableFastPath: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; dequeueMessagesFromCkQueue( //keys @@ -5551,8 +5728,9 @@ declare module "@internal/redis" { enableFastPath: string, keyPrefix: string, counterTtl: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; enqueueMessageWithTtlCkTracked( masterQueueKey: string, @@ -5585,8 +5763,9 @@ declare module "@internal/redis" { enableFastPath: string, keyPrefix: string, counterTtl: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[number, number[] | null]> + ): Result<[number, number[] | null], Context>; dequeueMessagesFromCkQueueTracked( ckIndexKey: string, @@ -5599,14 +5778,16 @@ declare module "@internal/redis" { masterQueueKey: string, ttlQueueKey: string, lengthCounterKey: string, + runningCounterKey: string, ckWildcardName: string, currentTime: string, defaultEnvConcurrencyLimit: string, defaultEnvConcurrencyBurstFactor: string, keyPrefix: string, maxCount: string, - callback?: Callback - ): Result; + metricsEnabled: string, + callback?: Callback<[string[] | null, number[] | null]> + ): Result<[string[] | null, number[] | null], Context>; dequeueMessageFromKeyTracked( messageKey: string, diff --git a/internal-packages/run-engine/src/run-queue/metrics.test.ts b/internal-packages/run-engine/src/run-queue/metrics.test.ts new file mode 100644 index 00000000000..44c692b577c --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/metrics.test.ts @@ -0,0 +1,345 @@ +import { createRedisClient } from "@internal/redis"; +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { + allStreamKeys, + MetricsStreamEmitter, + type MetricDefinition, +} from "@internal/metrics-pipeline"; +import { Logger } from "@trigger.dev/core/logger"; +import { Decimal } from "@trigger.dev/database"; +import { describe } from "node:test"; +import { setTimeout } from "node:timers/promises"; +import { expect } from "vitest"; +import { FairQueueSelectionStrategy } from "./fairQueueSelectionStrategy.js"; +import { RunQueue } from "./index.js"; +import { RunQueueFullKeyProducer } from "./keyProducer.js"; +import { InputPayload } from "./types.js"; + +const authenticatedEnvDev = { + id: "e1234", + type: "DEVELOPMENT" as const, + maximumConcurrencyLimit: 10, + concurrencyLimitBurstFactor: new Decimal(1.0), + project: { id: "p1234" }, + organization: { id: "o1234" }, +}; + +async function readAllEntries(redisOptions: { + host: string; + port: number; +}, definition: MetricDefinition) { + const client = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const entries: Array<{ id: string; fields: Record }> = []; + for (const key of allStreamKeys(definition)) { + const raw = (await client.xrange(key, "-", "+")) as Array<[string, string[]]>; + for (const [id, flat] of raw) { + const fields: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) fields[flat[i]!] = flat[i + 1]!; + entries.push({ id, fields }); + } + } + await client.quit(); + return entries; +} + +// Gauges now land via a fire-and-forget Node XADD after the script reply (not synchronously +// inside the Lua), so reads must poll until the expected entries appear. +async function waitForEntries( + redisOptions: { host: string; port: number }, + definition: MetricDefinition, + predicate: (entries: Array<{ id: string; fields: Record }>) => boolean, + timeoutMs = 5000 +) { + const start = Date.now(); + let entries = await readAllEntries(redisOptions, definition); + while (!predicate(entries)) { + if (Date.now() - start > timeoutMs) return entries; + await setTimeout(50); + entries = await readAllEntries(redisOptions, definition); + } + return entries; +} + +describe("RunQueue queue-metrics emission", () => { + redisTest("emits gauge + enqueue/started/ack events when enabled", async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_test_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + const emitter = new MetricsStreamEmitter({ + redis, + definition, + flag: { enabled: () => true }, + }); + + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), + keys: new RunQueueFullKeyProducer(), + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); + + const message: InputPayload = { + runId: "r-metrics", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + eligibleAtMs: Date.now() - 500, + attempt: 0, + }; + + try { + await queue.enqueueMessage({ env: authenticatedEnvDev, message, workerQueue: authenticatedEnvDev.id }); + await setTimeout(1000); + const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + expect(dequeued?.messageId).toBe(message.runId); + await queue.acknowledgeMessage(message.orgId, message.runId); + await setTimeout(100); + + const entries = await waitForEntries(redis, definition, (es) => { + const seen = es.map((e) => e.fields.op); + return ["enqueue", "gauge", "started", "ack"].every((o) => seen.includes(o)); + }); + const ops = entries.map((e) => e.fields.op); + expect(ops).toContain("enqueue"); + expect(ops).toContain("gauge"); + expect(ops).toContain("started"); + expect(ops).toContain("ack"); + + const gauge = entries.find((e) => e.fields.op === "gauge"); + assertGauge(gauge); + expect(gauge!.fields.q).toContain("task/my-task"); + for (const f of ["ql", "cc", "lim", "eql", "ec", "elim", "thr"]) { + expect(gauge!.fields[f]).toBeDefined(); + } + + // The first counter emission also seeds a cum=0 baseline (no wait); the real reading + // carries wait. Pick the reading (cum > 0). + const started = entries.find((e) => e.fields.op === "started" && Number(e.fields.cum) > 0); + expect(started!.fields.wait).toBeDefined(); + expect(Number(started!.fields.wait)).toBeGreaterThanOrEqual(0); + expect(Number(started!.fields.cum)).toBeGreaterThan(0); + } finally { + await queue.quit(); + await emitter.close(); + } + }); + + redisTest("emits a fast-path gauge reusing the admission-check locals", async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_fp_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + const emitter = new MetricsStreamEmitter({ redis, definition, flag: { enabled: () => true } }); + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), + keys: new RunQueueFullKeyProducer(), + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); + + const message: InputPayload = { + runId: "r-fastpath", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, + }; + + try { + // enableFastPath + empty queue + zero concurrency => the Lua takes the fast path, + // so the gauge runs the reuse snippet (queueCurrent/envCurrent/queueLimit/envLimit). + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + enableFastPath: true, + }); + const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + expect(dequeued?.messageId).toBe(message.runId); + + const entries = await waitForEntries( + redis, + definition, + (es) => es.some((e) => e.fields.op === "gauge") && es.some((e) => e.fields.op === "enqueue") + ); + const gauge = entries.find((e) => e.fields.op === "gauge"); + assertGauge(gauge); + for (const f of ["ql", "cc", "lim", "eql", "ec", "elim", "thr"]) { + expect(gauge!.fields[f]).toBeDefined(); + } + // Fast path was taken => capacity was available => not throttled. + expect(gauge!.fields.thr).toBe("0"); + expect(entries.some((e) => e.fields.op === "enqueue")).toBe(true); + } finally { + await queue.quit(); + await emitter.close(); + } + }); + + redisTest("emits an aggregate gauge for CK queues at dequeue", async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_ck_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + const emitter = new MetricsStreamEmitter({ redis, definition, flag: { enabled: () => true } }); + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), + keys: new RunQueueFullKeyProducer(), + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); + + const message: InputPayload = { + runId: "r-ck", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + concurrencyKey: "tenant-1", + timestamp: Date.now(), + eligibleAtMs: Date.now() - 300, + attempt: 0, + }; + + try { + await queue.enqueueMessage({ env: authenticatedEnvDev, message, workerQueue: authenticatedEnvDev.id }); + await setTimeout(1000); + const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + expect(dequeued?.messageId).toBe(message.runId); + + const entries = await waitForEntries(redis, definition, (es) => + es.some((e) => e.fields.op === "gauge" && e.fields.q.includes(":ck:") && e.fields.thr === "0") + ); + const gauges = entries.filter((e) => e.fields.op === "gauge"); + expect(gauges.length).toBeGreaterThan(0); + // The aggregate CK dequeue gauge targets the CK wildcard and never sets thr. + const aggregate = gauges.find((e) => e.fields.q.includes(":ck:") && e.fields.thr === "0"); + assertGauge(aggregate); + expect(Number(aggregate!.fields.ql)).toBeGreaterThanOrEqual(0); + expect(Number(aggregate!.fields.cc)).toBeGreaterThanOrEqual(0); + } finally { + await queue.quit(); + await emitter.close(); + } + }); + + redisTest("gauge sampling gates gauges but not counters", async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_sample_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + // gaugeSampleRate 0 => sampledSync() always false => Lua gauge never fires; counters still do. + const emitter = new MetricsStreamEmitter({ + redis, + definition, + flag: { enabled: () => true }, + gaugeSampleRate: 0, + }); + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), + keys: new RunQueueFullKeyProducer(), + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); + + const message: InputPayload = { + runId: "r-sample", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, + }; + + try { + await queue.enqueueMessage({ env: authenticatedEnvDev, message, workerQueue: authenticatedEnvDev.id }); + await setTimeout(1000); + await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + + // Poll until the counter (enqueue) lands; by then a gauge would have too, if sampled in. + const entries = await waitForEntries(redis, definition, (es) => + es.some((e) => e.fields.op === "enqueue") + ); + expect(entries.some((e) => e.fields.op === "gauge")).toBe(false); + expect(entries.some((e) => e.fields.op === "enqueue")).toBe(true); + } finally { + await queue.quit(); + await emitter.close(); + } + }); +}); + +function assertGauge(gauge: unknown): asserts gauge { + if (!gauge) throw new Error("expected a gauge entry"); +} diff --git a/internal-packages/run-engine/src/run-queue/types.ts b/internal-packages/run-engine/src/run-queue/types.ts index 0905f3971de..8a7d3c93ec5 100644 --- a/internal-packages/run-engine/src/run-queue/types.ts +++ b/internal-packages/run-engine/src/run-queue/types.ts @@ -13,6 +13,9 @@ export const InputPayload = z.object({ queue: z.string(), concurrencyKey: z.string().optional(), timestamp: z.number(), + // Unix ms the run became eligible (delayUntil if set, else triggered-at), pre-priority. + // Dequeue scheduling delay = dequeueTime - eligibleAtMs. Optional for old-payload compat. + eligibleAtMs: z.number().optional(), attempt: z.number(), /** TTL expiration timestamp (unix ms). If set, run will be expired when this time is reached. */ ttlExpiresAt: z.number().optional(), From 2d3d32dde85e4518dc2de40445d27cdd29a03af3 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 2 Jul 2026 23:30:45 +0100 Subject: [PATCH 04/37] feat(webapp): queue metrics ingestion, admin controls, and emission switch --- apps/webapp/app/entry.server.tsx | 3 + apps/webapp/app/env.server.ts | 23 ++ .../app/routes/admin.api.v1.queue-metrics.ts | 45 ++++ .../webapp/app/routes/admin.queue-metrics.tsx | 181 +++++++++++++ apps/webapp/app/routes/admin.tsx | 4 + apps/webapp/app/v3/queueMetrics.server.ts | 240 ++++++++++++++++++ apps/webapp/app/v3/queueMetricsMapping.ts | 101 ++++++++ apps/webapp/app/v3/runEngine.server.ts | 2 + apps/webapp/package.json | 2 + apps/webapp/test/queueMetricsMapping.test.ts | 125 +++++++++ 10 files changed, 726 insertions(+) create mode 100644 apps/webapp/app/routes/admin.api.v1.queue-metrics.ts create mode 100644 apps/webapp/app/routes/admin.queue-metrics.tsx create mode 100644 apps/webapp/app/v3/queueMetrics.server.ts create mode 100644 apps/webapp/app/v3/queueMetricsMapping.ts create mode 100644 apps/webapp/test/queueMetricsMapping.test.ts diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 091f2f28ccf..1cc842eb916 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -11,6 +11,7 @@ import * as Worker from "~/services/worker.server"; import { initMollifierDrainerWorker } from "~/v3/mollifierDrainerWorker.server"; import { initMollifierStaleSweepWorker } from "~/v3/mollifierStaleSweepWorker.server"; import { initBillingLimitWorker } from "~/v3/billingLimitWorker.server"; +import { initQueueMetricsConsumer, initQueueMetricsEmitter } from "~/v3/queueMetrics.server"; import { bootstrap } from "./bootstrap"; import { LocaleContextProvider } from "./components/primitives/LocaleProvider"; import type { OperatingSystemPlatform } from "./components/primitives/OperatingSystemProvider"; @@ -234,6 +235,8 @@ Worker.init().catch((error) => { initMollifierDrainerWorker(); initMollifierStaleSweepWorker(); initBillingLimitWorker(); +initQueueMetricsEmitter(); +initQueueMetricsConsumer(); bootstrap().catch((error) => { logError(error); diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index dc189720821..e7591e177b4 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -883,6 +883,29 @@ const EnvironmentSchema = z RUN_ENGINE_REUSE_SNAPSHOT_COUNT: z.coerce.number().int().default(0), RUN_ENGINE_MAXIMUM_ENV_COUNT: z.coerce.number().int().optional(), RUN_ENGINE_RUN_QUEUE_SHARD_COUNT: z.coerce.number().int().default(4), + // Queue metrics ingestion (Redis Stream -> ClickHouse). The runtime on/off is the + // `queue_metrics:enabled` Redis key; these gate emitter construction + consumer boot. + QUEUE_METRICS_EMIT_ENABLED: z.string().default("0"), + QUEUE_METRICS_CONSUMER_ENABLED: z.string().default("0"), + QUEUE_METRICS_STREAM_SHARD_COUNT: z.coerce.number().int().default(4), + QUEUE_METRICS_STREAM_MAXLEN: z.coerce.number().int().default(2_000_000), + QUEUE_METRICS_CONSUMER_BATCH_SIZE: z.coerce.number().int().default(1000), + // Counter stream (exact counts, loss-intolerant). Unset host => the run-queue Redis; + // set it to a dedicated instance so counter backlog never competes with the run queue. + QUEUE_METRICS_REDIS_HOST: z.string().optional(), + QUEUE_METRICS_REDIS_PORT: z.coerce.number().optional(), + QUEUE_METRICS_REDIS_USERNAME: z.string().optional(), + QUEUE_METRICS_REDIS_PASSWORD: z.string().optional(), + QUEUE_METRICS_REDIS_TLS_DISABLED: z.string().default("false"), + QUEUE_METRICS_COUNTER_STREAM_MAXLEN: z.coerce.number().int().default(8_000_000), + // TTL (seconds) on the per-(queue,op) cumulative odometer key, refreshed on every write. + // Idle-past-TTL queues purge and self-heal (restart from 1) on return; default 7 days. + QUEUE_METRICS_COUNTER_ODOMETER_TTL_SECONDS: z.coerce.number().int().default(604_800), + // Per-env distinct queue_name cap (0 = unlimited); overflow maps to "__overflow__". + QUEUE_METRICS_MAX_QUEUE_NAMES_PER_ENV: z.coerce.number().int().default(1000), + // Fraction (0..1) of ops that emit a gauge; counters are never sampled. Dial below 1 + // only if EngineCPU is too high in slow-path-heavy regions (hurts low-traffic queues). + QUEUE_METRICS_GAUGE_SAMPLE_RATE: z.coerce.number().min(0).max(1).default(1), RUN_ENGINE_WORKER_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().default(60_000), RUN_ENGINE_RETRY_WARM_START_THRESHOLD_MS: z.coerce.number().int().default(30_000), RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS: z.coerce.number().int().default(200), diff --git a/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts b/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts new file mode 100644 index 00000000000..69e4e8c1fac --- /dev/null +++ b/apps/webapp/app/routes/admin.api.v1.queue-metrics.ts @@ -0,0 +1,45 @@ +import { type ActionFunctionArgs, type LoaderFunctionArgs, json } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { + probeQueueMetricsStreams, + readQueueMetricsControls, + writeQueueMetricsControls, +} from "~/v3/queueMetrics.server"; + +export async function loader({ request }: LoaderFunctionArgs) { + await requireAdminApiRequest(request); + const [controls, streams] = await Promise.all([ + readQueueMetricsControls(), + probeQueueMetricsStreams(), + ]); + return json({ controls, streams }); +} + +const BodySchema = z.object({ + enabled: z.boolean().optional(), + sampleRate: z.number().min(0).max(1).optional(), +}); + +export async function action({ request }: ActionFunctionArgs) { + await requireAdminApiRequest(request); + + if (request.method !== "POST") { + return json({ error: "Method not allowed" }, { status: 405 }); + } + + let body: unknown; + try { + body = await request.json(); + } catch { + return json({ error: "Invalid JSON body" }, { status: 400 }); + } + + const parsed = BodySchema.safeParse(body); + if (!parsed.success) { + return json({ error: "Invalid payload", details: parsed.error.issues }, { status: 400 }); + } + + await writeQueueMetricsControls(parsed.data); + return json({ ok: true, controls: await readQueueMetricsControls() }); +} diff --git a/apps/webapp/app/routes/admin.queue-metrics.tsx b/apps/webapp/app/routes/admin.queue-metrics.tsx new file mode 100644 index 00000000000..883a3bdb2d3 --- /dev/null +++ b/apps/webapp/app/routes/admin.queue-metrics.tsx @@ -0,0 +1,181 @@ +import { useFetcher, useRevalidator } from "@remix-run/react"; +import { json } from "@remix-run/server-runtime"; +import { useEffect, useState } from "react"; +import { typedjson, useTypedLoaderData } from "remix-typedjson"; +import { z } from "zod"; +import { Button } from "~/components/primitives/Buttons"; +import { Callout } from "~/components/primitives/Callout"; +import { Header1, Header2 } from "~/components/primitives/Headers"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import { dashboardAction, dashboardLoader } from "~/services/routeBuilders/dashboardBuilder"; +import { + probeQueueMetricsStreams, + readQueueMetricsControls, + writeQueueMetricsControls, +} from "~/v3/queueMetrics.server"; + +export const loader = dashboardLoader({ authorization: { requireSuper: true } }, async () => { + const [controls, streams] = await Promise.all([ + readQueueMetricsControls(), + probeQueueMetricsStreams(), + ]); + return typedjson({ controls, streams }); +}); + +const BodySchema = z.object({ + enabled: z.boolean().optional(), + sampleRate: z.number().min(0).max(1).optional(), +}); + +export const action = dashboardAction( + { authorization: { requireSuper: true } }, + async ({ request }) => { + let body: unknown; + try { + body = await request.json(); + } catch { + return json({ error: "Invalid JSON body" }, { status: 400 }); + } + const parsed = BodySchema.safeParse(body); + if (!parsed.success) { + return json({ error: "Invalid payload" }, { status: 400 }); + } + await writeQueueMetricsControls(parsed.data); + return json({ success: true }); + } +); + +export default function AdminQueueMetricsRoute() { + const { controls, streams } = useTypedLoaderData(); + const saveFetcher = useFetcher<{ success?: boolean; error?: string }>(); + const revalidator = useRevalidator(); + + const [enabled, setEnabled] = useState(controls.enabled); + const [sampleRate, setSampleRate] = useState(String(controls.sampleRate)); + const [error, setError] = useState(null); + + useEffect(() => { + setEnabled(controls.enabled); + setSampleRate(String(controls.sampleRate)); + }, [controls.enabled, controls.sampleRate]); + + useEffect(() => { + if (saveFetcher.data?.success) { + setError(null); + revalidator.revalidate(); + } else if (saveFetcher.data?.error) { + setError(saveFetcher.data.error); + } + }, [saveFetcher.data]); + + const isSaving = saveFetcher.state === "submitting"; + + const handleSave = () => { + const rate = Number(sampleRate); + if (!Number.isFinite(rate) || rate < 0 || rate > 1) { + setError("Sample rate must be a number between 0 and 1"); + return; + } + saveFetcher.submit(JSON.stringify({ enabled, sampleRate: rate }), { + method: "POST", + encType: "application/json", + }); + }; + + const totalLag = streams.reduce((sum, s) => sum + (s.lag ?? 0), 0); + const lagUnknownCount = streams.filter((s) => s.lag === null).length; + + return ( +
+
+ Queue metrics ingest + + Live controls for the queue-metrics ingest pipeline on the run-queue Redis. Changes take + effect within ~10s across all instances (no redeploy). Watch EngineCPU on the run-queue + Redis when enabling or raising the sample rate. + + +
+ Controls + +
+ + setSampleRate(e.target.value)} + className="w-32 rounded border border-grid-bright bg-charcoal-750 px-2 py-1 text-text-bright" + /> +
+ {error && {error}} +
+ +
+
+ +
+
+ Stream health{totalLag > 0 ? ` — lag ${totalLag}` : ""} + +
+ + Depth = entries buffered in the shard stream; Lag = entries not yet delivered to the + consumer group (rising = consumer falling behind; "unknown" = entries were trimmed + past the group, i.e. data was lost); Pending = unacked entries. Gauges and counters + share one stream family on the metrics Redis. + + {lagUnknownCount > 0 && ( + + Lag is unknown on {lagUnknownCount} shard{lagUnknownCount === 1 ? "" : "s"}: entries + were trimmed past the consumer group's read position, so stream data was lost. Check + consumer health. + + )} + + + + + + + + + + + + {streams.map((s) => ( + + + + + + + + ))} + +
StreamShardDepthLagPending
{s.stream}{s.shard}{s.depth}{s.lag ?? "unknown"}{s.pending}
+
+
+
+ ); +} diff --git a/apps/webapp/app/routes/admin.tsx b/apps/webapp/app/routes/admin.tsx index a95b016ca5b..7d24fe312fa 100644 --- a/apps/webapp/app/routes/admin.tsx +++ b/apps/webapp/app/routes/admin.tsx @@ -38,6 +38,10 @@ export default function Page() { label: "Global Feature Flags", to: "/admin/feature-flags", }, + { + label: "Queue Metrics", + to: "/admin/queue-metrics", + }, { label: "Notifications", to: "/admin/notifications", diff --git a/apps/webapp/app/v3/queueMetrics.server.ts b/apps/webapp/app/v3/queueMetrics.server.ts new file mode 100644 index 00000000000..971b0aabd83 --- /dev/null +++ b/apps/webapp/app/v3/queueMetrics.server.ts @@ -0,0 +1,240 @@ +import { type ClickHouse, type QueueMetricsRawV1Input } from "@internal/clickhouse"; +import { + allStreamKeys, + CachedRedisFlag, + CachedRedisNumber, + MetricsStreamConsumer, + MetricsStreamEmitter, + probeShardStates, + type MetricDefinition, + type ShardState, + type StreamEntry, +} from "@internal/metrics-pipeline"; +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import os from "node:os"; +import { env } from "~/env.server"; +import { getDefaultClickhouseClient } from "~/services/clickhouse/clickhouseFactory.server"; +import { logger } from "~/services/logger.server"; +import { signalsEmitter } from "~/services/signals.server"; +import { singleton } from "~/utils/singleton"; +import { mapEntryToRow, QueueNameLimiter } from "./queueMetricsMapping"; +import { meter } from "./tracer.server"; + +const FLAG_KEY = "queue_metrics:enabled"; +const SAMPLE_RATE_KEY = "queue_metrics:gauge_sample_rate"; +const TRUTHY = new Set(["1", "true", "on", "enabled", "yes"]); + +// Same physical Redis as the RunQueue (host/port/auth). Stream keys are kept out of the +// keyPrefix on every access path, so only the connection details matter here. +function runQueueRedisOptions(): RedisOptions { + return { + port: env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? undefined, + host: env.RUN_ENGINE_RUN_QUEUE_REDIS_HOST ?? undefined, + username: env.RUN_ENGINE_RUN_QUEUE_REDIS_USERNAME ?? undefined, + password: env.RUN_ENGINE_RUN_QUEUE_REDIS_PASSWORD ?? undefined, + enableAutoPipelining: true, + ...(env.RUN_ENGINE_RUN_QUEUE_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }; +} + +// Metrics stream Redis: a dedicated instance when QUEUE_METRICS_REDIS_HOST is set (so the +// metrics backlog never competes with the run queue), else the run-queue Redis. Carries BOTH +// gauges and counters — gauges are read inside the queue-op Lua and returned on the reply, +// then XADDed here by Node, so the run-queue Redis holds no metrics stream. +function metricsRedisOptions(): RedisOptions { + if (!env.QUEUE_METRICS_REDIS_HOST) return runQueueRedisOptions(); + return { + host: env.QUEUE_METRICS_REDIS_HOST, + port: env.QUEUE_METRICS_REDIS_PORT ?? undefined, + username: env.QUEUE_METRICS_REDIS_USERNAME ?? undefined, + password: env.QUEUE_METRICS_REDIS_PASSWORD ?? undefined, + enableAutoPipelining: true, + ...(env.QUEUE_METRICS_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }; +} + +// One stream family on the metrics Redis carrying both gauge snapshots and cumulative +// counter readings; one consumer group reads it. +function metricsDefinition(): MetricDefinition { + return { + name: "queue_metrics", + shardCount: env.QUEUE_METRICS_STREAM_SHARD_COUNT, + consumerGroup: "queue_metrics_cg", + maxLen: env.QUEUE_METRICS_COUNTER_STREAM_MAXLEN, + }; +} + +// Dedicated client for the admin read/write/probe surface — works regardless of whether +// this instance runs the emitter/consumer. keyPrefix unset to match the raw control keys. +function adminRedis(): Redis { + return singleton("queueMetricsAdminRedis", () => + createRedisClient( + { ...runQueueRedisOptions(), keyPrefix: undefined }, + { onError: (error) => logger.error("queue metrics admin redis error", { error }) } + ) + ); +} + +function metricsAdminRedis(): Redis { + return singleton("queueMetricsCounterAdminRedis", () => + createRedisClient( + { ...metricsRedisOptions(), keyPrefix: undefined }, + { onError: (error) => logger.error("queue metrics counter admin redis error", { error }) } + ) + ); +} + +export type QueueMetricsControls = { + enabled: boolean; + enabledKeySet: boolean; + sampleRate: number; + sampleRateKeySet: boolean; + sampleRateDefault: number; +}; + +export async function readQueueMetricsControls(): Promise { + const [enabledRaw, rateRaw] = (await adminRedis().mget(FLAG_KEY, SAMPLE_RATE_KEY)) as ( + | string + | null + )[]; + const sampleRateDefault = env.QUEUE_METRICS_GAUGE_SAMPLE_RATE; + const parsed = rateRaw == null ? Number.NaN : Number(rateRaw); + return { + enabled: enabledRaw != null && TRUTHY.has(enabledRaw.trim().toLowerCase()), + enabledKeySet: enabledRaw != null, + sampleRate: Number.isFinite(parsed) ? Math.min(1, Math.max(0, parsed)) : sampleRateDefault, + sampleRateKeySet: rateRaw != null, + sampleRateDefault, + }; +} + +export async function writeQueueMetricsControls(update: { + enabled?: boolean; + sampleRate?: number; +}): Promise { + const client = adminRedis(); + const ops: Promise[] = []; + if (update.enabled !== undefined) { + ops.push(client.set(FLAG_KEY, update.enabled ? "1" : "0")); + } + if (update.sampleRate !== undefined) { + ops.push(client.set(SAMPLE_RATE_KEY, String(Math.min(1, Math.max(0, update.sampleRate))))); + } + await Promise.all(ops); +} + +export type LabeledShardState = ShardState & { stream: "queue_metrics" }; + +export async function probeQueueMetricsStreams(): Promise { + const def = metricsDefinition(); + const states = await probeShardStates( + metricsAdminRedis(), + allStreamKeys(def), + def.consumerGroup + ); + return states.map((s) => ({ ...s, stream: "queue_metrics" as const })); +} + +/** Injected into the RunQueue when QUEUE_METRICS_EMIT_ENABLED=1; emits only while the flag is on. */ +export function getQueueMetricsEmitter(): MetricsStreamEmitter { + return singleton("queueMetricsEmitter", () => { + // Control keys stay on the run-queue Redis (the admin surface + docs point there). + const controlRedis = runQueueRedisOptions(); + const flag = new CachedRedisFlag({ redis: controlRedis, key: FLAG_KEY, cacheTtlMs: 10_000 }); + // Live-tunable (Redis key, 10s cache); the env value is the default when the key is unset. + const gaugeSampleRate = new CachedRedisNumber({ + redis: controlRedis, + key: SAMPLE_RATE_KEY, + defaultValue: env.QUEUE_METRICS_GAUGE_SAMPLE_RATE, + min: 0, + max: 1, + cacheTtlMs: 10_000, + }); + return new MetricsStreamEmitter({ + redis: metricsRedisOptions(), + definition: metricsDefinition(), + flag, + meter, + gaugeSampleRate, + counterOdometerTtlMs: env.QUEUE_METRICS_COUNTER_ODOMETER_TTL_SECONDS * 1000, + }); + }); +} + +const queueNameLimiter = singleton( + "queueMetricsQueueNameLimiter", + () => new QueueNameLimiter(env.QUEUE_METRICS_MAX_QUEUE_NAMES_PER_ENV) +); + +function mapEntry(entry: StreamEntry): QueueMetricsRawV1Input | null { + return mapEntryToRow(entry, queueNameLimiter); +} + +function makeInsert(): ( + rows: QueueMetricsRawV1Input[], + opts: { dedupToken: string } +) => Promise { + const ch: ClickHouse = getDefaultClickhouseClient(); + const insertRaw = ch.queueMetrics.insertRaw; + return async (rows, { dedupToken }) => { + const [error] = await insertRaw(rows, { + params: { + clickhouse_settings: { + insert_deduplication_token: dedupToken, + async_insert: 0, + // Propagate the token through the MV so a raw-deduped retry can't leave + // queue_metrics_v1 short when the MV insert failed on the first attempt. + deduplicate_blocks_in_dependent_materialized_views: 1, + }, + }, + }); + if (error) throw error; + }; +} + +function getQueueMetricsConsumers(): MetricsStreamConsumer[] { + return singleton("queueMetricsConsumers", () => { + const insert = makeInsert(); + return [ + new MetricsStreamConsumer({ + consumerName: `${os.hostname()}-${process.pid}`, + batchSize: env.QUEUE_METRICS_CONSUMER_BATCH_SIZE, + meter, + mapEntry, + insert, + redis: metricsRedisOptions(), + definition: metricsDefinition(), + }), + ]; + }); +} + +// Construct the emitter at boot (not lazily on the first enqueue) so its flag has warmed +// before any traffic — otherwise the first op after boot reads the default and is dropped. +export function initQueueMetricsEmitter(): void { + if (env.QUEUE_METRICS_EMIT_ENABLED !== "1") return; + getQueueMetricsEmitter(); +} + +declare global { + // eslint-disable-next-line no-var + var __queueMetricsConsumerRegistered__: boolean | undefined; +} + +export function initQueueMetricsConsumer(): void { + if (env.QUEUE_METRICS_CONSUMER_ENABLED !== "1") return; + if (global.__queueMetricsConsumerRegistered__) return; + global.__queueMetricsConsumerRegistered__ = true; + + const consumers = getQueueMetricsConsumers(); + const stop = () => + Promise.all(consumers.map((c) => c.stop())).catch((error) => + logger.error("queue metrics consumer stop failed", { error }) + ); + signalsEmitter.on("SIGTERM", stop); + signalsEmitter.on("SIGINT", stop); + + Promise.all(consumers.map((c) => c.start())) + .then(() => logger.info("Queue metrics consumer started")) + .catch((error) => logger.error("queue metrics consumers failed to start", { error })); +} diff --git a/apps/webapp/app/v3/queueMetricsMapping.ts b/apps/webapp/app/v3/queueMetricsMapping.ts new file mode 100644 index 00000000000..afc30b5aaf7 --- /dev/null +++ b/apps/webapp/app/v3/queueMetricsMapping.ts @@ -0,0 +1,101 @@ +import { type QueueMetricsRawV1Input } from "@internal/clickhouse"; +import { entryOrderKey, entryTimeMs, type StreamEntry } from "@internal/metrics-pipeline"; + +const OPS = new Set(["gauge", "enqueue", "started", "ack", "nack", "dlq"]); + +// {org:ORGID}:proj:PROJECTID:env:ENVID:queue:QUEUENAME[:ck:CK]. Anchored (not a +// positional split) so a queue name containing ":" survives; the lazy name capture +// stops before an optional ":ck:" suffix. +const DESCRIPTOR = /^\{org:([^}]+)\}:proj:([^:]+):env:([^:]+):queue:(.+?)(?::ck:.+)?$/; + +export function descriptorFromQueue(q: string): { + organization_id: string; + project_id: string; + environment_id: string; + queue_name: string; +} | null { + const match = DESCRIPTOR.exec(q); + if (!match) return null; + return { + organization_id: match[1]!, + project_id: match[2]!, + environment_id: match[3]!, + queue_name: match[4]!, + }; +} + +export const OVERFLOW_QUEUE_NAME = "__overflow__"; + +/** + * Bounds per-environment queue_name cardinality (queue_name is user-controlled and is a + * GROUP BY key in the aggregated table). Names beyond the cap map to OVERFLOW_QUEUE_NAME. + * Per-process and reset on restart, so the cap is approximate: a protective bound, not a quota. + */ +export class QueueNameLimiter { + private readonly byEnv = new Map>(); + + constructor( + private readonly maxPerEnv: number, + private readonly maxEnvs = 10_000 + ) {} + + limit(environmentId: string, queueName: string): string { + if (this.maxPerEnv <= 0) return queueName; + let names = this.byEnv.get(environmentId); + if (!names) { + if (this.byEnv.size >= this.maxEnvs) { + const oldest = this.byEnv.keys().next().value; + if (oldest !== undefined) this.byEnv.delete(oldest); + } + names = new Set(); + this.byEnv.set(environmentId, names); + } + if (names.has(queueName)) return queueName; + if (names.size >= this.maxPerEnv) return OVERFLOW_QUEUE_NAME; + names.add(queueName); + return queueName; + } +} + +function num(value: string | undefined): number | undefined { + if (value == null) return undefined; + const n = Number(value); + return Number.isFinite(n) ? n : undefined; +} + +export function mapEntryToRow( + entry: StreamEntry, + limiter?: QueueNameLimiter +): QueueMetricsRawV1Input | null { + const f = entry.fields; + const op = f.op; + if (!op || !OPS.has(op) || !f.q) return null; + const descriptor = descriptorFromQueue(f.q); + if (!descriptor || !descriptor.queue_name) return null; + if (limiter) { + descriptor.queue_name = limiter.limit(descriptor.environment_id, descriptor.queue_name); + } + + const eventMs = entryTimeMs(entry.id) ?? Date.now(); + const row: QueueMetricsRawV1Input = { + ...descriptor, + event_time: new Date(eventMs).toISOString().slice(0, 19).replace("T", " "), + op: op as QueueMetricsRawV1Input["op"], + }; + + if (op === "gauge") { + row.queued = num(f.ql); + row.running = num(f.cc); + row.queue_limit = num(f.lim); + row.env_queued = num(f.eql); + row.env_running = num(f.ec); + row.env_limit = num(f.elim); + row.throttled = num(f.thr); + } else { + // Counter op: the monotonic odometer reading + its ordering key (and wait on started). + row.cumulative = num(f.cum); + row.order_key = entryOrderKey(entry.id); + if (op === "started" && f.wait != null) row.wait_ms = num(f.wait); + } + return row; +} diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts index 4d9e263d6be..85986933290 100644 --- a/apps/webapp/app/v3/runEngine.server.ts +++ b/apps/webapp/app/v3/runEngine.server.ts @@ -7,6 +7,7 @@ import { logger } from "~/services/logger.server"; import { defaultMachine, getCurrentPlan } from "~/services/platform.v3.server"; import { singleton } from "~/utils/singleton"; import { allMachines } from "./machinePresets.server"; +import { getQueueMetricsEmitter } from "./queueMetrics.server"; import { runEnginePendingVersionLookup } from "./runEnginePendingVersionLookup.server"; import { pickRunOpsStoreForCompletion } from "./runOpsMigration/crossSeamGuard.server"; import { runEngineControlPlaneResolver } from "./runOpsMigration/runEngineControlPlaneResolver.server"; @@ -83,6 +84,7 @@ function createRunEngine() { tracer, }, shardCount: env.RUN_ENGINE_RUN_QUEUE_SHARD_COUNT, + queueMetrics: env.QUEUE_METRICS_EMIT_ENABLED === "1" ? getQueueMetricsEmitter() : undefined, processWorkerQueueDebounceMs: env.RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS, dequeueBlockingTimeoutSeconds: env.RUN_ENGINE_DEQUEUE_BLOCKING_TIMEOUT_SECONDS, masterQueueConsumersIntervalMs: env.RUN_ENGINE_MASTER_QUEUE_CONSUMERS_INTERVAL_MS, diff --git a/apps/webapp/package.json b/apps/webapp/package.json index 643093624b4..90dc92447f7 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -17,6 +17,7 @@ "typecheck": "cross-env NODE_OPTIONS=\"--max-old-space-size=8192\" tsc --noEmit -p ./tsconfig.check.json", "db:seed": "tsx seed.ts", "db:seed:ai-spans": "tsx seed-ai-spans.mts", + "db:seed:queue-metrics": "tsx seed-queue-metrics.mts", "upload:sourcemaps": "bash ./upload-sourcemaps.sh", "test": "vitest --no-file-parallelism", "eval:dev": "evalite watch" @@ -57,6 +58,7 @@ "@internal/dashboard-agent": "workspace:*", "@internal/dashboard-agent-db": "workspace:*", "@internal/llm-model-catalog": "workspace:*", + "@internal/metrics-pipeline": "workspace:*", "@internal/redis": "workspace:*", "@internal/run-engine": "workspace:*", "@internal/run-ops-database": "workspace:*", diff --git a/apps/webapp/test/queueMetricsMapping.test.ts b/apps/webapp/test/queueMetricsMapping.test.ts new file mode 100644 index 00000000000..44a5d6bf7c5 --- /dev/null +++ b/apps/webapp/test/queueMetricsMapping.test.ts @@ -0,0 +1,125 @@ +import { describe, expect, it } from "vitest"; +import { + descriptorFromQueue, + mapEntryToRow, + OVERFLOW_QUEUE_NAME, + QueueNameLimiter, +} from "~/v3/queueMetricsMapping"; + +describe("descriptorFromQueue", () => { + it("parses a plain descriptor", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:task/my-task")).toEqual({ + organization_id: "o1", + project_id: "p1", + environment_id: "e1", + queue_name: "task/my-task", + }); + }); + + it("strips a concurrency-key suffix", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:task/t:ck:tenant-3")).toEqual( + expect.objectContaining({ queue_name: "task/t" }) + ); + }); + + it("keeps colons inside the queue name", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:my:odd:queue")).toEqual( + expect.objectContaining({ queue_name: "my:odd:queue" }) + ); + }); + + it("keeps colons in the name while stripping a real ck suffix", () => { + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1:queue:a:b:ck:t9")).toEqual( + expect.objectContaining({ queue_name: "a:b" }) + ); + }); + + it("rejects malformed descriptors", () => { + expect(descriptorFromQueue("not-a-descriptor")).toBeNull(); + expect(descriptorFromQueue("{org:o1}:proj:p1:env:e1")).toBeNull(); + expect(descriptorFromQueue("")).toBeNull(); + }); +}); + +describe("QueueNameLimiter", () => { + it("passes names through under the cap and overflows past it, per env", () => { + const limiter = new QueueNameLimiter(2); + expect(limiter.limit("env1", "a")).toBe("a"); + expect(limiter.limit("env1", "b")).toBe("b"); + expect(limiter.limit("env1", "c")).toBe(OVERFLOW_QUEUE_NAME); + expect(limiter.limit("env1", "a")).toBe("a"); + expect(limiter.limit("env2", "c")).toBe("c"); + }); + + it("is unlimited when the cap is 0", () => { + const limiter = new QueueNameLimiter(0); + for (let i = 0; i < 100; i++) { + expect(limiter.limit("env1", `q${i}`)).toBe(`q${i}`); + } + }); + + it("evicts the oldest env when the env map is full", () => { + const limiter = new QueueNameLimiter(1, 2); + expect(limiter.limit("env1", "a")).toBe("a"); + expect(limiter.limit("env2", "a")).toBe("a"); + expect(limiter.limit("env3", "a")).toBe("a"); + expect(limiter.limit("env1", "b")).toBe("b"); + }); +}); + +describe("mapEntryToRow", () => { + const q = "{org:o1}:proj:p1:env:e1:queue:task/t"; + + it("maps a gauge entry with numeric fields", () => { + const row = mapEntryToRow({ + id: "1700000000000-0", + fields: { op: "gauge", q, ql: "5", cc: "2", lim: "10", eql: "7", ec: "3", elim: "20", thr: "1" }, + }); + expect(row).toEqual( + expect.objectContaining({ + op: "gauge", + organization_id: "o1", + queue_name: "task/t", + queued: 5, + running: 2, + queue_limit: 10, + env_queued: 7, + env_running: 3, + env_limit: 20, + throttled: 1, + }) + ); + expect(row!.event_time).toBe("2023-11-14 22:13:20"); + }); + + it("maps started with wait_ms + cumulative and drops unknown ops", () => { + const started = mapEntryToRow({ + id: "1700000000000-0", + fields: { op: "started", q, wait: "48", cum: "512" }, + }); + expect(started).toEqual( + expect.objectContaining({ + op: "started", + wait_ms: 48, + cumulative: 512, + order_key: 1700000000000 * 100000, + }) + ); + expect(mapEntryToRow({ id: "1-0", fields: { op: "ack", q, cum: "9" } })).toEqual( + expect.objectContaining({ op: "ack", cumulative: 9 }) + ); + expect(mapEntryToRow({ id: "1-0", fields: { op: "bogus", q } })).toBeNull(); + expect(mapEntryToRow({ id: "1-0", fields: { op: "ack" } })).toBeNull(); + }); + + it("applies the queue-name limiter", () => { + const limiter = new QueueNameLimiter(1); + const first = mapEntryToRow({ id: "1-0", fields: { op: "ack", q } }, limiter); + expect(first!.queue_name).toBe("task/t"); + const second = mapEntryToRow( + { id: "1-1", fields: { op: "ack", q: "{org:o1}:proj:p1:env:e1:queue:task/other" } }, + limiter + ); + expect(second!.queue_name).toBe(OVERFLOW_QUEUE_NAME); + }); +}); From 305e9a6d7d79b5108063fc9b0ed22755a344292e Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 2 Jul 2026 23:30:59 +0100 Subject: [PATCH 05/37] feat(tsql): opt-in gap-fill for time-bucketed series --- .../clickhouse/src/client/tsql.ts | 6 + internal-packages/tsql/src/index.ts | 7 + .../tsql/src/query/printer.test.ts | 141 +++++++++++++ internal-packages/tsql/src/query/printer.ts | 191 +++++++++++++++++- .../tsql/src/query/printer_context.ts | 16 +- internal-packages/tsql/src/query/schema.ts | 6 + 6 files changed, 362 insertions(+), 5 deletions(-) diff --git a/internal-packages/clickhouse/src/client/tsql.ts b/internal-packages/clickhouse/src/client/tsql.ts index c712820812f..ddf1d059b97 100644 --- a/internal-packages/clickhouse/src/client/tsql.ts +++ b/internal-packages/clickhouse/src/client/tsql.ts @@ -108,6 +108,11 @@ export interface ExecuteTSQLOptions { * based on the span of the time range. */ timeRange?: TimeRange; + /** + * Opt-in: emit rows for empty time buckets in a top-level time-bucketed query + * (counters zero-fill, gauges carry forward). Off by default. + */ + fillGaps?: boolean; } /** @@ -192,6 +197,7 @@ export async function executeTSQL( fieldMappings: options.fieldMappings, whereClauseFallback: options.whereClauseFallback, timeRange: options.timeRange, + fillGaps: options.fillGaps, }); generatedSql = sql; diff --git a/internal-packages/tsql/src/index.ts b/internal-packages/tsql/src/index.ts index 1d8759c108c..6941dde6079 100644 --- a/internal-packages/tsql/src/index.ts +++ b/internal-packages/tsql/src/index.ts @@ -541,6 +541,12 @@ export interface CompileTSQLOptions { * ``` */ timeRange?: TimeRange; + /** + * Opt-in: emit rows for empty time buckets in a top-level time-bucketed query. + * Counters zero-fill, gauges (columns with `fillMode: "carry"`) carry forward. + * Off by default; output is unchanged when not set. + */ + fillGaps?: boolean; } /** @@ -599,6 +605,7 @@ export function compileTSQL(query: string, options: CompileTSQLOptions): PrintRe fieldMappings: options.fieldMappings, enforcedWhereClause, timeRange: options.timeRange, + fillGaps: options.fillGaps, }); // 6. Print the AST to ClickHouse SQL (enforced conditions applied at printer level) diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts index 0efa0d34fc4..f8f32edeac5 100644 --- a/internal-packages/tsql/src/query/printer.test.ts +++ b/internal-packages/tsql/src/query/printer.test.ts @@ -3831,3 +3831,144 @@ describe("timeBucket()", () => { }); }); }); + +// ============================================================ +// fillGaps Tests +// ============================================================ + +describe("timeBucket() fillGaps", () => { + // Schema with a gauge column (fillMode: "carry"), a counter, and a groupable dim. + const metricsSchema: TableSchema = { + name: "metrics", + clickhouseName: "trigger_dev.queue_metrics_v1", + timeConstraint: "bucket_at", + columns: { + bucket_at: { name: "bucket_at", clickhouseName: "created_at", ...column("DateTime64") }, + queue_name: { name: "queue_name", ...column("String") }, + max_running: { name: "max_running", ...column("UInt64"), fillMode: "carry" }, + enqueued: { name: "enqueued", ...column("UInt64"), fillMode: "zero" }, + organization_id: { name: "organization_id", ...column("String") }, + project_id: { name: "project_id", ...column("String") }, + environment_id: { name: "environment_id", ...column("String") }, + }, + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + }; + + // 7-day range -> 6 HOUR buckets (same as the timeBucket() block). + const sevenDayRange = { + from: new Date("2024-01-01T00:00:00Z"), + to: new Date("2024-01-08T00:00:00Z"), + }; + + function ctx(fillGaps: boolean): PrinterContext { + return createPrinterContext({ + schema: createSchemaRegistry([metricsSchema]), + enforcedWhereClause: { + organization_id: { op: "eq", value: "org_test123" }, + project_id: { op: "eq", value: "proj_test456" }, + environment_id: { op: "eq", value: "env_test789" }, + }, + timeRange: sevenDayRange, + fillGaps, + }); + } + + function run(query: string, fillGaps: boolean) { + const context = ctx(fillGaps); + const result = printToClickHouse(parseTSQLSelect(query), context); + return { ...result, warnings: context.warnings }; + } + + it("emits no WITH FILL when fillGaps is off (unchanged)", () => { + const query = + "SELECT timeBucket(), max(max_running), count() FROM metrics GROUP BY timeBucket ORDER BY timeBucket"; + const { sql } = run(query, false); + expect(sql).not.toContain("WITH FILL"); + expect(sql).not.toContain("INTERPOLATE"); + }); + + it("single-series gauge + counter: WITH FILL plus INTERPOLATE for the gauge only", () => { + const query = + "SELECT timeBucket(), max(max_running) AS max_running, count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket"; + const { sql, params } = run(query, true); + + // STEP matches the 6 HOUR bucket interval, FROM/TO snapped + parameterized. + expect(sql).toContain("WITH FILL FROM toStartOfInterval({"); + expect(sql).toContain("STEP INTERVAL 6 HOUR"); + expect(sql).toMatch(/TO toStartOfInterval\(\{[^}]+: DateTime64\(6\)\}, INTERVAL 6 HOUR\)/); + + // Gauge carried forward; counter omitted (defaults to 0). + expect(sql).toContain("INTERPOLATE (max_running AS max_running)"); + expect(sql).not.toContain("runs AS runs"); + + // FROM/TO bounds are real parameters carrying the time range. + const dateParams = Object.values(params).filter((v) => v instanceof Date); + expect(dateParams).toContainEqual(sevenDayRange.from); + expect(dateParams).toContainEqual(sevenDayRange.to); + }); + + it("single-series counter only: WITH FILL but no INTERPOLATE", () => { + const query = + "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket"; + const { sql } = run(query, true); + expect(sql).toContain("WITH FILL FROM toStartOfInterval({"); + expect(sql).toContain("STEP INTERVAL 6 HOUR"); + expect(sql).not.toContain("INTERPOLATE"); + }); + + it("grouped counter only: group dim first, then WITH FILL, no INTERPOLATE", () => { + const query = + "SELECT timeBucket(), queue_name, count() AS runs FROM metrics GROUP BY timeBucket, queue_name ORDER BY timeBucket"; + const { sql } = run(query, true); + expect(sql).toMatch(/ORDER BY queue_name, timebucket ASC WITH FILL/); + expect(sql).toContain("STEP INTERVAL 6 HOUR"); + expect(sql).not.toContain("INTERPOLATE"); + }); + + it("grouped + carry gauge: per-group LOCF via window functions, no INTERPOLATE", () => { + const query = + "SELECT timeBucket(), queue_name, max(max_running) AS max_running FROM metrics GROUP BY timeBucket, queue_name ORDER BY timeBucket"; + const { sql, warnings } = run(query, true); + + // Inner query densifies per group (dims first, then the bucket WITH FILL) + sentinel. + expect(sql).toMatch(/ORDER BY queue_name, timebucket ASC WITH FILL/); + expect(sql).toContain("STEP INTERVAL 6 HOUR"); + expect(sql).toContain("1 AS __tsql_present"); + + // Block id increments at each real row, partitioned by the group dim. + expect(sql).toContain( + "sum(__tsql_present) OVER (PARTITION BY queue_name ORDER BY timebucket ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS __tsql_block" + ); + + // Gauge carried within each (group, block); never INTERPOLATE (which bleeds across groups). + expect(sql).toContain( + "max(if(__tsql_present = 1, max_running, NULL)) OVER (PARTITION BY queue_name, __tsql_block) AS max_running" + ); + expect(sql).not.toContain("INTERPOLATE"); + + // Final result re-ordered by the user's ORDER BY, and not skipped. + expect(sql).toMatch(/\)\s*ORDER BY timebucket ASC$/); + expect(warnings.some((w) => w.code === "fill_skipped_grouped_gauge")).toBe(false); + }); + + it("grouped + carry gauge with a non-plain group dim: fill is skipped", () => { + const query = + "SELECT timeBucket(), upper(queue_name) AS q, max(max_running) AS max_running FROM metrics GROUP BY timeBucket, upper(queue_name) ORDER BY timeBucket"; + const { sql, warnings } = run(query, true); + expect(sql).not.toContain("WITH FILL"); + expect(sql).not.toContain("__tsql_block"); + expect(warnings.some((w) => w.code === "fill_skipped_grouped_gauge")).toBe(true); + }); + + it("user ORDER BY not led by timeBucket: fill is skipped", () => { + const query = + "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY runs DESC"; + const { sql } = run(query, true); + expect(sql).not.toContain("WITH FILL"); + expect(sql).not.toContain("INTERPOLATE"); + }); +}); diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index 82d97f5491b..f65e10dc80b 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -459,6 +459,25 @@ export class ClickHousePrinter { this.inProjectionContext = false; } + // Opt-in gap-fill: emit rows for empty time buckets via WITH FILL / INTERPOLATE. + // No-op unless enabled, top-level, and the query is fill-eligible. + let interpolateClause: string | null = null; + let groupedFillWrap: ((inner: string) => string) | null = null; + if (this.context.fillGaps && isTopLevelQuery) { + const fill = this.buildGapFill(node, orderBy, groupBy); + if (fill) { + orderBy = fill.orderBy; + if (fill.kind === "inline") { + interpolateClause = fill.interpolate; + } else { + // Grouped per-group LOCF: add the `present` sentinel to this (now inner) query + // and wrap the rendered SQL in the block-id + carry window layers below. + columns.push(fill.presentColumn); + groupedFillWrap = fill.wrap; + } + } + } + // Process ARRAY JOIN let arrayJoin = ""; if (node.array_join_op) { @@ -487,6 +506,8 @@ export class ClickHousePrinter { having ? `HAVING${space}${having}` : null, windowClause ? `WINDOW${space}${windowClause}` : null, orderBy && orderBy.length > 0 ? `ORDER BY${space}${orderBy.join(comma)}` : null, + // INTERPOLATE must follow the full ORDER BY (including WITH FILL) + interpolateClause, ]; // Process LIMIT @@ -549,6 +570,11 @@ export class ClickHousePrinter { response = this.pretty ? `(${response.trim()})` : `(${response})`; } + // Grouped per-group gap fill wraps this query in the block-id + carry window layers. + if (groupedFillWrap) { + response = groupedFillWrap(response); + } + // Restore saved contexts (for nested queries) this.selectAliases = savedAliases; this.queryHasGroupBy = savedQueryHasGroupBy; @@ -559,6 +585,166 @@ export class ClickHousePrinter { return response; } + /** + * Build the gap-fill transformation (WITH FILL + optional INTERPOLATE) for a + * top-level time-bucketed query. Returns null when the query is not + * fill-eligible (correct-by-construction: emit nothing extra rather than risk + * wrong values). + * + * Eligibility: exactly one timeBucket() column in SELECT, and ORDER BY led by + * that timeBucket column. Carry (gauge) columns are LOCF'd via INTERPOLATE; + * counters zero-fill via WITH FILL's default. Grouped gauge queries are unsafe + * (INTERPOLATE bleeds across groups) and are skipped with a warning. + */ + private buildGapFill( + node: SelectQuery, + orderBy: string[] | null, + groupBy: string[] | null + ): + | { kind: "inline"; orderBy: string[]; interpolate: string | null } + | { kind: "wrap"; orderBy: string[]; presentColumn: string; wrap: (inner: string) => string } + | null { + if (!orderBy || orderBy.length === 0 || !node.select || node.select.length === 0) { + return null; + } + + const timeRange = this.context.timeRange; + if (!timeRange) { + return null; + } + + // Need a time-constraint table to derive the bucket column + interval. + const tableWithConstraint = this.findTimeConstraintTable(); + if (!tableWithConstraint) { + return null; + } + const { tableSchema, clickhouseColumnName } = tableWithConstraint; + const interval = calculateTimeBucketInterval( + timeRange.from, + timeRange.to, + tableSchema.timeBucketThresholds + ); + const bucketSql = `toStartOfInterval(${escapeClickHouseIdentifier(clickhouseColumnName)}, INTERVAL ${interval.value} ${interval.unit})`; + + // Find exactly one timeBucket() column in SELECT and its output alias. + let bucketAlias: string | null = null; + let bucketCount = 0; + for (const col of node.select) { + const inner = (col as Alias).expression_type === "alias" ? (col as Alias).expr : col; + if ( + (inner as Call).expression_type === "call" && + (inner as Call).name.toLowerCase() === "timebucket" + ) { + bucketCount++; + bucketAlias = + (col as Alias).expression_type === "alias" ? (col as Alias).alias : "timebucket"; + } + } + if (bucketCount !== 1 || !bucketAlias) { + return null; + } + + // ORDER BY must be led by the timeBucket column (alias or full expression). + // Don't fight a user ordering like `ORDER BY count DESC`. + const leadTerm = orderBy[0]; + const leadExpr = leadTerm.replace(/\s+(ASC|DESC)\s*$/i, "").trim(); + const matchesBucket = (expr: string): boolean => + expr.toLowerCase() === bucketAlias!.toLowerCase() || expr === bucketSql; + if (!matchesBucket(leadExpr)) { + return null; + } + + // Group dims = GROUP BY expressions that are NOT the timeBucket column. + const groupDims = (groupBy ?? []).filter((g) => !matchesBucket(g.trim())); + + // Classify each SELECT output column. Carry (gauge) columns survive through + // aliases + value-preserving aggregates (see analyzeSelectColumn). A bare column + // that isn't the bucket is a GROUP BY dimension; everything else is a counter or + // derived value that zero-fills. + const carryAliases: string[] = []; + const dimNames: string[] = []; + const orderedOutputs: Array<{ name: string; carry: boolean }> = []; + for (const col of node.select) { + const { outputName, sourceColumn } = this.analyzeSelectColumn(col); + if (!outputName) continue; + const carry = sourceColumn?.fillMode === "carry"; + orderedOutputs.push({ name: outputName, carry }); + if (carry) carryAliases.push(outputName); + const inner = (col as Alias).expression_type === "alias" ? (col as Alias).expr : col; + if (!matchesBucket(outputName) && (inner as Field).expression_type === "field") { + dimNames.push(outputName); + } + } + + // Snap FROM/TO to the bucket grid and parameterize the bounds. + const fromBound = this.context.addValue(timeRange.from); + const toBound = this.context.addValue(timeRange.to); + const withFill = + `WITH FILL FROM toStartOfInterval(${fromBound}, INTERVAL ${interval.value} ${interval.unit})` + + ` TO toStartOfInterval(${toBound}, INTERVAL ${interval.value} ${interval.unit})` + + ` STEP INTERVAL ${interval.value} ${interval.unit}`; + + const esc = escapeClickHouseIdentifier; + + // Single series: WITH FILL on the bucket + INTERPOLATE the carry columns (LOCF); + // counters omitted from INTERPOLATE so they zero-fill. + if (groupDims.length === 0) { + const newOrderBy = [...orderBy]; + newOrderBy[0] = `${leadTerm} ${withFill}`; + const interpolate = + carryAliases.length > 0 + ? `INTERPOLATE (${carryAliases.map((a) => `${esc(a)} AS ${esc(a)}`).join(", ")})` + : null; + return { kind: "inline", orderBy: newOrderBy, interpolate }; + } + + // Grouped, counters only: per-group zero-fill via WITH FILL ordered by the dims. + if (carryAliases.length === 0) { + return { + kind: "inline", + orderBy: [...groupDims, `${leadTerm} ${withFill}`], + interpolate: null, + }; + } + + // Grouped + gauge: per-group LOCF. INTERPOLATE bleeds across groups, so densify per + // group (WITH FILL + a `present` sentinel that is 0 on filled rows), assign a block id + // that increments at each real row, then carry the block's real value via window max. + // Only safe when every GROUP BY dim is a plain column we can PARTITION BY. + if (dimNames.length !== groupDims.length) { + this.context.addWarning( + "fill_skipped_grouped_gauge", + "fillGaps was skipped: per-group gap fill needs every GROUP BY dimension to be a plain column." + ); + return null; + } + + const userOrderBy = [...orderBy]; + const presentCol = "__tsql_present"; + const blockCol = "__tsql_block"; + const partitionDims = dimNames.map(esc).join(", "); + const blockExpr = + `sum(${esc(presentCol)}) OVER (PARTITION BY ${partitionDims} ORDER BY ${esc(bucketAlias)}` + + ` ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS ${esc(blockCol)}`; + const finalColumns = orderedOutputs.map(({ name, carry }) => + carry + ? `max(if(${esc(presentCol)} = 1, ${esc(name)}, NULL)) OVER (PARTITION BY ${partitionDims}, ${esc( + blockCol + )}) AS ${esc(name)}` + : esc(name) + ); + const finalOrderBy = userOrderBy.length > 0 ? ` ORDER BY ${userOrderBy.join(", ")}` : ""; + const wrap = (inner: string): string => + `SELECT ${finalColumns.join(", ")} FROM (SELECT *, ${blockExpr} FROM (${inner.trim()}))${finalOrderBy}`; + + return { + kind: "wrap", + orderBy: [...dimNames.map(esc), `${leadTerm} ${withFill}`], + presentColumn: `1 AS ${esc(presentCol)}`, + wrap, + }; + } + /** * Extract column aliases from a SELECT expression. * Handles explicit aliases (AS name) and implicit names from aggregations/functions. @@ -1014,11 +1200,12 @@ export class ClickHousePrinter { if ((firstArg as Field).expression_type === "field") { const field = firstArg as Field; const columnInfo = this.resolveFieldToColumn(field.chain); - // Only propagate customRenderType, not the full column schema - if (columnInfo.column?.customRenderType) { + // Propagate customRenderType and fillMode (gauge-ness), not the full column schema + if (columnInfo.column?.customRenderType || columnInfo.column?.fillMode) { sourceColumn = { type: inferredType, customRenderType: columnInfo.column.customRenderType, + fillMode: columnInfo.column.fillMode, }; } } diff --git a/internal-packages/tsql/src/query/printer_context.ts b/internal-packages/tsql/src/query/printer_context.ts index d0fb41b5327..a964e2e04af 100644 --- a/internal-packages/tsql/src/query/printer_context.ts +++ b/internal-packages/tsql/src/query/printer_context.ts @@ -125,6 +125,9 @@ export class PrinterContext { */ readonly timeRange?: TimeRange; + /** When true, time-bucketed queries emit rows for empty buckets (opt-in). */ + readonly fillGaps?: boolean; + constructor( /** Schema registry containing allowed tables and columns */ public readonly schema: SchemaRegistry, @@ -138,13 +141,16 @@ export class PrinterContext { */ enforcedWhereClause: Record = {}, /** Time range for timeBucket() interval calculation */ - timeRange?: TimeRange + timeRange?: TimeRange, + /** Opt-in gap-fill for time-bucketed queries */ + fillGaps?: boolean ) { // Initialize with default settings this.settings = { ...DEFAULT_QUERY_SETTINGS, ...settings }; this.fieldMappings = fieldMappings; this.enforcedWhereClause = enforcedWhereClause; this.timeRange = timeRange; + this.fillGaps = fillGaps; } /** @@ -225,7 +231,8 @@ export class PrinterContext { this.settings, this.fieldMappings, this.enforcedWhereClause, - this.timeRange + this.timeRange, + this.fillGaps ); // Share the same values map so parameters are unified child.values = this.values; @@ -277,6 +284,8 @@ export interface PrinterContextOptions { * When provided, `timeBucket()` uses this to determine the appropriate bucket size. */ timeRange?: TimeRange; + /** When true, time-bucketed queries emit rows for empty buckets (opt-in). */ + fillGaps?: boolean; } /** @@ -288,6 +297,7 @@ export function createPrinterContext(options: PrinterContextOptions): PrinterCon options.settings, options.fieldMappings, options.enforcedWhereClause, - options.timeRange + options.timeRange, + options.fillGaps ); } diff --git a/internal-packages/tsql/src/query/schema.ts b/internal-packages/tsql/src/query/schema.ts index 9a1e2d2ddfe..68007c8e62e 100644 --- a/internal-packages/tsql/src/query/schema.ts +++ b/internal-packages/tsql/src/query/schema.ts @@ -122,6 +122,12 @@ export interface ColumnSchema { * ``` */ customRenderType?: string; + /** + * Gap-fill behavior when the opt-in `fillGaps` feature emits rows for empty + * time buckets: `"carry"` = gauge (LOCF via INTERPOLATE), `"zero"` (default) + * = counter (missing buckets get 0). + */ + fillMode?: "zero" | "carry"; /** * Example value for documentation purposes. * From 982be508eaded3ead2c86af09b0ea61062d0d2db Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 2 Jul 2026 23:31:12 +0100 Subject: [PATCH 06/37] feat(webapp): Queues dashboard and per-org metrics UI flag --- .../components/primitives/UsageSparkline.tsx | 5 +- .../presenters/v3/BuiltInDashboards.server.ts | 177 +++- .../v3/MetricDashboardPresenter.server.ts | 3 + .../v3/QueueMetricsPresenter.server.ts | 148 +++ .../route.tsx | 8 + .../route.tsx | 984 ++++++++++++++++-- .../route.tsx | 495 +++++++++ apps/webapp/app/routes/resources.metric.tsx | 4 + apps/webapp/app/utils/pathBuilder.ts | 9 + .../app/v3/canAccessQueueMetricsUi.server.ts | 26 + apps/webapp/app/v3/featureFlags.ts | 4 + apps/webapp/app/v3/querySchemas.ts | 163 +++ apps/webapp/seed-queue-metrics.mts | 494 +++++++++ 13 files changed, 2404 insertions(+), 116 deletions(-) create mode 100644 apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts create mode 100644 apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx create mode 100644 apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts create mode 100644 apps/webapp/seed-queue-metrics.mts diff --git a/apps/webapp/app/components/primitives/UsageSparkline.tsx b/apps/webapp/app/components/primitives/UsageSparkline.tsx index 2ffc1936a1d..6ac7aea6a8e 100644 --- a/apps/webapp/app/components/primitives/UsageSparkline.tsx +++ b/apps/webapp/app/components/primitives/UsageSparkline.tsx @@ -27,6 +27,8 @@ export type UsageSparklineProps = { color?: string; /** Unit shown in the tooltip (e.g. calls, tokens). */ unitLabel?: UnitLabel; + /** Trailing scalar shown after the chart. Defaults to the sum of buckets (override for gauges, e.g. peak). */ + total?: number; /** Format the trailing total. Defaults to `toLocaleString`. */ formatTotal?: (total: number) => string; /** Class for the trailing total label. */ @@ -44,6 +46,7 @@ export function UsageSparkline({ bucketIntervalMs, color = "#3B82F6", unitLabel = { singular: "call", plural: "calls" }, + total: totalOverride, formatTotal, totalClassName = "text-blue-400", }: UsageSparklineProps) { @@ -51,7 +54,7 @@ export function UsageSparkline({ return ; } - const total = data.reduce((a, b) => a + b, 0); + const total = totalOverride ?? data.reduce((a, b) => a + b, 0); const max = Math.max(...data); // Map each bucket to a dated point so the tooltip can show the window it diff --git a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts index 971fc9a3033..273e76054f9 100644 --- a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts +++ b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts @@ -550,7 +550,182 @@ const llmDashboard: BuiltInDashboard = { }, }; -const builtInDashboards: BuiltInDashboard[] = [overviewDashboard, llmDashboard]; +const queuesDashboard: BuiltInDashboard = { + key: "queues", + title: "Queues", + filters: ["queues"], + layout: { + version: "1", + layout: [ + { i: "env-used", x: 0, y: 0, w: 3, h: 4 }, + { i: "env-limit", x: 3, y: 0, w: 3, h: 4 }, + { i: "env-avail", x: 6, y: 0, w: 3, h: 4 }, + { i: "env-sat", x: 9, y: 0, w: 3, h: 4 }, + { i: "sat-time", x: 0, y: 4, w: 6, h: 9 }, + { i: "used-limit", x: 6, y: 4, w: 6, h: 9 }, + { i: "t-pressure", x: 0, y: 13, w: 12, h: 2, minH: 2, maxH: 2 }, + { i: "pressure", x: 0, y: 15, w: 12, h: 11 }, + { i: "t-trends", x: 0, y: 26, w: 12, h: 2, minH: 2, maxH: 2 }, + { i: "running-q", x: 0, y: 28, w: 6, h: 9 }, + { i: "queued-q", x: 6, y: 28, w: 6, h: 9 }, + { i: "throttled-q", x: 0, y: 37, w: 6, h: 9 }, + { i: "throughput", x: 6, y: 37, w: 6, h: 9 }, + { i: "wait-pct", x: 0, y: 46, w: 12, h: 9 }, + ], + widgets: { + "env-used": { + title: "Concurrency in use", + query: `SELECT argMax(max_env_running, bucket_start) AS in_use\nFROM queue_metrics`, + display: { type: "bignumber", column: "in_use", aggregation: "max", abbreviate: false }, + }, + "env-limit": { + title: "Environment limit", + query: `SELECT argMax(max_env_limit, bucket_start) AS env_limit\nFROM queue_metrics`, + display: { type: "bignumber", column: "env_limit", aggregation: "max", abbreviate: false }, + }, + "env-avail": { + title: "Available slots", + query: `SELECT argMax(max_env_limit, bucket_start) - argMax(max_env_running, bucket_start) AS available\nFROM queue_metrics`, + display: { type: "bignumber", column: "available", aggregation: "max", abbreviate: false }, + }, + "env-sat": { + title: "Env saturation", + query: `SELECT round(argMax(max_env_running, bucket_start) * 100.0 / nullIf(argMax(max_env_limit, bucket_start), 0), 1) AS saturation\nFROM queue_metrics`, + display: { + type: "bignumber", + column: "saturation", + aggregation: "max", + abbreviate: false, + suffix: "%", + }, + }, + "sat-time": { + title: "Environment saturation over time", + query: `SELECT timeBucket() AS t,\n round(max(max_env_running) * 100.0 / nullIf(max(max_env_limit), 0), 1) AS saturation\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["saturation"], + groupByColumn: null, + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + "used-limit": { + title: "Concurrency used vs limit", + query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS limit\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + // Single-series gauge: carry the last known used/limit across idle buckets instead of dropping to 0. + fillGaps: true, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["used", "limit"], + groupByColumn: null, + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + "t-pressure": { title: "Queue pressure", query: "", display: { type: "title" } }, + pressure: { + title: "Queue pressure", + query: `SELECT queue,\n argMax(max_running, bucket_start) AS running,\n argMax(max_queued, bucket_start) AS queued,\n argMax(max_limit, bucket_start) AS limit,\n running + queued AS demand,\n max(max_queued) AS peak_queued,\n sum(throttled_count) AS throttled,\n multiIf(running >= limit AND queued > 0, 'queue-limited', queued > 0, 'backlogged', 'healthy') AS status\nFROM queue_metrics\nGROUP BY queue\nORDER BY peak_queued DESC`, + display: { type: "table", prettyFormatting: true, sorting: [{ id: "peak_queued", desc: true }] }, + }, + "t-trends": { title: "Per-queue trends", query: "", display: { type: "title" } }, + "running-q": { + title: "Running by queue", + query: `SELECT timeBucket() AS t, queue, max(max_running) AS running\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`, + // Grouped gauge: carry each queue's running across idle buckets (per-group LOCF). + fillGaps: true, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["running"], + groupByColumn: "queue", + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + "queued-q": { + title: "Queue depth (backlog) by queue", + query: `SELECT timeBucket() AS t, queue, max(max_queued) AS queued\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`, + // Grouped gauge: carry each queue's backlog across idle buckets (per-group LOCF). + fillGaps: true, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["queued"], + groupByColumn: "queue", + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + "throttled-q": { + title: "Throttled buckets by queue", + query: `SELECT timeBucket() AS t, queue, sum(throttled_count) AS throttled\nFROM queue_metrics\nGROUP BY t, queue\nORDER BY t`, + // Grouped counter: per-group zero-fill so idle buckets read 0, not a gap. + fillGaps: true, + display: { + type: "chart", + chartType: "bar", + xAxisColumn: "t", + yAxisColumns: ["throttled"], + groupByColumn: "queue", + stacked: true, + sortByColumn: null, + sortDirection: "asc", + aggregation: "sum", + }, + }, + throughput: { + title: "Enqueued vs started", + query: `SELECT timeBucket() AS t,\n deltaSumTimestampMerge(enqueue_delta) AS enqueued,\n deltaSumTimestampMerge(started_delta) AS started\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + // Single-series counters: zero-fill idle buckets so the line returns to 0 rather than interpolating across gaps. + fillGaps: true, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["enqueued", "started"], + groupByColumn: null, + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "sum", + }, + }, + "wait-pct": { + title: "Scheduling delay p50/p95/p99 (ms)", + query: `SELECT timeBucket() AS t,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[1]) AS p50,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[3]) AS p99\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + display: { + type: "chart", + chartType: "line", + xAxisColumn: "t", + yAxisColumns: ["p50", "p95", "p99"], + groupByColumn: null, + stacked: false, + sortByColumn: null, + sortDirection: "asc", + aggregation: "max", + }, + }, + }, + }, +}; + +const builtInDashboards: BuiltInDashboard[] = [overviewDashboard, llmDashboard, queuesDashboard]; export function builtInDashboardList(): BuiltInDashboard[] { return builtInDashboards; diff --git a/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts b/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts index df43864b53a..0b84e971b2f 100644 --- a/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/MetricDashboardPresenter.server.ts @@ -37,6 +37,9 @@ export const Widget = z.object({ title: z.string(), query: z.string().default(""), display: QueryWidgetConfig, + // Opt into server-side gap fill (carry-forward for gauges, zero-fill for counters). + // Top-level rather than in `display` because display config is client-only and never reaches the query POST. + fillGaps: z.boolean().optional(), }); export type Widget = z.infer; diff --git a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts new file mode 100644 index 00000000000..42c65452abf --- /dev/null +++ b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts @@ -0,0 +1,148 @@ +import { type AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { logger } from "~/services/logger.server"; + +export const QUEUE_METRICS_WINDOWS = { + "1h": 3600, + "6h": 21600, + "24h": 86400, +} as const; + +export type QueueMetricsWindow = keyof typeof QUEUE_METRICS_WINDOWS; + +export function isQueueMetricsWindow(value: unknown): value is QueueMetricsWindow { + return typeof value === "string" && value in QUEUE_METRICS_WINDOWS; +} + +export type QueueListMetric = { + p50WaitMs: number | null; + p95WaitMs: number | null; + peakQueued: number; + /** Equal-width buckets, oldest first, carry-forward filled across idle gaps. */ + depthSparkline: number[]; +}; + +export type QueueListMetrics = { + window: QueueMetricsWindow; + bucketStartMs: number; + bucketIntervalMs: number; + byQueue: Map; +}; + +const SPARKLINE_POINTS = 48; + +function formatClickhouseDateTime(date: Date): string { + return date.toISOString().slice(0, 19).replace("T", " "); +} + +function finiteOrNull(value: number): number | null { + return Number.isFinite(value) ? value : null; +} + +export class QueueMetricsPresenter { + /** + * Recent per-queue metrics for a fixed set of queues (the visible list page), + * scoped to one ClickHouse query window so cost is independent of total queue count. + * Degrades to an empty map if ClickHouse is unavailable so the live list still renders. + */ + public async getQueueListMetrics({ + environment, + queueNames, + window, + }: { + environment: AuthenticatedEnvironment; + queueNames: string[]; + window: QueueMetricsWindow; + }): Promise { + const windowSeconds = QUEUE_METRICS_WINDOWS[window]; + const bucketSeconds = Math.max(60, Math.round(windowSeconds / SPARKLINE_POINTS)); + const numBuckets = Math.ceil(windowSeconds / bucketSeconds); + const nowSeconds = Math.floor(Date.now() / 1000); + const gridStartSeconds = Math.floor((nowSeconds - windowSeconds) / bucketSeconds) * bucketSeconds; + const bucketStartMs = gridStartSeconds * 1000; + const bucketIntervalMs = bucketSeconds * 1000; + + const empty: QueueListMetrics = { + window, + bucketStartMs, + bucketIntervalMs, + byQueue: new Map(), + }; + + if (queueNames.length === 0) { + return empty; + } + + try { + const clickhouse = await clickhouseFactory.getClickhouseForOrganization( + environment.organizationId, + "query" + ); + + const ids = { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + queueNames, + startTime: formatClickhouseDateTime(new Date(bucketStartMs)), + }; + + const [summaryResult, sparklineResult] = await Promise.all([ + clickhouse.queueMetrics.listSummary(ids), + clickhouse.queueMetrics.depthSparklines({ ...ids, bucketSeconds }), + ]); + + const [summaryError, summaryRows] = summaryResult; + const [sparklineError, sparklineRows] = sparklineResult; + + if (summaryError || sparklineError) { + logger.warn("QueueMetricsPresenter: clickhouse query failed", { + summaryError: summaryError?.message, + sparklineError: sparklineError?.message, + }); + return empty; + } + + // Bucket -> depth per queue, mapped onto the aligned grid and forward-filled. + const depthsByQueue = new Map>(); + for (const row of sparklineRows ?? []) { + const bucketMs = Date.parse(row.bucket.replace(" ", "T") + "Z"); + if (Number.isNaN(bucketMs)) continue; + const index = Math.round((bucketMs - bucketStartMs) / bucketIntervalMs); + if (index < 0 || index >= numBuckets) continue; + let byIndex = depthsByQueue.get(row.queue_name); + if (!byIndex) { + byIndex = new Map(); + depthsByQueue.set(row.queue_name, byIndex); + } + byIndex.set(index, row.depth); + } + + const byQueue = new Map(); + for (const row of summaryRows ?? []) { + const byIndex = depthsByQueue.get(row.queue_name); + const sparkline: number[] = new Array(numBuckets); + let last = 0; + for (let i = 0; i < numBuckets; i++) { + const value = byIndex?.get(i); + if (value !== undefined) last = value; + sparkline[i] = last; + } + byQueue.set(row.queue_name, { + p50WaitMs: finiteOrNull(row.p50_wait_ms), + p95WaitMs: finiteOrNull(row.p95_wait_ms), + peakQueued: row.peak_queued, + depthSparkline: sparkline, + }); + } + + return { window, bucketStartMs, bucketIntervalMs, byQueue }; + } catch (error) { + logger.warn("QueueMetricsPresenter: failed to load queue metrics", { + error: error instanceof Error ? error.message : String(error), + }); + return empty; + } + } + +} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx index 5fa237cee6e..b5abf8b0909 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx @@ -38,6 +38,7 @@ import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstan import { requireUser } from "~/services/session.server"; import { cn } from "~/utils/cn"; import { EnvironmentParamSchema } from "~/utils/pathBuilder"; +import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server"; import { QueryScopeSchema } from "~/v3/querySchemas"; import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route"; import { MetricWidget } from "../resources.metric"; @@ -50,6 +51,12 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const user = await requireUser(request); const { projectParam, organizationSlug, envParam, dashboardKey } = ParamSchema.parse(params); + // The built-in "queues" dashboard is part of the metrics UI (unlinked, but reachable by + // URL), so gate it per-org like the rest of the Queue Metrics view. + if (dashboardKey === "queues" && !(await canAccessQueueMetricsUi({ userId: user.id, organizationSlug }))) { + throw new Response(undefined, { status: 404, statusText: "Not found" }); + } + const project = await findProjectBySlug(organizationSlug, projectParam, user.id); if (!project) { throw new Response(undefined, { @@ -376,6 +383,7 @@ export function MetricDashboard({ promptSlugs={prompts.length > 0 ? prompts : undefined} operations={operations.length > 0 ? operations : undefined} providers={providers.length > 0 ? providers : undefined} + fillGaps={widget.fillGaps} config={widget.display} organizationId={organization.id} projectId={project.id} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index 877b1235a97..8e5161f2278 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -7,11 +7,11 @@ import { RectangleStackIcon, } from "@heroicons/react/20/solid"; import { DialogClose } from "@radix-ui/react-dialog"; -import { Form, useNavigation, type MetaFunction } from "@remix-run/react"; +import { Form, Link, useNavigation, useSearchParams, type MetaFunction } from "@remix-run/react"; import { type ActionFunctionArgs, type LoaderFunctionArgs } from "@remix-run/server-runtime"; import type { QueueItem } from "@trigger.dev/core/v3/schemas"; import type { RuntimeEnvironmentType } from "@trigger.dev/database"; -import { useEffect, useState } from "react"; +import { type ReactNode, useCallback, useEffect, useRef, useState } from "react"; import { typedjson, useTypedLoaderData } from "remix-typedjson"; import { z } from "zod"; import { ConcurrencyIcon } from "~/assets/icons/ConcurrencyIcon"; @@ -21,7 +21,6 @@ import { AdminDebugTooltip } from "~/components/admin/debugTooltip"; import { QueuesHasNoTasks } from "~/components/BlankStatePanels"; import { environmentFullTitle } from "~/components/environments/EnvironmentLabel"; import { PageBody, PageContainer } from "~/components/layout/AppLayout"; -import { BigNumber } from "~/components/metrics/BigNumber"; import { Badge } from "~/components/primitives/Badge"; import { Button, LinkButton, type ButtonVariant } from "~/components/primitives/Buttons"; import { Callout } from "~/components/primitives/Callout"; @@ -55,6 +54,8 @@ import { import { QueueName } from "~/components/runs/v3/QueueName"; import { env } from "~/env.server"; import { useAutoRevalidate } from "~/hooks/useAutoRevalidate"; +import { useInterval } from "~/hooks/useInterval"; +import { LoadingBarDivider } from "~/components/primitives/LoadingBarDivider"; import { useEnvironment } from "~/hooks/useEnvironment"; import { useOrganization } from "~/hooks/useOrganizations"; import { useProject } from "~/hooks/useProject"; @@ -64,6 +65,13 @@ import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { getUserById } from "~/models/user.server"; import { EnvironmentQueuePresenter } from "~/presenters/v3/EnvironmentQueuePresenter.server"; import { QueueListPresenter } from "~/presenters/v3/QueueListPresenter.server"; +import { + QueueMetricsPresenter, + isQueueMetricsWindow, + type QueueMetricsWindow, +} from "~/presenters/v3/QueueMetricsPresenter.server"; +import { UsageSparkline } from "~/components/primitives/UsageSparkline"; +import { Area, AreaChart, ResponsiveContainer } from "recharts"; import { requireUserId } from "~/services/session.server"; import { cn } from "~/utils/cn"; import { ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT } from "~/utils/environmentPauseSource"; @@ -72,16 +80,20 @@ import { docsPath, EnvironmentParamSchema, v3BillingPath, + v3QueuePath, v3RunsPath, } from "~/utils/pathBuilder"; import { concurrencySystem } from "~/v3/services/concurrencySystemInstance.server"; import { PauseEnvironmentService } from "~/v3/services/pauseEnvironment.server"; import { PauseQueueService } from "~/v3/services/pauseQueue.server"; import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route"; +import { BigNumber } from "~/components/metrics/BigNumber"; +import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server"; const SearchParamsSchema = z.object({ query: z.string().optional(), page: z.coerce.number().min(1).default(1), + period: z.string().optional(), }); export const meta: MetaFunction = () => { @@ -97,7 +109,10 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const { organizationSlug, projectParam, envParam } = EnvironmentParamSchema.parse(params); const url = new URL(request.url); - const { page, query } = SearchParamsSchema.parse(Object.fromEntries(url.searchParams)); + const { page, query, period: rawPeriod } = SearchParamsSchema.parse( + Object.fromEntries(url.searchParams) + ); + const period: QueueMetricsWindow = isQueueMetricsWindow(rawPeriod) ? rawPeriod : "24h"; const project = await findProjectBySlug(organizationSlug, projectParam, userId); if (!project) { @@ -115,6 +130,10 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }); } + // Per-org gate for the metrics UI. When off, this org gets the classic Queues page and + // no metrics query fires. + const queueMetricsUiEnabled = await canAccessQueueMetricsUi({ userId, organizationSlug }); + try { const queueListPresenter = new QueueListPresenter(); const queues = await queueListPresenter.call({ @@ -127,10 +146,40 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const autoReloadPollIntervalMs = env.QUEUES_AUTORELOAD_POLL_INTERVAL_MS; + // Per-queue list metrics (Delay p95 + backlog sparkline columns) are SSR'd with the table. + // The environment header tiles are fetched client-side per card (see QueueEnvMetricTile) so a + // slow ClickHouse query never blocks the queues list from rendering. + let metrics: { + window: QueueMetricsWindow; + bucketStartMs: number; + bucketIntervalMs: number; + byQueue: Record; + } | null = null; + + if (queueMetricsUiEnabled && queues.success) { + const presenter = new QueueMetricsPresenter(); + const queueNames = queues.queues.map((q) => (q.type === "task" ? `task/${q.name}` : q.name)); + const queueMetrics = + queueNames.length > 0 + ? await presenter.getQueueListMetrics({ environment, queueNames, window: period }) + : null; + if (queueMetrics) { + metrics = { + window: queueMetrics.window, + bucketStartMs: queueMetrics.bucketStartMs, + bucketIntervalMs: queueMetrics.bucketIntervalMs, + byQueue: Object.fromEntries(queueMetrics.byQueue), + }; + } + } + return typedjson({ ...queues, environment: await environmentQueuePresenter.call(environment), autoReloadPollIntervalMs, + metrics, + period, + queueMetricsUiEnabled, }); } catch (error) { console.error(error); @@ -299,6 +348,13 @@ export const action = async ({ request, params }: ActionFunctionArgs) => { }; export default function Page() { + // Per-org flag decides which whole page renders. Off => the classic Queues page, + // byte-for-byte the pre-metrics UI. Each branch is its own component (own hooks). + const { queueMetricsUiEnabled } = useTypedLoaderData(); + return queueMetricsUiEnabled ? : ; +} + +function QueuesWithMetricsView() { const { environment, queues, @@ -308,8 +364,12 @@ export default function Page() { totalQueues, hasFilters, autoReloadPollIntervalMs, + metrics, + period, } = useTypedLoaderData(); + const metricsByQueue = metrics?.byQueue ?? {}; + const organization = useOrganization(); const project = useProject(); const env = useEnvironment(); @@ -317,22 +377,37 @@ export default function Page() { useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true }); - const limitStatus = - environment.running === environment.concurrencyLimit * environment.burstFactor - ? "limit" - : environment.running > environment.concurrencyLimit - ? "burst" - : "within"; - - const limitClassName = - limitStatus === "burst" ? "text-warning" : limitStatus === "limit" ? "text-error" : undefined; - return ( + {plan ? ( + plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? ( + + Increase limit + + ) : ( + + Increase limit + + ) + ) : null} + {environment.runsEnabled && + env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? ( + + ) : null}
-
- paused : undefined} - animate - accessory={ -
- {environment.runsEnabled && - env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? ( - - ) : null} - -
- } - valueClassName={env.paused ? "text-warning tabular-nums" : "tabular-nums"} - compactThreshold={1000000} - /> - - Including {environment.running - environment.concurrencyLimit} burst runs{" "} - - - ) : limitStatus === "limit" ? ( - "At concurrency limit" - ) : undefined - } - accessory={ - - } - compactThreshold={1000000} - /> - 1 ? ( - - Burst limit {environment.burstFactor * environment.concurrencyLimit}{" "} - - - ) : undefined - } - accessory={ - plan ? ( - plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? ( - - Increase limit - - ) : ( - - Increase limit - - ) - ) : null - } - /> +
+ {QUEUE_HEADER_TILES.map((tile) => ( + + ))}
{success ? (
- +
+ + +
Limited by + Health + + Delay p95 + + Backlog Pause/resume @@ -518,11 +511,19 @@ export default function Page() { const queueFilterableName = `${queue.type === "task" ? "task/" : ""}${ queue.name }`; + const queueMetric = metricsByQueue[queueFilterableName]; return ( - + + + {queue.concurrency?.overriddenAt ? ( + + + + + {queueMetric && queueMetric.p95WaitMs !== null ? ( + = 60_000 + ? "text-warning" + : "text-text-bright" + )} + > + {formatWaitMs(queueMetric.p95WaitMs)} + + ) : ( + + )} + + + v.toLocaleString()} + /> + - +
{hasFilters @@ -1059,6 +1095,726 @@ export function QueueFilters() { return ; } +const QUEUE_METRICS_PERIODS: { value: QueueMetricsWindow; label: string }[] = [ + { value: "1h", label: "1h" }, + { value: "6h", label: "6h" }, + { value: "24h", label: "24h" }, +]; + +function QueuePeriodSelect({ period }: { period: QueueMetricsWindow }) { + const [searchParams] = useSearchParams(); + const hrefFor = (value: QueueMetricsWindow) => { + const next = new URLSearchParams(searchParams); + next.set("period", value); + next.delete("page"); + return `?${next.toString()}`; + }; + return ( +
+ Metrics + {QUEUE_METRICS_PERIODS.map(({ value, label }) => ( + + {label} + + ))} +
+ ); +} + +type MetricTileRow = Record; + +type MetricTileResponse = + | { success: true; data: { rows: MetricTileRow[] } } + | { success: false; error: string }; + +type QueueHeaderTile = { + id: string; + label: string; + color: string; + query: string; + derive: (rows: MetricTileRow[]) => { + sparkline: number[]; + value: ReactNode; + valueClassName?: string; + }; +}; + +function tileNumber(value: number | string | null): number { + const n = typeof value === "number" ? value : Number(value); + return Number.isFinite(n) ? n : 0; +} + +// Header tiles fetch their own TRQL query client-side (resources.metric) with fillGaps, mirroring the +// metrics dashboard widgets: the gauges (saturation inputs, backlog) carry, counters/p95 zero-fill. +const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ + { + id: "saturation", + label: "Env saturation", + color: "#6366F1", + query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS env_limit\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + derive: (rows) => { + const sparkline = rows.map((r) => { + const limit = tileNumber(r.env_limit); + return limit > 0 ? Math.round((tileNumber(r.used) / limit) * 100) : 0; + }); + const peak = sparkline.reduce((max, v) => Math.max(max, v), 0); + return { sparkline, value: `${peak}% peak` }; + }, + }, + { + id: "backlog", + label: "Backlog", + color: "#A78BFA", + query: `SELECT timeBucket() AS t,\n max(max_env_queued) AS queued\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + derive: (rows) => { + const sparkline = rows.map((r) => tileNumber(r.queued)); + const peak = sparkline.reduce((max, v) => Math.max(max, v), 0); + return { sparkline, value: `${peak.toLocaleString()} peak` }; + }, + }, + { + id: "p95", + label: "Scheduling delay p95", + color: "#F59E0B", + query: `SELECT timeBucket() AS t,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + derive: (rows) => { + const sparkline = rows.map((r) => tileNumber(r.p95)); + const worst = sparkline.reduce((max, v) => Math.max(max, v), 0); + return { + sparkline, + value: worst > 0 ? formatWaitMs(worst) : "–", + valueClassName: worst >= 60_000 ? "text-warning" : undefined, + }; + }, + }, + { + id: "throttled", + label: "Throttled", + color: "#F59E0B", + query: `SELECT timeBucket() AS t,\n sum(throttled_count) AS throttled\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + derive: (rows) => { + const sparkline = rows.map((r) => tileNumber(r.throttled)); + const total = sparkline.reduce((sum, v) => sum + v, 0); + return { + sparkline, + value: total.toLocaleString(), + valueClassName: total > 0 ? "text-warning" : undefined, + }; + }, + }, +]; + +function QueueEnvMetricTile({ tile, period }: { tile: QueueHeaderTile; period: QueueMetricsWindow }) { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + const [response, setResponse] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const abortRef = useRef(null); + + const orgId = organization.id; + const projectId = project.id; + const environmentId = environment.id; + const { query } = tile; + + const load = useCallback(() => { + abortRef.current?.abort(); + const controller = new AbortController(); + abortRef.current = controller; + setIsLoading(true); + fetch("/resources/metric", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + query, + scope: "environment", + period, + from: null, + to: null, + fillGaps: true, + organizationId: orgId, + projectId, + environmentId, + }), + signal: controller.signal, + }) + .then((res) => res.json() as Promise) + .then((data) => { + if (!controller.signal.aborted) { + setResponse(data); + setIsLoading(false); + } + }) + .catch((error) => { + if (error instanceof DOMException && error.name === "AbortError") return; + if (!controller.signal.aborted) { + setResponse({ success: false, error: "Network error" }); + setIsLoading(false); + } + }); + }, [query, period, orgId, projectId, environmentId]); + + useEffect(() => { + load(); + return () => abortRef.current?.abort(); + }, [load]); + + useInterval({ interval: 60_000, onLoad: false, onFocus: true, callback: load }); + + const rows = response?.success ? response.data.rows : []; + const hasData = rows.length > 0; + const showLoading = isLoading && !hasData; + const failed = response !== null && !response.success; + const { sparkline, value, valueClassName } = tile.derive(rows); + + return ( + + ) : failed ? undefined : ( + value + ) + } + valueClassName={valueClassName} + > + + {showLoading ? ( +
+ ) : failed ? ( +
Unable to load metrics
+ ) : ( + + )} + + ); +} + +function HeaderTile({ + label, + value, + valueClassName, + className, + children, +}: { + label: ReactNode; + value?: ReactNode; + valueClassName?: string; + className?: string; + children: ReactNode; +}) { + return ( +
+
+ {label} + {value !== undefined ? ( + + {value} + + ) : null} +
+ {children} +
+ ); +} + +function MiniChart({ data, color }: { data: number[]; color: string }) { + if (!data || data.length === 0 || data.every((v) => v === 0)) { + return
No activity
; + } + const chartData = data.map((v, i) => ({ i, v })); + return ( +
+ + + + + +
+ ); +} + +function QueueHealthBadge({ + paused, + running, + queued, + limit, +}: { + paused: boolean; + running: number; + queued: number; + limit: number; +}) { + if (paused) { + return ( + + Paused + + ); + } + if (running >= limit && queued > 0) { + return ( + + At capacity + + ); + } + if (queued > 0) { + return ( + + Backlogged + + ); + } + if (running > 0) { + return ( + + Active + + ); + } + return ( + + Idle + + ); +} + +function formatWaitMs(ms: number): string { + if (ms < 1000) return `${Math.round(ms)}ms`; + if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`; + if (ms < 3_600_000) return `${(ms / 60_000).toFixed(1)}m`; + return `${(ms / 3_600_000).toFixed(1)}h`; +} + + +// Classic Queues page, restored verbatim from before the Queue Metrics feature. Rendered +// when queueMetricsUiEnabled is off so a gated org sees exactly the pre-metrics UI. +function ClassicQueuesView() { + const { + environment, + queues, + success, + pagination, + code, + totalQueues, + hasFilters, + autoReloadPollIntervalMs, + } = useTypedLoaderData(); + + const organization = useOrganization(); + const project = useProject(); + const env = useEnvironment(); + const plan = useCurrentPlan(); + + useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true }); + + const limitStatus = + environment.running === environment.concurrencyLimit * environment.burstFactor + ? "limit" + : environment.running > environment.concurrencyLimit + ? "burst" + : "within"; + + const limitClassName = + limitStatus === "burst" ? "text-warning" : limitStatus === "limit" ? "text-error" : undefined; + + return ( + + + + + + + Queues docs + + + + +
+
+ paused : undefined} + animate + accessory={ +
+ {environment.runsEnabled && + env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? ( + + ) : null} + +
+ } + valueClassName={env.paused ? "text-warning tabular-nums" : "tabular-nums"} + compactThreshold={1000000} + /> + + Including {environment.running - environment.concurrencyLimit} burst runs{" "} + + + ) : limitStatus === "limit" ? ( + "At concurrency limit" + ) : undefined + } + accessory={ + + } + compactThreshold={1000000} + /> + 1 ? ( + + Burst limit {environment.burstFactor * environment.concurrencyLimit}{" "} + + + ) : undefined + } + accessory={ + plan ? ( + plan?.v3Subscription?.plan?.limits.concurrentRuns.canExceed ? ( + + Increase limit + + ) : ( + + Increase limit + + ) + ) : null + } + /> +
+ + {success ? ( +
+
+ + +
+ + + + Name + Queued + Running + Limit + +
+ Environment + + This queue is limited by your environment's concurrency limit of{" "} + {environment.concurrencyLimit}. + +
+
+ User + + This queue is limited by a concurrency limit set in your code. + +
+
+ Override + + This queue's concurrency limit has been manually overridden from the + dashboard or API. + +
+ + } + > + Limited by +
+ + Pause/resume + +
+
+ + {queues.length > 0 ? ( + queues.map((queue) => { + const limit = queue.concurrencyLimit ?? environment.concurrencyLimit; + const isAtConcurrencyLimit = queue.running >= limit; + const isAtQueueLimit = + environment.queueSizeLimit !== null && + queue.queued >= environment.queueSizeLimit; + const queueFilterableName = `${queue.type === "task" ? "task/" : ""}${ + queue.name + }`; + return ( + + + + + {queue.concurrency?.overriddenAt ? ( + + Concurrency limit overridden + + } + content="This queue's concurrency limit has been manually overridden from the dashboard or API." + className="max-w-xs" + disableHoverableContent + /> + ) : null} + {queue.paused ? ( + + Paused + + ) : null} + {isAtQueueLimit ? ( + + At queue limit + + ) : null} + {isAtConcurrencyLimit ? ( + + At concurrency limit + + ) : null} + + + + {queue.queued} + + 0 && "text-text-bright", + isAtConcurrencyLimit && "text-warning" + )} + > + {queue.running} + + + {limit} + + + {queue.concurrency?.overriddenAt ? ( + Override + ) : queue.concurrencyLimit ? ( + "User" + ) : ( + "Environment" + )} + + + } + hiddenButtons={ + !queue.paused && + } + popoverContent={ + <> + {queue.paused ? ( + + ) : ( + + )} + + + + + + + } + /> + + ); + }) + ) : ( + + +
+ + {hasFilters + ? "No queues found matching your filters" + : "No queues found"} + +
+
+
+ )} +
+
+
+ ) : ( +
+ {totalQueues === 0 ? ( +
+ +
+ ) : code === "engine-version" ? ( + + ) : ( + Something went wrong + )} +
+ )} +
+
+
+ ); +} + function BurstFactorTooltip({ environment, }: { diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx new file mode 100644 index 00000000000..59982d9bc66 --- /dev/null +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx @@ -0,0 +1,495 @@ +import { useSearchParams, type MetaFunction } from "@remix-run/react"; +import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; +import { + CartesianGrid, + Line, + LineChart, + ReferenceLine, + ResponsiveContainer, + Tooltip, + type TooltipProps, + XAxis, + YAxis, +} from "recharts"; +import { typedjson, useTypedLoaderData } from "remix-typedjson"; +import { z } from "zod"; +import { PageBody, PageContainer } from "~/components/layout/AppLayout"; +import { LinkButton } from "~/components/primitives/Buttons"; +import { Header2 } from "~/components/primitives/Headers"; +import { LoadingBarDivider } from "~/components/primitives/LoadingBarDivider"; +import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import { useInterval } from "~/hooks/useInterval"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; +import { + isQueueMetricsWindow, + type QueueMetricsWindow, +} from "~/presenters/v3/QueueMetricsPresenter.server"; +import { QueueRetrievePresenter } from "~/presenters/v3/QueueRetrievePresenter.server"; +import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server"; +import { requireUserId } from "~/services/session.server"; +import { cn } from "~/utils/cn"; +import { EnvironmentParamSchema } from "~/utils/pathBuilder"; + +export const meta: MetaFunction = () => [{ title: `Queue metrics | Trigger.dev` }]; + +const ParamsSchema = EnvironmentParamSchema.extend({ queueParam: z.string() }); + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const userId = await requireUserId(request); + const { organizationSlug, projectParam, envParam, queueParam } = ParamsSchema.parse(params); + + // This whole page is part of the metrics UI; gate it per-org (the list already hides + // the only link to it, this is defense in depth). + if (!(await canAccessQueueMetricsUi({ userId, organizationSlug }))) { + throw new Response(undefined, { status: 404, statusText: "Not found" }); + } + + const url = new URL(request.url); + const rawPeriod = url.searchParams.get("period") ?? undefined; + const period: QueueMetricsWindow = isQueueMetricsWindow(rawPeriod) ? rawPeriod : "24h"; + + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) throw new Response(undefined, { status: 404, statusText: "Project not found" }); + + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) throw new Response(undefined, { status: 404, statusText: "Environment not found" }); + + const retrieve = await new QueueRetrievePresenter().call({ environment, queueInput: queueParam }); + if (!retrieve.success) { + throw new Response(undefined, { status: 404, statusText: "Queue not found" }); + } + + const queue = retrieve.queue; + const fullName = queue.type === "task" ? `task/${queue.name}` : queue.name; + + // Charts + CH-derived stats are fetched client-side per card (see QueueDetailChartCard / + // useQueueMetric) so the drill-down renders instantly. The loader only returns the live + // "now" counts + identifiers the client fetches need. + return typedjson({ + queue, + fullName, + period, + backPath: url.pathname.replace(/\/[^/]+$/, ""), + ids: { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + }, + }); +}; + +const COLORS = { + running: "#6366F1", + limit: "#4D525B", + queued: "#A78BFA", + p50: "#22D3EE", + p95: "#F59E0B", + p99: "#EF4444", + throttled: "#F59E0B", +}; + +type Ids = { organizationId: string; projectId: string; environmentId: string }; + +export default function Page() { + const { queue, fullName, period, backPath, ids } = useTypedLoaderData(); + + return ( + + + + + +
+
+ + +
+ + + + + +
+
+
+ ); +} + +type MetricRow = Record; +type MetricResponse = + | { success: true; data: { rows: MetricRow[] } } + | { success: false; error: string }; + +/** + * Client-fetch a queue-scoped TRQL query from the metric resource route, mirroring the + * dashboard widgets: own loading state, 60s + on-focus refresh, abort on change/unmount. + */ +function useQueueMetric( + query: string, + opts: { ids: Ids; period: string; queueName: string; fillGaps?: boolean } +) { + const [rows, setRows] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [failed, setFailed] = useState(false); + const abortRef = useRef(null); + const { ids, period, queueName, fillGaps } = opts; + + const load = useCallback(() => { + abortRef.current?.abort(); + const controller = new AbortController(); + abortRef.current = controller; + setIsLoading(true); + fetch("/resources/metric", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + query, + scope: "environment", + period, + from: null, + to: null, + fillGaps: !!fillGaps, + organizationId: ids.organizationId, + projectId: ids.projectId, + environmentId: ids.environmentId, + queues: [queueName], + }), + signal: controller.signal, + }) + .then((res) => res.json() as Promise) + .then((data) => { + if (controller.signal.aborted) return; + if (data.success) { + setRows(data.data.rows); + setFailed(false); + } else { + setFailed(true); + } + setIsLoading(false); + }) + .catch((error) => { + if (error instanceof DOMException && error.name === "AbortError") return; + if (!controller.signal.aborted) { + setFailed(true); + setIsLoading(false); + } + }); + }, [query, period, queueName, fillGaps, ids.organizationId, ids.projectId, ids.environmentId]); + + useEffect(() => { + load(); + return () => abortRef.current?.abort(); + }, [load]); + + useInterval({ interval: 60_000, onLoad: false, onFocus: true, callback: load }); + + return { rows: rows ?? [], showLoading: isLoading && !rows, failed }; +} + +function toNumber(value: number | string | null | undefined): number { + const n = typeof value === "number" ? value : Number(value); + return Number.isFinite(n) ? n : 0; +} + +function clickhouseTimeToMs(value: unknown): number { + const s = String(value).replace(" ", "T"); + return Date.parse(s.endsWith("Z") ? s : `${s}Z`); +} + +type SeriesConfig = { key: string; label: string; color: string; dashed?: boolean }; + +function QueueDetailChartCard({ + title, + query, + series, + ids, + period, + queueName, + valueFormat, + fillGaps, +}: { + title: string; + query: string; + series: SeriesConfig[]; + ids: Ids; + period: string; + queueName: string; + valueFormat?: (value: number) => string; + fillGaps?: boolean; +}) { + const { rows, showLoading, failed } = useQueueMetric(query, { ids, period, queueName, fillGaps }); + + const points = useMemo(() => { + return rows + .map((r) => { + const point: Record = { ts: clickhouseTimeToMs(r.t) }; + for (const s of series) point[s.key] = toNumber(r[s.key]); + return point; + }) + .filter((p) => Number.isFinite(p.ts)); + }, [rows, series]); + + const bucketIntervalMs = points.length >= 2 ? points[1].ts - points[0].ts : 0; + const formatX = useMemo(() => { + const sameDay = bucketIntervalMs > 0 && bucketIntervalMs < 6 * 3600_000; + return (value: number) => + new Date(value).toLocaleString("en-US", { + month: sameDay ? undefined : "short", + day: sameDay ? undefined : "numeric", + hour: "2-digit", + minute: "2-digit", + hour12: false, + }); + }, [bucketIntervalMs]); + + const hasData = points.some((p) => series.some((s) => p[s.key] > 0)); + + return ( +
+ {title} + + {showLoading ? ( +
+ ) : failed ? ( +
+ Unable to load metrics +
+ ) : hasData ? ( +
+ + + + + valueFormat(v) : undefined} + domain={[0, (dataMax: number) => Math.max(1, Math.ceil(dataMax * 1.15))]} + /> + } + allowEscapeViewBox={{ x: true, y: true }} + wrapperStyle={{ zIndex: 1000 }} + animationDuration={0} + /> + {series.map((s) => ( + + ))} + + + +
+ ) : ( +
+ No activity in this window +
+ )} +
+ ); +} + +function QueueChartTooltip({ + active, + payload, + label, + series, + formatX, + valueFormat, +}: TooltipProps & { + series: SeriesConfig[]; + formatX: (value: number) => string; + valueFormat?: (value: number) => string; +}) { + if (!active || !payload || payload.length === 0) return null; + return ( +
+
{formatX(Number(label))}
+ {series.map((s) => { + const entry = payload.find((p) => p.dataKey === s.key); + const value = entry?.value; + return ( +
+ + {s.label} + + {value === null || value === undefined + ? "–" + : valueFormat + ? valueFormat(Number(value)) + : Number(value).toLocaleString()} + +
+ ); + })} +
+ ); +} + +function QueueStats({ + queue, + ids, + period, + queueName, +}: { + queue: { running: number; queued: number }; + ids: Ids; + period: string; + queueName: string; +}) { + // One scalar query feeds the CH-derived stats; the "now" counts come from the loader (live). + const { rows, showLoading } = useQueueMetric( + `SELECT max(max_limit) AS lim, max(max_queued) AS peak_queued, deltaSumTimestampMerge(started_delta) AS started,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS worst_p95\nFROM queue_metrics`, + { ids, period, queueName } + ); + const row = rows[0]; + const worstP95 = row ? toNumber(row.worst_p95) : 0; + + return ( +
+ + + + + + 0 ? formatWaitMs(worstP95) : "–"} + loading={showLoading} + className={worstP95 >= 60_000 ? "text-warning" : undefined} + /> +
+ ); +} + +const PERIODS: QueueMetricsWindow[] = ["1h", "6h", "24h"]; + +function QueuePeriodSelect({ period }: { period: QueueMetricsWindow }) { + const [searchParams] = useSearchParams(); + const hrefFor = (value: QueueMetricsWindow) => { + const next = new URLSearchParams(searchParams); + next.set("period", value); + return `?${next.toString()}`; + }; + return ( +
+ Period + {PERIODS.map((value) => ( + + {value} + + ))} +
+ ); +} + +function Stat({ + label, + value, + className, + loading, +}: { + label: string; + value: string; + className?: string; + loading?: boolean; +}) { + return ( +
+
{label}
+ {loading ? ( +
+ ) : ( +
{value}
+ )} +
+ ); +} + +function formatWaitMs(ms: number): string { + if (ms < 1000) return `${Math.round(ms)}ms`; + if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`; + if (ms < 3_600_000) return `${(ms / 60_000).toFixed(1)}m`; + return `${(ms / 3_600_000).toFixed(1)}h`; +} diff --git a/apps/webapp/app/routes/resources.metric.tsx b/apps/webapp/app/routes/resources.metric.tsx index d456ba1ce1b..5bf0ed693ad 100644 --- a/apps/webapp/app/routes/resources.metric.tsx +++ b/apps/webapp/app/routes/resources.metric.tsx @@ -50,6 +50,8 @@ const MetricWidgetQuery = z.object({ operations: z.array(z.string()).optional(), providers: z.array(z.string()).optional(), tags: z.array(z.string()).optional(), + // Opt into server-side gap fill (carry-forward for gauges, zero-fill for counters). + fillGaps: z.boolean().optional(), }); export const action = async ({ request }: ActionFunctionArgs) => { @@ -85,6 +87,7 @@ export const action = async ({ request }: ActionFunctionArgs) => { operations, providers, tags: _tags, + fillGaps, } = submission.data; // Check they should be able to access it @@ -122,6 +125,7 @@ export const action = async ({ request }: ActionFunctionArgs) => { promptVersions, operations, providers, + fillGaps, // Set higher concurrency if many widgets are on screen at once customOrgConcurrencyLimit: env.METRIC_WIDGET_DEFAULT_ORG_CONCURRENCY_LIMIT, }); diff --git a/apps/webapp/app/utils/pathBuilder.ts b/apps/webapp/app/utils/pathBuilder.ts index 187bc50b549..edd65f8bde4 100644 --- a/apps/webapp/app/utils/pathBuilder.ts +++ b/apps/webapp/app/utils/pathBuilder.ts @@ -522,6 +522,15 @@ export function v3QueuesPath( return `${v3EnvironmentPath(organization, project, environment)}/queues`; } +export function v3QueuePath( + organization: OrgForPath, + project: ProjectForPath, + environment: EnvironmentForPath, + queue: { friendlyId: string } +) { + return `${v3QueuesPath(organization, project, environment)}/${queue.friendlyId}`; +} + export function v3WaitpointTokensPath( organization: OrgForPath, project: ProjectForPath, diff --git a/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts b/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts new file mode 100644 index 00000000000..0e3c142b272 --- /dev/null +++ b/apps/webapp/app/v3/canAccessQueueMetricsUi.server.ts @@ -0,0 +1,26 @@ +import { prisma } from "~/db.server"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; + +// Per-org gate for the Queue Metrics dashboard UI. Org override wins over the global +// FeatureFlag table value, which wins over the off-by-default. Ingestion/emission is a +// separate global flag; this only decides whether an org sees the metrics view. +export async function canAccessQueueMetricsUi(options: { + userId: string; + organizationSlug: string; +}): Promise { + const org = await prisma.organization.findFirst({ + where: { + slug: options.organizationSlug, + members: { some: { userId: options.userId } }, + }, + select: { featureFlags: true }, + }); + + const flag = makeFlag(); + return flag({ + key: FEATURE_FLAG.queueMetricsUiEnabled, + defaultValue: false, + overrides: (org?.featureFlags as Record) ?? {}, + }); +} diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 4617179eda1..c03519a49d2 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -19,6 +19,7 @@ export const FEATURE_FLAG = { computeMigrationRequireTemplate: "computeMigrationRequireTemplate", devBranchesEnabled: "devBranchesEnabled", runOpsMintKsuid: "runOpsMintKsuid", + queueMetricsUiEnabled: "queueMetricsUiEnabled", } as const; export const FeatureFlagCatalog = { @@ -54,6 +55,9 @@ export const FeatureFlagCatalog = { // Per-org KSUID mint cutover. Defaults to "cuid"; only honored when // RUN_OPS_MINT_KSUID_ENABLED is on AND isSplitEnabled() is true. [FEATURE_FLAG.runOpsMintKsuid]: z.enum(["cuid", "ksuid"]), + // Per-org access to the Queue Metrics dashboard UI (view only; emission is global and + // separate). Off unless enabled for the org. + [FEATURE_FLAG.queueMetricsUiEnabled]: z.coerce.boolean(), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts index 4784ad75629..5764edc1b9b 100644 --- a/apps/webapp/app/v3/querySchemas.ts +++ b/apps/webapp/app/v3/querySchemas.ts @@ -613,6 +613,168 @@ export const metricsSchema: TableSchema = { ] satisfies BucketThreshold[], }; +/** + * Schema definition for the queue_metrics table (trigger_dev.queue_metrics_v1). + * Pre-aggregated into 10-second buckets. Counter columns re-aggregate with sum(), + * gauges with max(), and wait_quantiles with quantilesMerge() — never FINAL. + */ +export const queueMetricsSchema: TableSchema = { + name: "queue_metrics", + clickhouseName: "trigger_dev.queue_metrics_v1", + description: "Per-queue depth, concurrency, throttling, and scheduling-delay metrics", + timeConstraint: "bucket_start", + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + columns: { + environment: { + name: "environment", + clickhouseName: "environment_id", + ...column("String", { description: "The environment slug", example: "prod" }), + fieldMapping: "environment", + customRenderType: "environment", + }, + project: { + name: "project", + clickhouseName: "project_id", + ...column("String", { + description: "The project reference, they always start with `proj_`.", + example: "proj_howcnaxbfxdmwmxazktx", + }), + fieldMapping: "project", + customRenderType: "project", + }, + queue: { + name: "queue", + clickhouseName: "queue_name", + ...column("LowCardinality(String)", { + description: "The queue name", + example: "my-queue", + coreColumn: true, + }), + }, + bucket_start: { + name: "bucket_start", + ...column("DateTime", { + description: "The start of the 10-second aggregation bucket", + example: "2024-01-15 09:30:00", + coreColumn: true, + }), + }, + // Cumulative-counter delta states. Read with deltaSumTimestampMerge() (loss-tolerant, + // reset-safe), never sum(); opaque like wait_quantiles. + enqueue_delta: { + name: "enqueue_delta", + ...column("String", { + description: + "Runs enqueued (cumulative-counter delta). Read with deltaSumTimestampMerge(enqueue_delta).", + }), + groupable: false, + sortable: false, + filterable: false, + }, + started_delta: { + name: "started_delta", + ...column("String", { + description: + "Runs dequeued/started (throughput). Read with deltaSumTimestampMerge(started_delta).", + coreColumn: true, + }), + groupable: false, + sortable: false, + filterable: false, + }, + ack_delta: { + name: "ack_delta", + ...column("String", { + description: "Runs acked (completed). Read with deltaSumTimestampMerge(ack_delta).", + }), + groupable: false, + sortable: false, + filterable: false, + }, + nack_delta: { + name: "nack_delta", + ...column("String", { + description: "Runs nacked. Read with deltaSumTimestampMerge(nack_delta).", + }), + groupable: false, + sortable: false, + filterable: false, + }, + dlq_delta: { + name: "dlq_delta", + ...column("String", { + description: "Runs dead-lettered. Read with deltaSumTimestampMerge(dlq_delta).", + }), + groupable: false, + sortable: false, + filterable: false, + }, + throttled_count: { + name: "throttled_count", + ...column("UInt64", { + description: "Gauge emissions where running>=limit and queued>0. Aggregate with sum().", + coreColumn: true, + }), + }, + max_queued: { + name: "max_queued", + ...column("UInt32", { description: "Peak queue depth in the bucket. Aggregate with max().", coreColumn: true, fillMode: "carry" }), + }, + max_running: { + name: "max_running", + ...column("UInt32", { description: "Peak running (concurrency) in the bucket. Aggregate with max().", coreColumn: true, fillMode: "carry" }), + }, + max_limit: { + name: "max_limit", + ...column("UInt32", { description: "The queue concurrency limit. Aggregate with max().", coreColumn: true, fillMode: "carry" }), + }, + max_env_queued: { + name: "max_env_queued", + ...column("UInt32", { description: "Peak environment-wide queued in the bucket. Aggregate with max().", fillMode: "carry" }), + }, + max_env_running: { + name: "max_env_running", + ...column("UInt32", { description: "Peak environment-wide running in the bucket. Aggregate with max().", fillMode: "carry" }), + }, + max_env_limit: { + name: "max_env_limit", + ...column("UInt32", { description: "The environment concurrency limit. Aggregate with max().", fillMode: "carry" }), + }, + wait_ms_sum: { + name: "wait_ms_sum", + ...column("UInt64", { description: "Sum of scheduling delays (ms). Mean = wait_ms_sum/wait_ms_count." }), + }, + wait_ms_count: { + name: "wait_ms_count", + ...column("UInt64", { description: "Count of scheduling-delay samples. Aggregate with sum()." }), + }, + wait_quantiles: { + name: "wait_quantiles", + ...column("String", { + description: + "Scheduling-delay (dequeue minus eligible-at) quantile state. Read with quantilesMerge(0.5,0.9,0.95,0.99)(wait_quantiles)[n].", + }), + groupable: false, + sortable: false, + filterable: false, + }, + }, + timeBucketThresholds: [ + { maxRangeSeconds: 3 * 60 * 60, interval: { value: 10, unit: "SECOND" } }, + { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } }, + { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } }, + { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } }, + { maxRangeSeconds: 30 * 24 * 60 * 60, interval: { value: 1, unit: "HOUR" } }, + { maxRangeSeconds: 90 * 24 * 60 * 60, interval: { value: 6, unit: "HOUR" } }, + { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } }, + { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } }, + ] satisfies BucketThreshold[], +}; + /** * All available schemas for the query editor */ @@ -980,6 +1142,7 @@ export const querySchemas: TableSchema[] = [ metricsSchema, llmMetricsSchema, llmModelsSchema, + queueMetricsSchema, ]; /** diff --git a/apps/webapp/seed-queue-metrics.mts b/apps/webapp/seed-queue-metrics.mts new file mode 100644 index 00000000000..a5b2f86b444 --- /dev/null +++ b/apps/webapp/seed-queue-metrics.mts @@ -0,0 +1,494 @@ +import { prisma } from "./app/db.server"; +import { createOrganization } from "./app/models/organization.server"; +import { createProject } from "./app/models/project.server"; +import { ClickHouse } from "@internal/clickhouse"; +import type { QueueMetricsRawV1Input } from "@internal/clickhouse"; +import { generateFriendlyId } from "./app/v3/friendlyIdentifiers"; + +// Queue metrics simulator: writes realistic raw rows into a synthetic tenant's +// queue_metrics_raw_v1 and lets the MV build queue_metrics_v1 (the same path the real +// consumer uses), so the dashboard can be built without the run engine. See TRI-10407. + +const ORG_TITLE = "Queue Metrics Dev"; +const PROJECT_NAME = "queue-metrics-demo"; + +type Rng = () => number; +type QueueProfile = { + name: string; + limit: (bucket: number) => number; + arrivals: (bucket: number, rng: Rng) => number; // expected new runs enqueued this bucket + waitBaseMs: number; + sparse?: boolean; // emit no rows when the queue is fully idle (tests carry-forward gaps) +}; +type Scenario = { description: string; envLimit: (bucket: number) => number; queues: QueueProfile[] }; + +// --------------------------------------------------------------------------- +// CLI args +// --------------------------------------------------------------------------- + +function parseArgs(argv: string[]) { + const flags: Record = {}; + for (let i = 0; i < argv.length; i++) { + const t = argv[i]; + if (t.startsWith("--")) { + const k = t.slice(2); + const n = argv[i + 1]; + if (n && !n.startsWith("--")) { + flags[k] = n; + i++; + } else flags[k] = "true"; + } + } + return flags; +} + +function parseDuration(s: string): number { + const m = s.match(/^(\d+)\s*(s|m|h|d)?$/); + if (!m) throw new Error(`bad duration: ${s}`); + const n = Number(m[1]); + const unit = m[2] ?? "s"; + return n * { s: 1, m: 60, h: 3600, d: 86400 }[unit]!; +} + +// --------------------------------------------------------------------------- +// Deterministic RNG + distributions +// --------------------------------------------------------------------------- + +function mulberry32(seed: number): Rng { + let a = seed >>> 0; + return () => { + a |= 0; + a = (a + 0x6d2b79f5) | 0; + let t = Math.imul(a ^ (a >>> 15), 1 | a); + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t; + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; +} + +function standardNormal(rng: Rng): number { + let u = 0; + let v = 0; + while (u === 0) u = rng(); + while (v === 0) v = rng(); + return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v); +} + +function lognormal(medianMs: number, sigma: number, rng: Rng): number { + return Math.exp(Math.log(Math.max(medianMs, 1)) + sigma * standardNormal(rng)); +} + +function poisson(lambda: number, rng: Rng): number { + if (lambda <= 0) return 0; + if (lambda > 30) return Math.max(0, Math.round(lambda + standardNormal(rng) * Math.sqrt(lambda))); + const L = Math.exp(-lambda); + let k = 0; + let p = 1; + do { + k++; + p *= rng(); + } while (p > L); + return k - 1; +} + +function formatChDateTime(date: Date): string { + return date.toISOString().slice(0, 19).replace("T", " "); +} + +// --------------------------------------------------------------------------- +// Scenarios +// --------------------------------------------------------------------------- + +const steady = (): QueueProfile[] => [ + { name: "emails", limit: () => 20, arrivals: (_b, r) => poisson(12, r), waitBaseMs: 40 }, + { name: "webhooks", limit: () => 15, arrivals: (_b, r) => poisson(9, r), waitBaseMs: 40 }, + { name: "reports", limit: () => 10, arrivals: (_b, r) => poisson(5, r), waitBaseMs: 60 }, +]; + +// periodic bursts every ~30 buckets +const bursty = (name: string, limit: number, base: number): QueueProfile => ({ + name, + limit: () => limit, + arrivals: (b, r) => poisson(b % 30 < 4 ? base * 5 : base, r), + waitBaseMs: 50, +}); + +const scenarios: Record Scenario> = { + steady: () => ({ description: "all queues below capacity, no throttling", envLimit: () => 60, queues: steady() }), + + burst: () => ({ + description: "periodic arrival bursts -> backlog + wait spikes + throttling", + envLimit: () => 60, + queues: [bursty("ingest", 20, 6), bursty("transform", 20, 7)], + }), + + // Tela case: sum of per-queue limits far exceeds the env limit, so queues compete. + "over-allocated-env": () => ({ + description: "Sum(queue limits)=120 >> env limit=40; env saturates, queues env-limited", + envLimit: () => 40, + queues: Array.from({ length: 6 }, (_v, i) => ({ + name: `worker-${i + 1}`, + limit: () => 20, + arrivals: (_b: number, r: Rng) => poisson(14, r), + waitBaseMs: 50, + })), + }), + + "single-queue-starves-others": () => ({ + description: "one greedy queue consumes most of a small env limit, starving the rest", + envLimit: () => 30, + queues: [ + { name: "greedy", limit: () => 40, arrivals: (_b, r) => poisson(45, r), waitBaseMs: 60 }, + { name: "polite-1", limit: () => 10, arrivals: (_b, r) => poisson(6, r), waitBaseMs: 50 }, + { name: "polite-2", limit: () => 10, arrivals: (_b, r) => poisson(6, r), waitBaseMs: 50 }, + ], + }), + + "throttled-backlog": () => ({ + description: "arrival rate persistently above the queue limit -> permanent backlog + throttling", + envLimit: () => 50, + queues: [{ name: "overloaded", limit: () => 10, arrivals: (_b, r) => poisson(16, r), waitBaseMs: 80 }], + }), + + "idle-sparse": () => ({ + description: "sparse arrivals with many empty buckets (carry-forward gaps)", + envLimit: () => 50, + queues: Array.from({ length: 4 }, (_v, i) => ({ + name: `sparse-${i + 1}`, + limit: () => 5, + arrivals: (_b: number, r: Rng) => (r() < 0.12 ? poisson(3, r) : 0), + waitBaseMs: 30, + sparse: true, + })), + }), + + "spike-then-drain": (totalBuckets) => ({ + description: "heavy arrivals for the first third, then zero; backlog builds then drains", + envLimit: () => 60, + queues: [ + { + name: "batch-job", + limit: () => 15, + arrivals: (b, r) => (b < totalBuckets / 3 ? poisson(30, r) : 0), + waitBaseMs: 70, + }, + ], + }), + + // Default: one env with a variety of queue behaviours + occasional env saturation. + mixed: (totalBuckets) => ({ + description: "variety of queue profiles in one env, with occasional env saturation", + envLimit: (b) => (b % 40 < 12 ? 45 : 70), // dips low periodically to flip env saturation + queues: [ + { name: "emails", limit: () => 20, arrivals: (_b, r) => poisson(12, r), waitBaseMs: 40 }, + bursty("webhooks", 20, 6), + { name: "reports", limit: () => 10, arrivals: (_b, r) => poisson(8, r), waitBaseMs: 80 }, + { + name: "cleanup", + limit: () => 5, + arrivals: (_b, r) => (r() < 0.12 ? poisson(3, r) : 0), + waitBaseMs: 30, + sparse: true, + }, + { + name: "nightly-batch", + limit: () => 15, + arrivals: (b, r) => (b < totalBuckets / 5 ? poisson(18, r) : 0), + waitBaseMs: 70, + }, + ], + }), +}; + +// --------------------------------------------------------------------------- +// Simulation +// --------------------------------------------------------------------------- + +type Ids = { organization_id: string; project_id: string; environment_id: string }; +const WAIT_SIGMA = 0.6; +const NACK_RATE = 0.02; +const DLQ_RATE = 0.004; + +// Advance one bucket of the simulation for every queue, returning the raw rows to insert. +// `backlog` is mutated in place so state carries across buckets (and into live mode). +function simulateBucket( + scenario: Scenario, + bucket: number, + bucketSec: number, + eventTime: string, + ids: Ids, + backlog: number[], + rng: Rng +): QueueMetricsRawV1Input[] { + const envLimit = scenario.envLimit(bucket); + const n = scenario.queues.length; + + const limit = new Array(n); + const desired = new Array(n); + for (let q = 0; q < n; q++) { + limit[q] = scenario.queues[q].limit(bucket); + const arrivals = Math.min(500, scenario.queues[q].arrivals(bucket, rng)); + const prior = backlog[q]; // backlog carried from earlier buckets, before this bucket's arrivals + backlog[q] += arrivals; // arrivals join the backlog; recorded as enqueues below + (desired as any)[q] = { arrivals, prior, want: Math.min(limit[q], backlog[q]) }; + } + + // Env cap: if the queues collectively want more concurrency than the env allows, scale down. + const sumWant = desired.reduce((s: number, d: any) => s + d.want, 0); + const scale = sumWant > envLimit && sumWant > 0 ? envLimit / sumWant : 1; + + const running = new Array(n); + const queued = new Array(n); + let envRunning = 0; + let envQueued = 0; + for (let q = 0; q < n; q++) { + const d = desired[q] as any; + running[q] = Math.floor(d.want * scale); + queued[q] = backlog[q] - running[q]; + envRunning += running[q]; + envQueued += queued[q]; + } + + const rows: QueueMetricsRawV1Input[] = []; + for (let q = 0; q < n; q++) { + const profile = scenario.queues[q]; + const started = running[q]; + const arrivals = (desired[q] as any).arrivals as number; + const prior = (desired[q] as any).prior as number; // depth a starting run actually queued behind + backlog[q] = queued[q]; // carry the unserved remainder forward + + if (profile.sparse && arrivals === 0 && started === 0 && prior === 0) { + continue; // fully idle: leave a gap so carry-forward is exercised + } + + const gauge: QueueMetricsRawV1Input = { + ...ids, + queue_name: profile.name, + event_time: eventTime, + op: "gauge", + running: running[q], + queued: queued[q], + queue_limit: limit[q], + env_running: envRunning, + env_queued: envQueued, + env_limit: envLimit, + throttled: running[q] >= limit[q] && queued[q] > 0 ? 1 : 0, + }; + rows.push(gauge); + + for (let a = 0; a < arrivals; a++) { + rows.push({ ...ids, queue_name: profile.name, event_time: eventTime, op: "enqueue" }); + } + + const medianWait = profile.waitBaseMs + (prior / Math.max(limit[q], 1)) * bucketSec * 1000; + for (let s = 0; s < started; s++) { + rows.push({ + ...ids, + queue_name: profile.name, + event_time: eventTime, + op: "started", + wait_ms: Math.round(lognormal(medianWait, WAIT_SIGMA, rng)), + }); + const roll = rng(); + const op = roll < DLQ_RATE ? "dlq" : roll < DLQ_RATE + NACK_RATE ? "nack" : "ack"; + rows.push({ ...ids, queue_name: profile.name, event_time: eventTime, op }); + } + } + return rows; +} + +// --------------------------------------------------------------------------- +// ClickHouse +// --------------------------------------------------------------------------- + +function clickhouse(): ClickHouse { + const clickhouseUrl = process.env.CLICKHOUSE_URL ?? process.env.EVENTS_CLICKHOUSE_URL; + if (!clickhouseUrl) { + console.error("CLICKHOUSE_URL not set"); + process.exit(1); + } + if (/\.clickhouse\.cloud|prod/i.test(clickhouseUrl)) { + console.error(`Refusing to run against a non-local ClickHouse: ${clickhouseUrl}`); + process.exit(1); + } + const url = new URL(clickhouseUrl); + url.searchParams.delete("secure"); + return new ClickHouse({ url: url.toString(), name: "queue-metrics-simulator" }); +} + +async function insertBatched(ch: ClickHouse, rows: QueueMetricsRawV1Input[], nonce: string) { + const BATCH = 25_000; + for (let i = 0; i < rows.length; i += BATCH) { + const slice = rows.slice(i, i + BATCH); + const [error] = await ch.queueMetrics.insertRaw(slice, { + params: { clickhouse_settings: { insert_deduplication_token: `${nonce}:${i}` } }, + }); + if (error) { + console.error("insert failed:", error.message); + process.exit(1); + } + } +} + +async function resetEnv(ch: ClickHouse, environmentId: string) { + const raw = (ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } }) + .client; + for (const table of ["queue_metrics_raw_v1", "queue_metrics_v1"]) { + await raw.command({ + query: `DELETE FROM trigger_dev.${table} WHERE environment_id = '${environmentId}'`, + }); + } + console.log(`Reset queue metrics for environment ${environmentId}`); +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +// Make the synthetic project a V2 engine project with a current dev worker + a Postgres +// TaskQueue per simulated queue, so the /queues list renders the V2 table (it pages from +// Postgres and gates on engine version; ClickHouse only holds the metrics). +async function ensureTaskQueues(scenario: Scenario, projectId: string, runtimeEnvironmentId: string) { + await prisma.project.update({ where: { id: projectId }, data: { engine: "V2" } }); + + await prisma.backgroundWorker.upsert({ + where: { + projectId_runtimeEnvironmentId_version: { + projectId, + runtimeEnvironmentId, + version: "queue-metrics-sim", + }, + }, + update: {}, + create: { + friendlyId: generateFriendlyId("worker"), + engine: "V2", + contentHash: "queue-metrics-sim", + sdkVersion: "4.0.0", + cliVersion: "4.0.0", + projectId, + runtimeEnvironmentId, + version: "queue-metrics-sim", + metadata: {}, + }, + }); + + for (const profile of scenario.queues) { + const concurrencyLimit = profile.limit(0); + await prisma.taskQueue.upsert({ + where: { runtimeEnvironmentId_name: { runtimeEnvironmentId, name: profile.name } }, + create: { + friendlyId: generateFriendlyId("queue"), + version: "V2", + name: profile.name, + orderableName: profile.name, + concurrencyLimit, + runtimeEnvironmentId, + projectId, + type: "NAMED", + }, + update: { concurrencyLimit }, + }); + } + console.log(`Ensured ${scenario.queues.length} task queues in Postgres.`); +} + +async function main() { + const flags = parseArgs(process.argv.slice(2)); + const scenarioName = flags.scenario ?? "mixed"; + const build = scenarios[scenarioName]; + if (!build) { + console.error(`Unknown scenario "${scenarioName}". Options: ${Object.keys(scenarios).join(", ")}`); + process.exit(1); + } + const bucketSec = Number(flags.bucket ?? 10); + const windowSec = parseDuration(flags.window ?? "2h"); + const totalBuckets = Math.floor(windowSec / bucketSec); + const seed = Number(flags.seed ?? 1); + const live = flags.live === "true"; + + const user = await prisma.user.findUnique({ where: { email: "local@trigger.dev" } }); + if (!user) { + console.error("User local@trigger.dev not found. Run `pnpm run db:seed` first."); + process.exit(1); + } + + let org = await prisma.organization.findFirst({ + where: { title: ORG_TITLE, members: { some: { userId: user.id } } }, + }); + if (!org) org = await createOrganization({ title: ORG_TITLE, userId: user.id, companySize: "1-10" }); + + let project = await prisma.project.findFirst({ where: { name: PROJECT_NAME, organizationId: org.id } }); + if (!project) { + project = await createProject({ organizationSlug: org.slug, name: PROJECT_NAME, userId: user.id, version: "v3" }); + } + + const runtimeEnv = await prisma.runtimeEnvironment.findFirst({ + where: { projectId: project.id, type: "DEVELOPMENT" }, + }); + if (!runtimeEnv) { + console.error("No DEVELOPMENT environment found for project."); + process.exit(1); + } + + const ids: Ids = { organization_id: org.id, project_id: project.id, environment_id: runtimeEnv.id }; + const ch = clickhouse(); + const nonce = `qmsim-${Date.now()}-${seed}`; + + if (flags.reset === "true" || flags["reset-only"] === "true") { + await resetEnv(ch, runtimeEnv.id); + if (flags["reset-only"] === "true") { + await ch.close(); + process.exit(0); + } + } + + const scenario = build(totalBuckets, bucketSec); + await ensureTaskQueues(scenario, project.id, runtimeEnv.id); + const rng = mulberry32(seed); + const backlog = new Array(scenario.queues.length).fill(0); + + console.log(`Scenario "${scenarioName}": ${scenario.description}`); + console.log(`Backfilling ${totalBuckets} x ${bucketSec}s buckets (${flags.window ?? "2h"}) for ${scenario.queues.length} queues...`); + + // Backfill: buckets from (now - window) up to now, aligned to the bucket grid. + const nowBucket = Math.floor(Date.now() / 1000 / bucketSec) * bucketSec; + const startBucket = nowBucket - totalBuckets * bucketSec; + const rows: QueueMetricsRawV1Input[] = []; + for (let b = 0; b < totalBuckets; b++) { + const eventTime = formatChDateTime(new Date((startBucket + b * bucketSec) * 1000)); + rows.push(...simulateBucket(scenario, b, bucketSec, eventTime, ids, backlog, rng)); + } + await insertBatched(ch, rows, nonce); + console.log(`Inserted ${rows.length} raw rows.`); + + // Merge the AggregatingMergeTree partials so argMax "current value" widgets read cleanly. + // The real pipeline relies on background merges; the simulator forces it for a tidy demo. + const raw = (ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } }) + .client; + await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_v1 FINAL` }); + + const origin = process.env.APP_ORIGIN ?? "http://localhost:3030"; + console.log(`\nQueues dashboard: ${origin}/orgs/${org.slug}/projects/${project.slug}/env/dev/dashboards/queues`); + + if (live) { + console.log(`\nLive mode: appending one bucket every ${bucketSec}s (Ctrl-C to stop)...`); + let b = totalBuckets; + // eslint-disable-next-line no-constant-condition + while (true) { + await new Promise((r) => setTimeout(r, bucketSec * 1000)); + const eventTime = formatChDateTime(new Date(Math.floor(Date.now() / 1000 / bucketSec) * bucketSec * 1000)); + const liveRows = simulateBucket(scenario, b, bucketSec, eventTime, ids, backlog, rng); + await insertBatched(ch, liveRows, `${nonce}:live:${b}`); + console.log(`bucket ${b}: ${liveRows.length} rows @ ${eventTime}`); + b++; + } + } + + await ch.close(); + process.exit(0); +} + +main().catch((e) => { + console.error(e); + process.exit(1); +}); From 0143080b287567c95bc65e76ec3550d6e1ad9458 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 2 Jul 2026 23:33:36 +0100 Subject: [PATCH 07/37] chore(webapp): add server-changes note for queue metrics --- .server-changes/queue-metrics-dashboard.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .server-changes/queue-metrics-dashboard.md diff --git a/.server-changes/queue-metrics-dashboard.md b/.server-changes/queue-metrics-dashboard.md new file mode 100644 index 00000000000..37baffc7aaa --- /dev/null +++ b/.server-changes/queue-metrics-dashboard.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Queue metrics and health on the Queues page: per-queue depth, throughput, concurrency, throttling, and scheduling-delay charts, plus a per-queue detail view. Off by default; enabled per organization. From d193dc3142ac7f3ed015cce4dd880aeb9d672a47 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 09:27:27 +0100 Subject: [PATCH 08/37] chore: apply oxfmt formatting --- .../presenters/v3/BuiltInDashboards.server.ts | 6 +- .../v3/QueueMetricsPresenter.server.ts | 4 +- .../route.tsx | 5 +- .../route.tsx | 29 +- .../route.tsx | 24 +- .../webapp/app/routes/admin.queue-metrics.tsx | 6 +- apps/webapp/app/v3/querySchemas.ts | 41 ++- apps/webapp/app/v3/queueMetrics.server.ts | 6 +- apps/webapp/seed-queue-metrics.mts | 71 +++- apps/webapp/test/queueMetricsMapping.test.ts | 12 +- .../metrics-pipeline/src/cachedValue.ts | 5 +- .../metrics-pipeline/src/consumer.test.ts | 324 +++++++++++------- .../metrics-pipeline/src/consumer.ts | 8 +- .../metrics-pipeline/src/idempotency.ts | 4 +- .../run-engine/src/run-queue/metrics.test.ts | 165 +++++---- 15 files changed, 448 insertions(+), 262 deletions(-) diff --git a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts index 273e76054f9..48bc26f6438 100644 --- a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts +++ b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts @@ -635,7 +635,11 @@ const queuesDashboard: BuiltInDashboard = { pressure: { title: "Queue pressure", query: `SELECT queue,\n argMax(max_running, bucket_start) AS running,\n argMax(max_queued, bucket_start) AS queued,\n argMax(max_limit, bucket_start) AS limit,\n running + queued AS demand,\n max(max_queued) AS peak_queued,\n sum(throttled_count) AS throttled,\n multiIf(running >= limit AND queued > 0, 'queue-limited', queued > 0, 'backlogged', 'healthy') AS status\nFROM queue_metrics\nGROUP BY queue\nORDER BY peak_queued DESC`, - display: { type: "table", prettyFormatting: true, sorting: [{ id: "peak_queued", desc: true }] }, + display: { + type: "table", + prettyFormatting: true, + sorting: [{ id: "peak_queued", desc: true }], + }, }, "t-trends": { title: "Per-queue trends", query: "", display: { type: "title" } }, "running-q": { diff --git a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts index 42c65452abf..3545c8598f4 100644 --- a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts @@ -58,7 +58,8 @@ export class QueueMetricsPresenter { const bucketSeconds = Math.max(60, Math.round(windowSeconds / SPARKLINE_POINTS)); const numBuckets = Math.ceil(windowSeconds / bucketSeconds); const nowSeconds = Math.floor(Date.now() / 1000); - const gridStartSeconds = Math.floor((nowSeconds - windowSeconds) / bucketSeconds) * bucketSeconds; + const gridStartSeconds = + Math.floor((nowSeconds - windowSeconds) / bucketSeconds) * bucketSeconds; const bucketStartMs = gridStartSeconds * 1000; const bucketIntervalMs = bucketSeconds * 1000; @@ -144,5 +145,4 @@ export class QueueMetricsPresenter { return empty; } } - } diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx index b5abf8b0909..d529fdf0d22 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.dashboards.$dashboardKey/route.tsx @@ -53,7 +53,10 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { // The built-in "queues" dashboard is part of the metrics UI (unlinked, but reachable by // URL), so gate it per-org like the rest of the Queue Metrics view. - if (dashboardKey === "queues" && !(await canAccessQueueMetricsUi({ userId: user.id, organizationSlug }))) { + if ( + dashboardKey === "queues" && + !(await canAccessQueueMetricsUi({ userId: user.id, organizationSlug })) + ) { throw new Response(undefined, { status: 404, statusText: "Not found" }); } diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index 8e5161f2278..517d29e9393 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -109,9 +109,11 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const { organizationSlug, projectParam, envParam } = EnvironmentParamSchema.parse(params); const url = new URL(request.url); - const { page, query, period: rawPeriod } = SearchParamsSchema.parse( - Object.fromEntries(url.searchParams) - ); + const { + page, + query, + period: rawPeriod, + } = SearchParamsSchema.parse(Object.fromEntries(url.searchParams)); const period: QueueMetricsWindow = isQueueMetricsWindow(rawPeriod) ? rawPeriod : "24h"; const project = await findProjectBySlug(organizationSlug, projectParam, userId); @@ -153,7 +155,10 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { window: QueueMetricsWindow; bucketStartMs: number; bucketIntervalMs: number; - byQueue: Record; + byQueue: Record< + string, + import("~/presenters/v3/QueueMetricsPresenter.server").QueueListMetric + >; } | null = null; if (queueMetricsUiEnabled && queues.success) { @@ -404,8 +409,7 @@ function QueuesWithMetricsView() { ) ) : null} - {environment.runsEnabled && - env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? ( + {environment.runsEnabled && env.pauseSource !== ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT ? ( ) : null} ) : failed ? ( -
Unable to load metrics
+
+ Unable to load metrics +
) : ( )} @@ -1407,7 +1419,6 @@ function formatWaitMs(ms: number): string { return `${(ms / 3_600_000).toFixed(1)}h`; } - // Classic Queues page, restored verbatim from before the Queue Metrics feature. Rendered // when queueMetricsUiEnabled is off so a gated org sees exactly the pre-metrics UI. function ClassicQueuesView() { diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx index 59982d9bc66..d24129806f6 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx @@ -55,7 +55,8 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { if (!project) throw new Response(undefined, { status: 404, statusText: "Project not found" }); const environment = await findEnvironmentBySlug(project.id, envParam, userId); - if (!environment) throw new Response(undefined, { status: 404, statusText: "Environment not found" }); + if (!environment) + throw new Response(undefined, { status: 404, statusText: "Environment not found" }); const retrieve = await new QueueRetrievePresenter().call({ environment, queueInput: queueParam }); if (!retrieve.success) { @@ -330,7 +331,9 @@ function QueueDetailChartCard({ /> } + content={ + + } allowEscapeViewBox={{ x: true, y: true }} wrapperStyle={{ zIndex: 1000 }} animationDuration={0} @@ -382,7 +385,10 @@ function QueueChartTooltip({ const value = entry?.value; return (
- + {s.label} {value === null || value === undefined @@ -421,13 +427,21 @@ function QueueStats({
- + - + 0 ? formatWaitMs(worstP95) : "–"} diff --git a/apps/webapp/app/routes/admin.queue-metrics.tsx b/apps/webapp/app/routes/admin.queue-metrics.tsx index 883a3bdb2d3..df4bbe8c001 100644 --- a/apps/webapp/app/routes/admin.queue-metrics.tsx +++ b/apps/webapp/app/routes/admin.queue-metrics.tsx @@ -141,9 +141,9 @@ export default function AdminQueueMetricsRoute() {
Depth = entries buffered in the shard stream; Lag = entries not yet delivered to the - consumer group (rising = consumer falling behind; "unknown" = entries were trimmed - past the group, i.e. data was lost); Pending = unacked entries. Gauges and counters - share one stream family on the metrics Redis. + consumer group (rising = consumer falling behind; "unknown" = entries were trimmed past + the group, i.e. data was lost); Pending = unacked entries. Gauges and counters share one + stream family on the metrics Redis. {lagUnknownCount > 0 && ( diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts index 5764edc1b9b..2938d254cf8 100644 --- a/apps/webapp/app/v3/querySchemas.ts +++ b/apps/webapp/app/v3/querySchemas.ts @@ -722,35 +722,60 @@ export const queueMetricsSchema: TableSchema = { }, max_queued: { name: "max_queued", - ...column("UInt32", { description: "Peak queue depth in the bucket. Aggregate with max().", coreColumn: true, fillMode: "carry" }), + ...column("UInt32", { + description: "Peak queue depth in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), }, max_running: { name: "max_running", - ...column("UInt32", { description: "Peak running (concurrency) in the bucket. Aggregate with max().", coreColumn: true, fillMode: "carry" }), + ...column("UInt32", { + description: "Peak running (concurrency) in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), }, max_limit: { name: "max_limit", - ...column("UInt32", { description: "The queue concurrency limit. Aggregate with max().", coreColumn: true, fillMode: "carry" }), + ...column("UInt32", { + description: "The queue concurrency limit. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), }, max_env_queued: { name: "max_env_queued", - ...column("UInt32", { description: "Peak environment-wide queued in the bucket. Aggregate with max().", fillMode: "carry" }), + ...column("UInt32", { + description: "Peak environment-wide queued in the bucket. Aggregate with max().", + fillMode: "carry", + }), }, max_env_running: { name: "max_env_running", - ...column("UInt32", { description: "Peak environment-wide running in the bucket. Aggregate with max().", fillMode: "carry" }), + ...column("UInt32", { + description: "Peak environment-wide running in the bucket. Aggregate with max().", + fillMode: "carry", + }), }, max_env_limit: { name: "max_env_limit", - ...column("UInt32", { description: "The environment concurrency limit. Aggregate with max().", fillMode: "carry" }), + ...column("UInt32", { + description: "The environment concurrency limit. Aggregate with max().", + fillMode: "carry", + }), }, wait_ms_sum: { name: "wait_ms_sum", - ...column("UInt64", { description: "Sum of scheduling delays (ms). Mean = wait_ms_sum/wait_ms_count." }), + ...column("UInt64", { + description: "Sum of scheduling delays (ms). Mean = wait_ms_sum/wait_ms_count.", + }), }, wait_ms_count: { name: "wait_ms_count", - ...column("UInt64", { description: "Count of scheduling-delay samples. Aggregate with sum()." }), + ...column("UInt64", { + description: "Count of scheduling-delay samples. Aggregate with sum().", + }), }, wait_quantiles: { name: "wait_quantiles", diff --git a/apps/webapp/app/v3/queueMetrics.server.ts b/apps/webapp/app/v3/queueMetrics.server.ts index 971b0aabd83..68749ceda4b 100644 --- a/apps/webapp/app/v3/queueMetrics.server.ts +++ b/apps/webapp/app/v3/queueMetrics.server.ts @@ -127,11 +127,7 @@ export type LabeledShardState = ShardState & { stream: "queue_metrics" }; export async function probeQueueMetricsStreams(): Promise { const def = metricsDefinition(); - const states = await probeShardStates( - metricsAdminRedis(), - allStreamKeys(def), - def.consumerGroup - ); + const states = await probeShardStates(metricsAdminRedis(), allStreamKeys(def), def.consumerGroup); return states.map((s) => ({ ...s, stream: "queue_metrics" as const })); } diff --git a/apps/webapp/seed-queue-metrics.mts b/apps/webapp/seed-queue-metrics.mts index a5b2f86b444..afb02d4bb69 100644 --- a/apps/webapp/seed-queue-metrics.mts +++ b/apps/webapp/seed-queue-metrics.mts @@ -20,7 +20,11 @@ type QueueProfile = { waitBaseMs: number; sparse?: boolean; // emit no rows when the queue is fully idle (tests carry-forward gaps) }; -type Scenario = { description: string; envLimit: (bucket: number) => number; queues: QueueProfile[] }; +type Scenario = { + description: string; + envLimit: (bucket: number) => number; + queues: QueueProfile[]; +}; // --------------------------------------------------------------------------- // CLI args @@ -113,7 +117,11 @@ const bursty = (name: string, limit: number, base: number): QueueProfile => ({ }); const scenarios: Record Scenario> = { - steady: () => ({ description: "all queues below capacity, no throttling", envLimit: () => 60, queues: steady() }), + steady: () => ({ + description: "all queues below capacity, no throttling", + envLimit: () => 60, + queues: steady(), + }), burst: () => ({ description: "periodic arrival bursts -> backlog + wait spikes + throttling", @@ -144,9 +152,12 @@ const scenarios: Record Sce }), "throttled-backlog": () => ({ - description: "arrival rate persistently above the queue limit -> permanent backlog + throttling", + description: + "arrival rate persistently above the queue limit -> permanent backlog + throttling", envLimit: () => 50, - queues: [{ name: "overloaded", limit: () => 10, arrivals: (_b, r) => poisson(16, r), waitBaseMs: 80 }], + queues: [ + { name: "overloaded", limit: () => 10, arrivals: (_b, r) => poisson(16, r), waitBaseMs: 80 }, + ], }), "idle-sparse": () => ({ @@ -330,8 +341,9 @@ async function insertBatched(ch: ClickHouse, rows: QueueMetricsRawV1Input[], non } async function resetEnv(ch: ClickHouse, environmentId: string) { - const raw = (ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } }) - .client; + const raw = ( + ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } } + ).client; for (const table of ["queue_metrics_raw_v1", "queue_metrics_v1"]) { await raw.command({ query: `DELETE FROM trigger_dev.${table} WHERE environment_id = '${environmentId}'`, @@ -347,7 +359,11 @@ async function resetEnv(ch: ClickHouse, environmentId: string) { // Make the synthetic project a V2 engine project with a current dev worker + a Postgres // TaskQueue per simulated queue, so the /queues list renders the V2 table (it pages from // Postgres and gates on engine version; ClickHouse only holds the metrics). -async function ensureTaskQueues(scenario: Scenario, projectId: string, runtimeEnvironmentId: string) { +async function ensureTaskQueues( + scenario: Scenario, + projectId: string, + runtimeEnvironmentId: string +) { await prisma.project.update({ where: { id: projectId }, data: { engine: "V2" } }); await prisma.backgroundWorker.upsert({ @@ -397,7 +413,9 @@ async function main() { const scenarioName = flags.scenario ?? "mixed"; const build = scenarios[scenarioName]; if (!build) { - console.error(`Unknown scenario "${scenarioName}". Options: ${Object.keys(scenarios).join(", ")}`); + console.error( + `Unknown scenario "${scenarioName}". Options: ${Object.keys(scenarios).join(", ")}` + ); process.exit(1); } const bucketSec = Number(flags.bucket ?? 10); @@ -415,11 +433,19 @@ async function main() { let org = await prisma.organization.findFirst({ where: { title: ORG_TITLE, members: { some: { userId: user.id } } }, }); - if (!org) org = await createOrganization({ title: ORG_TITLE, userId: user.id, companySize: "1-10" }); + if (!org) + org = await createOrganization({ title: ORG_TITLE, userId: user.id, companySize: "1-10" }); - let project = await prisma.project.findFirst({ where: { name: PROJECT_NAME, organizationId: org.id } }); + let project = await prisma.project.findFirst({ + where: { name: PROJECT_NAME, organizationId: org.id }, + }); if (!project) { - project = await createProject({ organizationSlug: org.slug, name: PROJECT_NAME, userId: user.id, version: "v3" }); + project = await createProject({ + organizationSlug: org.slug, + name: PROJECT_NAME, + userId: user.id, + version: "v3", + }); } const runtimeEnv = await prisma.runtimeEnvironment.findFirst({ @@ -430,7 +456,11 @@ async function main() { process.exit(1); } - const ids: Ids = { organization_id: org.id, project_id: project.id, environment_id: runtimeEnv.id }; + const ids: Ids = { + organization_id: org.id, + project_id: project.id, + environment_id: runtimeEnv.id, + }; const ch = clickhouse(); const nonce = `qmsim-${Date.now()}-${seed}`; @@ -448,7 +478,9 @@ async function main() { const backlog = new Array(scenario.queues.length).fill(0); console.log(`Scenario "${scenarioName}": ${scenario.description}`); - console.log(`Backfilling ${totalBuckets} x ${bucketSec}s buckets (${flags.window ?? "2h"}) for ${scenario.queues.length} queues...`); + console.log( + `Backfilling ${totalBuckets} x ${bucketSec}s buckets (${flags.window ?? "2h"}) for ${scenario.queues.length} queues...` + ); // Backfill: buckets from (now - window) up to now, aligned to the bucket grid. const nowBucket = Math.floor(Date.now() / 1000 / bucketSec) * bucketSec; @@ -463,12 +495,15 @@ async function main() { // Merge the AggregatingMergeTree partials so argMax "current value" widgets read cleanly. // The real pipeline relies on background merges; the simulator forces it for a tidy demo. - const raw = (ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } }) - .client; + const raw = ( + ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } } + ).client; await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_v1 FINAL` }); const origin = process.env.APP_ORIGIN ?? "http://localhost:3030"; - console.log(`\nQueues dashboard: ${origin}/orgs/${org.slug}/projects/${project.slug}/env/dev/dashboards/queues`); + console.log( + `\nQueues dashboard: ${origin}/orgs/${org.slug}/projects/${project.slug}/env/dev/dashboards/queues` + ); if (live) { console.log(`\nLive mode: appending one bucket every ${bucketSec}s (Ctrl-C to stop)...`); @@ -476,7 +511,9 @@ async function main() { // eslint-disable-next-line no-constant-condition while (true) { await new Promise((r) => setTimeout(r, bucketSec * 1000)); - const eventTime = formatChDateTime(new Date(Math.floor(Date.now() / 1000 / bucketSec) * bucketSec * 1000)); + const eventTime = formatChDateTime( + new Date(Math.floor(Date.now() / 1000 / bucketSec) * bucketSec * 1000) + ); const liveRows = simulateBucket(scenario, b, bucketSec, eventTime, ids, backlog, rng); await insertBatched(ch, liveRows, `${nonce}:live:${b}`); console.log(`bucket ${b}: ${liveRows.length} rows @ ${eventTime}`); diff --git a/apps/webapp/test/queueMetricsMapping.test.ts b/apps/webapp/test/queueMetricsMapping.test.ts index 44a5d6bf7c5..9de27044cb1 100644 --- a/apps/webapp/test/queueMetricsMapping.test.ts +++ b/apps/webapp/test/queueMetricsMapping.test.ts @@ -73,7 +73,17 @@ describe("mapEntryToRow", () => { it("maps a gauge entry with numeric fields", () => { const row = mapEntryToRow({ id: "1700000000000-0", - fields: { op: "gauge", q, ql: "5", cc: "2", lim: "10", eql: "7", ec: "3", elim: "20", thr: "1" }, + fields: { + op: "gauge", + q, + ql: "5", + cc: "2", + lim: "10", + eql: "7", + ec: "3", + elim: "20", + thr: "1", + }, }); expect(row).toEqual( expect.objectContaining({ diff --git a/internal-packages/metrics-pipeline/src/cachedValue.ts b/internal-packages/metrics-pipeline/src/cachedValue.ts index 6183d62c072..3c31f39c2cc 100644 --- a/internal-packages/metrics-pipeline/src/cachedValue.ts +++ b/internal-packages/metrics-pipeline/src/cachedValue.ts @@ -28,7 +28,10 @@ export class CachedRedisValue { this.logger = options.logger ?? new Logger(options.loggerName ?? "CachedRedisValue", "warn"); this.redis = createRedisClient( { ...options.redis, keyPrefix: undefined }, - { onError: (error) => this.logger.error("cached value redis error", { error, key: options.key }) } + { + onError: (error) => + this.logger.error("cached value redis error", { error, key: options.key }), + } ); this.key = options.key; this.parse = options.parse; diff --git a/internal-packages/metrics-pipeline/src/consumer.test.ts b/internal-packages/metrics-pipeline/src/consumer.test.ts index ff4406ed449..fcb36176937 100644 --- a/internal-packages/metrics-pipeline/src/consumer.test.ts +++ b/internal-packages/metrics-pipeline/src/consumer.test.ts @@ -20,42 +20,54 @@ function definitionFor(suffix: string, shardCount = 2): MetricDefinition { return { name: `qm_${Date.now()}_${suffix}`, shardCount, consumerGroup: "cg", maxLen: 1000 }; } -redisTest("emitter -> consumer round trip maps rows, dedups, and acks", async ({ redisOptions }) => { - const definition = definitionFor("rt"); - const emitter = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => true } }); - const inserted: Array<{ rows: Array>; dedupToken: string }> = []; - - const consumer = new MetricsStreamConsumer>({ - redis: redisOptions, - definition, - consumerName: "c1", - mapEntry: (e) => ({ id: e.id, ...e.fields }), - insert: async (rows, { dedupToken }) => { - inserted.push({ rows, dedupToken }); - }, - blockMs: 200, - }); - - await consumer.start(); - emitter.emit("queueA", { op: "enqueue", q: "queueA" }); - emitter.emit("queueB", { op: "started", q: "queueB", wait: 42 }); - - await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2); - await consumer.stop(); - - const rows = inserted.flatMap((i) => i.rows); - expect(rows).toContainEqual(expect.objectContaining({ op: "enqueue", q: "queueA" })); - expect(rows).toContainEqual(expect.objectContaining({ op: "started", q: "queueB", wait: "42" })); - expect(inserted[0]!.dedupToken).toMatch(/^[0-9a-f]{40}$/); +redisTest( + "emitter -> consumer round trip maps rows, dedups, and acks", + async ({ redisOptions }) => { + const definition = definitionFor("rt"); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + }); + const inserted: Array<{ rows: Array>; dedupToken: string }> = []; + + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async (rows, { dedupToken }) => { + inserted.push({ rows, dedupToken }); + }, + blockMs: 200, + }); + + await consumer.start(); + emitter.emit("queueA", { op: "enqueue", q: "queueA" }); + emitter.emit("queueB", { op: "started", q: "queueB", wait: 42 }); + + await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2); + await consumer.stop(); - const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); - for (const key of consumer.streamKeys()) { - const pending = (await admin.xpending(key, definition.consumerGroup)) as [number, ...unknown[]]; - expect(pending[0]).toBe(0); + const rows = inserted.flatMap((i) => i.rows); + expect(rows).toContainEqual(expect.objectContaining({ op: "enqueue", q: "queueA" })); + expect(rows).toContainEqual( + expect.objectContaining({ op: "started", q: "queueB", wait: "42" }) + ); + expect(inserted[0]!.dedupToken).toMatch(/^[0-9a-f]{40}$/); + + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + for (const key of consumer.streamKeys()) { + const pending = (await admin.xpending(key, definition.consumerGroup)) as [ + number, + ...unknown[], + ]; + expect(pending[0]).toBe(0); + } + await admin.quit(); + await emitter.close(); } - await admin.quit(); - await emitter.close(); -}); +); redisTest("emit is a no-op when the flag is disabled", async ({ redisOptions }) => { const definition = definitionFor("off"); @@ -83,7 +95,16 @@ redisTest("reclaims stale pending entries from a dead consumer", async ({ redisO await admin.xgroup("CREATE", key, definition.consumerGroup, "$", "MKSTREAM"); await admin.xadd(key, "*", "op", "ack", "q", "qZ"); await admin.xadd(key, "*", "op", "nack", "q", "qZ"); - await admin.xreadgroup("GROUP", definition.consumerGroup, "zombie", "COUNT", 10, "STREAMS", key, ">"); + await admin.xreadgroup( + "GROUP", + definition.consumerGroup, + "zombie", + "COUNT", + 10, + "STREAMS", + key, + ">" + ); const inserted: Array> = []; const consumer = new MetricsStreamConsumer>({ @@ -108,104 +129,117 @@ redisTest("reclaims stale pending entries from a dead consumer", async ({ redisO await admin.quit(); }); -redisTest("per-stream batches: one insert + distinct dedup token per shard stream", async ({ redisOptions }) => { - const definition = definitionFor("pershard", 2); - const emitter = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => true } }); - // Two shard keys that land on different shards. - const a = "shardkey-a"; - let b = "shardkey-b0"; - for (let i = 1; shardFor(b, 2) === shardFor(a, 2); i++) b = `shardkey-b${i}`; - - const inserted: Array<{ rows: Array>; dedupToken: string }> = []; - const consumer = new MetricsStreamConsumer>({ - redis: redisOptions, - definition, - consumerName: "c1", - mapEntry: (e) => ({ id: e.id, ...e.fields }), - insert: async (rows, { dedupToken }) => { - inserted.push({ rows, dedupToken }); - }, - blockMs: 200, - }); - - await consumer.start(); - emitter.emit(a, { op: "enqueue", q: a }); - emitter.emit(b, { op: "enqueue", q: b }); - await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2); - await consumer.stop(); - await emitter.close(); - - // Each shard's batch is its own dedup block with its own (stream-scoped) token. - const batchesWithRows = inserted.filter((i) => i.rows.length > 0); - expect(batchesWithRows.length).toBe(2); - expect(new Set(batchesWithRows.map((i) => i.dedupToken)).size).toBe(2); -}); - -redisTest("probe reports lag as null (not 0) when Redis cannot compute it", async ({ redisOptions }) => { - const definition = definitionFor("nillag", 1); - const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); - const key = streamKey(definition, 0); +redisTest( + "per-stream batches: one insert + distinct dedup token per shard stream", + async ({ redisOptions }) => { + const definition = definitionFor("pershard", 2); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + }); + // Two shard keys that land on different shards. + const a = "shardkey-a"; + let b = "shardkey-b0"; + for (let i = 1; shardFor(b, 2) === shardFor(a, 2); i++) b = `shardkey-b${i}`; + + const inserted: Array<{ rows: Array>; dedupToken: string }> = []; + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async (rows, { dedupToken }) => { + inserted.push({ rows, dedupToken }); + }, + blockMs: 200, + }); + + await consumer.start(); + emitter.emit(a, { op: "enqueue", q: a }); + emitter.emit(b, { op: "enqueue", q: b }); + await waitFor(() => inserted.flatMap((i) => i.rows).length >= 2); + await consumer.stop(); + await emitter.close(); - await admin.xgroup("CREATE", key, definition.consumerGroup, "0", "MKSTREAM"); - const ids: string[] = []; - for (let i = 0; i < 5; i++) { - ids.push((await admin.xadd(key, "*", "op", "enqueue", "q", "qT")) as string); + // Each shard's batch is its own dedup block with its own (stream-scoped) token. + const batchesWithRows = inserted.filter((i) => i.rows.length > 0); + expect(batchesWithRows.length).toBe(2); + expect(new Set(batchesWithRows.map((i) => i.dedupToken)).size).toBe(2); } - // SETID to an arbitrary id makes the group's entries-read unknown => lag is nil - // (severe trimming can do the same in prod); the probe must NOT report that as 0. - await admin.xgroup("SETID", key, definition.consumerGroup, ids[2]!); - - const consumer = new MetricsStreamConsumer>({ - redis: redisOptions, - definition, - consumerName: "c1", - mapEntry: (e) => ({ id: e.id, ...e.fields }), - insert: async () => {}, - }); - try { - const states = await consumer.streamState(); - expect(states[0]!.lag).toBeNull(); - } finally { - await consumer.stop(); +); + +redisTest( + "probe reports lag as null (not 0) when Redis cannot compute it", + async ({ redisOptions }) => { + const definition = definitionFor("nillag", 1); + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, 0); + + await admin.xgroup("CREATE", key, definition.consumerGroup, "0", "MKSTREAM"); + const ids: string[] = []; + for (let i = 0; i < 5; i++) { + ids.push((await admin.xadd(key, "*", "op", "enqueue", "q", "qT")) as string); + } + // SETID to an arbitrary id makes the group's entries-read unknown => lag is nil + // (severe trimming can do the same in prod); the probe must NOT report that as 0. + await admin.xgroup("SETID", key, definition.consumerGroup, ids[2]!); + + const consumer = new MetricsStreamConsumer>({ + redis: redisOptions, + definition, + consumerName: "c1", + mapEntry: (e) => ({ id: e.id, ...e.fields }), + insert: async () => {}, + }); + try { + const states = await consumer.streamState(); + expect(states[0]!.lag).toBeNull(); + } finally { + await consumer.stop(); + await admin.quit(); + } + } +); + +redisTest( + "emitGauge XADDs an op=gauge snapshot onto the shared metrics stream", + async ({ redisOptions }) => { + const definition = definitionFor("gauge", 2); + const emitter = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + }); + + emitter.emitGauge("q1", { + op: "gauge", + q: "q1", + ql: 5, + cc: 2, + lim: 10, + eql: 3, + ec: 1, + elim: 20, + thr: 0, + }); + + const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); + const key = streamKey(definition, shardFor("q1", 2)); + // Plain XADD (no odometer, no cum=0 seed) => exactly one entry, unlike counter emit(). + await waitFor2(async () => (await admin.xlen(key)) === 1); + const raw = (await admin.xrange(key, "-", "+")) as Array<[string, string[]]>; + const flat = raw[0]![1]; + const fields: Record = {}; + for (let i = 0; i + 1 < flat.length; i += 2) fields[flat[i]!] = flat[i + 1]!; + expect(fields.op).toBe("gauge"); + expect(fields.q).toBe("q1"); + expect(fields.ql).toBe("5"); + expect(fields.thr).toBe("0"); await admin.quit(); + await emitter.close(); } -}); - -redisTest("emitGauge XADDs an op=gauge snapshot onto the shared metrics stream", async ({ redisOptions }) => { - const definition = definitionFor("gauge", 2); - const emitter = new MetricsStreamEmitter({ - redis: redisOptions, - definition, - flag: { enabled: () => true }, - }); - - emitter.emitGauge("q1", { - op: "gauge", - q: "q1", - ql: 5, - cc: 2, - lim: 10, - eql: 3, - ec: 1, - elim: 20, - thr: 0, - }); - - const admin = createRedisClient({ ...redisOptions, keyPrefix: undefined }); - const key = streamKey(definition, shardFor("q1", 2)); - // Plain XADD (no odometer, no cum=0 seed) => exactly one entry, unlike counter emit(). - await waitFor2(async () => (await admin.xlen(key)) === 1); - const raw = (await admin.xrange(key, "-", "+")) as Array<[string, string[]]>; - const flat = raw[0]![1]; - const fields: Record = {}; - for (let i = 0; i + 1 < flat.length; i += 2) fields[flat[i]!] = flat[i + 1]!; - expect(fields.op).toBe("gauge"); - expect(fields.q).toBe("q1"); - expect(fields.ql).toBe("5"); - expect(fields.thr).toBe("0"); - await admin.quit(); - await emitter.close(); -}); +); async function waitFor2(cond: () => Promise, timeoutMs = 5000): Promise { const start = Date.now(); @@ -217,9 +251,24 @@ async function waitFor2(cond: () => Promise, timeoutMs = 5000): Promise redisTest("sampledSync gates on both the flag and the sample rate", async ({ redisOptions }) => { const definition = definitionFor("sample"); - const off = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => true }, gaugeSampleRate: 0 }); - const on = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => true }, gaugeSampleRate: 1 }); - const disabled = new MetricsStreamEmitter({ redis: redisOptions, definition, flag: { enabled: () => false }, gaugeSampleRate: 1 }); + const off = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + gaugeSampleRate: 0, + }); + const on = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => true }, + gaugeSampleRate: 1, + }); + const disabled = new MetricsStreamEmitter({ + redis: redisOptions, + definition, + flag: { enabled: () => false }, + gaugeSampleRate: 1, + }); expect(off.sampledSync()).toBe(false); // rate 0 => never sampled in expect(on.sampledSync()).toBe(true); // rate 1 + enabled => always @@ -274,7 +323,16 @@ redisTest("streamState reports depth, lag, and pending per shard", async ({ redi await admin.xadd(key, "*", "op", "enqueue", "q", "qX"); await admin.xadd(key, "*", "op", "ack", "q", "qX"); // Read one entry as some consumer and leave it unacked -> 1 pending, 1 still undelivered. - await admin.xreadgroup("GROUP", definition.consumerGroup, "reader", "COUNT", 1, "STREAMS", key, ">"); + await admin.xreadgroup( + "GROUP", + definition.consumerGroup, + "reader", + "COUNT", + 1, + "STREAMS", + key, + ">" + ); const consumer = new MetricsStreamConsumer>({ redis: redisOptions, diff --git a/internal-packages/metrics-pipeline/src/consumer.ts b/internal-packages/metrics-pipeline/src/consumer.ts index c088f56f14e..5afe419a92b 100644 --- a/internal-packages/metrics-pipeline/src/consumer.ts +++ b/internal-packages/metrics-pipeline/src/consumer.ts @@ -133,7 +133,8 @@ export class MetricsStreamConsumer { valueType: ValueType.INT, }); const lagUnknownGauge = this.meter.createObservableGauge("queue_metrics.consumer.lag_unknown", { - description: "1 when Redis cannot compute group lag (entries trimmed => data loss); alert on this", + description: + "1 when Redis cannot compute group lag (entries trimmed => data loss); alert on this", valueType: ValueType.INT, }); this.observables = [depthGauge, lagGauge, pendingGauge, lagUnknownGauge]; @@ -161,10 +162,7 @@ export class MetricsStreamConsumer { this.running = false; this.meter.removeBatchObservableCallback(this.batchCallback, this.observables); await this.loopPromise?.catch(() => {}); - await Promise.all([ - this.redis.quit().catch(() => {}), - this.probeRedis.quit().catch(() => {}), - ]); + await Promise.all([this.redis.quit().catch(() => {}), this.probeRedis.quit().catch(() => {})]); } private async ensureGroups(): Promise { diff --git a/internal-packages/metrics-pipeline/src/idempotency.ts b/internal-packages/metrics-pipeline/src/idempotency.ts index b9f5ead0a5e..60cbd661f53 100644 --- a/internal-packages/metrics-pipeline/src/idempotency.ts +++ b/internal-packages/metrics-pipeline/src/idempotency.ts @@ -5,5 +5,7 @@ import { createHash } from "node:crypto"; // `scope` (the stream key) disambiguates id sets that could collide across streams. export function dedupTokenFromEntryIds(ids: string[], scope = ""): string { const sorted = [...ids].sort(); - return createHash("sha1").update(`${scope}|${sorted.join(",")}`).digest("hex"); + return createHash("sha1") + .update(`${scope}|${sorted.join(",")}`) + .digest("hex"); } diff --git a/internal-packages/run-engine/src/run-queue/metrics.test.ts b/internal-packages/run-engine/src/run-queue/metrics.test.ts index 44c692b577c..27a8ca5ba8c 100644 --- a/internal-packages/run-engine/src/run-queue/metrics.test.ts +++ b/internal-packages/run-engine/src/run-queue/metrics.test.ts @@ -25,10 +25,13 @@ const authenticatedEnvDev = { organization: { id: "o1234" }, }; -async function readAllEntries(redisOptions: { - host: string; - port: number; -}, definition: MetricDefinition) { +async function readAllEntries( + redisOptions: { + host: string; + port: number; + }, + definition: MetricDefinition +) { const client = createRedisClient({ ...redisOptions, keyPrefix: undefined }); const entries: Array<{ id: string; fields: Record }> = []; for (const key of allStreamKeys(definition)) { @@ -108,7 +111,11 @@ describe("RunQueue queue-metrics emission", () => { }; try { - await queue.enqueueMessage({ env: authenticatedEnvDev, message, workerQueue: authenticatedEnvDev.id }); + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + }); await setTimeout(1000); const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); expect(dequeued?.messageId).toBe(message.runId); @@ -144,75 +151,83 @@ describe("RunQueue queue-metrics emission", () => { } }); - redisTest("emits a fast-path gauge reusing the admission-check locals", async ({ redisContainer }) => { - const redis = { - keyPrefix: "runqueue:test:", - host: redisContainer.getHost(), - port: redisContainer.getPort(), - }; - const definition: MetricDefinition = { - name: `qm_fp_${Date.now()}`, - shardCount: 2, - consumerGroup: "cg", - maxLen: 1000, - }; - const emitter = new MetricsStreamEmitter({ redis, definition, flag: { enabled: () => true } }); - const queue = new RunQueue({ - name: "rq", - tracer: trace.getTracer("rq"), - defaultEnvConcurrency: 25, - logger: new Logger("RunQueue", "error"), - keys: new RunQueueFullKeyProducer(), - queueSelectionStrategy: new FairQueueSelectionStrategy({ + redisTest( + "emits a fast-path gauge reusing the admission-check locals", + async ({ redisContainer }) => { + const redis = { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }; + const definition: MetricDefinition = { + name: `qm_fp_${Date.now()}`, + shardCount: 2, + consumerGroup: "cg", + maxLen: 1000, + }; + const emitter = new MetricsStreamEmitter({ redis, + definition, + flag: { enabled: () => true }, + }); + const queue = new RunQueue({ + name: "rq", + tracer: trace.getTracer("rq"), + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "error"), keys: new RunQueueFullKeyProducer(), - }), - redis, - queueMetrics: emitter, - }); + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis, + keys: new RunQueueFullKeyProducer(), + }), + redis, + queueMetrics: emitter, + }); - const message: InputPayload = { - runId: "r-fastpath", - taskIdentifier: "task/my-task", - orgId: "o1234", - projectId: "p1234", - environmentId: authenticatedEnvDev.id, - environmentType: "DEVELOPMENT", - queue: "task/my-task", - timestamp: Date.now(), - attempt: 0, - }; + const message: InputPayload = { + runId: "r-fastpath", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: authenticatedEnvDev.id, + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, + }; - try { - // enableFastPath + empty queue + zero concurrency => the Lua takes the fast path, - // so the gauge runs the reuse snippet (queueCurrent/envCurrent/queueLimit/envLimit). - await queue.enqueueMessage({ - env: authenticatedEnvDev, - message, - workerQueue: authenticatedEnvDev.id, - enableFastPath: true, - }); - const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); - expect(dequeued?.messageId).toBe(message.runId); + try { + // enableFastPath + empty queue + zero concurrency => the Lua takes the fast path, + // so the gauge runs the reuse snippet (queueCurrent/envCurrent/queueLimit/envLimit). + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + enableFastPath: true, + }); + const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); + expect(dequeued?.messageId).toBe(message.runId); - const entries = await waitForEntries( - redis, - definition, - (es) => es.some((e) => e.fields.op === "gauge") && es.some((e) => e.fields.op === "enqueue") - ); - const gauge = entries.find((e) => e.fields.op === "gauge"); - assertGauge(gauge); - for (const f of ["ql", "cc", "lim", "eql", "ec", "elim", "thr"]) { - expect(gauge!.fields[f]).toBeDefined(); + const entries = await waitForEntries( + redis, + definition, + (es) => + es.some((e) => e.fields.op === "gauge") && es.some((e) => e.fields.op === "enqueue") + ); + const gauge = entries.find((e) => e.fields.op === "gauge"); + assertGauge(gauge); + for (const f of ["ql", "cc", "lim", "eql", "ec", "elim", "thr"]) { + expect(gauge!.fields[f]).toBeDefined(); + } + // Fast path was taken => capacity was available => not throttled. + expect(gauge!.fields.thr).toBe("0"); + expect(entries.some((e) => e.fields.op === "enqueue")).toBe(true); + } finally { + await queue.quit(); + await emitter.close(); } - // Fast path was taken => capacity was available => not throttled. - expect(gauge!.fields.thr).toBe("0"); - expect(entries.some((e) => e.fields.op === "enqueue")).toBe(true); - } finally { - await queue.quit(); - await emitter.close(); } - }); + ); redisTest("emits an aggregate gauge for CK queues at dequeue", async ({ redisContainer }) => { const redis = { @@ -256,13 +271,19 @@ describe("RunQueue queue-metrics emission", () => { }; try { - await queue.enqueueMessage({ env: authenticatedEnvDev, message, workerQueue: authenticatedEnvDev.id }); + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + }); await setTimeout(1000); const dequeued = await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); expect(dequeued?.messageId).toBe(message.runId); const entries = await waitForEntries(redis, definition, (es) => - es.some((e) => e.fields.op === "gauge" && e.fields.q.includes(":ck:") && e.fields.thr === "0") + es.some( + (e) => e.fields.op === "gauge" && e.fields.q.includes(":ck:") && e.fields.thr === "0" + ) ); const gauges = entries.filter((e) => e.fields.op === "gauge"); expect(gauges.length).toBeGreaterThan(0); @@ -323,7 +344,11 @@ describe("RunQueue queue-metrics emission", () => { }; try { - await queue.enqueueMessage({ env: authenticatedEnvDev, message, workerQueue: authenticatedEnvDev.id }); + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message, + workerQueue: authenticatedEnvDev.id, + }); await setTimeout(1000); await queue.dequeueMessageFromWorkerQueue("c1", authenticatedEnvDev.id); From bcb017dc89f8b19bbc95f81a0300bf28d9a42ddc Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 09:30:41 +0100 Subject: [PATCH 09/37] chore: use import type for type-only imports --- .../route.tsx | 6 ++---- internal-packages/clickhouse/src/queueMetrics.ts | 2 +- internal-packages/metrics-pipeline/src/flag.ts | 2 +- internal-packages/run-engine/src/run-queue/metrics.test.ts | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index 517d29e9393..ca92d1303c1 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -68,6 +68,7 @@ import { QueueListPresenter } from "~/presenters/v3/QueueListPresenter.server"; import { QueueMetricsPresenter, isQueueMetricsWindow, + type QueueListMetric, type QueueMetricsWindow, } from "~/presenters/v3/QueueMetricsPresenter.server"; import { UsageSparkline } from "~/components/primitives/UsageSparkline"; @@ -155,10 +156,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { window: QueueMetricsWindow; bucketStartMs: number; bucketIntervalMs: number; - byQueue: Record< - string, - import("~/presenters/v3/QueueMetricsPresenter.server").QueueListMetric - >; + byQueue: Record; } | null = null; if (queueMetricsUiEnabled && queues.success) { diff --git a/internal-packages/clickhouse/src/queueMetrics.ts b/internal-packages/clickhouse/src/queueMetrics.ts index 81870c96ba5..43d9e995e9b 100644 --- a/internal-packages/clickhouse/src/queueMetrics.ts +++ b/internal-packages/clickhouse/src/queueMetrics.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { ClickhouseReader, ClickhouseWriter } from "./client/types.js"; +import type { ClickhouseReader, ClickhouseWriter } from "./client/types.js"; export const QueueMetricsRawV1Input = z.object({ organization_id: z.string(), diff --git a/internal-packages/metrics-pipeline/src/flag.ts b/internal-packages/metrics-pipeline/src/flag.ts index 6573e55789f..5931e088939 100644 --- a/internal-packages/metrics-pipeline/src/flag.ts +++ b/internal-packages/metrics-pipeline/src/flag.ts @@ -1,5 +1,5 @@ import type { RedisOptions } from "@internal/redis"; -import { Logger } from "@trigger.dev/core/logger"; +import type { Logger } from "@trigger.dev/core/logger"; import { CachedRedisValue } from "./cachedValue.js"; export type CachedRedisFlagOptions = { diff --git a/internal-packages/run-engine/src/run-queue/metrics.test.ts b/internal-packages/run-engine/src/run-queue/metrics.test.ts index 27a8ca5ba8c..1c4c94c4f4e 100644 --- a/internal-packages/run-engine/src/run-queue/metrics.test.ts +++ b/internal-packages/run-engine/src/run-queue/metrics.test.ts @@ -14,7 +14,7 @@ import { expect } from "vitest"; import { FairQueueSelectionStrategy } from "./fairQueueSelectionStrategy.js"; import { RunQueue } from "./index.js"; import { RunQueueFullKeyProducer } from "./keyProducer.js"; -import { InputPayload } from "./types.js"; +import type { InputPayload } from "./types.js"; const authenticatedEnvDev = { id: "e1234", From 946a24ded25ee3ad764abeb15c5ebfb0d861477b Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 09:33:16 +0100 Subject: [PATCH 10/37] fix(tsql): avoid polynomial backtracking in ORDER BY direction strip --- internal-packages/tsql/src/query/printer.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index f65e10dc80b..74a4a9fdcbe 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -647,7 +647,9 @@ export class ClickHousePrinter { // ORDER BY must be led by the timeBucket column (alias or full expression). // Don't fight a user ordering like `ORDER BY count DESC`. const leadTerm = orderBy[0]; - const leadExpr = leadTerm.replace(/\s+(ASC|DESC)\s*$/i, "").trim(); + // Trim first so the direction match is anchored without a trailing `\s*$`, which + // combined with the leading `\s+` would backtrack polynomially on all-whitespace input. + const leadExpr = leadTerm.trim().replace(/\s+(?:ASC|DESC)$/i, ""); const matchesBucket = (expr: string): boolean => expr.toLowerCase() === bucketAlias!.toLowerCase() || expr === bucketSql; if (!matchesBucket(leadExpr)) { From 06bb47643bc8354a2076ddb39cdd3b9bbf3c5bf5 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 10:06:49 +0100 Subject: [PATCH 11/37] fix(tsql): strip ORDER BY direction without a backtracking regex --- internal-packages/tsql/src/query/printer.ts | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index 74a4a9fdcbe..9b9886ddffa 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -647,9 +647,16 @@ export class ClickHousePrinter { // ORDER BY must be led by the timeBucket column (alias or full expression). // Don't fight a user ordering like `ORDER BY count DESC`. const leadTerm = orderBy[0]; - // Trim first so the direction match is anchored without a trailing `\s*$`, which - // combined with the leading `\s+` would backtrack polynomially on all-whitespace input. - const leadExpr = leadTerm.trim().replace(/\s+(?:ASC|DESC)$/i, ""); + // Strip a trailing ASC/DESC direction without a regex: an unanchored `\s+` before the + // keyword backtracks polynomially across start positions on whitespace runs (CodeQL + // js/polynomial-redos). endsWith + slice is linear. + const trimmedLead = leadTerm.trim(); + const upperLead = trimmedLead.toUpperCase(); + const leadExpr = upperLead.endsWith(" ASC") + ? trimmedLead.slice(0, -4).trimEnd() + : upperLead.endsWith(" DESC") + ? trimmedLead.slice(0, -5).trimEnd() + : trimmedLead; const matchesBucket = (expr: string): boolean => expr.toLowerCase() === bucketAlias!.toLowerCase() || expr === bucketSql; if (!matchesBucket(leadExpr)) { From e848045b4461b112f3276c57c790b5be5aeb5d78 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 10:06:49 +0100 Subject: [PATCH 12/37] fix(clickhouse): remove semicolons from queue metrics migration comments --- .../clickhouse/schema/035_create_queue_metrics_v1.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql index 28e076fb1ae..0d0ed5f91ed 100644 --- a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql +++ b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql @@ -13,7 +13,7 @@ CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_raw_v1 environment_id String CODEC(ZSTD(1)), queue_name String CODEC(ZSTD(1)), event_time DateTime CODEC(Delta(4), ZSTD(1)), - order_key UInt64 DEFAULT 0, -- stream-id composite (ms*1e5+seq); deltaSumTimestamp ordering key + order_key UInt64 DEFAULT 0, -- stream-id composite (ms*1e5+seq), deltaSumTimestamp ordering key op LowCardinality(String), -- gauge | enqueue | started | ack | nack | dlq running UInt32 DEFAULT 0, queued UInt32 DEFAULT 0, @@ -23,7 +23,7 @@ CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_raw_v1 env_limit UInt32 DEFAULT 0, throttled UInt8 DEFAULT 0, -- 1 on a gauge emission with running>=limit AND queued>0 wait_ms UInt32 DEFAULT 0, -- set on op='started' (scheduling delay) - cumulative UInt64 DEFAULT 0 -- monotonic per-(queue,op) odometer on a counter op; diffed at read time + cumulative UInt64 DEFAULT 0 -- monotonic per-(queue,op) odometer on a counter op, diffed at read time ) ENGINE = MergeTree() PARTITION BY toDate(event_time) @@ -40,7 +40,7 @@ CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_v1 queue_name String CODEC(ZSTD(1)), bucket_start DateTime CODEC(Delta(4), ZSTD(1)), - -- Cumulative-counter deltas: each op maintains a monotonic odometer; deltaSumTimestamp + -- Cumulative-counter deltas: each op maintains a monotonic odometer, and deltaSumTimestamp -- sums positive consecutive deltas (ignoring resets) ordered by event_time, so a lost -- reading self-heals (the next surviving reading restates the total). Read with -- deltaSumTimestampMerge(), never sum(). From c7befb37ade5a69adaa66bf87abb38bd651c899c Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 10:06:49 +0100 Subject: [PATCH 13/37] test(clickhouse): rewrite queue metrics test for cumulative counters --- .../clickhouse/src/queueMetrics.test.ts | 69 ++++++++++++++----- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/internal-packages/clickhouse/src/queueMetrics.test.ts b/internal-packages/clickhouse/src/queueMetrics.test.ts index ef18d48288e..68d30198bc8 100644 --- a/internal-packages/clickhouse/src/queueMetrics.test.ts +++ b/internal-packages/clickhouse/src/queueMetrics.test.ts @@ -19,6 +19,30 @@ function base(op: QueueMetricsRawV1Input["op"], queue: string): QueueMetricsRawV }; } +// Cumulative counters: each op keeps a monotonic per-(queue,op) odometer, so a counter row +// carries the running total in `cumulative`. deltaSumTimestamp reconstructs the increase +// (last - first) from a seeded cum=0 baseline; order_key orders readings within an op. +let orderKey = 0; +function counter( + op: QueueMetricsRawV1Input["op"], + queue: string, + total: number, + waits?: number[] +): QueueMetricsRawV1Input[] { + const rows: QueueMetricsRawV1Input[] = [ + { ...base(op, queue), cumulative: 0, order_key: orderKey++ }, + ]; + for (let cum = 1; cum <= total; cum++) { + rows.push({ + ...base(op, queue), + cumulative: cum, + order_key: orderKey++, + ...(waits ? { wait_ms: waits[cum - 1] } : {}), + }); + } + return rows; +} + const aggregatedRow = z.object({ enqueue_count: z.coerce.number(), started_count: z.coerce.number(), @@ -44,11 +68,11 @@ function readAggregated(ch: ClickHouse) { return ch.reader.query({ name: "read-queue-metrics-aggregated", query: `SELECT - sum(enqueue_count) AS enqueue_count, - sum(started_count) AS started_count, - sum(ack_count) AS ack_count, - sum(nack_count) AS nack_count, - sum(dlq_count) AS dlq_count, + deltaSumTimestampMerge(enqueue_delta) AS enqueue_count, + deltaSumTimestampMerge(started_delta) AS started_count, + deltaSumTimestampMerge(ack_delta) AS ack_count, + deltaSumTimestampMerge(nack_delta) AS nack_count, + deltaSumTimestampMerge(dlq_delta) AS dlq_count, sum(throttled_count) AS throttled_count, max(max_running) AS max_running, max(max_queued) AS max_queued, @@ -82,14 +106,11 @@ describe("queue_metrics_v1", () => { const queue = "queue-a"; const rows: QueueMetricsRawV1Input[] = [ - ...Array.from({ length: 3 }, () => base("enqueue", queue)), - ...[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000].map((wait_ms) => ({ - ...base("started", queue), - wait_ms, - })), - ...Array.from({ length: 2 }, () => base("ack", queue)), - base("nack", queue), - base("dlq", queue), + ...counter("enqueue", queue, 3), + ...counter("started", queue, 10, [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]), + ...counter("ack", queue, 2), + ...counter("nack", queue, 1), + ...counter("dlq", queue, 1), { ...base("gauge", queue), running: 8, @@ -155,12 +176,24 @@ describe("queue_metrics_v1", () => { const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); const queue = "queue-b"; - const block = (waits: number[]) => - waits.map((wait_ms) => ({ ...base("started", queue), wait_ms })); - - const [e1] = await ch.queueMetrics.insertRaw(block([100, 200, 300, 400, 500]), SYNC); + // Cumulative odometer continues across the two insert blocks (baseline 0, then 1..10); + // deltaSumTimestamp state and quantile state merge across the parts into one bucket. + const startedRow = (cum: number, wait_ms?: number): QueueMetricsRawV1Input => ({ + ...base("started", queue), + cumulative: cum, + order_key: orderKey++, + ...(wait_ms !== undefined ? { wait_ms } : {}), + }); + + const [e1] = await ch.queueMetrics.insertRaw( + [startedRow(0), ...[100, 200, 300, 400, 500].map((w, i) => startedRow(i + 1, w))], + SYNC + ); expect(e1).toBeNull(); - const [e2] = await ch.queueMetrics.insertRaw(block([600, 700, 800, 900, 1000]), SYNC); + const [e2] = await ch.queueMetrics.insertRaw( + [600, 700, 800, 900, 1000].map((w, i) => startedRow(i + 6, w)), + SYNC + ); expect(e2).toBeNull(); const [queryError, result] = await readAggregated(ch)({ queueName: queue }); From 8cd41b73ba5d946e5cb8444441754cad1ffa1983 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 10:34:32 +0100 Subject: [PATCH 14/37] fix(metrics-pipeline): use BigInt order keys and namespaced odometer counters entryOrderKey returns a string built with BigInt math so ordering stays correct at real epoch magnitudes. Odometer keys are namespaced by definition name. The consumer reports null lag for a missing consumer group instead of 0, and empty gauge values parse as NaN rather than 0. --- .../metrics-pipeline/src/cachedValue.ts | 3 ++- internal-packages/metrics-pipeline/src/consumer.ts | 9 +++++---- internal-packages/metrics-pipeline/src/emitter.ts | 2 +- .../metrics-pipeline/src/pipeline.test.ts | 13 ++++++++++++- internal-packages/metrics-pipeline/src/types.ts | 9 +++++---- 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/internal-packages/metrics-pipeline/src/cachedValue.ts b/internal-packages/metrics-pipeline/src/cachedValue.ts index 3c31f39c2cc..7f7bbb07903 100644 --- a/internal-packages/metrics-pipeline/src/cachedValue.ts +++ b/internal-packages/metrics-pipeline/src/cachedValue.ts @@ -100,7 +100,8 @@ export class CachedRedisNumber { redis: options.redis, key: options.key, parse: (raw) => { - const n = raw == null ? Number.NaN : Number(raw); + // Number("") is 0 (not NaN), so treat blank/whitespace as missing => fallback. + const n = raw == null || raw.trim() === "" ? Number.NaN : Number(raw); return Number.isFinite(n) ? clamp(n) : fallback; }, defaultValue: fallback, diff --git a/internal-packages/metrics-pipeline/src/consumer.ts b/internal-packages/metrics-pipeline/src/consumer.ts index 5afe419a92b..4e48080a661 100644 --- a/internal-packages/metrics-pipeline/src/consumer.ts +++ b/internal-packages/metrics-pipeline/src/consumer.ts @@ -114,7 +114,7 @@ export class MetricsStreamConsumer { description: "Failed inserts (batch left pending for retry)", valueType: ValueType.INT, }); - this.insertDuration = this.meter.createHistogram("queue_metrics.consumer.insert_duration_ms", { + this.insertDuration = this.meter.createHistogram("queue_metrics.consumer.insert_duration", { description: "Sink insert latency", unit: "ms", valueType: ValueType.INT, @@ -302,9 +302,10 @@ export async function probeShardStates( for (let shard = 0; shard < keys.length; shard++) { const key = keys[shard]!; const depth = Number(await redis.xlen(key)) || 0; - // lag defaults to null (unknown) and only becomes a number when Redis reports one: - // a nil lag means entries were trimmed past the group's read position (data loss). - let lag: number | null = 0; + // lag defaults to null (unknown) and only becomes a number when the group is found and + // Redis reports one: a nil lag (or a missing group on an existing stream) means we can't + // compute it, e.g. entries were trimmed past the group's read position (data loss). + let lag: number | null = null; let pending = 0; try { const groups = (await redis.call("XINFO", "GROUPS", key)) as unknown[]; diff --git a/internal-packages/metrics-pipeline/src/emitter.ts b/internal-packages/metrics-pipeline/src/emitter.ts index 7bab52176b6..6574aa70fe0 100644 --- a/internal-packages/metrics-pipeline/src/emitter.ts +++ b/internal-packages/metrics-pipeline/src/emitter.ts @@ -136,7 +136,7 @@ export class MetricsStreamEmitter { if (!this.flag.enabled()) return; const op = String(fields.op ?? "unknown"); const q = String(fields.q ?? ""); - const odometerKey = `queue_metrics_cum:${op}:${q}`; + const odometerKey = `${this.def.name}_cum:${op}:${q}`; const stream = streamKey(this.def, shardFor(shardKey, this.def.shardCount)); const extra: string[] = []; for (const [field, value] of Object.entries(fields)) { diff --git a/internal-packages/metrics-pipeline/src/pipeline.test.ts b/internal-packages/metrics-pipeline/src/pipeline.test.ts index 195d0102ff6..e88419a2ebf 100644 --- a/internal-packages/metrics-pipeline/src/pipeline.test.ts +++ b/internal-packages/metrics-pipeline/src/pipeline.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest"; import { createMetricsGaugeComputeLua } from "./lua.js"; import { dedupTokenFromEntryIds } from "./idempotency.js"; import { fnv1a32, shardFor } from "./hash.js"; -import { allStreamKeys, entryTimeMs, streamKey } from "./types.js"; +import { allStreamKeys, entryOrderKey, entryTimeMs, streamKey } from "./types.js"; describe("shardFor", () => { it("is deterministic and in range", () => { @@ -33,6 +33,17 @@ describe("stream keys", () => { expect(entryTimeMs("1717000000000-5")).toBe(1717000000000); expect(entryTimeMs("nope")).toBeNull(); }); + + it("entryOrderKey stays exact and strictly monotonic at real epoch magnitudes", () => { + const ms = 1783000000000; // ~2026: ms*1e5 is past JS safe-integer range, so a number key + const k = (seq: number) => BigInt(entryOrderKey(`${ms}-${seq}`)); + // adjacent seq within one ms must not collapse to the same key (the float bug) + expect(k(0)).toBe(BigInt(ms) * 100000n); + expect(k(1) - k(0)).toBe(1n); + expect(k(2) - k(1)).toBe(1n); + // a later ms always outranks any seq of an earlier ms + expect(BigInt(entryOrderKey(`${ms + 1}-0`))).toBeGreaterThan(k(99999)); + }); }); describe("createMetricsGaugeComputeLua", () => { diff --git a/internal-packages/metrics-pipeline/src/types.ts b/internal-packages/metrics-pipeline/src/types.ts index c5336efa5eb..b5de4e2b579 100644 --- a/internal-packages/metrics-pipeline/src/types.ts +++ b/internal-packages/metrics-pipeline/src/types.ts @@ -32,9 +32,10 @@ export function entryTimeMs(id: string): number | null { return Number.isFinite(ms) ? ms : null; } -// Strictly-monotonic-per-stream ordering key from a stream id (`-`): ms*1e5+seq. -// Used to order cumulative readings for deltaSumTimestamp so within-ms ties don't misorder. -export function entryOrderKey(id: string): number { +// Ordering key from a stream id (`-`) = ms*1e5+seq, for deltaSumTimestamp. BigInt + +// string because ms*1e5 exceeds JS safe-integer range at real epoch magnitudes (a number would +// collapse nearby seq values); the ClickHouse order_key column is UInt64 and takes the string. +export function entryOrderKey(id: string): string { const [ms, seq] = id.split("-"); - return (Number(ms) || 0) * 100000 + (Number(seq) || 0); + return (BigInt(Number(ms) || 0) * 100000n + BigInt(Number(seq) || 0)).toString(); } From 70f520d42842843c78550dbaee548886502da04f Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 10:34:44 +0100 Subject: [PATCH 15/37] fix(clickhouse): filter zero waits from quantile view and accept string order keys The wait-time quantile materialized view now excludes wait_ms = 0 rows so it matches the count aggregation. order_key accepts a string or a number. Migration comments no longer contain semicolons that split the migration into invalid statements. --- .../clickhouse/schema/035_create_queue_metrics_v1.sql | 2 +- internal-packages/clickhouse/src/queueMetrics.ts | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql index 0d0ed5f91ed..9f7f2aab269 100644 --- a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql +++ b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql @@ -88,7 +88,7 @@ SELECT max(env_limit) AS max_env_limit, sumIf(wait_ms, op = 'started') AS wait_ms_sum, countIf(op = 'started' AND wait_ms > 0) AS wait_ms_count, - quantilesStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started') AS wait_quantiles + quantilesStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0) AS wait_quantiles FROM trigger_dev.queue_metrics_raw_v1 GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start; diff --git a/internal-packages/clickhouse/src/queueMetrics.ts b/internal-packages/clickhouse/src/queueMetrics.ts index 43d9e995e9b..0850360b301 100644 --- a/internal-packages/clickhouse/src/queueMetrics.ts +++ b/internal-packages/clickhouse/src/queueMetrics.ts @@ -7,7 +7,9 @@ export const QueueMetricsRawV1Input = z.object({ environment_id: z.string(), queue_name: z.string(), event_time: z.string(), - order_key: z.number().optional(), + // Exact UInt64 ordering key; a string preserves precision past JS safe-integer range + // (see entryOrderKey). A plain number is still accepted for small test values. + order_key: z.union([z.string(), z.number()]).optional(), op: z.enum(["gauge", "enqueue", "started", "ack", "nack", "dlq"]), running: z.number().optional(), queued: z.number().optional(), From 5e191439ac573ec5af5db58925715e38948b7f87 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 10:34:52 +0100 Subject: [PATCH 16/37] fix(webapp): fail open on queue metrics and honor sparkline total override The queues list tolerates a metrics query failure by rendering without metrics and logging a warning. UsageSparkline renders its total override even when every bucket is zero. The queue detail page returns 404 and its loader skips the metrics query when the feature flag is off. The seed script validates bucket size and only writes ClickHouse against a local host. --- .../components/primitives/UsageSparkline.tsx | 3 +- apps/webapp/app/env.server.ts | 3 +- .../route.tsx | 35 ++++++++++++------- .../route.tsx | 2 +- apps/webapp/seed-queue-metrics.mts | 20 ++++++++--- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/apps/webapp/app/components/primitives/UsageSparkline.tsx b/apps/webapp/app/components/primitives/UsageSparkline.tsx index 6ac7aea6a8e..7c4bbd5d262 100644 --- a/apps/webapp/app/components/primitives/UsageSparkline.tsx +++ b/apps/webapp/app/components/primitives/UsageSparkline.tsx @@ -50,7 +50,8 @@ export function UsageSparkline({ formatTotal, totalClassName = "text-blue-400", }: UsageSparklineProps) { - if (!data || data.every((v) => v === 0)) { + const hasTotalOverride = totalOverride !== undefined; + if (!data || data.length === 0 || (data.every((v) => v === 0) && !hasTotalOverride)) { return ; } diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index e7591e177b4..b153d8293bb 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -888,7 +888,6 @@ const EnvironmentSchema = z QUEUE_METRICS_EMIT_ENABLED: z.string().default("0"), QUEUE_METRICS_CONSUMER_ENABLED: z.string().default("0"), QUEUE_METRICS_STREAM_SHARD_COUNT: z.coerce.number().int().default(4), - QUEUE_METRICS_STREAM_MAXLEN: z.coerce.number().int().default(2_000_000), QUEUE_METRICS_CONSUMER_BATCH_SIZE: z.coerce.number().int().default(1000), // Counter stream (exact counts, loss-intolerant). Unset host => the run-queue Redis; // set it to a dedicated instance so counter backlog never competes with the run queue. @@ -896,7 +895,7 @@ const EnvironmentSchema = z QUEUE_METRICS_REDIS_PORT: z.coerce.number().optional(), QUEUE_METRICS_REDIS_USERNAME: z.string().optional(), QUEUE_METRICS_REDIS_PASSWORD: z.string().optional(), - QUEUE_METRICS_REDIS_TLS_DISABLED: z.string().default("false"), + QUEUE_METRICS_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"), QUEUE_METRICS_COUNTER_STREAM_MAXLEN: z.coerce.number().int().default(8_000_000), // TTL (seconds) on the per-(queue,op) cumulative odometer key, refreshed on every write. // Idle-past-TTL queues purge and self-heal (restart from 1) on return; default 7 days. diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index ca92d1303c1..0daad15fa2b 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -73,6 +73,7 @@ import { } from "~/presenters/v3/QueueMetricsPresenter.server"; import { UsageSparkline } from "~/components/primitives/UsageSparkline"; import { Area, AreaChart, ResponsiveContainer } from "recharts"; +import { logger } from "~/services/logger.server"; import { requireUserId } from "~/services/session.server"; import { cn } from "~/utils/cn"; import { ENVIRONMENT_PAUSE_SOURCE_BILLING_LIMIT } from "~/utils/environmentPauseSource"; @@ -160,19 +161,27 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { } | null = null; if (queueMetricsUiEnabled && queues.success) { - const presenter = new QueueMetricsPresenter(); - const queueNames = queues.queues.map((q) => (q.type === "task" ? `task/${q.name}` : q.name)); - const queueMetrics = - queueNames.length > 0 - ? await presenter.getQueueListMetrics({ environment, queueNames, window: period }) - : null; - if (queueMetrics) { - metrics = { - window: queueMetrics.window, - bucketStartMs: queueMetrics.bucketStartMs, - bucketIntervalMs: queueMetrics.bucketIntervalMs, - byQueue: Object.fromEntries(queueMetrics.byQueue), - }; + // Metrics are additive observability; a ClickHouse hiccup must not take down queue + // management. Fail open to metrics: null instead of bubbling to the page-level 400. + try { + const presenter = new QueueMetricsPresenter(); + const queueNames = queues.queues.map((q) => + q.type === "task" ? `task/${q.name}` : q.name + ); + const queueMetrics = + queueNames.length > 0 + ? await presenter.getQueueListMetrics({ environment, queueNames, window: period }) + : null; + if (queueMetrics) { + metrics = { + window: queueMetrics.window, + bucketStartMs: queueMetrics.bucketStartMs, + bucketIntervalMs: queueMetrics.bucketIntervalMs, + byQueue: Object.fromEntries(queueMetrics.byQueue), + }; + } + } catch (error) { + logger.warn("Queue list metrics unavailable, rendering without them", { error }); } } diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx index d24129806f6..3b5ea5f07c1 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx @@ -443,7 +443,7 @@ function QueueStats({ loading={showLoading} /> 0 ? formatWaitMs(worstP95) : "–"} loading={showLoading} className={worstP95 >= 60_000 ? "text-warning" : undefined} diff --git a/apps/webapp/seed-queue-metrics.mts b/apps/webapp/seed-queue-metrics.mts index afb02d4bb69..2af9ea661d7 100644 --- a/apps/webapp/seed-queue-metrics.mts +++ b/apps/webapp/seed-queue-metrics.mts @@ -282,7 +282,7 @@ function simulateBucket( env_running: envRunning, env_queued: envQueued, env_limit: envLimit, - throttled: running[q] >= limit[q] && queued[q] > 0 ? 1 : 0, + throttled: queued[q] > 0 && (running[q] >= limit[q] || scale < 1) ? 1 : 0, }; rows.push(gauge); @@ -317,11 +317,13 @@ function clickhouse(): ClickHouse { console.error("CLICKHOUSE_URL not set"); process.exit(1); } - if (/\.clickhouse\.cloud|prod/i.test(clickhouseUrl)) { - console.error(`Refusing to run against a non-local ClickHouse: ${clickhouseUrl}`); + const url = new URL(clickhouseUrl); + // Allowlist local hosts only (this script TRUNCATEs), and never echo the URL (it carries creds). + const localHosts = new Set(["localhost", "127.0.0.1", "::1", "0.0.0.0"]); + if (!localHosts.has(url.hostname)) { + console.error(`Refusing to run against a non-local ClickHouse host: ${url.hostname}`); process.exit(1); } - const url = new URL(clickhouseUrl); url.searchParams.delete("secure"); return new ClickHouse({ url: url.toString(), name: "queue-metrics-simulator" }); } @@ -419,8 +421,18 @@ async function main() { process.exit(1); } const bucketSec = Number(flags.bucket ?? 10); + if (!Number.isFinite(bucketSec) || bucketSec <= 0) { + console.error(`--bucket must be a positive number of seconds, got: ${flags.bucket}`); + process.exit(1); + } const windowSec = parseDuration(flags.window ?? "2h"); const totalBuckets = Math.floor(windowSec / bucketSec); + if (!Number.isFinite(totalBuckets) || totalBuckets <= 0) { + console.error( + `--window must be longer than --bucket (got ${windowSec}s window, ${bucketSec}s bucket)` + ); + process.exit(1); + } const seed = Number(flags.seed ?? 1); const live = flags.live === "true"; From 4b465b7b0f435e2893b1d9bc3d5df239ce540466 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 10:34:59 +0100 Subject: [PATCH 17/37] test(run-engine): import describe from vitest in run-queue metrics test --- internal-packages/run-engine/src/run-queue/metrics.test.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal-packages/run-engine/src/run-queue/metrics.test.ts b/internal-packages/run-engine/src/run-queue/metrics.test.ts index 1c4c94c4f4e..a530ed44a31 100644 --- a/internal-packages/run-engine/src/run-queue/metrics.test.ts +++ b/internal-packages/run-engine/src/run-queue/metrics.test.ts @@ -8,9 +8,8 @@ import { } from "@internal/metrics-pipeline"; import { Logger } from "@trigger.dev/core/logger"; import { Decimal } from "@trigger.dev/database"; -import { describe } from "node:test"; import { setTimeout } from "node:timers/promises"; -import { expect } from "vitest"; +import { describe, expect } from "vitest"; import { FairQueueSelectionStrategy } from "./fairQueueSelectionStrategy.js"; import { RunQueue } from "./index.js"; import { RunQueueFullKeyProducer } from "./keyProducer.js"; From 9ce3bd2f3e9f9d9da0dd3f74152745310ad3d693 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 10:39:31 +0100 Subject: [PATCH 18/37] fix(tsql): skip gap-fill on descending bucket order A bucket-led ORDER BY DESC combined with fillGaps emitted an ascending WITH FILL (positive step, ascending bounds), which produces invalid or empty fills. Skip the gap-fill rewrite for descending orders and let the plain descending query stand. Adds a DESC fillGaps test. --- internal-packages/tsql/src/query/printer.test.ts | 10 ++++++++++ internal-packages/tsql/src/query/printer.ts | 10 +++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts index f8f32edeac5..831e73f88c9 100644 --- a/internal-packages/tsql/src/query/printer.test.ts +++ b/internal-packages/tsql/src/query/printer.test.ts @@ -3971,4 +3971,14 @@ describe("timeBucket() fillGaps", () => { expect(sql).not.toContain("WITH FILL"); expect(sql).not.toContain("INTERPOLATE"); }); + + it("bucket-led ORDER BY DESC: fill is skipped (ascending fill would be invalid)", () => { + const query = + "SELECT timeBucket(), count() AS runs FROM metrics GROUP BY timeBucket ORDER BY timeBucket DESC"; + const { sql } = run(query, true); + expect(sql).not.toContain("WITH FILL"); + expect(sql).not.toContain("INTERPOLATE"); + // The plain descending order still stands. + expect(sql).toContain("ORDER BY timebucket DESC"); + }); }); diff --git a/internal-packages/tsql/src/query/printer.ts b/internal-packages/tsql/src/query/printer.ts index 9b9886ddffa..73cb821dfaf 100644 --- a/internal-packages/tsql/src/query/printer.ts +++ b/internal-packages/tsql/src/query/printer.ts @@ -652,9 +652,10 @@ export class ClickHousePrinter { // js/polynomial-redos). endsWith + slice is linear. const trimmedLead = leadTerm.trim(); const upperLead = trimmedLead.toUpperCase(); + const isDescending = upperLead.endsWith(" DESC"); const leadExpr = upperLead.endsWith(" ASC") ? trimmedLead.slice(0, -4).trimEnd() - : upperLead.endsWith(" DESC") + : isDescending ? trimmedLead.slice(0, -5).trimEnd() : trimmedLead; const matchesBucket = (expr: string): boolean => @@ -662,6 +663,13 @@ export class ClickHousePrinter { if (!matchesBucket(leadExpr)) { return null; } + // WITH FILL is emitted with ascending bounds and a positive STEP, which is + // only valid for an ascending bucket order. A descending order would need + // swapped bounds and a negative step (newer ClickHouse only), so skip the + // gap-fill rewrite and let the plain descending ORDER BY stand. + if (isDescending) { + return null; + } // Group dims = GROUP BY expressions that are NOT the timeBucket column. const groupDims = (groupBy ?? []).filter((g) => !matchesBucket(g.trim())); From a33912b5a80e60517d197ede9b0ec4d3e8d513df Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 3 Jul 2026 11:01:34 +0100 Subject: [PATCH 19/37] fix(metrics-pipeline): widen order_key packing factor to 1e6 Packs the stream sequence with a 1e6 factor (was 1e5) so up to 1M entries per millisecond per shard fit before a seq could spill into the next millisecond's range, far above what a single Redis stream can produce. ms*1e6 stays within UInt64. Also fixes the webapp mapping test that still expected a numeric order_key after the switch to a BigInt-derived string. --- apps/webapp/test/queueMetricsMapping.test.ts | 2 +- .../clickhouse/schema/035_create_queue_metrics_v1.sql | 2 +- internal-packages/metrics-pipeline/src/pipeline.test.ts | 8 ++++---- internal-packages/metrics-pipeline/src/types.ts | 7 ++++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/apps/webapp/test/queueMetricsMapping.test.ts b/apps/webapp/test/queueMetricsMapping.test.ts index 9de27044cb1..affb4881398 100644 --- a/apps/webapp/test/queueMetricsMapping.test.ts +++ b/apps/webapp/test/queueMetricsMapping.test.ts @@ -112,7 +112,7 @@ describe("mapEntryToRow", () => { op: "started", wait_ms: 48, cumulative: 512, - order_key: 1700000000000 * 100000, + order_key: (1700000000000n * 1000000n).toString(), }) ); expect(mapEntryToRow({ id: "1-0", fields: { op: "ack", q, cum: "9" } })).toEqual( diff --git a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql index 9f7f2aab269..ebbe70391b6 100644 --- a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql +++ b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql @@ -13,7 +13,7 @@ CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_raw_v1 environment_id String CODEC(ZSTD(1)), queue_name String CODEC(ZSTD(1)), event_time DateTime CODEC(Delta(4), ZSTD(1)), - order_key UInt64 DEFAULT 0, -- stream-id composite (ms*1e5+seq), deltaSumTimestamp ordering key + order_key UInt64 DEFAULT 0, -- stream-id composite (ms*1e6+seq), deltaSumTimestamp ordering key op LowCardinality(String), -- gauge | enqueue | started | ack | nack | dlq running UInt32 DEFAULT 0, queued UInt32 DEFAULT 0, diff --git a/internal-packages/metrics-pipeline/src/pipeline.test.ts b/internal-packages/metrics-pipeline/src/pipeline.test.ts index e88419a2ebf..173f9cbbcd8 100644 --- a/internal-packages/metrics-pipeline/src/pipeline.test.ts +++ b/internal-packages/metrics-pipeline/src/pipeline.test.ts @@ -35,14 +35,14 @@ describe("stream keys", () => { }); it("entryOrderKey stays exact and strictly monotonic at real epoch magnitudes", () => { - const ms = 1783000000000; // ~2026: ms*1e5 is past JS safe-integer range, so a number key + const ms = 1783000000000; // ~2026: ms*1e6 is past JS safe-integer range, so a number key const k = (seq: number) => BigInt(entryOrderKey(`${ms}-${seq}`)); // adjacent seq within one ms must not collapse to the same key (the float bug) - expect(k(0)).toBe(BigInt(ms) * 100000n); + expect(k(0)).toBe(BigInt(ms) * 1000000n); expect(k(1) - k(0)).toBe(1n); expect(k(2) - k(1)).toBe(1n); - // a later ms always outranks any seq of an earlier ms - expect(BigInt(entryOrderKey(`${ms + 1}-0`))).toBeGreaterThan(k(99999)); + // a later ms always outranks any seq of an earlier ms (up to the 1M/ms factor) + expect(BigInt(entryOrderKey(`${ms + 1}-0`))).toBeGreaterThan(k(999999)); }); }); diff --git a/internal-packages/metrics-pipeline/src/types.ts b/internal-packages/metrics-pipeline/src/types.ts index b5de4e2b579..d9e9e43f554 100644 --- a/internal-packages/metrics-pipeline/src/types.ts +++ b/internal-packages/metrics-pipeline/src/types.ts @@ -32,10 +32,11 @@ export function entryTimeMs(id: string): number | null { return Number.isFinite(ms) ? ms : null; } -// Ordering key from a stream id (`-`) = ms*1e5+seq, for deltaSumTimestamp. BigInt + -// string because ms*1e5 exceeds JS safe-integer range at real epoch magnitudes (a number would +// Ordering key from a stream id (`-`) = ms*1e6+seq, for deltaSumTimestamp. BigInt + +// string because ms*1e6 exceeds JS safe-integer range at real epoch magnitudes (a number would // collapse nearby seq values); the ClickHouse order_key column is UInt64 and takes the string. +// The 1e6 factor (1M entries/ms/shard, far above any single Redis stream) stays within UInt64. export function entryOrderKey(id: string): string { const [ms, seq] = id.split("-"); - return (BigInt(Number(ms) || 0) * 100000n + BigInt(Number(seq) || 0)).toString(); + return (BigInt(Number(ms) || 0) * 1000000n + BigInt(Number(seq) || 0)).toString(); } From 957731689145e0d6540023308971af89ebdc72f5 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 03:34:52 +0100 Subject: [PATCH 20/37] feat(webapp,clickhouse): standard time filter for queue metrics pages The queues list and queue detail pages now use the shared TimeFilter (any preset period or a custom date range) and everything on the page follows it: header tiles, per queue metric columns, charts, and stats. The custom period buttons, hand rolled chart cards, and duplicated metric fetch loops are replaced by the ChartCard and Chart primitives, UsageSparkline, and a shared useMetricResourceQuery hook. The ClickHouse list queries take an explicit end bound so fixed ranges query only their window. --- .../app/hooks/useMetricResourceQuery.ts | 109 ++++++ .../v3/QueueMetricsPresenter.server.ts | 34 +- .../route.tsx | 290 +++++--------- .../route.tsx | 355 +++++------------- .../clickhouse/src/queueMetrics.ts | 3 + 5 files changed, 323 insertions(+), 468 deletions(-) create mode 100644 apps/webapp/app/hooks/useMetricResourceQuery.ts diff --git a/apps/webapp/app/hooks/useMetricResourceQuery.ts b/apps/webapp/app/hooks/useMetricResourceQuery.ts new file mode 100644 index 00000000000..8cb8faec507 --- /dev/null +++ b/apps/webapp/app/hooks/useMetricResourceQuery.ts @@ -0,0 +1,109 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { useInterval } from "./useInterval"; + +export type MetricResourceRow = Record; + +type MetricResourceResponse = + | { success: true; data: { rows: MetricResourceRow[] } } + | { success: false; error: string }; + +export type MetricResourceTimeRange = { + period: string | null; + from: string | null; + to: string | null; +}; + +export type MetricResourceQueryOptions = { + organizationId: string; + projectId: string; + environmentId: string; + timeRange: MetricResourceTimeRange; + defaultPeriod: string; + queues?: string[]; + fillGaps?: boolean; + refreshIntervalMs?: number; +}; + +/** + * Client-fetch a TRQL query from the metric resource route (like the dashboard + * widgets): own loading state, interval + on-focus refresh, abort on change/unmount. + */ +export function useMetricResourceQuery(query: string, opts: MetricResourceQueryOptions) { + const [rows, setRows] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [failed, setFailed] = useState(false); + const abortRef = useRef(null); + + const { + organizationId, + projectId, + environmentId, + defaultPeriod, + fillGaps, + refreshIntervalMs = 60_000, + } = opts; + const { period, from, to } = opts.timeRange; + const queuesKey = opts.queues && opts.queues.length > 0 ? opts.queues.join(",") : undefined; + + const load = useCallback(() => { + abortRef.current?.abort(); + const controller = new AbortController(); + abortRef.current = controller; + setIsLoading(true); + fetch("/resources/metric", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + query, + scope: "environment", + period: period ?? (from || to ? null : defaultPeriod), + from, + to, + fillGaps: !!fillGaps, + organizationId, + projectId, + environmentId, + ...(queuesKey !== undefined ? { queues: queuesKey.split(",") } : {}), + }), + signal: controller.signal, + }) + .then((res) => res.json() as Promise) + .then((data) => { + if (controller.signal.aborted) return; + if (data.success) { + setRows(data.data.rows); + setFailed(false); + } else { + setFailed(true); + } + setIsLoading(false); + }) + .catch((error) => { + if (error instanceof DOMException && error.name === "AbortError") return; + if (!controller.signal.aborted) { + setFailed(true); + setIsLoading(false); + } + }); + }, [ + query, + period, + from, + to, + defaultPeriod, + fillGaps, + organizationId, + projectId, + environmentId, + queuesKey, + ]); + + useEffect(() => { + load(); + return () => abortRef.current?.abort(); + }, [load]); + + useInterval({ interval: refreshIntervalMs, onLoad: false, onFocus: true, callback: load }); + + return { rows: rows ?? [], isLoading, showLoading: isLoading && !rows, failed }; +} diff --git a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts index 3545c8598f4..b683ef41798 100644 --- a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts @@ -2,18 +2,6 @@ import { type AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; import { logger } from "~/services/logger.server"; -export const QUEUE_METRICS_WINDOWS = { - "1h": 3600, - "6h": 21600, - "24h": 86400, -} as const; - -export type QueueMetricsWindow = keyof typeof QUEUE_METRICS_WINDOWS; - -export function isQueueMetricsWindow(value: unknown): value is QueueMetricsWindow { - return typeof value === "string" && value in QUEUE_METRICS_WINDOWS; -} - export type QueueListMetric = { p50WaitMs: number | null; p95WaitMs: number | null; @@ -23,7 +11,6 @@ export type QueueListMetric = { }; export type QueueListMetrics = { - window: QueueMetricsWindow; bucketStartMs: number; bucketIntervalMs: number; byQueue: Map; @@ -41,30 +28,30 @@ function finiteOrNull(value: number): number | null { export class QueueMetricsPresenter { /** - * Recent per-queue metrics for a fixed set of queues (the visible list page), + * Per-queue metrics over a time range for a fixed set of queues (the visible list page), * scoped to one ClickHouse query window so cost is independent of total queue count. * Degrades to an empty map if ClickHouse is unavailable so the live list still renders. */ public async getQueueListMetrics({ environment, queueNames, - window, + from, + to, }: { environment: AuthenticatedEnvironment; queueNames: string[]; - window: QueueMetricsWindow; + from: Date; + to: Date; }): Promise { - const windowSeconds = QUEUE_METRICS_WINDOWS[window]; - const bucketSeconds = Math.max(60, Math.round(windowSeconds / SPARKLINE_POINTS)); - const numBuckets = Math.ceil(windowSeconds / bucketSeconds); - const nowSeconds = Math.floor(Date.now() / 1000); + const rangeSeconds = Math.max(60, Math.round((to.getTime() - from.getTime()) / 1000)); + const bucketSeconds = Math.max(60, Math.round(rangeSeconds / SPARKLINE_POINTS)); + const numBuckets = Math.max(1, Math.ceil(rangeSeconds / bucketSeconds)); const gridStartSeconds = - Math.floor((nowSeconds - windowSeconds) / bucketSeconds) * bucketSeconds; + Math.floor(Math.floor(from.getTime() / 1000) / bucketSeconds) * bucketSeconds; const bucketStartMs = gridStartSeconds * 1000; const bucketIntervalMs = bucketSeconds * 1000; const empty: QueueListMetrics = { - window, bucketStartMs, bucketIntervalMs, byQueue: new Map(), @@ -86,6 +73,7 @@ export class QueueMetricsPresenter { environmentId: environment.id, queueNames, startTime: formatClickhouseDateTime(new Date(bucketStartMs)), + endTime: formatClickhouseDateTime(to), }; const [summaryResult, sparklineResult] = await Promise.all([ @@ -137,7 +125,7 @@ export class QueueMetricsPresenter { }); } - return { window, bucketStartMs, bucketIntervalMs, byQueue }; + return { bucketStartMs, bucketIntervalMs, byQueue }; } catch (error) { logger.warn("QueueMetricsPresenter: failed to load queue metrics", { error: error instanceof Error ? error.message : String(error), diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index 0daad15fa2b..59b5072741d 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -7,11 +7,11 @@ import { RectangleStackIcon, } from "@heroicons/react/20/solid"; import { DialogClose } from "@radix-ui/react-dialog"; -import { Form, Link, useNavigation, useSearchParams, type MetaFunction } from "@remix-run/react"; +import { Form, Link, useNavigation, type MetaFunction } from "@remix-run/react"; import { type ActionFunctionArgs, type LoaderFunctionArgs } from "@remix-run/server-runtime"; import type { QueueItem } from "@trigger.dev/core/v3/schemas"; import type { RuntimeEnvironmentType } from "@trigger.dev/database"; -import { type ReactNode, useCallback, useEffect, useRef, useState } from "react"; +import { type ReactNode, useEffect, useState } from "react"; import { typedjson, useTypedLoaderData } from "remix-typedjson"; import { z } from "zod"; import { ConcurrencyIcon } from "~/assets/icons/ConcurrencyIcon"; @@ -54,7 +54,6 @@ import { import { QueueName } from "~/components/runs/v3/QueueName"; import { env } from "~/env.server"; import { useAutoRevalidate } from "~/hooks/useAutoRevalidate"; -import { useInterval } from "~/hooks/useInterval"; import { LoadingBarDivider } from "~/components/primitives/LoadingBarDivider"; import { useEnvironment } from "~/hooks/useEnvironment"; import { useOrganization } from "~/hooks/useOrganizations"; @@ -67,12 +66,16 @@ import { EnvironmentQueuePresenter } from "~/presenters/v3/EnvironmentQueuePrese import { QueueListPresenter } from "~/presenters/v3/QueueListPresenter.server"; import { QueueMetricsPresenter, - isQueueMetricsWindow, type QueueListMetric, - type QueueMetricsWindow, } from "~/presenters/v3/QueueMetricsPresenter.server"; +import { TimeFilter, timeFilterFromTo } from "~/components/runs/v3/SharedFilters"; +import { useSearchParams } from "~/hooks/useSearchParam"; +import { parseFiniteInt } from "~/utils/searchParams"; import { UsageSparkline } from "~/components/primitives/UsageSparkline"; -import { Area, AreaChart, ResponsiveContainer } from "recharts"; +import { + useMetricResourceQuery, + type MetricResourceTimeRange, +} from "~/hooks/useMetricResourceQuery"; import { logger } from "~/services/logger.server"; import { requireUserId } from "~/services/session.server"; import { cn } from "~/utils/cn"; @@ -96,8 +99,12 @@ const SearchParamsSchema = z.object({ query: z.string().optional(), page: z.coerce.number().min(1).default(1), period: z.string().optional(), + from: z.string().optional(), + to: z.string().optional(), }); +const QUEUE_METRICS_DEFAULT_PERIOD = "1d"; + export const meta: MetaFunction = () => { return [ { @@ -111,12 +118,9 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const { organizationSlug, projectParam, envParam } = EnvironmentParamSchema.parse(params); const url = new URL(request.url); - const { - page, - query, - period: rawPeriod, - } = SearchParamsSchema.parse(Object.fromEntries(url.searchParams)); - const period: QueueMetricsWindow = isQueueMetricsWindow(rawPeriod) ? rawPeriod : "24h"; + const { page, query, period, from, to } = SearchParamsSchema.parse( + Object.fromEntries(url.searchParams) + ); const project = await findProjectBySlug(organizationSlug, projectParam, userId); if (!project) { @@ -154,7 +158,6 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { // The environment header tiles are fetched client-side per card (see QueueEnvMetricTile) so a // slow ClickHouse query never blocks the queues list from rendering. let metrics: { - window: QueueMetricsWindow; bucketStartMs: number; bucketIntervalMs: number; byQueue: Record; @@ -168,13 +171,23 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const queueNames = queues.queues.map((q) => q.type === "task" ? `task/${q.name}` : q.name ); + const timeRange = timeFilterFromTo({ + period, + from: parseFiniteInt(from), + to: parseFiniteInt(to), + defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD, + }); const queueMetrics = queueNames.length > 0 - ? await presenter.getQueueListMetrics({ environment, queueNames, window: period }) + ? await presenter.getQueueListMetrics({ + environment, + queueNames, + from: timeRange.from, + to: timeRange.to, + }) : null; if (queueMetrics) { metrics = { - window: queueMetrics.window, bucketStartMs: queueMetrics.bucketStartMs, bucketIntervalMs: queueMetrics.bucketIntervalMs, byQueue: Object.fromEntries(queueMetrics.byQueue), @@ -190,7 +203,6 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { environment: await environmentQueuePresenter.call(environment), autoReloadPollIntervalMs, metrics, - period, queueMetricsUiEnabled, }); } catch (error) { @@ -377,7 +389,6 @@ function QueuesWithMetricsView() { hasFilters, autoReloadPollIntervalMs, metrics, - period, } = useTypedLoaderData(); const metricsByQueue = metrics?.byQueue ?? {}; @@ -386,6 +397,15 @@ function QueuesWithMetricsView() { const project = useProject(); const env = useEnvironment(); const plan = useCurrentPlan(); + const maxPeriodDays = plan?.v3Subscription?.plan?.limits?.queryPeriodDays?.number; + + // The header tiles fetch client-side with the same period/from/to the TimeFilter writes. + const { value } = useSearchParams(); + const timeRange = { + period: value("period") ?? null, + from: value("from") ?? null, + to: value("to") ?? null, + }; useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true }); @@ -432,16 +452,23 @@ function QueuesWithMetricsView() {
{QUEUE_HEADER_TILES.map((tile) => ( - + ))}
{success ? (
-
+
- +
; } -const QUEUE_METRICS_PERIODS: { value: QueueMetricsWindow; label: string }[] = [ - { value: "1h", label: "1h" }, - { value: "6h", label: "6h" }, - { value: "24h", label: "24h" }, -]; - -function QueuePeriodSelect({ period }: { period: QueueMetricsWindow }) { - const [searchParams] = useSearchParams(); - const hrefFor = (value: QueueMetricsWindow) => { - const next = new URLSearchParams(searchParams); - next.set("period", value); - next.delete("page"); - return `?${next.toString()}`; - }; - return ( -
- Metrics - {QUEUE_METRICS_PERIODS.map(({ value, label }) => ( - - {label} - - ))} -
- ); -} - type MetricTileRow = Record; -type MetricTileResponse = - | { success: true; data: { rows: MetricTileRow[] } } - | { success: false; error: string }; - type QueueHeaderTile = { id: string; label: string; color: string; query: string; + unitLabel: { singular: string; plural: string }; derive: (rows: MetricTileRow[]) => { sparkline: number[]; - value: ReactNode; - valueClassName?: string; + total: number; + formatTotal?: (total: number) => string; + totalClassName?: string; }; }; @@ -1160,6 +1154,11 @@ function tileNumber(value: number | string | null): number { return Number.isFinite(n) ? n : 0; } +function tileTimeToMs(value: number | string | null): number { + const s = String(value).replace(" ", "T"); + return Date.parse(s.endsWith("Z") ? s : `${s}Z`); +} + // Header tiles fetch their own TRQL query client-side (resources.metric) with fillGaps, mirroring the // metrics dashboard widgets: the gauges (saturation inputs, backlog) carry, counters/p95 zero-fill. const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ @@ -1168,13 +1167,14 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ label: "Env saturation", color: "#6366F1", query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS env_limit\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + unitLabel: { singular: "%", plural: "%" }, derive: (rows) => { const sparkline = rows.map((r) => { const limit = tileNumber(r.env_limit); return limit > 0 ? Math.round((tileNumber(r.used) / limit) * 100) : 0; }); const peak = sparkline.reduce((max, v) => Math.max(max, v), 0); - return { sparkline, value: `${peak}% peak` }; + return { sparkline, total: peak, formatTotal: (v) => `${v}% peak` }; }, }, { @@ -1182,10 +1182,11 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ label: "Backlog", color: "#A78BFA", query: `SELECT timeBucket() AS t,\n max(max_env_queued) AS queued\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + unitLabel: { singular: "queued", plural: "queued" }, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.queued)); const peak = sparkline.reduce((max, v) => Math.max(max, v), 0); - return { sparkline, value: `${peak.toLocaleString()} peak` }; + return { sparkline, total: peak, formatTotal: (v) => `${v.toLocaleString()} peak` }; }, }, { @@ -1193,13 +1194,15 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ label: "Scheduling delay p95", color: "#F59E0B", query: `SELECT timeBucket() AS t,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + unitLabel: { singular: "ms", plural: "ms" }, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.p95)); const worst = sparkline.reduce((max, v) => Math.max(max, v), 0); return { sparkline, - value: worst > 0 ? formatWaitMs(worst) : "–", - valueClassName: worst >= 60_000 ? "text-warning" : undefined, + total: worst, + formatTotal: (v) => (v > 0 ? formatWaitMs(v) : "–"), + totalClassName: worst >= 60_000 ? "text-warning" : undefined, }; }, }, @@ -1208,171 +1211,78 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ label: "Throttled", color: "#F59E0B", query: `SELECT timeBucket() AS t,\n sum(throttled_count) AS throttled\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + unitLabel: { singular: "throttled bucket", plural: "throttled buckets" }, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.throttled)); const total = sparkline.reduce((sum, v) => sum + v, 0); return { sparkline, - value: total.toLocaleString(), - valueClassName: total > 0 ? "text-warning" : undefined, + total, + totalClassName: total > 0 ? "text-warning" : undefined, }; }, }, ]; +type TileTimeRange = MetricResourceTimeRange; + function QueueEnvMetricTile({ tile, - period, + timeRange, }: { tile: QueueHeaderTile; - period: QueueMetricsWindow; + timeRange: TileTimeRange; }) { const organization = useOrganization(); const project = useProject(); const environment = useEnvironment(); - const [response, setResponse] = useState(null); - const [isLoading, setIsLoading] = useState(true); - const abortRef = useRef(null); - - const orgId = organization.id; - const projectId = project.id; - const environmentId = environment.id; - const { query } = tile; - - const load = useCallback(() => { - abortRef.current?.abort(); - const controller = new AbortController(); - abortRef.current = controller; - setIsLoading(true); - fetch("/resources/metric", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ - query, - scope: "environment", - period, - from: null, - to: null, - fillGaps: true, - organizationId: orgId, - projectId, - environmentId, - }), - signal: controller.signal, - }) - .then((res) => res.json() as Promise) - .then((data) => { - if (!controller.signal.aborted) { - setResponse(data); - setIsLoading(false); - } - }) - .catch((error) => { - if (error instanceof DOMException && error.name === "AbortError") return; - if (!controller.signal.aborted) { - setResponse({ success: false, error: "Network error" }); - setIsLoading(false); - } - }); - }, [query, period, orgId, projectId, environmentId]); - - useEffect(() => { - load(); - return () => abortRef.current?.abort(); - }, [load]); - useInterval({ interval: 60_000, onLoad: false, onFocus: true, callback: load }); + const { rows, isLoading, showLoading, failed } = useMetricResourceQuery(tile.query, { + organizationId: organization.id, + projectId: project.id, + environmentId: environment.id, + timeRange, + defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD, + fillGaps: true, + }); - const rows = response?.success ? response.data.rows : []; - const hasData = rows.length > 0; - const showLoading = isLoading && !hasData; - const failed = response !== null && !response.success; - const { sparkline, value, valueClassName } = tile.derive(rows); + const { sparkline, total, formatTotal, totalClassName } = tile.derive(rows); + const bucketStartMs = rows.length > 0 ? tileTimeToMs(rows[0].t) : undefined; + const bucketIntervalMs = + rows.length >= 2 ? tileTimeToMs(rows[1].t) - tileTimeToMs(rows[0].t) : undefined; return ( - - ) : failed ? undefined : ( - value - ) - } - valueClassName={valueClassName} - > + {showLoading ? ( -
+
) : failed ? ( -
- Unable to load metrics -
+
Unable to load metrics
) : ( - + )} ); } -function HeaderTile({ - label, - value, - valueClassName, - className, - children, -}: { - label: ReactNode; - value?: ReactNode; - valueClassName?: string; - className?: string; - children: ReactNode; -}) { +function HeaderTile({ label, children }: { label: ReactNode; children: ReactNode }) { return ( -
-
- {label} - {value !== undefined ? ( - - {value} - - ) : null} -
+
+ {label} {children}
); } -function MiniChart({ data, color }: { data: number[]; color: string }) { - if (!data || data.length === 0 || data.every((v) => v === 0)) { - return
No activity
; - } - const chartData = data.map((v, i) => ({ i, v })); - return ( -
- - - - - -
- ); -} - function QueueHealthBadge({ paused, running, @@ -1400,7 +1310,7 @@ function QueueHealthBadge({ } if (queued > 0) { return ( - + Backlogged ); diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx index 3b5ea5f07c1..8b765bd8f63 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues_.$queueParam/route.tsx @@ -1,33 +1,27 @@ -import { useSearchParams, type MetaFunction } from "@remix-run/react"; +import { type MetaFunction } from "@remix-run/react"; import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; -import { useCallback, useEffect, useMemo, useRef, useState } from "react"; -import { - CartesianGrid, - Line, - LineChart, - ReferenceLine, - ResponsiveContainer, - Tooltip, - type TooltipProps, - XAxis, - YAxis, -} from "recharts"; +import { useMemo } from "react"; import { typedjson, useTypedLoaderData } from "remix-typedjson"; import { z } from "zod"; import { PageBody, PageContainer } from "~/components/layout/AppLayout"; -import { LinkButton } from "~/components/primitives/Buttons"; -import { Header2 } from "~/components/primitives/Headers"; -import { LoadingBarDivider } from "~/components/primitives/LoadingBarDivider"; import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; -import { Paragraph } from "~/components/primitives/Paragraph"; -import { useInterval } from "~/hooks/useInterval"; +import { buildActivityTimeAxis } from "~/components/primitives/charts/activityTimeAxis"; +import { + Chart, + type ChartConfig, + type ChartState, +} from "~/components/primitives/charts/ChartCompound"; +import { ChartCard } from "~/components/primitives/charts/ChartCard"; +import { + useMetricResourceQuery, + type MetricResourceTimeRange, +} from "~/hooks/useMetricResourceQuery"; import { findProjectBySlug } from "~/models/project.server"; import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; -import { - isQueueMetricsWindow, - type QueueMetricsWindow, -} from "~/presenters/v3/QueueMetricsPresenter.server"; import { QueueRetrievePresenter } from "~/presenters/v3/QueueRetrievePresenter.server"; +import { TimeFilter } from "~/components/runs/v3/SharedFilters"; +import { useSearchParams } from "~/hooks/useSearchParam"; +import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route"; import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server"; import { requireUserId } from "~/services/session.server"; import { cn } from "~/utils/cn"; @@ -48,8 +42,6 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { } const url = new URL(request.url); - const rawPeriod = url.searchParams.get("period") ?? undefined; - const period: QueueMetricsWindow = isQueueMetricsWindow(rawPeriod) ? rawPeriod : "24h"; const project = await findProjectBySlug(organizationSlug, projectParam, userId); if (!project) throw new Response(undefined, { status: 404, statusText: "Project not found" }); @@ -72,7 +64,6 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { return typedjson({ queue, fullName, - period, backPath: url.pathname.replace(/\/[^/]+$/, ""), ids: { organizationId: environment.organizationId, @@ -94,8 +85,21 @@ const COLORS = { type Ids = { organizationId: string; projectId: string; environmentId: string }; +type TimeRangeParams = MetricResourceTimeRange; + +const QUEUE_METRICS_DEFAULT_PERIOD = "1d"; + export default function Page() { - const { queue, fullName, period, backPath, ids } = useTypedLoaderData(); + const { queue, fullName, backPath, ids } = useTypedLoaderData(); + const plan = useCurrentPlan(); + const maxPeriodDays = plan?.v3Subscription?.plan?.limits?.queryPeriodDays?.number; + + const { value } = useSearchParams(); + const timeRange: TimeRangeParams = { + period: value("period") ?? null, + from: value("from") ?? null, + to: value("to") ?? null, + }; return ( @@ -104,14 +108,21 @@ export default function Page() {
-
+
- +
@@ -140,7 +151,7 @@ export default function Page() { query={`SELECT timeBucket() AS t,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[1]) AS p50,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[3]) AS p99\nFROM queue_metrics\nGROUP BY t\nORDER BY t`} fillGaps ids={ids} - period={period} + timeRange={timeRange} queueName={fullName} valueFormat={formatWaitMs} series={[ @@ -154,7 +165,7 @@ export default function Page() { query={`SELECT timeBucket() AS t, sum(throttled_count) AS throttled\nFROM queue_metrics\nGROUP BY t\nORDER BY t`} fillGaps ids={ids} - period={period} + timeRange={timeRange} queueName={fullName} series={[{ key: "throttled", label: "Throttled", color: COLORS.throttled }]} /> @@ -164,75 +175,17 @@ export default function Page() { ); } -type MetricRow = Record; -type MetricResponse = - | { success: true; data: { rows: MetricRow[] } } - | { success: false; error: string }; - -/** - * Client-fetch a queue-scoped TRQL query from the metric resource route, mirroring the - * dashboard widgets: own loading state, 60s + on-focus refresh, abort on change/unmount. - */ function useQueueMetric( query: string, - opts: { ids: Ids; period: string; queueName: string; fillGaps?: boolean } + opts: { ids: Ids; timeRange: TimeRangeParams; queueName: string; fillGaps?: boolean } ) { - const [rows, setRows] = useState(null); - const [isLoading, setIsLoading] = useState(true); - const [failed, setFailed] = useState(false); - const abortRef = useRef(null); - const { ids, period, queueName, fillGaps } = opts; - - const load = useCallback(() => { - abortRef.current?.abort(); - const controller = new AbortController(); - abortRef.current = controller; - setIsLoading(true); - fetch("/resources/metric", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ - query, - scope: "environment", - period, - from: null, - to: null, - fillGaps: !!fillGaps, - organizationId: ids.organizationId, - projectId: ids.projectId, - environmentId: ids.environmentId, - queues: [queueName], - }), - signal: controller.signal, - }) - .then((res) => res.json() as Promise) - .then((data) => { - if (controller.signal.aborted) return; - if (data.success) { - setRows(data.data.rows); - setFailed(false); - } else { - setFailed(true); - } - setIsLoading(false); - }) - .catch((error) => { - if (error instanceof DOMException && error.name === "AbortError") return; - if (!controller.signal.aborted) { - setFailed(true); - setIsLoading(false); - } - }); - }, [query, period, queueName, fillGaps, ids.organizationId, ids.projectId, ids.environmentId]); - - useEffect(() => { - load(); - return () => abortRef.current?.abort(); - }, [load]); - - useInterval({ interval: 60_000, onLoad: false, onFocus: true, callback: load }); - - return { rows: rows ?? [], showLoading: isLoading && !rows, failed }; + return useMetricResourceQuery(query, { + ...opts.ids, + timeRange: opts.timeRange, + defaultPeriod: QUEUE_METRICS_DEFAULT_PERIOD, + queues: [opts.queueName], + fillGaps: opts.fillGaps, + }); } function toNumber(value: number | string | null | undefined): number { @@ -245,14 +198,14 @@ function clickhouseTimeToMs(value: unknown): number { return Date.parse(s.endsWith("Z") ? s : `${s}Z`); } -type SeriesConfig = { key: string; label: string; color: string; dashed?: boolean }; +type SeriesConfig = { key: string; label: string; color: string }; function QueueDetailChartCard({ title, query, series, ids, - period, + timeRange, queueName, valueFormat, fillGaps, @@ -261,145 +214,63 @@ function QueueDetailChartCard({ query: string; series: SeriesConfig[]; ids: Ids; - period: string; + timeRange: TimeRangeParams; queueName: string; valueFormat?: (value: number) => string; fillGaps?: boolean; }) { - const { rows, showLoading, failed } = useQueueMetric(query, { ids, period, queueName, fillGaps }); + const { rows, showLoading, failed } = useQueueMetric(query, { + ids, + timeRange, + queueName, + fillGaps, + }); - const points = useMemo(() => { + const data = useMemo(() => { return rows .map((r) => { - const point: Record = { ts: clickhouseTimeToMs(r.t) }; + const point: { bucket: number } & Record = { + bucket: clickhouseTimeToMs(r.t), + }; for (const s of series) point[s.key] = toNumber(r[s.key]); return point; }) - .filter((p) => Number.isFinite(p.ts)); + .filter((p) => Number.isFinite(p.bucket)); }, [rows, series]); - const bucketIntervalMs = points.length >= 2 ? points[1].ts - points[0].ts : 0; - const formatX = useMemo(() => { - const sameDay = bucketIntervalMs > 0 && bucketIntervalMs < 6 * 3600_000; - return (value: number) => - new Date(value).toLocaleString("en-US", { - month: sameDay ? undefined : "short", - day: sameDay ? undefined : "numeric", - hour: "2-digit", - minute: "2-digit", - hour12: false, - }); - }, [bucketIntervalMs]); - - const hasData = points.some((p) => series.some((s) => p[s.key] > 0)); + const chartConfig = useMemo(() => { + const cfg: ChartConfig = {}; + for (const s of series) cfg[s.key] = { label: s.label, color: s.color }; + return cfg; + }, [series]); - return ( -
- {title} - - {showLoading ? ( -
- ) : failed ? ( -
- Unable to load metrics -
- ) : hasData ? ( -
- - - - - valueFormat(v) : undefined} - domain={[0, (dataMax: number) => Math.max(1, Math.ceil(dataMax * 1.15))]} - /> - - } - allowEscapeViewBox={{ x: true, y: true }} - wrapperStyle={{ zIndex: 1000 }} - animationDuration={0} - /> - {series.map((s) => ( - - ))} - - - -
- ) : ( -
- No activity in this window -
- )} -
+ const { tickFormatter, tooltipLabelFormatter } = useMemo( + () => buildActivityTimeAxis(data), + [data] ); -} -function QueueChartTooltip({ - active, - payload, - label, - series, - formatX, - valueFormat, -}: TooltipProps & { - series: SeriesConfig[]; - formatX: (value: number) => string; - valueFormat?: (value: number) => string; -}) { - if (!active || !payload || payload.length === 0) return null; + const state: ChartState = showLoading ? "loading" : failed ? "invalid" : undefined; + return ( -
-
{formatX(Number(label))}
- {series.map((s) => { - const entry = payload.find((p) => p.dataKey === s.key); - const value = entry?.value; - return ( -
- - {s.label} - - {value === null || value === undefined - ? "–" - : valueFormat - ? valueFormat(Number(value)) - : Number(value).toLocaleString()} - -
- ); - })} +
+ + s.key)} + state={state} + fillContainer + > + valueFormat(v) } : undefined} + tooltipLabelFormatter={tooltipLabelFormatter} + tooltipValueFormatter={valueFormat} + /> + +
); } @@ -407,18 +278,18 @@ function QueueChartTooltip({ function QueueStats({ queue, ids, - period, + timeRange, queueName, }: { queue: { running: number; queued: number }; ids: Ids; - period: string; + timeRange: TimeRangeParams; queueName: string; }) { // One scalar query feeds the CH-derived stats; the "now" counts come from the loader (live). const { rows, showLoading } = useQueueMetric( `SELECT max(max_limit) AS lim, max(max_queued) AS peak_queued, deltaSumTimestampMerge(started_delta) AS started,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS worst_p95\nFROM queue_metrics`, - { ids, period, queueName } + { ids, timeRange, queueName } ); const row = rows[0]; const worstP95 = row ? toNumber(row.worst_p95) : 0; @@ -452,32 +323,6 @@ function QueueStats({ ); } -const PERIODS: QueueMetricsWindow[] = ["1h", "6h", "24h"]; - -function QueuePeriodSelect({ period }: { period: QueueMetricsWindow }) { - const [searchParams] = useSearchParams(); - const hrefFor = (value: QueueMetricsWindow) => { - const next = new URLSearchParams(searchParams); - next.set("period", value); - return `?${next.toString()}`; - }; - return ( -
- Period - {PERIODS.map((value) => ( - - {value} - - ))} -
- ); -} - function Stat({ label, value, @@ -490,7 +335,7 @@ function Stat({ loading?: boolean; }) { return ( -
+
{label}
{loading ? (
diff --git a/internal-packages/clickhouse/src/queueMetrics.ts b/internal-packages/clickhouse/src/queueMetrics.ts index 0850360b301..8f64bb9b2f5 100644 --- a/internal-packages/clickhouse/src/queueMetrics.ts +++ b/internal-packages/clickhouse/src/queueMetrics.ts @@ -39,6 +39,7 @@ const QueueMetricsListParams = z.object({ environmentId: z.string(), queueNames: z.array(z.string()), startTime: z.string(), + endTime: z.string(), }); const QueueMetricsSummaryRow = z.object({ @@ -65,6 +66,7 @@ export function getQueueListMetricsSummary(reader: ClickhouseReader) { AND environment_id = {environmentId: String} AND queue_name IN {queueNames: Array(String)} AND bucket_start >= {startTime: DateTime} + AND bucket_start < {endTime: DateTime} GROUP BY queue_name`, params: QueueMetricsListParams, schema: QueueMetricsSummaryRow, @@ -95,6 +97,7 @@ export function getQueueDepthSparklines(reader: ClickhouseReader) { AND environment_id = {environmentId: String} AND queue_name IN {queueNames: Array(String)} AND bucket_start >= {startTime: DateTime} + AND bucket_start < {endTime: DateTime} GROUP BY queue_name, bucket ORDER BY bucket`, params: QueueDepthSparklineParams, From fa40e59b3cc0e09c2cdd93e9b27f3fdf22e944b8 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 03:35:05 +0100 Subject: [PATCH 21/37] fix(tsql): register the deltaSumTimestampMerge aggregate Queries using deltaSumTimestampMerge failed with an unknown function error, which broke the queue detail stats and the started counts on the built in Queues dashboard. --- internal-packages/tsql/src/query/functions.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal-packages/tsql/src/query/functions.ts b/internal-packages/tsql/src/query/functions.ts index 2f2b9278454..4b2ac0cb553 100644 --- a/internal-packages/tsql/src/query/functions.ts +++ b/internal-packages/tsql/src/query/functions.ts @@ -650,6 +650,12 @@ export const TSQL_AGGREGATIONS: Record = { countMerge: { clickhouseName: "countMerge", minArgs: 1, maxArgs: 1, aggregate: true }, minMerge: { clickhouseName: "minMerge", minArgs: 1, maxArgs: 1, aggregate: true }, maxMerge: { clickhouseName: "maxMerge", minArgs: 1, maxArgs: 1, aggregate: true }, + deltaSumTimestampMerge: { + clickhouseName: "deltaSumTimestampMerge", + minArgs: 1, + maxArgs: 1, + aggregate: true, + }, // Statistical functions simpleLinearRegression: { From 79ca47f41b1a1f51408b74fec01f2dc053e1c572 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 03:35:05 +0100 Subject: [PATCH 22/37] chore(webapp): use shared primitives on the admin queue metrics page --- .../webapp/app/routes/admin.queue-metrics.tsx | 55 +++++++++++-------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/apps/webapp/app/routes/admin.queue-metrics.tsx b/apps/webapp/app/routes/admin.queue-metrics.tsx index df4bbe8c001..6deaedce66e 100644 --- a/apps/webapp/app/routes/admin.queue-metrics.tsx +++ b/apps/webapp/app/routes/admin.queue-metrics.tsx @@ -6,7 +6,16 @@ import { z } from "zod"; import { Button } from "~/components/primitives/Buttons"; import { Callout } from "~/components/primitives/Callout"; import { Header1, Header2 } from "~/components/primitives/Headers"; +import { Input } from "~/components/primitives/Input"; import { Paragraph } from "~/components/primitives/Paragraph"; +import { + Table, + TableBody, + TableCell, + TableHeader, + TableHeaderCell, + TableRow, +} from "~/components/primitives/Table"; import { dashboardAction, dashboardLoader } from "~/services/routeBuilders/dashboardBuilder"; import { probeQueueMetricsStreams, @@ -110,14 +119,14 @@ export default function AdminQueueMetricsRoute() { Gauge sample rate 0–1 (queue_metrics:gauge_sample_rate); default{" "} {controls.sampleRateDefault} - setSampleRate(e.target.value)} - className="w-32 rounded border border-grid-bright bg-charcoal-750 px-2 py-1 text-text-bright" + className="w-32" />
{error && {error}} @@ -130,7 +139,7 @@ export default function AdminQueueMetricsRoute() {
- Stream health{totalLag > 0 ? ` — lag ${totalLag}` : ""} + Stream health{totalLag > 0 ? ` (lag ${totalLag})` : ""}
From ab0284a8666f535141636dfd879288cc1326a952 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 09:19:09 +0100 Subject: [PATCH 23/37] feat(webapp): house style hero charts on the queues list The queues list header tiles now render the same line chart, grid, and tooltip as the rest of the metrics charts instead of a row sparkline, with the headline value in the tile header. The env saturation tile draws the environment concurrency limit and burst limit as labeled reference lines. Chart tooltips keep a gap between the series label and the value, and the shared line chart gains showDots and referenceLines options. --- .../components/primitives/charts/Chart.tsx | 2 +- .../primitives/charts/ChartLine.tsx | 53 +++++++- .../route.tsx | 126 ++++++++++++++---- 3 files changed, 154 insertions(+), 27 deletions(-) diff --git a/apps/webapp/app/components/primitives/charts/Chart.tsx b/apps/webapp/app/components/primitives/charts/Chart.tsx index 57a2692e677..8894c2da34d 100644 --- a/apps/webapp/app/components/primitives/charts/Chart.tsx +++ b/apps/webapp/app/components/primitives/charts/Chart.tsx @@ -216,7 +216,7 @@ const ChartTooltipContent = React.forwardRef< )}
diff --git a/apps/webapp/app/components/primitives/charts/ChartLine.tsx b/apps/webapp/app/components/primitives/charts/ChartLine.tsx index 1edd5a2357e..5d5fb95ecce 100644 --- a/apps/webapp/app/components/primitives/charts/ChartLine.tsx +++ b/apps/webapp/app/components/primitives/charts/ChartLine.tsx @@ -4,6 +4,7 @@ import { CartesianGrid, Line, LineChart, + ReferenceLine, XAxis, YAxis, type XAxisProps, @@ -48,12 +49,38 @@ export type ChartLineRendererProps = { tooltipLabelFormatter?: (label: string, payload: any[]) => string; /** Optional formatter for numeric tooltip values (e.g. bytes, duration) */ tooltipValueFormatter?: (value: number) => string; + /** Draw a dot at each data point. Defaults to true; turn off for dense/compact charts. */ + showDots?: boolean; + /** Horizontal reference lines (e.g. limits); the y-domain extends to include them. */ + referenceLines?: Array<{ y: number; label?: string; color?: string }>; /** Width injected by ResponsiveContainer */ width?: number; /** Height injected by ResponsiveContainer */ height?: number; }; +/** Reference-line label: right-aligned just below the line (recharts injects viewBox). */ +function ReferenceLineLabel({ + viewBox, + value, +}: { + viewBox?: { x: number; y: number; width: number }; + value: string; +}) { + if (!viewBox) return null; + return ( + + {value} + + ); +} + /** * Line chart renderer for the compound component system. * Must be used within a Chart.Root. @@ -73,6 +100,8 @@ export function ChartLineRenderer({ stacked = false, tooltipLabelFormatter, tooltipValueFormatter, + showDots = true, + referenceLines, width, height, }: ChartLineRendererProps) { @@ -176,6 +205,17 @@ export function ChartLineRenderer({ labelFormatter={tooltipLabelFormatter} /> {/* Note: Legend is now rendered by ChartRoot outside the chart container */} + {referenceLines?.map((line) => ( + : undefined} + /> + ))} {visibleSeries.map((key) => ( {/* Note: Legend is now rendered by ChartRoot outside the chart container */} + {referenceLines?.map((line) => ( + : undefined} + /> + ))} {visibleSeries.map((key) => ( diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index 59b5072741d..65131fef430 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -11,7 +11,7 @@ import { Form, Link, useNavigation, type MetaFunction } from "@remix-run/react"; import { type ActionFunctionArgs, type LoaderFunctionArgs } from "@remix-run/server-runtime"; import type { QueueItem } from "@trigger.dev/core/v3/schemas"; import type { RuntimeEnvironmentType } from "@trigger.dev/database"; -import { type ReactNode, useEffect, useState } from "react"; +import { type ReactNode, useEffect, useMemo, useState } from "react"; import { typedjson, useTypedLoaderData } from "remix-typedjson"; import { z } from "zod"; import { ConcurrencyIcon } from "~/assets/icons/ConcurrencyIcon"; @@ -72,6 +72,8 @@ import { TimeFilter, timeFilterFromTo } from "~/components/runs/v3/SharedFilters import { useSearchParams } from "~/hooks/useSearchParam"; import { parseFiniteInt } from "~/utils/searchParams"; import { UsageSparkline } from "~/components/primitives/UsageSparkline"; +import { buildActivityTimeAxis } from "~/components/primitives/charts/activityTimeAxis"; +import { Chart, type ChartConfig } from "~/components/primitives/charts/ChartCompound"; import { useMetricResourceQuery, type MetricResourceTimeRange, @@ -452,7 +454,28 @@ function QueuesWithMetricsView() {
{QUEUE_HEADER_TILES.map((tile) => ( - + 1 + ? [ + { + y: Math.round(environment.burstFactor * 100), + label: `Burst ${Math.round( + environment.concurrencyLimit * environment.burstFactor + )}`, + }, + ] + : []), + ] + : undefined + } + /> ))}
@@ -1140,7 +1163,8 @@ type QueueHeaderTile = { label: string; color: string; query: string; - unitLabel: { singular: string; plural: string }; + /** Formats a single bucket's value in the chart tooltip. */ + formatValue?: (value: number) => string; derive: (rows: MetricTileRow[]) => { sparkline: number[]; total: number; @@ -1167,7 +1191,7 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ label: "Env saturation", color: "#6366F1", query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS env_limit\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, - unitLabel: { singular: "%", plural: "%" }, + formatValue: (v) => `${v}%`, derive: (rows) => { const sparkline = rows.map((r) => { const limit = tileNumber(r.env_limit); @@ -1182,7 +1206,6 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ label: "Backlog", color: "#A78BFA", query: `SELECT timeBucket() AS t,\n max(max_env_queued) AS queued\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, - unitLabel: { singular: "queued", plural: "queued" }, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.queued)); const peak = sparkline.reduce((max, v) => Math.max(max, v), 0); @@ -1194,7 +1217,7 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ label: "Scheduling delay p95", color: "#F59E0B", query: `SELECT timeBucket() AS t,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, - unitLabel: { singular: "ms", plural: "ms" }, + formatValue: formatWaitMs, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.p95)); const worst = sparkline.reduce((max, v) => Math.max(max, v), 0); @@ -1211,7 +1234,6 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ label: "Throttled", color: "#F59E0B", query: `SELECT timeBucket() AS t,\n sum(throttled_count) AS throttled\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, - unitLabel: { singular: "throttled bucket", plural: "throttled buckets" }, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.throttled)); const total = sparkline.reduce((sum, v) => sum + v, 0); @@ -1229,9 +1251,11 @@ type TileTimeRange = MetricResourceTimeRange; function QueueEnvMetricTile({ tile, timeRange, + referenceLines, }: { tile: QueueHeaderTile; timeRange: TileTimeRange; + referenceLines?: Array<{ y: number; label?: string }>; }) { const organization = useOrganization(); const project = useProject(); @@ -1247,37 +1271,89 @@ function QueueEnvMetricTile({ }); const { sparkline, total, formatTotal, totalClassName } = tile.derive(rows); - const bucketStartMs = rows.length > 0 ? tileTimeToMs(rows[0].t) : undefined; - const bucketIntervalMs = - rows.length >= 2 ? tileTimeToMs(rows[1].t) - tileTimeToMs(rows[0].t) : undefined; + + // Same point shape the full-size charts use so the shared axis/tooltip helpers apply. + const data = rows + .map((r, i) => ({ bucket: tileTimeToMs(r.t), [tile.id]: sparkline[i] ?? 0 })) + .filter((p) => Number.isFinite(p.bucket)); + + const chartConfig = useMemo( + () => ({ [tile.id]: { label: tile.label, color: tile.color } }), + [tile.id, tile.label, tile.color] + ); + + const { tooltipLabelFormatter } = useMemo(() => buildActivityTimeAxis(data), [data]); + const hasData = data.length > 0 && sparkline.some((v) => v > 0); return ( - + + ) : failed ? undefined : formatTotal ? ( + formatTotal(total) + ) : ( + total.toLocaleString() + ) + } + valueClassName={totalClassName} + > {showLoading ? ( -
+
) : failed ? ( -
Unable to load metrics
+
+ Unable to load metrics +
+ ) : hasData ? ( +
+ + + +
) : ( - +
No activity
)} ); } -function HeaderTile({ label, children }: { label: ReactNode; children: ReactNode }) { +function HeaderTile({ + label, + value, + valueClassName, + children, +}: { + label: ReactNode; + value?: ReactNode; + valueClassName?: string; + children: ReactNode; +}) { return (
- {label} +
+ {label} + {value !== undefined ? ( + + {value} + + ) : null} +
{children}
); From ec4d0324f8cc7c979590136a10946da4e00a0ae7 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 14:21:08 +0100 Subject: [PATCH 24/37] feat(clickhouse): queue activity ranking queries --- internal-packages/clickhouse/src/index.ts | 4 ++ .../clickhouse/src/queueMetrics.ts | 62 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index 52fc3507def..293ad14ad6e 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -36,6 +36,8 @@ import { insertQueueMetricsRaw, getQueueListMetricsSummary, getQueueDepthSparklines, + getQueueRankingPage, + getQueueRankingCount, } from "./queueMetrics.js"; import { getSessionTagsQueryBuilder, @@ -271,6 +273,8 @@ export class ClickHouse { insertRaw: insertQueueMetricsRaw(this.writer), listSummary: getQueueListMetricsSummary(this.reader), depthSparklines: getQueueDepthSparklines(this.reader), + rankingPage: getQueueRankingPage(this.reader), + rankingCount: getQueueRankingCount(this.reader), }; } diff --git a/internal-packages/clickhouse/src/queueMetrics.ts b/internal-packages/clickhouse/src/queueMetrics.ts index 8f64bb9b2f5..3efab11e793 100644 --- a/internal-packages/clickhouse/src/queueMetrics.ts +++ b/internal-packages/clickhouse/src/queueMetrics.ts @@ -105,4 +105,66 @@ export function getQueueDepthSparklines(reader: ClickhouseReader) { }); } +const QueueRankingParams = z.object({ + organizationId: z.string(), + projectId: z.string(), + environmentId: z.string(), + startTime: z.string(), + /** 1 = rank by peak backlog only; 0 = backlog + running ("busiest"). */ + byQueuedOnly: z.number(), + nameContains: z.string(), + limit: z.number(), + offset: z.number(), +}); + +const QueueRankingRow = z.object({ + queue_name: z.string(), +}); + +const RANKING_WHERE = `organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND bucket_start >= {startTime: DateTime} + AND queue_name != '__overflow__' + AND ({nameContains: String} = '' OR positionCaseInsensitive(queue_name, {nameContains: String}) > 0)`; + +/** Queue names ranked by recent activity, for relevance-ordered list pages. */ +export function getQueueRankingPage(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueRankingPage", + query: `SELECT queue_name + FROM trigger_dev.queue_metrics_v1 + WHERE ${RANKING_WHERE} + GROUP BY queue_name + ORDER BY + if({byQueuedOnly: UInt8} = 1, max(max_queued), max(max_queued) + max(max_running)) DESC, + queue_name ASC + LIMIT {limit: UInt32} OFFSET {offset: UInt32}`, + params: QueueRankingParams, + schema: QueueRankingRow, + }); +} + +const QueueRankingCountParams = QueueRankingParams.omit({ + byQueuedOnly: true, + limit: true, + offset: true, +}); + +const QueueRankingCountRow = z.object({ + ranked: z.coerce.number(), +}); + +/** How many queues have activity in the ranking window (the ranked head of the list). */ +export function getQueueRankingCount(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueRankingCount", + query: `SELECT uniqExact(queue_name) AS ranked + FROM trigger_dev.queue_metrics_v1 + WHERE ${RANKING_WHERE}`, + params: QueueRankingCountParams, + schema: QueueRankingCountRow, + }); +} + // (per-queue detail series is now fetched via TRQL + fillGaps from the metric resource route) From fbaa4be5dcbb21dab6013784dc98c30ae2664102 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 14:21:08 +0100 Subject: [PATCH 25/37] feat(webapp): queue allocation view and relevance-ordered queue list Adds an Allocation tab to the Queues page (behind the queue metrics UI flag): overview cards, a burst-aware capacity bar showing each queue allocation and its live usage in a distinct color, an inline-editable limits table with per-queue locks, load-weighted auto-balance, and a review dialog that bulk-applies limits as overrides through the existing concurrency system. The queue list now defaults to Busiest ordering (with Backlog and Name options). ClickHouse ranks queues by activity over the last 15 minutes and returns just the requested page of names, so the cost per page is one small aggregate regardless of environment size; idle queues follow in name order and any failure falls back to name ordering. The classic page keeps plain name order. --- .../v3/QueueAllocationPresenter.server.ts | 94 +++ .../v3/QueueListPresenter.server.ts | 173 ++++- .../AllocationView.tsx | 606 ++++++++++++++++++ .../route.tsx | 145 ++++- 4 files changed, 1013 insertions(+), 5 deletions(-) create mode 100644 apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts create mode 100644 apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx diff --git a/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts new file mode 100644 index 00000000000..c7a8166b6a3 --- /dev/null +++ b/apps/webapp/app/presenters/v3/QueueAllocationPresenter.server.ts @@ -0,0 +1,94 @@ +import { TaskQueueType, type Prisma } from "@trigger.dev/database"; +import { type AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { engine } from "~/v3/runEngine.server"; +import { BasePresenter } from "./basePresenter.server"; + +const MAX_ALLOCATION_QUEUES = 500; + +export type QueueAllocationItem = { + id: string; + name: string; + type: "task" | "custom"; + running: number; + queued: number; + paused: boolean; + /** Explicit per-queue limit; null means the queue floats up to the env limit. */ + limit: number | null; + overridden: boolean; +}; + +export type QueueAllocation = { + queues: QueueAllocationItem[]; + totalQueues: number; + truncated: boolean; + /** Sum of explicit limits, each clamped to the env limit. */ + allocated: number; + unlimitedCount: number; +}; + +/** Every queue in the environment (capped) with live counts, for the allocation view. */ +export class QueueAllocationPresenter extends BasePresenter { + public async call({ + environment, + }: { + environment: AuthenticatedEnvironment; + }): Promise { + const where: Prisma.TaskQueueWhereInput = { + runtimeEnvironmentId: environment.id, + version: "V2", + }; + + const [totalQueues, queues] = await Promise.all([ + this._replica.taskQueue.count({ where }), + this._replica.taskQueue.findMany({ + where, + select: { + friendlyId: true, + name: true, + type: true, + paused: true, + concurrencyLimit: true, + concurrencyLimitOverriddenAt: true, + }, + orderBy: { orderableName: "asc" }, + take: MAX_ALLOCATION_QUEUES, + }), + ]); + + const names = queues.map((q) => q.name); + const [queuedByQueue, runningByQueue] = await Promise.all([ + engine.lengthOfQueues(environment, names), + engine.currentConcurrencyOfQueues(environment, names), + ]); + + const envLimit = environment.maximumConcurrencyLimit; + let allocated = 0; + let unlimitedCount = 0; + + const items: QueueAllocationItem[] = queues.map((queue) => { + if (queue.concurrencyLimit === null) { + unlimitedCount++; + } else { + allocated += Math.min(queue.concurrencyLimit, envLimit); + } + return { + id: queue.friendlyId, + name: queue.name.replace(/^task\//, ""), + type: queue.type === TaskQueueType.VIRTUAL ? ("task" as const) : ("custom" as const), + running: runningByQueue[queue.name] ?? 0, + queued: queuedByQueue[queue.name] ?? 0, + paused: queue.paused, + limit: queue.concurrencyLimit, + overridden: queue.concurrencyLimitOverriddenAt !== null, + }; + }); + + return { + queues: items, + totalQueues, + truncated: totalQueues > queues.length, + allocated, + unlimitedCount, + }; + } +} diff --git a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts index 024a1342b0a..747eb119568 100644 --- a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts @@ -3,6 +3,8 @@ import type { Prisma } from "@trigger.dev/database"; import { TaskQueueType } from "@trigger.dev/database"; import { type PrismaClientOrTransaction } from "~/db.server"; import { type AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { logger } from "~/services/logger.server"; import { determineEngineVersion } from "~/v3/engineVersion.server"; import { engine } from "~/v3/runEngine.server"; import { BasePresenter } from "./basePresenter.server"; @@ -13,6 +15,12 @@ type QueueListEngine = Pick = { task: TaskQueueType.VIRTUAL, custom: TaskQueueType.NAMED, @@ -30,6 +38,38 @@ const queueListSelect = { paused: true, } satisfies Prisma.TaskQueueSelect; +type QueueListRow = Prisma.TaskQueueGetPayload<{ select: typeof queueListSelect }>; + +type QueueListItem = ReturnType; + +type QueueListPagination = + | { mode: "filtered"; currentPage: number; hasMore: boolean } + | { mode: "unfiltered"; currentPage: number; totalPages: number; count: number }; + +// The `?: undefined` markers keep every key reachable across the union, so consumers +// can destructure before narrowing on `success`. +export type QueueListResult = + | { + success: false; + code: string; + totalQueues: number; + hasFilters: boolean; + queues?: undefined; + pagination?: undefined; + } + | { + success: true; + queues: QueueListItem[]; + pagination: QueueListPagination; + totalQueues?: number; + hasFilters: boolean; + code?: undefined; + }; + +function formatClickhouseDateTime(date: Date): string { + return date.toISOString().slice(0, 19).replace("T", " "); +} + function buildQueueListWhere( environmentId: string, query: string | undefined, @@ -70,13 +110,15 @@ export class QueueListPresenter extends BasePresenter { query, page, type, + sort = "name", }: { environment: AuthenticatedEnvironment; query?: string; page: number; perPage?: number; type?: "task" | "custom"; - }) { + sort?: QueueListSort; + }): Promise { const hasFilters = Boolean(query?.trim()) || type !== undefined; const engineVersion = await determineEngineVersion({ environment }); @@ -110,6 +152,18 @@ export class QueueListPresenter extends BasePresenter { }; } + if (sort !== "name") { + // Ranking is additive: any failure or unsupported input falls back to name order. + try { + const ranked = await this.getRankedQueues(environment, query, page, type, sort); + if (ranked) { + return ranked; + } + } catch (error) { + logger.warn("Queue ranking unavailable, falling back to name order", { error }); + } + } + if (hasFilters) { const { queues, hasMore } = await this.getFilteredQueues(environment, query, page, type); @@ -143,6 +197,123 @@ export class QueueListPresenter extends BasePresenter { }; } + /** + * ClickHouse ranks queues by recent activity and returns the requested page of names; + * queues with no recent metrics follow in name order. Null when ranking does not apply. + */ + private async getRankedQueues( + environment: AuthenticatedEnvironment, + query: string | undefined, + page: number, + type: "task" | "custom" | undefined, + sort: Exclude + ) { + if (type !== undefined) { + return null; + } + + const clickhouse = await clickhouseFactory.getClickhouseForOrganization( + environment.organizationId, + "query" + ); + + const rankingArgs = { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + startTime: formatClickhouseDateTime( + new Date(Date.now() - QUEUE_RANKING_WINDOW_MINUTES * 60 * 1000) + ), + nameContains: query?.trim() ?? "", + }; + + const [countError, countRows] = await clickhouse.queueMetrics.rankingCount(rankingArgs); + if (countError) { + throw countError; + } + const ranked = countRows?.[0]?.ranked ?? 0; + if (ranked > MAX_RANKED_QUEUES) { + return null; + } + + const where = buildQueueListWhere(environment.id, query, type); + const totalQueues = await this._replica.taskQueue.count({ where }); + const offset = (page - 1) * this.perPage; + + let rankedPageQueues: QueueListRow[] = []; + if (offset < ranked) { + const [pageError, pageRows] = await clickhouse.queueMetrics.rankingPage({ + ...rankingArgs, + byQueuedOnly: sort === "queued" ? 1 : 0, + limit: this.perPage, + offset, + }); + if (pageError) { + throw pageError; + } + const rankedNames = (pageRows ?? []).map((row) => row.queue_name); + rankedPageQueues = await this.findQueuesByNames(where, rankedNames); + } + + // Tail of the page: name-ordered queues that have no recent metrics. Slot math uses the + // ClickHouse counts so pages never overlap, even if some ranked names no longer exist. + const rankedSlots = Math.min(Math.max(ranked - offset, 0), this.perPage); + const tailNeeded = this.perPage - rankedSlots; + let tailQueues: QueueListRow[] = []; + if (tailNeeded > 0) { + let excludedNames: string[] = []; + if (ranked > 0) { + const [allError, allRows] = await clickhouse.queueMetrics.rankingPage({ + ...rankingArgs, + byQueuedOnly: 0, + limit: MAX_RANKED_QUEUES, + offset: 0, + }); + if (allError) { + throw allError; + } + excludedNames = (allRows ?? []).map((row) => row.queue_name); + } + tailQueues = await this._replica.taskQueue.findMany({ + where: { ...where, name: { notIn: excludedNames } }, + select: queueListSelect, + orderBy: { + orderableName: "asc", + }, + skip: Math.max(0, offset - ranked), + take: tailNeeded, + }); + } + + return { + success: true as const, + queues: await this.enrichQueues(environment, [...rankedPageQueues, ...tailQueues]), + pagination: { + mode: "unfiltered" as const, + currentPage: page, + totalPages: Math.max(1, Math.ceil(totalQueues / this.perPage)), + count: totalQueues, + }, + totalQueues, + hasFilters: Boolean(query?.trim()) || type !== undefined, + }; + } + + private async findQueuesByNames( + where: Prisma.TaskQueueWhereInput, + names: string[] + ): Promise { + if (names.length === 0) { + return []; + } + const queues = await this._replica.taskQueue.findMany({ + where: { ...where, name: { in: names } }, + select: queueListSelect, + }); + const byName = new Map(queues.map((queue) => [queue.name, queue])); + return names.flatMap((name) => byName.get(name) ?? []); + } + private async getFilteredQueues( environment: AuthenticatedEnvironment, query: string | undefined, diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx new file mode 100644 index 00000000000..7dcb0b9cab0 --- /dev/null +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx @@ -0,0 +1,606 @@ +import { LockClosedIcon, LockOpenIcon, ScaleIcon } from "@heroicons/react/20/solid"; +import { Form, useNavigation } from "@remix-run/react"; +import { type ReactNode, useEffect, useMemo, useState } from "react"; +import { BigNumber } from "~/components/metrics/BigNumber"; +import { Badge } from "~/components/primitives/Badge"; +import { Button } from "~/components/primitives/Buttons"; +import { Callout } from "~/components/primitives/Callout"; +import { Dialog, DialogContent, DialogHeader, DialogTrigger } from "~/components/primitives/Dialog"; +import { Input } from "~/components/primitives/Input"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import { + Table, + TableBody, + TableCell, + TableHeader, + TableHeaderCell, + TableRow, +} from "~/components/primitives/Table"; +import { SimpleTooltip } from "~/components/primitives/Tooltip"; +import { getSeriesColor } from "~/components/code/chartColors"; +import { QueueName } from "~/components/runs/v3/QueueName"; +import { type Environment } from "~/presenters/v3/EnvironmentQueuePresenter.server"; +import { + type QueueAllocation, + type QueueAllocationItem, +} from "~/presenters/v3/QueueAllocationPresenter.server"; +import { cn } from "~/utils/cn"; + +type Drafts = Record; + +/** + * Distribute the env budget across unlocked queues, weighted by current load + * (running + queued), largest-remainder rounding, min 1 when the budget allows. + * Locked queues keep their current value and are subtracted from the budget first. + */ +export function computeAutoBalance( + queues: QueueAllocationItem[], + envLimit: number, + locked: Set, + draftLimit: (queue: QueueAllocationItem) => number | null +): Drafts { + const unlocked = queues.filter((q) => !locked.has(q.id)); + if (unlocked.length === 0) return {}; + + const lockedSum = queues + .filter((q) => locked.has(q.id)) + .reduce((sum, q) => sum + (draftLimit(q) ?? 0), 0); + const budget = Math.max(0, envLimit - lockedSum); + + const weights = unlocked.map((q) => q.running + q.queued); + const totalWeight = weights.reduce((a, b) => a + b, 0); + const raw = unlocked.map((_, i) => + totalWeight > 0 ? (budget * weights[i]) / totalWeight : budget / unlocked.length + ); + + const shares = raw.map(Math.floor); + let remainder = budget - shares.reduce((a, b) => a + b, 0); + const byFraction = raw + .map((value, i) => ({ i, fraction: value - Math.floor(value) })) + .sort((a, b) => b.fraction - a.fraction); + for (const { i } of byFraction) { + if (remainder <= 0) break; + shares[i]++; + remainder--; + } + + // Every unlocked queue gets at least 1 when the budget can afford it. + if (budget >= unlocked.length) { + for (let i = 0; i < shares.length; i++) { + while (shares[i] < 1) { + let donor = -1; + for (let j = 0; j < shares.length; j++) { + if (j !== i && shares[j] > 1 && (donor === -1 || shares[j] > shares[donor])) donor = j; + } + if (donor === -1) break; + shares[donor]--; + shares[i]++; + } + } + } + + const result: Drafts = {}; + unlocked.forEach((q, i) => { + result[q.id] = Math.min(Math.max(shares[i], 0), envLimit); + }); + return result; +} + +export function AllocationView({ + allocation, + environment, +}: { + allocation: QueueAllocation; + environment: Environment; +}) { + const [drafts, setDrafts] = useState({}); + const [locked, setLocked] = useState>(new Set()); + const [reviewOpen, setReviewOpen] = useState(false); + const navigation = useNavigation(); + const isSubmitting = navigation.state !== "idle"; + + const envLimit = environment.concurrencyLimit; + const burstLimit = Math.round(envLimit * environment.burstFactor); + + useEffect(() => { + if (navigation.state === "loading" || navigation.state === "idle") { + setReviewOpen(false); + } + }, [navigation.state]); + + // After an apply revalidates the loader, drop drafts that now match the saved limits. + useEffect(() => { + setDrafts((prev) => { + const next = { ...prev }; + for (const queue of allocation.queues) { + if (next[queue.id] !== undefined && next[queue.id] === queue.limit) { + delete next[queue.id]; + } + } + return next; + }); + }, [allocation]); + + const draftLimit = (queue: QueueAllocationItem): number | null => drafts[queue.id] ?? queue.limit; + + const draftAllocated = allocation.queues.reduce((sum, queue) => { + const limit = draftLimit(queue); + return limit === null ? sum : sum + Math.min(limit, envLimit); + }, 0); + + const changes = allocation.queues.filter( + (queue) => drafts[queue.id] !== undefined && drafts[queue.id] !== queue.limit + ); + + const unlimitedCount = allocation.queues.filter((queue) => draftLimit(queue) === null).length; + const allocationPct = envLimit > 0 ? Math.round((draftAllocated / envLimit) * 100) : 0; + const overAllocated = draftAllocated > envLimit; + + const setDraft = (queue: QueueAllocationItem, value: string) => { + setDrafts((prev) => { + const next = { ...prev }; + if (value.trim() === "") { + delete next[queue.id]; + return next; + } + const parsed = parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed < 0) return prev; + if (parsed === queue.limit) { + delete next[queue.id]; + } else { + next[queue.id] = parsed; + } + return next; + }); + }; + + const toggleLock = (id: string) => { + setLocked((prev) => { + const next = new Set(prev); + if (next.has(id)) next.delete(id); + else next.add(id); + return next; + }); + }; + + const autoBalance = () => { + const balanced = computeAutoBalance(allocation.queues, envLimit, locked, draftLimit); + setDrafts((prev) => { + const next = { ...prev }; + for (const queue of allocation.queues) { + const value = balanced[queue.id]; + if (value === undefined) continue; + if (value === queue.limit) delete next[queue.id]; + else next[queue.id] = value; + } + return next; + }); + }; + + const changesPayload = useMemo( + () => + JSON.stringify(changes.map((queue) => ({ friendlyId: queue.id, limit: drafts[queue.id] }))), + [changes, drafts] + ); + + const colorByQueue = useMemo(() => { + const map = new Map(); + allocation.queues.forEach((queue, i) => map.set(queue.id, getSeriesColor(i))); + return map; + }, [allocation.queues]); + const colorFor = (id: string) => colorByQueue.get(id) ?? "#878C99"; + + // Busiest first: the queues you'd rebalance are the ones under load. Colors stay + // keyed to the loader order so they don't shift as counts change. + const tableQueues = useMemo( + () => [...allocation.queues].sort((a, b) => b.running + b.queued - (a.running + a.queued)), + [allocation.queues] + ); + + return ( +
+
+ 1 ? `bursts up to ${burstLimit}` : undefined} + suffixClassName="text-text-dimmed" + /> + + 0 + ? `${unlimitedCount} without a limit (can use up to ${envLimit})` + : "all have limits" + } + suffixClassName="text-text-dimmed" + /> +
+ + + + {overAllocated && ( + + The queue limits add up to more than the environment limit, so queues will compete for + concurrency when the environment saturates. Reduce limits (or use Auto-balance) to + guarantee each queue its allocation. + + )} + + {allocation.truncated && ( + + Showing the first {allocation.queues.length} of {allocation.totalQueues} queues. + Allocation totals only include the queues shown. + + )} + +
+ + +
+ + + + + + Apply queue limits +
+ + + + Queue + Current + New + + + + {changes.map((queue) => ( + + + + + {queue.limit ?? "–"} + {drafts[queue.id]} + + ))} + +
+
+ + Limits apply immediately and are set as overrides, so they survive deploys until + removed. + +
+ + + +
+
+
+
+ + + + + Name + Running + Queued + + Limit + + + Lock + + + + + {tableQueues.map((queue) => { + const isLocked = locked.has(queue.id); + const changed = drafts[queue.id] !== undefined && drafts[queue.id] !== queue.limit; + return ( + + + + + + {queue.paused && ( + + Paused + + )} + {queue.overridden && ( + + Override + + )} + + + {queue.running} + {queue.queued} + + + {changed && ( + + {queue.limit ?? "–"} → {drafts[queue.id]} + + )} + setDraft(queue, e.target.value)} + disabled={isLocked || isSubmitting} + className="w-24" + variant="small" + /> + + + + toggleLock(queue.id)} + aria-label={isLocked ? "Unlock queue" : "Lock queue"} + /> + } + content={ + isLocked + ? "Locked: auto-balance keeps this queue's limit" + : "Unlocked: auto-balance can change this queue's limit" + } + /> + + + ); + })} + +
+
+ ); +} + +const MAX_BAR_SEGMENTS = 24; + +function AllocationBar({ + queues, + draftLimit, + envLimit, + burstLimit, + draftAllocated, + colorFor, +}: { + queues: QueueAllocationItem[]; + draftLimit: (queue: QueueAllocationItem) => number | null; + envLimit: number; + burstLimit: number; + draftAllocated: number; + colorFor: (id: string) => string; +}) { + const limited = queues + .map((queue) => ({ queue, limit: draftLimit(queue) })) + .filter( + (entry): entry is { queue: QueueAllocationItem; limit: number } => + typeof entry.limit === "number" && entry.limit > 0 + ) + .sort((a, b) => b.limit - a.limit); + + const top = limited.slice(0, MAX_BAR_SEGMENTS); + const rest = limited.slice(MAX_BAR_SEGMENTS); + const restTotal = rest.reduce((sum, entry) => sum + entry.limit, 0); + const restRunning = rest.reduce( + (sum, entry) => sum + Math.min(entry.queue.running, entry.limit), + 0 + ); + + const hasBurst = burstLimit > envLimit; + // The axis runs to the burst ceiling: allocations are guaranteed up to the env + // limit, and everything between the limit and burst is shared overflow headroom. + const scale = Math.max(draftAllocated, envLimit, burstLimit); + if (scale === 0) return null; + + const free = Math.max(0, envLimit - draftAllocated); + const limitMarkerPct = (envLimit / scale) * 100; + const burstZoneWidthPct = ((Math.min(burstLimit, scale) - envLimit) / scale) * 100; + + return ( +
+
+
+ {hasBurst && ( + + } + content={`Shared burst headroom: beyond the environment limit, queues can burst up to ${burstLimit} combined`} + disableHoverableContent + /> + )} +
+ {top.map((entry) => ( + + } + /> + ))} + {restTotal > 0 && ( + + )} +
+
+
+
+
+ + {draftAllocated} allocated + {free > 0 ? ` · ${free} unallocated` : ""} + + {hasBurst ? ( + <> + + Environment limit {envLimit} + + Burst {burstLimit} + + ) : ( + Environment limit {envLimit} + )} +
+
+ ); +} + +function QueueSegmentTooltip({ + queue, + limit, + envLimit, + color, +}: { + queue: QueueAllocationItem; + limit: number; + envLimit: number; + color: string; +}) { + const utilizationPct = limit > 0 ? Math.round((queue.running / limit) * 100) : 0; + const sharePct = envLimit > 0 ? Math.round((limit / envLimit) * 100) : 0; + return ( +
+ + + + {queue.paused && ( + + Paused + + )} + +
+ Running + + {queue.running} of {limit} ({utilizationPct}%) + + Queued + {queue.queued} + Allocation + + {sharePct}% of the environment limit + +
+
+ ); +} + +/** One queue's slice of the capacity bar: dim fill = allocation, solid fill = current usage. */ +function BarSegment({ + color, + widthPct, + usagePct, + tooltip, +}: { + color: string; + widthPct: number; + usagePct: number; + tooltip: ReactNode; +}) { + return ( + + {usagePct > 0 && ( +
+ )} +
+ } + content={tooltip} + disableHoverableContent + /> + ); +} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index 65131fef430..66b6c9962d4 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -68,6 +68,9 @@ import { QueueMetricsPresenter, type QueueListMetric, } from "~/presenters/v3/QueueMetricsPresenter.server"; +import * as Ariakit from "@ariakit/react"; +import { AppliedFilter } from "~/components/primitives/AppliedFilter"; +import { SelectItem, SelectPopover, SelectProvider } from "~/components/primitives/Select"; import { TimeFilter, timeFilterFromTo } from "~/components/runs/v3/SharedFilters"; import { useSearchParams } from "~/hooks/useSearchParam"; import { parseFiniteInt } from "~/utils/searchParams"; @@ -96,6 +99,9 @@ import { PauseQueueService } from "~/v3/services/pauseQueue.server"; import { useCurrentPlan } from "../_app.orgs.$organizationSlug/route"; import { BigNumber } from "~/components/metrics/BigNumber"; import { canAccessQueueMetricsUi } from "~/v3/canAccessQueueMetricsUi.server"; +import { QueueAllocationPresenter } from "~/presenters/v3/QueueAllocationPresenter.server"; +import { TabButton, TabContainer } from "~/components/primitives/Tabs"; +import { AllocationView } from "./AllocationView"; const SearchParamsSchema = z.object({ query: z.string().optional(), @@ -103,8 +109,15 @@ const SearchParamsSchema = z.object({ period: z.string().optional(), from: z.string().optional(), to: z.string().optional(), + view: z.string().optional(), + sort: z.enum(["busiest", "queued", "name"]).optional(), }); +const AllocationChangesSchema = z + .array(z.object({ friendlyId: z.string(), limit: z.number().int().min(0) })) + .min(1) + .max(200); + const QUEUE_METRICS_DEFAULT_PERIOD = "1d"; export const meta: MetaFunction = () => { @@ -120,7 +133,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const { organizationSlug, projectParam, envParam } = EnvironmentParamSchema.parse(params); const url = new URL(request.url); - const { page, query, period, from, to } = SearchParamsSchema.parse( + const { page, query, period, from, to, view, sort } = SearchParamsSchema.parse( Object.fromEntries(url.searchParams) ); @@ -150,6 +163,8 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { environment, query, page, + // Relevance ordering rides the metrics pipeline, so it is part of the gated UI. + sort: queueMetricsUiEnabled ? (sort ?? "busiest") : "name", }); const environmentQueuePresenter = new EnvironmentQueuePresenter(); @@ -165,7 +180,9 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { byQueue: Record; } | null = null; - if (queueMetricsUiEnabled && queues.success) { + const allocationView = queueMetricsUiEnabled && view === "allocation"; + + if (queueMetricsUiEnabled && queues.success && !allocationView) { // Metrics are additive observability; a ClickHouse hiccup must not take down queue // management. Fail open to metrics: null instead of bubbling to the page-level 400. try { @@ -200,11 +217,17 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { } } + const allocation = + allocationView && queues.success + ? await new QueueAllocationPresenter().call({ environment }) + : null; + return typedjson({ ...queues, environment: await environmentQueuePresenter.call(environment), autoReloadPollIntervalMs, metrics, + allocation, queueMetricsUiEnabled, }); } catch (error) { @@ -368,6 +391,48 @@ export const action = async ({ request, params }: ActionFunctionArgs) => { return redirectWithSuccessMessage(redirectPath, request, "Queue concurrency limit reset"); } + case "allocation-apply": { + if (!(await canAccessQueueMetricsUi({ userId, organizationSlug }))) { + return redirectWithErrorMessage(redirectPath, request, "Not available"); + } + + let changes; + try { + changes = AllocationChangesSchema.parse(JSON.parse(String(formData.get("changes")))); + } catch { + return redirectWithErrorMessage(redirectPath, request, "Invalid changes"); + } + + const user = await getUserById(userId); + if (!user) { + return redirectWithErrorMessage(redirectPath, request, "User not found"); + } + + let failed = 0; + for (const change of changes) { + const result = await concurrencySystem.queues.overrideQueueConcurrencyLimit( + environment, + change.friendlyId, + change.limit, + user + ); + if (!result.isOk()) failed++; + } + + if (failed > 0) { + return redirectWithErrorMessage( + redirectPath, + request, + `Failed to update ${failed} of ${changes.length} queue limits` + ); + } + + return redirectWithSuccessMessage( + redirectPath, + request, + `Updated ${changes.length} queue limit${changes.length === 1 ? "" : "s"}` + ); + } default: return redirectWithErrorMessage(redirectPath, request, "Something went wrong"); } @@ -391,6 +456,7 @@ function QueuesWithMetricsView() { hasFilters, autoReloadPollIntervalMs, metrics, + allocation, } = useTypedLoaderData(); const metricsByQueue = metrics?.byQueue ?? {}; @@ -402,12 +468,13 @@ function QueuesWithMetricsView() { const maxPeriodDays = plan?.v3Subscription?.plan?.limits?.queryPeriodDays?.number; // The header tiles fetch client-side with the same period/from/to the TimeFilter writes. - const { value } = useSearchParams(); + const { value, replace } = useSearchParams(); const timeRange = { period: value("period") ?? null, from: value("from") ?? null, to: value("to") ?? null, }; + const view = value("view") === "allocation" ? ("allocation" as const) : ("queues" as const); useAutoRevalidate({ interval: autoReloadPollIntervalMs, onFocus: true }); @@ -451,7 +518,7 @@ function QueuesWithMetricsView() { -
+
{QUEUE_HEADER_TILES.map((tile) => ( {success ? ( + + replace({ view: undefined })} + > + Queues + + replace({ view: "allocation", page: undefined })} + > + Allocation + + + ) : ( +
+ )} + + {success && view === "allocation" ? ( + allocation ? ( + + ) : ( +
+ +
+ ) + ) : success ? (
+ ; } +const QUEUE_SORT_OPTIONS = [ + { value: "busiest", label: "Busiest" }, + { value: "queued", label: "Backlog" }, + { value: "name", label: "Name" }, +] as const; + +type QueueSortValue = (typeof QUEUE_SORT_OPTIONS)[number]["value"]; + +function QueueSortFilter() { + const { value, replace } = useSearchParams(); + const sort: QueueSortValue = (value("sort") as QueueSortValue) ?? "busiest"; + const label = QUEUE_SORT_OPTIONS.find((option) => option.value === sort)?.label ?? "Busiest"; + + return ( + + replace({ sort: next === "busiest" ? undefined : (next as string), page: undefined }) + } + > + }> + + + + {QUEUE_SORT_OPTIONS.map((option) => ( + + {option.label} + + ))} + + + ); +} + type MetricTileRow = Record; type QueueHeaderTile = { From 3b4722cb44ea5891f36c53325165093f67f05102 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 20:54:48 +0100 Subject: [PATCH 26/37] fix(tsql): inject time fallbacks into FROM subqueries The fallback WHERE injection only targeted the top-level SELECT, so a query shaped as an outer aggregation over a FROM subquery failed to compile: the time column only exists inside the subquery. Descend into the subquery so the fallback lands next to the table reference. --- internal-packages/tsql/src/index.test.ts | 20 ++++++++++++++++++++ internal-packages/tsql/src/index.ts | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/internal-packages/tsql/src/index.test.ts b/internal-packages/tsql/src/index.test.ts index f9aca2f236d..ce358e6ac08 100644 --- a/internal-packages/tsql/src/index.test.ts +++ b/internal-packages/tsql/src/index.test.ts @@ -231,6 +231,26 @@ describe("injectFallbackConditions", () => { expect(modified.where.expression_type).toBe("and"); } }); + + it("should inject into a FROM subquery, where the fallback column's table lives", () => { + const ast = parseTSQLSelect( + "SELECT t, sum(total) AS total FROM (SELECT time AS t, status, count(*) AS total FROM task_runs GROUP BY t, status) GROUP BY t" + ); + const fallbacks: Record = { + time: { op: "gte", value: "2024-01-01" }, + }; + + const modified = injectFallbackConditions(ast, fallbacks); + expect(modified.expression_type).toBe("select_query"); + if (modified.expression_type === "select_query") { + expect(modified.where).toBeUndefined(); + const inner = modified.select_from?.table; + expect(inner?.expression_type).toBe("select_query"); + if (inner?.expression_type === "select_query") { + expect(isColumnReferencedInExpression(inner.where, "time")).toBe(true); + } + } + }); }); describe("compileTSQL with whereClauseFallback", () => { diff --git a/internal-packages/tsql/src/index.ts b/internal-packages/tsql/src/index.ts index 6941dde6079..1ebd1a60a5d 100644 --- a/internal-packages/tsql/src/index.ts +++ b/internal-packages/tsql/src/index.ts @@ -429,6 +429,24 @@ export function injectFallbackConditions( // Handle SelectQuery const selectQuery = ast as SelectQuery; + + // When the FROM is a subquery, the fallback columns belong to the inner query's + // table, not this level; descend so e.g. a time fallback lands next to the table ref. + const fromTable = selectQuery.select_from?.table; + if ( + fromTable && + (fromTable.expression_type === "select_query" || + fromTable.expression_type === "select_set_query") + ) { + return { + ...selectQuery, + select_from: { + ...selectQuery.select_from!, + table: injectFallbackConditions(fromTable, fallbacks) as SelectQuery | SelectSetQuery, + }, + }; + } + const existingWhere = selectQuery.where; // Collect fallback expressions for columns not already in WHERE From d60696c2ea89aca2014e667566952f8c31ab6c93 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 20:55:00 +0100 Subject: [PATCH 27/37] feat(clickhouse,tsql): queue metrics rollups and single-scan ranking Adds two rollups fed from the raw landing table: a per-queue 5-minute tier and an environment-level 1-minute tier (gauges plus TDigest wait quantiles). Ranking now reads the 5m tier and returns the page and the ranked total in one windowed query instead of two scans. The 5m materialized view reads raw rather than cascading off the 10s table: deltaSumTimestamp states hold a single first/last segment, so merging states in an MV's hash-ordered GROUP BY double-counts bridging spans. For the same reason the env tier carries no counter columns, and env-wide counter totals must group by queue before summing. --- .../schema/035_create_queue_metrics_v1.sql | 106 ++++++++++++++++ internal-packages/clickhouse/src/index.ts | 6 +- .../clickhouse/src/queueMetrics.test.ts | 118 ++++++++++++++++++ .../clickhouse/src/queueMetrics.ts | 67 ++++++++-- internal-packages/tsql/src/query/functions.ts | 7 ++ .../tsql/src/query/printer.test.ts | 65 ++++++++++ internal-packages/tsql/src/query/schema.ts | 10 ++ 7 files changed, 364 insertions(+), 15 deletions(-) diff --git a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql index ebbe70391b6..8053de2abf4 100644 --- a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql +++ b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql @@ -92,7 +92,113 @@ SELECT FROM trigger_dev.queue_metrics_raw_v1 GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start; +-- (4) Env-level 1m rollup (no queue dimension) for header tiles/saturation charts. +-- No counter deltas on purpose: cross-queue deltaSumTimestamp state merges mix unrelated +-- odometers (env totals must GROUP BY queue then sum). TDigest because an env-level +-- reservoir absorbs every sample in the environment. +CREATE TABLE IF NOT EXISTS trigger_dev.env_metrics_1m_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + bucket_start DateTime CODEC(Delta(4), ZSTD(1)), + + max_env_queued SimpleAggregateFunction(max, UInt32), + max_env_running SimpleAggregateFunction(max, UInt32), + max_env_limit SimpleAggregateFunction(max, UInt32), + throttled_count SimpleAggregateFunction(sum, UInt64), + + wait_ms_sum SimpleAggregateFunction(sum, UInt64), + wait_ms_count SimpleAggregateFunction(sum, UInt64), + wait_quantiles AggregateFunction(quantilesTDigest(0.5, 0.9, 0.95, 0.99), UInt32) +) +ENGINE = AggregatingMergeTree() +PARTITION BY toDate(bucket_start) +ORDER BY (organization_id, project_id, environment_id, bucket_start) +TTL bucket_start + INTERVAL 30 DAY +SETTINGS ttl_only_drop_parts = 1; + +-- (5) MV: raw -> env rollup. +CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.env_metrics_1m_mv_v1 +TO trigger_dev.env_metrics_1m_v1 AS +SELECT + organization_id, project_id, environment_id, + toStartOfInterval(event_time, INTERVAL 1 MINUTE) AS bucket_start, + max(env_queued) AS max_env_queued, + max(env_running) AS max_env_running, + max(env_limit) AS max_env_limit, + sum(throttled) AS throttled_count, + sumIf(wait_ms, op = 'started') AS wait_ms_sum, + countIf(op = 'started' AND wait_ms > 0) AS wait_ms_count, + quantilesTDigestStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0) AS wait_quantiles +FROM trigger_dev.queue_metrics_raw_v1 +GROUP BY organization_id, project_id, environment_id, bucket_start; + +-- (6) Per-queue 5m rollup, exact column mirror of queue_metrics_v1, for ranking and +-- env-wide GROUP BY queue reads at long ranges. +CREATE TABLE IF NOT EXISTS trigger_dev.queue_metrics_5m_v1 +( + organization_id LowCardinality(String), + project_id LowCardinality(String), + environment_id String CODEC(ZSTD(1)), + queue_name String CODEC(ZSTD(1)), + bucket_start DateTime CODEC(Delta(4), ZSTD(1)), + + enqueue_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + started_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + ack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + nack_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + dlq_delta AggregateFunction(deltaSumTimestamp, UInt64, UInt64), + throttled_count SimpleAggregateFunction(sum, UInt64), + + max_queued SimpleAggregateFunction(max, UInt32), + max_running SimpleAggregateFunction(max, UInt32), + max_limit SimpleAggregateFunction(max, UInt32), + max_env_queued SimpleAggregateFunction(max, UInt32), + max_env_running SimpleAggregateFunction(max, UInt32), + max_env_limit SimpleAggregateFunction(max, UInt32), + + wait_ms_sum SimpleAggregateFunction(sum, UInt64), + wait_ms_count SimpleAggregateFunction(sum, UInt64), + wait_quantiles AggregateFunction(quantiles(0.5, 0.9, 0.95, 0.99), UInt32) +) +ENGINE = AggregatingMergeTree() +PARTITION BY toDate(bucket_start) +ORDER BY (organization_id, project_id, environment_id, queue_name, bucket_start) +TTL bucket_start + INTERVAL 30 DAY +SETTINGS ttl_only_drop_parts = 1; + +-- (7) MV: raw -> 5m rollup. MUST read raw, never cascade off queue_metrics_v1 with +-- -MergeState: MV GROUP BY merges states in hash order, and out-of-time-order +-- deltaSumTimestamp merges double-count bridging spans (verified 3x inflation). +CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.queue_metrics_5m_mv_v1 +TO trigger_dev.queue_metrics_5m_v1 AS +SELECT + organization_id, project_id, environment_id, queue_name, + toStartOfInterval(event_time, INTERVAL 5 MINUTE) AS bucket_start, + deltaSumTimestampStateIf(cumulative, order_key, op = 'enqueue') AS enqueue_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'started') AS started_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'ack') AS ack_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'nack') AS nack_delta, + deltaSumTimestampStateIf(cumulative, order_key, op = 'dlq') AS dlq_delta, + sum(throttled) AS throttled_count, + max(queued) AS max_queued, + max(running) AS max_running, + max(queue_limit) AS max_limit, + max(env_queued) AS max_env_queued, + max(env_running) AS max_env_running, + max(env_limit) AS max_env_limit, + sumIf(wait_ms, op = 'started') AS wait_ms_sum, + countIf(op = 'started' AND wait_ms > 0) AS wait_ms_count, + quantilesStateIf(0.5, 0.9, 0.95, 0.99)(wait_ms, op = 'started' AND wait_ms > 0) AS wait_quantiles +FROM trigger_dev.queue_metrics_raw_v1 +GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start; + -- +goose Down +DROP VIEW IF EXISTS trigger_dev.queue_metrics_5m_mv_v1; +DROP TABLE IF EXISTS trigger_dev.queue_metrics_5m_v1; +DROP VIEW IF EXISTS trigger_dev.env_metrics_1m_mv_v1; +DROP TABLE IF EXISTS trigger_dev.env_metrics_1m_v1; DROP VIEW IF EXISTS trigger_dev.queue_metrics_mv_v1; DROP TABLE IF EXISTS trigger_dev.queue_metrics_v1; DROP TABLE IF EXISTS trigger_dev.queue_metrics_raw_v1; diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index 293ad14ad6e..97c2209b1cb 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -36,7 +36,8 @@ import { insertQueueMetricsRaw, getQueueListMetricsSummary, getQueueDepthSparklines, - getQueueRankingPage, + getQueueRanking, + getQueueRankingNames, getQueueRankingCount, } from "./queueMetrics.js"; import { @@ -273,7 +274,8 @@ export class ClickHouse { insertRaw: insertQueueMetricsRaw(this.writer), listSummary: getQueueListMetricsSummary(this.reader), depthSparklines: getQueueDepthSparklines(this.reader), - rankingPage: getQueueRankingPage(this.reader), + ranking: getQueueRanking(this.reader), + rankingNames: getQueueRankingNames(this.reader), rankingCount: getQueueRankingCount(this.reader), }; } diff --git a/internal-packages/clickhouse/src/queueMetrics.test.ts b/internal-packages/clickhouse/src/queueMetrics.test.ts index 68d30198bc8..968cd7308a0 100644 --- a/internal-packages/clickhouse/src/queueMetrics.test.ts +++ b/internal-packages/clickhouse/src/queueMetrics.test.ts @@ -213,4 +213,122 @@ describe("queue_metrics_v1", () => { await ch.close(); } ); + + clickhouseTest( + "5m and env rollups agree with the 10s tier, and cross-queue totals sum per queue", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + + // Own org so the env-level read (no queue filter) stays isolated from other tests. + const rollOrg = "org_qm_roll"; + const rows: QueueMetricsRawV1Input[] = [ + ...counter("started", "roll-a", 7, [100, 150, 200, 250, 300, 350, 400]), + ...counter("started", "roll-b", 3, [500, 600, 700]), + { ...base("gauge", "roll-a"), running: 4, queued: 9, env_running: 30, env_limit: 50 }, + { ...base("gauge", "roll-b"), running: 2, queued: 1, env_running: 45, env_limit: 50 }, + ].map((row) => ({ ...row, organization_id: rollOrg })); + const [insertError] = await ch.queueMetrics.insertRaw(rows, SYNC); + expect(insertError).toBeNull(); + + const perQueue = (table: string) => + ch.reader.query({ + name: "per-queue-both-tiers", + query: `SELECT queue_name, deltaSumTimestampMerge(started_delta) AS started + FROM ${table} + WHERE queue_name IN ('roll-a', 'roll-b') + GROUP BY queue_name ORDER BY queue_name`, + schema: z.object({ queue_name: z.string(), started: z.coerce.number() }), + })({}); + const [e10, rows10] = await perQueue("trigger_dev.queue_metrics_v1"); + const [e5m, rows5m] = await perQueue("trigger_dev.queue_metrics_5m_v1"); + expect(e10).toBeNull(); + expect(e5m).toBeNull(); + expect(rows10).toEqual([ + { queue_name: "roll-a", started: 7 }, + { queue_name: "roll-b", started: 3 }, + ]); + expect(rows5m).toEqual(rows10); + + // Env-wide totals: sum of per-queue merges (a single merge across queues would mix + // odometers and double-count). + const [envTotalError, envTotal] = await ch.reader.query({ + name: "env-total-per-queue-sum", + query: `SELECT sum(started) AS started FROM ( + SELECT queue_name, deltaSumTimestampMerge(started_delta) AS started + FROM trigger_dev.queue_metrics_5m_v1 + WHERE queue_name IN ('roll-a', 'roll-b') + GROUP BY queue_name + )`, + schema: z.object({ started: z.coerce.number() }), + })({}); + expect(envTotalError).toBeNull(); + expect(envTotal![0]!.started).toBe(10); + + const [envError, envRows] = await ch.reader.query({ + name: "env-rollup-read", + query: `SELECT + max(max_env_running) AS max_env_running, + max(max_env_limit) AS max_env_limit, + round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[4]) AS wait_p99 + FROM trigger_dev.env_metrics_1m_v1 + WHERE organization_id = {org: String}`, + schema: z.object({ + max_env_running: z.coerce.number(), + max_env_limit: z.coerce.number(), + wait_p99: z.coerce.number(), + }), + params: z.object({ org: z.string() }), + })({ org: rollOrg }); + expect(envError).toBeNull(); + expect(envRows![0]!.max_env_running).toBe(45); + expect(envRows![0]!.max_env_limit).toBe(50); + expect(envRows![0]!.wait_p99).toBeGreaterThanOrEqual(600); + expect(envRows![0]!.wait_p99).toBeLessThanOrEqual(1000); + + await ch.close(); + } + ); + + clickhouseTest( + "merged ranking returns the page and the windowed total in one query", + async ({ clickhouseContainer }) => { + const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); + + const gauge = (queue: string, queued: number, running: number): QueueMetricsRawV1Input => ({ + ...base("gauge", queue), + queued, + running, + }); + const [insertError] = await ch.queueMetrics.insertRaw( + [gauge("rank-low", 1, 0), gauge("rank-high", 50, 3), gauge("rank-mid", 10, 2)], + SYNC + ); + expect(insertError).toBeNull(); + + const args = { + organizationId: ORG, + projectId: PROJECT, + environmentId: ENV, + startTime: "2026-06-30 11:50:00", + nameContains: "rank-", + byQueuedOnly: 0, + }; + const [pageError, page] = await ch.queueMetrics.ranking({ ...args, limit: 2, offset: 0 }); + expect(pageError).toBeNull(); + expect(page).toEqual([ + { queue_name: "rank-high", ranked_total: 3 }, + { queue_name: "rank-mid", ranked_total: 3 }, + ]); + + const [countError, count] = await ch.queueMetrics.rankingCount(args); + expect(countError).toBeNull(); + expect(count![0]!.ranked).toBe(3); + + const [namesError, names] = await ch.queueMetrics.rankingNames({ ...args, limit: 10 }); + expect(namesError).toBeNull(); + expect(names!.map((r) => r.queue_name)).toEqual(["rank-high", "rank-mid", "rank-low"]); + + await ch.close(); + } + ); }); diff --git a/internal-packages/clickhouse/src/queueMetrics.ts b/internal-packages/clickhouse/src/queueMetrics.ts index 3efab11e793..b982df1c941 100644 --- a/internal-packages/clickhouse/src/queueMetrics.ts +++ b/internal-packages/clickhouse/src/queueMetrics.ts @@ -50,6 +50,12 @@ const QueueMetricsSummaryRow = z.object({ started_count: z.coerce.number(), }); +// Callers align window bounds to the bucket grid so repeated loads share cache entries. +const QUEUE_METRICS_CACHE_SETTINGS = { + use_query_cache: 1, + query_cache_ttl: 30, +} as const; + /** Per-queue rollups over a window, for a fixed set of queues (the visible page). */ export function getQueueListMetricsSummary(reader: ClickhouseReader) { return reader.query({ @@ -70,6 +76,7 @@ export function getQueueListMetricsSummary(reader: ClickhouseReader) { GROUP BY queue_name`, params: QueueMetricsListParams, schema: QueueMetricsSummaryRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, }); } @@ -102,6 +109,7 @@ export function getQueueDepthSparklines(reader: ClickhouseReader) { ORDER BY bucket`, params: QueueDepthSparklineParams, schema: QueueDepthSparklineRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, }); } @@ -119,8 +127,11 @@ const QueueRankingParams = z.object({ const QueueRankingRow = z.object({ queue_name: z.string(), + ranked_total: z.coerce.number(), }); +// Ranking reads the 5m rollup: a 15-minute window there costs ~30x fewer rows than the +// 10s table. const RANKING_WHERE = `organization_id = {organizationId: String} AND project_id = {projectId: String} AND environment_id = {environmentId: String} @@ -128,20 +139,49 @@ const RANKING_WHERE = `organization_id = {organizationId: String} AND queue_name != '__overflow__' AND ({nameContains: String} = '' OR positionCaseInsensitive(queue_name, {nameContains: String}) > 0)`; -/** Queue names ranked by recent activity, for relevance-ordered list pages. */ -export function getQueueRankingPage(reader: ClickhouseReader) { +/** + * One page of queue names ranked by recent activity, with the total ranked count on + * every row (window function), so page + count cost a single scan. + */ +export function getQueueRanking(reader: ClickhouseReader) { return reader.query({ - name: "getQueueRankingPage", - query: `SELECT queue_name - FROM trigger_dev.queue_metrics_v1 - WHERE ${RANKING_WHERE} - GROUP BY queue_name - ORDER BY - if({byQueuedOnly: UInt8} = 1, max(max_queued), max(max_queued) + max(max_running)) DESC, - queue_name ASC + name: "getQueueRanking", + query: `SELECT queue_name, count() OVER () AS ranked_total + FROM ( + SELECT queue_name + FROM trigger_dev.queue_metrics_5m_v1 + WHERE ${RANKING_WHERE} + GROUP BY queue_name + ORDER BY + if({byQueuedOnly: UInt8} = 1, max(max_queued), max(max_queued) + max(max_running)) DESC, + queue_name ASC + ) LIMIT {limit: UInt32} OFFSET {offset: UInt32}`, params: QueueRankingParams, schema: QueueRankingRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, + }); +} + +const QueueRankingNamesParams = QueueRankingParams.omit({ byQueuedOnly: true, offset: true }); + +const QueueRankingNameRow = z.object({ + queue_name: z.string(), +}); + +/** All ranked queue names (activity order), used to exclude them from the alphabetical tail. */ +export function getQueueRankingNames(reader: ClickhouseReader) { + return reader.query({ + name: "getQueueRankingNames", + query: `SELECT queue_name + FROM trigger_dev.queue_metrics_5m_v1 + WHERE ${RANKING_WHERE} + GROUP BY queue_name + ORDER BY max(max_queued) + max(max_running) DESC, queue_name ASC + LIMIT {limit: UInt32}`, + params: QueueRankingNamesParams, + schema: QueueRankingNameRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, }); } @@ -155,15 +195,16 @@ const QueueRankingCountRow = z.object({ ranked: z.coerce.number(), }); -/** How many queues have activity in the ranking window (the ranked head of the list). */ +/** Ranked-queue count alone, for pages past the ranked head (approximate uniq is fine). */ export function getQueueRankingCount(reader: ClickhouseReader) { return reader.query({ name: "getQueueRankingCount", - query: `SELECT uniqExact(queue_name) AS ranked - FROM trigger_dev.queue_metrics_v1 + query: `SELECT uniq(queue_name) AS ranked + FROM trigger_dev.queue_metrics_5m_v1 WHERE ${RANKING_WHERE}`, params: QueueRankingCountParams, schema: QueueRankingCountRow, + settings: QUEUE_METRICS_CACHE_SETTINGS, }); } diff --git a/internal-packages/tsql/src/query/functions.ts b/internal-packages/tsql/src/query/functions.ts index 4b2ac0cb553..a6dadf0f609 100644 --- a/internal-packages/tsql/src/query/functions.ts +++ b/internal-packages/tsql/src/query/functions.ts @@ -645,6 +645,13 @@ export const TSQL_AGGREGATIONS: Record = { maxParams: 1, aggregate: true, }, + quantilesTDigestMerge: { + clickhouseName: "quantilesTDigestMerge", + minArgs: 1, + maxArgs: 1, + minParams: 1, + aggregate: true, + }, sumMerge: { clickhouseName: "sumMerge", minArgs: 1, maxArgs: 1, aggregate: true }, avgMerge: { clickhouseName: "avgMerge", minArgs: 1, maxArgs: 1, aggregate: true }, countMerge: { clickhouseName: "countMerge", minArgs: 1, maxArgs: 1, aggregate: true }, diff --git a/internal-packages/tsql/src/query/printer.test.ts b/internal-packages/tsql/src/query/printer.test.ts index 831e73f88c9..8ae1164b3bd 100644 --- a/internal-packages/tsql/src/query/printer.test.ts +++ b/internal-packages/tsql/src/query/printer.test.ts @@ -3982,3 +3982,68 @@ describe("timeBucket() fillGaps", () => { expect(sql).toContain("ORDER BY timebucket DESC"); }); }); + +describe("cross-queue counter totals via subquery (env-wide throughput shape)", () => { + // deltaSumTimestamp states must merge per queue, then sum outside; this is the + // supported shape for env-wide totals. + const metricsSchema: TableSchema = { + name: "metrics", + clickhouseName: "trigger_dev.queue_metrics_v1", + timeConstraint: "bucket_at", + columns: { + bucket_at: { name: "bucket_at", clickhouseName: "created_at", ...column("DateTime64") }, + queue_name: { name: "queue_name", ...column("String") }, + started_delta: { + name: "started_delta", + ...column("String"), + groupable: false, + sortable: false, + filterable: false, + }, + organization_id: { name: "organization_id", ...column("String") }, + project_id: { name: "project_id", ...column("String") }, + environment_id: { name: "environment_id", ...column("String") }, + }, + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + }; + + function runSubquery(query: string) { + const context = createPrinterContext({ + schema: createSchemaRegistry([metricsSchema]), + enforcedWhereClause: { + organization_id: { op: "eq", value: "org_test123" }, + }, + timeRange: { + from: new Date("2024-01-01T00:00:00Z"), + to: new Date("2024-01-08T00:00:00Z"), + }, + }); + const result = printToClickHouse(parseTSQLSelect(query), context); + return { ...result, warnings: context.warnings }; + } + + it("compiles per-queue merge + outer sum, with tenant scoping inside the subquery", () => { + const { sql, params } = runSubquery(` + SELECT t, sum(started) AS started + FROM ( + SELECT timeBucket() AS t, queue_name, deltaSumTimestampMerge(started_delta) AS started + FROM metrics + GROUP BY t, queue_name + ) + GROUP BY t + ORDER BY t + `); + + expect(sql).toContain("deltaSumTimestampMerge(started_delta)"); + expect(sql).toContain("toStartOfInterval(created_at, INTERVAL 6 HOUR)"); + const subqueryStart = sql.indexOf("FROM ("); + const tenantFilter = sql.indexOf("organization_id"); + expect(subqueryStart).toBeGreaterThan(-1); + expect(tenantFilter).toBeGreaterThan(subqueryStart); + expect(Object.values(params)).toContain("org_test123"); + }); +}); diff --git a/internal-packages/tsql/src/query/schema.ts b/internal-packages/tsql/src/query/schema.ts index 68007c8e62e..5445f1a5d0e 100644 --- a/internal-packages/tsql/src/query/schema.ts +++ b/internal-packages/tsql/src/query/schema.ts @@ -415,6 +415,16 @@ export interface TableSchema { * is needed to get correct results. Not needed for plain MergeTree tables. */ useFinal?: boolean; + /** + * Coarser physical rollups with an identical logical schema, substituted by callers + * (not the printer) when the timeBucket() interval is at least minIntervalSeconds. + */ + rollups?: Array<{ minIntervalSeconds: number; clickhouseName: string }>; + /** + * Opt into the ClickHouse query cache; callers align time bounds to alignSeconds + * so repeated auto-refresh queries share cache entries. + */ + queryCache?: { ttlSeconds: number; alignSeconds: number }; } /** From c28b6cfbce44948561af2d9dcfdfc12384494853 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 20:55:13 +0100 Subject: [PATCH 28/37] feat(webapp): serve queue metrics reads from rollups and fix env totals The built-in queues dashboard's enqueued vs started chart merged counter states across queues, which mixes unrelated cumulative counters and returns wrong totals; it now merges per queue and sums outside. Env header tiles and saturation charts read the environment rollup, so their cost no longer scales with queue count, and coarse-bucket ranges are served from the 5m rollup automatically. Queue list ranking runs as one query, time bounds are aligned to the bucket grid, and repeated auto-refresh reads share ClickHouse query-cache entries. --- .../presenters/v3/BuiltInDashboards.server.ts | 20 +-- .../v3/QueueListPresenter.server.ts | 49 ++++--- .../v3/QueueMetricsPresenter.server.ts | 5 +- .../route.tsx | 8 +- .../app/services/queryService.server.ts | 74 +++++++++- apps/webapp/app/v3/querySchemas.ts | 130 +++++++++++++++++- 6 files changed, 240 insertions(+), 46 deletions(-) diff --git a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts index 48bc26f6438..65d5bd0b6ff 100644 --- a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts +++ b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts @@ -575,22 +575,22 @@ const queuesDashboard: BuiltInDashboard = { widgets: { "env-used": { title: "Concurrency in use", - query: `SELECT argMax(max_env_running, bucket_start) AS in_use\nFROM queue_metrics`, + query: `SELECT argMax(max_env_running, bucket_start) AS in_use\nFROM env_metrics`, display: { type: "bignumber", column: "in_use", aggregation: "max", abbreviate: false }, }, "env-limit": { title: "Environment limit", - query: `SELECT argMax(max_env_limit, bucket_start) AS env_limit\nFROM queue_metrics`, + query: `SELECT argMax(max_env_limit, bucket_start) AS env_limit\nFROM env_metrics`, display: { type: "bignumber", column: "env_limit", aggregation: "max", abbreviate: false }, }, "env-avail": { title: "Available slots", - query: `SELECT argMax(max_env_limit, bucket_start) - argMax(max_env_running, bucket_start) AS available\nFROM queue_metrics`, + query: `SELECT argMax(max_env_limit, bucket_start) - argMax(max_env_running, bucket_start) AS available\nFROM env_metrics`, display: { type: "bignumber", column: "available", aggregation: "max", abbreviate: false }, }, "env-sat": { title: "Env saturation", - query: `SELECT round(argMax(max_env_running, bucket_start) * 100.0 / nullIf(argMax(max_env_limit, bucket_start), 0), 1) AS saturation\nFROM queue_metrics`, + query: `SELECT round(argMax(max_env_running, bucket_start) * 100.0 / nullIf(argMax(max_env_limit, bucket_start), 0), 1) AS saturation\nFROM env_metrics`, display: { type: "bignumber", column: "saturation", @@ -601,7 +601,7 @@ const queuesDashboard: BuiltInDashboard = { }, "sat-time": { title: "Environment saturation over time", - query: `SELECT timeBucket() AS t,\n round(max(max_env_running) * 100.0 / nullIf(max(max_env_limit), 0), 1) AS saturation\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + query: `SELECT timeBucket() AS t,\n round(max(max_env_running) * 100.0 / nullIf(max(max_env_limit), 0), 1) AS saturation\nFROM env_metrics\nGROUP BY t\nORDER BY t`, display: { type: "chart", chartType: "line", @@ -616,7 +616,7 @@ const queuesDashboard: BuiltInDashboard = { }, "used-limit": { title: "Concurrency used vs limit", - query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS limit\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS limit\nFROM env_metrics\nGROUP BY t\nORDER BY t`, // Single-series gauge: carry the last known used/limit across idle buckets instead of dropping to 0. fillGaps: true, display: { @@ -695,9 +695,9 @@ const queuesDashboard: BuiltInDashboard = { }, throughput: { title: "Enqueued vs started", - query: `SELECT timeBucket() AS t,\n deltaSumTimestampMerge(enqueue_delta) AS enqueued,\n deltaSumTimestampMerge(started_delta) AS started\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, - // Single-series counters: zero-fill idle buckets so the line returns to 0 rather than interpolating across gaps. - fillGaps: true, + // Counter states merge per queue, then sum outside: a single merge across queues + // mixes unrelated odometers and returns wrong totals. + query: `SELECT t, sum(enq) AS enqueued, sum(st) AS started\nFROM (\n SELECT timeBucket() AS t, queue,\n deltaSumTimestampMerge(enqueue_delta) AS enq,\n deltaSumTimestampMerge(started_delta) AS st\n FROM queue_metrics\n GROUP BY t, queue\n)\nGROUP BY t\nORDER BY t`, display: { type: "chart", chartType: "line", @@ -712,7 +712,7 @@ const queuesDashboard: BuiltInDashboard = { }, "wait-pct": { title: "Scheduling delay p50/p95/p99 (ms)", - query: `SELECT timeBucket() AS t,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[1]) AS p50,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[3]) AS p99\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + query: `SELECT timeBucket() AS t,\n round(quantilesTDigestMerge(0.5, 0.95, 0.99)(wait_quantiles)[1]) AS p50,\n round(quantilesTDigestMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95,\n round(quantilesTDigestMerge(0.5, 0.95, 0.99)(wait_quantiles)[3]) AS p99\nFROM env_metrics\nGROUP BY t\nORDER BY t`, display: { type: "chart", chartType: "line", diff --git a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts index 747eb119568..4a1641a1be7 100644 --- a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts @@ -217,40 +217,49 @@ export class QueueListPresenter extends BasePresenter { "query" ); + // The window start is aligned to the minute so repeated page loads produce identical + // query text and can share ClickHouse query-cache entries. + const windowStartMs = + Math.floor((Date.now() - QUEUE_RANKING_WINDOW_MINUTES * 60 * 1000) / 60_000) * 60_000; const rankingArgs = { organizationId: environment.organizationId, projectId: environment.projectId, environmentId: environment.id, - startTime: formatClickhouseDateTime( - new Date(Date.now() - QUEUE_RANKING_WINDOW_MINUTES * 60 * 1000) - ), + startTime: formatClickhouseDateTime(new Date(windowStartMs)), nameContains: query?.trim() ?? "", }; - const [countError, countRows] = await clickhouse.queueMetrics.rankingCount(rankingArgs); - if (countError) { - throw countError; + const offset = (page - 1) * this.perPage; + + // One scan returns the page and the total ranked count (window function). + const [pageError, pageRows] = await clickhouse.queueMetrics.ranking({ + ...rankingArgs, + byQueuedOnly: sort === "queued" ? 1 : 0, + limit: this.perPage, + offset, + }); + if (pageError) { + throw pageError; + } + + let ranked = pageRows?.[0]?.ranked_total ?? 0; + if (ranked === 0 && offset > 0) { + // Empty page past the ranked head: fetch the count alone for the tail slot math. + const [countError, countRows] = await clickhouse.queueMetrics.rankingCount(rankingArgs); + if (countError) { + throw countError; + } + ranked = countRows?.[0]?.ranked ?? 0; } - const ranked = countRows?.[0]?.ranked ?? 0; if (ranked > MAX_RANKED_QUEUES) { return null; } const where = buildQueueListWhere(environment.id, query, type); const totalQueues = await this._replica.taskQueue.count({ where }); - const offset = (page - 1) * this.perPage; let rankedPageQueues: QueueListRow[] = []; - if (offset < ranked) { - const [pageError, pageRows] = await clickhouse.queueMetrics.rankingPage({ - ...rankingArgs, - byQueuedOnly: sort === "queued" ? 1 : 0, - limit: this.perPage, - offset, - }); - if (pageError) { - throw pageError; - } + if ((pageRows?.length ?? 0) > 0) { const rankedNames = (pageRows ?? []).map((row) => row.queue_name); rankedPageQueues = await this.findQueuesByNames(where, rankedNames); } @@ -263,11 +272,9 @@ export class QueueListPresenter extends BasePresenter { if (tailNeeded > 0) { let excludedNames: string[] = []; if (ranked > 0) { - const [allError, allRows] = await clickhouse.queueMetrics.rankingPage({ + const [allError, allRows] = await clickhouse.queueMetrics.rankingNames({ ...rankingArgs, - byQueuedOnly: 0, limit: MAX_RANKED_QUEUES, - offset: 0, }); if (allError) { throw allError; diff --git a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts index b683ef41798..a36c402dda7 100644 --- a/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/QueueMetricsPresenter.server.ts @@ -67,13 +67,16 @@ export class QueueMetricsPresenter { "query" ); + // End bound snaps up to the bucket grid so repeated loads within a bucket produce + // identical params and share ClickHouse query-cache entries. + const endMs = Math.ceil(to.getTime() / bucketIntervalMs) * bucketIntervalMs; const ids = { organizationId: environment.organizationId, projectId: environment.projectId, environmentId: environment.id, queueNames, startTime: formatClickhouseDateTime(new Date(bucketStartMs)), - endTime: formatClickhouseDateTime(to), + endTime: formatClickhouseDateTime(new Date(endMs)), }; const [summaryResult, sparklineResult] = await Promise.all([ diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx index 66b6c9962d4..1f9611947ba 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/route.tsx @@ -1327,7 +1327,7 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ id: "saturation", label: "Env saturation", color: "#6366F1", - query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS env_limit\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + query: `SELECT timeBucket() AS t,\n max(max_env_running) AS used,\n max(max_env_limit) AS env_limit\nFROM env_metrics\nGROUP BY t\nORDER BY t`, formatValue: (v) => `${v}%`, derive: (rows) => { const sparkline = rows.map((r) => { @@ -1342,7 +1342,7 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ id: "backlog", label: "Backlog", color: "#A78BFA", - query: `SELECT timeBucket() AS t,\n max(max_env_queued) AS queued\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + query: `SELECT timeBucket() AS t,\n max(max_env_queued) AS queued\nFROM env_metrics\nGROUP BY t\nORDER BY t`, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.queued)); const peak = sparkline.reduce((max, v) => Math.max(max, v), 0); @@ -1353,7 +1353,7 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ id: "p95", label: "Scheduling delay p95", color: "#F59E0B", - query: `SELECT timeBucket() AS t,\n round(quantilesMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + query: `SELECT timeBucket() AS t,\n round(quantilesTDigestMerge(0.5, 0.95, 0.99)(wait_quantiles)[2]) AS p95\nFROM env_metrics\nGROUP BY t\nORDER BY t`, formatValue: formatWaitMs, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.p95)); @@ -1370,7 +1370,7 @@ const QUEUE_HEADER_TILES: QueueHeaderTile[] = [ id: "throttled", label: "Throttled", color: "#F59E0B", - query: `SELECT timeBucket() AS t,\n sum(throttled_count) AS throttled\nFROM queue_metrics\nGROUP BY t\nORDER BY t`, + query: `SELECT timeBucket() AS t,\n sum(throttled_count) AS throttled\nFROM env_metrics\nGROUP BY t\nORDER BY t`, derive: (rows) => { const sparkline = rows.map((r) => tileNumber(r.throttled)); const total = sparkline.reduce((sum, v) => sum + v, 0); diff --git a/apps/webapp/app/services/queryService.server.ts b/apps/webapp/app/services/queryService.server.ts index 57b877ed876..70af40ec89f 100644 --- a/apps/webapp/app/services/queryService.server.ts +++ b/apps/webapp/app/services/queryService.server.ts @@ -7,7 +7,12 @@ import { type TSQLQueryResult, } from "@internal/clickhouse"; import type { CustomerQuerySource } from "@trigger.dev/database"; -import type { TableSchema, WhereClauseCondition } from "@internal/tsql"; +import { + calculateTimeBucketInterval, + type TableSchema, + type TimeBucketInterval, + type WhereClauseCondition, +} from "@internal/tsql"; import { z } from "zod"; import { prisma } from "~/db.server"; import { env } from "~/env.server"; @@ -110,6 +115,41 @@ export type ExecuteQueryResult = } | { success: false; error: Error }; +const INTERVAL_UNIT_SECONDS: Record = { + SECOND: 1, + MINUTE: 60, + HOUR: 3_600, + DAY: 86_400, + WEEK: 604_800, + MONTH: 2_592_000, +}; + +function floorToSeconds(date: Date, alignSeconds: number): Date { + const ms = alignSeconds * 1000; + return new Date(Math.floor(date.getTime() / ms) * ms); +} + +/** + * Swap a table for one of its rollups when the query's bucket interval is at least the + * rollup's granularity. The rollup has identical logical columns, so only the physical + * table (and therefore rows read) changes. + */ +function resolveRollup(schema: TableSchema, timeRange: { from: Date; to: Date }): TableSchema { + if (!schema.rollups || schema.rollups.length === 0) { + return schema; + } + const interval = calculateTimeBucketInterval( + timeRange.from, + timeRange.to, + schema.timeBucketThresholds + ); + const intervalSeconds = interval.value * INTERVAL_UNIT_SECONDS[interval.unit]; + const best = [...schema.rollups] + .sort((a, b) => b.minIntervalSeconds - a.minIntervalSeconds) + .find((r) => r.minIntervalSeconds <= intervalSeconds); + return best ? { ...schema, clickhouseName: best.clickhouseName } : schema; +} + export async function getDefaultPeriod(organizationId: string): Promise { const idealDefaultPeriodDays = 7; const maxQueryPeriod = await getLimit(organizationId, "queryPeriodDays", 30); @@ -183,6 +223,14 @@ export async function executeQuery( defaultPeriod, }); + // Align the time bounds so repeated auto-refresh queries produce identical query + // params and can share ClickHouse query-cache entries (params are part of the key). + const alignSeconds = matchedSchema?.queryCache?.alignSeconds; + if (alignSeconds) { + if (timeFilter.from) timeFilter.from = floorToSeconds(timeFilter.from, alignSeconds); + if (timeFilter.to) timeFilter.to = floorToSeconds(timeFilter.to, alignSeconds); + } + // Calculate the effective "from" date the user is requesting (for period clipping check) // This is null only when the user specifies just a "to" date (rare case) let requestedFromDate: Date | null = null; @@ -192,6 +240,9 @@ export async function executeQuery( // Period specified (or default) - calculate from now const periodMs = parse(timeFilter.period ?? defaultPeriod) ?? 7 * 24 * 60 * 60 * 1000; requestedFromDate = new Date(Date.now() - periodMs); + if (alignSeconds) { + requestedFromDate = floorToSeconds(requestedFromDate, alignSeconds); + } } // Build the fallback WHERE condition based on what the user specified @@ -207,7 +258,10 @@ export async function executeQuery( } const maxQueryPeriod = await getLimit(organizationId, "queryPeriodDays", 30); - const maxQueryPeriodDate = new Date(Date.now() - maxQueryPeriod * 24 * 60 * 60 * 1000); + let maxQueryPeriodDate = new Date(Date.now() - maxQueryPeriod * 24 * 60 * 60 * 1000); + if (alignSeconds) { + maxQueryPeriodDate = floorToSeconds(maxQueryPeriodDate, alignSeconds); + } // Check if the requested time period exceeds the plan limit const periodClipped = requestedFromDate !== null && requestedFromDate < maxQueryPeriodDate; @@ -255,6 +309,10 @@ export async function executeQuery( to: to ?? undefined, defaultPeriod, }); + if (alignSeconds) { + timeRange.from = floorToSeconds(timeRange.from, alignSeconds); + timeRange.to = floorToSeconds(timeRange.to, alignSeconds); + } try { // Build field mappings for project_ref → project_id and environment_id → slug translation @@ -277,10 +335,19 @@ export async function executeQuery( organizationId, "query" ); + // Serve coarse-bucket queries from the table's rollup when one qualifies. + const effectiveSchemas = matchedSchema?.rollups + ? querySchemas.map((s) => (s === matchedSchema ? resolveRollup(s, timeRange) : s)) + : querySchemas; + + const queryCacheSettings: ClickHouseSettings = matchedSchema?.queryCache + ? { use_query_cache: 1, query_cache_ttl: matchedSchema.queryCache.ttlSeconds } + : {}; + const result = await executeTSQL(queryClickhouse.reader, { ...baseOptions, schema: z.record(z.any()), - tableSchema: querySchemas, + tableSchema: effectiveSchemas, transformValues: true, enforcedWhereClause, fieldMappings, @@ -290,6 +357,7 @@ export async function executeQuery( timeRange, clickhouseSettings: { ...getDefaultClickhouseSettings(), + ...queryCacheSettings, ...baseOptions.clickhouseSettings, // Allow caller overrides if needed }, querySettings: { diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts index 2938d254cf8..acdaa503d59 100644 --- a/apps/webapp/app/v3/querySchemas.ts +++ b/apps/webapp/app/v3/querySchemas.ts @@ -664,12 +664,13 @@ export const queueMetricsSchema: TableSchema = { }), }, // Cumulative-counter delta states. Read with deltaSumTimestampMerge() (loss-tolerant, - // reset-safe), never sum(); opaque like wait_quantiles. + // reset-safe), never sum(); opaque like wait_quantiles. Merging across queues is + // invalid (mixes unrelated odometers): totals must GROUP BY queue, then sum outside. enqueue_delta: { name: "enqueue_delta", ...column("String", { description: - "Runs enqueued (cumulative-counter delta). Read with deltaSumTimestampMerge(enqueue_delta).", + "Runs enqueued (cumulative-counter delta). Read with deltaSumTimestampMerge(enqueue_delta) grouped by queue. For totals across queues, sum the per-queue results in an outer query, never merge across queues.", }), groupable: false, sortable: false, @@ -679,7 +680,7 @@ export const queueMetricsSchema: TableSchema = { name: "started_delta", ...column("String", { description: - "Runs dequeued/started (throughput). Read with deltaSumTimestampMerge(started_delta).", + "Runs dequeued/started (throughput). Read with deltaSumTimestampMerge(started_delta) grouped by queue. For totals across queues, sum the per-queue results in an outer query, never merge across queues.", coreColumn: true, }), groupable: false, @@ -689,7 +690,8 @@ export const queueMetricsSchema: TableSchema = { ack_delta: { name: "ack_delta", ...column("String", { - description: "Runs acked (completed). Read with deltaSumTimestampMerge(ack_delta).", + description: + "Runs acked (completed). Read with deltaSumTimestampMerge(ack_delta) grouped by queue; sum per-queue results for totals.", }), groupable: false, sortable: false, @@ -698,7 +700,8 @@ export const queueMetricsSchema: TableSchema = { nack_delta: { name: "nack_delta", ...column("String", { - description: "Runs nacked. Read with deltaSumTimestampMerge(nack_delta).", + description: + "Runs nacked. Read with deltaSumTimestampMerge(nack_delta) grouped by queue; sum per-queue results for totals.", }), groupable: false, sortable: false, @@ -707,7 +710,8 @@ export const queueMetricsSchema: TableSchema = { dlq_delta: { name: "dlq_delta", ...column("String", { - description: "Runs dead-lettered. Read with deltaSumTimestampMerge(dlq_delta).", + description: + "Runs dead-lettered. Read with deltaSumTimestampMerge(dlq_delta) grouped by queue; sum per-queue results for totals.", }), groupable: false, sortable: false, @@ -798,11 +802,122 @@ export const queueMetricsSchema: TableSchema = { { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } }, { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } }, ] satisfies BucketThreshold[], + // Ranges whose bucket interval is >= 5 minutes read the 5m rollup instead (same + // logical columns, ~30x fewer rows). + rollups: [{ minIntervalSeconds: 300, clickhouseName: "trigger_dev.queue_metrics_5m_v1" }], + queryCache: { ttlSeconds: 30, alignSeconds: 30 }, }; /** - * All available schemas for the query editor + * Schema definition for the env_metrics table (trigger_dev.env_metrics_1m_v1). + * Environment-level rollup of queue_metrics into 1-minute buckets with the queue + * dimension dropped, so header tiles and saturation charts cost the same regardless + * of how many queues the environment has. */ +export const envMetricsSchema: TableSchema = { + name: "env_metrics", + clickhouseName: "trigger_dev.env_metrics_1m_v1", + description: + "Environment-level concurrency, saturation, throttling, and scheduling-delay metrics (1-minute buckets)", + timeConstraint: "bucket_start", + tenantColumns: { + organizationId: "organization_id", + projectId: "project_id", + environmentId: "environment_id", + }, + columns: { + environment: { + name: "environment", + clickhouseName: "environment_id", + ...column("String", { description: "The environment slug", example: "prod" }), + fieldMapping: "environment", + customRenderType: "environment", + }, + project: { + name: "project", + clickhouseName: "project_id", + ...column("String", { + description: "The project reference, they always start with `proj_`.", + example: "proj_howcnaxbfxdmwmxazktx", + }), + fieldMapping: "project", + customRenderType: "project", + }, + bucket_start: { + name: "bucket_start", + ...column("DateTime", { + description: "The start of the 1-minute aggregation bucket", + example: "2024-01-15 09:30:00", + coreColumn: true, + }), + }, + max_env_queued: { + name: "max_env_queued", + ...column("UInt32", { + description: "Peak environment-wide queued in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + max_env_running: { + name: "max_env_running", + ...column("UInt32", { + description: "Peak environment-wide running in the bucket. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + max_env_limit: { + name: "max_env_limit", + ...column("UInt32", { + description: "The environment concurrency limit. Aggregate with max().", + coreColumn: true, + fillMode: "carry", + }), + }, + throttled_count: { + name: "throttled_count", + ...column("UInt64", { + description: + "Gauge emissions where a queue was at its limit with work queued. Aggregate with sum().", + coreColumn: true, + }), + }, + wait_ms_sum: { + name: "wait_ms_sum", + ...column("UInt64", { + description: "Sum of scheduling delays (ms). Mean = wait_ms_sum/wait_ms_count.", + }), + }, + wait_ms_count: { + name: "wait_ms_count", + ...column("UInt64", { + description: "Count of scheduling-delay samples. Aggregate with sum().", + }), + }, + wait_quantiles: { + name: "wait_quantiles", + ...column("String", { + description: + "Scheduling-delay quantile state (TDigest). Read with quantilesTDigestMerge(0.5,0.9,0.95,0.99)(wait_quantiles)[n].", + }), + groupable: false, + sortable: false, + filterable: false, + }, + }, + timeBucketThresholds: [ + { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } }, + { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } }, + { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } }, + { maxRangeSeconds: 30 * 24 * 60 * 60, interval: { value: 1, unit: "HOUR" } }, + { maxRangeSeconds: 90 * 24 * 60 * 60, interval: { value: 6, unit: "HOUR" } }, + { maxRangeSeconds: 180 * 24 * 60 * 60, interval: { value: 1, unit: "DAY" } }, + { maxRangeSeconds: 365 * 24 * 60 * 60, interval: { value: 1, unit: "WEEK" } }, + ] satisfies BucketThreshold[], + queryCache: { ttlSeconds: 30, alignSeconds: 30 }, +}; + /** * Schema definition for the llm_metrics table (trigger_dev.llm_metrics_v1) */ @@ -1168,6 +1283,7 @@ export const querySchemas: TableSchema[] = [ llmMetricsSchema, llmModelsSchema, queueMetricsSchema, + envMetricsSchema, ]; /** From df0bbe1afee972ed90ca580cd9470db49d5a9fd7 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 21:53:10 +0100 Subject: [PATCH 29/37] feat(clickhouse,webapp): keep 10-second resolution on the env metrics rollup The env rollup's win comes from dropping the queue dimension, not from coarser buckets: row count is queue-independent (~8640/day/env), so full 10-second granularity stays cheap at any range. Env header tiles and saturation charts now resolve short-range detail exactly like the per-queue charts, and the current-value tiles read the latest 10-second bucket instead of a minute-wide one. --- apps/webapp/app/v3/querySchemas.ts | 16 +++++++++------- .../schema/035_create_queue_metrics_v1.sql | 15 ++++++++------- .../clickhouse/src/queueMetrics.test.ts | 16 ++++++++++++++-- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts index acdaa503d59..4153be016b5 100644 --- a/apps/webapp/app/v3/querySchemas.ts +++ b/apps/webapp/app/v3/querySchemas.ts @@ -809,16 +809,17 @@ export const queueMetricsSchema: TableSchema = { }; /** - * Schema definition for the env_metrics table (trigger_dev.env_metrics_1m_v1). - * Environment-level rollup of queue_metrics into 1-minute buckets with the queue - * dimension dropped, so header tiles and saturation charts cost the same regardless - * of how many queues the environment has. + * Schema definition for the env_metrics table (trigger_dev.env_metrics_v1). + * Environment-level rollup of queue_metrics with the queue dimension dropped, so + * header tiles and saturation charts cost the same regardless of how many queues + * the environment has. Keeps the full 10-second granularity: row count is + * queue-independent, so even 30-day ranges stay small. */ export const envMetricsSchema: TableSchema = { name: "env_metrics", - clickhouseName: "trigger_dev.env_metrics_1m_v1", + clickhouseName: "trigger_dev.env_metrics_v1", description: - "Environment-level concurrency, saturation, throttling, and scheduling-delay metrics (1-minute buckets)", + "Environment-level concurrency, saturation, throttling, and scheduling-delay metrics (10-second buckets)", timeConstraint: "bucket_start", tenantColumns: { organizationId: "organization_id", @@ -846,7 +847,7 @@ export const envMetricsSchema: TableSchema = { bucket_start: { name: "bucket_start", ...column("DateTime", { - description: "The start of the 1-minute aggregation bucket", + description: "The start of the 10-second aggregation bucket", example: "2024-01-15 09:30:00", coreColumn: true, }), @@ -907,6 +908,7 @@ export const envMetricsSchema: TableSchema = { }, }, timeBucketThresholds: [ + { maxRangeSeconds: 3 * 60 * 60, interval: { value: 10, unit: "SECOND" } }, { maxRangeSeconds: 12 * 60 * 60, interval: { value: 1, unit: "MINUTE" } }, { maxRangeSeconds: 2 * 24 * 60 * 60, interval: { value: 5, unit: "MINUTE" } }, { maxRangeSeconds: 7 * 24 * 60 * 60, interval: { value: 15, unit: "MINUTE" } }, diff --git a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql index 8053de2abf4..6308ff0e5d2 100644 --- a/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql +++ b/internal-packages/clickhouse/schema/035_create_queue_metrics_v1.sql @@ -92,11 +92,12 @@ SELECT FROM trigger_dev.queue_metrics_raw_v1 GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start; --- (4) Env-level 1m rollup (no queue dimension) for header tiles/saturation charts. +-- (4) Env-level 10s rollup (no queue dimension) for header tiles/saturation charts. +-- Row count is queue-independent (~8640/day/env), so full granularity stays cheap at any range. -- No counter deltas on purpose: cross-queue deltaSumTimestamp state merges mix unrelated -- odometers (env totals must GROUP BY queue then sum). TDigest because an env-level -- reservoir absorbs every sample in the environment. -CREATE TABLE IF NOT EXISTS trigger_dev.env_metrics_1m_v1 +CREATE TABLE IF NOT EXISTS trigger_dev.env_metrics_v1 ( organization_id LowCardinality(String), project_id LowCardinality(String), @@ -119,11 +120,11 @@ TTL bucket_start + INTERVAL 30 DAY SETTINGS ttl_only_drop_parts = 1; -- (5) MV: raw -> env rollup. -CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.env_metrics_1m_mv_v1 -TO trigger_dev.env_metrics_1m_v1 AS +CREATE MATERIALIZED VIEW IF NOT EXISTS trigger_dev.env_metrics_mv_v1 +TO trigger_dev.env_metrics_v1 AS SELECT organization_id, project_id, environment_id, - toStartOfInterval(event_time, INTERVAL 1 MINUTE) AS bucket_start, + toStartOfInterval(event_time, INTERVAL 10 SECOND) AS bucket_start, max(env_queued) AS max_env_queued, max(env_running) AS max_env_running, max(env_limit) AS max_env_limit, @@ -197,8 +198,8 @@ GROUP BY organization_id, project_id, environment_id, queue_name, bucket_start; -- +goose Down DROP VIEW IF EXISTS trigger_dev.queue_metrics_5m_mv_v1; DROP TABLE IF EXISTS trigger_dev.queue_metrics_5m_v1; -DROP VIEW IF EXISTS trigger_dev.env_metrics_1m_mv_v1; -DROP TABLE IF EXISTS trigger_dev.env_metrics_1m_v1; +DROP VIEW IF EXISTS trigger_dev.env_metrics_mv_v1; +DROP TABLE IF EXISTS trigger_dev.env_metrics_v1; DROP VIEW IF EXISTS trigger_dev.queue_metrics_mv_v1; DROP TABLE IF EXISTS trigger_dev.queue_metrics_v1; DROP TABLE IF EXISTS trigger_dev.queue_metrics_raw_v1; diff --git a/internal-packages/clickhouse/src/queueMetrics.test.ts b/internal-packages/clickhouse/src/queueMetrics.test.ts index 968cd7308a0..77a1f4ea54e 100644 --- a/internal-packages/clickhouse/src/queueMetrics.test.ts +++ b/internal-packages/clickhouse/src/queueMetrics.test.ts @@ -215,7 +215,7 @@ describe("queue_metrics_v1", () => { ); clickhouseTest( - "5m and env rollups agree with the 10s tier, and cross-queue totals sum per queue", + "5m and env rollups agree with the 10s tier, and env buckets are 10s", async ({ clickhouseContainer }) => { const ch = new ClickHouse({ url: clickhouseContainer.getConnectionUrl(), name: "test" }); @@ -226,6 +226,14 @@ describe("queue_metrics_v1", () => { ...counter("started", "roll-b", 3, [500, 600, 700]), { ...base("gauge", "roll-a"), running: 4, queued: 9, env_running: 30, env_limit: 50 }, { ...base("gauge", "roll-b"), running: 2, queued: 1, env_running: 45, env_limit: 50 }, + { + ...base("gauge", "roll-a"), + event_time: "2026-06-30 12:00:15", + running: 1, + queued: 2, + env_running: 20, + env_limit: 50, + }, ].map((row) => ({ ...row, organization_id: rollOrg })); const [insertError] = await ch.queueMetrics.insertRaw(rows, SYNC); expect(insertError).toBeNull(); @@ -269,12 +277,14 @@ describe("queue_metrics_v1", () => { query: `SELECT max(max_env_running) AS max_env_running, max(max_env_limit) AS max_env_limit, + uniqExact(bucket_start) AS buckets, round(quantilesTDigestMerge(0.5, 0.9, 0.95, 0.99)(wait_quantiles)[4]) AS wait_p99 - FROM trigger_dev.env_metrics_1m_v1 + FROM trigger_dev.env_metrics_v1 WHERE organization_id = {org: String}`, schema: z.object({ max_env_running: z.coerce.number(), max_env_limit: z.coerce.number(), + buckets: z.coerce.number(), wait_p99: z.coerce.number(), }), params: z.object({ org: z.string() }), @@ -282,6 +292,8 @@ describe("queue_metrics_v1", () => { expect(envError).toBeNull(); expect(envRows![0]!.max_env_running).toBe(45); expect(envRows![0]!.max_env_limit).toBe(50); + // 12:00:05 and 12:00:15 land in separate 10s env buckets (12:00:00 and 12:00:10). + expect(envRows![0]!.buckets).toBe(2); expect(envRows![0]!.wait_p99).toBeGreaterThanOrEqual(600); expect(envRows![0]!.wait_p99).toBeLessThanOrEqual(1000); From efdd64f59752e3e656e86cd333c17ca401ba4caf Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 21:57:49 +0100 Subject: [PATCH 30/37] fix(webapp): include rollup tables in the queue metrics simulator reset The simulator's --reset only cleared the raw and 10s tables, leaving stale rows in the 5m and env rollups. It also force-merges the rollups after seeding so current-value widgets read cleanly. --- apps/webapp/seed-queue-metrics.mts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/webapp/seed-queue-metrics.mts b/apps/webapp/seed-queue-metrics.mts index 2af9ea661d7..3e512c4305c 100644 --- a/apps/webapp/seed-queue-metrics.mts +++ b/apps/webapp/seed-queue-metrics.mts @@ -346,7 +346,12 @@ async function resetEnv(ch: ClickHouse, environmentId: string) { const raw = ( ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } } ).client; - for (const table of ["queue_metrics_raw_v1", "queue_metrics_v1"]) { + for (const table of [ + "queue_metrics_raw_v1", + "queue_metrics_v1", + "queue_metrics_5m_v1", + "env_metrics_v1", + ]) { await raw.command({ query: `DELETE FROM trigger_dev.${table} WHERE environment_id = '${environmentId}'`, }); @@ -511,6 +516,8 @@ async function main() { ch.writer as unknown as { client: { command: (a: { query: string }) => Promise } } ).client; await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_v1 FINAL` }); + await raw.command({ query: `OPTIMIZE TABLE trigger_dev.queue_metrics_5m_v1 FINAL` }); + await raw.command({ query: `OPTIMIZE TABLE trigger_dev.env_metrics_v1 FINAL` }); const origin = process.env.APP_ORIGIN ?? "http://localhost:3030"; console.log( From d162dcf536a65bee1b0dd9ca92067e32be7f4643 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 22:38:59 +0100 Subject: [PATCH 31/37] fix(webapp): update the queue metrics simulator for cumulative counters Counter events now emit per queue and op odometer readings with a seeded zero baseline, matching the production emitter, so throughput and started counts reconstruct from simulated data instead of reading zero. Scenario switches prune the previous scenario's queues, a --project flag seeds each scenario into its own project for side-by-side design review, and a new many-queues scenario covers pagination and relevance ranking with one runaway queue, a busy head, a bursty middle, and a sparse tail. Adds --help. --- apps/webapp/seed-queue-metrics.mts | 195 ++++++++++++++++++++++++++--- 1 file changed, 176 insertions(+), 19 deletions(-) diff --git a/apps/webapp/seed-queue-metrics.mts b/apps/webapp/seed-queue-metrics.mts index 3e512c4305c..32da6b9b727 100644 --- a/apps/webapp/seed-queue-metrics.mts +++ b/apps/webapp/seed-queue-metrics.mts @@ -185,6 +185,39 @@ const scenarios: Record Sce ], }), + // Pagination + relevance-ranking design surface: one runaway queue, a busy-but-healthy + // head, a bursty middle, and a long sparse tail across 61 queues (the list pages at 25). + "many-queues": () => ({ + description: + "61 queues: one runaway, busy head, bursty middle, long sparse tail (pagination + ranking)", + envLimit: () => 150, + queues: [ + { name: "imports", limit: () => 8, arrivals: (_b, r) => poisson(14, r), waitBaseMs: 80 }, + ...["checkout", "notifications", "emails"].map((name, i) => ({ + name, + limit: () => 15, + arrivals: (_b: number, r: Rng) => poisson(7 + i, r), + waitBaseMs: 60, + })), + ...Array.from({ length: 12 }, (_v, i) => + bursty(`service-${String(i + 1).padStart(2, "0")}`, 10, 2) + ), + ...Array.from({ length: 20 }, (_v, i) => ({ + name: `job-${String(i + 1).padStart(2, "0")}`, + limit: () => 5, + arrivals: (_b: number, r: Rng) => poisson(1, r), + waitBaseMs: 40, + })), + ...Array.from({ length: 25 }, (_v, i) => ({ + name: `tenant-${String(i + 1).padStart(2, "0")}`, + limit: () => 3, + arrivals: (_b: number, r: Rng) => (r() < 0.05 ? poisson(2, r) : 0), + waitBaseMs: 30, + sparse: true, + })), + ], + }), + // Default: one env with a variety of queue behaviours + occasional env saturation. mixed: (totalBuckets) => ({ description: "variety of queue profiles in one env, with occasional env saturation", @@ -219,15 +252,61 @@ const WAIT_SIGMA = 0.6; const NACK_RATE = 0.02; const DLQ_RATE = 0.004; +type CounterOp = "enqueue" | "started" | "ack" | "nack" | "dlq"; +// Per-(queue, op) odometers, mirroring the production emitter: cumulative readings with a +// cum=0 baseline on the first one, so deltaSumTimestamp captures the 0->1 delta. +type CounterState = Record[]; + +function counterRows( + counters: CounterState, + q: number, + ids: Ids, + queueName: string, + eventTime: string, + orderKey: () => number, + op: CounterOp, + wait_ms?: number +): QueueMetricsRawV1Input[] { + const rows: QueueMetricsRawV1Input[] = []; + if (counters[q][op] === 0) { + rows.push({ + ...ids, + queue_name: queueName, + event_time: eventTime, + op, + cumulative: 0, + order_key: orderKey(), + }); + } + counters[q][op] += 1; + rows.push({ + ...ids, + queue_name: queueName, + event_time: eventTime, + op, + cumulative: counters[q][op], + order_key: orderKey(), + ...(wait_ms !== undefined ? { wait_ms } : {}), + }); + return rows; +} + +function newCounterState(n: number): CounterState { + return Array.from({ length: n }, () => ({ enqueue: 0, started: 0, ack: 0, nack: 0, dlq: 0 })); +} + // Advance one bucket of the simulation for every queue, returning the raw rows to insert. -// `backlog` is mutated in place so state carries across buckets (and into live mode). +// `backlog` and `counters` are mutated in place so state carries across buckets (and into +// live mode). function simulateBucket( scenario: Scenario, bucket: number, bucketSec: number, eventTime: string, + bucketEpochSec: number, ids: Ids, backlog: number[], + counters: CounterState, rng: Rng ): QueueMetricsRawV1Input[] { const envLimit = scenario.envLimit(bucket); @@ -259,6 +338,11 @@ function simulateBucket( envQueued += queued[q]; } + // Order keys are time-based (like the production stream ids) so appended runs and live + // mode stay monotonic; the per-bucket sequence keeps them unique within a bucket. + let bucketSeq = 0; + const orderKey = () => bucketEpochSec * 1_000_000 + bucketSeq++; + const rows: QueueMetricsRawV1Input[] = []; for (let q = 0; q < n; q++) { const profile = scenario.queues[q]; @@ -287,21 +371,26 @@ function simulateBucket( rows.push(gauge); for (let a = 0; a < arrivals; a++) { - rows.push({ ...ids, queue_name: profile.name, event_time: eventTime, op: "enqueue" }); + rows.push(...counterRows(counters, q, ids, profile.name, eventTime, orderKey, "enqueue")); } const medianWait = profile.waitBaseMs + (prior / Math.max(limit[q], 1)) * bucketSec * 1000; for (let s = 0; s < started; s++) { - rows.push({ - ...ids, - queue_name: profile.name, - event_time: eventTime, - op: "started", - wait_ms: Math.round(lognormal(medianWait, WAIT_SIGMA, rng)), - }); + rows.push( + ...counterRows( + counters, + q, + ids, + profile.name, + eventTime, + orderKey, + "started", + Math.round(lognormal(medianWait, WAIT_SIGMA, rng)) + ) + ); const roll = rng(); - const op = roll < DLQ_RATE ? "dlq" : roll < DLQ_RATE + NACK_RATE ? "nack" : "ack"; - rows.push({ ...ids, queue_name: profile.name, event_time: eventTime, op }); + const op: CounterOp = roll < DLQ_RATE ? "dlq" : roll < DLQ_RATE + NACK_RATE ? "nack" : "ack"; + rows.push(...counterRows(counters, q, ids, profile.name, eventTime, orderKey, op)); } } return rows; @@ -412,11 +501,55 @@ async function ensureTaskQueues( update: { concurrencyLimit }, }); } - console.log(`Ensured ${scenario.queues.length} task queues in Postgres.`); + + // Drop queues left over from a previously seeded scenario so switching scenarios + // does not leave metric-less rows in the list. + const { count: pruned } = await prisma.taskQueue.deleteMany({ + where: { + runtimeEnvironmentId, + name: { notIn: scenario.queues.map((q) => q.name) }, + }, + }); + console.log( + `Ensured ${scenario.queues.length} task queues in Postgres${pruned > 0 ? `, pruned ${pruned} stale` : ""}.` + ); +} + +function printHelp() { + const lines = Object.entries(scenarios).map( + ([name, build]) => ` ${name.padEnd(28)}${build(720, 10).description}` + ); + console.log(`Queue metrics simulator: seeds a synthetic tenant with realistic queue metrics. + +Usage: pnpm --filter webapp run db:seed:queue-metrics -- [flags] + +Flags: + --scenario which scenario to seed (default: mixed) + --project project to seed into (default: ${PROJECT_NAME}); use one + project per scenario to browse them side by side + --window how much history to backfill, e.g. 30m, 6h, 1d (default: 2h) + --bucket seconds per simulated bucket (default: 10) + --seed RNG seed for reproducible data (default: 1) + --live after backfilling, keep appending one bucket per interval + --reset clear this environment's metrics before seeding + --reset-only clear and exit without seeding + --help this text + +Scenarios: +${lines.join("\n")} + +Example designer setup (one project per scenario): + pnpm --filter webapp run db:seed:queue-metrics -- --scenario mixed --reset + pnpm --filter webapp run db:seed:queue-metrics -- --scenario many-queues --project qm-many-queues --reset + pnpm --filter webapp run db:seed:queue-metrics -- --scenario throttled-backlog --project qm-throttled --reset`); } async function main() { const flags = parseArgs(process.argv.slice(2)); + if (flags.help === "true") { + printHelp(); + process.exit(0); + } const scenarioName = flags.scenario ?? "mixed"; const build = scenarios[scenarioName]; if (!build) { @@ -453,13 +586,14 @@ async function main() { if (!org) org = await createOrganization({ title: ORG_TITLE, userId: user.id, companySize: "1-10" }); + const projectName = flags.project ?? PROJECT_NAME; let project = await prisma.project.findFirst({ - where: { name: PROJECT_NAME, organizationId: org.id }, + where: { name: projectName, organizationId: org.id }, }); if (!project) { project = await createProject({ organizationSlug: org.slug, - name: PROJECT_NAME, + name: projectName, userId: user.id, version: "v3", }); @@ -502,10 +636,24 @@ async function main() { // Backfill: buckets from (now - window) up to now, aligned to the bucket grid. const nowBucket = Math.floor(Date.now() / 1000 / bucketSec) * bucketSec; const startBucket = nowBucket - totalBuckets * bucketSec; + const counters = newCounterState(scenario.queues.length); const rows: QueueMetricsRawV1Input[] = []; for (let b = 0; b < totalBuckets; b++) { - const eventTime = formatChDateTime(new Date((startBucket + b * bucketSec) * 1000)); - rows.push(...simulateBucket(scenario, b, bucketSec, eventTime, ids, backlog, rng)); + const bucketEpochSec = startBucket + b * bucketSec; + const eventTime = formatChDateTime(new Date(bucketEpochSec * 1000)); + rows.push( + ...simulateBucket( + scenario, + b, + bucketSec, + eventTime, + bucketEpochSec, + ids, + backlog, + counters, + rng + ) + ); } await insertBatched(ch, rows, nonce); console.log(`Inserted ${rows.length} raw rows.`); @@ -530,10 +678,19 @@ async function main() { // eslint-disable-next-line no-constant-condition while (true) { await new Promise((r) => setTimeout(r, bucketSec * 1000)); - const eventTime = formatChDateTime( - new Date(Math.floor(Date.now() / 1000 / bucketSec) * bucketSec * 1000) + const bucketEpochSec = Math.floor(Date.now() / 1000 / bucketSec) * bucketSec; + const eventTime = formatChDateTime(new Date(bucketEpochSec * 1000)); + const liveRows = simulateBucket( + scenario, + b, + bucketSec, + eventTime, + bucketEpochSec, + ids, + backlog, + counters, + rng ); - const liveRows = simulateBucket(scenario, b, bucketSec, eventTime, ids, backlog, rng); await insertBatched(ch, liveRows, `${nonce}:live:${b}`); console.log(`bucket ${b}: ${liveRows.length} rows @ ${eventTime}`); b++; From 3c67a0cf376480f245d8b1acb494ff2362fbdf31 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sat, 4 Jul 2026 22:47:35 +0100 Subject: [PATCH 32/37] feat(webapp): stage fake Redis usage from the queue metrics simulator A --usage flag stages plausible running counts in the local run-queue Redis for the seeded queues, so the list's Running column and the Allocation tab's usage bars have data without the run engine. Staged state is reconciled on every run: present with --usage, cleared without. Local Redis hosts only. --- apps/webapp/seed-queue-metrics.mts | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/apps/webapp/seed-queue-metrics.mts b/apps/webapp/seed-queue-metrics.mts index 32da6b9b727..0a04473f779 100644 --- a/apps/webapp/seed-queue-metrics.mts +++ b/apps/webapp/seed-queue-metrics.mts @@ -448,6 +448,51 @@ async function resetEnv(ch: ClickHouse, environmentId: string) { console.log(`Reset queue metrics for environment ${environmentId}`); } +// Fake running counts in the run-queue Redis (Running column + allocation usage bars). +// Reconciled every run: staged with --usage, cleared otherwise. +async function stageRedisUsage(scenario: Scenario, ids: Ids, seed: number, clear: boolean) { + const host = process.env.RUN_ENGINE_RUN_QUEUE_REDIS_HOST ?? process.env.REDIS_HOST ?? "localhost"; + const port = Number( + process.env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? process.env.REDIS_PORT ?? 6379 + ); + const localHosts = new Set(["localhost", "127.0.0.1", "::1", "0.0.0.0"]); + if (!localHosts.has(host)) { + console.warn(`Skipping Redis usage staging on a non-local host: ${host}`); + return; + } + try { + const { createRedisClient } = await import("@internal/redis"); + const redis = createRedisClient({ host, port }); + const rng = mulberry32(seed + 1); + const base = `engine:runqueue:{org:${ids.organization_id}}:proj:${ids.project_id}:env:${ids.environment_id}:queue:`; + for (const [q, profile] of scenario.queues.entries()) { + const key = `${base}${profile.name}:currentDequeued`; + await redis.del(key); + if (clear) continue; + const limit = profile.limit(0); + // First queue rides at/over its limit, the rest at 30-90%, sparse mostly idle. + const count = profile.sparse + ? rng() < 0.3 + ? 1 + : 0 + : q === 0 + ? limit + Math.round(rng() * 2) + : Math.round(limit * (0.3 + 0.6 * rng())); + if (count > 0) { + await redis.sadd(key, ...Array.from({ length: count }, (_v, i) => `sim_run_${i}`)); + } + } + await redis.quit(); + console.log( + clear + ? "Cleared staged Redis usage." + : "Staged fake running counts in Redis (Running column + allocation usage bars)." + ); + } catch (error) { + console.warn("Redis usage staging skipped:", error instanceof Error ? error.message : error); + } +} + // --------------------------------------------------------------------------- // Main // --------------------------------------------------------------------------- @@ -530,6 +575,8 @@ Flags: --window how much history to backfill, e.g. 30m, 6h, 1d (default: 2h) --bucket seconds per simulated bucket (default: 10) --seed RNG seed for reproducible data (default: 1) + --usage stage fake running counts in Redis so the Running column and + the Allocation tab's usage bars have data (cleared when omitted) --live after backfilling, keep appending one bucket per interval --reset clear this environment's metrics before seeding --reset-only clear and exit without seeding @@ -625,6 +672,7 @@ async function main() { const scenario = build(totalBuckets, bucketSec); await ensureTaskQueues(scenario, project.id, runtimeEnv.id); + await stageRedisUsage(scenario, ids, seed, flags.usage !== "true"); const rng = mulberry32(seed); const backlog = new Array(scenario.queues.length).fill(0); From bbd3eaaa2b9c06f405f55e8b72a751b0e3cb5f58 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sun, 5 Jul 2026 07:11:51 +0100 Subject: [PATCH 33/37] fix(webapp): keep the search filter on the ranked queue list's tail The tail query's exclusion list overwrote the search's name filter via object spread, so searching while sorted by activity showed unrelated queues past the ranked head. Combine the conditions with AND instead. --- apps/webapp/app/presenters/v3/QueueListPresenter.server.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts index 4a1641a1be7..751d4b0a602 100644 --- a/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/QueueListPresenter.server.ts @@ -281,8 +281,10 @@ export class QueueListPresenter extends BasePresenter { } excludedNames = (allRows ?? []).map((row) => row.queue_name); } + // AND keeps the search's name filter intact alongside the exclusion (a spread + // would overwrite one name condition with the other). tailQueues = await this._replica.taskQueue.findMany({ - where: { ...where, name: { notIn: excludedNames } }, + where: { AND: [where, { name: { notIn: excludedNames } }] }, select: queueListSelect, orderBy: { orderableName: "asc", @@ -314,7 +316,7 @@ export class QueueListPresenter extends BasePresenter { return []; } const queues = await this._replica.taskQueue.findMany({ - where: { ...where, name: { in: names } }, + where: { AND: [where, { name: { in: names } }] }, select: queueListSelect, }); const byName = new Map(queues.map((queue) => [queue.name, queue])); From 279db19096768c58ffa126c512f7a7aff5eb630a Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sun, 5 Jul 2026 07:11:51 +0100 Subject: [PATCH 34/37] fix(metrics-pipeline): drop metric emits while the metrics Redis is not ready Without a readiness guard, every fire-and-forget emit during a metrics Redis outage queued a command in ioredis's in-memory offline queue until rejection. Metrics are loss-tolerant by design, so drop instead; waitUntilReady() lets embedders await the initial connect. --- .../metrics-pipeline/src/consumer.test.ts | 2 ++ internal-packages/metrics-pipeline/src/emitter.ts | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/internal-packages/metrics-pipeline/src/consumer.test.ts b/internal-packages/metrics-pipeline/src/consumer.test.ts index fcb36176937..672fa426999 100644 --- a/internal-packages/metrics-pipeline/src/consumer.test.ts +++ b/internal-packages/metrics-pipeline/src/consumer.test.ts @@ -212,6 +212,8 @@ redisTest( flag: { enabled: () => true }, }); + // Emits before the connection is ready are dropped by design (loss-tolerant). + await emitter.waitUntilReady(); emitter.emitGauge("q1", { op: "gauge", q: "q1", diff --git a/internal-packages/metrics-pipeline/src/emitter.ts b/internal-packages/metrics-pipeline/src/emitter.ts index 6574aa70fe0..625cd3e39c9 100644 --- a/internal-packages/metrics-pipeline/src/emitter.ts +++ b/internal-packages/metrics-pipeline/src/emitter.ts @@ -114,6 +114,9 @@ export class MetricsStreamEmitter { // the caller. Shares the counter stream (one stream family on the metrics Redis). emitGauge(shardKey: string, fields: MetricFields): void { if (!this.flag.enabled()) return; + // Drop rather than queue while the metrics Redis is unreachable: ioredis would hold + // every command in its offline queue until rejection, and metrics are loss-tolerant. + if (this.redis.status !== "ready") return; const op = String(fields.op ?? "gauge"); const stream = streamKey(this.def, shardFor(shardKey, this.def.shardCount)); const args: string[] = []; @@ -134,6 +137,7 @@ export class MetricsStreamEmitter { // lost XADD self-heals (the next reading restates the total); the INCR is never sampled. emit(shardKey: string, fields: MetricFields): void { if (!this.flag.enabled()) return; + if (this.redis.status !== "ready") return; const op = String(fields.op ?? "unknown"); const q = String(fields.q ?? ""); const odometerKey = `${this.def.name}_cum:${op}:${q}`; @@ -161,6 +165,12 @@ export class MetricsStreamEmitter { }); } + // Resolves once the metrics Redis connection is ready (emits before that are dropped). + waitUntilReady(): Promise { + if (this.redis.status === "ready") return Promise.resolve(); + return new Promise((resolve) => this.redis.once("ready", () => resolve())); + } + async close(): Promise { await this.redis.quit(); } From 23de76ba0a34429969a7526a5b263c937484aee7 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Sun, 5 Jul 2026 09:49:08 +0100 Subject: [PATCH 35/37] feat(webapp): remove auto-balance from the allocation view The allocation view keeps manual limit edits, the review dialog, and bulk apply. The one-shot auto-balance button is removed (and the row locks whose only purpose was protecting queues from it); a policy-driven approach can replace it if rebalancing returns. --- .../AllocationView.tsx | 122 +----------------- 1 file changed, 3 insertions(+), 119 deletions(-) diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx index 7dcb0b9cab0..8b13afbcb87 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.queues/AllocationView.tsx @@ -1,4 +1,3 @@ -import { LockClosedIcon, LockOpenIcon, ScaleIcon } from "@heroicons/react/20/solid"; import { Form, useNavigation } from "@remix-run/react"; import { type ReactNode, useEffect, useMemo, useState } from "react"; import { BigNumber } from "~/components/metrics/BigNumber"; @@ -28,64 +27,6 @@ import { cn } from "~/utils/cn"; type Drafts = Record; -/** - * Distribute the env budget across unlocked queues, weighted by current load - * (running + queued), largest-remainder rounding, min 1 when the budget allows. - * Locked queues keep their current value and are subtracted from the budget first. - */ -export function computeAutoBalance( - queues: QueueAllocationItem[], - envLimit: number, - locked: Set, - draftLimit: (queue: QueueAllocationItem) => number | null -): Drafts { - const unlocked = queues.filter((q) => !locked.has(q.id)); - if (unlocked.length === 0) return {}; - - const lockedSum = queues - .filter((q) => locked.has(q.id)) - .reduce((sum, q) => sum + (draftLimit(q) ?? 0), 0); - const budget = Math.max(0, envLimit - lockedSum); - - const weights = unlocked.map((q) => q.running + q.queued); - const totalWeight = weights.reduce((a, b) => a + b, 0); - const raw = unlocked.map((_, i) => - totalWeight > 0 ? (budget * weights[i]) / totalWeight : budget / unlocked.length - ); - - const shares = raw.map(Math.floor); - let remainder = budget - shares.reduce((a, b) => a + b, 0); - const byFraction = raw - .map((value, i) => ({ i, fraction: value - Math.floor(value) })) - .sort((a, b) => b.fraction - a.fraction); - for (const { i } of byFraction) { - if (remainder <= 0) break; - shares[i]++; - remainder--; - } - - // Every unlocked queue gets at least 1 when the budget can afford it. - if (budget >= unlocked.length) { - for (let i = 0; i < shares.length; i++) { - while (shares[i] < 1) { - let donor = -1; - for (let j = 0; j < shares.length; j++) { - if (j !== i && shares[j] > 1 && (donor === -1 || shares[j] > shares[donor])) donor = j; - } - if (donor === -1) break; - shares[donor]--; - shares[i]++; - } - } - } - - const result: Drafts = {}; - unlocked.forEach((q, i) => { - result[q.id] = Math.min(Math.max(shares[i], 0), envLimit); - }); - return result; -} - export function AllocationView({ allocation, environment, @@ -94,7 +35,6 @@ export function AllocationView({ environment: Environment; }) { const [drafts, setDrafts] = useState({}); - const [locked, setLocked] = useState>(new Set()); const [reviewOpen, setReviewOpen] = useState(false); const navigation = useNavigation(); const isSubmitting = navigation.state !== "idle"; @@ -154,29 +94,6 @@ export function AllocationView({ }); }; - const toggleLock = (id: string) => { - setLocked((prev) => { - const next = new Set(prev); - if (next.has(id)) next.delete(id); - else next.add(id); - return next; - }); - }; - - const autoBalance = () => { - const balanced = computeAutoBalance(allocation.queues, envLimit, locked, draftLimit); - setDrafts((prev) => { - const next = { ...prev }; - for (const queue of allocation.queues) { - const value = balanced[queue.id]; - if (value === undefined) continue; - if (value === queue.limit) delete next[queue.id]; - else next[queue.id] = value; - } - return next; - }); - }; - const changesPayload = useMemo( () => JSON.stringify(changes.map((queue) => ({ friendlyId: queue.id, limit: drafts[queue.id] }))), @@ -237,8 +154,8 @@ export function AllocationView({ {overAllocated && ( The queue limits add up to more than the environment limit, so queues will compete for - concurrency when the environment saturates. Reduce limits (or use Auto-balance) to - guarantee each queue its allocation. + concurrency when the environment saturates. Reduce limits to guarantee each queue its + allocation. )} @@ -250,16 +167,6 @@ export function AllocationView({ )}
-