diff --git a/packages/fleet/lib/supervisor.ts b/packages/fleet/lib/supervisor.ts index ead2044ffb9..a4e2b3d9fa2 100644 --- a/packages/fleet/lib/supervisor.ts +++ b/packages/fleet/lib/supervisor.ts @@ -1,3 +1,5 @@ +import tracer from 'dd-trace'; +import type { Span } from 'dd-trace'; import type { DatabaseClient } from './db/client.js'; import type { Knex } from 'knex'; import { logger } from './utils/logger.js'; @@ -24,6 +26,39 @@ type Operation = | { type: 'TERMINATE'; node: Node } | { type: 'REMOVE'; node: Node }; +const Operation = { + asSpanTags: (o: Operation): Record => { + switch (o.type) { + case 'CREATE': + return { + operation: o.type, + routingId: o.routingId, + deploymentId: o.deployment.id + }; + case 'FAIL': + return { + operation: o.type, + nodeId: o.node.id, + reason: o.reason + }; + case 'START': + return { + operation: o.type, + nodeId: o.node.id + }; + case 'OUTDATE': + case 'FINISHING': + case 'FINISHING_TIMEOUT': + case 'TERMINATE': + case 'REMOVE': + return { + operation: o.type, + nodeId: o.node.id + }; + } + } +}; + type SupervisorState = 'stopped' | 'running' | 'stopping'; export const STATE_TIMEOUT_MS = { @@ -82,7 +117,7 @@ export class Supervisor { } public async tick(): Promise> { - // TODO: trace + const span = tracer.startSpan('fleet.supervisor.tick'); try { this.tickCancelled = false; const plan = await this.plan(); @@ -93,7 +128,10 @@ export class Supervisor { return Err(plan.error); } } catch (err) { + span.setTag('error', err); return Err(new FleetError('supervisor_tick_failed', { cause: err })); + } finally { + span.finish(); } } @@ -270,39 +308,42 @@ export class Supervisor { } private async executePlan(plan: Operation[]): Promise { - if (plan.length > 0) { - logger.info('Executing plan:', plan); - } - for (const action of plan) { + for (const operation of plan) { if (this.tickCancelled) { return; } - const result = await this.execute(action); + const active = tracer.scope().active(); + const span = tracer.startSpan(`fleet.supervisor.operation.${operation.type.toLowerCase()}`, { + childOf: active as Span + }); + span.addTags(Operation.asSpanTags(operation)); + const result = await this.execute(operation); if (result.isErr()) { - // TODO: trace - logger.error('Failed to execute action:', result.error, result.error.cause); + span.setTag('error', result.error); + logger.error('Failed to execute operation:', result.error, result.error.cause); } + span.finish(); } } - private async execute(action: Operation): Promise> { - switch (action.type) { + private async execute(operation: Operation): Promise> { + switch (operation.type) { case 'CREATE': - return this.createNode(this.dbClient.db, action); + return this.createNode(this.dbClient.db, operation); case 'START': - return this.startNode(action); + return this.startNode(operation); case 'OUTDATE': - return this.outdateNode(action); + return this.outdateNode(operation); case 'FINISHING': - return this.finishingNode(action); + return this.finishingNode(operation); case 'TERMINATE': - return this.terminateNode(action); + return this.terminateNode(operation); case 'REMOVE': - return this.removeNode(action); + return this.removeNode(operation); case 'FINISHING_TIMEOUT': - return this.finishingTimeout(action); + return this.finishingTimeout(operation); case 'FAIL': - return this.failNode(action); + return this.failNode(operation); } } diff --git a/packages/jobs/lib/runner/runner.ts b/packages/jobs/lib/runner/runner.ts index e43b9cd8488..1b242413604 100644 --- a/packages/jobs/lib/runner/runner.ts +++ b/packages/jobs/lib/runner/runner.ts @@ -35,20 +35,24 @@ function getRunnerId(suffix: string): string { } export async function getRunner(teamId: number): Promise> { - // a runner per account in prod only - const runnerId = isProd ? getRunnerId(`${teamId}`) : getRunnerId('default'); - - const isFleetGloballyEnabled = await featureFlags.isEnabled('fleet', 'global', false); - const isFleetEnabledForTeam = await featureFlags.isEnabled('fleet', `${teamId}`, false); - const isFleetEnabled = isFleetGloballyEnabled || isFleetEnabledForTeam; - if (isFleetEnabled) { - const runner = await getOrStartRunner(runnerId).catch(() => getOrStartRunner(getRunnerId('default'))); + try { + // a runner per account in prod only + const runnerId = isProd ? getRunnerId(`${teamId}`) : getRunnerId('default'); + + const isFleetGloballyEnabled = await featureFlags.isEnabled('fleet', 'global', false); + const isFleetEnabledForTeam = await featureFlags.isEnabled('fleet', `${teamId}`, false); + const isFleetEnabled = isFleetGloballyEnabled || isFleetEnabledForTeam; + if (isFleetEnabled) { + const runner = await getOrStartRunner(runnerId).catch(() => getOrStartRunner(getRunnerId('default'))); + return Ok(runner); + } + + // fallback to default runner if account runner isn't ready yet + const runner = await getOrStartRunnerLegacy(runnerId).catch(() => getOrStartRunnerLegacy(getRunnerId('default'))); return Ok(runner); + } catch (err) { + return Err(new Error(`Failed to get runner for team ${teamId}`, { cause: err })); } - - // fallback to default runner if account runner isn't ready yet - const runner = await getOrStartRunnerLegacy(runnerId).catch(() => getOrStartRunnerLegacy(getRunnerId('default'))); - return Ok(runner); } export async function idle(nodeId: number): Promise> {