Skip to content

Commit

Permalink
fix: plugin grpc server retry method
Browse files Browse the repository at this point in the history
Signed-off-by: googs1025 <googs1025@gmail.com>
  • Loading branch information
googs1025 committed Dec 11, 2024
1 parent b6b81a6 commit 1ad3980
Showing 1 changed file with 18 additions and 4 deletions.
22 changes: 18 additions & 4 deletions internal/plugin/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,13 @@ func (plugin *nvidiaDevicePlugin) Serve() error {
go func() {
lastCrashTime := time.Now()
restartCount := 0
maxRestarts := 5
crashTimeoutSeconds := float64(3600) // 1 hour in seconds

for {
// quite if it has been restarted too often
// i.e. if server has crashed more than 5 times and it didn't last more than one hour each time
if restartCount > 5 {
if restartCount > maxRestarts {
// quit
klog.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", plugin.rm.Resource())
}
Expand All @@ -198,17 +200,29 @@ func (plugin *nvidiaDevicePlugin) Serve() error {
break
}

klog.Infof("GRPC server for '%s' crashed with error: %v", plugin.rm.Resource(), err)

timeSinceLastCrash := time.Since(lastCrashTime).Seconds()
lastCrashTime = time.Now()
if timeSinceLastCrash > 3600 {

klog.Errorf(
"GRPC server for '%s' crashed with error: %v. Retry attempt: %d, Time since last crash: %.2f seconds",
plugin.rm.Resource(),
err,
restartCount+1,
timeSinceLastCrash,
)

if timeSinceLastCrash > crashTimeoutSeconds {
// it has been one hour since the last crash.. reset the count
// to reflect on the frequency
restartCount = 0
} else {
restartCount++
}

// add a small delay before restarting to prevent tight loops
retryDelay := 5 * time.Second
klog.Infof("Waiting for %v before attempting to restart GRPC server for '%s'", retryDelay, plugin.rm.Resource())
time.Sleep(retryDelay) // Wait for 5 seconds before attempting to restart
}
}()

Expand Down

0 comments on commit 1ad3980

Please sign in to comment.