Skip to content

Commit

Permalink
Add retry logic when an IMEX domain's channels can't be advertized
Browse files Browse the repository at this point in the history
This indicates a transient failure that will be resolved if fewer
clique ids are used in the future for the same domain.

Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed Oct 27, 2024
1 parent 6ea5c6f commit 20e8011
Showing 1 changed file with 22 additions and 3 deletions.
25 changes: 22 additions & 3 deletions cmd/nvidia-dra-controller/imex.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package main

import (
"context"
"errors"
"fmt"
"strings"
"sync"
Expand All @@ -41,8 +42,12 @@ const (
ImexDomainLabel = "nvidia.com/gpu.imex-domain"
ResourceSliceImexChannelLimit = 128
DriverImexChannelLimit = 2048
RetryTimeout = 1 * time.Minute
)

// transientError defines an error indicating that it is transient.
type transientError struct{ error }

// imexDomainOffsets represents the offset for assigning IMEX channels
// to ResourceSlices for each <imex-domain, cliqueid> combination.
type imexDomainOffsets map[string]map[string]int
Expand All @@ -51,6 +56,7 @@ type ImexManager struct {
driverName string
resourceSliceImexChannelLimit int
driverImexChannelLimit int
retryTimeout time.Duration
waitGroup sync.WaitGroup
clientset kubernetes.Interface
imexDomainOffsets imexDomainOffsets
Expand Down Expand Up @@ -95,6 +101,7 @@ func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error)
driverName: DriverName,
resourceSliceImexChannelLimit: ResourceSliceImexChannelLimit,
driverImexChannelLimit: DriverImexChannelLimit,
retryTimeout: RetryTimeout,
clientset: clientset,
owner: owner,
driverResources: driverResources,
Expand Down Expand Up @@ -133,14 +140,26 @@ func (m *ImexManager) manageResourceSlices(ctx context.Context) error {
klog.Infof("Adding channels for new IMEX domain: %v", addedDomain)
if err := m.addImexDomain(addedDomain); err != nil {
klog.Errorf("Error adding channels for IMEX domain %s: %v", addedDomain, err)
return
if errors.As(err, &transientError{}) {
klog.Infof("Retrying adding channels for IMEX domain %s after %v", addedDomain, m.retryTimeout)
go func() {
time.Sleep(m.retryTimeout)
addedDomainsCh <- addedDomain
}()
}
}
controller.Update(m.driverResources)
case removedDomain := <-removedDomainsCh:
klog.Infof("Removing channels for removed IMEX domain: %v", removedDomain)
if err := m.removeImexDomain(removedDomain); err != nil {
klog.Errorf("Error removing channels for IMEX domain %s: %v", removedDomain, err)
return
if errors.As(err, &transientError{}) {
klog.Infof("Retrying removing channels for IMEX domain %s after %v", removedDomain, m.retryTimeout)
go func() {
time.Sleep(m.retryTimeout)
removedDomainsCh <- removedDomain
}()
}
}
controller.Update(m.driverResources)
case <-ctx.Done():
Expand Down Expand Up @@ -175,7 +194,7 @@ func (m *ImexManager) addImexDomain(imexDomain string) error {
}
offset, err := m.imexDomainOffsets.add(imexDomainID, cliqueID, m.resourceSliceImexChannelLimit, m.driverImexChannelLimit)
if err != nil {
return fmt.Errorf("error setting offset for IMEX channels: %w", err)
return transientError{fmt.Errorf("error setting offset for IMEX channels: %w", err)}
}
m.driverResources = m.driverResources.DeepCopy()
m.driverResources.Pools[imexDomain] = generateImexChannelPool(imexDomain, offset, m.resourceSliceImexChannelLimit)
Expand Down

0 comments on commit 20e8011

Please sign in to comment.