Skip to content

Commit

Permalink
mimir.rules.kubernetes: Don't retry unrecoverable errors (#616)
Browse files Browse the repository at this point in the history
Change event processing to immediately stop applying changes to Mimir
in response to events that cause HTTP 4XX errors. These errors indicate
the request is malformed and will never succeed so there's no point
retrying it.

Fixes #610

Signed-off-by: Nick Pillitteri <nick.pillitteri@grafana.com>
  • Loading branch information
56quarters authored Apr 23, 2024
1 parent c91d76e commit 596fade
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ Main (unreleased)

- Fixed issue where text labels displayed outside of component node's boundary. (@hainenber)

- In `mimir.rules.kubernetes`, fix an issue where unrecoverable errors from the Mimir API were retried. (@56quarters)

### Other changes

- Update `alloy-mixin` to use more specific alert group names (for example,
Expand Down
8 changes: 4 additions & 4 deletions internal/component/mimir/rules/kubernetes/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
"github.com/go-kit/log"
"github.com/grafana/alloy/internal/alloy/logging/level"
"github.com/grafana/alloy/internal/component/common/kubernetes"
mimirClient "github.com/grafana/alloy/internal/mimir/client"
"github.com/grafana/alloy/internal/mimir/client"
"github.com/hashicorp/go-multierror"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1"
Expand Down Expand Up @@ -39,7 +39,7 @@ type eventProcessor struct {
stopChan chan struct{}
health healthReporter

mimirClient mimirClient.Interface
mimirClient client.Interface
namespaceLister coreListers.NamespaceLister
ruleLister promListers.PrometheusRuleLister
namespaceSelector labels.Selector
Expand Down Expand Up @@ -68,7 +68,7 @@ func (e *eventProcessor) run(ctx context.Context) {

if err != nil {
retries := e.queue.NumRequeues(evt)
if retries < 5 {
if retries < 5 && client.IsRecoverable(err) {
e.metrics.eventsRetried.WithLabelValues(string(evt.Typ)).Inc()
e.queue.AddRateLimited(evt)
level.Error(e.logger).Log(
Expand All @@ -80,7 +80,7 @@ func (e *eventProcessor) run(ctx context.Context) {
} else {
e.metrics.eventsFailed.WithLabelValues(string(evt.Typ)).Inc()
level.Error(e.logger).Log(
"msg", "failed to process event, max retries exceeded",
"msg", "failed to process event, unrecoverable error or max retries exceeded",
"retries", fmt.Sprintf("%d/5", retries),
"err", err,
)
Expand Down
14 changes: 9 additions & 5 deletions internal/mimir/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
"net/url"
"strings"

log "github.com/go-kit/log"
"github.com/go-kit/log"
"github.com/grafana/alloy/internal/mimir/client/internal"
"github.com/grafana/alloy/internal/useragent"
"github.com/grafana/dskit/instrument"
Expand All @@ -22,10 +22,14 @@ import (
)

var (
ErrNoConfig = errors.New("no config exists for this user")
ErrResourceNotFound = errors.New("requested resource not found")
ErrUnrecoverable = errors.New("unrecoverable error response")
)

// IsRecoverable returns true for errors from API requests that can be retried, false otherwise.
func IsRecoverable(err error) bool {
return !errors.Is(err, ErrUnrecoverable)
}

// Config is used to configure a MimirClient.
type Config struct {
ID string
Expand Down Expand Up @@ -123,8 +127,8 @@ func checkResponse(r *http.Response) error {
errMsg = fmt.Sprintf("server returned HTTP status %s: %s", r.Status, msg)
}

if r.StatusCode == http.StatusNotFound {
return ErrResourceNotFound
if r.StatusCode/100 == 4 && r.StatusCode != http.StatusTooManyRequests {
return fmt.Errorf("%w: %s", ErrUnrecoverable, errMsg)
}

return errors.New(errMsg)
Expand Down
82 changes: 82 additions & 0 deletions internal/mimir/client/client_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package client

import (
"io"
"net/http"
"net/url"
"strings"
"testing"

"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -99,3 +101,83 @@ func TestBuildURL(t *testing.T) {
})
}
}

func TestCheckResponseSuccess(t *testing.T) {
tc := []struct {
name string
body string
statusCode int
}{
{
name: "returns nil error for 200 response",
body: "200 message!",
statusCode: http.StatusOK,
},
{
name: "returns nil error for 204 response",
body: "",
statusCode: http.StatusNoContent,
},
}

for _, tt := range tc {
t.Run(tt.name, func(t *testing.T) {
response := &http.Response{
Status: http.StatusText(tt.statusCode),
StatusCode: tt.statusCode,
Body: io.NopCloser(strings.NewReader(tt.body)),
}

err := checkResponse(response)
require.NoError(t, err)
})
}
}

func TestCheckResponseErrors(t *testing.T) {
tc := []struct {
name string
body string
statusCode int
canRetry bool
}{
{
name: "returns correct error for 400 response",
body: "400 message!",
statusCode: http.StatusBadRequest,
canRetry: false,
},
{
name: "returns correct error for 404 response",
body: "404 message!",
statusCode: 404,
canRetry: false,
},
{
name: "returns correct error for 429 response",
body: "429 message!",
statusCode: http.StatusTooManyRequests,
canRetry: true,
},
{
name: "returns correct error for 500 response",
body: "500 message!",
statusCode: http.StatusInternalServerError,
canRetry: true,
},
}

for _, tt := range tc {
t.Run(tt.name, func(t *testing.T) {
response := &http.Response{
Status: http.StatusText(tt.statusCode),
StatusCode: tt.statusCode,
Body: io.NopCloser(strings.NewReader(tt.body)),
}

err := checkResponse(response)
require.Error(t, err)
require.Equal(t, tt.canRetry, IsRecoverable(err), "%+v is not expected recoverable/unrecoverable", err)
})
}
}

0 comments on commit 596fade

Please sign in to comment.