diff --git a/README.md b/README.md index e9b5159..87021db 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Pod-Reaper is configurable through environment variables. The pod-reaper specifi - `REQUIRE_ANNOTATION_VALUES` comma-separated list of metadata annotation values (of key-value pair) that pod-reaper should require - `DRY_RUN` log pod-reaper's actions but don't actually kill any pods - `MAX_PODS` kill a maximum number of pods on each run +- `POD_SORTING_STRATEGY` sorts pods before killing them (most useful when used with MAX_PODS) - `LOG_LEVEL` control verbosity level of log messages - `LOG_FORMAT` choose between several formats of logging @@ -115,6 +116,29 @@ Default value: unset (which will behave as if it were set to "0") Acceptable values are positive integers. Negative integers will evaluate to 0 and any other values will error. This can be useful to prevent too many pods being killed in one run. Logging messages will reflect that a pod was selected for reaping and that pod was not killed because too many pods were reaped already. +### `POD_SORTING_STRATEGY` + +Default value: unset (which will use the pod ordering return without specification from the API server). +Accepted values: +- (unset) - use the default ordering from the API server +- `random` (case-sensitive) will randomly shuffle the list of pods before killing +- `oldest-first` (case-sensitive) will sort pods into oldest-first based on the pods start time. (!! warning below). +- `youngest-first` (case-sensitive) will sort pods into youngest-first based on the pods start time (!! warning below) +- `pod-deletion-cost` (case-sensitive) will sort pods based on the [pod deletion cost annotation](https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/#pod-deletion-cost). + +!! WARNINGS !! + +Pod start time is not always defined. In these cases, sorting strategies based on age put pods without start times at the +end of the list. From my experience, this usually happens during a race condition with the pod initially being scheduled, +but there may be other cases hidden away. + +Using pod-reaper against the kube-system namespace can have some surprising implications. For example, during testing I +found that the kube-schedule was owned by a master node (not a replicaset/daemon-set) and appeared to effectively ignore +delete actions. The age returned from `kubectl` was reset, but the actual pod start time was unaffected. As a result of +this, I found a looping scenario where the kube scheduler was effectively always the oldest pod. + +In examples/pod-sorting-strategy.yml I mitigated this using by excluding on the label `tier: control-plane` + ## Logging Pod reaper logs in JSON format using a logrus (https://github.com/sirupsen/logrus). diff --git a/examples/pod-sorting-strategy.yml b/examples/pod-sorting-strategy.yml new file mode 100644 index 0000000..112251b --- /dev/null +++ b/examples/pod-sorting-strategy.yml @@ -0,0 +1,85 @@ +# example configuration with permission for running pod-reaper against +# an entire cluster + +--- +# namespace for the reaper +apiVersion: v1 +kind: Namespace +metadata: + name: reaper + +--- +# service account for running pod-reaper +apiVersion: v1 +kind: ServiceAccount +metadata: + name: pod-reaper-service-account + namespace: reaper + +--- +# minimal permissions required for running pod-reaper at cluster level +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-reaper-cluster-role +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["list", "delete"] + +--- +# binding the above cluster role (permissions) to the above service account +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: pod-reaper-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pod-reaper-cluster-role +subjects: +- kind: ServiceAccount + name: pod-reaper-service-account + namespace: reaper + +--- +# a basic pod-reaper deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pod-reaper + namespace: reaper # namespace matches above +spec: + replicas: 1 + selector: + matchLabels: + app: pod-reaper + template: + metadata: + labels: + app: pod-reaper + spec: + serviceAccount: pod-reaper-service-account # service account from above + containers: + - name: chaos + image: brianberzins/pod-reaper:alpha + resources: + limits: + cpu: 30m + memory: 30Mi + requests: + cpu: 20m + memory: 20Mi + env: + - name: EXCLUDE_LABEL_KEY + value: "tier" + - name: EXCLUDE_LABEL_VALUES + value: "control-plane" + - name: SCHEDULE + value: "@every 20s" + - name: CHAOS_CHANCE + value: "1" + - name: MAX_PODS + value: "1" + - name: POD_SORTING_STRATEGY + value: "oldest-first" diff --git a/reaper/options.go b/reaper/options.go index 25dceb6..f967925 100644 --- a/reaper/options.go +++ b/reaper/options.go @@ -1,8 +1,12 @@ package main import ( + "errors" "fmt" + v1 "k8s.io/api/core/v1" + "math/rand" "os" + "sort" "strconv" "strings" "time" @@ -26,6 +30,7 @@ const envRequireAnnotationKey = "REQUIRE_ANNOTATION_KEY" const envRequireAnnotationValues = "REQUIRE_ANNOTATION_VALUES" const envDryRun = "DRY_RUN" const envMaxPods = "MAX_PODS" +const envPodSortingStrategy = "POD_SORTING_STRATEGY" const envEvict = "EVICT" type options struct { @@ -38,6 +43,7 @@ type options struct { annotationRequirement *labels.Requirement dryRun bool maxPods int + podSortingStrategy func([]v1.Pod) rules rules.Rules evict bool } @@ -163,6 +169,72 @@ func maxPods() (int, error) { return v, nil } +func getPodDeletionCost(pod v1.Pod) int32 { + // https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/#pod-deletion-cost + costString, present := pod.ObjectMeta.Annotations["controller.kubernetes.io/pod-deletion-cost"] + if !present { + return 0 + } + // per k8s doc: invalid values should be rejected by the API server + cost, _ := strconv.ParseInt(costString, 10, 32) + return int32(cost) +} + +func defaultSort([]v1.Pod) {} + +func randomSort(pods []v1.Pod) { + rand.Shuffle(len(pods), func(i, j int) { pods[i], pods[j] = pods[j], pods[i] }) +} + +func oldestFirstSort(pods []v1.Pod) { + sort.Slice(pods, func(i, j int) bool { + if pods[i].Status.StartTime == nil { + return false + } + if pods[j].Status.StartTime == nil { + return true + } + return pods[i].Status.StartTime.Unix() < pods[j].Status.StartTime.Unix() + }) +} + +func youngestFirstSort(pods []v1.Pod) { + sort.Slice(pods, func(i, j int) bool { + if pods[i].Status.StartTime == nil { + return false + } + if pods[j].Status.StartTime == nil { + return true + } + return pods[j].Status.StartTime.Unix() < pods[i].Status.StartTime.Unix() + }) +} + +func podDeletionCostSort(pods []v1.Pod) { + sort.Slice(pods, func(i, j int) bool { + return getPodDeletionCost(pods[i]) < getPodDeletionCost(pods[j]) + }) +} + +func podSortingStrategy() (func([]v1.Pod), error) { + sortingStrategy, present := os.LookupEnv(envPodSortingStrategy) + if !present { + return defaultSort, nil + } + switch sortingStrategy { + case "random": + return randomSort, nil + case "oldest-first": + return oldestFirstSort, nil + case "youngest-first": + return youngestFirstSort, nil + case "pod-deletion-cost": + return podDeletionCostSort, nil + default: + return nil, errors.New("unknown pod sorting strategy") + } +} + func evict() (bool, error) { value, exists := os.LookupEnv(envEvict) if !exists { @@ -173,44 +245,37 @@ func evict() (bool, error) { func loadOptions() (options options, err error) { options.namespace = namespace() - options.gracePeriod, err = gracePeriod() - if err != nil { + if options.gracePeriod, err = gracePeriod(); err != nil { return options, err } options.schedule = schedule() - options.runDuration, err = runDuration() - if err != nil { + if options.runDuration, err = runDuration(); err != nil { return options, err } - options.labelExclusion, err = labelExclusion() - if err != nil { + if options.labelExclusion, err = labelExclusion(); err != nil { return options, err } - options.labelRequirement, err = labelRequirement() - if err != nil { + if options.labelRequirement, err = labelRequirement(); err != nil { return options, err } - options.annotationRequirement, err = annotationRequirement() - if err != nil { + if options.annotationRequirement, err = annotationRequirement(); err != nil { return options, err } - options.dryRun, err = dryRun() - if err != nil { + if options.dryRun, err = dryRun(); err != nil { return options, err } - options.maxPods, err = maxPods() - if err != nil { + if options.maxPods, err = maxPods(); err != nil { return options, err } - - options.evict, err = evict() - if err != nil { + if options.podSortingStrategy, err = podSortingStrategy(); err != nil { + return options, err + } + if options.evict, err = evict(); err != nil { return options, err } // rules - options.rules, err = rules.LoadRules() - if err != nil { + if options.rules, err = rules.LoadRules(); err != nil { return options, err } return options, nil diff --git a/reaper/options_test.go b/reaper/options_test.go index c9de430..02be2f3 100644 --- a/reaper/options_test.go +++ b/reaper/options_test.go @@ -1,6 +1,9 @@ package main import ( + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "math/rand" "os" "testing" "time" @@ -16,6 +19,49 @@ func init() { logrus.SetOutput(ioutil.Discard) } +func epocPlus(duration time.Duration) *metav1.Time { + t := metav1.NewTime(time.Unix(0, 0).Add(duration)) + return &t +} +func testPodList() []v1.Pod { + return []v1.Pod{ + { + Status: v1.PodStatus{ + StartTime: epocPlus(2 * time.Minute), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "bearded-dragon", + Annotations: map[string]string{"example/key": "lizard", "controller.kubernetes.io/pod-deletion-cost": "invalid"}, + }, + }, + { + Status: v1.PodStatus{}, + ObjectMeta: metav1.ObjectMeta{ + Name: "nil-start-time", + Annotations: map[string]string{"example/key": "lizard", "controller.kubernetes.io/pod-deletion-cost": "-100"}, + }, + }, + { + Status: v1.PodStatus{ + StartTime: epocPlus(5 * time.Minute), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "expensive", + Annotations: map[string]string{"example/key": "not-lizard", "controller.kubernetes.io/pod-deletion-cost": "500"}, + }, + }, + { + Status: v1.PodStatus{ + StartTime: epocPlus(1 * time.Minute), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "corgi", + Annotations: map[string]string{"example/key": "not-lizard"}, + }, + }, + } +} + func TestOptions(t *testing.T) { t.Run("namespace", func(t *testing.T) { t.Run("default", func(t *testing.T) { @@ -250,6 +296,75 @@ func TestOptions(t *testing.T) { assert.Equal(t, 0, maxPods) }) }) + t.Run("pod-sorting", func(t *testing.T) { + t.Run("default", func(t *testing.T) { + os.Clearenv() + sorter, err := podSortingStrategy() + assert.NotNil(t, sorter) + assert.NoError(t, err) + subject := testPodList() + sorter(subject) + assert.Equal(t, testPodList(), subject) + }) + t.Run("invalid", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodSortingStrategy, "not a valid sorting strategy") + _, err := podSortingStrategy() + assert.Error(t, err) + }) + t.Run("random", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodSortingStrategy, "random") + sorter, err := podSortingStrategy() + assert.NotNil(t, sorter) + assert.NoError(t, err) + subject := testPodList() + rand.Seed(2) // magic seed to force switch + sorter(subject) + assert.NotEqual(t, testPodList(), subject) + assert.ElementsMatch(t, testPodList(), subject) + }) + t.Run("oldest-first", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodSortingStrategy, "oldest-first") + sorter, err := podSortingStrategy() + assert.NotNil(t, sorter) + assert.NoError(t, err) + subject := testPodList() + sorter(subject) + assert.Equal(t, "corgi", subject[0].ObjectMeta.Name) + assert.Equal(t, "bearded-dragon", subject[1].ObjectMeta.Name) + assert.Equal(t, "expensive", subject[2].ObjectMeta.Name) + assert.Equal(t, "nil-start-time", subject[3].ObjectMeta.Name) + assert.ElementsMatch(t, testPodList(), subject) + }) + t.Run("youngest-first", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodSortingStrategy, "youngest-first") + sorter, err := podSortingStrategy() + assert.NotNil(t, sorter) + assert.NoError(t, err) + subject := testPodList() + sorter(subject) + assert.Equal(t, "expensive", subject[0].ObjectMeta.Name) + assert.Equal(t, "bearded-dragon", subject[1].ObjectMeta.Name) + assert.Equal(t, "corgi", subject[2].ObjectMeta.Name) + assert.Equal(t, "nil-start-time", subject[3].ObjectMeta.Name) + assert.ElementsMatch(t, testPodList(), subject) + }) + t.Run("pod-deletion-cost", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodSortingStrategy, "pod-deletion-cost") + sorter, err := podSortingStrategy() + assert.NotNil(t, sorter) + assert.NoError(t, err) + subject := testPodList() + sorter(subject) + assert.Equal(t, "nil-start-time", subject[0].ObjectMeta.Name) + assert.Equal(t, "expensive", subject[3].ObjectMeta.Name) + assert.ElementsMatch(t, testPodList(), subject) + }) + }) } func TestOptionsLoad(t *testing.T) { diff --git a/reaper/reaper.go b/reaper/reaper.go index bdedb81..79ca6e4 100644 --- a/reaper/reaper.go +++ b/reaper/reaper.go @@ -60,6 +60,8 @@ func (reaper reaper) getPods() *v1.PodList { listOptions.LabelSelector = selector.String() } podList, err := pods.List(listOptions) + reaper.options.podSortingStrategy(podList.Items) + if err != nil { logrus.WithError(err).Panic("unable to get pods from the cluster") panic(err) diff --git a/rules/rules.go b/rules/rules.go index 8307532..1887a88 100644 --- a/rules/rules.go +++ b/rules/rules.go @@ -10,11 +10,11 @@ import ( // Rule is an interface defining the two functions needed for pod reaper to use the rule. type Rule interface { - // load attempts to load the load and returns whether or the not the rule was loaded, a message that will be logged + // load attempts to load the load and returns whether the rule was loaded, a message that will be logged // when the rule is loaded, and any error that may have occurred during the load. load() (bool, string, error) - // ShouldReap takes a pod and returns whether or not the pod should be reaped based on this rule and a message that + // ShouldReap takes a pod and returns whether the pod should be reaped based on this rule and a message that // will be logged when the pod is selected for reaping. ShouldReap(pod v1.Pod) (bool, string) } @@ -24,7 +24,7 @@ type Rules struct { LoadedRules []Rule } -// LoadRules load all of the rules based on their own implementations +// LoadRules load all the rules based on their own implementations func LoadRules() (Rules, error) { // load all possible rules rules := []Rule{ @@ -45,14 +45,14 @@ func LoadRules() (Rules, error) { loadedRules = append(loadedRules, rule) } } - // return an err if no rules where loaded + // return an error if no rules where loaded if len(loadedRules) == 0 { return Rules{LoadedRules: loadedRules}, errors.New("no rules were loaded") } return Rules{LoadedRules: loadedRules}, nil } -// ShouldReap takes a pod and return whether or not the pod should be reaped based on this rule. +// ShouldReap takes a pod and return whether the pod should be reaped based on this rule. // Also includes a message describing why the pod was flagged for reaping. func (rules Rules) ShouldReap(pod v1.Pod) (bool, []string) { var reasons []string