diff --git a/.github/ISSUE_TEMPLATE/NEW_RELEASE.md b/.github/ISSUE_TEMPLATE/NEW_RELEASE.md index 3f9c2c5214..6d47cba36f 100644 --- a/.github/ISSUE_TEMPLATE/NEW_RELEASE.md +++ b/.github/ISSUE_TEMPLATE/NEW_RELEASE.md @@ -14,13 +14,16 @@ Please do not remove items from the checklist At least two for minor or major releases. At least one for a patch release. - [ ] Verify that the changelog in this issue and the CHANGELOG folder is up-to-date - [ ] Use https://github.com/kubernetes/release/tree/master/cmd/release-notes to gather notes. - Example: `release-notes --org kubernetes-sigs --repo kueue --branch release-0.3 --start-sha 4a0ebe7a3c5f2775cdf5fc7d60c23225660f8702 --end-sha a51cf138afe65677f5f5c97f8f8b1bc4887f73d2` + Example: `release-notes --org kubernetes-sigs --repo kueue --branch release-0.3 --start-sha 4a0ebe7a3c5f2775cdf5fc7d60c23225660f8702 --end-sha a51cf138afe65677f5f5c97f8f8b1bc4887f73d2 --dependencies=false --required-author=""` - [ ] For major or minor releases (v$MAJ.$MIN.0), create a new release branch. - - [ ] an OWNER creates a vanilla release branch with + - [ ] An OWNER creates a vanilla release branch with `git branch release-$MAJ.$MIN main` - [ ] An OWNER pushes the new release branch with `git push release-$MAJ.$MIN` -- [ ] Update `README.md`, `CHANGELOG`, `charts/kueue/Chart.yaml` (`appVersion`) and `charts/kueue/values.yaml` (`controllerManager.manager.image.tag`) in the release branch: +- [ ] Update the release branch: + - [ ] Update `RELEASE_BRANCH` and `RELEASE_VERSION` in `Makefile` and run `make prepare-release-branch` + - [ ] Update the `CHANGELOG` + - [ ] Submit a pull request with the changes: - [ ] An OWNER [prepares a draft release](https://github.com/kubernetes-sigs/kueue/releases) - [ ] Write the change log into the draft release. - [ ] Run diff --git a/.github/workflows/odh-release.yml b/.github/workflows/odh-release.yml new file mode 100644 index 0000000000..fd303d217b --- /dev/null +++ b/.github/workflows/odh-release.yml @@ -0,0 +1,54 @@ +# This workflow will compile e2e tests and release them + +name: ODH Release +on: + workflow_dispatch: + inputs: + version: + description: 'Tag to be used for release, i.e.: v0.0.1' + required: true + push: + tags: + - '*' + +env: + KUEUE_RELEASE_VERSION: ${{ github.event.inputs.version || github.ref_name }} + +jobs: + release-odh: + runs-on: ubuntu-latest + + # Permission required to create a release + permissions: + contents: write + + steps: + - uses: actions/checkout@v4 + + - name: Set Go + uses: actions/setup-go@v5 + with: + go-version: v1.21 + + - name: Verify that release doesn't exist yet + shell: bash {0} + run: | + gh release view $KUEUE_RELEASE_VERSION + status=$? + if [[ $status -eq 0 ]]; then + echo "Release $KUEUE_RELEASE_VERSION already exists." + exit 1 + fi + env: + GITHUB_TOKEN: ${{ github.TOKEN }} + + - name: Compile tests + run: | + go test -c -o compiled-tests/e2e-singlecluster ./test/e2e/singlecluster/ + + - name: Creates a release in GitHub + run: | + gh release create $KUEUE_RELEASE_VERSION --target $GITHUB_SHA compiled-tests/* + env: + GITHUB_TOKEN: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }} + shell: bash diff --git a/.github/workflows/openvex.yaml b/.github/workflows/openvex.yaml index 916faddce1..d4bd909b0c 100644 --- a/.github/workflows/openvex.yaml +++ b/.github/workflows/openvex.yaml @@ -24,9 +24,9 @@ jobs: name: Run vexctl with: product: pkg:golang/sigs.k8s.io/kueue@${{ env.TAG }} - file: /tmp/kueve.vex.json + file: /tmp/kueue.vex.json - name: Upload openvex data env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - gh release upload $TAG /tmp/kueve.vex.json + gh release upload $TAG /tmp/kueue.vex.json diff --git a/CHANGELOG/CHANGELOG-0.6.md b/CHANGELOG/CHANGELOG-0.6.md new file mode 100644 index 0000000000..9f5bdf7137 --- /dev/null +++ b/CHANGELOG/CHANGELOG-0.6.md @@ -0,0 +1,119 @@ +## v0.6.2 + +Changes since `v0.6.1`: + +### Bug or Regression + +- Avoid unnecessary preemptions when there are multiple candidates for preemption with the same admission timestamp (#1880, @alculquicondor) +- Fix Pods in Pod groups stuck with finalizers when deleted immediately after Succeeded (#1916, @alculquicondor) +- Fix preemption to reclaim quota that is blocked by an earlier pending Workload from another ClusterQueue in the same cohort. (#1868, @alculquicondor) +- Reduce number of Workload reconciliations due to wrong equality check. (#1917, @gabesaba) + +### Other (Cleanup or Flake) + +- Improve pod integration performance (#1953, @gabesaba) + +## v0.6.1 + +Changes Since `v0.6.0`: + +### Feature + +- Added MultiKueue worker connection monitoring and reconnect. (#1809, @trasc) +- The Failed pods in a pod-group are finalized once a replacement pods are created. (#1801, @trasc) + +### Bug or Regression + +- Exclude Pod labels, preemptionPolicy and container images when determining whether pods in a pod group have the same shape. (#1760, @alculquicondor) +- Fix incorrect quota management when lendingLimit enabled in preemption (#1826, @kerthcet, @B1F030) +- Fix the configuration for the number of reconcilers for the Pod integration. It was only reconciling one group at a time. (#1837, @alculquicondor) +- Kueue visibility API is no longer installed by default. Users can install it via helm or applying the visibility-api.yaml artifact. (#1764, @trasc) +- WaitForPodsReady: Fix a bug that the requeueState isn't reset. (#1843, @tenzen-y) + +### Other (Cleanup or Flake) + +- Avoid API calls for admission attempts when Workload already has condition Admitted=false (#1845, @alculquicondor) +- Skip requeueing of Workloads when there is a status update for a ClusterQueue, saving on API calls for Workloads that were already attempted for admission. (#1832, @alculquicondor) + +## v0.6.0 + +Changes since `v0.5.0`: + +### API Change + +- A `stopPolicy` field in the ClusterQueue allows to hold or drain a ClusterQueue (#1299, @trasc) +- Add a lendingLimit field in ClusterQueue's quotas, to allow restricting how much of the unused resources by the ClusterQueue can be borrowed by other ClusterQueues in the cohort. + In other words, this allows a quota equal to `nominal-lendingLimit` to be exclusively used by the ClusterQueue. (#1385, @B1F030) +- Add validation for clusterQueue: when cohort is empty, borrowingLimit must be nil. (#1525, @B1F030) +- Allow decrease reclaimable pods to 0 for suspended job (#1277, @yaroslava-serdiuk) +- MultiKueue: Add Path location type for cluster KubeConfigs. (#1640, @trasc) +- MultiKueue: Add garbage collection of deleted Workloads. (#1643, @trasc) +- MultiKueue: Multi cluster job dispatching for k8s Job. This doesn't include support for live status updates. (#1313, @trasc) +- Support for a mechanism to suspend a running Job without requeueing (#1252, @vicentefb) +- Support for preemption while borrowing (#1397, @mimowo) +- The leaderElection field in the Configuration API is now defaulted. + Leader election is now enabled by default. (#1598, @astefanutti) +- Visibility API: Add an endpoint that allows a user to fetch information about pending workloads and their position in LocalQueue. (#1365, @PBundyra) +- Visibility API: Introduce an on-demand API endpoint for fetching pending workloads in a ClusterQueue. (#1251, @PBundyra) +- Visibility API: extend the information returned for the pending workloads in a ClusterQueue, including the workload position in the queue. (#1362, @PBundyra) +- WaitForPodsReady: Add a config field to allow admins to configure the timestamp used when sorting workloads that were evicted due to their Pods not becoming ready on time. (#1542, @nstogner) +- WaitForPodsReady: Support a backoff re-queueing mechanism with configurable limit. (#1709, @tenzen-y) + +### Feature + +- Add Prebuilt Workload support for JobSets. (#1575, @trasc) +- Add events for transitions of the provisioning AdmissionCheck (#1271, @stuton) +- Add prebuilt workload support for batch/job. (#1358, @trasc) +- Add support for groups of plain Pods. (#1319, @achernevskii) +- Allow configuring featureGates on helm charts. (#1314, @B1F030) +- At log level 6, the usage of ClusterQueues and cohorts is included in logs. + + The status of the internal cache and queues is also logged on demand when a SIGUSR2 is sent to kueue, regardless of the log level. (#1528, @alculquicondor) +- Changing tolerations in an inadmissible job triggers an admission retry with the updated tolerations. (#1304, @stuton) +- Increase the default number of reconcilers for Pod and Workload objects to 5, each. (#1589, @alculquicondor) +- Jobs preserve their position in the queue if the number of pods change before being admitted (#1223, @yaroslava-serdiuk) +- Make the image build setting CGO_ENABLED configurable (#1391, @anishasthana) +- MultiKueue: Add live status updates for multikueue JobSets (#1668, @trasc) +- MultiKueue: Support for JobSets. (#1606, @trasc) +- Support RayCluster as a queue-able workload in Kueue (#1520, @vicentefb) +- Support for retry of provisioning request. + + When `ProvisioningACC` is enabled, and there are existing ProvisioningRequests, they are going to be recreated. + This may cause job evictions for some long-running jobs which were using the ProvisioningRequests. (#1351, @mimowo) +- The image gcr.io/k8s-staging-kueue/debug:main, along with the script ./hack/dump_cache.sh can be used to trigger a dump of the internal cache into the logs. (#1541, @alculquicondor) +- The priority sorting within the cohort could be disabled by setting the feature gate PrioritySortingWithinCohort to false (#1406, @yaroslava-serdiuk) +- Visibility API: Add HA support. (#1554, @astefanutti) + +### Bug or Regression + +- Add Missing RBAC on finalizer sub-resources for job integrations. (#1486, @astefanutti) +- Add Mutating WebhookConfigurations for the AdmissionCheck, RayJob, and JobSet to helm charts (#1567, @B1F030) +- Add Validating/Mutating WebhookConfigurations for the KubeflowJobs like PyTorchJob (#1460, @tenzen-y) +- Added event for QuotaReserved and fixed event for Admitted to trigger when admission checks complete (#1436, @trasc) +- Avoid finished Workloads from blocking quota after a Kueue restart (#1689, @trasc) +- Avoid recreating a Workload for a finished Job and finalize a job when the workload is declared finished. (#1383, @achernevskii) +- Do not (re)create ProvReq if the state of admission check is Ready (#1617, @mimowo) +- Fix Kueue crashing at the log level 6 when re-admitting workloads (#1644, @mimowo) +- Fix a bug in the pod integration that unexpected errors will occur when the pod isn't found (#1512, @achernevskii) +- Fix a bug that plain pods managed by kueue will remain in a terminating state, due to a finalizer (#1342, @tenzen-y) +- Fix client-go libraries bug that can not operate clusterScoped resources like ClusterQueue and ResourceFlavor. (#1294, @tenzen-y) +- Fix fungibility policy `Preempt` where it was not able to utilize the next flavor if preemption was not possible. (#1366, @alculquicondor) +- Fix handling of preemption within a cohort when there is no borrowingLimit. In that case, + during preemption, the permitted resources to borrow were calculated as if borrowingLimit=0, instead of unlimited. + + As a consequence, when using `reclaimWithinCohort`, it was possible that a workload, scheduled to ClusterQueue with no borrowingLimit, would preempt more workloads than needed, even though it could fit by borrowing. (#1561, @mimowo) +- Fix the synchronization of the admission check state based on recreated ProvisioningRequest (#1585, @mimowo) +- Fixed fungibility policy `whenCanPreempt: Preempt`. The admission should happen in the flavor for which preemptions were issued. (#1332, @alculquicondor) +- Kueue replicas are advertised as Ready only once the webhooks are functional. + + This allows users to wait with the first requests until the Kueue deployment is available, so that the + early requests don't fail. (#1676, @mimowo) +- Pending workload from StrictFIFO ClusterQueue doesn't block borrowing from other ClusterQueues (#1399, @yaroslava-serdiuk) +- Remove deleted pending workloads from the cache (#1679, @astefanutti) +- Remove finalizer from Workloads that are orphaned (have no owners). (#1523, @achernevskii) +- Trigger an eviction for an admitted Job after an admission check changed state to Rejected. (#1562, @trasc) +- Webhooks are served in non-leading replicas (#1509, @astefanutti) + +### Other (Cleanup or Flake) + +- Expose utilization functions to setup jobframework reconcilers and webhooks (#1630, @tenzen-y) \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index ef863e4494..8e16cd690c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ ARG BASE_IMAGE # Build the manager binary FROM --platform=${BUILDPLATFORM} ${BUILDER_IMAGE} as builder -ARG CGO_ENABLED +ARG CGO_ENABLED=1 ARG TARGETARCH WORKDIR /workspace diff --git a/Dockerfile.rhoai b/Dockerfile.rhoai new file mode 100644 index 0000000000..cb2fcee4df --- /dev/null +++ b/Dockerfile.rhoai @@ -0,0 +1,28 @@ +# Build the manager binary +FROM registry.access.redhat.com/ubi9/go-toolset:1.21 as builder + +WORKDIR /workspace + +RUN git config --global --add safe.directory /workspace + +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum +# cache deps before building and copying source so that we don't need to re-download as much +# and so that source changes don't invalidate our downloaded layer +RUN go mod download + +# Copy the go source +COPY . . + +# Build +RUN make build GO_BUILD_ENV='CGO_ENABLED=1 GOOS=linux' + +FROM registry.access.redhat.com/ubi9/ubi:latest + +WORKDIR / +USER root +COPY --from=builder /workspace/bin/manager /manager +USER 65532:65532 + +ENTRYPOINT ["/manager"] \ No newline at end of file diff --git a/Makefile b/Makefile index 3aa2c9464f..5ebe93fed6 100644 --- a/Makefile +++ b/Makefile @@ -71,6 +71,11 @@ E2E_K8S_VERSIONS ?= 1.26.12 1.27.9 1.28.5 1.29.0 # Default will delete default kind cluster KIND_CLUSTER_NAME ?= kind +# Number of processes to use during integration tests to run specs within a +# suite in parallel. Suites still run sequentially. User may set this value to 1 +# to run without parallelism. +INTEGRATION_NPROCS ?= 4 + # Setting SHELL to bash allows bash commands to be executed by recipes. # This is a requirement for 'setup-envtest.sh' in the test target. # Options are set to exit when a recipe line exits non-zero or a piped command fails. @@ -81,6 +86,11 @@ version_pkg = sigs.k8s.io/kueue/pkg/version LD_FLAGS += -X '$(version_pkg).GitVersion=$(GIT_TAG)' LD_FLAGS += -X '$(version_pkg).GitCommit=$(shell git rev-parse HEAD)' +# Update these variables when preparing a new release or a release branch. +# Then run `make prepare-release-branch` +RELEASE_VERSION=v0.6.2 +RELEASE_BRANCH=release-0.6 + .PHONY: all all: generate fmt vet build @@ -162,7 +172,7 @@ test: gotestsum ## Run tests. .PHONY: test-integration test-integration: gomod-download envtest ginkgo mpi-operator-crd ray-operator-crd jobset-operator-crd kf-training-operator-crd cluster-autoscaler-crd ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" \ - $(GINKGO) $(GINKGO_ARGS) --junit-report=junit.xml --output-dir=$(ARTIFACTS) -v $(INTEGRATION_TARGET) + $(GINKGO) $(GINKGO_ARGS) -procs=$(INTEGRATION_NPROCS) --junit-report=junit.xml --output-dir=$(ARTIFACTS) -v $(INTEGRATION_TARGET) CREATE_KIND_CLUSTER ?= true .PHONY: test-e2e @@ -192,8 +202,8 @@ ci-lint: golangci-lint $(GOLANGCI_LINT) run --timeout 15m0s .PHONY: verify -verify: gomod-verify vet ci-lint fmt-verify toc-verify manifests generate update-helm generate-apiref - git --no-pager diff --exit-code config/components apis charts/kueue/templates client-go site/content/en/docs/reference +verify: gomod-verify vet ci-lint fmt-verify toc-verify manifests generate update-helm generate-apiref prepare-release-branch + git --no-pager diff --exit-code config/components apis charts/kueue/templates client-go site/ ##@ Build @@ -243,7 +253,7 @@ ifndef ignore-not-found ignore-not-found = false endif -clean-manifests = (cd config/components/manager && $(KUSTOMIZE) edit set image controller=gcr.io/k8s-staging-kueue/kueue:main) +clean-manifests = (cd config/components/manager && $(KUSTOMIZE) edit set image controller=gcr.io/k8s-staging-kueue/kueue:$(RELEASE_BRANCH)) .PHONY: install install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config. @@ -277,6 +287,7 @@ artifacts: kustomize yq helm $(KUSTOMIZE) build config/dev -o artifacts/manifests-dev.yaml $(KUSTOMIZE) build config/alpha-enabled -o artifacts/manifests-alpha-enabled.yaml $(KUSTOMIZE) build config/prometheus -o artifacts/prometheus.yaml + $(KUSTOMIZE) build config/visibility -o artifacts/visibility-api.yaml @$(call clean-manifests) # Update the image tag and policy $(YQ) e '.controllerManager.manager.image.repository = "$(IMAGE_REPO)" | .controllerManager.manager.image.tag = "$(GIT_TAG)" | .controllerManager.manager.image.pullPolicy = "IfNotPresent"' -i charts/kueue/values.yaml @@ -286,6 +297,12 @@ artifacts: kustomize yq helm # Revert the image changes $(YQ) e '.controllerManager.manager.image.repository = "$(STAGING_IMAGE_REGISTRY)/$(IMAGE_NAME)" | .controllerManager.manager.image.tag = "main" | .controllerManager.manager.image.pullPolicy = "Always"' -i charts/kueue/values.yaml +.PHONY: prepare-release-branch +prepare-release-branch: yq kustomize + sed -r 's/v[0-9]+\.[0-9]+\.[0-9]+/$(RELEASE_VERSION)/g' -i README.md -i site/config.toml + $(YQ) e '.appVersion = "$(RELEASE_VERSION)"' -i charts/kueue/Chart.yaml + @$(call clean-manifests) + ##@ Tools # Build an image that can be used with kubectl debug @@ -315,7 +332,7 @@ kustomize: ## Download kustomize locally if necessary. ENVTEST = $(PROJECT_DIR)/bin/setup-envtest .PHONY: envtest envtest: ## Download envtest-setup locally if necessary. - @GOBIN=$(PROJECT_DIR)/bin GO111MODULE=on $(GO_CMD) install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest + @GOBIN=$(PROJECT_DIR)/bin GO111MODULE=on $(GO_CMD) install sigs.k8s.io/controller-runtime/tools/setup-envtest@v0.0.0-20240320141353-395cfc7486e6 GINKGO = $(PROJECT_DIR)/bin/ginkgo .PHONY: ginkgo @@ -372,7 +389,7 @@ jobset-operator-crd: cp -f $(JOBSETROOT)/config/components/crd/bases/* $(PROJECT_DIR)/dep-crds/jobset-operator/ -CAROOT = $(shell $(GO_CMD) list -m -f "{{.Dir}}" k8s.io/autoscaler/cluster-autoscaler) +CAROOT = $(shell $(GO_CMD) list -m -f "{{.Dir}}" k8s.io/autoscaler/cluster-autoscaler/apis) .PHONY: cluster-autoscaler-crd cluster-autoscaler-crd: mkdir -p $(PROJECT_DIR)/dep-crds/cluster-autoscaler/ diff --git a/OWNERS b/OWNERS index c4f13a002b..8add8425da 100644 --- a/OWNERS +++ b/OWNERS @@ -1,17 +1,23 @@ # See the OWNERS docs at https://go.k8s.io/owners approvers: -- alculquicondor -- tenzen-y +- anishasthana +- astefanutti +- jbusche +- kpostoffice +- maxusmusti +- sutaakar +- tedhtchang reviewers: -- alculquicondor -- denkensk -- kerthcet -- mimowo -- tenzen-y -- trasc - -emeritus_approvers: -- ahg-g -- denkensk +- anishasthana +- astefanutti +- ChristianZaccaria +- dimakis +- Fiona-Waters +- jbusche +- kpostoffice +- maxusmusti +- MichaelClifford +- sutaakar +- tedhtchang diff --git a/README.md b/README.md index bd92508549..553bfe6626 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Read the [overview](https://kueue.sigs.k8s.io/docs/overview/) to learn more. To install the latest release of Kueue in your cluster, run the following command: ```shell -kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.5.3/manifests.yaml +kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml ``` The controller runs in the `kueue-system` namespace. diff --git a/charts/kueue/Chart.yaml b/charts/kueue/Chart.yaml index 3b19e9071b..5604645330 100644 --- a/charts/kueue/Chart.yaml +++ b/charts/kueue/Chart.yaml @@ -18,4 +18,4 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.5.3" +appVersion: "v0.6.2" diff --git a/charts/kueue/templates/_helpers.tpl b/charts/kueue/templates/_helpers.tpl index d207db1faa..9b0a8a5606 100644 --- a/charts/kueue/templates/_helpers.tpl +++ b/charts/kueue/templates/_helpers.tpl @@ -74,3 +74,21 @@ FeatureGates - --feature-gates={{ $features | trimSuffix "," }} {{- end }} {{- end }} + +{{/* +IsFeatureGateEnabled - outputs true if the feature gate .Feature is enabled in the .List +Usage: + {{- if include "kueue.isFeatureGateEnabled" (dict "List" .Values.controllerManager.featureGates "Feature" "VisibilityOnDemand") }} +*/}} +{{- define "kueue.isFeatureGateEnabled" -}} +{{- $feature := .Feature }} +{{- $enabled := false }} +{{- range .List }} +{{- if (and (eq .name $feature) (eq .enabled true)) }} +{{- $enabled = true }} +{{- end }} +{{- end }} +{{- if $enabled }} +{{- $enabled -}} +{{- end }} +{{- end }} diff --git a/charts/kueue/templates/visibility/apiservice.yaml b/charts/kueue/templates/visibility/apiservice.yaml index 31cbdf453a..497cea57f5 100644 --- a/charts/kueue/templates/visibility/apiservice.yaml +++ b/charts/kueue/templates/visibility/apiservice.yaml @@ -1,3 +1,4 @@ +{{- if include "kueue.isFeatureGateEnabled" (dict "List" .Values.controllerManager.featureGates "Feature" "VisibilityOnDemand") }} apiVersion: apiregistration.k8s.io/v1 kind: APIService metadata: @@ -11,3 +12,4 @@ spec: namespace: '{{ .Release.Namespace }}' version: v1alpha1 versionPriority: 100 +{{- end }} diff --git a/charts/kueue/templates/visibility/role_binding.yaml b/charts/kueue/templates/visibility/role_binding.yaml index f67ff03225..7cdcc3e376 100644 --- a/charts/kueue/templates/visibility/role_binding.yaml +++ b/charts/kueue/templates/visibility/role_binding.yaml @@ -1,3 +1,4 @@ +{{- if include "kueue.isFeatureGateEnabled" (dict "List" .Values.controllerManager.featureGates "Feature" "VisibilityOnDemand") }} apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: @@ -13,3 +14,4 @@ subjects: - kind: ServiceAccount name: kueue-controller-manager namespace: '{{ .Release.Namespace }}' +{{- end }} diff --git a/charts/kueue/templates/visibility/service.yaml b/charts/kueue/templates/visibility/service.yaml index fd88e4f986..662de0024a 100644 --- a/charts/kueue/templates/visibility/service.yaml +++ b/charts/kueue/templates/visibility/service.yaml @@ -1,3 +1,4 @@ +{{- if include "kueue.isFeatureGateEnabled" (dict "List" .Values.controllerManager.featureGates "Feature" "VisibilityOnDemand") }} apiVersion: v1 kind: Service metadata: @@ -11,3 +12,4 @@ spec: targetPort: 8082 selector: control-plane: controller-manager +{{- end }} diff --git a/charts/kueue/templates/webhook/webhook.yaml b/charts/kueue/templates/webhook/webhook.yaml index 1de8e46241..413ebb38e4 100644 --- a/charts/kueue/templates/webhook/webhook.yaml +++ b/charts/kueue/templates/webhook/webhook.yaml @@ -287,10 +287,8 @@ webhooks: - v1beta1 operations: - CREATE - - UPDATE resources: - workloads - - workloads/status sideEffects: None --- apiVersion: admissionregistration.k8s.io/v1 diff --git a/charts/kueue/values.yaml b/charts/kueue/values.yaml index 0c1a90d8af..94725329f9 100644 --- a/charts/kueue/values.yaml +++ b/charts/kueue/values.yaml @@ -9,9 +9,9 @@ enablePrometheus: false enableCertManager: false # Customize controlerManager controllerManager: - #featureGates: - # - name: PartialAdmission - # enabled: true + #featureGates: + # - name: PartialAdmission + # enabled: true kubeRbacProxy: image: repository: gcr.io/kubebuilder/kube-rbac-proxy @@ -22,8 +22,6 @@ controllerManager: manager: image: repository: gcr.io/k8s-staging-kueue/kueue - # tag, if defined will use the given image tag, else Chart.AppVersion will be used - tag: main # This should be set to 'IfNotPresent' for released version pullPolicy: Always resources: diff --git a/cmd/kueue/main.go b/cmd/kueue/main.go index 56e5c59e36..7137d32bb0 100644 --- a/cmd/kueue/main.go +++ b/cmd/kueue/main.go @@ -35,7 +35,7 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/validation/field" utilfeature "k8s.io/apiserver/pkg/util/feature" - autoscaling "k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/apis/autoscaling.x-k8s.io/v1beta1" + autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" "k8s.io/client-go/discovery" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" diff --git a/config/alpha-enabled/kustomization.yaml b/config/alpha-enabled/kustomization.yaml index 21e4b163d9..ad4c4899e3 100644 --- a/config/alpha-enabled/kustomization.yaml +++ b/config/alpha-enabled/kustomization.yaml @@ -3,6 +3,7 @@ # Use default settings as a base. resources: - ../default +- ../visibility patches: # Modify feature gates to enable AllAlpha=true diff --git a/config/components/manager/controller_manager_config.yaml b/config/components/manager/controller_manager_config.yaml index 15adb60561..0c93b6957b 100644 --- a/config/components/manager/controller_manager_config.yaml +++ b/config/components/manager/controller_manager_config.yaml @@ -13,7 +13,7 @@ leaderElection: controller: groupKindConcurrency: Job.batch: 5 - Pod.: 5 + Pod: 5 Workload.kueue.x-k8s.io: 5 LocalQueue.kueue.x-k8s.io: 1 ClusterQueue.kueue.x-k8s.io: 1 @@ -22,8 +22,9 @@ clientConnection: qps: 50 burst: 100 #pprofBindAddress: :8082 -#waitForPodsReady: -# enable: true +waitForPodsReady: + enable: true + blockAdmission: false #manageJobsWithoutQueueName: true #internalCertManagement: # enable: false diff --git a/config/components/manager/kustomization.yaml b/config/components/manager/kustomization.yaml index b7b7487da3..8848b738b3 100644 --- a/config/components/manager/kustomization.yaml +++ b/config/components/manager/kustomization.yaml @@ -17,4 +17,4 @@ kind: Kustomization images: - name: controller newName: gcr.io/k8s-staging-kueue/kueue - newTag: main + newTag: release-0.6 diff --git a/config/components/manager/manager.yaml b/config/components/manager/manager.yaml index 6121c3672d..d15d0c2dc3 100644 --- a/config/components/manager/manager.yaml +++ b/config/components/manager/manager.yaml @@ -10,19 +10,12 @@ kind: Deployment metadata: name: controller-manager namespace: system - labels: - control-plane: controller-manager spec: - selector: - matchLabels: - control-plane: controller-manager replicas: 1 template: metadata: annotations: kubectl.kubernetes.io/default-container: manager - labels: - control-plane: controller-manager spec: securityContext: runAsNonRoot: true diff --git a/config/components/rbac/role.yaml b/config/components/rbac/role.yaml index c7ed22eb2d..f634072dfd 100644 --- a/config/components/rbac/role.yaml +++ b/config/components/rbac/role.yaml @@ -92,6 +92,14 @@ rules: - list - update - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch - apiGroups: - autoscaling.x-k8s.io resources: diff --git a/config/components/webhook/manifests.yaml b/config/components/webhook/manifests.yaml index 46378039a5..c7bed96edc 100644 --- a/config/components/webhook/manifests.yaml +++ b/config/components/webhook/manifests.yaml @@ -267,10 +267,8 @@ webhooks: - v1beta1 operations: - CREATE - - UPDATE resources: - workloads - - workloads/status sideEffects: None --- apiVersion: admissionregistration.k8s.io/v1 diff --git a/config/components/webhook/service.yaml b/config/components/webhook/service.yaml index c9f09d4b8c..acd6493f95 100644 --- a/config/components/webhook/service.yaml +++ b/config/components/webhook/service.yaml @@ -8,5 +8,3 @@ spec: - port: 443 protocol: TCP targetPort: 9443 - selector: - control-plane: controller-manager diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index f4afb3eade..29eebb3b76 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -17,7 +17,6 @@ resources: - ../components/rbac - ../components/manager - ../components/internalcert -- ../components/visibility # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # ../components/crd/kustomization.yaml - ../components/webhook diff --git a/config/rhoai/auth_proxy_service_patch.yaml b/config/rhoai/auth_proxy_service_patch.yaml new file mode 100644 index 0000000000..7836a7eeb7 --- /dev/null +++ b/config/rhoai/auth_proxy_service_patch.yaml @@ -0,0 +1,6 @@ +$patch: delete +apiVersion: v1 +kind: Service +metadata: + name: controller-manager-metrics-service + namespace: system diff --git a/config/rhoai/batch-user-rolebinding.yaml b/config/rhoai/batch-user-rolebinding.yaml new file mode 100644 index 0000000000..eb532afffc --- /dev/null +++ b/config/rhoai/batch-user-rolebinding.yaml @@ -0,0 +1,12 @@ +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: batch-user-rolebinding +subjects: + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: 'system:authenticated' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: batch-user-role diff --git a/config/rhoai/binding_admin_roles.yaml b/config/rhoai/binding_admin_roles.yaml new file mode 100644 index 0000000000..61be865881 --- /dev/null +++ b/config/rhoai/binding_admin_roles.yaml @@ -0,0 +1,18 @@ +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: admin-rolebinding +subjects: + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: rhods-admins + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: odh-admins + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: dedicated-admins +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kueue-batch-admin-role diff --git a/config/rhoai/clusterqueue_viewer_role_patch.yaml b/config/rhoai/clusterqueue_viewer_role_patch.yaml new file mode 100644 index 0000000000..bdb73407ca --- /dev/null +++ b/config/rhoai/clusterqueue_viewer_role_patch.yaml @@ -0,0 +1,7 @@ +# patch to add clusterqueue-viewer-role to batch-user +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: clusterqueue-viewer-role + labels: + rbac.kueue.x-k8s.io/batch-user: 'true' diff --git a/config/rhoai/kueue-metrics-service.yaml b/config/rhoai/kueue-metrics-service.yaml new file mode 100644 index 0000000000..1f0a18d17f --- /dev/null +++ b/config/rhoai/kueue-metrics-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: metrics-service +spec: + ports: + - name: metrics + port: 8080 + protocol: TCP + targetPort: metrics + selector: + app.kubernetes.io/name: kueue + app.kubernetes.io/part-of: kueue +status: + loadBalancer: {} diff --git a/config/rhoai/kustomization.yaml b/config/rhoai/kustomization.yaml new file mode 100644 index 0000000000..fec8dff5fa --- /dev/null +++ b/config/rhoai/kustomization.yaml @@ -0,0 +1,57 @@ +# RHOAI configuration for Kueue. + +# Adds namespace to all resources. +namespace: opendatahub + +# Value of this field is prepended to the +# names of all resources, e.g. a deployment named +# "wordpress" becomes "alices-wordpress". +# Note that it should also match with the prefix (text before '-') of the namespace +# field above. +namePrefix: kueue- + +configMapGenerator: +- name: rhoai-config + envs: + - params.env + +configurations: + - params.yaml + +vars: +- name: image + objref: + kind: ConfigMap + name: rhoai-config + apiVersion: v1 + fieldref: + fieldpath: data.odh-kueue-controller-image + +# Labels to add to all resources and selectors. +commonLabels: + app.kubernetes.io/name: kueue + app.kubernetes.io/component: controller + +resources: +- ../components/crd +- ../components/rbac +- ../components/manager +- ../components/internalcert +- ../components/webhook +- monitor.yaml +- binding_admin_roles.yaml +- webhook_network_policy.yaml +- batch-user-rolebinding.yaml +- kueue-metrics-service.yaml + +patches: +# Mount the controller config file for loading manager configurations +# through a ComponentConfig type +- path: manager_config_patch.yaml +- path: manager_webhook_patch.yaml +- path: manager_metrics_patch.yaml +- path: auth_proxy_service_patch.yaml +- path: mutating_webhook_patch.yaml +- path: validating_webhook_patch.yaml +- path: clusterqueue_viewer_role_patch.yaml +- path: remove_default_namespace.yaml diff --git a/config/rhoai/manager_config_patch.yaml b/config/rhoai/manager_config_patch.yaml new file mode 100644 index 0000000000..fe17c5a694 --- /dev/null +++ b/config/rhoai/manager_config_patch.yaml @@ -0,0 +1,22 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + containers: + - name: manager + image: $(image) + args: + - "--config=/controller_manager_config.yaml" + - "--zap-log-level=2" + volumeMounts: + - name: manager-config + mountPath: /controller_manager_config.yaml + subPath: controller_manager_config.yaml + volumes: + - name: manager-config + configMap: + name: manager-config diff --git a/config/rhoai/manager_metrics_patch.yaml b/config/rhoai/manager_metrics_patch.yaml new file mode 100644 index 0000000000..f4e3c86f1a --- /dev/null +++ b/config/rhoai/manager_metrics_patch.yaml @@ -0,0 +1,14 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + containers: + - name: manager + ports: + - containerPort: 8080 + protocol: TCP + name: metrics diff --git a/config/rhoai/manager_webhook_patch.yaml b/config/rhoai/manager_webhook_patch.yaml new file mode 100644 index 0000000000..738de350b7 --- /dev/null +++ b/config/rhoai/manager_webhook_patch.yaml @@ -0,0 +1,23 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + containers: + - name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: webhook-server-cert diff --git a/config/rhoai/monitor.yaml b/config/rhoai/monitor.yaml new file mode 100644 index 0000000000..e8fd0618d8 --- /dev/null +++ b/config/rhoai/monitor.yaml @@ -0,0 +1,13 @@ +# Prometheus Pod Monitor (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: controller-manager-metrics-monitor + namespace: system +spec: + selector: + matchLabels: + app.kubernetes.io/name: kueue + app.kubernetes.io/component: controller + podMetricsEndpoints: + - port: metrics diff --git a/config/rhoai/mutating_webhook_patch.yaml b/config/rhoai/mutating_webhook_patch.yaml new file mode 100644 index 0000000000..31a80969a3 --- /dev/null +++ b/config/rhoai/mutating_webhook_patch.yaml @@ -0,0 +1,7 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: mutating-webhook-configuration +webhooks: + - $patch: delete + name: mpod.kb.io diff --git a/config/rhoai/params.env b/config/rhoai/params.env new file mode 100644 index 0000000000..ca431060e4 --- /dev/null +++ b/config/rhoai/params.env @@ -0,0 +1 @@ +odh-kueue-controller-image=registry.k8s.io/kueue/kueue:v0.6.2 diff --git a/config/rhoai/params.yaml b/config/rhoai/params.yaml new file mode 100644 index 0000000000..43509ff293 --- /dev/null +++ b/config/rhoai/params.yaml @@ -0,0 +1,3 @@ +varReference: + - path: spec/template/spec/containers[]/image + kind: Deployment diff --git a/config/rhoai/remove_default_namespace.yaml b/config/rhoai/remove_default_namespace.yaml new file mode 100644 index 0000000000..9a52c0573d --- /dev/null +++ b/config/rhoai/remove_default_namespace.yaml @@ -0,0 +1,5 @@ +$patch: delete +apiVersion: v1 +kind: Namespace +metadata: + name: system diff --git a/config/rhoai/validating_webhook_patch.yaml b/config/rhoai/validating_webhook_patch.yaml new file mode 100644 index 0000000000..711b05dc29 --- /dev/null +++ b/config/rhoai/validating_webhook_patch.yaml @@ -0,0 +1,7 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: validating-webhook-configuration +webhooks: + - $patch: delete + name: vpod.kb.io diff --git a/config/rhoai/webhook_network_policy.yaml b/config/rhoai/webhook_network_policy.yaml new file mode 100644 index 0000000000..08845704c2 --- /dev/null +++ b/config/rhoai/webhook_network_policy.yaml @@ -0,0 +1,14 @@ +kind: NetworkPolicy +apiVersion: networking.k8s.io/v1 +metadata: + name: webhook-server + namespace: system +spec: + podSelector: + matchLabels: {} + ingress: + - ports: + - protocol: TCP + port: 9443 + policyTypes: + - Ingress diff --git a/config/visibility/kustomization.yaml b/config/visibility/kustomization.yaml new file mode 100644 index 0000000000..8dbcc994d9 --- /dev/null +++ b/config/visibility/kustomization.yaml @@ -0,0 +1,7 @@ +# This overlay builds the visibility component to be used in combination +# with other overlays. + +namespace: kueue-system +namePrefix: kueue- +resources: +- ../components/visibility diff --git a/go.mod b/go.mod index ab96339d49..dfcc91b11d 100644 --- a/go.mod +++ b/go.mod @@ -8,19 +8,21 @@ require ( github.com/kubeflow/common v0.4.7 github.com/kubeflow/mpi-operator v0.4.0 github.com/kubeflow/training-operator v1.7.0 - github.com/onsi/ginkgo/v2 v2.15.0 + github.com/onsi/ginkgo/v2 v2.16.0 github.com/onsi/gomega v1.31.1 github.com/open-policy-agent/cert-controller v0.10.1 github.com/prometheus/client_golang v1.18.0 github.com/prometheus/client_model v0.5.0 github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0 go.uber.org/zap v1.26.0 - k8s.io/api v0.29.1 - k8s.io/apimachinery v0.29.1 + golang.org/x/exp v0.0.0-20230905200255-921286631fa9 + k8s.io/api v0.29.2 + k8s.io/apiextensions-apiserver v0.29.0 + k8s.io/apimachinery v0.29.2 k8s.io/apiserver v0.29.1 - k8s.io/autoscaler/cluster-autoscaler v0.0.0-20230925095857-cf8c507d2421 - k8s.io/client-go v0.29.1 - k8s.io/code-generator v0.29.1 + k8s.io/autoscaler/cluster-autoscaler/apis v0.0.0-20240325113845-0130d33747bb + k8s.io/client-go v0.29.2 + k8s.io/code-generator v0.29.2 k8s.io/component-base v0.29.1 k8s.io/component-helpers v0.29.1 k8s.io/klog/v2 v2.110.1 @@ -100,16 +102,15 @@ require ( go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/crypto v0.18.0 // indirect - golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect golang.org/x/mod v0.14.0 // indirect golang.org/x/net v0.20.0 // indirect golang.org/x/oauth2 v0.12.0 // indirect - golang.org/x/sync v0.5.0 // indirect + golang.org/x/sync v0.6.0 // indirect golang.org/x/sys v0.16.0 // indirect golang.org/x/term v0.16.0 // indirect golang.org/x/text v0.14.0 // indirect golang.org/x/time v0.3.0 // indirect - golang.org/x/tools v0.16.1 // indirect + golang.org/x/tools v0.17.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/appengine v1.6.8 // indirect google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect @@ -121,7 +122,6 @@ require ( gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.29.0 // indirect k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01 // indirect k8s.io/kms v0.29.1 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0 // indirect diff --git a/go.sum b/go.sum index 49931f4ec3..4b1c37dd2c 100644 --- a/go.sum +++ b/go.sum @@ -153,8 +153,8 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.15.0 h1:79HwNRBAZHOEwrczrgSOPy+eFTTlIGELKy5as+ClttY= -github.com/onsi/ginkgo/v2 v2.15.0/go.mod h1:HlxMHtYF57y6Dpf+mc5529KKmSq9h2FpCF+/ZkwUxKM= +github.com/onsi/ginkgo/v2 v2.16.0 h1:7q1w9frJDzninhXxjZd+Y/x54XNjG/UlRLIYPZafsPM= +github.com/onsi/ginkgo/v2 v2.16.0/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= github.com/onsi/gomega v1.31.1 h1:KYppCUK+bUgAZwHOu7EXVBKyQA6ILvOESHkn/tgoqvo= github.com/onsi/gomega v1.31.1/go.mod h1:y40C95dwAD1Nz36SsEnxvfFe8FFfNxzI5eJ0EYGyAy0= github.com/open-policy-agent/cert-controller v0.10.1 h1:RXSYoyn8FdCenWecRP//UV5nbVfmstNpj4kHQFkvPK4= @@ -275,8 +275,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= -golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -307,8 +307,8 @@ golang.org/x/tools v0.0.0-20200505023115-26f46d2f7ef8/go.mod h1:EkVYQZoAsY45+roY golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.16.1 h1:TLyB3WofjdOEepBHAU20JdNC1Zbg87elYofWYAY5oZA= -golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= +golang.org/x/tools v0.17.0 h1:FvmRgNOcs3kOa+T20R1uhfP9F6HgG2mfxDv1vrx1Htc= +golang.org/x/tools v0.17.0/go.mod h1:xsh6VxdV005rRVaS6SSAf9oiAqljS7UZUacMZ8Bnsps= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -345,20 +345,20 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.29.1 h1:DAjwWX/9YT7NQD4INu49ROJuZAAAP/Ijki48GUPzxqw= -k8s.io/api v0.29.1/go.mod h1:7Kl10vBRUXhnQQI8YR/R327zXC8eJ7887/+Ybta+RoQ= +k8s.io/api v0.29.2 h1:hBC7B9+MU+ptchxEqTNW2DkUosJpp1P+Wn6YncZ474A= +k8s.io/api v0.29.2/go.mod h1:sdIaaKuU7P44aoyyLlikSLayT6Vb7bvJNCX105xZXY0= k8s.io/apiextensions-apiserver v0.29.0 h1:0VuspFG7Hj+SxyF/Z/2T0uFbI5gb5LRgEyUVE3Q4lV0= k8s.io/apiextensions-apiserver v0.29.0/go.mod h1:TKmpy3bTS0mr9pylH0nOt/QzQRrW7/h7yLdRForMZwc= -k8s.io/apimachinery v0.29.1 h1:KY4/E6km/wLBguvCZv8cKTeOwwOBqFNjwJIdMkMbbRc= -k8s.io/apimachinery v0.29.1/go.mod h1:6HVkd1FwxIagpYrHSwJlQqZI3G9LfYWRPAkUvLnXTKU= +k8s.io/apimachinery v0.29.2 h1:EWGpfJ856oj11C52NRCHuU7rFDwxev48z+6DSlGNsV8= +k8s.io/apimachinery v0.29.2/go.mod h1:6HVkd1FwxIagpYrHSwJlQqZI3G9LfYWRPAkUvLnXTKU= k8s.io/apiserver v0.29.1 h1:e2wwHUfEmMsa8+cuft8MT56+16EONIEK8A/gpBSco+g= k8s.io/apiserver v0.29.1/go.mod h1:V0EpkTRrJymyVT3M49we8uh2RvXf7fWC5XLB0P3SwRw= -k8s.io/autoscaler/cluster-autoscaler v0.0.0-20230925095857-cf8c507d2421 h1:h8E8WHuSP3aArI7k9n/E4r98S5PkXQ+hiQSprylSy7Q= -k8s.io/autoscaler/cluster-autoscaler v0.0.0-20230925095857-cf8c507d2421/go.mod h1:CIDuB0kVY16I0kzpUJa+eYDbj9xbclVbs8aJtgL3WzA= -k8s.io/client-go v0.29.1 h1:19B/+2NGEwnFLzt0uB5kNJnfTsbV8w6TgQRz9l7ti7A= -k8s.io/client-go v0.29.1/go.mod h1:TDG/psL9hdet0TI9mGyHJSgRkW3H9JZk2dNEUS7bRks= -k8s.io/code-generator v0.29.1 h1:8ba8BdtSmAVHgAMpzThb/fuyQeTRtN7NtN7VjMcDLew= -k8s.io/code-generator v0.29.1/go.mod h1:FwFi3C9jCrmbPjekhaCYcYG1n07CYiW1+PAPCockaos= +k8s.io/autoscaler/cluster-autoscaler/apis v0.0.0-20240325113845-0130d33747bb h1:ycQ/tSpcJEUHHx0pv6MXdq4NcRflCvFX6SMwmKROiis= +k8s.io/autoscaler/cluster-autoscaler/apis v0.0.0-20240325113845-0130d33747bb/go.mod h1:LPhCVj3E5Lp9W6HGVlW664m/X+KN2firfF3wtBBji54= +k8s.io/client-go v0.29.2 h1:FEg85el1TeZp+/vYJM7hkDlSTFZ+c5nnK44DJ4FyoRg= +k8s.io/client-go v0.29.2/go.mod h1:knlvFZE58VpqbQpJNbCbctTVXcd35mMyAAwBdpt4jrA= +k8s.io/code-generator v0.29.2 h1:c9/iw2KnNpw2IRV+wwuG/Wns2TjPSgjWzbbjTevyiHI= +k8s.io/code-generator v0.29.2/go.mod h1:FwFi3C9jCrmbPjekhaCYcYG1n07CYiW1+PAPCockaos= k8s.io/component-base v0.29.1 h1:MUimqJPCRnnHsskTTjKD+IC1EHBbRCVyi37IoFBrkYw= k8s.io/component-base v0.29.1/go.mod h1:fP9GFjxYrLERq1GcWWZAE3bqbNcDKDytn2srWuHTtKc= k8s.io/component-helpers v0.29.1 h1:54MMEDu6xeJmMtAKztsPwu0kJKr4+jCUzaEIn2UXRoc= diff --git a/hack/create-multikueue-kubeconfig.sh b/hack/create-multikueue-kubeconfig.sh new file mode 100644 index 0000000000..8658f0c2c8 --- /dev/null +++ b/hack/create-multikueue-kubeconfig.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +# Copyright 2024 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +KUBECONFIG_OUT=${1:-kubeconfig} +MULTIKUEUE_SA=multikueue-sa +NAMESPACE=kueue-system + +echo "Creating a custom MultiKueue Role and Service Account" +kubectl apply -f - < ${KUBECONFIG_OUT} < ${ARTIFACTS}/worker1.kubeconfig - $KIND get kubeconfig --name $WORKER2_KIND_CLUSTER_NAME > ${ARTIFACTS}/worker2.kubeconfig - kubectl config use-context kind-${MANAGER_KIND_CLUSTER_NAME} - kubectl create namespace kueue-system - kubectl create secret generic multikueue1 -n kueue-system --from-file=kubeconfig=${ARTIFACTS}/worker1.kubeconfig - kubectl create secret generic multikueue2 -n kueue-system --from-file=kubeconfig=${ARTIFACTS}/worker2.kubeconfig fi } @@ -121,9 +115,22 @@ function kueue_deploy { cluster_kueue_deploy $WORKER2_KIND_CLUSTER_NAME } +function prepare_secrets { + kubectl config use-context kind-${WORKER1_KIND_CLUSTER_NAME} + source ${SOURCE_DIR}/create-multikueue-kubeconfig.sh ${ARTIFACTS}/worker1.kubeconfig + + kubectl config use-context kind-${WORKER2_KIND_CLUSTER_NAME} + source ${SOURCE_DIR}/create-multikueue-kubeconfig.sh ${ARTIFACTS}/worker2.kubeconfig + + kubectl config use-context kind-${MANAGER_KIND_CLUSTER_NAME} + kubectl create secret generic multikueue1 -n kueue-system --from-file=kubeconfig=${ARTIFACTS}/worker1.kubeconfig + kubectl create secret generic multikueue2 -n kueue-system --from-file=kubeconfig=${ARTIFACTS}/worker2.kubeconfig +} + trap cleanup EXIT startup kind_load kueue_deploy +prepare_secrets $GINKGO $GINKGO_ARGS --junit-report=junit.xml --output-dir=$ARTIFACTS -v ./test/e2e/multikueue/... diff --git a/hack/update-helm.sh b/hack/update-helm.sh index b726593c2f..d258fcbe21 100755 --- a/hack/update-helm.sh +++ b/hack/update-helm.sh @@ -269,4 +269,11 @@ for output_file in ${DEST_VISIBILITY_DIR}/*.yaml; do if [ "$(cat $output_file | $YQ '.subjects.[] | has("namespace")')" = "true" ]; then $YQ -N -i '.subjects.[].namespace = "{{ .Release.Namespace }}"' $output_file fi + + { + echo '{{- if include "kueue.isFeatureGateEnabled" (dict "List" .Values.controllerManager.featureGates "Feature" "VisibilityOnDemand") }}' + cat $output_file + echo "{{- end }}" + }> ${output_file}.tmp + mv ${output_file}.tmp ${output_file} done diff --git a/keps/976-plain-pods/README.md b/keps/976-plain-pods/README.md index 55f07bdb43..a12be4eae5 100644 --- a/keps/976-plain-pods/README.md +++ b/keps/976-plain-pods/README.md @@ -565,8 +565,9 @@ in-memory Workload, then it is considered unmatching and the Workload is evicted In the Pod-group reconciler: 1. If the Pod is not terminated and doesn't have a deletionTimestamp, create a Workload for the pod group if one does not exist. -2. Remove Pod finalizers if the Pod is terminated and the Workload is finished, has a deletion - timestamp or is finished. +2. Remove Pod finalizers if: + - The Pod is terminated and the Workload is finished or has a deletion timestamp. + - The Pod Failed and a valid replacement pod was created for it. 3. Build the in-memory Workload. If its podset counters are greater than the stored Workload, then evict the Workload. 4. For gated pods: @@ -574,16 +575,15 @@ In the Pod-group reconciler: 5. If the number of succeeded pods is equal to the admission count, mark the Workload as Finished and remove the finalizers from the Pods. -Note that we are only removing Pod finalizers once the Workload is finished. This is a simple way of -managing finalizers, but it might lead to too many Pods lingering in etcd for a long time after -terminated. - ### Retrying Failed Pods The Pod group will generally only be considered finished if all the Pods finish with a Succeeded phase. This allows the user to send replacement Pods when a Pod in the group fails or if the group is preempted. The replacement Pods can have any name, but they must point to the same pod group. +Once a replacement Pod is created, and Kueue has added it as an owner of the Workload, the +Failed pod will be finalized. If multiple Pods have Failed, a new Pod is assumed to replace +the Pod that failed first. To declare that a group is failed, a user can execute one of the following actions: 1. Issue a Delete for the Workload object. The controller would terminate all running Pods and @@ -708,4 +708,4 @@ While this would be a clean approach, this proposal is targetting users that don wrapping their Pods, and adding one would be a bigger effort than adding annotations. Such amount of effort could be similar to migrating from plain Pods to the Job API, which is already supported. -We could reconsider this based on user feedback. \ No newline at end of file +We could reconsider this based on user feedback. diff --git a/pkg/cache/clusterqueue.go b/pkg/cache/clusterqueue.go index 7fe289a09e..949338e7ff 100644 --- a/pkg/cache/clusterqueue.go +++ b/pkg/cache/clusterqueue.go @@ -453,6 +453,29 @@ func updateUsage(wi *workload.Info, flvUsage FlavorResourceQuantities, m int64) } } +func updateCohortUsage(wi *workload.Info, cq *ClusterQueue, m int64) { + for _, ps := range wi.TotalRequests { + for wlRes, wlResFlv := range ps.Flavors { + v, wlResExist := ps.Requests[wlRes] + flv, flvExist := cq.Cohort.Usage[wlResFlv] + if flvExist && wlResExist { + if _, exists := flv[wlRes]; exists { + after := cq.Usage[wlResFlv][wlRes] - cq.guaranteedQuota(wlResFlv, wlRes) + // rollback update cq.Usage + before := after - v*m + if before > 0 { + flv[wlRes] -= before + } + // simulate updating cq.Usage + if after > 0 { + flv[wlRes] += after + } + } + } + } + } +} + func (c *ClusterQueue) addLocalQueue(q *kueue.LocalQueue) error { qKey := queueKey(q) if _, ok := c.localQueues[qKey]; ok { diff --git a/pkg/cache/snapshot.go b/pkg/cache/snapshot.go index e96cbd4867..3a0cc60dde 100644 --- a/pkg/cache/snapshot.go +++ b/pkg/cache/snapshot.go @@ -41,18 +41,26 @@ func (s *Snapshot) RemoveWorkload(wl *workload.Info) { delete(cq.Workloads, workload.Key(wl.Obj)) updateUsage(wl, cq.Usage, -1) if cq.Cohort != nil { - updateUsage(wl, cq.Cohort.Usage, -1) + if features.Enabled(features.LendingLimit) { + updateCohortUsage(wl, cq, -1) + } else { + updateUsage(wl, cq.Cohort.Usage, -1) + } } } -// AddWorkload removes a workload from its corresponding ClusterQueue and +// AddWorkload adds a workload from its corresponding ClusterQueue and // updates resource usage. func (s *Snapshot) AddWorkload(wl *workload.Info) { cq := s.ClusterQueues[wl.ClusterQueue] cq.Workloads[workload.Key(wl.Obj)] = wl updateUsage(wl, cq.Usage, 1) if cq.Cohort != nil { - updateUsage(wl, cq.Cohort.Usage, 1) + if features.Enabled(features.LendingLimit) { + updateCohortUsage(wl, cq, 1) + } else { + updateUsage(wl, cq.Cohort.Usage, 1) + } } } diff --git a/pkg/cache/snapshot_test.go b/pkg/cache/snapshot_test.go index d54ae71dbb..abc6a8ab71 100644 --- a/pkg/cache/snapshot_test.go +++ b/pkg/cache/snapshot_test.go @@ -864,3 +864,446 @@ func TestSnapshotAddRemoveWorkload(t *testing.T) { }) } } + +func TestSnapshotAddRemoveWorkloadWithLendingLimit(t *testing.T) { + _ = features.SetEnable(features.LendingLimit, true) + flavors := []*kueue.ResourceFlavor{ + utiltesting.MakeResourceFlavor("default").Obj(), + } + clusterQueues := []*kueue.ClusterQueue{ + utiltesting.MakeClusterQueue("lend-a"). + Cohort("lend"). + ResourceGroup( + *utiltesting.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "10", "", "4").Obj(), + ). + Preemption(kueue.ClusterQueuePreemption{ + WithinClusterQueue: kueue.PreemptionPolicyLowerPriority, + ReclaimWithinCohort: kueue.PreemptionPolicyLowerPriority, + }). + Obj(), + utiltesting.MakeClusterQueue("lend-b"). + Cohort("lend"). + ResourceGroup( + *utiltesting.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "10", "", "6").Obj(), + ). + Preemption(kueue.ClusterQueuePreemption{ + WithinClusterQueue: kueue.PreemptionPolicyNever, + ReclaimWithinCohort: kueue.PreemptionPolicyAny, + }). + Obj(), + } + workloads := []kueue.Workload{ + *utiltesting.MakeWorkload("lend-a-1", ""). + Request(corev1.ResourceCPU, "1"). + ReserveQuota(utiltesting.MakeAdmission("lend-a").Assignment(corev1.ResourceCPU, "default", "1").Obj()). + Obj(), + *utiltesting.MakeWorkload("lend-a-2", ""). + Request(corev1.ResourceCPU, "9"). + ReserveQuota(utiltesting.MakeAdmission("lend-a").Assignment(corev1.ResourceCPU, "default", "9").Obj()). + Obj(), + *utiltesting.MakeWorkload("lend-a-3", ""). + Request(corev1.ResourceCPU, "6"). + ReserveQuota(utiltesting.MakeAdmission("lend-a").Assignment(corev1.ResourceCPU, "default", "6").Obj()). + Obj(), + *utiltesting.MakeWorkload("lend-b-1", ""). + Request(corev1.ResourceCPU, "4"). + ReserveQuota(utiltesting.MakeAdmission("lend-b").Assignment(corev1.ResourceCPU, "default", "4").Obj()). + Obj(), + } + + ctx := context.Background() + cl := utiltesting.NewClientBuilder().WithLists(&kueue.WorkloadList{Items: workloads}).Build() + + cqCache := New(cl) + for _, flv := range flavors { + cqCache.AddOrUpdateResourceFlavor(flv) + } + for _, cq := range clusterQueues { + if err := cqCache.AddClusterQueue(ctx, cq); err != nil { + t.Fatalf("Couldn't add ClusterQueue to cache: %v", err) + } + } + wlInfos := make(map[string]*workload.Info, len(workloads)) + for _, cq := range cqCache.clusterQueues { + for _, wl := range cq.Workloads { + wlInfos[workload.Key(wl.Obj)] = wl + } + } + initialSnapshot := cqCache.Snapshot() + initialCohortResources := initialSnapshot.ClusterQueues["lend-a"].Cohort.RequestableResources + cases := map[string]struct { + remove []string + add []string + want Snapshot + }{ + "remove all then add all": { + remove: []string{"/lend-a-1", "/lend-a-2", "/lend-a-3", "/lend-b-1"}, + add: []string{"/lend-a-1", "/lend-a-2", "/lend-a-3", "/lend-b-1"}, + want: initialSnapshot, + }, + "remove all": { + remove: []string{"/lend-a-1", "/lend-a-2", "/lend-a-3", "/lend-b-1"}, + want: func() Snapshot { + cohort := &Cohort{ + Name: "lend", + AllocatableResourceGeneration: 2, + RequestableResources: initialCohortResources, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + } + return Snapshot{ + ClusterQueues: map[string]*ClusterQueue{ + "lend-a": { + Name: "lend-a", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-a"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 6_000, + }, + }, + }, + "lend-b": { + Name: "lend-b", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-b"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 4_000, + }, + }, + }, + }, + } + }(), + }, + "remove workload, but still using quota over GuaranteedQuota": { + remove: []string{"/lend-a-2"}, + want: func() Snapshot { + cohort := &Cohort{ + Name: "lend", + AllocatableResourceGeneration: 2, + RequestableResources: initialCohortResources, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 1_000}, + }, + } + return Snapshot{ + ClusterQueues: map[string]*ClusterQueue{ + "lend-a": { + Name: "lend-a", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-a"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 7_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 6_000, + }, + }, + }, + "lend-b": { + Name: "lend-b", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-b"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 4_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 4_000, + }, + }, + }, + }, + } + }(), + }, + "remove wokload, using same quota as GuaranteedQuota": { + remove: []string{"/lend-a-1", "/lend-a-2"}, + want: func() Snapshot { + cohort := &Cohort{ + Name: "lend", + AllocatableResourceGeneration: 2, + RequestableResources: initialCohortResources, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + } + return Snapshot{ + ClusterQueues: map[string]*ClusterQueue{ + "lend-a": { + Name: "lend-a", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-a"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 6_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 6_000, + }, + }, + }, + "lend-b": { + Name: "lend-b", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-b"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 4_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 4_000, + }, + }, + }, + }, + } + }(), + }, + "remove workload, using less quota than GuaranteedQuota": { + remove: []string{"/lend-a-2", "/lend-a-3"}, + want: func() Snapshot { + cohort := &Cohort{ + Name: "lend", + AllocatableResourceGeneration: 2, + RequestableResources: initialCohortResources, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + } + return Snapshot{ + ClusterQueues: map[string]*ClusterQueue{ + "lend-a": { + Name: "lend-a", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-a"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 1_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 6_000, + }, + }, + }, + "lend-b": { + Name: "lend-b", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-b"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 4_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 4_000, + }, + }, + }, + }, + } + }(), + }, + "remove all then add workload, using less quota than GuaranteedQuota": { + remove: []string{"/lend-a-1", "/lend-a-2", "/lend-a-3", "/lend-b-1"}, + add: []string{"/lend-a-1"}, + want: func() Snapshot { + cohort := &Cohort{ + Name: "lend", + AllocatableResourceGeneration: 2, + RequestableResources: initialCohortResources, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + } + return Snapshot{ + ClusterQueues: map[string]*ClusterQueue{ + "lend-a": { + Name: "lend-a", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-a"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 1_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 6_000, + }, + }, + }, + "lend-b": { + Name: "lend-b", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-b"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 4_000, + }, + }, + }, + }, + } + }(), + }, + "remove all then add workload, using same quota as GuaranteedQuota": { + remove: []string{"/lend-a-1", "/lend-a-2", "/lend-a-3", "/lend-b-1"}, + add: []string{"/lend-a-3"}, + want: func() Snapshot { + cohort := &Cohort{ + Name: "lend", + AllocatableResourceGeneration: 2, + RequestableResources: initialCohortResources, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + } + return Snapshot{ + ClusterQueues: map[string]*ClusterQueue{ + "lend-a": { + Name: "lend-a", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-a"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 6_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 6_000, + }, + }, + }, + "lend-b": { + Name: "lend-b", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-b"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 4_000, + }, + }, + }, + }, + } + }(), + }, + "remove all then add workload, using quota over GuaranteedQuota": { + remove: []string{"/lend-a-1", "/lend-a-2", "/lend-a-3", "/lend-b-1"}, + add: []string{"/lend-a-2"}, + want: func() Snapshot { + cohort := &Cohort{ + Name: "lend", + AllocatableResourceGeneration: 2, + RequestableResources: initialCohortResources, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 3_000}, + }, + } + return Snapshot{ + ClusterQueues: map[string]*ClusterQueue{ + "lend-a": { + Name: "lend-a", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-a"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 9_000}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 6_000, + }, + }, + }, + "lend-b": { + Name: "lend-b", + Cohort: cohort, + Workloads: make(map[string]*workload.Info), + ResourceGroups: cqCache.clusterQueues["lend-b"].ResourceGroups, + FlavorFungibility: defaultFlavorFungibility, + AllocatableResourceGeneration: 1, + Usage: FlavorResourceQuantities{ + "default": {corev1.ResourceCPU: 0}, + }, + GuaranteedQuota: FlavorResourceQuantities{ + "default": { + corev1.ResourceCPU: 4_000, + }, + }, + }, + }, + } + }(), + }, + } + cmpOpts := append(snapCmpOpts, + cmpopts.IgnoreFields(ClusterQueue{}, "NamespaceSelector", "Preemption", "Status"), + cmpopts.IgnoreFields(Snapshot{}, "ResourceFlavors"), + cmpopts.IgnoreTypes(&workload.Info{})) + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + snap := cqCache.Snapshot() + for _, name := range tc.remove { + snap.RemoveWorkload(wlInfos[name]) + } + for _, name := range tc.add { + snap.AddWorkload(wlInfos[name]) + } + if diff := cmp.Diff(tc.want, snap, cmpOpts...); diff != "" { + t.Errorf("Unexpected snapshot state after operations (-want,+got):\n%s", diff) + } + }) + } +} diff --git a/pkg/controller/admissionchecks/multikueue/jobset_adapter_test.go b/pkg/controller/admissionchecks/multikueue/jobset_adapter_test.go index 61ac7de838..3796a03287 100644 --- a/pkg/controller/admissionchecks/multikueue/jobset_adapter_test.go +++ b/pkg/controller/admissionchecks/multikueue/jobset_adapter_test.go @@ -331,7 +331,7 @@ func TestWlReconcileJobset(t *testing.T) { worker1Builder = worker1Builder.WithLists(&kueue.WorkloadList{Items: tc.worker1Workloads}, &jobset.JobSetList{Items: tc.worker1JobSets}) worker1Client := worker1Builder.Build() - w1remoteClient := newRemoteClient(managerClient, nil, defaultOrigin) + w1remoteClient := newRemoteClient(managerClient, nil, nil, defaultOrigin, "") w1remoteClient.client = worker1Client cRec.remoteClients["worker1"] = w1remoteClient diff --git a/pkg/controller/admissionchecks/multikueue/multikueuecluster.go b/pkg/controller/admissionchecks/multikueue/multikueuecluster.go index beb13f025f..2b957b5395 100644 --- a/pkg/controller/admissionchecks/multikueue/multikueuecluster.go +++ b/pkg/controller/admissionchecks/multikueue/multikueuecluster.go @@ -23,6 +23,7 @@ import ( "os" "strings" "sync" + "sync/atomic" "time" corev1 "k8s.io/api/core/v1" @@ -36,26 +37,50 @@ import ( "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1" kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" ) +const ( + eventChBufferSize = 10 + + // this set will provide waiting time between 0 to 5m20s + retryIncrement = 5 * time.Second + retryMaxSteps = 7 +) + +// retryAfter returns an exponentially increasing interval between +// 0 and 2^(retryMaxSteps-1) * retryIncrement +func retryAfter(failedAttempts uint) time.Duration { + if failedAttempts == 0 { + return 0 + } + return (1 << (min(failedAttempts, retryMaxSteps) - 1)) * retryIncrement +} + type clientWithWatchBuilder func(config []byte, options client.Options) (client.WithWatch, error) type remoteClient struct { - localClient client.Client - client client.WithWatch - wlUpdateCh chan<- event.GenericEvent - watchCancel map[string]func() - kubeconfig []byte - origin string + clusterName string + localClient client.Client + client client.WithWatch + wlUpdateCh chan<- event.GenericEvent + watchEndedCh chan<- event.GenericEvent + watchCancel func() + kubeconfig []byte + origin string + + forceReconnect atomic.Bool + failedConnAttempts uint // For unit testing only. There is now need of creating fully functional remote clients in the unit tests // and creating valid kubeconfig content is not trivial. @@ -63,12 +88,13 @@ type remoteClient struct { builderOverride clientWithWatchBuilder } -func newRemoteClient(localClient client.Client, wlUpdateCh chan<- event.GenericEvent, origin string) *remoteClient { +func newRemoteClient(localClient client.Client, wlUpdateCh, watchEndedCh chan<- event.GenericEvent, origin, clusterName string) *remoteClient { rc := &remoteClient{ - wlUpdateCh: wlUpdateCh, - localClient: localClient, - origin: origin, - watchCancel: map[string]func(){}, + clusterName: clusterName, + wlUpdateCh: wlUpdateCh, + watchEndedCh: watchEndedCh, + localClient: localClient, + origin: origin, } return rc } @@ -107,13 +133,19 @@ func (*workloadKueueWatcher) GetWorkloadKey(o runtime.Object) (types.NamespacedN } // setConfig - will try to recreate the k8s client and restart watching if the new config is different than -// the one currently used. -func (rc *remoteClient) setConfig(watchCtx context.Context, kubeconfig []byte) error { - if equality.Semantic.DeepEqual(kubeconfig, rc.kubeconfig) { - return nil +// the one currently used or a reconnect was requested. +// If the encountered error is not permanent the duration after which a retry should be done is returned. +func (rc *remoteClient) setConfig(watchCtx context.Context, kubeconfig []byte) (*time.Duration, error) { + configChanged := !equality.Semantic.DeepEqual(kubeconfig, rc.kubeconfig) + if !configChanged && !rc.forceReconnect.Load() { + return nil, nil } rc.StopWatchers() + if configChanged { + rc.kubeconfig = kubeconfig + rc.failedConnAttempts = 0 + } builder := newClientWithWatch if rc.builderOverride != nil { @@ -121,13 +153,16 @@ func (rc *remoteClient) setConfig(watchCtx context.Context, kubeconfig []byte) e } remoteClient, err := builder(kubeconfig, client.Options{Scheme: rc.localClient.Scheme()}) if err != nil { - return err + return nil, err } + rc.client = remoteClient + watchCtx, rc.watchCancel = context.WithCancel(watchCtx) err = rc.startWatcher(watchCtx, kueue.GroupVersion.WithKind("Workload").GroupKind().String(), &workloadKueueWatcher{}) if err != nil { - return err + rc.failedConnAttempts++ + return ptr.To(retryAfter(rc.failedConnAttempts)), err } // add a watch for all the adapters implementing multiKueueWatcher @@ -141,12 +176,14 @@ func (rc *remoteClient) setConfig(watchCtx context.Context, kubeconfig []byte) e // not being able to setup a watcher is not ideal but we can function with only the wl watcher. ctrl.LoggerFrom(watchCtx).V(2).Error(err, "Unable to start the watcher", "kind", kind) // however let's not accept this for now. - return err + rc.failedConnAttempts++ + return ptr.To(retryAfter(rc.failedConnAttempts)), err } } - rc.kubeconfig = kubeconfig - return nil + rc.forceReconnect.Store(false) + rc.failedConnAttempts = 0 + return nil, nil } func (rc *remoteClient) startWatcher(ctx context.Context, kind string, w multiKueueWatcher) error { @@ -158,7 +195,6 @@ func (rc *remoteClient) startWatcher(ctx context.Context, kind string, w multiKu go func() { log.V(2).Info("Starting watch") - defer log.V(2).Info("Watch ended") for r := range newWatcher.ResultChan() { wlKey, err := w.GetWorkloadKey(r.Object) if err != nil { @@ -167,16 +203,23 @@ func (rc *remoteClient) startWatcher(ctx context.Context, kind string, w multiKu rc.queueWorkloadEvent(ctx, wlKey) } } + log.V(2).Info("Watch ended", "ctxErr", ctx.Err()) + // If the context is not yet Done , queue a reconcile to attempt reconnection + if ctx.Err() == nil { + oldReconnect := rc.forceReconnect.Swap(true) + //reconnect if this is the first watch failing. + if !oldReconnect { + log.V(2).Info("Queue reconcile for reconnect", "cluster", rc.clusterName) + rc.queueWatchEndedEvent(ctx) + } + } }() - - rc.watchCancel[kind] = newWatcher.Stop return nil } func (rc *remoteClient) StopWatchers() { - for kind, stop := range rc.watchCancel { - stop() - delete(rc.watchCancel, kind) + if rc.watchCancel != nil { + rc.watchCancel() } } @@ -191,6 +234,15 @@ func (rc *remoteClient) queueWorkloadEvent(ctx context.Context, wlKey types.Name } } +func (rc *remoteClient) queueWatchEndedEvent(ctx context.Context) { + cluster := &kueuealpha.MultiKueueCluster{} + if err := rc.localClient.Get(ctx, types.NamespacedName{Name: rc.clusterName}, cluster); err == nil { + rc.watchEndedCh <- event.GenericEvent{Object: cluster} + } else { + ctrl.LoggerFrom(ctx).Error(err, "sending watch ended event") + } +} + // runGC - lists all the remote workloads having the same multikueue-origin and remove those who // no longer have a local correspondent (missing or awaiting deletion). If the remote workload // is owned by a job, also delete the job. @@ -264,6 +316,10 @@ type clustersReconciler struct { // and creating valid kubeconfig content is not trivial. // The full client creation and usage is validated in the integration and e2e tests. builderOverride clientWithWatchBuilder + + // watchEndedCh - an event chan used to request the reconciliation of the clusters for which the watch loop + // has ended (connection lost). + watchEndedCh chan event.GenericEvent } var _ manager.Runnable = (*clustersReconciler)(nil) @@ -284,13 +340,13 @@ func (c *clustersReconciler) stopAndRemoveCluster(clusterName string) { } } -func (c *clustersReconciler) setRemoteClientConfig(ctx context.Context, clusterName string, kubeconfig []byte, origin string) error { +func (c *clustersReconciler) setRemoteClientConfig(ctx context.Context, clusterName string, kubeconfig []byte, origin string) (*time.Duration, error) { c.lock.Lock() defer c.lock.Unlock() client, found := c.remoteClients[clusterName] if !found { - client = newRemoteClient(c.localClient, c.wlUpdateCh, origin) + client = newRemoteClient(c.localClient, c.wlUpdateCh, c.watchEndedCh, origin, clusterName) if c.builderOverride != nil { client.builderOverride = c.builderOverride } @@ -300,12 +356,11 @@ func (c *clustersReconciler) setRemoteClientConfig(ctx context.Context, clusterN clientLog := ctrl.LoggerFrom(c.rootContext).WithValues("clusterName", clusterName) clientCtx := ctrl.LoggerInto(c.rootContext, clientLog) - if err := client.setConfig(clientCtx, kubeconfig); err != nil { + if retryAfter, err := client.setConfig(clientCtx, kubeconfig); err != nil { ctrl.LoggerFrom(ctx).Error(err, "failed to set kubeConfig in the remote client") - delete(c.remoteClients, clusterName) - return err + return retryAfter, err } - return nil + return nil, nil } func (a *clustersReconciler) controllerFor(acName string) (*remoteClient, bool) { @@ -343,9 +398,13 @@ func (c *clustersReconciler) Reconcile(ctx context.Context, req reconcile.Reques return reconcile.Result{}, c.updateStatus(ctx, cluster, false, "BadConfig", err.Error()) } - if err := c.setRemoteClientConfig(ctx, cluster.Name, kubeConfig, c.origin); err != nil { - log.Error(err, "setting kubeconfig") - return reconcile.Result{}, c.updateStatus(ctx, cluster, false, "ClientConnectionFailed", err.Error()) + if retryAfter, err := c.setRemoteClientConfig(ctx, cluster.Name, kubeConfig, c.origin); err != nil { + log.Error(err, "setting kubeconfig", "retryAfter", retryAfter) + if err := c.updateStatus(ctx, cluster, false, "ClientConnectionFailed", err.Error()); err != nil { + return reconcile.Result{}, err + } else { + return reconcile.Result{RequeueAfter: ptr.Deref(retryAfter, 0)}, nil + } } return reconcile.Result{}, c.updateStatus(ctx, cluster, true, "Active", "Connected") } @@ -433,9 +492,10 @@ func newClustersReconciler(c client.Client, namespace string, gcInterval time.Du localClient: c, configNamespace: namespace, remoteClients: make(map[string]*remoteClient), - wlUpdateCh: make(chan event.GenericEvent, 10), + wlUpdateCh: make(chan event.GenericEvent, eventChBufferSize), gcInterval: gcInterval, origin: origin, + watchEndedCh: make(chan event.GenericEvent, eventChBufferSize), } } @@ -445,9 +505,18 @@ func (c *clustersReconciler) setupWithManager(mgr ctrl.Manager) error { return err } + syncHndl := handler.Funcs{ + GenericFunc: func(_ context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { + q.Add(reconcile.Request{NamespacedName: types.NamespacedName{ + Name: e.Object.GetName(), + }}) + }, + } + return ctrl.NewControllerManagedBy(mgr). For(&kueuealpha.MultiKueueCluster{}). Watches(&corev1.Secret{}, &secretHandler{client: c.localClient}). + WatchesRawSource(&source.Channel{Source: c.watchEndedCh}, syncHndl). Complete(c) } diff --git a/pkg/controller/admissionchecks/multikueue/multikueuecluster_test.go b/pkg/controller/admissionchecks/multikueue/multikueuecluster_test.go index 1d975baf6a..bf99f4dc55 100644 --- a/pkg/controller/admissionchecks/multikueue/multikueuecluster_test.go +++ b/pkg/controller/admissionchecks/multikueue/multikueuecluster_test.go @@ -17,8 +17,10 @@ limitations under the License. package multikueue import ( + "context" "errors" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -26,7 +28,9 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/watch" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" "sigs.k8s.io/controller-runtime/pkg/reconcile" kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1" @@ -38,6 +42,7 @@ import ( var ( errInvalidConfig = errors.New("invalid kubeconfig") + errCannotWatch = errors.New("client cannot watch") ) func fakeClientBuilder(kubeconfig []byte, options client.Options) (client.WithWatch, error) { @@ -45,26 +50,36 @@ func fakeClientBuilder(kubeconfig []byte, options client.Options) (client.WithWa return nil, errInvalidConfig } b, _ := getClientBuilder() + b = b.WithInterceptorFuncs(interceptor.Funcs{ + Watch: func(ctx context.Context, client client.WithWatch, obj client.ObjectList, opts ...client.ListOption) (watch.Interface, error) { + if string(kubeconfig) == "nowatch" { + return nil, errCannotWatch + } + return client.Watch(ctx, obj, opts...) + }, + }) return b.Build(), nil } -func newTestClient(config string) *remoteClient { +func newTestClient(config string, watchCancel func()) *remoteClient { b, _ := getClientBuilder() localClient := b.Build() ret := &remoteClient{ kubeconfig: []byte(config), localClient: localClient, + watchCancel: watchCancel, builderOverride: fakeClientBuilder, } - ret.watchCancel = map[string]func(){ - "test": func() { - ret.kubeconfig = []byte(string(ret.kubeconfig) + " canceled") - }, - } return ret } +func setReconnectState(rc *remoteClient, a uint) *remoteClient { + rc.failedConnAttempts = a + rc.forceReconnect.Store(true) + return rc +} + func makeTestSecret(name string, kubeconfig string) corev1.Secret { return corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ @@ -78,6 +93,9 @@ func makeTestSecret(name string, kubeconfig string) corev1.Secret { } func TestUpdateConfig(t *testing.T) { + cancelCalledCount := 0 + cancelCalled := func() { cancelCalledCount++ } + cases := map[string]struct { reconcileFor string remoteClients map[string]*remoteClient @@ -86,6 +104,8 @@ func TestUpdateConfig(t *testing.T) { wantRemoteClients map[string]*remoteClient wantClusters []kueuealpha.MultiKueueCluster + wantRequeueAfter time.Duration + wantCancelCalled int }{ "new valid client is added": { reconcileFor: "worker1", @@ -113,7 +133,7 @@ func TestUpdateConfig(t *testing.T) { makeTestSecret("worker1", "worker1 kubeconfig"), }, remoteClients: map[string]*remoteClient{ - "worker1": newTestClient("worker1 old kubeconfig"), + "worker1": newTestClient("worker1 old kubeconfig", cancelCalled), }, wantClusters: []kueuealpha.MultiKueueCluster{ *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionTrue, "Active", "Connected").Obj(), @@ -123,6 +143,7 @@ func TestUpdateConfig(t *testing.T) { kubeconfig: []byte("worker1 kubeconfig"), }, }, + wantCancelCalled: 1, }, "update client with valid path config": { reconcileFor: "worker1", @@ -130,7 +151,7 @@ func TestUpdateConfig(t *testing.T) { *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.PathLocationType, "testdata/worker1KubeConfig").Obj(), }, remoteClients: map[string]*remoteClient{ - "worker1": newTestClient("worker1 old kubeconfig"), + "worker1": newTestClient("worker1 old kubeconfig", cancelCalled), }, wantClusters: []kueuealpha.MultiKueueCluster{ *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.PathLocationType, "testdata/worker1KubeConfig").Active(metav1.ConditionTrue, "Active", "Connected").Obj(), @@ -140,6 +161,7 @@ func TestUpdateConfig(t *testing.T) { kubeconfig: []byte("worker1 kubeconfig"), }, }, + wantCancelCalled: 1, }, "update client with invalid secret config": { reconcileFor: "worker1", @@ -150,11 +172,15 @@ func TestUpdateConfig(t *testing.T) { makeTestSecret("worker1", "invalid"), }, remoteClients: map[string]*remoteClient{ - "worker1": newTestClient("worker1 old kubeconfig"), + "worker1": newTestClient("worker1 old kubeconfig", cancelCalled), + }, + wantRemoteClients: map[string]*remoteClient{ + "worker1": newTestClient("invalid", nil), }, wantClusters: []kueuealpha.MultiKueueCluster{ *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionFalse, "ClientConnectionFailed", "invalid kubeconfig").Obj(), }, + wantCancelCalled: 1, }, "update client with invalid path config": { reconcileFor: "worker1", @@ -162,11 +188,12 @@ func TestUpdateConfig(t *testing.T) { *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.PathLocationType, "").Obj(), }, remoteClients: map[string]*remoteClient{ - "worker1": newTestClient("worker1 old kubeconfig"), + "worker1": newTestClient("worker1 old kubeconfig", cancelCalled), }, wantClusters: []kueuealpha.MultiKueueCluster{ *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.PathLocationType, "").Active(metav1.ConditionFalse, "BadConfig", "open : no such file or directory").Obj(), }, + wantCancelCalled: 1, }, "missing cluster is removed": { reconcileFor: "worker2", @@ -174,8 +201,8 @@ func TestUpdateConfig(t *testing.T) { *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(), }, remoteClients: map[string]*remoteClient{ - "worker1": newTestClient("worker1 kubeconfig"), - "worker2": newTestClient("worker2 kubeconfig"), + "worker1": newTestClient("worker1 kubeconfig", cancelCalled), + "worker2": newTestClient("worker2 kubeconfig", cancelCalled), }, wantClusters: []kueuealpha.MultiKueueCluster{ *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(), @@ -185,6 +212,97 @@ func TestUpdateConfig(t *testing.T) { kubeconfig: []byte("worker1 kubeconfig"), }, }, + wantCancelCalled: 1, + }, + "update client config, nowatch": { + reconcileFor: "worker1", + clusters: []kueuealpha.MultiKueueCluster{ + *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(), + }, + secrets: []corev1.Secret{ + makeTestSecret("worker1", "nowatch"), + }, + remoteClients: map[string]*remoteClient{ + "worker1": newTestClient("worker1 old kubeconfig", cancelCalled), + }, + wantRemoteClients: map[string]*remoteClient{ + "worker1": setReconnectState(newTestClient("nowatch", nil), 1), + }, + wantClusters: []kueuealpha.MultiKueueCluster{ + *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch").Obj(), + }, + wantRequeueAfter: 5 * time.Second, + wantCancelCalled: 1, + }, + "update client config, nowatch 3rd try": { + reconcileFor: "worker1", + clusters: []kueuealpha.MultiKueueCluster{ + *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch").Obj(), + }, + secrets: []corev1.Secret{ + makeTestSecret("worker1", "nowatch"), + }, + remoteClients: map[string]*remoteClient{ + "worker1": setReconnectState(newTestClient("nowatch", cancelCalled), 2), + }, + wantRemoteClients: map[string]*remoteClient{ + "worker1": setReconnectState(newTestClient("nowatch", nil), 3), + }, + wantClusters: []kueuealpha.MultiKueueCluster{ + *utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch").Obj(), + }, + wantRequeueAfter: 20 * time.Second, + wantCancelCalled: 1, + }, + "failed attempts are set to 0 on successful connection": { + reconcileFor: "worker1", + clusters: []kueuealpha.MultiKueueCluster{ + *utiltesting.MakeMultiKueueCluster("worker1"). + KubeConfig(kueuealpha.SecretLocationType, "worker1"). + Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch"). + Obj(), + }, + secrets: []corev1.Secret{ + makeTestSecret("worker1", "good config"), + }, + remoteClients: map[string]*remoteClient{ + "worker1": setReconnectState(newTestClient("nowatch", cancelCalled), 5), + }, + wantRemoteClients: map[string]*remoteClient{ + "worker1": newTestClient("good config", nil), + }, + wantClusters: []kueuealpha.MultiKueueCluster{ + *utiltesting.MakeMultiKueueCluster("worker1"). + KubeConfig(kueuealpha.SecretLocationType, "worker1"). + Active(metav1.ConditionTrue, "Active", "Connected"). + Obj(), + }, + wantCancelCalled: 1, + }, + "failed attempts are set to 0 on config change": { + reconcileFor: "worker1", + clusters: []kueuealpha.MultiKueueCluster{ + *utiltesting.MakeMultiKueueCluster("worker1"). + KubeConfig(kueuealpha.SecretLocationType, "worker1"). + Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch"). + Obj(), + }, + secrets: []corev1.Secret{ + makeTestSecret("worker1", "invalid"), + }, + remoteClients: map[string]*remoteClient{ + "worker1": setReconnectState(newTestClient("nowatch", cancelCalled), 5), + }, + wantRemoteClients: map[string]*remoteClient{ + "worker1": newTestClient("invalid", nil), + }, + wantClusters: []kueuealpha.MultiKueueCluster{ + *utiltesting.MakeMultiKueueCluster("worker1"). + KubeConfig(kueuealpha.SecretLocationType, "worker1"). + Active(metav1.ConditionFalse, "ClientConnectionFailed", "invalid kubeconfig"). + Obj(), + }, + wantCancelCalled: 1, }, } @@ -204,11 +322,20 @@ func TestUpdateConfig(t *testing.T) { } reconciler.builderOverride = fakeClientBuilder - _, gotErr := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: tc.reconcileFor}}) + cancelCalledCount = 0 + res, gotErr := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: tc.reconcileFor}}) if gotErr != nil { t.Errorf("unexpected reconcile error: %s", gotErr) } + if diff := cmp.Diff(tc.wantRequeueAfter, res.RequeueAfter); diff != "" { + t.Errorf("unexpected requeue after (-want/+got):\n%s", diff) + } + + if tc.wantCancelCalled != cancelCalledCount { + t.Errorf("unexpected watch cancel call count want: %d, got: %d", tc.wantCancelCalled, cancelCalledCount) + } + lst := &kueuealpha.MultiKueueClusterList{} gotErr = c.List(ctx, lst) if gotErr != nil { @@ -222,7 +349,12 @@ func TestUpdateConfig(t *testing.T) { } if diff := cmp.Diff(tc.wantRemoteClients, reconciler.remoteClients, cmpopts.EquateEmpty(), - cmp.Comparer(func(a, b remoteClient) bool { return string(a.kubeconfig) == string(b.kubeconfig) })); diff != "" { + cmp.Comparer(func(a, b *remoteClient) bool { + if a.failedConnAttempts != b.failedConnAttempts { + return false + } + return string(a.kubeconfig) == string(b.kubeconfig) + })); diff != "" { t.Errorf("unexpected controllers (-want/+got):\n%s", diff) } }) @@ -341,7 +473,7 @@ func TestRemoteClientGC(t *testing.T) { worker1Builder = worker1Builder.WithLists(&kueue.WorkloadList{Items: tc.workersWorkloads}, &batchv1.JobList{Items: tc.workersJobs}) worker1Client := worker1Builder.Build() - w1remoteClient := newRemoteClient(managerClient, nil, defaultOrigin) + w1remoteClient := newRemoteClient(managerClient, nil, nil, defaultOrigin, "") w1remoteClient.client = worker1Client w1remoteClient.runGC(ctx) diff --git a/pkg/controller/admissionchecks/multikueue/workload_test.go b/pkg/controller/admissionchecks/multikueue/workload_test.go index eb5e62f6e3..5edcbd994b 100644 --- a/pkg/controller/admissionchecks/multikueue/workload_test.go +++ b/pkg/controller/admissionchecks/multikueue/workload_test.go @@ -451,7 +451,7 @@ func TestWlReconcile(t *testing.T) { worker1Builder = worker1Builder.WithLists(&kueue.WorkloadList{Items: tc.worker1Workloads}, &batchv1.JobList{Items: tc.worker1Jobs}) worker1Client := worker1Builder.Build() - w1remoteClient := newRemoteClient(managerClient, nil, defaultOrigin) + w1remoteClient := newRemoteClient(managerClient, nil, nil, defaultOrigin, "") w1remoteClient.client = worker1Client cRec.remoteClients["worker1"] = w1remoteClient @@ -481,7 +481,7 @@ func TestWlReconcile(t *testing.T) { }) worker2Client = worker2Builder.Build() - w2remoteClient := newRemoteClient(managerClient, nil, defaultOrigin) + w2remoteClient := newRemoteClient(managerClient, nil, nil, defaultOrigin, "") w2remoteClient.client = worker2Client cRec.remoteClients["worker2"] = w2remoteClient diff --git a/pkg/controller/admissionchecks/provisioning/controller.go b/pkg/controller/admissionchecks/provisioning/controller.go index d55c72ebb9..9c3910ac6c 100644 --- a/pkg/controller/admissionchecks/provisioning/controller.go +++ b/pkg/controller/admissionchecks/provisioning/controller.go @@ -32,7 +32,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" - autoscaling "k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/apis/autoscaling.x-k8s.io/v1beta1" + autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" @@ -478,9 +478,8 @@ func (c *Controller) syncCheckStates(ctx context.Context, wl *kueue.Workload, ch } prFailed := apimeta.IsStatusConditionTrue(pr.Status.Conditions, autoscaling.Failed) - prAccepted := apimeta.IsStatusConditionTrue(pr.Status.Conditions, autoscaling.Provisioned) - prAvailable := apimeta.IsStatusConditionTrue(pr.Status.Conditions, autoscaling.CapacityAvailable) - log.V(3).Info("Synchronizing admission check state based on provisioning request", "wl", klog.KObj(wl), "check", check, "prName", pr.Name, "failed", prFailed, "accepted", prAccepted, "available", prAvailable) + prProvisioned := apimeta.IsStatusConditionTrue(pr.Status.Conditions, autoscaling.Provisioned) + log.V(3).Info("Synchronizing admission check state based on provisioning request", "wl", klog.KObj(wl), "check", check, "prName", pr.Name, "failed", prFailed, "accepted", prProvisioned) switch { case prFailed: @@ -497,7 +496,7 @@ func (c *Controller) syncCheckStates(ctx context.Context, wl *kueue.Workload, ch checkState.Message = apimeta.FindStatusCondition(pr.Status.Conditions, autoscaling.Failed).Message } } - case prAccepted || prAvailable: + case prProvisioned: if checkState.State != kueue.CheckStateReady { updated = true checkState.State = kueue.CheckStateReady diff --git a/pkg/controller/admissionchecks/provisioning/controller_test.go b/pkg/controller/admissionchecks/provisioning/controller_test.go index 9a8d66774b..1e14bb1a5b 100644 --- a/pkg/controller/admissionchecks/provisioning/controller_test.go +++ b/pkg/controller/admissionchecks/provisioning/controller_test.go @@ -27,7 +27,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - autoscaling "k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/apis/autoscaling.x-k8s.io/v1beta1" + autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -479,37 +479,6 @@ func TestReconcile(t *testing.T) { Obj(), }, }, - "when capacity is available": { - workload: baseWorkload.DeepCopy(), - checks: []kueue.AdmissionCheck{*baseCheck.DeepCopy()}, - flavors: []kueue.ResourceFlavor{*baseFlavor1.DeepCopy(), *baseFlavor2.DeepCopy()}, - configs: []kueue.ProvisioningRequestConfig{*baseConfig.DeepCopy()}, - requests: []autoscaling.ProvisioningRequest{ - *requestWithCondition(baseRequest, autoscaling.CapacityAvailable, metav1.ConditionTrue), - }, - templates: []corev1.PodTemplate{*baseTemplate1.DeepCopy(), *baseTemplate2.DeepCopy()}, - wantWorkloads: map[string]*kueue.Workload{ - baseWorkload.Name: (&utiltesting.WorkloadWrapper{Workload: *baseWorkload.DeepCopy()}). - AdmissionChecks(kueue.AdmissionCheckState{ - Name: "check1", - State: kueue.CheckStateReady, - PodSetUpdates: []kueue.PodSetUpdate{ - { - Name: "ps1", - Annotations: map[string]string{"cluster-autoscaler.kubernetes.io/consume-provisioning-request": "wl-check1-1"}, - }, - { - Name: "ps2", - Annotations: map[string]string{"cluster-autoscaler.kubernetes.io/consume-provisioning-request": "wl-check1-1"}, - }, - }, - }, kueue.AdmissionCheckState{ - Name: "not-provisioning", - State: kueue.CheckStatePending, - }). - Obj(), - }, - }, "when request is provisioned": { workload: baseWorkload.DeepCopy(), checks: []kueue.AdmissionCheck{*baseCheck.DeepCopy()}, diff --git a/pkg/controller/admissionchecks/provisioning/indexer.go b/pkg/controller/admissionchecks/provisioning/indexer.go index cfbe1301da..91f48e02ca 100644 --- a/pkg/controller/admissionchecks/provisioning/indexer.go +++ b/pkg/controller/admissionchecks/provisioning/indexer.go @@ -21,7 +21,7 @@ import ( "fmt" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - autoscaling "k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/apis/autoscaling.x-k8s.io/v1beta1" + autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/controller-runtime/pkg/manager" diff --git a/pkg/controller/admissionchecks/provisioning/indexer_test.go b/pkg/controller/admissionchecks/provisioning/indexer_test.go index 584cefe866..24e9ddaaf6 100644 --- a/pkg/controller/admissionchecks/provisioning/indexer_test.go +++ b/pkg/controller/admissionchecks/provisioning/indexer_test.go @@ -25,7 +25,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - autoscaling "k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/apis/autoscaling.x-k8s.io/v1beta1" + autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" diff --git a/pkg/controller/core/clusterqueue_controller.go b/pkg/controller/core/clusterqueue_controller.go index 8d15d75b0a..757eb2185b 100644 --- a/pkg/controller/core/clusterqueue_controller.go +++ b/pkg/controller/core/clusterqueue_controller.go @@ -302,11 +302,12 @@ func (r *ClusterQueueReconciler) Update(e event.UpdateEvent) bool { return true } defer r.notifyWatchers(oldCq, newCq) + specUpdated := !equality.Semantic.DeepEqual(oldCq.Spec, newCq.Spec) if err := r.cache.UpdateClusterQueue(newCq); err != nil { log.Error(err, "Failed to update clusterQueue in cache") } - if err := r.qManager.UpdateClusterQueue(context.Background(), newCq); err != nil { + if err := r.qManager.UpdateClusterQueue(context.Background(), newCq, specUpdated); err != nil { log.Error(err, "Failed to update clusterQueue in queue manager") } diff --git a/pkg/controller/core/workload_controller.go b/pkg/controller/core/workload_controller.go index d7c7bebcac..beb372c2f9 100644 --- a/pkg/controller/core/workload_controller.go +++ b/pkg/controller/core/workload_controller.go @@ -24,6 +24,7 @@ import ( "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" nodev1 "k8s.io/api/node/v1" + "k8s.io/apimachinery/pkg/api/equality" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -148,6 +149,12 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c ctx = ctrl.LoggerInto(ctx, log) log.V(2).Info("Reconciling Workload") + // If a deactivated workload is re-activated, we need to reset the RequeueState. + if wl.Status.RequeueState != nil && ptr.Deref(wl.Spec.Active, true) && workload.IsEvictedByDeactivation(&wl) { + wl.Status.RequeueState = nil + return ctrl.Result{}, workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) + } + if len(wl.ObjectMeta.OwnerReferences) == 0 && !wl.DeletionTimestamp.IsZero() { return ctrl.Result{}, workload.RemoveFinalizer(ctx, r.client, &wl) } @@ -210,25 +217,25 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, err } - if !r.queues.QueueForWorkloadExists(&wl) { + switch { + case !r.queues.QueueForWorkloadExists(&wl): log.V(3).Info("Workload is inadmissible because of missing LocalQueue", "localQueue", klog.KRef(wl.Namespace, wl.Spec.QueueName)) - workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("LocalQueue %s doesn't exist", wl.Spec.QueueName)) - err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) - return ctrl.Result{}, client.IgnoreNotFound(err) - } - - if !cqOk { + if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("LocalQueue %s doesn't exist", wl.Spec.QueueName)) { + err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) + return ctrl.Result{}, client.IgnoreNotFound(err) + } + case !cqOk: log.V(3).Info("Workload is inadmissible because of missing ClusterQueue", "clusterQueue", klog.KRef("", cqName)) - workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s doesn't exist", cqName)) - err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) - return ctrl.Result{}, client.IgnoreNotFound(err) - } - - if !r.cache.ClusterQueueActive(cqName) { + if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s doesn't exist", cqName)) { + err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) + return ctrl.Result{}, client.IgnoreNotFound(err) + } + case !r.cache.ClusterQueueActive(cqName): log.V(3).Info("Workload is inadmissible because ClusterQueue is inactive", "clusterQueue", klog.KRef("", cqName)) - workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is inactive", cqName)) - err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) - return ctrl.Result{}, client.IgnoreNotFound(err) + if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is inactive", cqName)) { + err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) + return ctrl.Result{}, client.IgnoreNotFound(err) + } } return ctrl.Result{}, nil @@ -286,13 +293,13 @@ func (r *WorkloadReconciler) reconcileOnClusterQueueActiveState(ctx context.Cont if err != nil || !queue.DeletionTimestamp.IsZero() { log.V(3).Info("Workload is inadmissible because the ClusterQueue is terminating or missing", "clusterQueue", klog.KRef("", cqName)) - workload.UnsetQuotaReservationWithCondition(wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is terminating or missing", cqName)) + _ = workload.UnsetQuotaReservationWithCondition(wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is terminating or missing", cqName)) return true, workload.ApplyAdmissionStatus(ctx, r.client, wl, true) } if queueStopPolicy != kueue.None { log.V(3).Info("Workload is inadmissible because the ClusterQueue is stopped", "clusterQueue", klog.KRef("", cqName)) - workload.UnsetQuotaReservationWithCondition(wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is stopped", cqName)) + _ = workload.UnsetQuotaReservationWithCondition(wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is stopped", cqName)) return true, workload.ApplyAdmissionStatus(ctx, r.client, wl, true) } @@ -541,9 +548,38 @@ func (r *WorkloadReconciler) Update(e event.UpdateEvent) bool { log.Error(err, "Failed to delete workload from cache") } }) - if !r.queues.AddOrUpdateWorkload(wlCopy) { - log.V(2).Info("Queue for workload didn't exist; ignored for now") + var backoff time.Duration + if wlCopy.Status.RequeueState != nil && wlCopy.Status.RequeueState.RequeueAt != nil { + backoff = time.Until(wl.Status.RequeueState.RequeueAt.Time) + } + if backoff <= 0 { + if !r.queues.AddOrUpdateWorkload(wlCopy) { + log.V(2).Info("Queue for workload didn't exist; ignored for now") + } + } else { + log.V(3).Info("Workload to be requeued after backoff", "backoff", backoff, "requeueAt", wl.Status.RequeueState.RequeueAt.Time) + time.AfterFunc(backoff, func() { + updatedWl := kueue.Workload{} + err := r.client.Get(ctx, client.ObjectKeyFromObject(wl), &updatedWl) + if err == nil && workloadStatus(&updatedWl) == pending { + if !r.queues.AddOrUpdateWorkload(wlCopy) { + log.V(2).Info("Queue for workload didn't exist; ignored for now") + } else { + log.V(3).Info("Workload requeued after backoff") + } + } + }) } + case prevStatus == admitted && status == admitted && !equality.Semantic.DeepEqual(oldWl.Status.ReclaimablePods, wl.Status.ReclaimablePods): + // trigger the move of associated inadmissibleWorkloads, if there are any. + r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() { + // Update the workload from cache while holding the queues lock + // to guarantee that requeued workloads are taken into account before + // the next scheduling cycle. + if err := r.cache.UpdateWorkload(oldWl, wlCopy); err != nil { + log.Error(err, "Failed to delete workload from cache") + } + }) default: // Workload update in the cache is handled here; however, some fields are immutable @@ -714,7 +750,7 @@ func (w *workloadCqHandler) Update(ctx context.Context, ev event.UpdateEvent, wq if !newCq.DeletionTimestamp.IsZero() || !slices.CmpNoOrder(oldCq.Spec.AdmissionChecks, newCq.Spec.AdmissionChecks) || - oldCq.Spec.StopPolicy != newCq.Spec.StopPolicy { + !ptr.Equal(oldCq.Spec.StopPolicy, newCq.Spec.StopPolicy) { w.queueReconcileForWorkloads(ctx, newCq.Name, wq) } } diff --git a/pkg/controller/jobframework/interface.go b/pkg/controller/jobframework/interface.go index fe7ca3d78d..816ae5d3de 100644 --- a/pkg/controller/jobframework/interface.go +++ b/pkg/controller/jobframework/interface.go @@ -111,8 +111,6 @@ type ComposableJob interface { FindMatchingWorkloads(ctx context.Context, c client.Client, r record.EventRecorder) (match *kueue.Workload, toDelete []*kueue.Workload, err error) // Stop implements the custom stop procedure for ComposableJob Stop(ctx context.Context, c client.Client, podSetsInfo []podset.PodSetInfo, stopReason StopReason, eventMsg string) ([]client.Object, error) - // Ensure all members of the ComposableJob are owning the workload - EnsureWorkloadOwnedByAllMembers(ctx context.Context, c client.Client, r record.EventRecorder, workload *kueue.Workload) error } func ParentWorkloadName(job GenericJob) string { diff --git a/pkg/controller/jobframework/reconciler.go b/pkg/controller/jobframework/reconciler.go index 221f22ba04..5370048a72 100644 --- a/pkg/controller/jobframework/reconciler.go +++ b/pkg/controller/jobframework/reconciler.go @@ -269,21 +269,9 @@ func (r *JobReconciler) ReconcileGenericJob(ctx context.Context, req ctrl.Reques return ctrl.Result{}, err } - // Ensure all members of the composable job own the workload - if wl != nil { - if cj, implements := job.(ComposableJob); implements { - if err := cj.EnsureWorkloadOwnedByAllMembers(ctx, r.client, r.record, wl); err != nil { - return ctrl.Result{}, err - } - } - } - if wl != nil && apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) { - // Finalize the job if it's finished - if _, finished := job.Finished(); finished { - if err := r.finalizeJob(ctx, job); err != nil { - return ctrl.Result{}, err - } + if err := r.finalizeJob(ctx, job); err != nil { + return ctrl.Result{}, err } r.record.Eventf(object, corev1.EventTypeNormal, ReasonFinishedWorkload, @@ -380,8 +368,7 @@ func (r *JobReconciler) ReconcileGenericJob(ctx context.Context, req ctrl.Reques if workload.HasQuotaReservation(wl) { if !job.IsActive() { log.V(6).Info("The job is no longer active, clear the workloads admission") - workload.UnsetQuotaReservationWithCondition(wl, "Pending", evCond.Message) - _ = workload.SyncAdmittedCondition(wl) + _ = workload.UnsetQuotaReservationWithCondition(wl, "Pending", evCond.Message) err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true) if err != nil { return ctrl.Result{}, fmt.Errorf("clearing admission: %w", err) diff --git a/pkg/controller/jobframework/setup.go b/pkg/controller/jobframework/setup.go index 39541ad3c4..2109ee7da5 100644 --- a/pkg/controller/jobframework/setup.go +++ b/pkg/controller/jobframework/setup.go @@ -22,19 +22,38 @@ import ( "fmt" "os" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/tools/cache" + "github.com/go-logr/logr" + "golang.org/x/exp/slices" "k8s.io/apimachinery/pkg/api/meta" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + "k8s.io/apimachinery/pkg/watch" + retrywatch "k8s.io/client-go/tools/watch" + "sigs.k8s.io/kueue/pkg/controller/jobs/noop" ) +const ( + pytorchjobAPI = "pytorchjobs.kubeflow.org" + rayclusterAPI = "rayclusters.ray.io" +) + var ( errFailedMappingResource = errors.New("restMapper failed mapping resource") ) +// +kubebuilder:rbac:groups="apiextensions.k8s.io",resources=customresourcedefinitions,verbs=get;list;watch + // SetupControllers setups all controllers and webhooks for integrations. // When the platform developers implement a separate kueue-manager to manage the in-house custom jobs, // they can easily setup controllers and webhooks for the in-house custom jobs. @@ -62,24 +81,14 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error { if err != nil { return fmt.Errorf("%s: %w: %w", fwkNamePrefix, errFailedMappingResource, err) } - if _, err = mgr.GetRESTMapper().RESTMapping(gvk.GroupKind(), gvk.Version); err != nil { - if !meta.IsNoMatchError(err) { - return fmt.Errorf("%s: %w", fwkNamePrefix, err) - } - logger.Info("No matching API in the server for job framework, skipped setup of controller and webhook") + if !isAPIAvailable(context.TODO(), mgr, rayclusterAPI) { + logger.Info("API not available, waiting for it to become available... - Skipping setup of controller and webhook") + waitForAPI(context.TODO(), logger, mgr, rayclusterAPI, func() { + setupComponents(mgr, logger, gvk, fwkNamePrefix, cb, opts...) + }) } else { - if err = cb.NewReconciler( - mgr.GetClient(), - mgr.GetEventRecorderFor(fmt.Sprintf("%s-%s-controller", name, options.ManagerName)), - opts..., - ).SetupWithManager(mgr); err != nil { - return fmt.Errorf("%s: %w", fwkNamePrefix, err) - } - if err = cb.SetupWebhook(mgr, opts...); err != nil { - return fmt.Errorf("%s: unable to create webhook: %w", fwkNamePrefix, err) - } - logger.Info("Set up controller and webhook for job framework") - return nil + logger.Info("API is available, setting up components...") + setupComponents(mgr, logger, gvk, fwkNamePrefix, cb, opts...) } } if err := noop.SetupWebhook(mgr, cb.JobType); err != nil { @@ -89,6 +98,39 @@ func SetupControllers(mgr ctrl.Manager, log logr.Logger, opts ...Option) error { }) } +func setupComponents(mgr ctrl.Manager, log logr.Logger, gvk schema.GroupVersionKind, fwkNamePrefix string, cb IntegrationCallbacks, opts ...Option) { + // Attempt to get the REST mapping for the GVK + if _, err := mgr.GetRESTMapper().RESTMapping(gvk.GroupKind(), gvk.Version); err != nil { + if !meta.IsNoMatchError(err) { + log.Error(err, fmt.Sprintf("%s: unable to get REST mapping", fwkNamePrefix)) + return + } + log.Info("No matching API in the server for job framework, skipped setup of controller and webhook") + } else { + if err := setupControllerAndWebhook(mgr, gvk, fwkNamePrefix, cb, opts...); err != nil { + log.Error(err, "Failed to set up controller and webhook") + } else { + log.Info("Set up controller and webhook for job framework") + } + } +} + +func setupControllerAndWebhook(mgr ctrl.Manager, gvk schema.GroupVersionKind, fwkNamePrefix string, cb IntegrationCallbacks, opts ...Option) error { + if err := cb.NewReconciler( + mgr.GetClient(), + mgr.GetEventRecorderFor(fmt.Sprintf("%s-%s-controller", gvk.Kind, "managerName")), // Ensure managerName is defined or fetched + opts..., + ).SetupWithManager(mgr); err != nil { + return fmt.Errorf("%s: %w", fwkNamePrefix, err) + } + + if err := cb.SetupWebhook(mgr, opts...); err != nil { + return fmt.Errorf("%s: unable to create webhook: %w", fwkNamePrefix, err) + } + + return nil +} + // SetupIndexes setups the indexers for integrations. // When the platform developers implement a separate kueue-manager to manage the in-house custom jobs, // they can easily setup indexers for the in-house custom jobs. @@ -105,3 +147,73 @@ func SetupIndexes(ctx context.Context, indexer client.FieldIndexer, opts ...Opti return nil }) } + +func isAPIAvailable(ctx context.Context, mgr ctrl.Manager, apiName string) bool { + crdClient, err := apiextensionsclientset.NewForConfig(mgr.GetConfig()) + exitOnError(err, "unable to create CRD client") + + crdList, err := crdClient.ApiextensionsV1().CustomResourceDefinitions().List(ctx, metav1.ListOptions{}) + exitOnError(err, "unable to list CRDs") + + return slices.ContainsFunc(crdList.Items, func(crd apiextensionsv1.CustomResourceDefinition) bool { + return crd.Name == apiName + }) +} + +func waitForAPI(ctx context.Context, log logr.Logger, mgr ctrl.Manager, apiName string, action func()) { + crdClient, err := apiextensionsclientset.NewForConfig(mgr.GetConfig()) + exitOnError(err, "unable to create CRD client") + + crdList, err := crdClient.ApiextensionsV1().CustomResourceDefinitions().List(ctx, metav1.ListOptions{}) + exitOnError(err, "unable to list CRDs") + + // If API is already available, just invoke action + if slices.ContainsFunc(crdList.Items, func(crd apiextensionsv1.CustomResourceDefinition) bool { + return crd.Name == apiName + }) { + action() + return + } + + // Wait for the API to become available then invoke action + log.Info(fmt.Sprintf("API %v not available, setting up retry watcher", apiName)) + retryWatcher, err := retrywatch.NewRetryWatcher(crdList.ResourceVersion, &cache.ListWatch{ + ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { + return crdClient.ApiextensionsV1().CustomResourceDefinitions().List(ctx, metav1.ListOptions{}) + }, + WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { + return crdClient.ApiextensionsV1().CustomResourceDefinitions().Watch(ctx, metav1.ListOptions{}) + }, + }) + exitOnError(err, "unable to create retry watcher") + + defer retryWatcher.Stop() + for { + select { + case <-ctx.Done(): + return + case event := <-retryWatcher.ResultChan(): + switch event.Type { + case watch.Error: + exitOnError(apierrors.FromObject(event.Object), fmt.Sprintf("error watching for API %v", apiName)) + + case watch.Added, watch.Modified: + if crd := event.Object.(*apiextensionsv1.CustomResourceDefinition); crd.Name == apiName && + slices.ContainsFunc(crd.Status.Conditions, func(condition apiextensionsv1.CustomResourceDefinitionCondition) bool { + return condition.Type == apiextensionsv1.Established && condition.Status == apiextensionsv1.ConditionTrue + }) { + log.Info(fmt.Sprintf("API %v installed, invoking deferred action", apiName)) + action() + return + } + } + } + } +} + +func exitOnError(err error, msg string) { + if err != nil { + fmt.Sprint(err, msg) + os.Exit(1) + } +} diff --git a/pkg/controller/jobs/pod/indexer.go b/pkg/controller/jobs/pod/indexer.go new file mode 100644 index 0000000000..24f1831942 --- /dev/null +++ b/pkg/controller/jobs/pod/indexer.go @@ -0,0 +1,38 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pod + +import ( + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + PodGroupNameCacheKey = "PodGroupNameCacheKey" +) + +func IndexPodGroupName(o client.Object) []string { + pod, ok := o.(*corev1.Pod) + if !ok { + return nil + } + + if labelValue, exists := pod.GetLabels()[GroupNameLabel]; exists { + return []string{labelValue} + } + return nil +} diff --git a/pkg/controller/jobs/pod/pod_controller.go b/pkg/controller/jobs/pod/pod_controller.go index 9cc4a0be44..7b8c8410fd 100644 --- a/pkg/controller/jobs/pod/pod_controller.go +++ b/pkg/controller/jobs/pod/pod_controller.go @@ -43,6 +43,7 @@ import ( "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" @@ -108,9 +109,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu } func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { + concurrency := mgr.GetControllerOptions().GroupKindConcurrency[gvk.GroupKind().String()] + ctrl.Log.V(3).Info("Setting up Pod reconciler", "concurrency", concurrency) return ctrl.NewControllerManagedBy(mgr). Watches(&corev1.Pod{}, &podEventHandler{cleanedUpPodsExpectations: r.expectationsStore}).Named("v1_pod"). Watches(&kueue.Workload{}, &workloadHandler{}). + WithOptions(controller.Options{ + MaxConcurrentReconciles: concurrency, + }). Complete(r) } @@ -470,7 +476,13 @@ func (p *Pod) Stop(ctx context.Context, c client.Client, _ []podset.PodSetInfo, } func SetupIndexes(ctx context.Context, indexer client.FieldIndexer) error { - return jobframework.SetupWorkloadOwnerIndex(ctx, indexer, gvk) + if err := indexer.IndexField(ctx, &corev1.Pod{}, PodGroupNameCacheKey, IndexPodGroupName); err != nil { + return err + } + if err := jobframework.SetupWorkloadOwnerIndex(ctx, indexer, gvk); err != nil { + return err + } + return nil } func CanSupportIntegration(opts ...jobframework.Option) (bool, error) { @@ -490,8 +502,8 @@ func (p *Pod) Finalize(ctx context.Context, c client.Client) error { if groupName == "" { podsInGroup.Items = append(podsInGroup.Items, *p.Object().(*corev1.Pod)) } else { - if err := c.List(ctx, &podsInGroup, client.MatchingLabels{ - GroupNameLabel: groupName, + if err := c.List(ctx, &podsInGroup, client.MatchingFields{ + PodGroupNameCacheKey: groupName, }, client.InNamespace(p.pod.Namespace)); err != nil { return err } @@ -554,9 +566,6 @@ func getRoleHash(p corev1.Pod) (string, error) { } shape := map[string]interface{}{ - "metadata": map[string]interface{}{ - "labels": omitKueueLabels(p.ObjectMeta.Labels), - }, "spec": map[string]interface{}{ "initContainers": containersShape(p.Spec.InitContainers), "containers": containersShape(p.Spec.Containers), @@ -565,7 +574,6 @@ func getRoleHash(p corev1.Pod) (string, error) { "tolerations": p.Spec.Tolerations, "runtimeClassName": p.Spec.RuntimeClassName, "priority": p.Spec.Priority, - "preemptionPolicy": p.Spec.PreemptionPolicy, "topologySpreadConstraints": p.Spec.TopologySpreadConstraints, "overhead": p.Spec.Overhead, "resourceClaims": p.Spec.ResourceClaims, @@ -609,8 +617,8 @@ func (p *Pod) Load(ctx context.Context, c client.Client, key *types.NamespacedNa // and update the expectations after we've retrieved active pods from the store. p.satisfiedExcessPods = p.excessPodExpectations.Satisfied(ctrl.LoggerFrom(ctx), *key) - if err := c.List(ctx, &p.list, client.MatchingLabels{ - GroupNameLabel: key.Name, + if err := c.List(ctx, &p.list, client.MatchingFields{ + PodGroupNameCacheKey: key.Name, }, client.InNamespace(key.Namespace)); err != nil { return false, err } @@ -627,9 +635,13 @@ func (p *Pod) Load(ctx context.Context, c client.Client, key *types.NamespacedNa } func (p *Pod) constructGroupPodSets() ([]kueue.PodSet, error) { + return constructGroupPodSets(p.list.Items) +} + +func constructGroupPodSets(pods []corev1.Pod) ([]kueue.PodSet, error) { var resultPodSets []kueue.PodSet - for _, podInGroup := range p.list.Items { + for _, podInGroup := range pods { if !isPodRunnableOrSucceeded(&podInGroup) { continue } @@ -708,14 +720,12 @@ func (p *Pod) validatePodGroupMetadata(r record.EventRecorder, activePods []core // runnableOrSucceededPods returns a slice of active pods in the group func (p *Pod) runnableOrSucceededPods() []corev1.Pod { - activePods := make([]corev1.Pod, 0, len(p.list.Items)) - for _, pod := range p.list.Items { - if isPodRunnableOrSucceeded(&pod) { - activePods = append(activePods, pod) - } - } + return utilslices.Pick(p.list.Items, isPodRunnableOrSucceeded) +} - return activePods +// notRunnableNorSucceededPods returns a slice of inactive pods in the group +func (p *Pod) notRunnableNorSucceededPods() []corev1.Pod { + return utilslices.Pick(p.list.Items, func(p *corev1.Pod) bool { return !isPodRunnableOrSucceeded(p) }) } // isPodRunnableOrSucceeded returns whether the Pod can eventually run, is Running or Succeeded. @@ -727,21 +737,55 @@ func isPodRunnableOrSucceeded(p *corev1.Pod) bool { return p.Status.Phase != corev1.PodFailed } -// cleanupExcessPods will delete and finalize pods created last if the number of -// activePods is greater than the totalCount value. -func (p *Pod) cleanupExcessPods(ctx context.Context, c client.Client, r record.EventRecorder, totalCount int, activePods []corev1.Pod) error { - log := ctrl.LoggerFrom(ctx) - - extraPodsCount := len(activePods) - totalCount - - if extraPodsCount <= 0 { - return nil +// lastActiveTime returns the last timestamp on which the pod was observed active: +// - the time the pod was declared Failed +// - the deletion time +func lastActiveTime(p *corev1.Pod) time.Time { + lastTransition := metav1.Now() + for _, c := range p.Status.Conditions { + if c.Type == corev1.ContainersReady { + if c.Status == corev1.ConditionFalse && c.Reason == string(corev1.PodFailed) { + lastTransition = c.LastTransitionTime + } + break + } } - // Do not clean up more pods until observing previous operations - if !p.satisfiedExcessPods { - return errPendingOps + deletionTime := ptr.Deref(p.DeletionTimestamp, metav1.Now()) + if lastTransition.Before(&deletionTime) { + return lastTransition.Time } + return deletionTime.Time +} + +// sortInactivePods sorts the provided pods slice based on: +// - finalizer state (pods with finalizers are first) +// - lastActiveTime (pods that were active last are first) +// - creation timestamp (newer pods are first) +func sortInactivePods(inactivePods []corev1.Pod) { + sort.Slice(inactivePods, func(i, j int) bool { + pi := &inactivePods[i] + pj := &inactivePods[j] + iFin := slices.Contains(pi.Finalizers, PodFinalizer) + jFin := slices.Contains(pj.Finalizers, PodFinalizer) + if iFin != jFin { + return iFin + } + + iLastActive := lastActiveTime(pi) + jLastActive := lastActiveTime(pj) + if iLastActive.Equal(jLastActive) { + return pi.CreationTimestamp.Before(&pj.CreationTimestamp) + } + return jLastActive.Before(iLastActive) + }) +} + +// sortActivePods sorts the provided pods slice based on: +// - finalizer state (pods with no finalizers are last) +// - gated state (pods that are still gated are last) +// - creation timestamp (newer pods are last) +func sortActivePods(activePods []corev1.Pod) { // Sort active pods by creation timestamp sort.Slice(activePods, func(i, j int) bool { pi := &activePods[i] @@ -758,16 +802,22 @@ func (p *Pod) cleanupExcessPods(ctx context.Context, c client.Client, r record.E if iGated != jGated { return !iGated } - return pi.ObjectMeta.CreationTimestamp.Before(&pj.ObjectMeta.CreationTimestamp) + return pi.CreationTimestamp.Before(&pj.CreationTimestamp) }) +} + +func (p *Pod) removeExcessPods(ctx context.Context, c client.Client, r record.EventRecorder, extraPods []corev1.Pod) error { + if len(extraPods) == 0 { + return nil + } + + log := ctrl.LoggerFrom(ctx) // Extract all the latest created extra pods - extraPods := activePods[len(activePods)-extraPodsCount:] extraPodsUIDs := utilslices.Map(extraPods, func(p *corev1.Pod) types.UID { return p.UID }) p.excessPodExpectations.ExpectUIDs(log, p.key, extraPodsUIDs) // Finalize and delete the active pods created last - err := parallelize.Until(ctx, len(extraPods), func(i int) error { pod := extraPods[i] if controllerutil.RemoveFinalizer(&pod, PodFinalizer) { @@ -778,7 +828,7 @@ func (p *Pod) cleanupExcessPods(ctx context.Context, c client.Client, r record.E return err } } - if pod.ObjectMeta.DeletionTimestamp.IsZero() { + if pod.DeletionTimestamp.IsZero() { log.V(3).Info("Deleting excess pod in group", "excessPod", klog.KObj(&pod)) if err := c.Delete(ctx, &pod); err != nil { // We won't observe this cleanup in the event handler. @@ -792,31 +842,42 @@ func (p *Pod) cleanupExcessPods(ctx context.Context, c client.Client, r record.E if err != nil { return err } + return nil +} - // Remove excess pods from the group list - newPodsInGroup := make([]corev1.Pod, 0, len(p.list.Items)-len(extraPods)) - for i := range p.list.Items { - found := false - for j := range extraPods { - if p.list.Items[i].Name == extraPods[j].Name && p.list.Items[i].Namespace == extraPods[j].Namespace { - found = true - break - } - } +func (p *Pod) finalizePods(ctx context.Context, c client.Client, extraPods []corev1.Pod) error { + if len(extraPods) == 0 { + return nil + } + + log := ctrl.LoggerFrom(ctx) + + // Extract all the latest created extra pods + extraPodsUIDs := utilslices.Map(extraPods, func(p *corev1.Pod) types.UID { return p.UID }) + p.excessPodExpectations.ExpectUIDs(log, p.key, extraPodsUIDs) - if !found { - newPodsInGroup = append(newPodsInGroup, p.list.Items[i]) + err := parallelize.Until(ctx, len(extraPods), func(i int) error { + pod := extraPods[i] + if controllerutil.RemoveFinalizer(&pod, PodFinalizer) { + log.V(3).Info("Finalizing pod in group", "Pod", klog.KObj(&pod)) + if err := c.Update(ctx, &pod); err != nil { + // We won't observe this cleanup in the event handler. + p.excessPodExpectations.ObservedUID(log, p.key, pod.UID) + return err + } + } else { + // We don't expect an event in this case. + p.excessPodExpectations.ObservedUID(log, p.key, pod.UID) } + return nil + }) + if err != nil { + return err } - p.list.Items = newPodsInGroup - return nil } -func (p *Pod) EnsureWorkloadOwnedByAllMembers(ctx context.Context, c client.Client, r record.EventRecorder, workload *kueue.Workload) error { - if !p.isGroup { - return nil - } +func (p *Pod) ensureWorkloadOwnedByAllMembers(ctx context.Context, c client.Client, r record.EventRecorder, workload *kueue.Workload) error { oldOwnersCnt := len(workload.GetOwnerReferences()) for _, pod := range p.list.Items { if err := controllerutil.SetOwnerReference(&pod, workload, c.Scheme()); err != nil { @@ -875,7 +936,7 @@ func (p *Pod) ConstructComposableWorkload(ctx context.Context, c client.Client, return wl, nil } - if err := p.finalizeNonRunnableNorSucceededPods(ctx, c); err != nil { + if err := p.finalizePods(ctx, c, p.notRunnableNorSucceededPods()); err != nil { return nil, err } @@ -897,9 +958,13 @@ func (p *Pod) ConstructComposableWorkload(ctx context.Context, c client.Client, } // Cleanup extra pods if there's any - err = p.cleanupExcessPods(ctx, c, r, groupTotalCount, activePods) - if err != nil { - return nil, err + if excessPodsCount := len(activePods) - groupTotalCount; excessPodsCount > 0 { + sortActivePods(activePods) + err = p.removeExcessPods(ctx, c, r, activePods[len(activePods)-excessPodsCount:]) + if err != nil { + return nil, err + } + p.list.Items = activePods[:len(activePods)-excessPodsCount] } // Construct workload for a pod group @@ -975,37 +1040,73 @@ func (p *Pod) FindMatchingWorkloads(ctx context.Context, c client.Client, r reco // Cleanup excess pods for each workload pod set (role) activePods := p.runnableOrSucceededPods() + inactivePods := p.notRunnableNorSucceededPods() + + var keptPods []corev1.Pod + var excessActivePods []corev1.Pod + var replacedInactivePods []corev1.Pod + for _, ps := range workload.Spec.PodSets { - // Find all the active pods of the role - var roleActivePods []corev1.Pod - for _, activePod := range activePods { - roleHash, err := getRoleHash(activePod) + // Find all the active and inactive pods of the role + var roleHashErrors []error + hasRoleFunc := func(p *corev1.Pod) bool { + hash, err := getRoleHash(*p) if err != nil { - return nil, nil, fmt.Errorf("failed to calculate pod role hash: %w", err) + roleHashErrors = append(roleHashErrors, err) + return false } + return hash == ps.Name + } + roleActivePods := utilslices.Pick(activePods, hasRoleFunc) + roleInactivePods := utilslices.Pick(inactivePods, hasRoleFunc) + if len(roleHashErrors) > 0 { + return nil, nil, fmt.Errorf("failed to calculate pod role hash: %w", errors.Join(roleHashErrors...)) + } - if ps.Name == roleHash { - roleActivePods = append(roleActivePods, activePod) - } + if excessCount := len(roleActivePods) - int(ps.Count); excessCount > 0 { + sortActivePods(roleActivePods) + excessActivePods = append(excessActivePods, roleActivePods[len(roleActivePods)-excessCount:]...) + keptPods = append(keptPods, roleActivePods[:len(roleActivePods)-excessCount]...) + } else { + keptPods = append(keptPods, roleActivePods...) } - // Cleanup excess pods of the role - err := p.cleanupExcessPods(ctx, c, r, int(ps.Count), roleActivePods) - if err != nil { - return nil, nil, err + if finalizeablePodsCount := min(len(roleInactivePods), len(roleInactivePods)+len(roleActivePods)-int(ps.Count)); finalizeablePodsCount > 0 { + sortInactivePods(roleInactivePods) + replacedInactivePods = append(replacedInactivePods, roleInactivePods[len(roleInactivePods)-finalizeablePodsCount:]...) + keptPods = append(keptPods, roleInactivePods[:len(roleInactivePods)-finalizeablePodsCount]...) + } else { + keptPods = append(keptPods, roleInactivePods...) } } - jobPodSets, err := p.constructGroupPodSets() + jobPodSets, err := constructGroupPodSets(keptPods) if err != nil { return nil, nil, err } - if p.equivalentToWorkload(workload, jobPodSets) { - return workload, []*kueue.Workload{}, nil - } else { + if len(keptPods) == 0 || !p.equivalentToWorkload(workload, jobPodSets) { return nil, []*kueue.Workload{workload}, nil } + + // Do not clean up more pods until observing previous operations + if !p.satisfiedExcessPods { + return nil, nil, errPendingOps + } + + p.list.Items = keptPods + if err := p.ensureWorkloadOwnedByAllMembers(ctx, c, r, workload); err != nil { + return nil, nil, err + } + + if err := p.removeExcessPods(ctx, c, r, excessActivePods); err != nil { + return nil, nil, err + } + + if err := p.finalizePods(ctx, c, replacedInactivePods); err != nil { + return nil, nil, err + } + return workload, []*kueue.Workload{}, nil } func (p *Pod) equivalentToWorkload(wl *kueue.Workload, jobPodSets []kueue.PodSet) bool { @@ -1072,20 +1173,6 @@ func (p *Pod) ReclaimablePods() ([]kueue.ReclaimablePod, error) { return result, nil } -func (p *Pod) finalizeNonRunnableNorSucceededPods(ctx context.Context, c client.Client) error { - for _, p := range p.list.Items { - if isPodRunnableOrSucceeded(&p) { - continue - } - if controllerutil.RemoveFinalizer(&p, PodFinalizer) { - if err := c.Update(ctx, &p); err != nil { - return err - } - } - } - return nil -} - func IsPodOwnerManagedByKueue(p *Pod) bool { if owner := metav1.GetControllerOf(&p.pod); owner != nil { return jobframework.IsOwnerManagedByKueue(owner) || (owner.Kind == "RayCluster" && strings.HasPrefix(owner.APIVersion, "ray.io/v1alpha1")) diff --git a/pkg/controller/jobs/pod/pod_controller_test.go b/pkg/controller/jobs/pod/pod_controller_test.go index 14205360d4..4a7a597a88 100644 --- a/pkg/controller/jobs/pod/pod_controller_test.go +++ b/pkg/controller/jobs/pod/pod_controller_test.go @@ -532,7 +532,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). SchedulingGates(corev1.PodSchedulingGate{Name: "kueue.x-k8s.io/admission"}). Obj(), @@ -596,7 +596,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). SchedulingGates(corev1.PodSchedulingGate{Name: "kueue.x-k8s.io/admission"}). Obj(), @@ -610,7 +610,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). SchedulingGates(corev1.PodSchedulingGate{Name: "kueue.x-k8s.io/admission"}). Obj(), @@ -635,7 +635,6 @@ func TestReconciler(t *testing.T) { KueueSchedulingGate(). Group("test-group"). GroupTotalCount("2"). - Annotation("kueue.x-k8s.io/role-hash", "60bc72d3"). Obj(), *basePodWrapper. Clone(). @@ -645,7 +644,6 @@ func TestReconciler(t *testing.T) { KueueSchedulingGate(). Group("test-group"). GroupTotalCount("2"). - Annotation("kueue.x-k8s.io/role-hash", "60bc72d3"). Obj(), }, wantPods: []corev1.Pod{ @@ -655,7 +653,6 @@ func TestReconciler(t *testing.T) { KueueFinalizer(). Group("test-group"). GroupTotalCount("2"). - Annotation("kueue.x-k8s.io/role-hash", "60bc72d3"). NodeSelector("kubernetes.io/arch", "arm64"). Obj(), *basePodWrapper. @@ -665,17 +662,16 @@ func TestReconciler(t *testing.T) { KueueFinalizer(). Group("test-group"). GroupTotalCount("2"). - Annotation("kueue.x-k8s.io/role-hash", "60bc72d3"). NodeSelector("kubernetes.io/arch", "arm64"). Obj(), }, workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). - PodSets(*utiltesting.MakePodSet("60bc72d3", 2).Request(corev1.ResourceCPU, "1").Obj()). + PodSets(*utiltesting.MakePodSet("dc85db45", 2).Request(corev1.ResourceCPU, "1").Obj()). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). ReserveQuota( - utiltesting.MakeAdmission("cq", "60bc72d3"). + utiltesting.MakeAdmission("cq", "dc85db45"). Assignment(corev1.ResourceCPU, "unit-test-flavor", "2"). AssignmentPodCount(2). Obj(), @@ -685,11 +681,11 @@ func TestReconciler(t *testing.T) { }, wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). - PodSets(*utiltesting.MakePodSet("60bc72d3", 2).Request(corev1.ResourceCPU, "1").Obj()). + PodSets(*utiltesting.MakePodSet("dc85db45", 2).Request(corev1.ResourceCPU, "1").Obj()). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). ReserveQuota( - utiltesting.MakeAdmission("cq", "60bc72d3"). + utiltesting.MakeAdmission("cq", "dc85db45"). Assignment(corev1.ResourceCPU, "unit-test-flavor", "2"). AssignmentPodCount(2). Obj(), @@ -754,7 +750,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -768,7 +764,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -777,7 +773,7 @@ func TestReconciler(t *testing.T) { OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). Admitted(true). - ReclaimablePods(kueue.ReclaimablePod{Name: "60bc72d3", Count: 1}). + ReclaimablePods(kueue.ReclaimablePod{Name: "dc85db45", Count: 1}). Obj(), }, workloadCmpOpts: defaultWorkloadCmpOpts, @@ -822,7 +818,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -836,7 +832,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -881,7 +877,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -894,7 +890,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -951,7 +947,7 @@ func TestReconciler(t *testing.T) { }, workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). - PodSets(*utiltesting.MakePodSet("60bc72d3", 2).Request(corev1.ResourceCPU, "1").Obj()). + PodSets(*utiltesting.MakePodSet("dc85db45", 2).Request(corev1.ResourceCPU, "1").Obj()). Queue("test-queue"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). @@ -966,7 +962,7 @@ func TestReconciler(t *testing.T) { }, wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). - PodSets(*utiltesting.MakePodSet("60bc72d3", 2).Request(corev1.ResourceCPU, "1").Obj()). + PodSets(*utiltesting.MakePodSet("dc85db45", 2).Request(corev1.ResourceCPU, "1").Obj()). Queue("test-queue"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). @@ -981,10 +977,11 @@ func TestReconciler(t *testing.T) { }, workloadCmpOpts: defaultWorkloadCmpOpts, }, - "workload is not deleted if one of the pods in the finished group is absent": { + "Pods are finalized even if one of the pods in the finished group is absent": { pods: []corev1.Pod{ *basePodWrapper. Clone(). + KueueFinalizer(). Label("kueue.x-k8s.io/managed", "true"). Group("test-group"). GroupTotalCount("2"). @@ -1004,7 +1001,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1023,7 +1020,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1121,7 +1118,7 @@ func TestReconciler(t *testing.T) { wantPods: []corev1.Pod{}, workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). - PodSets(*utiltesting.MakePodSet("60bc72d3", 2).Request(corev1.ResourceCPU, "1").Obj()). + PodSets(*utiltesting.MakePodSet("dc85db45", 2).Request(corev1.ResourceCPU, "1").Obj()). ControllerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). ControllerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). Queue("test-queue"). @@ -1192,28 +1189,28 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). Queue("user-queue"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). - ReserveQuota(utiltesting.MakeAdmission("cq", "60bc72d3").AssignmentPodCount(1).Obj()). + ReserveQuota(utiltesting.MakeAdmission("cq", "dc85db45").AssignmentPodCount(1).Obj()). Admitted(true). Obj(), }, wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). Queue("user-queue"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). - ReserveQuota(utiltesting.MakeAdmission("cq", "60bc72d3").AssignmentPodCount(1).Obj()). + ReserveQuota(utiltesting.MakeAdmission("cq", "dc85db45").AssignmentPodCount(1).Obj()). Admitted(true). Obj(), }, @@ -1268,17 +1265,17 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 3). + *utiltesting.MakePodSet("dc85db45", 3). Request(corev1.ResourceCPU, "1"). Obj(), ). Queue("user-queue"). - ReserveQuota(utiltesting.MakeAdmission("cq", "60bc72d3").AssignmentPodCount(3).Obj()). + ReserveQuota(utiltesting.MakeAdmission("cq", "dc85db45").AssignmentPodCount(3).Obj()). Admitted(true). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod3", "test-uid"). - ReclaimablePods(kueue.ReclaimablePod{Name: "60bc72d3", Count: 1}). + ReclaimablePods(kueue.ReclaimablePod{Name: "dc85db45", Count: 1}). Obj(), }, wantPods: []corev1.Pod{ @@ -1294,7 +1291,6 @@ func TestReconciler(t *testing.T) { Clone(). Name("pod2"). Label("kueue.x-k8s.io/managed", "true"). - KueueFinalizer(). Group("test-group"). GroupTotalCount("3"). StatusPhase(corev1.PodFailed). @@ -1320,14 +1316,14 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 3). + *utiltesting.MakePodSet("dc85db45", 3). Request(corev1.ResourceCPU, "1"). Obj(), ). Queue("user-queue"). - ReserveQuota(utiltesting.MakeAdmission("cq", "60bc72d3").AssignmentPodCount(3).Obj()). + ReserveQuota(utiltesting.MakeAdmission("cq", "dc85db45").AssignmentPodCount(3).Obj()). ReclaimablePods(kueue.ReclaimablePod{ - Name: "60bc72d3", + Name: "dc85db45", Count: 1, }). Admitted(true). @@ -1335,7 +1331,7 @@ func TestReconciler(t *testing.T) { OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod3", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "replacement", "test-uid"). - ReclaimablePods(kueue.ReclaimablePod{Name: "60bc72d3", Count: 1}). + ReclaimablePods(kueue.ReclaimablePod{Name: "dc85db45", Count: 1}). Obj(), }, workloadCmpOpts: defaultWorkloadCmpOpts, @@ -1411,7 +1407,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1426,7 +1422,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1480,7 +1476,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1500,7 +1496,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1545,7 +1541,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1565,7 +1561,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1655,7 +1651,7 @@ func TestReconciler(t *testing.T) { }, wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). - PodSets(*utiltesting.MakePodSet("60bc72d3", 2).Request(corev1.ResourceCPU, "1").Obj()). + PodSets(*utiltesting.MakePodSet("dc85db45", 2).Request(corev1.ResourceCPU, "1").Obj()). Queue("test-queue"). Priority(0). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). @@ -1716,7 +1712,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1730,7 +1726,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1770,7 +1766,7 @@ func TestReconciler(t *testing.T) { *utiltesting.MakePodSet("absent-pod-role", 1). Request(corev1.ResourceCPU, "1"). Obj(), - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1786,7 +1782,7 @@ func TestReconciler(t *testing.T) { *utiltesting.MakePodSet("absent-pod-role", 1). Request(corev1.ResourceCPU, "1"). Obj(), - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -1882,7 +1878,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). SchedulingGates(corev1.PodSchedulingGate{Name: "kueue.x-k8s.io/admission"}). Request(corev1.ResourceCPU, "1"). Obj(), @@ -1980,7 +1976,7 @@ func TestReconciler(t *testing.T) { Label("kueue.x-k8s.io/managed", "true"). KueueFinalizer(). Group("test-group"). - Image("test-image-role2", nil). + Request(corev1.ResourceMemory, "1Gi"). GroupTotalCount("3"). StatusPhase(corev1.PodFailed). Obj(), @@ -2007,7 +2003,7 @@ func TestReconciler(t *testing.T) { Name("pod3"). Label("kueue.x-k8s.io/managed", "true"). Group("test-group"). - Image("test-image-role2", nil). + Request(corev1.ResourceMemory, "1Gi"). GroupTotalCount("3"). StatusPhase(corev1.PodFailed). Obj(), @@ -2015,10 +2011,11 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("4389b941", 1). + *utiltesting.MakePodSet("a119f908", 1). Request(corev1.ResourceCPU, "1"). + Request(corev1.ResourceMemory, "1Gi"). Obj(), - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2033,10 +2030,11 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("4389b941", 1). + *utiltesting.MakePodSet("a119f908", 1). Request(corev1.ResourceCPU, "1"). + Request(corev1.ResourceMemory, "1Gi"). Obj(), - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2091,7 +2089,7 @@ func TestReconciler(t *testing.T) { Label("kueue.x-k8s.io/managed", "true"). KueueFinalizer(). Group("test-group"). - Image("test-image-role2", nil). + Request(corev1.ResourceMemory, "1Gi"). GroupTotalCount("3"). StatusPhase(corev1.PodSucceeded). Obj(), @@ -2120,7 +2118,7 @@ func TestReconciler(t *testing.T) { Label("kueue.x-k8s.io/managed", "true"). KueueFinalizer(). Group("test-group"). - Image("test-image-role2", nil). + Request(corev1.ResourceMemory, "1Gi"). GroupTotalCount("3"). StatusPhase(corev1.PodSucceeded). Obj(), @@ -2128,10 +2126,11 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("007b32e9", 1). + *utiltesting.MakePodSet("4ebdd4a6", 1). Request(corev1.ResourceCPU, "1"). + Request(corev1.ResourceMemory, "1Gi"). Obj(), - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2146,10 +2145,11 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("007b32e9", 1). + *utiltesting.MakePodSet("4ebdd4a6", 1). Request(corev1.ResourceCPU, "1"). + Request(corev1.ResourceMemory, "1Gi"). Obj(), - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2159,7 +2159,7 @@ func TestReconciler(t *testing.T) { OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod2", "test-uid"). OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "pod3", "test-uid"). - ReclaimablePods(kueue.ReclaimablePod{Name: "007b32e9", Count: 1}). + ReclaimablePods(kueue.ReclaimablePod{Name: "4ebdd4a6", Count: 1}). Obj(), }, workloadCmpOpts: defaultWorkloadCmpOpts, @@ -2201,7 +2201,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). SchedulingGates(corev1.PodSchedulingGate{Name: "kueue.x-k8s.io/admission"}). Obj(), @@ -2284,7 +2284,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2297,7 +2297,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2356,7 +2356,7 @@ func TestReconciler(t *testing.T) { *utiltesting.MakePodSet("aaf269e6", 1). Request(corev1.ResourceCPU, "1"). Obj(), - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2372,7 +2372,7 @@ func TestReconciler(t *testing.T) { *utiltesting.MakePodSet("aaf269e6", 1). Request(corev1.ResourceCPU, "1"). Obj(), - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2462,7 +2462,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2474,7 +2474,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2543,7 +2543,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2558,7 +2558,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 2). + *utiltesting.MakePodSet("dc85db45", 2). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2631,7 +2631,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns").Finalizers(kueue.ResourceInUseFinalizerName). PodSets( - *utiltesting.MakePodSet("60bc72d3", 1). + *utiltesting.MakePodSet("dc85db45", 1). Request(corev1.ResourceCPU, "1"). SchedulingGates(corev1.PodSchedulingGate{Name: "kueue.x-k8s.io/admission"}). Obj(), @@ -2760,7 +2760,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 3). + *utiltesting.MakePodSet("dc85db45", 3). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2774,7 +2774,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 3). + *utiltesting.MakePodSet("dc85db45", 3). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2866,7 +2866,7 @@ func TestReconciler(t *testing.T) { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 3). + *utiltesting.MakePodSet("dc85db45", 3). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2886,7 +2886,7 @@ func TestReconciler(t *testing.T) { wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("test-group", "ns"). PodSets( - *utiltesting.MakePodSet("60bc72d3", 3). + *utiltesting.MakePodSet("dc85db45", 3). Request(corev1.ResourceCPU, "1"). Obj(), ). @@ -2919,6 +2919,397 @@ func TestReconciler(t *testing.T) { }, }, }, + "the failed pods are finalized in order": { + pods: []corev1.Pod{ + *basePodWrapper. + Clone(). + Name("active-pod"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + Obj(), + *basePodWrapper. + Clone(). + Name("finished-with-error"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-5 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("deleted"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + StatusPhase(corev1.PodFailed). + DeletionTimestamp(now.Add(-4 * time.Minute)). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-3 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("finished-with-error2"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-3 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("replacement1"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + Obj(), + *basePodWrapper. + Clone(). + Name("replacement2"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + Obj(), + }, + wantPods: []corev1.Pod{ + *basePodWrapper. + Clone(). + Name("active-pod"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + Obj(), + *basePodWrapper. + Clone(). + Name("finished-with-error"). + Label("kueue.x-k8s.io/managed", "true"). + Group("test-group"). + GroupTotalCount("4"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-5 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("finished-with-error2"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-3 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("replacement1"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + Obj(), + *basePodWrapper. + Clone(). + Name("replacement2"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("4"). + Obj(), + }, + workloads: []kueue.Workload{ + *utiltesting.MakeWorkload("test-group", "ns"). + PodSets( + *utiltesting.MakePodSet("dc85db45", 4). + Request(corev1.ResourceCPU, "1"). + Obj(), + ). + Queue("user-queue"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "active-pod", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "deleted", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error2", "test-uid"). + ReserveQuota(utiltesting.MakeAdmission("cq").AssignmentPodCount(1).Obj()). + Admitted(true). + Obj(), + }, + wantWorkloads: []kueue.Workload{ + *utiltesting.MakeWorkload("test-group", "ns"). + PodSets( + *utiltesting.MakePodSet("dc85db45", 4). + Request(corev1.ResourceCPU, "1"). + Obj(), + ). + Queue("user-queue"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "active-pod", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "deleted", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error2", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "replacement1", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "replacement2", "test-uid"). + ReserveQuota(utiltesting.MakeAdmission("cq").AssignmentPodCount(1).Obj()). + Admitted(true). + Obj(), + }, + workloadCmpOpts: defaultWorkloadCmpOpts, + wantEvents: []utiltesting.EventRecord{ + { + Key: types.NamespacedName{Name: "test-group", Namespace: "ns"}, + EventType: "Normal", + Reason: "OwnerReferencesAdded", + Message: "Added 2 owner reference(s)", + }, + }, + }, + "no failed pods are finalized while waiting for expectations": { + pods: []corev1.Pod{ + *basePodWrapper. + Clone(). + Name("active-pod"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + Obj(), + *basePodWrapper. + Clone(). + Name("finished-with-error"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-5 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("replacement"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + Obj(), + }, + wantPods: []corev1.Pod{ + *basePodWrapper. + Clone(). + Name("active-pod"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + Obj(), + *basePodWrapper. + Clone(). + Name("finished-with-error"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-5 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("replacement"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + Obj(), + }, + workloads: []kueue.Workload{ + *utiltesting.MakeWorkload("test-group", "ns"). + PodSets( + *utiltesting.MakePodSet("dc85db45", 2). + Request(corev1.ResourceCPU, "1"). + Obj(), + ). + Queue("user-queue"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "active-pod", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error", "test-uid"). + ReserveQuota(utiltesting.MakeAdmission("cq").AssignmentPodCount(1).Obj()). + Admitted(true). + Obj(), + }, + wantWorkloads: []kueue.Workload{ + *utiltesting.MakeWorkload("test-group", "ns"). + PodSets( + *utiltesting.MakePodSet("dc85db45", 2). + Request(corev1.ResourceCPU, "1"). + Obj(), + ). + Queue("user-queue"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "active-pod", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error", "test-uid"). + ReserveQuota(utiltesting.MakeAdmission("cq").AssignmentPodCount(1).Obj()). + Admitted(true). + Obj(), + }, + workloadCmpOpts: defaultWorkloadCmpOpts, + excessPodsExpectations: []keyUIDs{{ + key: types.NamespacedName{Name: "test-group", Namespace: "ns"}, + uids: []types.UID{"some-other-pod"}, + }}, + }, + "no unnecessary additional failed pods are finalized": { + pods: []corev1.Pod{ + *basePodWrapper. + Clone(). + Name("finished-with-error"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-5 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("finished-with-error-no-finalizer"). + Label("kueue.x-k8s.io/managed", "true"). + Group("test-group"). + GroupTotalCount("2"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-3 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("replacement"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + Obj(), + }, + wantPods: []corev1.Pod{ + *basePodWrapper. + Clone(). + Name("finished-with-error"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-5 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("finished-with-error-no-finalizer"). + Label("kueue.x-k8s.io/managed", "true"). + Group("test-group"). + GroupTotalCount("2"). + StatusPhase(corev1.PodFailed). + StatusConditions(corev1.PodCondition{ + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodFailed), + LastTransitionTime: metav1.NewTime(now.Add(-3 * time.Minute)).Rfc3339Copy(), + }). + Obj(), + *basePodWrapper. + Clone(). + Name("replacement"). + Label("kueue.x-k8s.io/managed", "true"). + KueueFinalizer(). + Group("test-group"). + GroupTotalCount("2"). + Obj(), + }, + workloads: []kueue.Workload{ + *utiltesting.MakeWorkload("test-group", "ns"). + PodSets( + *utiltesting.MakePodSet("dc85db45", 4). + Request(corev1.ResourceCPU, "1"). + Obj(), + ). + Queue("user-queue"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error-no-finalizer", "test-uid"). + ReserveQuota(utiltesting.MakeAdmission("cq").AssignmentPodCount(1).Obj()). + Admitted(true). + Obj(), + }, + wantWorkloads: []kueue.Workload{ + *utiltesting.MakeWorkload("test-group", "ns"). + PodSets( + *utiltesting.MakePodSet("dc85db45", 4). + Request(corev1.ResourceCPU, "1"). + Obj(), + ). + Queue("user-queue"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "finished-with-error-no-finalizer", "test-uid"). + OwnerReference(corev1.SchemeGroupVersion.WithKind("Pod"), "replacement", "test-uid"). + ReserveQuota(utiltesting.MakeAdmission("cq").AssignmentPodCount(1).Obj()). + Admitted(true). + Obj(), + }, + workloadCmpOpts: defaultWorkloadCmpOpts, + wantEvents: []utiltesting.EventRecord{ + { + Key: types.NamespacedName{Name: "test-group", Namespace: "ns"}, + EventType: "Normal", + Reason: "OwnerReferencesAdded", + Message: "Added 1 owner reference(s)", + }, + }, + }, } for name, tc := range testCases { diff --git a/pkg/controller/jobs/pod/pod_webhook.go b/pkg/controller/jobs/pod/pod_webhook.go index 3a090680f3..b81ae06563 100644 --- a/pkg/controller/jobs/pod/pod_webhook.go +++ b/pkg/controller/jobs/pod/pod_webhook.go @@ -20,7 +20,6 @@ import ( "context" "errors" "fmt" - "strings" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/validation" @@ -105,21 +104,9 @@ func getPodOptions(integrationOpts map[string]any) (configapi.PodIntegrationOpti var _ webhook.CustomDefaulter = &PodWebhook{} -func omitKueueLabels(l map[string]string) map[string]string { - result := map[string]string{} - - for key, value := range l { - if !strings.HasPrefix(key, "kueue.x-k8s.io/") { - result[key] = value - } - } - return result -} - func containersShape(containers []corev1.Container) (result []map[string]interface{}) { for _, c := range containers { result = append(result, map[string]interface{}{ - "image": c.Image, "resources": map[string]interface{}{ "requests": c.Resources.Requests, }, diff --git a/pkg/controller/jobs/pod/pod_webhook_test.go b/pkg/controller/jobs/pod/pod_webhook_test.go index 2bd4f6759d..f2f8d696ba 100644 --- a/pkg/controller/jobs/pod/pod_webhook_test.go +++ b/pkg/controller/jobs/pod/pod_webhook_test.go @@ -233,7 +233,7 @@ func TestDefault(t *testing.T) { want: testingpod.MakePod("test-pod", defaultNamespace.Name). Queue("test-queue"). Group("test-group"). - RoleHash("3b4907a8"). + RoleHash("a9f06f3a"). Label("kueue.x-k8s.io/managed", "true"). KueueSchedulingGate(). KueueFinalizer(). @@ -250,7 +250,7 @@ func TestDefault(t *testing.T) { want: testingpod.MakePod("test-pod", defaultNamespace.Name). Queue("test-queue"). Group("test-group"). - RoleHash("3b4907a8"). + RoleHash("a9f06f3a"). Label("kueue.x-k8s.io/managed", "true"). KueueSchedulingGate(). KueueFinalizer(). diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 665e215c5d..74ded34f31 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -55,7 +55,7 @@ const ( var ( CQStatuses = []ClusterQueueStatus{CQStatusPending, CQStatusActive, CQStatusTerminating} - admissionAttemptsTotal = prometheus.NewCounterVec( + AdmissionAttemptsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Subsystem: constants.KueueName, Name: "admission_attempts_total", @@ -177,7 +177,7 @@ For a ClusterQueue, the metric only reports a value of 1 for one of the statuses ) func AdmissionAttempt(result AdmissionResult, duration time.Duration) { - admissionAttemptsTotal.WithLabelValues(string(result)).Inc() + AdmissionAttemptsTotal.WithLabelValues(string(result)).Inc() admissionAttemptDuration.WithLabelValues(string(result)).Observe(duration.Seconds()) } @@ -290,7 +290,7 @@ func ClearClusterQueueResourceReservations(cqName, flavor, resource string) { func Register() { metrics.Registry.MustRegister( - admissionAttemptsTotal, + AdmissionAttemptsTotal, admissionAttemptDuration, PendingWorkloads, ReservingActiveWorkloads, diff --git a/pkg/queue/cluster_queue_impl.go b/pkg/queue/cluster_queue_impl.go index 84d2f0bfbd..cdde522513 100644 --- a/pkg/queue/cluster_queue_impl.go +++ b/pkg/queue/cluster_queue_impl.go @@ -116,6 +116,7 @@ func (c *clusterQueueBase) PushOrUpdate(wInfo *workload.Info) { // to potentially become admissible, unless the Eviction status changed // which can affect the workloads order in the queue. if equality.Semantic.DeepEqual(oldInfo.Obj.Spec, wInfo.Obj.Spec) && + equality.Semantic.DeepEqual(oldInfo.Obj.Status.ReclaimablePods, wInfo.Obj.Status.ReclaimablePods) && equality.Semantic.DeepEqual(apimeta.FindStatusCondition(oldInfo.Obj.Status.Conditions, kueue.WorkloadEvicted), apimeta.FindStatusCondition(wInfo.Obj.Status.Conditions, kueue.WorkloadEvicted)) { c.inadmissibleWorkloads[key] = wInfo diff --git a/pkg/queue/cluster_queue_impl_test.go b/pkg/queue/cluster_queue_impl_test.go index 5297b0fde3..bdaa3bf352 100644 --- a/pkg/queue/cluster_queue_impl_test.go +++ b/pkg/queue/cluster_queue_impl_test.go @@ -328,6 +328,18 @@ func TestClusterQueueImpl(t *testing.T) { inadmissibleWorkloadsToRequeue: []*workload.Info{workload.NewInfo(workloads[1]), workload.NewInfo(workloads[1])}, wantPending: 1, }, + "update reclaimable pods in inadmissible": { + inadmissibleWorkloadsToRequeue: []*workload.Info{ + workload.NewInfo(utiltesting.MakeWorkload("w", "").PodSets(*utiltesting.MakePodSet("main", 1).Request(corev1.ResourceCPU, "1").Obj()).Obj()), + }, + workloadsToUpdate: []*kueue.Workload{ + utiltesting.MakeWorkload("w", "").PodSets(*utiltesting.MakePodSet("main", 2).Request(corev1.ResourceCPU, "1").Obj()). + ReclaimablePods(kueue.ReclaimablePod{Name: "main", Count: 1}). + Obj(), + }, + wantActiveWorkloads: []string{"/w"}, + wantPending: 1, + }, } for name, test := range tests { diff --git a/pkg/queue/manager.go b/pkg/queue/manager.go index f7f0b33db7..14f6bb2581 100644 --- a/pkg/queue/manager.go +++ b/pkg/queue/manager.go @@ -141,7 +141,7 @@ func (m *Manager) AddClusterQueue(ctx context.Context, cq *kueue.ClusterQueue) e return nil } -func (m *Manager) UpdateClusterQueue(ctx context.Context, cq *kueue.ClusterQueue) error { +func (m *Manager) UpdateClusterQueue(ctx context.Context, cq *kueue.ClusterQueue, specUpdated bool) error { m.Lock() defer m.Unlock() cqImpl, ok := m.clusterQueues[cq.Name] @@ -162,7 +162,7 @@ func (m *Manager) UpdateClusterQueue(ctx context.Context, cq *kueue.ClusterQueue // TODO(#8): Selectively move workloads based on the exact event. // If any workload becomes admissible or the queue becomes active. - if m.queueAllInadmissibleWorkloadsInCohort(ctx, cqImpl) || (!oldActive && cqImpl.Active()) { + if (specUpdated && m.queueAllInadmissibleWorkloadsInCohort(ctx, cqImpl)) || (!oldActive && cqImpl.Active()) { m.reportPendingWorkloads(cq.Name, cqImpl) m.Broadcast() } diff --git a/pkg/queue/manager_test.go b/pkg/queue/manager_test.go index c42509b836..f170f2072c 100644 --- a/pkg/queue/manager_test.go +++ b/pkg/queue/manager_test.go @@ -166,7 +166,7 @@ func TestUpdateClusterQueue(t *testing.T) { // Put cq2 in the same cohort as cq1. clusterQueues[1].Spec.Cohort = clusterQueues[0].Spec.Cohort - if err := manager.UpdateClusterQueue(ctx, clusterQueues[1]); err != nil { + if err := manager.UpdateClusterQueue(ctx, clusterQueues[1], true); err != nil { t.Fatalf("Failed to update ClusterQueue: %v", err) } @@ -225,7 +225,7 @@ func TestClusterQueueToActive(t *testing.T) { t.Fatalf("Failed adding clusterQueue %v", err) } - if err := manager.UpdateClusterQueue(ctx, runningCq); err != nil { + if err := manager.UpdateClusterQueue(ctx, runningCq, false); err != nil { t.Fatalf("Failed to update ClusterQueue: %v", err) } diff --git a/pkg/scheduler/preemption/preemption.go b/pkg/scheduler/preemption/preemption.go index 71dbdccfb1..36cb2555d4 100644 --- a/pkg/scheduler/preemption/preemption.go +++ b/pkg/scheduler/preemption/preemption.go @@ -389,6 +389,7 @@ func workloadFits(wlReq cache.FlavorResourceQuantities, cq *cache.ClusterQueue, } // candidatesOrdering criteria: +// 0. Workloads already marked for preemption first. // 1. Workloads from other ClusterQueues in the cohort before the ones in the // same ClusterQueue as the preemptor. // 2. Workloads with lower priority first. @@ -397,6 +398,11 @@ func candidatesOrdering(candidates []*workload.Info, cq string, now time.Time) f return func(i, j int) bool { a := candidates[i] b := candidates[j] + aEvicted := meta.IsStatusConditionTrue(a.Obj.Status.Conditions, kueue.WorkloadEvicted) + bEvicted := meta.IsStatusConditionTrue(b.Obj.Status.Conditions, kueue.WorkloadEvicted) + if aEvicted != bEvicted { + return aEvicted + } aInCQ := a.ClusterQueue == cq bInCQ := b.ClusterQueue == cq if aInCQ != bInCQ { @@ -407,7 +413,13 @@ func candidatesOrdering(candidates []*workload.Info, cq string, now time.Time) f if pa != pb { return pa < pb } - return quotaReservationTime(b.Obj, now).Before(quotaReservationTime(a.Obj, now)) + timeA := quotaReservationTime(a.Obj, now) + timeB := quotaReservationTime(b.Obj, now) + if !timeA.Equal(timeB) { + return timeA.After(timeB) + } + // Arbitrary comparison for deterministic sorting. + return a.Obj.UID < b.Obj.UID } } diff --git a/pkg/scheduler/preemption/preemption_test.go b/pkg/scheduler/preemption/preemption_test.go index f4fc733231..bdd8ae3fd1 100644 --- a/pkg/scheduler/preemption/preemption_test.go +++ b/pkg/scheduler/preemption/preemption_test.go @@ -206,7 +206,6 @@ func TestPreemption(t *testing.T) { Cohort("cohort-lend"). ResourceGroup(*utiltesting.MakeFlavorQuotas("default"). Resource(corev1.ResourceCPU, "6", "", "4"). - Resource(corev1.ResourceMemory, "3Gi", "", "2Gi"). Obj(), ). Preemption(kueue.ClusterQueuePreemption{ @@ -217,13 +216,12 @@ func TestPreemption(t *testing.T) { utiltesting.MakeClusterQueue("lend2"). Cohort("cohort-lend"). ResourceGroup(*utiltesting.MakeFlavorQuotas("default"). - Resource(corev1.ResourceCPU, "6", "", "4"). - Resource(corev1.ResourceMemory, "3Gi", "", "2Gi"). + Resource(corev1.ResourceCPU, "6", "", "2"). Obj(), ). Preemption(kueue.ClusterQueuePreemption{ - WithinClusterQueue: kueue.PreemptionPolicyNever, - ReclaimWithinCohort: kueue.PreemptionPolicyAny, + WithinClusterQueue: kueue.PreemptionPolicyLowerPriority, + ReclaimWithinCohort: kueue.PreemptionPolicyLowerPriority, }). Obj(), } @@ -1043,6 +1041,27 @@ func TestPreemption(t *testing.T) { wantPreempted: sets.New("/lend1-low", "/lend2-low"), enableLendingLimit: true, }, + "cannot preempt from other ClusterQueues if exceeds requestable quota including lending limit": { + admitted: []kueue.Workload{ + *utiltesting.MakeWorkload("lend2-low", ""). + Priority(-1). + Request(corev1.ResourceCPU, "10"). + ReserveQuota(utiltesting.MakeAdmission("lend2").Assignment(corev1.ResourceCPU, "default", "10").Obj()). + Obj(), + }, + incoming: utiltesting.MakeWorkload("in", ""). + Request(corev1.ResourceCPU, "9"). + Obj(), + targetCQ: "lend1", + assignment: singlePodSetAssignment(flavorassigner.ResourceAssignment{ + corev1.ResourceCPU: &flavorassigner.FlavorAssignment{ + Name: "default", + Mode: flavorassigner.Preempt, + }, + }), + wantPreempted: nil, + enableLendingLimit: true, + }, } for name, tc := range cases { t.Run(name, func(t *testing.T) { @@ -1108,15 +1127,25 @@ func TestCandidatesOrdering(t *testing.T) { Obj()), workload.NewInfo(utiltesting.MakeWorkload("low", ""). ReserveQuota(utiltesting.MakeAdmission("self").Obj()). - Priority(10). Priority(-10). Obj()), workload.NewInfo(utiltesting.MakeWorkload("other", ""). ReserveQuota(utiltesting.MakeAdmission("other").Obj()). Priority(10). Obj()), - workload.NewInfo(utiltesting.MakeWorkload("old", ""). - ReserveQuota(utiltesting.MakeAdmission("self").Obj()). + workload.NewInfo(utiltesting.MakeWorkload("evicted", ""). + SetOrReplaceCondition(metav1.Condition{ + Type: kueue.WorkloadEvicted, + Status: metav1.ConditionTrue, + }). + Obj()), + workload.NewInfo(utiltesting.MakeWorkload("old-a", ""). + UID("old-a"). + ReserveQuotaAt(utiltesting.MakeAdmission("self").Obj(), now). + Obj()), + workload.NewInfo(utiltesting.MakeWorkload("old-b", ""). + UID("old-b"). + ReserveQuotaAt(utiltesting.MakeAdmission("self").Obj(), now). Obj()), workload.NewInfo(utiltesting.MakeWorkload("current", ""). ReserveQuota(utiltesting.MakeAdmission("self").Obj()). @@ -1132,7 +1161,7 @@ func TestCandidatesOrdering(t *testing.T) { for i, c := range candidates { gotNames[i] = workload.Key(c.Obj) } - wantCandidates := []string{"/other", "/low", "/current", "/old", "/high"} + wantCandidates := []string{"/evicted", "/other", "/low", "/current", "/old-a", "/old-b", "/high"} if diff := cmp.Diff(wantCandidates, gotNames); diff != "" { t.Errorf("Sorted with wrong order (-want,+got):\n%s", diff) } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 0433946328..752ddbca1a 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -202,19 +202,23 @@ func (s *Scheduler) schedule(ctx context.Context) { // head got admitted that should be scheduled in the cohort before the heads // of other clusterQueues. cycleCohortsUsage := cohortsUsage{} + cycleCohortsSkipPreemption := sets.New[string]() for i := range entries { e := &entries[i] - if e.assignment.RepresentativeMode() == flavorassigner.NoFit { + mode := e.assignment.RepresentativeMode() + if mode == flavorassigner.NoFit { continue } cq := snapshot.ClusterQueues[e.ClusterQueue] if cq.Cohort != nil { sum := cycleCohortsUsage.totalUsageForCommonFlavorResources(cq.Cohort.Name, e.assignment.Usage) - // If the workload uses resources that were potentially assumed in this cycle and will no longer fit in the - // cohort. If a resource of a flavor is used only once or for the first time in the cycle the checks done by - // the flavorassigner are still valid. - if cycleCohortsUsage.hasCommonFlavorResources(cq.Cohort.Name, e.assignment.Usage) && !cq.FitInCohort(sum) { + // Check whether there was an assignment in this cycle that could render the next assignments invalid: + // - If the workload no longer fits in the cohort. + // - If there was another assignment in the cohort, then the preemption calculation is no longer valid. + if cycleCohortsUsage.hasCommonFlavorResources(cq.Cohort.Name, e.assignment.Usage) && + ((mode == flavorassigner.Fit && !cq.FitInCohort(sum)) || + (mode == flavorassigner.Preempt && cycleCohortsSkipPreemption.Has(cq.Cohort.Name))) { e.status = skipped e.inadmissibleMsg = "other workloads in the cohort were prioritized" // When the workload needs borrowing and there is another workload in cohort doesn't @@ -241,6 +245,9 @@ func (s *Scheduler) schedule(ctx context.Context) { e.inadmissibleMsg += fmt.Sprintf(". Pending the preemption of %d workload(s)", preempted) e.requeueReason = queue.RequeueReasonPendingPreemption } + if cq.Cohort != nil { + cycleCohortsSkipPreemption.Insert(cq.Cohort.Name) + } } else { log.V(2).Info("Workload requires preemption, but there are no candidate workloads allowed for preemption", "preemption", cq.Preemption) } @@ -262,6 +269,9 @@ func (s *Scheduler) schedule(ctx context.Context) { if err := s.admit(ctx, e, cq.AdmissionChecks); err != nil { e.inadmissibleMsg = fmt.Sprintf("Failed to admit workload: %v", err) } + if cq.Cohort != nil { + cycleCohortsSkipPreemption.Insert(cq.Cohort.Name) + } } // 6. Requeue the heads that were not scheduled. @@ -585,10 +595,11 @@ func (s *Scheduler) requeueAndUpdate(log logr.Logger, ctx context.Context, e ent log.V(2).Info("Workload re-queued", "workload", klog.KObj(e.Obj), "clusterQueue", klog.KRef("", e.ClusterQueue), "queue", klog.KRef(e.Obj.Namespace, e.Obj.Spec.QueueName), "requeueReason", e.requeueReason, "added", added) if e.status == notNominated || e.status == skipped { - workload.UnsetQuotaReservationWithCondition(e.Obj, "Pending", e.inadmissibleMsg) - err := workload.ApplyAdmissionStatus(ctx, s.client, e.Obj, true) - if err != nil { - log.Error(err, "Could not update Workload status") + if workload.UnsetQuotaReservationWithCondition(e.Obj, "Pending", e.inadmissibleMsg) { + err := workload.ApplyAdmissionStatus(ctx, s.client, e.Obj, true) + if err != nil { + log.Error(err, "Could not update Workload status") + } } s.recorder.Eventf(e.Obj, corev1.EventTypeNormal, "Pending", api.TruncateEventMessage(e.inadmissibleMsg)) } diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index fc30490347..f969bbf894 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -36,6 +36,7 @@ import ( "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" config "sigs.k8s.io/kueue/apis/config/v1beta1" kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" @@ -715,6 +716,60 @@ func TestSchedule(t *testing.T) { "eng-alpha/borrower": *utiltesting.MakeAdmission("eng-alpha").Assignment(corev1.ResourceCPU, "on-demand", "60").Obj(), }, }, + "multiple CQs need preemption": { + additionalClusterQueues: []kueue.ClusterQueue{ + *utiltesting.MakeClusterQueue("other-alpha"). + Cohort("other"). + ResourceGroup( + *utiltesting.MakeFlavorQuotas("on-demand"). + Resource(corev1.ResourceCPU, "50", "50").Obj(), + ). + Obj(), + *utiltesting.MakeClusterQueue("other-beta"). + Cohort("other"). + Preemption(kueue.ClusterQueuePreemption{ + ReclaimWithinCohort: kueue.PreemptionPolicyAny, + WithinClusterQueue: kueue.PreemptionPolicyLowerPriority, + }). + ResourceGroup( + *utiltesting.MakeFlavorQuotas("on-demand"). + Resource(corev1.ResourceCPU, "50", "10").Obj(), + ). + Obj(), + }, + additionalLocalQueues: []kueue.LocalQueue{ + *utiltesting.MakeLocalQueue("other", "eng-alpha").ClusterQueue("other-alpha").Obj(), + *utiltesting.MakeLocalQueue("other", "eng-beta").ClusterQueue("other-beta").Obj(), + }, + workloads: []kueue.Workload{ + *utiltesting.MakeWorkload("preemptor", "eng-beta"). + Priority(-1). + Queue("other"). + Request(corev1.ResourceCPU, "1"). + Obj(), + *utiltesting.MakeWorkload("pending", "eng-alpha"). + Priority(1). + Queue("other"). + Request(corev1.ResourceCPU, "1"). + Obj(), + *utiltesting.MakeWorkload("use-all", "eng-alpha"). + Request(corev1.ResourceCPU, "100"). + ReserveQuota(utiltesting.MakeAdmission("other-alpha").Assignment(corev1.ResourceCPU, "on-demand", "100").Obj()). + Obj(), + }, + wantLeft: map[string][]string{ + // Preemptor is not admitted in this cycle. + "other-beta": {"eng-beta/preemptor"}, + }, + wantInadmissibleLeft: map[string][]string{ + "other-alpha": {"eng-alpha/pending"}, + }, + wantPreempted: sets.New("eng-alpha/use-all"), + wantAssignments: map[string]kueue.Admission{ + // Removal from cache for the preempted workloads is deferred until we receive Workload updates + "eng-alpha/use-all": *utiltesting.MakeAdmission("other-alpha").Assignment(corev1.ResourceCPU, "on-demand", "100").Obj(), + }, + }, "cannot borrow resource not listed in clusterQueue": { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("new", "eng-alpha"). @@ -2004,11 +2059,12 @@ func TestRequeueAndUpdate(t *testing.T) { w1 := utiltesting.MakeWorkload("w1", "ns1").Queue(q1.Name).Obj() cases := []struct { - name string - e entry - wantWorkloads map[string][]string - wantInadmissible map[string][]string - wantStatus kueue.WorkloadStatus + name string + e entry + wantWorkloads map[string][]string + wantInadmissible map[string][]string + wantStatus kueue.WorkloadStatus + wantStatusUpdates int }{ { name: "workload didn't fit", @@ -2028,6 +2084,7 @@ func TestRequeueAndUpdate(t *testing.T) { wantInadmissible: map[string][]string{ "cq": {workload.Key(w1)}, }, + wantStatusUpdates: 1, }, { name: "assumed", @@ -2068,6 +2125,7 @@ func TestRequeueAndUpdate(t *testing.T) { wantWorkloads: map[string][]string{ "cq": {workload.Key(w1)}, }, + wantStatusUpdates: 1, }, } @@ -2076,9 +2134,14 @@ func TestRequeueAndUpdate(t *testing.T) { ctx, log := utiltesting.ContextWithLog(t) scheme := runtime.NewScheme() + updates := 0 objs := []client.Object{w1, q1, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "ns1"}}} - clientBuilder := utiltesting.NewClientBuilder().WithObjects(objs...).WithStatusSubresource(objs...) - cl := clientBuilder.Build() + cl := utiltesting.NewClientBuilder().WithInterceptorFuncs(interceptor.Funcs{ + SubResourcePatch: func(ctx context.Context, client client.Client, subResourceName string, obj client.Object, patch client.Patch, opts ...client.SubResourcePatchOption) error { + updates++ + return client.SubResource(subResourceName).Patch(ctx, obj, patch, opts...) + }, + }).WithObjects(objs...).WithStatusSubresource(objs...).Build() broadcaster := record.NewBroadcaster() recorder := broadcaster.NewRecorder(scheme, corev1.EventSource{Component: constants.AdmissionName}) cqCache := cache.New(cl) @@ -2121,6 +2184,11 @@ func TestRequeueAndUpdate(t *testing.T) { if diff := cmp.Diff(tc.wantStatus, updatedWl.Status, ignoreConditionTimestamps); diff != "" { t.Errorf("Unexpected status after updating (-want,+got):\n%s", diff) } + // Make sure a second call doesn't make unnecessary updates. + scheduler.requeueAndUpdate(log, ctx, tc.e) + if updates != tc.wantStatusUpdates { + t.Errorf("Observed %d status updates, want %d", updates, tc.wantStatusUpdates) + } }) } } diff --git a/pkg/util/slices/slices.go b/pkg/util/slices/slices.go index 2d3e3079c4..ae9ebbace5 100644 --- a/pkg/util/slices/slices.go +++ b/pkg/util/slices/slices.go @@ -82,3 +82,14 @@ func CmpNoOrder[E comparable, S ~[]E](a, b S) bool { } return true } + +// Pick creates a new slice containing only the elements for which keep return true. +func Pick[E any, S ~[]E](s S, keep func(*E) bool) S { + var ret S + for i := range s { + if keep(&s[i]) { + ret = append(ret, s[i]) + } + } + return ret +} diff --git a/pkg/util/slices/slices_test.go b/pkg/util/slices/slices_test.go index d3a81912f9..f45e35115d 100644 --- a/pkg/util/slices/slices_test.go +++ b/pkg/util/slices/slices_test.go @@ -102,3 +102,36 @@ func TestToCmpNoOrder(t *testing.T) { }) } } + +func TestPick(t *testing.T) { + cases := map[string]struct { + testSlice []int + want []int + }{ + "nil input": { + testSlice: nil, + want: nil, + }, + "empty input": { + testSlice: []int{}, + want: nil, + }, + "no match": { + testSlice: []int{1, 3, 5, 7, 9}, + want: nil, + }, + "match": { + testSlice: []int{1, 2, 3, 4, 5, 6, 7, 8, 9}, + want: []int{2, 4, 6, 8}, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + result := Pick(tc.testSlice, func(i *int) bool { return (*i)%2 == 0 }) + if diff := cmp.Diff(tc.want, result); diff != "" { + t.Errorf("Unexpected result (-want,+got):\n%s", diff) + } + }) + } +} diff --git a/pkg/util/testing/wrappers.go b/pkg/util/testing/wrappers.go index cd668927f2..1ac7d18398 100644 --- a/pkg/util/testing/wrappers.go +++ b/pkg/util/testing/wrappers.go @@ -82,6 +82,11 @@ func (w *WorkloadWrapper) Clone() *WorkloadWrapper { return &WorkloadWrapper{Workload: *w.DeepCopy()} } +func (w *WorkloadWrapper) UID(uid types.UID) *WorkloadWrapper { + w.Workload.UID = uid + return w +} + func (w *WorkloadWrapper) Finalizers(fin ...string) *WorkloadWrapper { w.ObjectMeta.Finalizers = fin return w @@ -116,11 +121,16 @@ func (w *WorkloadWrapper) Active(a bool) *WorkloadWrapper { // ReserveQuota sets workload admission and adds a "QuotaReserved" status condition func (w *WorkloadWrapper) ReserveQuota(a *kueue.Admission) *WorkloadWrapper { + return w.ReserveQuotaAt(a, time.Now()) +} + +// ReserveQuotaAt sets workload admission and adds a "QuotaReserved" status condition +func (w *WorkloadWrapper) ReserveQuotaAt(a *kueue.Admission, now time.Time) *WorkloadWrapper { w.Status.Admission = a w.Status.Conditions = []metav1.Condition{{ Type: kueue.WorkloadQuotaReserved, Status: metav1.ConditionTrue, - LastTransitionTime: metav1.Now(), + LastTransitionTime: metav1.NewTime(now), Reason: "AdmittedByTest", Message: fmt.Sprintf("Admitted by ClusterQueue %s", w.Status.Admission.ClusterQueue), }} diff --git a/pkg/util/testingjobs/pod/wrappers.go b/pkg/util/testingjobs/pod/wrappers.go index aca94e818f..b6564eb0a9 100644 --- a/pkg/util/testingjobs/pod/wrappers.go +++ b/pkg/util/testingjobs/pod/wrappers.go @@ -217,6 +217,13 @@ func (p *PodWrapper) CreationTimestamp(t time.Time) *PodWrapper { return p } +// DeletionTimestamp sets a creation timestamp for the pod object +func (p *PodWrapper) DeletionTimestamp(t time.Time) *PodWrapper { + timestamp := metav1.NewTime(t).Rfc3339Copy() + p.Pod.DeletionTimestamp = ×tamp + return p +} + // Delete sets a deletion timestamp for the pod object func (p *PodWrapper) Delete() *PodWrapper { t := metav1.NewTime(time.Now()) diff --git a/pkg/webhooks/workload_webhook.go b/pkg/webhooks/workload_webhook.go index d9f2edc3ec..86ee9a0e84 100644 --- a/pkg/webhooks/workload_webhook.go +++ b/pkg/webhooks/workload_webhook.go @@ -50,7 +50,7 @@ func setupWebhookForWorkload(mgr ctrl.Manager) error { Complete() } -// +kubebuilder:webhook:path=/mutate-kueue-x-k8s-io-v1beta1-workload,mutating=true,failurePolicy=fail,sideEffects=None,groups=kueue.x-k8s.io,resources=workloads;workloads/status,verbs=create;update,versions=v1beta1,name=mworkload.kb.io,admissionReviewVersions=v1 +// +kubebuilder:webhook:path=/mutate-kueue-x-k8s-io-v1beta1-workload,mutating=true,failurePolicy=fail,sideEffects=None,groups=kueue.x-k8s.io,resources=workloads,verbs=create,versions=v1beta1,name=mworkload.kb.io,admissionReviewVersions=v1 var _ webhook.CustomDefaulter = &WorkloadWebhook{} @@ -76,10 +76,6 @@ func (w *WorkloadWebhook) Default(ctx context.Context, obj runtime.Object) error } } - // If a deactivated workload is re-activated, we need to reset the RequeueState. - if ptr.Deref(wl.Spec.Active, true) && workload.IsEvictedByDeactivation(wl) && workload.HasRequeueState(wl) { - wl.Status.RequeueState = nil - } return nil } diff --git a/pkg/webhooks/workload_webhook_test.go b/pkg/webhooks/workload_webhook_test.go index 78051be403..8414a22286 100644 --- a/pkg/webhooks/workload_webhook_test.go +++ b/pkg/webhooks/workload_webhook_test.go @@ -78,22 +78,6 @@ func TestWorkloadWebhookDefault(t *testing.T) { }, }, }, - "re-activated workload with re-queue state is reset the re-queue state": { - wl: *testingutil.MakeWorkload(testWorkloadName, testWorkloadNamespace). - Condition(metav1.Condition{ - Type: kueue.WorkloadEvicted, - Status: metav1.ConditionTrue, - Reason: kueue.WorkloadEvictedByDeactivation, - }).RequeueState(ptr.To[int32](5), ptr.To(metav1.Now())). - Obj(), - wantWl: *testingutil.MakeWorkload(testWorkloadName, testWorkloadNamespace). - Condition(metav1.Condition{ - Type: kueue.WorkloadEvicted, - Status: metav1.ConditionTrue, - Reason: kueue.WorkloadEvictedByDeactivation, - }). - Obj(), - }, } for name, tc := range cases { t.Run(name, func(t *testing.T) { diff --git a/pkg/workload/admissionchecks.go b/pkg/workload/admissionchecks.go index b1d3ebcb3d..e831144681 100644 --- a/pkg/workload/admissionchecks.go +++ b/pkg/workload/admissionchecks.go @@ -58,8 +58,7 @@ func SyncAdmittedCondition(w *kueue.Workload) bool { newCondition.Message = "The workload has not all checks ready" } - apimeta.SetStatusCondition(&w.Status.Conditions, newCondition) - return true + return apimeta.SetStatusCondition(&w.Status.Conditions, newCondition) } // FindAdmissionCheck - returns a pointer to the check identified by checkName if found in checks. diff --git a/pkg/workload/workload.go b/pkg/workload/workload.go index 67c7fc9f37..e002b0226a 100644 --- a/pkg/workload/workload.go +++ b/pkg/workload/workload.go @@ -301,7 +301,10 @@ func UpdateStatus(ctx context.Context, return c.Status().Patch(ctx, newWl, client.Apply, client.FieldOwner(managerPrefix+"-"+condition.Type)) } -func UnsetQuotaReservationWithCondition(wl *kueue.Workload, reason, message string) { +// UnsetQuotaReservationWithCondition sets the QuotaReserved condition to false and clears +// the admission. +// Returns whether any change was done. +func UnsetQuotaReservationWithCondition(wl *kueue.Workload, reason, message string) bool { condition := metav1.Condition{ Type: kueue.WorkloadQuotaReserved, Status: metav1.ConditionFalse, @@ -309,11 +312,17 @@ func UnsetQuotaReservationWithCondition(wl *kueue.Workload, reason, message stri Reason: reason, Message: api.TruncateConditionMessage(message), } - apimeta.SetStatusCondition(&wl.Status.Conditions, condition) - wl.Status.Admission = nil + changed := apimeta.SetStatusCondition(&wl.Status.Conditions, condition) + if wl.Status.Admission != nil { + wl.Status.Admission = nil + changed = true + } // Reset the admitted condition if necessary. - _ = SyncAdmittedCondition(wl) + if SyncAdmittedCondition(wl) { + changed = true + } + return changed } // BaseSSAWorkload creates a new object based on the input workload that diff --git a/site/config.toml b/site/config.toml index 882e741af2..0675d4e167 100644 --- a/site/config.toml +++ b/site/config.toml @@ -92,7 +92,7 @@ ignoreFiles = [] # The major.minor version tag for the version of the docs represented in this # branch of the repository. Used in the "version-banner" partial to display a # version number for this doc set. - version = "v0.5.3" + version = "v0.6.2" # Flag used in the "version-banner" partial to decide whether to display a # banner on every page indicating that this is an archived version of the docs. diff --git a/test/e2e/config/kustomization.yaml b/test/e2e/config/kustomization.yaml index 56657fe34a..064f203d8b 100644 --- a/test/e2e/config/kustomization.yaml +++ b/test/e2e/config/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization resources: - ../../../config/dev +- ../../../config/visibility replicas: - name: kueue-controller-manager diff --git a/test/e2e/config_1_26/kustomization.yaml b/test/e2e/config_1_26/kustomization.yaml index 11d3f3eecb..70be6af201 100644 --- a/test/e2e/config_1_26/kustomization.yaml +++ b/test/e2e/config_1_26/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization resources: - ../../../config/dev +- ../../../config/visibility patches: - path: manager_e2e_patch.yaml diff --git a/test/e2e/multikueue/e2e_test.go b/test/e2e/multikueue/e2e_test.go index e287148bb3..6b49a084a1 100644 --- a/test/e2e/multikueue/e2e_test.go +++ b/test/e2e/multikueue/e2e_test.go @@ -17,6 +17,8 @@ limitations under the License. package mke2e import ( + "os/exec" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" @@ -340,4 +342,50 @@ var _ = ginkgo.Describe("MultiKueue", func() { }) }) }) + ginkgo.When("The connection to a worker cluster is unreliable", func() { + ginkgo.It("Should update the cluster status to reflect the connection state", func() { + ginkgo.By("Disconnecting worker1 container from the kind network", func() { + cmd := exec.Command("docker", "network", "disconnect", "kind", "kind-worker1-control-plane") + output, err := cmd.CombinedOutput() + gomega.Expect(err).NotTo(gomega.HaveOccurred(), "%s: %s", err, output) + }) + + worker1ClusterKey := client.ObjectKeyFromObject(workerCluster1) + + ginkgo.By("Waiting for the cluster do become inactive", func() { + readClient := &kueuealpha.MultiKueueCluster{} + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sManagerClient.Get(ctx, worker1ClusterKey, readClient)).To(gomega.Succeed()) + g.Expect(readClient.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo( + metav1.Condition{ + Type: kueuealpha.MultiKueueClusterActive, + Status: metav1.ConditionFalse, + Reason: "ClientConnectionFailed", + }, + cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime", "Message")))) + }, util.Timeout, util.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Reconnecting worker1 container to the kind network", func() { + cmd := exec.Command("docker", "network", "connect", "kind", "kind-worker1-control-plane") + output, err := cmd.CombinedOutput() + gomega.Expect(err).NotTo(gomega.HaveOccurred(), "%s: %s", err, output) + }) + + ginkgo.By("Waiting for the cluster do become active", func() { + readClient := &kueuealpha.MultiKueueCluster{} + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sManagerClient.Get(ctx, worker1ClusterKey, readClient)).To(gomega.Succeed()) + g.Expect(readClient.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo( + metav1.Condition{ + Type: kueuealpha.MultiKueueClusterActive, + Status: metav1.ConditionTrue, + Reason: "Active", + Message: "Connected", + }, + cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")))) + }, util.Timeout, util.Interval).Should(gomega.Succeed()) + }) + }) + }) }) diff --git a/test/e2e/singlecluster/e2e_test.go b/test/e2e/singlecluster/e2e_test.go index 98db10cae2..6f62f3aa2a 100644 --- a/test/e2e/singlecluster/e2e_test.go +++ b/test/e2e/singlecluster/e2e_test.go @@ -357,7 +357,7 @@ var _ = ginkgo.Describe("Kueue", func() { gomega.Expect(util.DeleteAllJobsInNamespace(ctx, k8sClient, ns)).Should(gomega.Succeed()) util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandRF, true) - gomega.Expect(k8sClient.Delete(ctx, check)).Should(gomega.Succeed()) + util.ExpectAdmissionCheckToBeDeleted(ctx, k8sClient, check, true) }) ginkgo.It("Should unsuspend a job only after all checks are cleared", func() { diff --git a/test/e2e/singlecluster/pod_test.go b/test/e2e/singlecluster/pod_test.go index e940e4e806..c86777fa66 100644 --- a/test/e2e/singlecluster/pod_test.go +++ b/test/e2e/singlecluster/pod_test.go @@ -217,7 +217,7 @@ var _ = ginkgo.Describe("Pod groups", func() { }, util.Timeout, util.Interval).Should(gomega.Equal(corev1.PodFailed)) }) - ginkgo.By("Replacement pod starts", func() { + ginkgo.By("Replacement pod starts, and the failed one is deleted", func() { // Use a pod template that can succeed fast. rep := group[2].DeepCopy() rep.Name = "replacement" @@ -226,6 +226,7 @@ var _ = ginkgo.Describe("Pod groups", func() { var p corev1.Pod g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(rep), &p)).To(gomega.Succeed()) g.Expect(p.Spec.SchedulingGates).To(gomega.BeEmpty()) + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(group[0]), &p)).To(testing.BeNotFoundError()) }, util.Timeout, util.Interval).Should(gomega.Succeed()) }) @@ -404,7 +405,7 @@ var _ = ginkgo.Describe("Pod groups", func() { // For replacement pods use args that let it complete fast. rep.Name = "replacement-for-" + rep.Name rep.Spec.Containers[0].Args = []string{"1ms"} - gomega.Expect(k8sClient.Create(ctx, rep)).To(gomega.Succeed()) + g.Expect(k8sClient.Create(ctx, rep)).To(gomega.Succeed()) replacementPods[origKey] = client.ObjectKeyFromObject(rep) } } @@ -412,6 +413,15 @@ var _ = ginkgo.Describe("Pod groups", func() { return len(replacementPods) }, util.Timeout, util.Interval).Should(gomega.Equal(len(defaultPriorityGroup))) }) + ginkgo.By("Check that the preempted pods are deleted", func() { + gomega.Eventually(func(g gomega.Gomega) { + var p corev1.Pod + for _, origPod := range defaultPriorityGroup { + origKey := client.ObjectKeyFromObject(origPod) + g.Expect(k8sClient.Get(ctx, origKey, &p)).To(testing.BeNotFoundError()) + } + }, util.Timeout, util.Interval).Should(gomega.Succeed()) + }) ginkgo.By("Verify the high-priority pods are scheduled", func() { gomega.Eventually(func(g gomega.Gomega) { diff --git a/test/integration/controller/admissionchecks/provisioning/provisioning_test.go b/test/integration/controller/admissionchecks/provisioning/provisioning_test.go index 65c820f8a0..18ebcdaae9 100644 --- a/test/integration/controller/admissionchecks/provisioning/provisioning_test.go +++ b/test/integration/controller/admissionchecks/provisioning/provisioning_test.go @@ -26,7 +26,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - autoscaling "k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/apis/autoscaling.x-k8s.io/v1beta1" + autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" diff --git a/test/integration/controller/admissionchecks/provisioning/suite_test.go b/test/integration/controller/admissionchecks/provisioning/suite_test.go index 817a376c65..8f13d61fc2 100644 --- a/test/integration/controller/admissionchecks/provisioning/suite_test.go +++ b/test/integration/controller/admissionchecks/provisioning/suite_test.go @@ -23,7 +23,7 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" - autoscaling "k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/apis/autoscaling.x-k8s.io/v1beta1" + autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" diff --git a/test/integration/controller/jobs/pod/pod_controller_test.go b/test/integration/controller/jobs/pod/pod_controller_test.go index 8a0f42a469..3a52ee545b 100644 --- a/test/integration/controller/jobs/pod/pod_controller_test.go +++ b/test/integration/controller/jobs/pod/pod_controller_test.go @@ -18,6 +18,7 @@ package pod import ( "fmt" + "strconv" "time" "github.com/google/go-cmp/cmp" @@ -29,6 +30,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -503,7 +505,7 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu pod2 := testingpod.MakePod("test-pod2", ns.Name). Group("test-group"). GroupTotalCount("2"). - Image("test-image", nil). + Request(corev1.ResourceCPU, "1"). Queue("test-queue"). Obj() pod1LookupKey := client.ObjectKeyFromObject(pod1) @@ -529,15 +531,27 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu gomega.Expect(createdWorkload.Spec.QueueName).To(gomega.Equal("test-queue"), "The Workload should have .spec.queueName set") ginkgo.By("checking that all pods in group are unsuspended when workload is admitted", func() { - admission := testing.MakeAdmission(clusterQueue.Name, "7c214403", "41b86f81"). - Assignment(corev1.ResourceCPU, "default", "1"). - AssignmentPodCount(2). - Obj() + admission := testing.MakeAdmission(clusterQueue.Name).PodSets( + kueue.PodSetAssignment{ + Name: "4b0469f7", + Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ + corev1.ResourceCPU: "default", + }, + Count: ptr.To[int32](1), + }, + kueue.PodSetAssignment{ + Name: "bf90803c", + Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ + corev1.ResourceCPU: "default", + }, + Count: ptr.To[int32](1), + }, + ).Obj() gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod1LookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) - util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod2LookupKey, nil) + util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod2LookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed()) gomega.Expect(createdWorkload.Status.Conditions).Should(gomega.BeComparableTo( @@ -602,7 +616,7 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu gomega.Expect(createdWorkload.Spec.QueueName).To(gomega.Equal("test-queue"), "The Workload should have .spec.queueName set") originalWorkloadUID := createdWorkload.UID - admission := testing.MakeAdmission(clusterQueue.Name, "7c214403"). + admission := testing.MakeAdmission(clusterQueue.Name, "bf90803c"). Assignment(corev1.ResourceCPU, "default", "1"). AssignmentPodCount(createdWorkload.Spec.PodSets[0].Count). Obj() @@ -663,7 +677,7 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu GroupTotalCount("2"). Queue("test-queue"). Obj() - replacementPodLookupKey := client.ObjectKeyFromObject(pod2) + replacementPodLookupKey := client.ObjectKeyFromObject(replacementPod) ginkgo.By("creating the replacement pod and readmitting the workload will unsuspended the replacement", func() { gomega.Expect(k8sClient.Create(ctx, replacementPod)).Should(gomega.Succeed()) @@ -724,7 +738,7 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu gomega.Expect(createdWorkload.Spec.QueueName).To(gomega.Equal("test-queue"), "The Workload should have .spec.queueName set") ginkgo.By("checking that pod is unsuspended when workload is admitted") - admission := testing.MakeAdmission(clusterQueue.Name, "7c214403"). + admission := testing.MakeAdmission(clusterQueue.Name, "bf90803c"). Assignment(corev1.ResourceCPU, "default", "1"). AssignmentPodCount(createdWorkload.Spec.PodSets[0].Count). Obj() @@ -734,6 +748,9 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu podLookupKey := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, podLookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) + // cache the uid of the workload it should be the same until the execution ends otherwise the workload was recreated + wlUid := createdWorkload.UID + ginkgo.By("Failing the running pod") util.SetPodsPhase(ctx, k8sClient, corev1.PodFailed, pod) @@ -756,18 +773,28 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu gomega.Expect(k8sClient.Create(ctx, replacementPod)).Should(gomega.Succeed()) replacementPodLookupKey := client.ObjectKeyFromObject(replacementPod) - // Workload shouldn't be deleted after replacement pod has been created - gomega.Consistently(func() error { - return k8sClient.Get(ctx, wlLookupKey, createdWorkload) - }, util.ConsistentDuration, util.Interval).Should(gomega.Succeed()) + ginkgo.By("Failing the replacement", func() { + util.ExpectPodsFinalized(ctx, k8sClient, podLookupKey) + util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, replacementPodLookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) + util.SetPodsPhase(ctx, k8sClient, corev1.PodFailed, replacementPod) + }) - util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, replacementPodLookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) + ginkgo.By("Creating a second replacement pod in the group") + replacementPod2 := testingpod.MakePod("replacement-test-pod2", ns.Name). + Group("test-group"). + GroupTotalCount("1"). + Queue("test-queue"). + Obj() + gomega.Expect(k8sClient.Create(ctx, replacementPod2)).Should(gomega.Succeed()) + replacementPod2LookupKey := client.ObjectKeyFromObject(replacementPod) - util.SetPodsPhase(ctx, k8sClient, corev1.PodSucceeded, replacementPod) - util.ExpectPodsFinalized(ctx, k8sClient, replacementPodLookupKey) + util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, replacementPod2LookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) + util.SetPodsPhase(ctx, k8sClient, corev1.PodSucceeded, replacementPod2) + util.ExpectPodsFinalized(ctx, k8sClient, replacementPodLookupKey, replacementPod2LookupKey) gomega.Eventually(func(g gomega.Gomega) []metav1.Condition { g.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed()) + g.Expect(createdWorkload.UID).To(gomega.Equal(wlUid)) return createdWorkload.Status.Conditions }, util.Timeout, util.Interval).Should(gomega.ContainElement( gomega.BeComparableTo( @@ -787,7 +814,7 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu pod2 := testingpod.MakePod("test-pod2", ns.Name). Group("test-group"). GroupTotalCount("2"). - Image("test-image", nil). + Request(corev1.ResourceCPU, "1"). Queue("test-queue"). Obj() pod1LookupKey := client.ObjectKeyFromObject(pod1) @@ -813,15 +840,27 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu createdPod := &corev1.Pod{} ginkgo.By("checking that all pods in group are unsuspended when workload is admitted", func() { - admission := testing.MakeAdmission(clusterQueue.Name, "7c214403", "41b86f81"). - Assignment(corev1.ResourceCPU, "default", "1"). - AssignmentPodCount(2). - Obj() + admission := testing.MakeAdmission(clusterQueue.Name).PodSets( + kueue.PodSetAssignment{ + Name: "4b0469f7", + Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ + corev1.ResourceCPU: "default", + }, + Count: ptr.To[int32](1), + }, + kueue.PodSetAssignment{ + Name: "bf90803c", + Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ + corev1.ResourceCPU: "default", + }, + Count: ptr.To[int32](1), + }, + ).Obj() gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod1LookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) - util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod2LookupKey, nil) + util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod2LookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed()) gomega.Expect(createdWorkload.Status.Conditions).Should(gomega.BeComparableTo( @@ -861,7 +900,11 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu replacementPod2LookupKey := client.ObjectKeyFromObject(replacementPod2) ginkgo.By("checking that unretriable replacement pod is allowed to run", func() { - util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, replacementPod2LookupKey, nil) + util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, replacementPod2LookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) + }) + + ginkgo.By("checking that the replaced pod is finalized", func() { + util.ExpectPodsFinalized(ctx, k8sClient, pod1LookupKey) }) ginkgo.By("checking that pod group is finalized when unretriable pod has failed", func() { @@ -880,7 +923,7 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu ), ), "Expected 'Finished' workload condition") - util.ExpectPodsFinalized(ctx, k8sClient, pod1LookupKey, pod2LookupKey, replacementPod2LookupKey) + util.ExpectPodsFinalized(ctx, k8sClient, pod2LookupKey, replacementPod2LookupKey) }) }) @@ -894,13 +937,13 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu pod2 := testingpod.MakePod("test-pod2", ns.Name). Group("test-group"). GroupTotalCount("2"). - Image("test-image", nil). + Request(corev1.ResourceCPU, "1"). Queue("test-queue"). Obj() excessBasePod := testingpod.MakePod("excess-pod", ns.Name). Group("test-group"). GroupTotalCount("2"). - Image("test-image", nil). + Request(corev1.ResourceCPU, "1"). Queue("test-queue") pod1LookupKey := client.ObjectKeyFromObject(pod1) @@ -940,15 +983,27 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu }) ginkgo.By("checking that all pods in group are unsuspended when workload is admitted", func() { - admission := testing.MakeAdmission(clusterQueue.Name, "7c214403", "41b86f81"). - Assignment(corev1.ResourceCPU, "default", "1"). - AssignmentPodCount(2). - Obj() + admission := testing.MakeAdmission(clusterQueue.Name).PodSets( + kueue.PodSetAssignment{ + Name: "4b0469f7", + Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ + corev1.ResourceCPU: "default", + }, + Count: ptr.To[int32](1), + }, + kueue.PodSetAssignment{ + Name: "bf90803c", + Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ + corev1.ResourceCPU: "default", + }, + Count: ptr.To[int32](1), + }, + ).Obj() gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod1LookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) - util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod2LookupKey, nil) + util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, pod2LookupKey, map[string]string{"kubernetes.io/arch": "arm64"}) gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed()) gomega.Expect(createdWorkload.Status.Conditions).Should(gomega.BeComparableTo( @@ -970,17 +1025,77 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu }) }) + ginkgo.It("Should finalize all Succeeded Pods when deleted", func() { + ginkgo.By("Creating pods with queue name") + // Use a number of Pods big enough to cause conflicts when removing finalizers >50% of the time. + const podCount = 7 + pods := make([]*corev1.Pod, podCount) + for i := range pods { + pods[i] = testingpod.MakePod(fmt.Sprintf("test-pod-%d", i), ns.Name). + Group("test-group"). + GroupTotalCount(strconv.Itoa(podCount)). + Request(corev1.ResourceCPU, "1"). + Queue("test-queue"). + Obj() + gomega.Expect(k8sClient.Create(ctx, pods[i])).Should(gomega.Succeed()) + } + + ginkgo.By("checking that workload is created for the pod group") + wlLookupKey := types.NamespacedName{ + Namespace: pods[0].Namespace, + Name: "test-group", + } + createdWorkload := &kueue.Workload{} + gomega.Eventually(func() error { + return k8sClient.Get(ctx, wlLookupKey, createdWorkload) + }, util.Timeout, util.Interval).Should(gomega.Succeed()) + + ginkgo.By("Admitting workload", func() { + admission := testing.MakeAdmission(clusterQueue.Name).PodSets( + kueue.PodSetAssignment{ + Name: "4b0469f7", + Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ + corev1.ResourceCPU: "default", + }, + Count: ptr.To[int32](podCount), + }, + ).Obj() + gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) + util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) + + for i := range pods { + util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, client.ObjectKeyFromObject(pods[i]), map[string]string{"kubernetes.io/arch": "arm64"}) + } + }) + + ginkgo.By("Finishing and deleting Pods", func() { + util.SetPodsPhase(ctx, k8sClient, corev1.PodSucceeded, pods...) + for i := range pods { + gomega.Expect(k8sClient.Delete(ctx, pods[i])).To(gomega.Succeed()) + } + + gomega.Eventually(func(g gomega.Gomega) { + for i := range pods { + key := types.NamespacedName{Namespace: ns.Name, Name: fmt.Sprintf("test-pod-%d", i)} + g.Expect(k8sClient.Get(ctx, key, &corev1.Pod{})).To(testing.BeNotFoundError()) + } + }, util.Timeout, util.Interval).Should(gomega.Succeed()) + }) + + }) + ginkgo.It("Should finalize workload if pods are absent", func() { ginkgo.By("Creating pods with queue name") pod1 := testingpod.MakePod("test-pod1", ns.Name). Group("test-group"). GroupTotalCount("2"). + Request(corev1.ResourceCPU, "1"). Queue("test-queue"). Obj() pod2 := testingpod.MakePod("test-pod2", ns.Name). Group("test-group"). GroupTotalCount("2"). - Image("test-image", nil). + Request(corev1.ResourceCPU, "2"). Queue("test-queue"). Obj() @@ -1128,22 +1243,20 @@ var _ = ginkgo.Describe("Pod controller interacting with scheduler", ginkgo.Orde role1Pod1 := basePod. Clone(). Name("role1-pod1"). - Image("role1-image", nil). Obj() role1Pod2 := basePod. Clone(). Name("role1-pod2"). - Image("role1-image", nil). Obj() role2Pod1 := basePod. Clone(). Name("role2-pod1"). - Image("role2-image", nil). + Request(corev1.ResourceCPU, "1.5"). Obj() role2Pod2 := basePod. Clone(). Name("role2-pod2"). - Image("role2-image", nil). + Request(corev1.ResourceCPU, "1.5"). Obj() ginkgo.By("creating the pods", func() { diff --git a/test/integration/controller/jobs/raycluster/raycluster_controller_test.go b/test/integration/controller/jobs/raycluster/raycluster_controller_test.go index ffc19e4d9d..ef3e063564 100644 --- a/test/integration/controller/jobs/raycluster/raycluster_controller_test.go +++ b/test/integration/controller/jobs/raycluster/raycluster_controller_test.go @@ -581,7 +581,7 @@ var _ = ginkgo.Describe("RayCluster Job controller interacting with scheduler", Should(gomega.Succeed()) return *createdJob2.Spec.Suspend }, util.Timeout, util.Interval).Should(gomega.BeTrue()) - util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) + util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 1) util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) ginkgo.By("deleting the job", func() { diff --git a/test/integration/scheduler/podsready/scheduler_test.go b/test/integration/scheduler/podsready/scheduler_test.go index e76ac9b6b9..523890354d 100644 --- a/test/integration/scheduler/podsready/scheduler_test.go +++ b/test/integration/scheduler/podsready/scheduler_test.go @@ -248,7 +248,7 @@ var _ = ginkgo.Describe("SchedulerWithWaitForPodsReady", func() { // To avoid flakiness, we don't verify if the workload has a QuotaReserved=false with pending reason here. }) - ginkgo.It("Should re-admit a timed out workload and deactivate a workload exceeded the re-queue count limit", func() { + ginkgo.It("Should re-admit a timed out workload and deactivate a workload exceeded the re-queue count limit. After that re-activating a workload", func() { ginkgo.By("create the 'prod' workload") prodWl := testing.MakeWorkload("prod", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "2").Obj() gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) @@ -276,6 +276,29 @@ var _ = ginkgo.Describe("SchedulerWithWaitForPodsReady", func() { g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)) g.Expect(ptr.Deref(prodWl.Spec.Active, true)).Should(gomega.BeFalse()) }, util.Timeout, util.Interval).Should(gomega.Succeed()) + + ginkgo.By("verify the re-activated inactive 'prod' workload re-queue state is reset") + // TODO: Once we move a logic to issue the Eviction with InactiveWorkload reason, we need to remove the below updates. + // REF: https://github.com/kubernetes-sigs/kueue/issues/1841 + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) + apimeta.SetStatusCondition(&prodWl.Status.Conditions, metav1.Condition{ + Type: kueue.WorkloadEvicted, + Status: metav1.ConditionTrue, + Reason: kueue.WorkloadEvictedByDeactivation, + Message: "evicted by Test", + }) + g.Expect(k8sClient.Status().Update(ctx, prodWl)).Should(gomega.Succeed()) + }, util.Timeout, util.Interval).Should(gomega.Succeed(), "Job reconciler should add an Evicted condition with InactiveWorkload to the Workload") + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) + prodWl.Spec.Active = ptr.To(true) + g.Expect(k8sClient.Update(ctx, prodWl)).Should(gomega.Succeed()) + }, util.Timeout, util.Interval).Should(gomega.Succeed(), "Reactivate inactive Workload") + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) + g.Expect(prodWl.Status.RequeueState).Should(gomega.BeNil()) + }, util.Timeout, util.Interval).Should(gomega.Succeed()) }) ginkgo.It("Should unblock admission of new workloads in other ClusterQueues once the admitted workload exceeds timeout", func() { diff --git a/test/integration/scheduler/preemption_test.go b/test/integration/scheduler/preemption_test.go index e1e1f89d65..3d92ef0188 100644 --- a/test/integration/scheduler/preemption_test.go +++ b/test/integration/scheduler/preemption_test.go @@ -17,6 +17,7 @@ limitations under the License. package scheduler import ( + "fmt" "time" "github.com/onsi/ginkgo/v2" @@ -306,7 +307,7 @@ var _ = ginkgo.Describe("Preemption", func() { util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, betaCQ.Name, betaLowWl) }) - ginkgo.It("Should preempt all necessary workloads in concurrent scheduling", func() { + ginkgo.It("Should preempt all necessary workloads in concurrent scheduling with different priorities", func() { ginkgo.By("Creating workloads in beta-cq that borrow quota") betaMidWl := testing.MakeWorkload("beta-mid", ns.Name). @@ -345,7 +346,7 @@ var _ = ginkgo.Describe("Preemption", func() { util.FinishEvictionForWorkloads(ctx, k8sClient, betaMidWl) // one of alpha-mid and gamma-mid should be admitted - gomega.Eventually(func() []*kueue.Workload { return util.FilterAdmittedWorkloads(ctx, k8sClient, alphaMidWl, gammaMidWl) }, util.Interval*4, util.Interval).Should(gomega.HaveLen(1)) + util.ExpectWorkloadsToBeAdmittedCount(ctx, k8sClient, 1, alphaMidWl, gammaMidWl) // betaHighWl remains admitted util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, betaCQ.Name, betaHighWl) @@ -356,6 +357,131 @@ var _ = ginkgo.Describe("Preemption", func() { util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, alphaCQ.Name, alphaMidWl) util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, gammaCQ.Name, gammaMidWl) }) + + ginkgo.It("Should preempt all necessary workloads in concurrent scheduling with the same priority", func() { + var betaWls []*kueue.Workload + for i := 0; i < 3; i++ { + wl := testing.MakeWorkload(fmt.Sprintf("beta-%d", i), ns.Name). + Queue(betaQ.Name). + Request(corev1.ResourceCPU, "2"). + Obj() + gomega.Expect(k8sClient.Create(ctx, wl)).To(gomega.Succeed()) + betaWls = append(betaWls, wl) + } + util.ExpectWorkloadsToBeAdmitted(ctx, k8sClient, betaWls...) + + ginkgo.By("Creating preempting pods") + + alphaWl := testing.MakeWorkload("alpha", ns.Name). + Queue(alphaQ.Name). + Request(corev1.ResourceCPU, "2"). + Obj() + gomega.Expect(k8sClient.Create(ctx, alphaWl)).To(gomega.Succeed()) + + gammaWl := testing.MakeWorkload("gamma", ns.Name). + Queue(gammaQ.Name). + Request(corev1.ResourceCPU, "2"). + Obj() + gomega.Expect(k8sClient.Create(ctx, gammaWl)).To(gomega.Succeed()) + + var evictedWorkloads []*kueue.Workload + gomega.Eventually(func() int { + evictedWorkloads = util.FilterEvictedWorkloads(ctx, k8sClient, betaWls...) + return len(evictedWorkloads) + }, util.Timeout, util.Interval).Should(gomega.Equal(1), "Number of evicted workloads") + + ginkgo.By("Finishing eviction for first set of preempted workloads") + util.FinishEvictionForWorkloads(ctx, k8sClient, evictedWorkloads...) + util.ExpectWorkloadsToBeAdmittedCount(ctx, k8sClient, 1, alphaWl, gammaWl) + + gomega.Eventually(func() int { + evictedWorkloads = util.FilterEvictedWorkloads(ctx, k8sClient, betaWls...) + return len(evictedWorkloads) + }, util.Timeout, util.Interval).Should(gomega.Equal(2), "Number of evicted workloads") + + ginkgo.By("Finishing eviction for second set of preempted workloads") + util.FinishEvictionForWorkloads(ctx, k8sClient, evictedWorkloads...) + util.ExpectWorkloadsToBeAdmitted(ctx, k8sClient, alphaWl, gammaWl) + util.ExpectWorkloadsToBeAdmittedCount(ctx, k8sClient, 1, betaWls...) + }) + }) + + ginkgo.Context("In a cohort with StrictFIFO", func() { + var ( + alphaCQ, betaCQ *kueue.ClusterQueue + alphaLQ, betaLQ *kueue.LocalQueue + oneFlavor *kueue.ResourceFlavor + ) + + ginkgo.BeforeEach(func() { + oneFlavor = testing.MakeResourceFlavor("one").Obj() + gomega.Expect(k8sClient.Create(ctx, oneFlavor)).To(gomega.Succeed()) + + alphaCQ = testing.MakeClusterQueue("alpha-cq"). + Cohort("all"). + QueueingStrategy(kueue.StrictFIFO). + ResourceGroup(*testing.MakeFlavorQuotas("one").Resource(corev1.ResourceCPU, "2").Obj()). + Preemption(kueue.ClusterQueuePreemption{ + WithinClusterQueue: kueue.PreemptionPolicyLowerPriority, + ReclaimWithinCohort: kueue.PreemptionPolicyAny, + }). + Obj() + gomega.Expect(k8sClient.Create(ctx, alphaCQ)).To(gomega.Succeed()) + alphaLQ = testing.MakeLocalQueue("alpha-lq", ns.Name).ClusterQueue("alpha-cq").Obj() + gomega.Expect(k8sClient.Create(ctx, alphaLQ)).To(gomega.Succeed()) + betaCQ = testing.MakeClusterQueue("beta-cq"). + Cohort("all"). + QueueingStrategy(kueue.StrictFIFO). + ResourceGroup(*testing.MakeFlavorQuotas("one").Resource(corev1.ResourceCPU, "2").Obj()). + Preemption(kueue.ClusterQueuePreemption{ + WithinClusterQueue: kueue.PreemptionPolicyLowerPriority, + ReclaimWithinCohort: kueue.PreemptionPolicyAny, + }). + Obj() + gomega.Expect(k8sClient.Create(ctx, betaCQ)).To(gomega.Succeed()) + betaLQ = testing.MakeLocalQueue("beta-lq", ns.Name).ClusterQueue("beta-cq").Obj() + gomega.Expect(k8sClient.Create(ctx, betaLQ)).To(gomega.Succeed()) + }) + + ginkgo.AfterEach(func() { + gomega.Expect(util.DeleteWorkloadsInNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) + gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, alphaLQ)).To(gomega.Succeed()) + gomega.Expect(util.DeleteClusterQueue(ctx, k8sClient, alphaCQ)).To(gomega.Succeed()) + gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, betaLQ)).To(gomega.Succeed()) + gomega.Expect(util.DeleteClusterQueue(ctx, k8sClient, betaCQ)).To(gomega.Succeed()) + }) + + ginkgo.It("Should reclaim from cohort even if another CQ has pending workloads", func() { + useAllAlphaWl := testing.MakeWorkload("use-all", ns.Name). + Queue("alpha-lq"). + Priority(1). + Request(corev1.ResourceCPU, "4"). + Obj() + gomega.Expect(k8sClient.Create(ctx, useAllAlphaWl)).To(gomega.Succeed()) + util.ExpectWorkloadsToBeAdmitted(ctx, k8sClient, useAllAlphaWl) + + pendingAlphaWl := testing.MakeWorkload("pending", ns.Name). + Queue("alpha-lq"). + Priority(0). + Request(corev1.ResourceCPU, "1"). + Obj() + gomega.Expect(k8sClient.Create(ctx, pendingAlphaWl)).To(gomega.Succeed()) + util.ExpectWorkloadsToBePending(ctx, k8sClient, pendingAlphaWl) + + ginkgo.By("Creating a workload to reclaim quota") + + preemptorBetaWl := testing.MakeWorkload("preemptor", ns.Name). + Queue("beta-lq"). + Priority(-1). + Request(corev1.ResourceCPU, "1"). + Obj() + gomega.Expect(k8sClient.Create(ctx, preemptorBetaWl)).To(gomega.Succeed()) + util.ExpectWorkloadsToBePreempted(ctx, k8sClient, useAllAlphaWl) + util.FinishEvictionForWorkloads(ctx, k8sClient, useAllAlphaWl) + util.ExpectWorkloadsToBeAdmitted(ctx, k8sClient, preemptorBetaWl) + util.ExpectWorkloadsToBePending(ctx, k8sClient, useAllAlphaWl, pendingAlphaWl) + }) + }) ginkgo.Context("When most quota is in a shared ClusterQueue in a cohort", func() { @@ -550,7 +676,7 @@ var _ = ginkgo.Describe("Preemption", func() { }) }) - ginkgo.Context("Should be able to preempt when lending limit enabled", func() { + ginkgo.Context("When lending limit enabled", func() { var ( prodCQ *kueue.ClusterQueue devCQ *kueue.ClusterQueue diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index f6d2b00a7d..f139809fae 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -616,12 +616,14 @@ var _ = ginkgo.Describe("Scheduler", func() { util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) }) ginkgo.It("Should re-enqueue by the update event of ClusterQueue", func() { + metrics.AdmissionAttemptsTotal.Reset() wl := testing.MakeWorkload("on-demand-wl", ns.Name).Queue(queue.Name).Request(corev1.ResourceCPU, "6").Obj() gomega.Expect(k8sClient.Create(ctx, wl)).Should(gomega.Succeed()) util.ExpectWorkloadsToBePending(ctx, k8sClient, wl) util.ExpectPendingWorkloadsMetric(cq, 0, 1) util.ExpectReservingActiveWorkloadsMetric(cq, 0) util.ExpectAdmittedWorkloadsTotalMetric(cq, 0) + util.ExpectAdmissionAttemptsMetric(1, 0) ginkgo.By("updating ClusterQueue") updatedCq := &kueue.ClusterQueue{} @@ -645,6 +647,7 @@ var _ = ginkgo.Describe("Scheduler", func() { util.ExpectPendingWorkloadsMetric(cq, 0, 0) util.ExpectReservingActiveWorkloadsMetric(cq, 1) util.ExpectAdmittedWorkloadsTotalMetric(cq, 1) + util.ExpectAdmissionAttemptsMetric(1, 1) }) }) diff --git a/test/integration/webhook/workload_test.go b/test/integration/webhook/workload_test.go index 8b97543b0f..bf49b6a365 100644 --- a/test/integration/webhook/workload_test.go +++ b/test/integration/webhook/workload_test.go @@ -82,37 +82,6 @@ var _ = ginkgo.Describe("Workload defaulting webhook", func() { gomega.Expect(created.Spec.PodSets[0].Name).Should(gomega.Equal(kueue.DefaultPodSetName)) }) - ginkgo.It("Should reset re-queue state", func() { - ginkgo.By("Creating a new inactive Workload") - workload := testing.MakeWorkload(workloadName, ns.Name). - Active(false). - Obj() - gomega.Expect(k8sClient.Create(ctx, workload)).Should(gomega.Succeed()) - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), workload)) - workload.Status = kueue.WorkloadStatus{ - Conditions: []metav1.Condition{{ - Type: kueue.WorkloadEvicted, - Reason: kueue.WorkloadEvictedByDeactivation, - Status: metav1.ConditionTrue, - LastTransitionTime: metav1.Now(), - }}, - RequeueState: &kueue.RequeueState{ - Count: ptr.To[int32](10), - RequeueAt: ptr.To(metav1.Now()), - }, - } - g.Expect(k8sClient.Status().Update(ctx, workload)).Should(gomega.Succeed()) - }, util.Timeout, util.Interval).Should(gomega.Succeed()) - ginkgo.By("Activate a Workload") - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), workload)) - workload.Spec.Active = ptr.To(true) - g.Expect(k8sClient.Update(ctx, workload)).Should(gomega.Succeed()) - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), workload)) - g.Expect(workload.Status.RequeueState).Should(gomega.BeNil(), "re-queue state should be reset") - }, util.Timeout, util.Interval).Should(gomega.Succeed()) - }) }) }) diff --git a/test/util/e2e.go b/test/util/e2e.go index af537c445c..d4549f750b 100644 --- a/test/util/e2e.go +++ b/test/util/e2e.go @@ -22,6 +22,22 @@ import ( visibilityv1alpha1 "sigs.k8s.io/kueue/client-go/clientset/versioned/typed/visibility/v1alpha1" ) +const ( + // The environment variable for namespace where Kueue is installed + namespaceEnvVar = "NAMESPACE" + + // The namespace where kueue is installed in opendatahub + odhNamespace = "opendatahub" + + // The namespace where kueue is installed in rhoai + rhoaiNamespace = "redhat-ods-applications" + + // The default namespace where kueue is installed + kueueNamespace = "kueue-system" + + undefinedNamespace = "undefined" +) + func CreateClientUsingCluster(kContext string) client.WithWatch { cfg, err := config.GetConfigWithContext(kContext) if err != nil { @@ -70,14 +86,14 @@ func CreateVisibilityClient(user string) visibilityv1alpha1.VisibilityV1alpha1In func WaitForKueueAvailability(ctx context.Context, k8sClient client.Client) { kcmKey := types.NamespacedName{ - Namespace: "kueue-system", + Namespace: GetNamespace(), Name: "kueue-controller-manager", } deployment := &appsv1.Deployment{} pods := corev1.PodList{} gomega.EventuallyWithOffset(1, func(g gomega.Gomega) error { g.Expect(k8sClient.Get(ctx, kcmKey, deployment)).To(gomega.Succeed()) - g.Expect(k8sClient.List(ctx, &pods, client.InNamespace("kueue-system"), client.MatchingLabels(deployment.Spec.Selector.MatchLabels))).To(gomega.Succeed()) + g.Expect(k8sClient.List(ctx, &pods, client.InNamespace(GetNamespace()), client.MatchingLabels(deployment.Spec.Selector.MatchLabels))).To(gomega.Succeed()) for _, pod := range pods.Items { for _, cs := range pod.Status.ContainerStatuses { if cs.RestartCount > 0 { @@ -95,3 +111,23 @@ func WaitForKueueAvailability(ctx context.Context, k8sClient client.Client) { }, StartUpTimeout, Interval).Should(gomega.Succeed()) } + +func GetNamespace() string { + namespace, ok := os.LookupEnv(namespaceEnvVar) + if !ok { + fmt.Printf("Expected environment variable %s is unset, please use this environment variable to specify in which namespace Kueue is installed", namespaceEnvVar) + os.Exit(1) + } + switch namespace { + case "opendatahub": + return odhNamespace + case "redhat-ods-applications": + return rhoaiNamespace + case "kueue-system": + return kueueNamespace + default: + fmt.Printf("Expected environment variable %s contains an incorrect value", namespaceEnvVar) + os.Exit(1) + return undefinedNamespace + } +} diff --git a/test/util/util.go b/test/util/util.go index 882d013bf8..315ce860fe 100644 --- a/test/util/util.go +++ b/test/util/util.go @@ -244,11 +244,21 @@ func ExpectWorkloadsToHaveQuotaReservation(ctx context.Context, k8sClient client } func FilterAdmittedWorkloads(ctx context.Context, k8sClient client.Client, wls ...*kueue.Workload) []*kueue.Workload { + return filterWorkloads(ctx, k8sClient, workload.HasQuotaReservation, wls...) +} + +func FilterEvictedWorkloads(ctx context.Context, k8sClient client.Client, wls ...*kueue.Workload) []*kueue.Workload { + return filterWorkloads(ctx, k8sClient, func(wl *kueue.Workload) bool { + return apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadEvicted) + }, wls...) +} + +func filterWorkloads(ctx context.Context, k8sClient client.Client, filter func(*kueue.Workload) bool, wls ...*kueue.Workload) []*kueue.Workload { ret := make([]*kueue.Workload, 0, len(wls)) var updatedWorkload kueue.Workload for _, wl := range wls { err := k8sClient.Get(ctx, client.ObjectKeyFromObject(wl), &updatedWorkload) - if err == nil && workload.HasQuotaReservation(&updatedWorkload) { + if err == nil && filter(&updatedWorkload) { ret = append(ret, wl) } } @@ -274,17 +284,25 @@ func ExpectWorkloadsToBePending(ctx context.Context, k8sClient client.Client, wl } func ExpectWorkloadsToBeAdmitted(ctx context.Context, k8sClient client.Client, wls ...*kueue.Workload) { - gomega.EventuallyWithOffset(1, func() int { + expectWorkloadsToBeAdmittedCountWithOffset(ctx, 2, k8sClient, len(wls), wls...) +} + +func ExpectWorkloadsToBeAdmittedCount(ctx context.Context, k8sClient client.Client, count int, wls ...*kueue.Workload) { + expectWorkloadsToBeAdmittedCountWithOffset(ctx, 2, k8sClient, count, wls...) +} + +func expectWorkloadsToBeAdmittedCountWithOffset(ctx context.Context, offset int, k8sClient client.Client, count int, wls ...*kueue.Workload) { + gomega.EventuallyWithOffset(offset, func() int { admitted := 0 var updatedWorkload kueue.Workload for _, wl := range wls { - gomega.ExpectWithOffset(1, k8sClient.Get(ctx, client.ObjectKeyFromObject(wl), &updatedWorkload)).To(gomega.Succeed()) + gomega.ExpectWithOffset(offset, k8sClient.Get(ctx, client.ObjectKeyFromObject(wl), &updatedWorkload)).To(gomega.Succeed()) if apimeta.IsStatusConditionTrue(updatedWorkload.Status.Conditions, kueue.WorkloadAdmitted) { admitted++ } } return admitted - }, Timeout, Interval).Should(gomega.Equal(len(wls)), "Not enough workloads are admitted") + }, Timeout, Interval).Should(gomega.Equal(count), "Not enough workloads are admitted") } func ExpectWorkloadToFinish(ctx context.Context, k8sClient client.Client, wlKey client.ObjectKey) { @@ -372,6 +390,21 @@ func ExpectWorkloadToBeAdmittedAs(ctx context.Context, k8sClient client.Client, }, Timeout, Interval).Should(gomega.BeComparableTo(admission)) } +var attemptStatuses = []metrics.AdmissionResult{metrics.AdmissionResultInadmissible, metrics.AdmissionResultSuccess} + +func ExpectAdmissionAttemptsMetric(pending, admitted int) { + vals := []int{pending, admitted} + + for i, status := range attemptStatuses { + metric := metrics.AdmissionAttemptsTotal.WithLabelValues(string(status)) + gomega.EventuallyWithOffset(1, func() int { + v, err := testutil.GetCounterMetricValue(metric) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + return int(v) + }, Timeout, Interval).Should(gomega.Equal(vals[i]), "pending_workloads with status=%s", status) + } +} + var pendingStatuses = []string{metrics.PendingStatusActive, metrics.PendingStatusInadmissible} func ExpectPendingWorkloadsMetric(cq *kueue.ClusterQueue, active, inadmissible int) {