Skip to content
This repository has been archived by the owner on Sep 19, 2022. It is now read-only.

Commit

Permalink
Use podGroup instead of PDB in v1beta2 (#150)
Browse files Browse the repository at this point in the history
  • Loading branch information
johnugeorge authored and k8s-ci-robot committed Mar 27, 2019
1 parent fd773c0 commit 261dd72
Show file tree
Hide file tree
Showing 40 changed files with 2,134 additions and 65 deletions.
17 changes: 15 additions & 2 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ required = [
name = "github.com/golang/protobuf"
version = "1.1.0"

[[constraint]]
name = "github.com/kubernetes-sigs/kube-batch"
version = "v0.3"

[[constraint]]
name = "github.com/kubeflow/tf-operator"
branch = "master"

[[constraint]]
name = "github.com/sirupsen/logrus"
version = "v1.0.4"
Expand Down
22 changes: 14 additions & 8 deletions cmd/pytorch-operator.v1beta1/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"os"
"time"

kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned"
log "github.com/sirupsen/logrus"
"k8s.io/api/core/v1"
crdclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
Expand Down Expand Up @@ -87,7 +88,7 @@ func Run(opt *options.ServerOption) error {
}

// Create clients.
kubeClientSet, leaderElectionClientSet, pytorchJobClientSet, err := createClientSets(kcfg)
kubeClientSet, leaderElectionClientSet, pytorchJobClientSet, kubeBatchClientSet, err := createClientSets(kcfg)
if err != nil {
return err
}
Expand All @@ -99,7 +100,7 @@ func Run(opt *options.ServerOption) error {
unstructuredInformer := controller.NewUnstructuredPyTorchJobInformer(kcfg, opt.Namespace)

// Create pytorch controller.
tc := controller.NewPyTorchController(unstructuredInformer, kubeClientSet, pytorchJobClientSet, kubeInformerFactory, pytorchJobInformerFactory, *opt)
tc := controller.NewPyTorchController(unstructuredInformer, kubeClientSet, kubeBatchClientSet, pytorchJobClientSet, kubeInformerFactory, pytorchJobInformerFactory, *opt)

// Start informer goroutines.
go kubeInformerFactory.Start(stopCh)
Expand Down Expand Up @@ -154,32 +155,37 @@ func Run(opt *options.ServerOption) error {
return nil
}

func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, jobclientset.Interface, error) {
func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, jobclientset.Interface, kubebatchclient.Interface, error) {

crdClient, err := crdclient.NewForConfig(config)

if err != nil {
return nil, nil, nil, err
return nil, nil, nil, nil, err
}

checkCRDExists(crdClient, v1beta1.PytorchCRD)

kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "pytorch-operator"))
if err != nil {
return nil, nil, nil, err
return nil, nil, nil, nil, err
}

leaderElectionClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election"))
if err != nil {
return nil, nil, nil, err
return nil, nil, nil, nil, err
}

jobClientSet, err := jobclientset.NewForConfig(config)
if err != nil {
return nil, nil, nil, err
return nil, nil, nil, nil, err
}

return kubeClientSet, leaderElectionClientSet, jobClientSet, nil
kubeBatchClientSet, err := kubebatchclient.NewForConfig(restclientset.AddUserAgent(config, "kube-batch"))
if err != nil {
return nil, nil, nil, nil, err
}

return kubeClientSet, leaderElectionClientSet, jobClientSet, kubeBatchClientSet, nil
}

func checkCRDExists(clientset crdclient.Interface, crdName string) {
Expand Down
20 changes: 13 additions & 7 deletions cmd/pytorch-operator.v1beta2/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"os"
"time"

kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned"
log "github.com/sirupsen/logrus"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -87,7 +88,7 @@ func Run(opt *options.ServerOption) error {
}

// Create clients.
kubeClientSet, leaderElectionClientSet, pytorchJobClientSet, err := createClientSets(kcfg)
kubeClientSet, leaderElectionClientSet, pytorchJobClientSet, kubeBatchClientSet, err := createClientSets(kcfg)
if err != nil {
return err
}
Expand All @@ -102,7 +103,7 @@ func Run(opt *options.ServerOption) error {
unstructuredInformer := controller.NewUnstructuredPyTorchJobInformer(kcfg, opt.Namespace)

// Create pytorch controller.
tc := controller.NewPyTorchController(unstructuredInformer, kubeClientSet, pytorchJobClientSet, kubeInformerFactory, pytorchJobInformerFactory, *opt)
tc := controller.NewPyTorchController(unstructuredInformer, kubeClientSet, kubeBatchClientSet, pytorchJobClientSet, kubeInformerFactory, pytorchJobInformerFactory, *opt)

// Start informer goroutines.
go kubeInformerFactory.Start(stopCh)
Expand Down Expand Up @@ -157,24 +158,29 @@ func Run(opt *options.ServerOption) error {
return nil
}

func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, jobclientset.Interface, error) {
func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, jobclientset.Interface, kubebatchclient.Interface, error) {

kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "pytorch-operator"))
if err != nil {
return nil, nil, nil, err
return nil, nil, nil, nil, err
}

leaderElectionClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election"))
if err != nil {
return nil, nil, nil, err
return nil, nil, nil, nil, err
}

jobClientSet, err := jobclientset.NewForConfig(config)
if err != nil {
return nil, nil, nil, err
return nil, nil, nil, nil, err
}

return kubeClientSet, leaderElectionClientSet, jobClientSet, nil
kubeBatchClientSet, err := kubebatchclient.NewForConfig(restclientset.AddUserAgent(config, "kube-batch"))
if err != nil {
return nil, nil, nil, nil, err
}

return kubeClientSet, leaderElectionClientSet, jobClientSet, kubeBatchClientSet, nil
}

func checkCRDExists(clientset jobclientset.Interface, namespace string) bool {
Expand Down
39 changes: 39 additions & 0 deletions manifests/podgroup.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: podgroups.scheduling.incubator.k8s.io
spec:
group: scheduling.incubator.k8s.io
names:
kind: PodGroup
plural: podgroups
scope: Namespaced
validation:
openAPIV3Schema:
properties:
apiVersion:
type: string
kind:
type: string
metadata:
type: object
spec:
properties:
minMember:
format: int32
type: integer
type: object
status:
properties:
succeeded:
format: int32
type: integer
failed:
format: int32
type: integer
running:
format: int32
type: integer
type: object
type: object
version: v1alpha1
20 changes: 13 additions & 7 deletions pkg/controller.v1beta1/pytorch/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"time"

kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned"
log "github.com/sirupsen/logrus"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -98,6 +99,7 @@ func NewPyTorchController(
// This variable is for unstructured informer.
jobInformer jobinformersv1beta1.PyTorchJobInformer,
kubeClientSet kubeclientset.Interface,
kubeBatchClientSet kubebatchclient.Interface,
jobClientSet jobclientset.Interface,
kubeInformerFactory kubeinformers.SharedInformerFactory,
// This field is not used now but we keep it since it will be used
Expand All @@ -116,7 +118,7 @@ func NewPyTorchController(
// Create base controller
log.Info("Creating Job controller")
jc := jobcontroller.NewJobController(pc, metav1.Duration{Duration: 15 * time.Second},
option.EnableGangScheduling, kubeClientSet, kubeInformerFactory, v1beta1.Plural)
option.EnableGangScheduling, kubeClientSet, kubeBatchClientSet, kubeInformerFactory, v1beta1.Plural)
pc.JobController = jc
// Set sync handler.
pc.syncHandler = pc.syncPyTorchJob
Expand Down Expand Up @@ -303,9 +305,9 @@ func (pc *PyTorchController) syncPyTorchJob(key string) (bool, error) {

if pc.Config.EnableGangScheduling {
minAvailableReplicas := getTotalReplicas(job)
_, err := pc.SyncPdb(job, minAvailableReplicas)
_, err := pc.SyncPodGroup(job, minAvailableReplicas)
if err != nil {
logger.Warnf("Sync pdb %v: %v", job.Name, err)
logger.Warnf("Sync PodGroup %v: %v", job.Name, err)
}
}

Expand Down Expand Up @@ -364,12 +366,12 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *v1beta1.PyTorchJob) error
}

if pc.Config.EnableGangScheduling {
pc.Recorder.Event(job, v1.EventTypeNormal, "JobTerminated", "Job is terminated, deleting pdb")
if err := pc.DeletePdb(job); err != nil {
pc.Recorder.Eventf(job, v1.EventTypeWarning, "FailedDeletePdb", "Error deleting: %v", err)
pc.Recorder.Event(job, v1.EventTypeNormal, "JobTerminated", "Job is terminated, deleting PodGroup")
if err := pc.DeletePodGroup(job); err != nil {
pc.Recorder.Eventf(job, v1.EventTypeWarning, "FailedDeletePodGroup", "Error deleting: %v", err)
return err
} else {
pc.Recorder.Eventf(job, v1.EventTypeNormal, "SuccessfulDeletePdb", "Deleted pdb: %v", job.Name)
pc.Recorder.Eventf(job, v1.EventTypeNormal, "SuccessfulDeletePodGroup", "Deleted PodGroup: %v", job.Name)

}
}
Expand Down Expand Up @@ -468,6 +470,10 @@ func (pc *PyTorchController) GetReplicaIndexLabelKey() string {
return replicaIndexLabel
}

func (pc *PyTorchController) GetJobRoleKey() string {
return labelPyTorchJobRole
}

func (pc *PyTorchController) ControllerName() string {
return controllerName
}
37 changes: 33 additions & 4 deletions pkg/controller.v1beta1/pytorch/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"time"

"github.com/golang/protobuf/proto"
kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned"
"k8s.io/api/core/v1"
apiv1beta1 "k8s.io/api/policy/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -48,6 +49,7 @@ var (
func newPyTorchController(
config *rest.Config,
kubeClientSet kubeclientset.Interface,
kubeBatchClientSet kubebatchclient.Interface,
jobClientSet jobclientset.Interface,
resyncPeriod controller.ResyncPeriodFunc,
option options.ServerOption,
Expand All @@ -60,7 +62,7 @@ func newPyTorchController(

jobInformer := NewUnstructuredPyTorchJobInformer(config, metav1.NamespaceAll)

ctr := NewPyTorchController(jobInformer, kubeClientSet, jobClientSet, kubeInformerFactory, jobInformerFactory, option)
ctr := NewPyTorchController(jobInformer, kubeClientSet, kubeBatchClientSet, jobClientSet, kubeInformerFactory, jobInformerFactory, option)
ctr.PodControl = &controller.FakePodControl{}
ctr.ServiceControl = &control.FakeServiceControl{}
return ctr, kubeInformerFactory, jobInformerFactory
Expand Down Expand Up @@ -227,6 +229,15 @@ func TestNormalPath(t *testing.T) {
},
},
)
// Prepare the kube-batch clientset and controller for the test.
kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{
Host: "",
ContentConfig: rest.ContentConfig{
GroupVersion: &v1.SchemeGroupVersion,
},
},
)

config := &rest.Config{
Host: "",
ContentConfig: rest.ContentConfig{
Expand All @@ -235,7 +246,7 @@ func TestNormalPath(t *testing.T) {
}
option := options.ServerOption{}
jobClientSet := jobclientset.NewForConfigOrDie(config)
ctr, kubeInformerFactory, _ := newPyTorchController(config, kubeClientSet, jobClientSet, controller.NoResyncPeriodFunc, option)
ctr, kubeInformerFactory, _ := newPyTorchController(config, kubeClientSet, kubeBatchClientSet, jobClientSet, controller.NoResyncPeriodFunc, option)
ctr.jobInformerSynced = testutil.AlwaysReady
ctr.PodInformerSynced = testutil.AlwaysReady
ctr.ServiceInformerSynced = testutil.AlwaysReady
Expand Down Expand Up @@ -347,14 +358,23 @@ func TestRun(t *testing.T) {
},
},
)
// Prepare the kube-batch clientset and controller for the test.
kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{
Host: "",
ContentConfig: rest.ContentConfig{
GroupVersion: &v1.SchemeGroupVersion,
},
},
)

config := &rest.Config{
Host: "",
ContentConfig: rest.ContentConfig{
GroupVersion: &v1beta1.SchemeGroupVersion,
},
}
jobClientSet := jobclientset.NewForConfigOrDie(config)
ctr, _, _ := newPyTorchController(config, kubeClientSet, jobClientSet, controller.NoResyncPeriodFunc, options.ServerOption{})
ctr, _, _ := newPyTorchController(config, kubeClientSet, kubeBatchClientSet, jobClientSet, controller.NoResyncPeriodFunc, options.ServerOption{})
ctr.jobInformerSynced = testutil.AlwaysReady
ctr.PodInformerSynced = testutil.AlwaysReady
ctr.ServiceInformerSynced = testutil.AlwaysReady
Expand All @@ -380,12 +400,21 @@ func TestSyncPdb(t *testing.T) {
GroupVersion: &v1beta1.SchemeGroupVersion,
},
}
// Prepare the kube-batch clientset and controller for the test.
kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{
Host: "",
ContentConfig: rest.ContentConfig{
GroupVersion: &v1.SchemeGroupVersion,
},
},
)

jobClientSet := jobclientset.NewForConfigOrDie(config)
kubeClientSet := fake.NewSimpleClientset()
option := options.ServerOption{
EnableGangScheduling: true,
}
ctr, _, _ := newPyTorchController(config, kubeClientSet, jobClientSet, controller.NoResyncPeriodFunc, option)
ctr, _, _ := newPyTorchController(config, kubeClientSet, kubeBatchClientSet, jobClientSet, controller.NoResyncPeriodFunc, option)

type testCase struct {
job *v1beta1.PyTorchJob
Expand Down
Loading

0 comments on commit 261dd72

Please sign in to comment.