diff --git a/.travis.yml b/.travis.yml index dbcb88bcf..182f32fcb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,6 @@ install: - gometalinter --install script: - - go build -o pytorch-operator github.com/kubeflow/pytorch-operator/cmd/pytorch-operator - go build -o pytorch-operator.v2 github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v2 - go build -o pytorch-operator.v1beta1 github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta1 - gometalinter --config=linter_config.json ./pkg/... diff --git a/build_image.sh b/build_image.sh index 64d218dc8..9e510d11d 100755 --- a/build_image.sh +++ b/build_image.sh @@ -20,10 +20,10 @@ echo "Create symlink to GOPATH" mkdir -p ${GOPATH}/src/github.com/kubeflow ln -s ${CONTEXT_DIR} ${GO_DIR} cd ${GO_DIR} -echo "Build pytorch operator v1alpha1 binary" -go build github.com/kubeflow/pytorch-operator/cmd/pytorch-operator echo "Build pytorch operator v1alpha2 binary" go build github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v2 +echo "Build pytorch operator v1beta1 binary" +go build github.com/kubeflow/pytorch-operator/cmd/pytorch-operator.v1beta1 echo "Building container in gcloud" gcloud container builds submit . --tag=${IMAGE}:${TAG} diff --git a/cmd/pytorch-operator/app/options/options.go b/cmd/pytorch-operator/app/options/options.go deleted file mode 100644 index 01762bc6c..000000000 --- a/cmd/pytorch-operator/app/options/options.go +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package options - -import ( - "flag" - "time" -) - -// ServerOption is the main context object for the controller manager. -type ServerOption struct { - ChaosLevel int - ControllerConfigFile string - PrintVersion bool - GCInterval time.Duration - JsonLogFormat bool -} - -// NewServerOption creates a new CMServer with a default config. -func NewServerOption() *ServerOption { - s := ServerOption{} - return &s -} - -// AddFlags adds flags for a specific CMServer to the specified FlagSet -func (s *ServerOption) AddFlags(fs *flag.FlagSet) { - // chaos level will be removed once we have a formal tool to inject failures. - fs.IntVar(&s.ChaosLevel, "chaos-level", -1, "DO NOT USE IN PRODUCTION - level of chaos injected into the PyTorchJob created by the operator.") - fs.BoolVar(&s.PrintVersion, "version", false, "Show version and quit") - fs.DurationVar(&s.GCInterval, "gc-interval", 10*time.Minute, "GC interval") - fs.StringVar(&s.ControllerConfigFile, "controller-config-file", "", "Path to file containing the controller config.") - fs.BoolVar(&s.JsonLogFormat, "json-log-format", true, "Set true to use json style log format. Set false to use plaintext style log format") -} diff --git a/cmd/pytorch-operator/app/server.go b/cmd/pytorch-operator/app/server.go deleted file mode 100644 index 2b2c716b5..000000000 --- a/cmd/pytorch-operator/app/server.go +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package app - -import ( - "fmt" - "io/ioutil" - "os" - "time" - - "github.com/ghodss/yaml" - log "github.com/sirupsen/logrus" - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - election "k8s.io/client-go/tools/leaderelection" - "k8s.io/client-go/tools/leaderelection/resourcelock" - "k8s.io/client-go/tools/record" - - "github.com/kubeflow/pytorch-operator/cmd/pytorch-operator/app/options" - "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - torchjobclient "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned" - "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/scheme" - informers "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions" - "github.com/kubeflow/pytorch-operator/pkg/controller" - "github.com/kubeflow/pytorch-operator/pkg/util" - "github.com/kubeflow/pytorch-operator/version" - "github.com/kubeflow/tf-operator/pkg/util/k8sutil" -) - -var ( - leaseDuration = 15 * time.Second - renewDuration = 5 * time.Second - retryPeriod = 3 * time.Second -) - -func Run(opt *options.ServerOption) error { - - // Check if the -version flag was passed and, if so, print the version and exit. - if opt.PrintVersion { - version.PrintVersionAndExit() - } - - namespace := os.Getenv(util.EnvKubeflowNamespace) - if len(namespace) == 0 { - log.Infof("EnvKubeflowNamespace not set, use default namespace") - namespace = metav1.NamespaceDefault - } - - // To help debugging, immediately log version - log.Infof("%+v", version.Info()) - - config, err := k8sutil.GetClusterConfig() - if err != nil { - return err - } - - kubeClient, leaderElectionClient, pytorchJobClient, err := createClients(config) - if err != nil { - return err - } - - controllerConfig := readControllerConfig(opt.ControllerConfigFile) - - neverStop := make(chan struct{}) - defer close(neverStop) - - pytorchJobInformerFactory := informers.NewSharedInformerFactory(pytorchJobClient, time.Second*30) - controller, err := controller.New(kubeClient, pytorchJobClient, *controllerConfig, pytorchJobInformerFactory) - if err != nil { - return err - } - - go pytorchJobInformerFactory.Start(neverStop) - - run := func(stopCh <-chan struct{}) { - controller.Run(1, stopCh) - } - - id, err := os.Hostname() - if err != nil { - return fmt.Errorf("Failed to get hostname: %v", err) - } - - // Prepare event clients. - eventBroadcaster := record.NewBroadcaster() - recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "pytorch-operator"}) - - rl := &resourcelock.EndpointsLock{ - EndpointsMeta: metav1.ObjectMeta{ - Namespace: namespace, - Name: "pytorch-operator", - }, - Client: leaderElectionClient.CoreV1(), - LockConfig: resourcelock.ResourceLockConfig{ - Identity: id, - EventRecorder: recorder, - }, - } - - election.RunOrDie(election.LeaderElectionConfig{ - Lock: rl, - LeaseDuration: leaseDuration, - RenewDeadline: renewDuration, - RetryPeriod: retryPeriod, - Callbacks: election.LeaderCallbacks{ - OnStartedLeading: run, - OnStoppedLeading: func() { - log.Fatalf("leader election lost") - }, - }, - }) - - return nil -} - -func readControllerConfig(controllerConfigFile string) *v1alpha1.ControllerConfig { - controllerConfig := &v1alpha1.ControllerConfig{} - if controllerConfigFile != "" { - log.Infof("Loading controller config from %v.", controllerConfigFile) - data, err := ioutil.ReadFile(controllerConfigFile) - if err != nil { - log.Fatalf("Could not read file: %v. Error: %v", controllerConfigFile, err) - return controllerConfig - } - err = yaml.Unmarshal(data, controllerConfig) - if err != nil { - log.Fatalf("Could not parse controller config; Error: %v\n", err) - } - log.Infof("ControllerConfig: %v", util.Pformat(controllerConfig)) - } else { - log.Info("No controller_config_file provided; using empty config.") - } - return controllerConfig -} - -func createClients(config *rest.Config) (clientset.Interface, clientset.Interface, torchjobclient.Interface, error) { - kubeClient, err := clientset.NewForConfig(rest.AddUserAgent(config, "pytorchjob_operator")) - if err != nil { - return nil, nil, nil, err - } - - leaderElectionClient, err := clientset.NewForConfig(rest.AddUserAgent(config, "leader-election")) - if err != nil { - return nil, nil, nil, err - } - - pytorchJobClient, err := torchjobclient.NewForConfig(config) - if err != nil { - return nil, nil, nil, err - } - - return kubeClient, leaderElectionClient, pytorchJobClient, nil -} diff --git a/cmd/pytorch-operator/main.go b/cmd/pytorch-operator/main.go deleted file mode 100644 index e8fd6c6bd..000000000 --- a/cmd/pytorch-operator/main.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "flag" - - "github.com/kubeflow/pytorch-operator/cmd/pytorch-operator/app" - "github.com/kubeflow/pytorch-operator/cmd/pytorch-operator/app/options" - "github.com/onrik/logrus/filename" - log "github.com/sirupsen/logrus" -) - -func init() { - // Add filename as one of the fields of the structured log message - filenameHook := filename.NewHook() - filenameHook.Field = "filename" - log.AddHook(filenameHook) -} - -func main() { - s := options.NewServerOption() - s.AddFlags(flag.CommandLine) - - flag.Parse() - - if s.JsonLogFormat { - // Output logs in a json format so that it can be parsed by services like Stackdriver - log.SetFormatter(&log.JSONFormatter{}) - } - - if err := app.Run(s); err != nil { - log.Fatalf("%v\n", err) - } - -} diff --git a/cmd/pytorch-operator/pytorch-operator b/cmd/pytorch-operator/pytorch-operator deleted file mode 100755 index 77fdd3c9a..000000000 Binary files a/cmd/pytorch-operator/pytorch-operator and /dev/null differ diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index ac6407e3c..f1bc2cdfe 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -30,16 +30,9 @@ CODEGEN_PKG=${CODEGEN_PKG:-$(cd ${SCRIPT_ROOT}; ls -d -1 ./vendor/k8s.io/code-ge # instead of the $GOPATH directly. For normal projects this can be dropped. ${CODEGEN_PKG}/generate-groups.sh "defaulter,deepcopy,client,informer,lister" \ github.com/kubeflow/pytorch-operator/pkg/client github.com/kubeflow/pytorch-operator/pkg/apis \ - pytorch:v1alpha1,v1alpha2,v1beta1 \ + pytorch:v1alpha2,v1beta1 \ --go-header-file ${SCRIPT_ROOT}/hack/boilerplate/boilerplate.go.txt -echo "Generating defaulters for pytorch v1alpha1" - ${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1 \ - -O zz_generated.defaults \ - --go-header-file ./hack/../hack/boilerplate/boilerplate.go.txt \ - --output-package github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1 - - echo "Generating defaulters for pytorch v1alpha2" ${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha2 \ -O zz_generated.defaults \ diff --git a/linter_config.json b/linter_config.json index 5fe716dd8..a7abb7cbd 100644 --- a/linter_config.json +++ b/linter_config.json @@ -22,9 +22,7 @@ "Exclude": [ "redundant return statement", "comment or be unexported", - "comment on exported", - "pkg/apis/pytorch/v1alpha1/zz_generated.deepcopy.go", - "pkg/apis/pytorch/v1alpha1/zz_generated.defaults.go" + "comment on exported" ], "Deadline": "300s", "Skip": ["pkg/client"] diff --git a/pkg/apis/pytorch/helper/helpers.go b/pkg/apis/pytorch/helper/helpers.go deleted file mode 100644 index 551e5cce4..000000000 --- a/pkg/apis/pytorch/helper/helpers.go +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package helper - -import ( - "fmt" - - torchv1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - "github.com/kubeflow/pytorch-operator/pkg/util" - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -var ( - groupVersionKind = schema.GroupVersionKind{ - Group: torchv1.GroupName, - Version: torchv1.GroupVersion, - Kind: torchv1.ResourceKind, - } -) - -// AsOwner make OwnerReference according to the parameter -func AsOwner(pytorchJob *torchv1.PyTorchJob) metav1.OwnerReference { - trueVar := true - // Both api.OwnerReference and metatypes.OwnerReference are combined into that. - return metav1.OwnerReference{ - APIVersion: groupVersionKind.GroupVersion().String(), - Kind: groupVersionKind.Kind, - Name: pytorchJob.ObjectMeta.Name, - UID: pytorchJob.ObjectMeta.UID, - Controller: &trueVar, - BlockOwnerDeletion: &trueVar, - } -} - -// ConfigureAcceleratorsForPyTorchJobSpec adds any accelerator specific configuration to the pods. -func ConfigureAcceleratorsForPyTorchJobSpec(c *torchv1.PyTorchJobSpec, accelerators map[string]torchv1.AcceleratorConfig) error { - for _, r := range c.ReplicaSpecs { - if r.Template == nil { - return fmt.Errorf("Replica is missing Template; %v", util.Pformat(r)) - } - for i, c := range r.Template.Spec.Containers { - if c.Name == torchv1.DefaultPyTorchContainer { - // Identify the accelerators attached to this container. - a := map[string]torchv1.AcceleratorConfig{} - - lists := []v1.ResourceList{c.Resources.Limits, c.Resources.Requests} - for _, resources := range lists { - for name, _ := range resources { - - if _, ok := accelerators[string(name)]; !ok { - continue - } - - // Add the expected mounts to the pods. - a[string(name)] = accelerators[string(name)] - } - } - - // Add accelerator information to the pod. - for _, config := range a { - for _, v := range config.Volumes { - r.Template.Spec.Volumes = append(r.Template.Spec.Volumes, - v1.Volume{ - Name: v.Name, - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{ - Path: v.HostPath, - }, - }, - }) - c.VolumeMounts = append(c.VolumeMounts, v1.VolumeMount{ - Name: v.Name, - MountPath: v.MountPath, - }) - } - - for _, envVar := range config.EnvVars { - c.Env = append(c.Env, v1.EnvVar{ - Name: envVar.Name, - Value: envVar.Value, - }) - } - } - r.Template.Spec.Containers[i] = c - break - } - } - } - return nil -} - -// Cleanup cleans up user passed spec, e.g. defaulting, transforming fields. -// TODO: move this to admission controller -func Cleanup(c *torchv1.PyTorchJobSpec) { - // TODO(jlewi): Add logic to cleanup user provided spec; e.g. by filling in defaults. - // We should have default container images so user doesn't have to provide these. -} - -func CRDName() string { - return fmt.Sprintf("%s.%s", torchv1.CRDKindPlural, torchv1.CRDGroup) -} - -func scalingReason(from, to int) string { - return fmt.Sprintf("Current cluster size: %d, desired cluster size: %d", from, to) -} diff --git a/pkg/apis/pytorch/helper/helpers_test.go b/pkg/apis/pytorch/helper/helpers_test.go deleted file mode 100644 index d44877b39..000000000 --- a/pkg/apis/pytorch/helper/helpers_test.go +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package helper - -import ( - "reflect" - "testing" - - "github.com/gogo/protobuf/proto" - torchv1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - "github.com/kubeflow/pytorch-operator/pkg/util" - "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" -) - -func TestAddAccelertor(t *testing.T) { - type testCase struct { - in *torchv1.PyTorchJobSpec - expected *torchv1.PyTorchJobSpec - config map[string]torchv1.AcceleratorConfig - } - - testCases := []testCase{ - // Case 1 checks that we look at requests. - { - in: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(2), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - Resources: v1.ResourceRequirements{ - Requests: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - }, - }, - }, - }, - PyTorchReplicaType: torchv1.WORKER, - }, - }, - }, - expected: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(2), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - Resources: v1.ResourceRequirements{ - Requests: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - VolumeMounts: []v1.VolumeMount{ - { - Name: "cuda-lib", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - Volumes: []v1.Volume{ - { - Name: "cuda-lib", - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{ - Path: "/home/cuda", - }, - }, - }, - }, - }, - }, - PyTorchReplicaType: torchv1.WORKER, - }, - }, - }, - config: map[string]torchv1.AcceleratorConfig{ - "nvidia-gpu": torchv1.AcceleratorConfig{ - Volumes: []torchv1.AcceleratorVolume{ - { - Name: "cuda-lib", - HostPath: "/home/cuda", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - }, - // Case 2 checks that we look at limit. - { - in: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(2), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - Resources: v1.ResourceRequirements{ - Limits: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - }, - }, - }, - }, - PyTorchReplicaType: torchv1.WORKER, - }, - }, - }, - expected: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(2), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - Resources: v1.ResourceRequirements{ - Limits: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - VolumeMounts: []v1.VolumeMount{ - { - Name: "cuda-lib", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - Volumes: []v1.Volume{ - { - Name: "cuda-lib", - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{ - Path: "/home/cuda", - }, - }, - }, - }, - }, - }, - PyTorchReplicaType: torchv1.WORKER, - }, - }, - }, - config: map[string]torchv1.AcceleratorConfig{ - "nvidia-gpu": torchv1.AcceleratorConfig{ - Volumes: []torchv1.AcceleratorVolume{ - { - Name: "cuda-lib", - HostPath: "/home/cuda", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - }, - // Case 3 no GPUs - { - in: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(2), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1.WORKER, - }, - }, - }, - expected: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(2), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1.WORKER, - }, - }, - }, - config: map[string]torchv1.AcceleratorConfig{ - "nvidia-gpu": torchv1.AcceleratorConfig{ - Volumes: []torchv1.AcceleratorVolume{ - { - Name: "cuda-lib", - HostPath: "/home/cuda", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - }, - } - - for _, c := range testCases { - if err := ConfigureAcceleratorsForPyTorchJobSpec(c.in, c.config); err != nil { - t.Errorf("ConfigureAccelerators error; %v", err) - } - if !reflect.DeepEqual(c.in, c.expected) { - t.Errorf("Want\n%v; Got\n %v", util.Pformat(c.expected), util.Pformat(c.in)) - } - } -} diff --git a/pkg/apis/pytorch/v1alpha1/defaults.go b/pkg/apis/pytorch/v1alpha1/defaults.go deleted file mode 100644 index 3c8f50e28..000000000 --- a/pkg/apis/pytorch/v1alpha1/defaults.go +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1 - -import ( - "github.com/golang/protobuf/proto" - "k8s.io/apimachinery/pkg/runtime" -) - -func addDefaultingFuncs(scheme *runtime.Scheme) error { - return RegisterDefaults(scheme) -} - -// SetDefaults_PyTorchJob sets any unspecified values to defaults -func SetDefaults_PyTorchJob(obj *PyTorchJob) { - c := &obj.Spec - - if c.PyTorchImage == "" { - c.PyTorchImage = DefaultPyTorchImage - } - - // Check that each replica has a pytorch container. - for _, r := range c.ReplicaSpecs { - - if r.MasterPort == nil { - r.MasterPort = proto.Int32(MasterPort) - } - - if string(r.PyTorchReplicaType) == "" { - r.PyTorchReplicaType = MASTER - } - - if r.Replicas == nil { - r.Replicas = proto.Int32(Replicas) - } - } - if c.TerminationPolicy == nil { - c.TerminationPolicy = &TerminationPolicySpec{ - Master: &MasterSpec{ - ReplicaName: "MASTER", - ReplicaRank: 0, - }, - } - } - -} diff --git a/pkg/apis/pytorch/v1alpha1/defaults_test.go b/pkg/apis/pytorch/v1alpha1/defaults_test.go deleted file mode 100644 index cb9328c98..000000000 --- a/pkg/apis/pytorch/v1alpha1/defaults_test.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1 - -import ( - "reflect" - "testing" - - "github.com/gogo/protobuf/proto" - "github.com/kubeflow/pytorch-operator/pkg/util" - "k8s.io/api/core/v1" -) - -func TestSetDefaults_PyTorchJob(t *testing.T) { - type testCase struct { - in *PyTorchJob - expected *PyTorchJob - } - - testCases := []testCase{ - { - in: &PyTorchJob{ - Spec: PyTorchJobSpec{ - ReplicaSpecs: []*PyTorchReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - }, - }, - PyTorchImage: "pytorch/pytorch:v0.2", - }, - }, - expected: &PyTorchJob{ - Spec: PyTorchJobSpec{ - ReplicaSpecs: []*PyTorchReplicaSpec{ - { - Replicas: proto.Int32(1), - MasterPort: proto.Int32(23456), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: MASTER, - }, - }, - PyTorchImage: "pytorch/pytorch:v0.2", - TerminationPolicy: &TerminationPolicySpec{ - Master: &MasterSpec{ - ReplicaName: "MASTER", - ReplicaRank: 0, - }, - }, - }, - }, - }, - { - in: &PyTorchJob{ - Spec: PyTorchJobSpec{ - ReplicaSpecs: []*PyTorchReplicaSpec{ - { - PyTorchReplicaType: WORKER, - }, - }, - PyTorchImage: "pytorch/pytorch:v0.2", - }, - }, - expected: &PyTorchJob{ - Spec: PyTorchJobSpec{ - ReplicaSpecs: []*PyTorchReplicaSpec{ - { - Replicas: proto.Int32(1), - MasterPort: proto.Int32(23456), - PyTorchReplicaType: WORKER, - }, - }, - PyTorchImage: "pytorch/pytorch:v0.2", - TerminationPolicy: &TerminationPolicySpec{ - Master: &MasterSpec{ - ReplicaName: "MASTER", - ReplicaRank: 0, - }, - }, - }, - }, - }, - } - - for _, c := range testCases { - SetDefaults_PyTorchJob(c.in) - if !reflect.DeepEqual(c.in, c.expected) { - t.Errorf("Want\n%v; Got\n %v", util.Pformat(c.expected), util.Pformat(c.in)) - } - } -} diff --git a/pkg/apis/pytorch/v1alpha1/doc.go b/pkg/apis/pytorch/v1alpha1/doc.go deleted file mode 100644 index 92db83ef1..000000000 --- a/pkg/apis/pytorch/v1alpha1/doc.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +k8s:deepcopy-gen=package,register -// +k8s:defaulter-gen=TypeMeta - -// Package v1alpha1 is the v1alpha1 version of the API. -// +groupName=kubeflow.org -package v1alpha1 diff --git a/pkg/apis/pytorch/v1alpha1/register.go b/pkg/apis/pytorch/v1alpha1/register.go deleted file mode 100644 index a1c4ba363..000000000 --- a/pkg/apis/pytorch/v1alpha1/register.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1 - -import ( - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -var ( - SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) - AddToScheme = SchemeBuilder.AddToScheme -) - -const ( - // GroupName is the group name use in this package. - GroupName = "kubeflow.org" - // ResourceKind is the kind name. - ResourceKind = "PyTorchJob" - // GroupVersion is the version. - GroupVersion = "v1alpha1" -) - -// SchemeGroupVersion is the group version used to register these objects. -var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: CRDVersion} - -func init() { - // We only register manually written functions here. The registration of the - // generated functions takes place in the generated files. The separation - // makes the code compile even when the generated files are missing. - SchemeBuilder.Register(addDefaultingFuncs) -} - -// Resource takes an unqualified resource and returns a Group-qualified GroupResource. -func Resource(resource string) schema.GroupResource { - return SchemeGroupVersion.WithResource(resource).GroupResource() -} - -// addKnownTypes adds the set of types defined in this package to the supplied scheme. -func addKnownTypes(scheme *runtime.Scheme) error { - scheme.AddKnownTypes(SchemeGroupVersion, - &PyTorchJob{}, - &PyTorchJob{}, - ) - metav1.AddToGroupVersion(scheme, SchemeGroupVersion) - return nil -} diff --git a/pkg/apis/pytorch/v1alpha1/types.go b/pkg/apis/pytorch/v1alpha1/types.go deleted file mode 100644 index c211189e8..000000000 --- a/pkg/apis/pytorch/v1alpha1/types.go +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package v1alpha1 - -import ( - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -const ( - CRDKind = "pytorchjob" - CRDKindPlural = "pytorchjobs" - CRDGroup = "kubeflow.org" - CRDVersion = "v1alpha1" - // Value of the APP label that gets applied to a lot of entities. - AppLabel = "pytorch-job" - // Defaults for the Spec - MasterPort = 23456 - Replicas = 1 -) - -// +genclient -// +genclient:noStatus -// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -// +resource:path=pytorchjob - -// PyTorchJob describes pytorchjob info -type PyTorchJob struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` - Spec PyTorchJobSpec `json:"spec"` - Status PyTorchJobStatus `json:"status"` -} - -type PyTorchJobSpec struct { - // TODO(jlewi): Can we we get rid of this and use some value from Kubernetes or a random ide. - RuntimeId string - - // ReplicaSpecs specifies the PyTorch replicas to run. - ReplicaSpecs []*PyTorchReplicaSpec `json:"replicaSpecs"` - - // PyTorchImage defines the tensorflow docker image that should be used for default parameter server - PyTorchImage string `json:"pytorchImage,omitempty"` - - // TerminationPolicy specifies the condition that the pytorchjob should be considered finished. - TerminationPolicy *TerminationPolicySpec `json:"terminationPolicy,omitempty"` - - // SchedulerName specifies the name of scheduler which should handle the PyTorchJob - SchedulerName string `json:"schedulerName,omitempty"` -} - -type TerminationPolicySpec struct { - // Master policy waits for a particular process (which is the master) to exit. - Master *MasterSpec `json:"master,omitempty"` -} - -type MasterSpec struct { - ReplicaName string `json:"replicaName"` - ReplicaRank int `json:"replicaRank"` -} - -// PyTorchReplicaType determines how a set of PyTorch processes are handled. -type PyTorchReplicaType string - -const ( - MASTER PyTorchReplicaType = "MASTER" - WORKER PyTorchReplicaType = "WORKER" -) - -const ( - DefaultPyTorchContainer string = "pytorch" - DefaultPyTorchImage string = "pytorch/pytorch:v0.2" -) - -// TODO(jlewi): We probably want to add a name field. This would allow us to have more than 1 type of each worker. -// This might be useful if you wanted to have a separate set of workers to do eval. -type PyTorchReplicaSpec struct { - // Replicas is the number of desired replicas. - // This is a pointer to distinguish between explicit zero and unspecified. - // Defaults to 1. - // More info: http://kubernetes.io/docs/user-guide/replication-controller#what-is-a-replication-controller - // +optional - Replicas *int32 `json:"replicas,omitempty" protobuf:"varint,1,opt,name=replicas"` - Template *v1.PodTemplateSpec `json:"template,omitempty" protobuf:"bytes,3,opt,name=template"` - // MasterPort is the port to use for PyTorch services. - MasterPort *int32 `json:"masterPort,omitempty" protobuf:"varint,1,opt,name=masterPort"` - PyTorchReplicaType `json:"replicaType"` -} - -type PyTorchJobPhase string - -const ( - PyTorchJobPhaseNone PyTorchJobPhase = "" - PyTorchJobPhaseCreating PyTorchJobPhase = "Creating" - PyTorchJobPhaseRunning PyTorchJobPhase = "Running" - PyTorchJobPhaseCleanUp PyTorchJobPhase = "CleanUp" - PyTorchJobPhaseFailed PyTorchJobPhase = "Failed" - PyTorchJobPhaseDone PyTorchJobPhase = "Done" -) - -type State string - -const ( - StateUnknown State = "Unknown" - StateRunning State = "Running" - StateSucceeded State = "Succeeded" - StateFailed State = "Failed" -) - -type PyTorchJobStatus struct { - // Phase is the PyTorchJob running phase - Phase PyTorchJobPhase `json:"phase"` - Reason string `json:"reason"` - - // State indicates the state of the job. - State State `json:"state"` - - // ReplicaStatuses specifies the status of each PyTorch replica. - ReplicaStatuses []*PyTorchReplicaStatus `json:"replicaStatuses"` -} - -type ReplicaState string - -const ( - ReplicaStateUnknown ReplicaState = "Unknown" - ReplicaStateRunning ReplicaState = "Running" - ReplicaStateFailed ReplicaState = "Failed" - ReplicaStateSucceeded ReplicaState = "Succeeded" -) - -type PyTorchReplicaStatus struct { - PyTorchReplicaType `json:"replica_type"` - - // State is the overall state of the replica - State ReplicaState `json:"state"` - - // ReplicasStates provides the number of replicas in each status. - ReplicasStates map[ReplicaState]int -} - -// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object -// +resource:path=pytorchjobs - -// PyTorchJobList is a list of PyTorchJobs clusters. -type PyTorchJobList struct { - metav1.TypeMeta `json:",inline"` - // Standard list metadata - // More info: http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata - metav1.ListMeta `json:"metadata,omitempty"` - // Items is a list of PyTorchJobs - Items []PyTorchJob `json:"items"` -} - -type ControllerConfig struct { - // Accelerators is a map from the name of the accelerator to the config for that accelerator. - // This should match the value specified as a container limit. - // e.g. alpha.kubernetes.io/nvidia-gpu - Accelerators map[string]AcceleratorConfig - - // Path to the file containing the grpc server source - GrpcServerFilePath string -} - -// AcceleratorVolume represents a host path that must be mounted into -// each container that needs to use GPUs. -type AcceleratorVolume struct { - Name string - HostPath string - MountPath string -} - -type AcceleratorConfig struct { - Volumes []AcceleratorVolume - EnvVars []EnvironmentVariableConfig -} - -type EnvironmentVariableConfig struct { - Name string - Value string -} diff --git a/pkg/apis/pytorch/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/pytorch/v1alpha1/zz_generated.deepcopy.go deleted file mode 100644 index fae8ee197..000000000 --- a/pkg/apis/pytorch/v1alpha1/zz_generated.deepcopy.go +++ /dev/null @@ -1,316 +0,0 @@ -// +build !ignore_autogenerated - -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by deepcopy-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - v1 "k8s.io/api/core/v1" - runtime "k8s.io/apimachinery/pkg/runtime" -) - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AcceleratorConfig) DeepCopyInto(out *AcceleratorConfig) { - *out = *in - if in.Volumes != nil { - in, out := &in.Volumes, &out.Volumes - *out = make([]AcceleratorVolume, len(*in)) - copy(*out, *in) - } - if in.EnvVars != nil { - in, out := &in.EnvVars, &out.EnvVars - *out = make([]EnvironmentVariableConfig, len(*in)) - copy(*out, *in) - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AcceleratorConfig. -func (in *AcceleratorConfig) DeepCopy() *AcceleratorConfig { - if in == nil { - return nil - } - out := new(AcceleratorConfig) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AcceleratorVolume) DeepCopyInto(out *AcceleratorVolume) { - *out = *in - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AcceleratorVolume. -func (in *AcceleratorVolume) DeepCopy() *AcceleratorVolume { - if in == nil { - return nil - } - out := new(AcceleratorVolume) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ControllerConfig) DeepCopyInto(out *ControllerConfig) { - *out = *in - if in.Accelerators != nil { - in, out := &in.Accelerators, &out.Accelerators - *out = make(map[string]AcceleratorConfig, len(*in)) - for key, val := range *in { - (*out)[key] = *val.DeepCopy() - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControllerConfig. -func (in *ControllerConfig) DeepCopy() *ControllerConfig { - if in == nil { - return nil - } - out := new(ControllerConfig) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EnvironmentVariableConfig) DeepCopyInto(out *EnvironmentVariableConfig) { - *out = *in - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvironmentVariableConfig. -func (in *EnvironmentVariableConfig) DeepCopy() *EnvironmentVariableConfig { - if in == nil { - return nil - } - out := new(EnvironmentVariableConfig) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MasterSpec) DeepCopyInto(out *MasterSpec) { - *out = *in - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MasterSpec. -func (in *MasterSpec) DeepCopy() *MasterSpec { - if in == nil { - return nil - } - out := new(MasterSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PyTorchJob) DeepCopyInto(out *PyTorchJob) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) - in.Status.DeepCopyInto(&out.Status) - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PyTorchJob. -func (in *PyTorchJob) DeepCopy() *PyTorchJob { - if in == nil { - return nil - } - out := new(PyTorchJob) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *PyTorchJob) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PyTorchJobList) DeepCopyInto(out *PyTorchJobList) { - *out = *in - out.TypeMeta = in.TypeMeta - out.ListMeta = in.ListMeta - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]PyTorchJob, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PyTorchJobList. -func (in *PyTorchJobList) DeepCopy() *PyTorchJobList { - if in == nil { - return nil - } - out := new(PyTorchJobList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *PyTorchJobList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PyTorchJobSpec) DeepCopyInto(out *PyTorchJobSpec) { - *out = *in - if in.ReplicaSpecs != nil { - in, out := &in.ReplicaSpecs, &out.ReplicaSpecs - *out = make([]*PyTorchReplicaSpec, len(*in)) - for i := range *in { - if (*in)[i] != nil { - in, out := &(*in)[i], &(*out)[i] - *out = new(PyTorchReplicaSpec) - (*in).DeepCopyInto(*out) - } - } - } - if in.TerminationPolicy != nil { - in, out := &in.TerminationPolicy, &out.TerminationPolicy - *out = new(TerminationPolicySpec) - (*in).DeepCopyInto(*out) - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PyTorchJobSpec. -func (in *PyTorchJobSpec) DeepCopy() *PyTorchJobSpec { - if in == nil { - return nil - } - out := new(PyTorchJobSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PyTorchJobStatus) DeepCopyInto(out *PyTorchJobStatus) { - *out = *in - if in.ReplicaStatuses != nil { - in, out := &in.ReplicaStatuses, &out.ReplicaStatuses - *out = make([]*PyTorchReplicaStatus, len(*in)) - for i := range *in { - if (*in)[i] != nil { - in, out := &(*in)[i], &(*out)[i] - *out = new(PyTorchReplicaStatus) - (*in).DeepCopyInto(*out) - } - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PyTorchJobStatus. -func (in *PyTorchJobStatus) DeepCopy() *PyTorchJobStatus { - if in == nil { - return nil - } - out := new(PyTorchJobStatus) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PyTorchReplicaSpec) DeepCopyInto(out *PyTorchReplicaSpec) { - *out = *in - if in.Replicas != nil { - in, out := &in.Replicas, &out.Replicas - *out = new(int32) - **out = **in - } - if in.Template != nil { - in, out := &in.Template, &out.Template - *out = new(v1.PodTemplateSpec) - (*in).DeepCopyInto(*out) - } - if in.MasterPort != nil { - in, out := &in.MasterPort, &out.MasterPort - *out = new(int32) - **out = **in - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PyTorchReplicaSpec. -func (in *PyTorchReplicaSpec) DeepCopy() *PyTorchReplicaSpec { - if in == nil { - return nil - } - out := new(PyTorchReplicaSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PyTorchReplicaStatus) DeepCopyInto(out *PyTorchReplicaStatus) { - *out = *in - if in.ReplicasStates != nil { - in, out := &in.ReplicasStates, &out.ReplicasStates - *out = make(map[ReplicaState]int, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PyTorchReplicaStatus. -func (in *PyTorchReplicaStatus) DeepCopy() *PyTorchReplicaStatus { - if in == nil { - return nil - } - out := new(PyTorchReplicaStatus) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TerminationPolicySpec) DeepCopyInto(out *TerminationPolicySpec) { - *out = *in - if in.Master != nil { - in, out := &in.Master, &out.Master - *out = new(MasterSpec) - **out = **in - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TerminationPolicySpec. -func (in *TerminationPolicySpec) DeepCopy() *TerminationPolicySpec { - if in == nil { - return nil - } - out := new(TerminationPolicySpec) - in.DeepCopyInto(out) - return out -} diff --git a/pkg/apis/pytorch/v1alpha1/zz_generated.defaults.go b/pkg/apis/pytorch/v1alpha1/zz_generated.defaults.go deleted file mode 100644 index 43352ddf2..000000000 --- a/pkg/apis/pytorch/v1alpha1/zz_generated.defaults.go +++ /dev/null @@ -1,43 +0,0 @@ -// +build !ignore_autogenerated - -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by defaulter-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - runtime "k8s.io/apimachinery/pkg/runtime" -) - -// RegisterDefaults adds defaulters functions to the given scheme. -// Public to allow building arbitrary schemes. -// All generated defaulters are covering - they call all nested defaulters. -func RegisterDefaults(scheme *runtime.Scheme) error { - scheme.AddTypeDefaultingFunc(&PyTorchJob{}, func(obj interface{}) { SetObjectDefaults_PyTorchJob(obj.(*PyTorchJob)) }) - scheme.AddTypeDefaultingFunc(&PyTorchJobList{}, func(obj interface{}) { SetObjectDefaults_PyTorchJobList(obj.(*PyTorchJobList)) }) - return nil -} - -func SetObjectDefaults_PyTorchJob(in *PyTorchJob) { - SetDefaults_PyTorchJob(in) -} - -func SetObjectDefaults_PyTorchJobList(in *PyTorchJobList) { - for i := range in.Items { - a := &in.Items[i] - SetObjectDefaults_PyTorchJob(a) - } -} diff --git a/pkg/apis/pytorch/validation/validation.go b/pkg/apis/pytorch/validation/validation.go index 833461af3..75b3fd294 100644 --- a/pkg/apis/pytorch/validation/validation.go +++ b/pkg/apis/pytorch/validation/validation.go @@ -15,73 +15,14 @@ package validation import ( - "errors" "fmt" log "github.com/sirupsen/logrus" - torchv1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" torchv2 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha2" torchv1beta1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta1" - "github.com/kubeflow/pytorch-operator/pkg/util" ) -// ValidatePyTorchJobSpec checks that the PyTorchJobSpec is valid. -func ValidatePyTorchJobSpec(c *torchv1.PyTorchJobSpec) error { - if c.TerminationPolicy == nil || c.TerminationPolicy.Master == nil { - return fmt.Errorf("invalid termination policy: %v", c.TerminationPolicy) - } - - masterExists := false - - // Check that each replica has a TensorFlow container and a master. - for _, r := range c.ReplicaSpecs { - found := false - if r.Template == nil { - return fmt.Errorf("Replica is missing Template; %v", util.Pformat(r)) - } - - if r.PyTorchReplicaType == torchv1.PyTorchReplicaType(c.TerminationPolicy.Master.ReplicaName) { - masterExists = true - } - - if r.MasterPort == nil { - return errors.New("PyTorchReplicaSpec.MasterPort can't be nil.") - } - - // Make sure the replica type is valid. - validReplicaTypes := []torchv1.PyTorchReplicaType{torchv1.MASTER, torchv1.WORKER} - - isValidReplicaType := false - for _, t := range validReplicaTypes { - if t == r.PyTorchReplicaType { - isValidReplicaType = true - break - } - } - - if !isValidReplicaType { - return fmt.Errorf("tfReplicaSpec.PyTorchReplicaType is %v but must be one of %v", r.PyTorchReplicaType, validReplicaTypes) - } - - for _, c := range r.Template.Spec.Containers { - if c.Name == torchv1.DefaultPyTorchContainer { - found = true - break - } - } - if !found { - return fmt.Errorf("Replica type %v is missing a container named %s", r.PyTorchReplicaType, torchv1.DefaultPyTorchContainer) - } - } - - if !masterExists { - return fmt.Errorf("Missing ReplicaSpec for master: %v", c.TerminationPolicy.Master.ReplicaName) - } - - return nil -} - func ValidateAlphaTwoPyTorchJobSpec(c *torchv2.PyTorchJobSpec) error { if c.PyTorchReplicaSpecs == nil { return fmt.Errorf("PyTorchJobSpec is not valid") diff --git a/pkg/apis/pytorch/validation/validation_test.go b/pkg/apis/pytorch/validation/validation_test.go index bbcc5ad7f..d7874915e 100644 --- a/pkg/apis/pytorch/validation/validation_test.go +++ b/pkg/apis/pytorch/validation/validation_test.go @@ -17,12 +17,10 @@ package validation import ( "testing" - torchv1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" torchv2 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha2" torchv1beta1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta1" common "github.com/kubeflow/tf-operator/pkg/apis/common/v1beta1" - "github.com/gogo/protobuf/proto" "k8s.io/api/core/v1" ) @@ -205,92 +203,3 @@ func TestValidateAlphaTwoPyTorchJobSpec(t *testing.T) { } } } - -func TestValidate(t *testing.T) { - type testCase struct { - in *torchv1.PyTorchJobSpec - expectingError bool - } - - testCases := []testCase{ - { - in: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1.MASTER, - Replicas: proto.Int32(1), - }, - }, - PyTorchImage: "pytorch/pytorch:v0.2", - }, - expectingError: false, - }, - { - in: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1.WORKER, - Replicas: proto.Int32(1), - }, - }, - PyTorchImage: "pytorch/pytorch:v0.2", - }, - expectingError: true, - }, - { - in: &torchv1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1.PyTorchReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1.WORKER, - Replicas: proto.Int32(1), - }, - }, - PyTorchImage: "pytorch/pytorch:v0.2", - TerminationPolicy: &torchv1.TerminationPolicySpec{ - Master: &torchv1.MasterSpec{ - ReplicaName: "WORKER", - ReplicaRank: 0, - }, - }, - }, - expectingError: false, - }, - } - - for _, c := range testCases { - job := &torchv1.PyTorchJob{ - Spec: *c.in, - } - torchv1.SetObjectDefaults_PyTorchJob(job) - if err := ValidatePyTorchJobSpec(&job.Spec); (err != nil) != c.expectingError { - t.Errorf("unexpected validation result: %v", err) - } - } -} diff --git a/pkg/client/clientset/versioned/clientset.go b/pkg/client/clientset/versioned/clientset.go index b3a3f6139..b527d6cd8 100644 --- a/pkg/client/clientset/versioned/clientset.go +++ b/pkg/client/clientset/versioned/clientset.go @@ -18,7 +18,6 @@ package versioned import ( glog "github.com/golang/glog" - kubeflowv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" kubeflowv1alpha2 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2" kubeflowv1beta1 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1beta1" discovery "k8s.io/client-go/discovery" @@ -28,7 +27,6 @@ import ( type Interface interface { Discovery() discovery.DiscoveryInterface - KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface KubeflowV1beta1() kubeflowv1beta1.KubeflowV1beta1Interface // Deprecated: please explicitly pick a version if possible. @@ -39,16 +37,10 @@ type Interface interface { // version included in a Clientset. type Clientset struct { *discovery.DiscoveryClient - kubeflowV1alpha1 *kubeflowv1alpha1.KubeflowV1alpha1Client kubeflowV1alpha2 *kubeflowv1alpha2.KubeflowV1alpha2Client kubeflowV1beta1 *kubeflowv1beta1.KubeflowV1beta1Client } -// KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client -func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { - return c.kubeflowV1alpha1 -} - // KubeflowV1alpha2 retrieves the KubeflowV1alpha2Client func (c *Clientset) KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface { return c.kubeflowV1alpha2 @@ -81,10 +73,6 @@ func NewForConfig(c *rest.Config) (*Clientset, error) { } var cs Clientset var err error - cs.kubeflowV1alpha1, err = kubeflowv1alpha1.NewForConfig(&configShallowCopy) - if err != nil { - return nil, err - } cs.kubeflowV1alpha2, err = kubeflowv1alpha2.NewForConfig(&configShallowCopy) if err != nil { return nil, err @@ -106,7 +94,6 @@ func NewForConfig(c *rest.Config) (*Clientset, error) { // panics if there is an error in the config. func NewForConfigOrDie(c *rest.Config) *Clientset { var cs Clientset - cs.kubeflowV1alpha1 = kubeflowv1alpha1.NewForConfigOrDie(c) cs.kubeflowV1alpha2 = kubeflowv1alpha2.NewForConfigOrDie(c) cs.kubeflowV1beta1 = kubeflowv1beta1.NewForConfigOrDie(c) @@ -117,7 +104,6 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { // New creates a new Clientset for the given RESTClient. func New(c rest.Interface) *Clientset { var cs Clientset - cs.kubeflowV1alpha1 = kubeflowv1alpha1.New(c) cs.kubeflowV1alpha2 = kubeflowv1alpha2.New(c) cs.kubeflowV1beta1 = kubeflowv1beta1.New(c) diff --git a/pkg/client/clientset/versioned/fake/clientset_generated.go b/pkg/client/clientset/versioned/fake/clientset_generated.go index 454af2e6c..95e5c7385 100644 --- a/pkg/client/clientset/versioned/fake/clientset_generated.go +++ b/pkg/client/clientset/versioned/fake/clientset_generated.go @@ -18,8 +18,6 @@ package fake import ( clientset "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned" - kubeflowv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" - fakekubeflowv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake" kubeflowv1alpha2 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2" fakekubeflowv1alpha2 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha2/fake" kubeflowv1beta1 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1beta1" @@ -64,11 +62,6 @@ func (c *Clientset) Discovery() discovery.DiscoveryInterface { var _ clientset.Interface = &Clientset{} -// KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client -func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { - return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} -} - // KubeflowV1alpha2 retrieves the KubeflowV1alpha2Client func (c *Clientset) KubeflowV1alpha2() kubeflowv1alpha2.KubeflowV1alpha2Interface { return &fakekubeflowv1alpha2.FakeKubeflowV1alpha2{Fake: &c.Fake} diff --git a/pkg/client/clientset/versioned/fake/register.go b/pkg/client/clientset/versioned/fake/register.go index b742e0ac9..c0a541fb7 100644 --- a/pkg/client/clientset/versioned/fake/register.go +++ b/pkg/client/clientset/versioned/fake/register.go @@ -17,7 +17,6 @@ package fake import ( - kubeflowv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" kubeflowv1alpha2 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha2" kubeflowv1beta1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -50,7 +49,6 @@ func init() { // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types // correctly. func AddToScheme(scheme *runtime.Scheme) { - kubeflowv1alpha1.AddToScheme(scheme) kubeflowv1alpha2.AddToScheme(scheme) kubeflowv1beta1.AddToScheme(scheme) diff --git a/pkg/client/clientset/versioned/scheme/register.go b/pkg/client/clientset/versioned/scheme/register.go index 2f4bb8fe9..c45005985 100644 --- a/pkg/client/clientset/versioned/scheme/register.go +++ b/pkg/client/clientset/versioned/scheme/register.go @@ -17,7 +17,6 @@ package scheme import ( - kubeflowv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" kubeflowv1alpha2 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha2" kubeflowv1beta1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -50,7 +49,6 @@ func init() { // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types // correctly. func AddToScheme(scheme *runtime.Scheme) { - kubeflowv1alpha1.AddToScheme(scheme) kubeflowv1alpha2.AddToScheme(scheme) kubeflowv1beta1.AddToScheme(scheme) diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go deleted file mode 100644 index 7c6f02e53..000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by client-gen. DO NOT EDIT. - -// This package has the automatically generated typed clients. -package v1alpha1 diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go deleted file mode 100644 index c3f1566b3..000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by client-gen. DO NOT EDIT. - -// Package fake has the automatically generated clients. -package fake diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go deleted file mode 100644 index a48554f73..000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by client-gen. DO NOT EDIT. - -package fake - -import ( - v1alpha1 "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" - rest "k8s.io/client-go/rest" - testing "k8s.io/client-go/testing" -) - -type FakeKubeflowV1alpha1 struct { - *testing.Fake -} - -func (c *FakeKubeflowV1alpha1) PyTorchJobs(namespace string) v1alpha1.PyTorchJobInterface { - return &FakePyTorchJobs{c, namespace} -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *FakeKubeflowV1alpha1) RESTClient() rest.Interface { - var ret *rest.RESTClient - return ret -} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_pytorchjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_pytorchjob.go deleted file mode 100644 index 6e19b21ad..000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_pytorchjob.go +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by client-gen. DO NOT EDIT. - -package fake - -import ( - v1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - labels "k8s.io/apimachinery/pkg/labels" - schema "k8s.io/apimachinery/pkg/runtime/schema" - types "k8s.io/apimachinery/pkg/types" - watch "k8s.io/apimachinery/pkg/watch" - testing "k8s.io/client-go/testing" -) - -// FakePyTorchJobs implements PyTorchJobInterface -type FakePyTorchJobs struct { - Fake *FakeKubeflowV1alpha1 - ns string -} - -var pytorchjobsResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1alpha1", Resource: "pytorchjobs"} - -var pytorchjobsKind = schema.GroupVersionKind{Group: "kubeflow.org", Version: "v1alpha1", Kind: "PyTorchJob"} - -// Get takes name of the pyTorchJob, and returns the corresponding pyTorchJob object, and an error if there is any. -func (c *FakePyTorchJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.PyTorchJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewGetAction(pytorchjobsResource, c.ns, name), &v1alpha1.PyTorchJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha1.PyTorchJob), err -} - -// List takes label and field selectors, and returns the list of PyTorchJobs that match those selectors. -func (c *FakePyTorchJobs) List(opts v1.ListOptions) (result *v1alpha1.PyTorchJobList, err error) { - obj, err := c.Fake. - Invokes(testing.NewListAction(pytorchjobsResource, pytorchjobsKind, c.ns, opts), &v1alpha1.PyTorchJobList{}) - - if obj == nil { - return nil, err - } - - label, _, _ := testing.ExtractFromListOptions(opts) - if label == nil { - label = labels.Everything() - } - list := &v1alpha1.PyTorchJobList{} - for _, item := range obj.(*v1alpha1.PyTorchJobList).Items { - if label.Matches(labels.Set(item.Labels)) { - list.Items = append(list.Items, item) - } - } - return list, err -} - -// Watch returns a watch.Interface that watches the requested pyTorchJobs. -func (c *FakePyTorchJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { - return c.Fake. - InvokesWatch(testing.NewWatchAction(pytorchjobsResource, c.ns, opts)) - -} - -// Create takes the representation of a pyTorchJob and creates it. Returns the server's representation of the pyTorchJob, and an error, if there is any. -func (c *FakePyTorchJobs) Create(pyTorchJob *v1alpha1.PyTorchJob) (result *v1alpha1.PyTorchJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewCreateAction(pytorchjobsResource, c.ns, pyTorchJob), &v1alpha1.PyTorchJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha1.PyTorchJob), err -} - -// Update takes the representation of a pyTorchJob and updates it. Returns the server's representation of the pyTorchJob, and an error, if there is any. -func (c *FakePyTorchJobs) Update(pyTorchJob *v1alpha1.PyTorchJob) (result *v1alpha1.PyTorchJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewUpdateAction(pytorchjobsResource, c.ns, pyTorchJob), &v1alpha1.PyTorchJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha1.PyTorchJob), err -} - -// Delete takes name of the pyTorchJob and deletes it. Returns an error if one occurs. -func (c *FakePyTorchJobs) Delete(name string, options *v1.DeleteOptions) error { - _, err := c.Fake. - Invokes(testing.NewDeleteAction(pytorchjobsResource, c.ns, name), &v1alpha1.PyTorchJob{}) - - return err -} - -// DeleteCollection deletes a collection of objects. -func (c *FakePyTorchJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { - action := testing.NewDeleteCollectionAction(pytorchjobsResource, c.ns, listOptions) - - _, err := c.Fake.Invokes(action, &v1alpha1.PyTorchJobList{}) - return err -} - -// Patch applies the patch and returns the patched pyTorchJob. -func (c *FakePyTorchJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.PyTorchJob, err error) { - obj, err := c.Fake. - Invokes(testing.NewPatchSubresourceAction(pytorchjobsResource, c.ns, name, data, subresources...), &v1alpha1.PyTorchJob{}) - - if obj == nil { - return nil, err - } - return obj.(*v1alpha1.PyTorchJob), err -} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go deleted file mode 100644 index 371b7a3e7..000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by client-gen. DO NOT EDIT. - -package v1alpha1 - -type PyTorchJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go deleted file mode 100644 index 21030ac1b..000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by client-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - v1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/scheme" - serializer "k8s.io/apimachinery/pkg/runtime/serializer" - rest "k8s.io/client-go/rest" -) - -type KubeflowV1alpha1Interface interface { - RESTClient() rest.Interface - PyTorchJobsGetter -} - -// KubeflowV1alpha1Client is used to interact with features provided by the kubeflow.org group. -type KubeflowV1alpha1Client struct { - restClient rest.Interface -} - -func (c *KubeflowV1alpha1Client) PyTorchJobs(namespace string) PyTorchJobInterface { - return newPyTorchJobs(c, namespace) -} - -// NewForConfig creates a new KubeflowV1alpha1Client for the given config. -func NewForConfig(c *rest.Config) (*KubeflowV1alpha1Client, error) { - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - client, err := rest.RESTClientFor(&config) - if err != nil { - return nil, err - } - return &KubeflowV1alpha1Client{client}, nil -} - -// NewForConfigOrDie creates a new KubeflowV1alpha1Client for the given config and -// panics if there is an error in the config. -func NewForConfigOrDie(c *rest.Config) *KubeflowV1alpha1Client { - client, err := NewForConfig(c) - if err != nil { - panic(err) - } - return client -} - -// New creates a new KubeflowV1alpha1Client for the given RESTClient. -func New(c rest.Interface) *KubeflowV1alpha1Client { - return &KubeflowV1alpha1Client{c} -} - -func setConfigDefaults(config *rest.Config) error { - gv := v1alpha1.SchemeGroupVersion - config.GroupVersion = &gv - config.APIPath = "/apis" - config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} - - if config.UserAgent == "" { - config.UserAgent = rest.DefaultKubernetesUserAgent() - } - - return nil -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *KubeflowV1alpha1Client) RESTClient() rest.Interface { - if c == nil { - return nil - } - return c.restClient -} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/pytorchjob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/pytorchjob.go deleted file mode 100644 index a445ebd62..000000000 --- a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/pytorchjob.go +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by client-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - v1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - scheme "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/scheme" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - types "k8s.io/apimachinery/pkg/types" - watch "k8s.io/apimachinery/pkg/watch" - rest "k8s.io/client-go/rest" -) - -// PyTorchJobsGetter has a method to return a PyTorchJobInterface. -// A group's client should implement this interface. -type PyTorchJobsGetter interface { - PyTorchJobs(namespace string) PyTorchJobInterface -} - -// PyTorchJobInterface has methods to work with PyTorchJob resources. -type PyTorchJobInterface interface { - Create(*v1alpha1.PyTorchJob) (*v1alpha1.PyTorchJob, error) - Update(*v1alpha1.PyTorchJob) (*v1alpha1.PyTorchJob, error) - Delete(name string, options *v1.DeleteOptions) error - DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error - Get(name string, options v1.GetOptions) (*v1alpha1.PyTorchJob, error) - List(opts v1.ListOptions) (*v1alpha1.PyTorchJobList, error) - Watch(opts v1.ListOptions) (watch.Interface, error) - Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.PyTorchJob, err error) - PyTorchJobExpansion -} - -// pyTorchJobs implements PyTorchJobInterface -type pyTorchJobs struct { - client rest.Interface - ns string -} - -// newPyTorchJobs returns a PyTorchJobs -func newPyTorchJobs(c *KubeflowV1alpha1Client, namespace string) *pyTorchJobs { - return &pyTorchJobs{ - client: c.RESTClient(), - ns: namespace, - } -} - -// Get takes name of the pyTorchJob, and returns the corresponding pyTorchJob object, and an error if there is any. -func (c *pyTorchJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.PyTorchJob, err error) { - result = &v1alpha1.PyTorchJob{} - err = c.client.Get(). - Namespace(c.ns). - Resource("pytorchjobs"). - Name(name). - VersionedParams(&options, scheme.ParameterCodec). - Do(). - Into(result) - return -} - -// List takes label and field selectors, and returns the list of PyTorchJobs that match those selectors. -func (c *pyTorchJobs) List(opts v1.ListOptions) (result *v1alpha1.PyTorchJobList, err error) { - result = &v1alpha1.PyTorchJobList{} - err = c.client.Get(). - Namespace(c.ns). - Resource("pytorchjobs"). - VersionedParams(&opts, scheme.ParameterCodec). - Do(). - Into(result) - return -} - -// Watch returns a watch.Interface that watches the requested pyTorchJobs. -func (c *pyTorchJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { - opts.Watch = true - return c.client.Get(). - Namespace(c.ns). - Resource("pytorchjobs"). - VersionedParams(&opts, scheme.ParameterCodec). - Watch() -} - -// Create takes the representation of a pyTorchJob and creates it. Returns the server's representation of the pyTorchJob, and an error, if there is any. -func (c *pyTorchJobs) Create(pyTorchJob *v1alpha1.PyTorchJob) (result *v1alpha1.PyTorchJob, err error) { - result = &v1alpha1.PyTorchJob{} - err = c.client.Post(). - Namespace(c.ns). - Resource("pytorchjobs"). - Body(pyTorchJob). - Do(). - Into(result) - return -} - -// Update takes the representation of a pyTorchJob and updates it. Returns the server's representation of the pyTorchJob, and an error, if there is any. -func (c *pyTorchJobs) Update(pyTorchJob *v1alpha1.PyTorchJob) (result *v1alpha1.PyTorchJob, err error) { - result = &v1alpha1.PyTorchJob{} - err = c.client.Put(). - Namespace(c.ns). - Resource("pytorchjobs"). - Name(pyTorchJob.Name). - Body(pyTorchJob). - Do(). - Into(result) - return -} - -// Delete takes name of the pyTorchJob and deletes it. Returns an error if one occurs. -func (c *pyTorchJobs) Delete(name string, options *v1.DeleteOptions) error { - return c.client.Delete(). - Namespace(c.ns). - Resource("pytorchjobs"). - Name(name). - Body(options). - Do(). - Error() -} - -// DeleteCollection deletes a collection of objects. -func (c *pyTorchJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { - return c.client.Delete(). - Namespace(c.ns). - Resource("pytorchjobs"). - VersionedParams(&listOptions, scheme.ParameterCodec). - Body(options). - Do(). - Error() -} - -// Patch applies the patch and returns the patched pyTorchJob. -func (c *pyTorchJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.PyTorchJob, err error) { - result = &v1alpha1.PyTorchJob{} - err = c.client.Patch(pt). - Namespace(c.ns). - Resource("pytorchjobs"). - SubResource(subresources...). - Name(name). - Body(data). - Do(). - Into(result) - return -} diff --git a/pkg/client/informers/externalversions/generic.go b/pkg/client/informers/externalversions/generic.go index 65861e914..72e3efe43 100644 --- a/pkg/client/informers/externalversions/generic.go +++ b/pkg/client/informers/externalversions/generic.go @@ -21,7 +21,6 @@ package externalversions import ( "fmt" - v1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" v1alpha2 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha2" v1beta1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta1" schema "k8s.io/apimachinery/pkg/runtime/schema" @@ -54,11 +53,7 @@ func (f *genericInformer) Lister() cache.GenericLister { // TODO extend this to unknown resources with a client pool func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { switch resource { - // Group=Kubeflow, Version=V1alpha1 - case v1alpha1.SchemeGroupVersion.WithResource("pytorchjobs"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1alpha1().PyTorchJobs().Informer()}, nil - - // Group=Kubeflow, Version=V1alpha2 + // Group=Kubeflow, Version=V1alpha2 case v1alpha2.SchemeGroupVersion.WithResource("pytorchjobs"): return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1alpha2().PyTorchJobs().Informer()}, nil diff --git a/pkg/client/informers/externalversions/kubeflow/interface.go b/pkg/client/informers/externalversions/kubeflow/interface.go index 6607e2246..74cc4c82e 100644 --- a/pkg/client/informers/externalversions/kubeflow/interface.go +++ b/pkg/client/informers/externalversions/kubeflow/interface.go @@ -20,15 +20,12 @@ package kubeflow import ( internalinterfaces "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions/internalinterfaces" - v1alpha1 "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions/kubeflow/v1alpha1" v1alpha2 "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions/kubeflow/v1alpha2" v1beta1 "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions/kubeflow/v1beta1" ) // Interface provides access to each of this group's versions. type Interface interface { - // V1alpha1 provides access to shared informers for resources in V1alpha1. - V1alpha1() v1alpha1.Interface // V1alpha2 provides access to shared informers for resources in V1alpha2. V1alpha2() v1alpha2.Interface // V1beta1 provides access to shared informers for resources in V1beta1. @@ -44,11 +41,6 @@ func New(f internalinterfaces.SharedInformerFactory) Interface { return &group{f} } -// V1alpha1 returns a new v1alpha1.Interface. -func (g *group) V1alpha1() v1alpha1.Interface { - return v1alpha1.New(g.SharedInformerFactory) -} - // V1alpha2 returns a new v1alpha2.Interface. func (g *group) V1alpha2() v1alpha2.Interface { return v1alpha2.New(g.SharedInformerFactory) diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go b/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go deleted file mode 100644 index 45e56ec54..000000000 --- a/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by informer-gen. DO NOT EDIT. - -// This file was automatically generated by informer-gen - -package v1alpha1 - -import ( - internalinterfaces "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions/internalinterfaces" -) - -// Interface provides access to all the informers in this group version. -type Interface interface { - // PyTorchJobs returns a PyTorchJobInformer. - PyTorchJobs() PyTorchJobInformer -} - -type version struct { - internalinterfaces.SharedInformerFactory -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory) Interface { - return &version{f} -} - -// PyTorchJobs returns a PyTorchJobInformer. -func (v *version) PyTorchJobs() PyTorchJobInformer { - return &pyTorchJobInformer{factory: v.SharedInformerFactory} -} diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha1/pytorchjob.go b/pkg/client/informers/externalversions/kubeflow/v1alpha1/pytorchjob.go deleted file mode 100644 index c34c32cc9..000000000 --- a/pkg/client/informers/externalversions/kubeflow/v1alpha1/pytorchjob.go +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by informer-gen. DO NOT EDIT. - -// This file was automatically generated by informer-gen - -package v1alpha1 - -import ( - time "time" - - pytorchv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - versioned "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned" - internalinterfaces "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions/internalinterfaces" - v1alpha1 "github.com/kubeflow/pytorch-operator/pkg/client/listers/kubeflow/v1alpha1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - runtime "k8s.io/apimachinery/pkg/runtime" - watch "k8s.io/apimachinery/pkg/watch" - cache "k8s.io/client-go/tools/cache" -) - -// PyTorchJobInformer provides access to a shared informer and lister for -// PyTorchJobs. -type PyTorchJobInformer interface { - Informer() cache.SharedIndexInformer - Lister() v1alpha1.PyTorchJobLister -} - -type pyTorchJobInformer struct { - factory internalinterfaces.SharedInformerFactory -} - -// NewPyTorchJobInformer constructs a new informer for PyTorchJob type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewPyTorchJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { - return cache.NewSharedIndexInformer( - &cache.ListWatch{ - ListFunc: func(options v1.ListOptions) (runtime.Object, error) { - return client.KubeflowV1alpha1().PyTorchJobs(namespace).List(options) - }, - WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { - return client.KubeflowV1alpha1().PyTorchJobs(namespace).Watch(options) - }, - }, - &pytorchv1alpha1.PyTorchJob{}, - resyncPeriod, - indexers, - ) -} - -func defaultPyTorchJobInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { - return NewPyTorchJobInformer(client, v1.NamespaceAll, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) -} - -func (f *pyTorchJobInformer) Informer() cache.SharedIndexInformer { - return f.factory.InformerFor(&pytorchv1alpha1.PyTorchJob{}, defaultPyTorchJobInformer) -} - -func (f *pyTorchJobInformer) Lister() v1alpha1.PyTorchJobLister { - return v1alpha1.NewPyTorchJobLister(f.Informer().GetIndexer()) -} diff --git a/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go b/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go deleted file mode 100644 index fc577d4b9..000000000 --- a/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by lister-gen. DO NOT EDIT. - -// This file was automatically generated by lister-gen - -package v1alpha1 - -// PyTorchJobListerExpansion allows custom methods to be added to -// PyTorchJobLister. -type PyTorchJobListerExpansion interface{} - -// PyTorchJobNamespaceListerExpansion allows custom methods to be added to -// PyTorchJobNamespaceLister. -type PyTorchJobNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/kubeflow/v1alpha1/pytorchjob.go b/pkg/client/listers/kubeflow/v1alpha1/pytorchjob.go deleted file mode 100644 index 48b6dae8b..000000000 --- a/pkg/client/listers/kubeflow/v1alpha1/pytorchjob.go +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Code generated by lister-gen. DO NOT EDIT. - -// This file was automatically generated by lister-gen - -package v1alpha1 - -import ( - v1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/tools/cache" -) - -// PyTorchJobLister helps list PyTorchJobs. -type PyTorchJobLister interface { - // List lists all PyTorchJobs in the indexer. - List(selector labels.Selector) (ret []*v1alpha1.PyTorchJob, err error) - // PyTorchJobs returns an object that can list and get PyTorchJobs. - PyTorchJobs(namespace string) PyTorchJobNamespaceLister - PyTorchJobListerExpansion -} - -// pyTorchJobLister implements the PyTorchJobLister interface. -type pyTorchJobLister struct { - indexer cache.Indexer -} - -// NewPyTorchJobLister returns a new PyTorchJobLister. -func NewPyTorchJobLister(indexer cache.Indexer) PyTorchJobLister { - return &pyTorchJobLister{indexer: indexer} -} - -// List lists all PyTorchJobs in the indexer. -func (s *pyTorchJobLister) List(selector labels.Selector) (ret []*v1alpha1.PyTorchJob, err error) { - err = cache.ListAll(s.indexer, selector, func(m interface{}) { - ret = append(ret, m.(*v1alpha1.PyTorchJob)) - }) - return ret, err -} - -// PyTorchJobs returns an object that can list and get PyTorchJobs. -func (s *pyTorchJobLister) PyTorchJobs(namespace string) PyTorchJobNamespaceLister { - return pyTorchJobNamespaceLister{indexer: s.indexer, namespace: namespace} -} - -// PyTorchJobNamespaceLister helps list and get PyTorchJobs. -type PyTorchJobNamespaceLister interface { - // List lists all PyTorchJobs in the indexer for a given namespace. - List(selector labels.Selector) (ret []*v1alpha1.PyTorchJob, err error) - // Get retrieves the PyTorchJob from the indexer for a given namespace and name. - Get(name string) (*v1alpha1.PyTorchJob, error) - PyTorchJobNamespaceListerExpansion -} - -// pyTorchJobNamespaceLister implements the PyTorchJobNamespaceLister -// interface. -type pyTorchJobNamespaceLister struct { - indexer cache.Indexer - namespace string -} - -// List lists all PyTorchJobs in the indexer for a given namespace. -func (s pyTorchJobNamespaceLister) List(selector labels.Selector) (ret []*v1alpha1.PyTorchJob, err error) { - err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { - ret = append(ret, m.(*v1alpha1.PyTorchJob)) - }) - return ret, err -} - -// Get retrieves the PyTorchJob from the indexer for a given namespace and name. -func (s pyTorchJobNamespaceLister) Get(name string) (*v1alpha1.PyTorchJob, error) { - obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) - if err != nil { - return nil, err - } - if !exists { - return nil, errors.NewNotFound(v1alpha1.Resource("pytorchjob"), name) - } - return obj.(*v1alpha1.PyTorchJob), nil -} diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go deleted file mode 100644 index 5fcc751e9..000000000 --- a/pkg/controller/controller.go +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package controller provides a Kubernetes controller for a TensorFlow job resource. -package controller - -import ( - "errors" - "fmt" - "time" - - log "github.com/sirupsen/logrus" - "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" - typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" - "k8s.io/client-go/tools/cache" - "k8s.io/client-go/tools/record" - "k8s.io/client-go/util/workqueue" - - torchv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - torchjobclient "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned" - kubeflowscheme "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/scheme" - informers "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions" - listers "github.com/kubeflow/pytorch-operator/pkg/client/listers/kubeflow/v1alpha1" - "github.com/kubeflow/pytorch-operator/pkg/trainer" -) - -const ( - controllerName = "kubeflow" -) - -var ( - ErrVersionOutdated = errors.New("requested version is outdated in apiserver") - - // IndexerInformer uses a delta queue, therefore for deletes we have to use this - // key function but it should be just fine for non delete events. - keyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc - - // DefaultJobBackOff is the max backoff period, exported for the e2e test - DefaultJobBackOff = 10 * time.Second - // MaxJobBackOff is the max backoff period, exported for the e2e test - MaxJobBackOff = 360 * time.Second -) - -type Controller struct { - KubeClient kubernetes.Interface - PyTorchJobClient torchjobclient.Interface - - config torchv1alpha1.ControllerConfig - jobs map[string]*trainer.TrainingJob - - PyTorchJobLister listers.PyTorchJobLister - PyTorchJobSynced cache.InformerSynced - - // WorkQueue is a rate limited work queue. This is used to queue work to be - // processed instead of performing it as soon as a change happens. This - // means we can ensure we only process a fixed amount of resources at a - // time, and makes it easy to ensure we are never processing the same item - // simultaneously in two different workers. - WorkQueue workqueue.RateLimitingInterface - - // recorder is an event recorder for recording Event resources to the - // Kubernetes API. - recorder record.EventRecorder - - syncHandler func(jobKey string) (bool, error) -} - -func New(kubeClient kubernetes.Interface, tfJobClient torchjobclient.Interface, - config torchv1alpha1.ControllerConfig, tfJobInformerFactory informers.SharedInformerFactory) (*Controller, error) { - tfJobInformer := tfJobInformerFactory.Kubeflow().V1alpha1().PyTorchJobs() - - kubeflowscheme.AddToScheme(scheme.Scheme) - log.Debug("Creating event broadcaster") - eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartLogging(log.Infof) - eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) - recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: controllerName}) - - controller := &Controller{ - KubeClient: kubeClient, - PyTorchJobClient: tfJobClient, - WorkQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "PyTorchjobs"), - recorder: recorder, - // TODO(jlewi)): What to do about cluster.Cluster? - jobs: make(map[string]*trainer.TrainingJob), - config: config, - } - - log.Info("Setting up event handlers") - // Set up an event handler for when Foo resources change - tfJobInformer.Informer().AddEventHandler( - cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - switch t := obj.(type) { - case *torchv1alpha1.PyTorchJob: - log.Debugf("filter tfjob name: %v", t.Name) - return true - default: - return false - } - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: controller.enqueueController, - UpdateFunc: func(oldObj, newObj interface{}) { - controller.enqueueController(newObj) - }, - DeleteFunc: controller.enqueueController, - }, - }) - - controller.PyTorchJobLister = tfJobInformer.Lister() - controller.PyTorchJobSynced = tfJobInformer.Informer().HasSynced - controller.syncHandler = controller.syncPyTorchJob - - return controller, nil -} - -// Run will set up the event handlers for types we are interested in, as well -// as syncing informer caches and starting workers. It will block until stopCh -// is closed, at which point it will shutdown the workqueue and wait for -// workers to finish processing their current work items. -func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error { - defer runtime.HandleCrash() - defer c.WorkQueue.ShutDown() - - // Start the informer factories to begin populating the informer caches - log.Info("Starting PyTorchJob controller") - - // Wait for the caches to be synced before starting workers - log.Info("Waiting for informer caches to sync") - if ok := cache.WaitForCacheSync(stopCh, c.PyTorchJobSynced); !ok { - return fmt.Errorf("failed to wait for caches to sync") - } - - log.Infof("Starting %v workers", threadiness) - // Launch workers to process PyTorchJob resources - for i := 0; i < threadiness; i++ { - go wait.Until(c.runWorker, time.Second, stopCh) - } - - log.Info("Started workers") - <-stopCh - log.Info("Shutting down workers") - - return nil -} - -// runWorker is a long-running function that will continually call the -// processNextWorkItem function in order to read and process a message on the -// workqueue. -func (c *Controller) runWorker() { - for c.processNextWorkItem() { - } -} - -// processNextWorkItem will read a single work item off the workqueue and -// attempt to process it, by calling the syncHandler. -func (c *Controller) processNextWorkItem() bool { - key, quit := c.WorkQueue.Get() - if quit { - return false - } - defer c.WorkQueue.Done(key) - - forget, err := c.syncHandler(key.(string)) - if err == nil { - if forget { - c.WorkQueue.Forget(key) - } - return true - } - - utilruntime.HandleError(fmt.Errorf("Error syncing job: %v", err)) - c.WorkQueue.AddRateLimited(key) - - return true -} - -// syncPyTorchJob will sync the job with the given. This function is not meant to be invoked -// concurrently with the same key. -// -// When a job is completely processed it will return true indicating that its ok to forget about this job since -// no more processing will occur for it. -func (c *Controller) syncPyTorchJob(key string) (bool, error) { - startTime := time.Now() - defer func() { - log.Debugf("Finished syncing job %q (%v)", key, time.Since(startTime)) - }() - - ns, name, err := cache.SplitMetaNamespaceKey(key) - if err != nil { - return false, err - } - if len(ns) == 0 || len(name) == 0 { - return false, fmt.Errorf("invalid job key %q: either namespace or name is missing", key) - } - - tfJob, err := c.PyTorchJobLister.PyTorchJobs(ns).Get(name) - - if err != nil { - if apierrors.IsNotFound(err) { - log.Debugf("Job has been deleted: %v", key) - return true, nil - } - return false, err - } - - // Create a new TrainingJob if there is no TrainingJob stored for it in the jobs map or if the UID's don't match. - // The UID's won't match in the event we deleted the job and then recreated the job with the same name. - if cJob, ok := c.jobs[key]; !ok || cJob.UID() != tfJob.UID { - nc, err := trainer.NewJob(c.KubeClient, c.PyTorchJobClient, c.recorder, tfJob, &c.config) - - if err != nil { - return false, err - } - c.jobs[key] = nc - } else { - // Replace the TFJob stored inside TrainingJob with the latest job. - // We need to do this to pull in the latest changes to the spec/status. - c.jobs[key].Update(tfJob) - } - - nc := c.jobs[key] - - if err := nc.Reconcile(&c.config); err != nil { - return false, err - } - - tfJob, err = c.PyTorchJobClient.KubeflowV1alpha1().PyTorchJobs(tfJob.ObjectMeta.Namespace).Get(tfJob.ObjectMeta.Name, metav1.GetOptions{}) - - if err != nil { - return false, err - } - - // TODO(jlewi): This logic will need to change when/if we get rid of phases and move to conditions. At that - // case we should forget about a job when the appropriate condition is reached. - if tfJob.Status.Phase == torchv1alpha1.PyTorchJobPhaseCleanUp { - return true, nil - } - return false, nil - -} - -// obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. -func (c *Controller) enqueueController(obj interface{}) { - key, err := keyFunc(obj) - if err != nil { - utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) - return - } - - c.WorkQueue.AddRateLimited(key) -} diff --git a/pkg/trainer/labels.go b/pkg/trainer/labels.go deleted file mode 100644 index 1e1a698f3..000000000 --- a/pkg/trainer/labels.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "fmt" - "strings" -) - -// KubernetesLabels represents a set of labels to apply to a Kubernetes resources. -type KubernetesLabels map[string]string - -// ToSelector converts the labels to a selector matching the labels. -func (l KubernetesLabels) ToSelector() (string, error) { - pieces := make([]string, 0, len(l)) - for k, v := range l { - pieces = append(pieces, fmt.Sprintf("%v=%v", k, v)) - } - - return strings.Join(pieces, ","), nil -} diff --git a/pkg/trainer/replicas.go b/pkg/trainer/replicas.go deleted file mode 100644 index 1e9575510..000000000 --- a/pkg/trainer/replicas.go +++ /dev/null @@ -1,513 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "encoding/json" - "errors" - "fmt" - "strconv" - "strings" - - log "github.com/golang/glog" - "k8s.io/api/core/v1" - k8s_errors "k8s.io/apimachinery/pkg/api/errors" - meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - k8sErrors "k8s.io/apimachinery/pkg/util/errors" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/record" - - torchv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - "github.com/kubeflow/tf-operator/pkg/util/k8sutil" - // TOOO(jlewi): Rename to apiErrors - "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/helper" - "github.com/kubeflow/pytorch-operator/pkg/util" -) - -const ( - SuccessfulCreateReason = "SuccessfulCreate" - FailedCreateReason = "FailedCreate" -) - -// PyTorchReplicaSet is a set of PyTorch processes all acting as the same role (e.g. worker -type PyTorchReplicaSet struct { - ClientSet kubernetes.Interface - recorder record.EventRecorder - // Job is a pointer to the TrainingJob to which this replica belongs. - Job *TrainingJob - Spec torchv1alpha1.PyTorchReplicaSpec -} - -// PyTorchReplicas is an interface for managing a set of replicas. -type PyTorchReplicaSetInterface interface { - Create() error - Delete() error - GetStatus() (torchv1alpha1.PyTorchReplicaStatus, error) -} - -// PyTorchConfig is a struct representing the TensorFlow config. This struct is turned into an environment -// which is used by TensorFlow processes to configure themselves. -type PyTorchConfig struct { - Cluster ClusterSpec `json:"cluster"` - Task TaskSpec `json:"task"` - Environment string `json:"environment"` -} - -func NewPyTorchReplicaSet(clientSet kubernetes.Interface, recorder record.EventRecorder, tfReplicaSpec torchv1alpha1.PyTorchReplicaSpec, job *TrainingJob) (*PyTorchReplicaSet, error) { - if tfReplicaSpec.PyTorchReplicaType == torchv1alpha1.MASTER && *tfReplicaSpec.Replicas != 1 { - return nil, errors.New("The MASTER must have Replicas = 1") - } - - if tfReplicaSpec.MasterPort == nil { - return nil, errors.New("tfReplicaSpec.MasterPort can't be nil.") - } - - // Make sure the replica type is valid. - validReplicaTypes := []torchv1alpha1.PyTorchReplicaType{torchv1alpha1.MASTER, torchv1alpha1.WORKER} - - isValidReplicaType := false - for _, t := range validReplicaTypes { - if t == tfReplicaSpec.PyTorchReplicaType { - isValidReplicaType = true - break - } - } - - if !isValidReplicaType { - return nil, fmt.Errorf("tfReplicaSpec.PyTorchReplicaType is %v but must be one of %v", tfReplicaSpec.PyTorchReplicaType, validReplicaTypes) - } - - return &PyTorchReplicaSet{ - ClientSet: clientSet, - recorder: recorder, - Job: job, - Spec: tfReplicaSpec, - }, nil -} - -// Labels returns the labels for this replica set. -func (s *PyTorchReplicaSet) Labels() KubernetesLabels { - return KubernetesLabels(map[string]string{ - "kubeflow.org": "", - "job_type": string(s.Spec.PyTorchReplicaType), - // runtime_id is set by Job.setup, which is called after the PyTorchReplicaSet is created. - // this is why labels aren't a member variable. - "runtime_id": s.Job.job.Spec.RuntimeId, - "pytorch_job_name": s.Job.job.ObjectMeta.Name}) -} - -func (s *PyTorchReplicaSet) Create(config *torchv1alpha1.ControllerConfig, worldSize int32) error { - // Create services - err := s.SyncServices() - if err != nil { - return err - } - - // Create pods - return s.SyncPods(worldSize) -} - -// CreateServiceWithIndex will create a new service with specify index -func (s *PyTorchReplicaSet) CreateServiceWithIndex(index int32) (*v1.Service, error) { - taskLabels := s.Labels() - taskLabels["task_index"] = fmt.Sprintf("%v", index) - - // Create the service. - service := &v1.Service{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: s.genName(index), - Labels: taskLabels, - OwnerReferences: []meta_v1.OwnerReference{ - helper.AsOwner(s.Job.job), - }, - }, - Spec: v1.ServiceSpec{ - Selector: taskLabels, - Ports: []v1.ServicePort{ - { - Name: "tf-port", - Port: *s.Spec.MasterPort, - }, - }, - }, - } - - log.Infof("Creating service: %v", service.ObjectMeta.Name) - return s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Create(service) -} - -// CreatePodWithIndex will create a new pod with specify index -func (s *PyTorchReplicaSet) CreatePodWithIndex(index int32, worldSize int32) (*v1.Pod, error) { - taskLabels := s.Labels() - taskLabels["task_index"] = fmt.Sprintf("%v", index) - - pod := &v1.Pod{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: s.genPodName(index), - Labels: taskLabels, - OwnerReferences: []meta_v1.OwnerReference{ - helper.AsOwner(s.Job.job), - }, - }, - Spec: *s.Spec.Template.Spec.DeepCopy(), - } - - pod.Spec.SchedulerName = s.Job.SchedulerName() - - // Configure the PyTorch distributed environment variables - masterPort := strconv.Itoa(int(*s.Spec.MasterPort)) - masterAddr := fmt.Sprintf("%v-%v-%v-%v", fmt.Sprintf("%.40s", s.Job.job.ObjectMeta.Name), "master", s.Job.job.Spec.RuntimeId, 0) - if index == 0 { - masterAddr = "localhost" - } - rank := strconv.Itoa(int(index)) - tfConfig := PyTorchConfig{ - Cluster: s.Job.ClusterSpec(), - Task: TaskSpec{ - Type: strings.ToLower(string(s.Spec.PyTorchReplicaType)), - Index: int(index), - }, - // We need to set environment to cloud otherwise it will default to local which isn't what we want. - Environment: "cloud", - } - - tfConfigJson, err := json.Marshal(tfConfig) - if err != nil { - log.Errorf("Job: %v serializing tfConfig: %v return error; %v", s.Job.job.ObjectMeta.Name, util.Pformat(tfConfig), err) - return nil, err - } - - // TODO(jose5918) Do not need TF_CONFIG but leaving for POC - // Add TF_CONFIG environment variable. - for i, _ := range pod.Spec.Containers { - // We can't get c in the loop variable because that would be by value so our modifications - // wouldn't have any effect. - c := &pod.Spec.Containers[i] - if c.Name != torchv1alpha1.DefaultPyTorchContainer { - continue - } - if len(c.Env) == 0 { - c.Env = make([]v1.EnvVar, 0) - } - c.Env = append(c.Env, v1.EnvVar{ - Name: "TF_CONFIG", - Value: string(tfConfigJson), - }) - c.Env = append(c.Env, v1.EnvVar{ - Name: "MASTER_PORT", - Value: masterPort, - }) - c.Env = append(c.Env, v1.EnvVar{ - Name: "MASTER_ADDR", - Value: masterAddr, - }) - c.Env = append(c.Env, v1.EnvVar{ - Name: "WORLD_SIZE", - Value: strconv.Itoa(int(worldSize)), - }) - c.Env = append(c.Env, v1.EnvVar{ - Name: "RANK", - Value: rank, - }) - c.Env = append(c.Env, v1.EnvVar{ - Name: "PYTHONUNBUFFERED", - Value: "0", - }) - } - - log.Infof("Creating pod: %v", pod.ObjectMeta.Name) - return s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).Create(pod) -} - -// Delete deletes the replicas -func (s *PyTorchReplicaSet) Delete() error { - selector, err := s.Labels().ToSelector() - if err != nil { - return err - } - - failures := false - - options := meta_v1.ListOptions{ - LabelSelector: selector, - } - - log.V(1).Infof("Deleting Jobs namespace=%v selector=%v", s.Job.job.ObjectMeta.Namespace, selector) - err = s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).DeleteCollection(&meta_v1.DeleteOptions{}, options) - - if err != nil { - log.Errorf("There was a problem deleting the jobs; %v", err) - failures = true - } - - // We need to delete the completed pods. - log.Infof("Deleting Pods namespace=%v selector=%v", s.Job.job.ObjectMeta.Namespace, selector) - err = s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).DeleteCollection(&meta_v1.DeleteOptions{}, options) - - if err != nil { - log.Errorf("There was a problem deleting the pods; %v", err) - failures = true - } - - // Services doesn't support DeleteCollection so we delete them individually. - // TODO(jlewi): We should check if this has changed with K8s 1.8 or other releases. - for index := int32(0); index < *s.Spec.Replicas; index++ { - log.V(1).Infof("Deleting Service %v:%v", s.Job.job.ObjectMeta.Namespace, s.genName((index))) - err = s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Delete(s.genName(index), &meta_v1.DeleteOptions{}) - - if err != nil { - log.Errorf("Error deleting service %v; %v", s.genName(index), err) - failures = true - } - } - - // If the ConfigMap for the default parameter server exists, we delete it - log.Infof("Get ConfigMaps %v:%v", s.Job.job.ObjectMeta.Namespace, s.defaultPSConfigMapName()) - _, err = s.ClientSet.CoreV1().ConfigMaps(s.Job.job.ObjectMeta.Namespace).Get(s.defaultPSConfigMapName(), meta_v1.GetOptions{}) - if err != nil { - if !k8sutil.IsKubernetesResourceNotFoundError(err) { - log.Errorf("Error deleting ConfigMap %v; %v", s.defaultPSConfigMapName(), err) - failures = true - } - } else { - log.Infof("Delete ConfigMaps %v:%v", s.Job.job.ObjectMeta.Namespace, s.defaultPSConfigMapName()) - err = s.ClientSet.CoreV1().ConfigMaps(s.Job.job.ObjectMeta.Namespace).Delete(s.defaultPSConfigMapName(), &meta_v1.DeleteOptions{}) - if err != nil { - log.Errorf("There was a problem deleting the ConfigMaps; %v", err) - failures = true - } - } - - if failures { - return errors.New("Some of the replicas resources could not be deleted") - } - return nil -} - -// replicaStatusFromPodList returns a status from a list of pods for a job. -func replicaStatusFromPodList(l v1.PodList, name string) torchv1alpha1.ReplicaState { - var latest *v1.Pod - for _, i := range l.Items { - if latest == nil { - latest = &i - continue - } - if latest.Status.StartTime.Before(i.Status.StartTime) { - latest = &i - } - } - - if latest == nil { - return torchv1alpha1.ReplicaStateRunning - } - - var tfState v1.ContainerState - - for _, i := range latest.Status.ContainerStatuses { - if i.Name != name { - continue - } - - // We need to decide whether to use the current state or the previous termination state. - tfState = i.State - } - - if tfState.Running != nil || tfState.Waiting != nil { - return torchv1alpha1.ReplicaStateRunning - } - - if tfState.Terminated != nil { - if tfState.Terminated.ExitCode == 0 { - return torchv1alpha1.ReplicaStateSucceeded - } - return torchv1alpha1.ReplicaStateFailed - } - - return torchv1alpha1.ReplicaStateUnknown -} - -func (s *PyTorchReplicaSet) GetSingleReplicaStatus(index int32) torchv1alpha1.ReplicaState { - labels := s.Labels() - labels["task_index"] = fmt.Sprintf("%v", index) - selector, err := labels.ToSelector() - if err != nil { - log.Errorf("labels.ToSelector() error; %v", err) - return torchv1alpha1.ReplicaStateFailed - } - - // TODO(jlewi): Handle errors. We need to get the pod and looking at recent container exits. - l, err := s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{ - // TODO(jlewi): Why isn't the label selector working? - LabelSelector: selector, - }) - - if err != nil { - // TODO(jlewi): Are there errors that should be treated as retryable errors? - return torchv1alpha1.ReplicaStateFailed - } - - status := replicaStatusFromPodList(*l, torchv1alpha1.DefaultPyTorchContainer) - return status -} - -// Status returns the status of the replica set. -func (s *PyTorchReplicaSet) GetStatus() (torchv1alpha1.PyTorchReplicaStatus, error) { - status := torchv1alpha1.PyTorchReplicaStatus{ - PyTorchReplicaType: s.Spec.PyTorchReplicaType, - State: torchv1alpha1.ReplicaStateUnknown, - ReplicasStates: make(map[torchv1alpha1.ReplicaState]int), - } - - increment := func(state torchv1alpha1.ReplicaState) { - v, ok := status.ReplicasStates[state] - if ok { - status.ReplicasStates[state] = v + 1 - } else { - status.ReplicasStates[state] = 1 - } - } - - for index := int32(0); index < *s.Spec.Replicas; index++ { - increment(s.GetSingleReplicaStatus(index)) - } - - // Determine the overall status for the replica set based on the status of the individual - // replicas. - // If any of the replicas failed mark the set as failed. - if _, ok := status.ReplicasStates[torchv1alpha1.ReplicaStateFailed]; ok { - status.State = torchv1alpha1.ReplicaStateFailed - return status, nil - } - - // If any replicas are RUNNING mark it as RUNNING. - if _, ok := status.ReplicasStates[torchv1alpha1.ReplicaStateRunning]; ok { - status.State = torchv1alpha1.ReplicaStateRunning - return status, nil - } - - // If all of the replicas succeeded consider it success. - if v, ok := status.ReplicasStates[torchv1alpha1.ReplicaStateSucceeded]; ok && int32(v) == *s.Spec.Replicas { - status.State = torchv1alpha1.ReplicaStateSucceeded - return status, nil - } - - return status, nil -} - -// SyncPods will try to check current pods for this PyTorchReplicaSet and try to make it as desired. -func (s *PyTorchReplicaSet) SyncPods(worldSize int32) error { - for index := int32(0); index < *s.Spec.Replicas; index++ { - - // Label to get all pods of this PyTorchReplicaType + index - labels := s.Labels() - labels["task_index"] = fmt.Sprintf("%v", index) - rank := index - if labels["job_type"] == "WORKER" { - rank = index + 1 - } - labels["task_index"] = fmt.Sprintf("%v", rank) - - labelSelector, err := labels.ToSelector() - if err != nil { - return err - } - - // Filter the unactive pods - fieldSelector := "status.phase!=" + string(v1.PodFailed) - //",deletionTimestamp!=nil" - - options := meta_v1.ListOptions{ - LabelSelector: labelSelector, - FieldSelector: fieldSelector, - } - // List to get pods - pl, err := s.ClientSet.CoreV1().Pods(s.Job.job.ObjectMeta.Namespace).List(options) - - if len(pl.Items) == 0 { - log.Infof("Pod not found, create new one.") - // Create the pod - createdPod, err := s.CreatePodWithIndex(rank, worldSize) - - // If the pod already exists do nothing. - if err != nil { - if k8s_errors.IsAlreadyExists(err) { - log.Infof("Pod: %v already exists.", createdPod.ObjectMeta.Name) - continue - } - s.recorder.Eventf(s.Job.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err) - return k8sErrors.NewAggregate([]error{fmt.Errorf("Creating pod %v returned error.", createdPod.ObjectMeta.Name), err}) - } - - s.recorder.Eventf(s.Job.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created pod: %v", createdPod.Name) - continue - } - - if err != nil { - // TODO: handing this error - continue - } - } - - return nil -} - -// SyncServices will try to check current services for this PyTorchReplicaSet and try to make it as desired. -func (s *PyTorchReplicaSet) SyncServices() error { - for index := int32(0); index < *s.Spec.Replicas; index++ { - _, err := s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Get(s.genName(index), meta_v1.GetOptions{}) - if err != nil && k8s_errors.IsNotFound(err) { - log.Infof("Service: %v not found, create new one.", s.genName(index)) - // Create the service - createdService, err := s.CreateServiceWithIndex(index) - - // If the service already exists do nothing. - if err != nil { - if k8s_errors.IsAlreadyExists(err) { - log.Infof("Service: %v already exists.", s.genName(index)) - continue - } - s.recorder.Eventf(s.Job.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err) - return k8sErrors.NewAggregate([]error{fmt.Errorf("Creating Service %v returned error.", createdService.ObjectMeta.Name), err}) - } - - s.recorder.Eventf(s.Job.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created Service: %v", createdService.Name) - continue - } - - if err != nil { - // TODO: handing this error - continue - } - } - - return nil -} - -func (s *PyTorchReplicaSet) genName(index int32) string { - // Truncate tfjob name to 40 characters - // The whole job name should be compliant with the DNS_LABEL spec, up to a max length of 63 characters - // Thus genName(40 chars)-replicaType(6 chars)-runtimeId(4 chars)-index(4 chars), also leaving some spaces - // See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/architecture/identifiers.md - return fmt.Sprintf("%v-%v-%v-%v", fmt.Sprintf("%.40s", s.Job.job.ObjectMeta.Name), strings.ToLower(string(s.Spec.PyTorchReplicaType)), s.Job.job.Spec.RuntimeId, index) -} - -func (s *PyTorchReplicaSet) genPodName(index int32) string { - // Generate a new pod name with random string - return s.genName(index) + "-" + util.RandString(5) -} - -func (s *PyTorchReplicaSet) defaultPSConfigMapName() string { - return fmt.Sprintf("cm-ps-%v", s.Job.job.Spec.RuntimeId) -} diff --git a/pkg/trainer/replicas_test.go b/pkg/trainer/replicas_test.go deleted file mode 100644 index b2f1413bd..000000000 --- a/pkg/trainer/replicas_test.go +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "encoding/json" - "fmt" - "reflect" - "strings" - "testing" - "time" - - "github.com/golang/protobuf/proto" - "k8s.io/api/core/v1" - meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/tools/record" - - torchv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - pytorchJobFake "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/fake" - "github.com/kubeflow/pytorch-operator/pkg/util" -) - -var ( - groupVersionKind = schema.GroupVersionKind{ - Group: torchv1alpha1.GroupName, - Version: torchv1alpha1.GroupVersion, - Kind: torchv1alpha1.ResourceKind, - } -) - -func TestPyTorchReplicaSet(t *testing.T) { - clientSet := fake.NewSimpleClientset() - - testSchedulerName := "test-scheduler" - - jobSpec := &torchv1alpha1.PyTorchJob{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: "some-job", - UID: "some-uid", - }, - Spec: torchv1alpha1.PyTorchJobSpec{ - RuntimeId: "some-runtime", - ReplicaSpecs: []*torchv1alpha1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(2), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1alpha1.WORKER, - }, - }, - SchedulerName: testSchedulerName, - }, - } - - recorder := record.NewFakeRecorder(100) - job, err := initJob(clientSet, &pytorchJobFake.Clientset{}, recorder, jobSpec) - - if err != nil { - t.Fatalf("initJob failed: %v", err) - } - - replica, err := NewPyTorchReplicaSet(clientSet, recorder, *jobSpec.Spec.ReplicaSpecs[0], job) - - if err != nil { - t.Fatalf("NewPyTorchReplicaSet failed: %v", err) - } - worldSize := int32(2) - if err := replica.Create(&torchv1alpha1.ControllerConfig{}, worldSize); err != nil { - t.Fatalf("replica.Create() error; %v", err) - } - - trueVal := true - expectedOwnerReference := meta_v1.OwnerReference{ - APIVersion: groupVersionKind.GroupVersion().String(), - Kind: groupVersionKind.Kind, - Name: "some-job", - UID: "some-uid", - Controller: &trueVal, - BlockOwnerDeletion: &trueVal, - } - - for index := 0; index < 2; index++ { - // Expected labels - expectedPodLabels := map[string]string{ - "kubeflow.org": "", - "task_index": fmt.Sprintf("%v", index+1), - "job_type": "WORKER", - "runtime_id": "some-runtime", - "pytorch_job_name": "some-job", - } - expectedServiceLabels := map[string]string{ - "kubeflow.org": "", - "task_index": fmt.Sprintf("%v", index), - "job_type": "WORKER", - "runtime_id": "some-runtime", - "pytorch_job_name": "some-job", - } - - // Check that a service was created. - sList, err := clientSet.CoreV1().Services(replica.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{}) - if err != nil { - t.Fatalf("List services error; %v", err) - } - - if len(sList.Items) != 2 { - t.Fatalf("Expected 2 services got %v", len(sList.Items)) - } - - s := sList.Items[index] - - if !reflect.DeepEqual(expectedServiceLabels, s.ObjectMeta.Labels) { - t.Fatalf("Service Labels; Got %v Want: %v", s.ObjectMeta.Labels, expectedServiceLabels) - } - - name := fmt.Sprintf("some-job-worker-some-runtime-%v", index) - if s.ObjectMeta.Name != name { - t.Fatalf("Job.ObjectMeta.Name = %v; want %v", s.ObjectMeta.Name, name) - } - - if len(s.ObjectMeta.OwnerReferences) != 1 { - t.Fatalf("Expected 1 owner reference got %v", len(s.ObjectMeta.OwnerReferences)) - } - - if !reflect.DeepEqual(s.ObjectMeta.OwnerReferences[0], expectedOwnerReference) { - t.Fatalf("Service.Metadata.OwnerReferences; Got %v; want %v", util.Pformat(s.ObjectMeta.OwnerReferences[0]), util.Pformat(expectedOwnerReference)) - } - - // Check that a pod was created. - l, err := clientSet.CoreV1().Pods(replica.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{}) - if err != nil { - t.Fatalf("List pods error; %v", err) - } - - if len(l.Items) != 2 { - t.Fatalf("Expected 1 pod got %v", len(l.Items)) - } - - p := l.Items[index] - - if !reflect.DeepEqual(expectedPodLabels, p.ObjectMeta.Labels) { - t.Fatalf("Pod Labels; Got %v Want: %v", p.ObjectMeta.Labels, expectedPodLabels) - } - - if len(p.Spec.Containers) != 1 { - t.Fatalf("Expected 1 container got %v", len(p.Spec.Containers)) - } - - if len(p.ObjectMeta.OwnerReferences) != 1 { - t.Fatalf("Expected 1 owner reference got %v", len(p.ObjectMeta.OwnerReferences)) - } - - if !reflect.DeepEqual(p.ObjectMeta.OwnerReferences[0], expectedOwnerReference) { - t.Fatalf("Pod.Metadata.OwnerReferences; Got %v; want %v", util.Pformat(p.ObjectMeta.OwnerReferences[0]), util.Pformat(expectedOwnerReference)) - } - - c := p.Spec.Containers[0] - if len(c.Env) != 6 { - t.Fatalf("Expected 6 environment variables got %v", len(c.Env)) - } - - if strings.Compare(p.Spec.SchedulerName, testSchedulerName) != 0 { - t.Fatalf("p.Spec.Template.Spec.SchedulerName; Got %v; want %v", p.Spec.SchedulerName, testSchedulerName) - } - - actualPyTorchConfig := &PyTorchConfig{} - if err := json.Unmarshal([]byte(c.Env[0].Value), actualPyTorchConfig); err != nil { - t.Fatalf("Could not unmarshal PyTorchConfig %v", err) - } - - expectedPyTorchConfig := &PyTorchConfig{ - Cluster: ClusterSpec{}, - Task: TaskSpec{ - Type: "worker", - Index: index + 1, - }, - Environment: "cloud", - } - - if !reflect.DeepEqual(expectedPyTorchConfig, actualPyTorchConfig) { - t.Fatalf("Got %v, Want %v", actualPyTorchConfig, expectedPyTorchConfig) - } - } - // Delete the job. - // N.B it doesn't look like the Fake clientset is sophisticated enough to delete jobs in response to a - // DeleteCollection request (deleting individual jobs does appear to work with the Fake). So if we were to list - // the jobs after calling Delete we'd still see the job. So we will rely on E2E tests to verify Delete works - // correctly. - if err := replica.Delete(); err != nil { - t.Fatalf("replica.Delete() error; %v", err) - } -} - -func TestPyTorchReplicaSetStatusFromPodList(t *testing.T) { - type TestCase struct { - PodList v1.PodList - Name string - Expected torchv1alpha1.ReplicaState - } - - cases := []TestCase{ - { - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Running: &v1.ContainerStateRunning{}, - }, - }, - }, - }, - }, - }, - }, - Name: "master", - Expected: torchv1alpha1.ReplicaStateRunning, - }, - { - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 0, - }, - }, - }, - }, - }, - }, - }, - }, - Name: "master", - Expected: torchv1alpha1.ReplicaStateSucceeded, - }, - { - // Multiple containers; make sure we match by name. - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "other", - State: v1.ContainerState{ - Running: &v1.ContainerStateRunning{}, - }, - }, - { - Name: "master", - State: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 0, - }, - }, - }, - }, - }, - }, - }, - }, - Name: "master", - Expected: torchv1alpha1.ReplicaStateSucceeded, - }, - { - // Container failed with permanent error and then got restarted. - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Running: &v1.ContainerStateRunning{}, - }, - LastTerminationState: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 100, - Message: "some reason", - }, - }, - }, - }, - }, - }, - }, - }, - Name: "master", - Expected: torchv1alpha1.ReplicaStateRunning, - }, - { - // Multiple Pods; check we get the most recent. - PodList: v1.PodList{ - Items: []v1.Pod{ - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Running: &v1.ContainerStateRunning{}, - }, - }, - }, - StartTime: &meta_v1.Time{ - Time: time.Date(2017, 0, 0, 0, 0, 0, 0, time.UTC), - }, - }, - }, - { - Status: v1.PodStatus{ - ContainerStatuses: []v1.ContainerStatus{ - { - Name: "master", - State: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 100, - Message: "some reason", - }, - }, - }, - }, - StartTime: &meta_v1.Time{ - Time: time.Date(2018, 0, 0, 0, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Name: "master", - Expected: torchv1alpha1.ReplicaStateFailed, - }, - } - - for _, c := range cases { - status := replicaStatusFromPodList(c.PodList, c.Name) - if status != c.Expected { - t.Errorf("replicaStatusFromPodList(%+v, %v)=%v ; want %v", c.PodList, c.Name, status, c.Expected) - } - } -} diff --git a/pkg/trainer/training.go b/pkg/trainer/training.go deleted file mode 100644 index 1356d839f..000000000 --- a/pkg/trainer/training.go +++ /dev/null @@ -1,408 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package trainer is to manage pytorch training jobs. -package trainer - -import ( - "fmt" - "reflect" - "strings" - - log "github.com/sirupsen/logrus" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/record" - - "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/helper" - torchv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/validation" - pytorchclient "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned" - "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/scheme" - "github.com/kubeflow/pytorch-operator/pkg/util" -) - -// TODO(jlewi): We should switch a New pattern and make trainingJob private so we can -// ensure correctness on creation. -type TrainingJob struct { - job *torchv1alpha1.PyTorchJob - - KubeCli kubernetes.Interface - - recorder record.EventRecorder - - Replicas []*PyTorchReplicaSet - - torchJobClient pytorchclient.Interface - - // in memory state of the job. - // status is the source of truth after job struct is materialized. Changes to the status to be persisted - // should be made here. - status torchv1alpha1.PyTorchJobStatus - - memberCounter int -} - -// TODO(jose5918): We don't really need the cluster spec for this operator but no harm in leaving it for POC -// ClusterSpec represents a cluster TensorFlow specification. -// https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster -// It is a map from job names to network addresses. -type ClusterSpec map[string][]string - -type TaskSpec struct { - Type string `json:"type"` - Index int `json:"index"` -} - -func initJob(kubeCli kubernetes.Interface, torchJobClient pytorchclient.Interface, recorder record.EventRecorder, job *torchv1alpha1.PyTorchJob) (*TrainingJob, error) { - j := &TrainingJob{ - KubeCli: kubeCli, - torchJobClient: torchJobClient, - recorder: recorder, - Replicas: make([]*PyTorchReplicaSet, 0), - job: job, - status: *job.Status.DeepCopy(), - } - - return j, nil -} - -func NewJob(kubeCli kubernetes.Interface, torchJobClient pytorchclient.Interface, recorder record.EventRecorder, job *torchv1alpha1.PyTorchJob, config *torchv1alpha1.ControllerConfig) (*TrainingJob, error) { - j, err := initJob(kubeCli, torchJobClient, recorder, job) - if err != nil { - return nil, err - } - - return j, nil -} - -func (j *TrainingJob) UID() types.UID { - return j.job.ObjectMeta.UID -} - -// Update replaces the PyTorchJob corresponding to TrainingJob with the provided job. -// This function is used when the Spec/Status of the job is modified outside the controller. -// For example, if the user issues a delete request. This will update the metadata on the object -// so we need to replace the spec. -func (j *TrainingJob) Update(newJob *torchv1alpha1.PyTorchJob) { - log.Infof("Updating job to %+v", *newJob) - j.job = newJob -} - -func (j *TrainingJob) ClusterSpec() ClusterSpec { - clusterSpec := make(ClusterSpec) - - for _, p := range j.Replicas { - replicaNames := make([]string, 0, *p.Spec.Replicas) - - for i := int32(0); i < *p.Spec.Replicas; i++ { - replicaNames = append(replicaNames, fmt.Sprintf("%v:%v", p.genName(i), *p.Spec.MasterPort)) - } - - clusterSpec[strings.ToLower(string(p.Spec.PyTorchReplicaType))] = replicaNames - } - - return clusterSpec -} - -// createResources creates all the replicas if requested -func (j *TrainingJob) createResources(config *torchv1alpha1.ControllerConfig) error { - // TODO(jose5918) Need to figure out where it is best to add worldSize logic - // Get PyTorch worldSize by adding replicas - worldSize := int32(0) - for _, r := range j.Replicas { - worldSize = worldSize + *r.Spec.Replicas - } - for _, r := range j.Replicas { - if err := r.Create(config, worldSize); err != nil { - return err - } - } - - return nil -} - -// deleteResources deletes the replicas it it was created -func (j *TrainingJob) deleteResources() error { - for _, r := range j.Replicas { - if err := r.Delete(); err != nil { - return err - } - } - - return nil -} - -func (j *TrainingJob) GetStatus() (torchv1alpha1.State, []*torchv1alpha1.PyTorchReplicaStatus, error) { - master := j.job.Spec.TerminationPolicy.Master - masterState := torchv1alpha1.ReplicaStateUnknown - - state := torchv1alpha1.StateUnknown - replicaStatuses := make([]*torchv1alpha1.PyTorchReplicaStatus, 0) - - // The state for each replica. - // TODO(jlewi): We will need to modify this code if we want to allow multiples of a given type of replica. - replicaSetStates := make(map[torchv1alpha1.PyTorchReplicaType]torchv1alpha1.ReplicaState) - - for _, r := range j.Replicas { - rStatus, err := r.GetStatus() - if err != nil { - log.Errorf("GetStatus() for %v returned error; %v", r.Spec.PyTorchReplicaType, err) - } - - replicaSetStates[r.Spec.PyTorchReplicaType] = rStatus.State - - replicaStatuses = append(replicaStatuses, &rStatus) - - if string(r.Spec.PyTorchReplicaType) == master.ReplicaName { - masterState = r.GetSingleReplicaStatus(int32(master.ReplicaRank)) - } - } - - if masterState == torchv1alpha1.ReplicaStateRunning { - state = torchv1alpha1.StateRunning - } else if masterState == torchv1alpha1.ReplicaStateFailed { - state = torchv1alpha1.StateFailed - } else if masterState == torchv1alpha1.ReplicaStateSucceeded { - state = torchv1alpha1.StateSucceeded - } - - return state, replicaStatuses, nil -} - -func (j *TrainingJob) masterName() string { - return fmt.Sprintf("master-%v-0", j.job.Spec.RuntimeId) -} - -// setup the training job. -func (j *TrainingJob) setup(config *torchv1alpha1.ControllerConfig) { - err := func() error { - // If the job has already started we shouldn't set it up again. - if j.status.Phase != torchv1alpha1.PyTorchJobPhaseNone { - log.Warningf("Job %v has already been setup.", j.name()) - return nil - } - - // Set defaults. - scheme.Scheme.Default(j.job) - - err := validation.ValidatePyTorchJobSpec(&j.job.Spec) - if err != nil { - return fmt.Errorf("invalid job spec: %v", err) - } - - if err := helper.ConfigureAcceleratorsForPyTorchJobSpec(&j.job.Spec, config.Accelerators); err != nil { - return fmt.Errorf("ConfigureAccelerators(...) error; %v", err) - } - - if j.job.Spec.RuntimeId == "" { - j.job.Spec.RuntimeId = util.RandString(4) - } - return nil - }() - - if err != nil { - j.status.Reason = err.Error() - j.status.Phase = torchv1alpha1.PyTorchJobPhaseFailed - j.status.State = torchv1alpha1.StateFailed - } else { - j.status.Phase = torchv1alpha1.PyTorchJobPhaseCreating - j.status.State = torchv1alpha1.StateRunning - } -} - -// setup Replicas. This creates in memory data structures corresponding to the replicas. -func (j *TrainingJob) setupReplicas() error { - - if len(j.Replicas) != len(j.job.Spec.ReplicaSpecs) { - j.Replicas = make([]*PyTorchReplicaSet, 0, len(j.job.Spec.ReplicaSpecs)) - for _, t := range j.job.Spec.ReplicaSpecs { - r, err := NewPyTorchReplicaSet(j.KubeCli, j.recorder, *t, j) - if err != nil { - return err - } - j.Replicas = append(j.Replicas, r) - } - } - - return nil -} - -func (j *TrainingJob) Delete() { - // TODO(jlewi): Delete is what should cause us to delete the Pods. - // we shouldn't delete the pods when the jobs finish because leaving the pods - // allows us to get the logs from the pods after the job finishes. - // - log.Infof("PyTorchJob %v deleted by the user", j.fullname()) - // TODO(jlewi): This logic is probably insufficient. - if j.job.Status.Phase != torchv1alpha1.PyTorchJobPhaseCleanUp { - j.status.Phase = torchv1alpha1.PyTorchJobPhaseCleanUp - } - - // TODO(jlewi): Does it make sense to explicitly delete the resources? Should - // we just rely on K8s garbage collection to delete the resources before - // deleting PyTorchJob? - if cErr := j.deleteResources(); cErr != nil { - log.Errorf("trainingJob.deleteResources() error; %v", cErr) - } -} - -// updateCRDStatus updates the job status based on TraingingJob.status. -func (j *TrainingJob) updateCRDStatus() error { - // If the status hasn't changed then there's no reason to update the CRD. - if reflect.DeepEqual(j.job.Status, j.status) { - return nil - } - - newJob := j.job - newJob.Status = j.status - newJob, err := j.torchJobClient.KubeflowV1alpha1().PyTorchJobs(j.job.ObjectMeta.Namespace).Update(newJob) - if err != nil { - return err - } - - j.job = newJob - - return nil -} - -// reconcile tries to get the job into the desired state. -func (j *TrainingJob) Reconcile(config *torchv1alpha1.ControllerConfig) error { - if j.job.ObjectMeta.DeletionTimestamp != nil { - log.Info("Deletion timestamp set; skipping reconcile") - // Job is in the process of being deleted so do nothing. - // We especially don't want to create new resources as that could block deletion. - return nil - } - if j.job.Status.Phase == torchv1alpha1.PyTorchJobPhaseNone { - // The job hasn't been setup. - j.setup(config) - if err := j.updateCRDStatus(); err != nil { - log.Warningf("failed to update CRD status: %v", err) - return err - } - } - - // setupreplicas initializes data structures inside TrainingJob representing the replicas. - // These are go-lang structures which aren't preserved in the APIServer. So we always need to call setupReplicas - // unlike setup which only needs to be called once during the lifecycle of the job. - if err := j.setupReplicas(); err != nil { - log.Errorf("failed to create replicas: %v", err) - j.status.Reason = fmt.Sprintf("Could not create in memory datastructures; %v", err) - if uErr := j.updateCRDStatus(); err != nil { - log.Warningf("Job %v; failed to update status error: %v", j.job.ObjectMeta.Name, uErr) - } - return err - } - - // TODO(jlewi): Can we determine from the CRD status whether we should - // Create the resources or not? We need to ensure the resources exist so for - // now we always call Create. - if j.job.Status.Phase == torchv1alpha1.PyTorchJobPhaseCreating || j.job.Status.Phase == torchv1alpha1.PyTorchJobPhaseRunning { - // We call Create to make sure all the resources exist and are running. - if cErr := j.createResources(config); cErr != nil { - // TODO(jlewi): Should we eventually give up and mark the job as failed if we can't create the resources? - j.status.Reason = fmt.Sprintf("Could not create job resources; %v", cErr) - if err := j.updateCRDStatus(); err != nil { - log.Warningf("Job %v; failed to update status error: %v", j.job.ObjectMeta.Name, err) - return err - } - log.Errorf("trainingJobCreateReplicas() error; %v", cErr) - return cErr - } - - state, replicaStatuses, err := j.GetStatus() - - j.status.ReplicaStatuses = replicaStatuses - if err != nil { - log.Errorf("GetStatus() for job %v returned error: %v", j.job.ObjectMeta.Name, err) - return err - } - // TODO(jlewi): We should update the Phase if we detect the job is done. - if state == torchv1alpha1.StateFailed { - log.Errorf("Master failed Job: %v.", j.job.ObjectMeta.Name) - j.status.Phase = torchv1alpha1.PyTorchJobPhaseDone - j.status.State = torchv1alpha1.StateFailed - } else if state == torchv1alpha1.StateSucceeded { - log.Infof("Master succeeded Job: %v.", j.job.ObjectMeta.Name) - j.status.Phase = torchv1alpha1.PyTorchJobPhaseDone - j.status.State = torchv1alpha1.StateSucceeded - } else { - log.Infof("Job %v status=%v", j.job.ObjectMeta.Name, util.Pformat(j.status)) - } - } - // TODO(jose5918) Need to figure out where it is best to add worldSize logic - // Get PyTorch worldSize by adding replicas - worldSize := int32(0) - for _, r := range j.Replicas { - worldSize = worldSize + *r.Spec.Replicas - } - - // Only sync pods and services if we are running. - if j.status.Phase == torchv1alpha1.PyTorchJobPhaseCreating || j.status.Phase == torchv1alpha1.PyTorchJobPhaseRunning { - // sync pods - for _, rc := range j.Replicas { - err := rc.SyncPods(worldSize) - if err != nil { - log.Errorf("SyncPods error: %v", err) - } - } - - // sync services - for _, rc := range j.Replicas { - err := rc.SyncServices() - if err != nil { - log.Errorf("SyncServices error: %v", err) - } - } - } - - // If the phase changed we should update the CRD. - if err := j.updateCRDStatus(); err != nil { - log.Warningf("Job %v, failed to update CRD status error: %v", j.job.ObjectMeta.Name, err) - return err - } - - if j.job.Status.Phase == torchv1alpha1.PyTorchJobPhaseCleanUp { - if cErr := j.deleteResources(); cErr != nil { - log.Errorf("Job %v trainingJob.Delete() error; %v", j.job.ObjectMeta.Name, cErr) - } - // j.status.SetPhase(spec.PyTorchJobPhaseDone) - // Return from run because we want to stop reconciling the object. - return nil - } - - // updateCRDStatus will update the status of the CRD with c.Status if c.Status - // doesn't match c.Cluster.status. So you can change c.Status in order to propagate - // changes to the CRD status. - if err := j.updateCRDStatus(); err != nil { - log.Warningf("Job %v; failed to update CRD status error: %v", j.job.ObjectMeta.Name, err) - return err - } - - return nil -} - -func (j *TrainingJob) name() string { - return j.job.ObjectMeta.GetName() -} - -// fullname returns the namespace and name for the job. -func (j *TrainingJob) fullname() string { - return j.job.ObjectMeta.GetNamespace() + ":" + j.job.ObjectMeta.GetName() -} - -func (j *TrainingJob) SchedulerName() string { - return j.job.Spec.SchedulerName -} diff --git a/pkg/trainer/training_test.go b/pkg/trainer/training_test.go deleted file mode 100644 index 5b20476f1..000000000 --- a/pkg/trainer/training_test.go +++ /dev/null @@ -1,301 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "reflect" - "testing" - - "github.com/gogo/protobuf/proto" - torchv1alpha1 "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1alpha1" - pytorchJobFake "github.com/kubeflow/pytorch-operator/pkg/client/clientset/versioned/fake" - "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/tools/record" -) - -func TestClusterSpec(t *testing.T) { - type TestCase struct { - Spec *torchv1alpha1.PyTorchJob - Expected map[string][]string - } - - cases := []TestCase{ - { - Spec: &torchv1alpha1.PyTorchJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myjob", - Namespace: "kubeflow", - }, - Spec: torchv1alpha1.PyTorchJobSpec{ - SchedulerName: "sched", - RuntimeId: "runtime", - ReplicaSpecs: []*torchv1alpha1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(1), - MasterPort: proto.Int32(42), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1alpha1.MASTER, - }, - { - Replicas: proto.Int32(3), - MasterPort: proto.Int32(40), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1alpha1.WORKER, - }, - }, - }, - }, - - Expected: map[string][]string{ - "master": []string{"myjob-master-runtime-0:42"}, - "worker": []string{"myjob-worker-runtime-0:40", "myjob-worker-runtime-1:40", "myjob-worker-runtime-2:40"}, - }, - }, - } - - for _, c := range cases { - - clientSet := fake.NewSimpleClientset() - - recorder := record.NewFakeRecorder(100) - job, err := initJob(clientSet, &pytorchJobFake.Clientset{}, recorder, c.Spec) - - if err != nil { - t.Fatalf("initJob failed: %v", err) - } - - job.setup(&torchv1alpha1.ControllerConfig{}) - job.setupReplicas() - actual := job.ClusterSpec() - - for k, v := range c.Expected { - actualV, ok := actual[k] - if !ok { - t.Errorf("Actual cluster spec is missing key: %v", k) - continue - } - if !reflect.DeepEqual(actualV, v) { - t.Errorf("Key %v got %v want %v", k, actualV, v) - } - } - - name := job.name() - expectedName := "myjob" - if !reflect.DeepEqual(expectedName, name) { - t.Errorf("Got name %v want %v", name, expectedName) - } - - fullname := job.fullname() - expectedFullname := "kubeflow:myjob" - if !reflect.DeepEqual(expectedFullname, fullname) { - t.Errorf("Got fullname %v want %v", fullname, expectedFullname) - } - - scheduler := job.SchedulerName() - expectedScheduler := "sched" - if !reflect.DeepEqual(expectedScheduler, scheduler) { - t.Errorf("Got name %v want %v", scheduler, expectedScheduler) - } - - c.Spec.ObjectMeta.Namespace = "update" - job_updated, err := initJob(clientSet, &pytorchJobFake.Clientset{}, recorder, c.Spec) - if err != nil { - t.Fatalf("initJob failed: %v", err) - } - job.Update(job_updated.job) - fullname = job.fullname() - expectedFullname = "update:myjob" - if !reflect.DeepEqual(expectedFullname, fullname) { - t.Errorf("Got fullname %v want %v", fullname, expectedFullname) - } - } -} - -func TestJobSetup(t *testing.T) { - // Verify the setup will fill in the RuntimeId. - clientSet := fake.NewSimpleClientset() - - type testCase struct { - jobSpec *torchv1alpha1.PyTorchJob - expectMounts int - expectPhase torchv1alpha1.PyTorchJobPhase - expectReason string - expectState torchv1alpha1.State - } - - testCases := []testCase{ - { - jobSpec: &torchv1alpha1.PyTorchJob{ - Spec: torchv1alpha1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1alpha1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(1), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - }, - }, - }, - }, - PyTorchReplicaType: torchv1alpha1.MASTER, - }, - }, - }, - }, - expectMounts: 0, - expectPhase: torchv1alpha1.PyTorchJobPhaseCreating, - expectState: torchv1alpha1.StateRunning, - }, - { - jobSpec: &torchv1alpha1.PyTorchJob{ - Spec: torchv1alpha1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1alpha1.PyTorchReplicaSpec{ - { - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - Resources: v1.ResourceRequirements{ - Requests: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - }, - }, - }, - }, - PyTorchReplicaType: torchv1alpha1.WORKER, - }, - }, - TerminationPolicy: &torchv1alpha1.TerminationPolicySpec{ - Master: &torchv1alpha1.MasterSpec{ - ReplicaName: string(torchv1alpha1.WORKER), - ReplicaRank: 0, - }, - }, - }, - }, - expectMounts: 1, - expectPhase: torchv1alpha1.PyTorchJobPhaseCreating, - expectState: torchv1alpha1.StateRunning, - }, - { - // The job should fail setup because the spec is invalid. - jobSpec: &torchv1alpha1.PyTorchJob{ - Spec: torchv1alpha1.PyTorchJobSpec{ - ReplicaSpecs: []*torchv1alpha1.PyTorchReplicaSpec{ - { - Replicas: proto.Int32(2), - MasterPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - Resources: v1.ResourceRequirements{ - Requests: map[v1.ResourceName]resource.Quantity{ - "nvidia-gpu": resource.MustParse("1"), - }, - }, - }, - }, - }, - }, - PyTorchReplicaType: torchv1alpha1.WORKER, - }, - }, - }, - }, - expectMounts: 0, - expectPhase: torchv1alpha1.PyTorchJobPhaseFailed, - expectState: torchv1alpha1.StateFailed, - expectReason: "invalid job spec: Missing ReplicaSpec for master: MASTER", - }, - } - - config := &torchv1alpha1.ControllerConfig{ - Accelerators: map[string]torchv1alpha1.AcceleratorConfig{ - "nvidia-gpu": torchv1alpha1.AcceleratorConfig{ - Volumes: []torchv1alpha1.AcceleratorVolume{ - { - Name: "cuda-lib", - HostPath: "/home/cuda", - MountPath: "/usr/local/cuda", - }, - }, - }, - }, - } - - for _, c := range testCases { - - recorder := record.NewFakeRecorder(100) - job, err := initJob(clientSet, &pytorchJobFake.Clientset{}, recorder, c.jobSpec) - - job.setup(config) - - if err != nil { - t.Errorf("j.setup error: %v", err) - } - - if job.status.Phase != c.expectPhase { - t.Errorf("job.job.Status.Phase Want: %v Got:%v ", c.expectPhase, job.status.Phase) - } - - if job.status.Reason != c.expectReason { - t.Errorf("job.job.Status.Reason Want: %v Got:%v ", c.expectReason, job.status.Reason) - } - - if job.status.State != c.expectState { - t.Errorf("job.job.Status.State Want: %v Got:%v ", c.expectState, job.status.State) - } - - // Make sure the runtime id is set if the job didn't fail. - if c.expectState != torchv1alpha1.StateFailed && job.job.Spec.RuntimeId == "" { - t.Errorf("RuntimeId should not be empty after calling setup.") - } - - if len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Volumes) != c.expectMounts { - t.Errorf("Expect %v Volumes got %v", c.expectMounts, len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Volumes)) - } - - if len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Containers[0].VolumeMounts) != c.expectMounts { - t.Errorf("Expect %v VolumeMounts got %v", c.expectMounts, len(job.job.Spec.ReplicaSpecs[0].Template.Spec.Containers[0].VolumeMounts)) - } - } -}