From 849ae66f0caeca0dece6b2c6479bace3c8963c60 Mon Sep 17 00:00:00 2001 From: liuyu <> Date: Tue, 7 Jan 2025 19:49:30 +0800 Subject: [PATCH 1/2] fix: disable gpu before uninstall if did not disable --- pkg/gpu/module.go | 1 + pkg/gpu/prepares.go | 27 +++++++++++++++++++++++++-- pkg/pipelines/gpu_uninstall.go | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pkg/gpu/module.go b/pkg/gpu/module.go index ffe6138a..3fa4c7f4 100644 --- a/pkg/gpu/module.go +++ b/pkg/gpu/module.go @@ -361,6 +361,7 @@ func (l *NodeUnlabelingModule) Init() { }, }, new(K8sNodeInstalled), + new(GpuDevicePluginInstalled), }, Action: new(RestartPlugin), Parallel: false, diff --git a/pkg/gpu/prepares.go b/pkg/gpu/prepares.go index 4c042a7a..48aa8412 100644 --- a/pkg/gpu/prepares.go +++ b/pkg/gpu/prepares.go @@ -76,7 +76,8 @@ type K8sNodeInstalled struct { func (p *K8sNodeInstalled) PreCheck(runtime connector.Runtime) (bool, error) { client, err := clientset.NewKubeClient() if err != nil { - return false, errors.Wrap(errors.WithStack(err), "kubeclient create error") + logger.Debug(errors.Wrap(errors.WithStack(err), "kubeclient create error")) + return false, nil } node, err := client.Kubernetes().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) @@ -84,7 +85,9 @@ func (p *K8sNodeInstalled) PreCheck(runtime connector.Runtime) (bool, error) { if apierrors.IsNotFound(err) { return false, nil } - return false, errors.Wrap(errors.WithStack(err), "list nodes error") + + logger.Debug(errors.Wrap(errors.WithStack(err), "list nodes error")) + return false, nil } if len(node.Items) == 0 { @@ -127,3 +130,23 @@ func (p *ContainerdInstalled) PreCheck(runtime connector.Runtime) (bool, error) logger.Info("containerd is not installed, ignore task") return false, nil } + +type GpuDevicePluginInstalled struct { + common.KubePrepare +} + +func (p *GpuDevicePluginInstalled) PreCheck(runtime connector.Runtime) (bool, error) { + client, err := clientset.NewKubeClient() + if err != nil { + logger.Debug(errors.Wrap(errors.WithStack(err), "kubeclient create error")) + return false, nil + } + + plugins, err := client.Kubernetes().CoreV1().Pods("kube-system").List(context.Background(), metav1.ListOptions{LabelSelector: "name=nvidia-device-plugin-ds"}) + if err != nil { + logger.Debug(err) + return false, nil + } + + return len(plugins.Items) > 0, nil +} diff --git a/pkg/pipelines/gpu_uninstall.go b/pkg/pipelines/gpu_uninstall.go index b3f06a9d..55c91c83 100644 --- a/pkg/pipelines/gpu_uninstall.go +++ b/pkg/pipelines/gpu_uninstall.go @@ -21,6 +21,7 @@ func UninstallGpuDrivers() error { Name: "UninstallGpuDrivers", Runtime: runtime, Modules: []module.Module{ + &gpu.NodeUnlabelingModule{}, &gpu.UninstallCudaModule{}, &gpu.RestartContainerdModule{}, }, From 9cdac6a66e112bb571143a78da783e6bb8f68765 Mon Sep 17 00:00:00 2001 From: liuyu <> Date: Fri, 17 Jan 2025 21:00:52 +0800 Subject: [PATCH 2/2] fix: modified some commands to compatible running In the container --- pkg/bootstrap/os/tasks.go | 9 +- pkg/common/common.go | 3 + pkg/common/kube_runtime.go | 8 +- pkg/images/images.go | 192 ++++++++++++++++++------------------- pkg/images/load.go | 9 +- pkg/k3s/tasks.go | 74 +++++++------- 6 files changed, 154 insertions(+), 141 deletions(-) diff --git a/pkg/bootstrap/os/tasks.go b/pkg/bootstrap/os/tasks.go index c09515f8..a6d72907 100644 --- a/pkg/bootstrap/os/tasks.go +++ b/pkg/bootstrap/os/tasks.go @@ -232,9 +232,12 @@ func (n *NodeConfigureOS) Execute(runtime connector.Runtime) error { } } - _, err1 := runtime.GetRunner().SudoCmd(fmt.Sprintf("hostnamectl set-hostname %s && sed -i '/^127.0.1.1/s/.*/127.0.1.1 %s/g' /etc/hosts", host.GetName(), host.GetName()), false, false) - if err1 != nil { - return errors.Wrap(errors.WithStack(err1), "Failed to override hostname") + // if running in docker container, /etc/hosts file is bind mounting, cannot be replaced via mv command + if !n.KubeConf.Arg.IsOlaresInContainer { + _, err1 := runtime.GetRunner().SudoCmd(fmt.Sprintf("hostnamectl set-hostname %s && sed -i '/^127.0.1.1/s/.*/127.0.1.1 %s/g' /etc/hosts", host.GetName(), host.GetName()), false, false) + if err1 != nil { + return errors.Wrap(errors.WithStack(err1), "Failed to override hostname") + } } if runtime.GetSystemInfo().IsWsl() { diff --git a/pkg/common/common.go b/pkg/common/common.go index ac045308..717729da 100755 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -296,6 +296,9 @@ const ( ENV_AUTO_ADD_FIREWALL_RULES = "AUTO_ADD_FIREWALL_RULES" ENV_TERMINUS_OS_DOMAINNAME = "TERMINUS_OS_DOMAINNAME" ENV_DEFAULT_WSL_DISTRO_LOCATION = "DEFAULT_WSL_DISTRO_LOCATION" // If set to 1, the default WSL distro storage will be used. + + ENV_CONTAINER = "container" + ENV_CONTAINER_MODE = "CONTAINER_MODE" // running in docker container ) // TerminusGlobalEnvs holds a group of general environment variables diff --git a/pkg/common/kube_runtime.go b/pkg/common/kube_runtime.go index 9fe4736f..98828a40 100755 --- a/pkg/common/kube_runtime.go +++ b/pkg/common/kube_runtime.go @@ -19,14 +19,15 @@ package common import ( "encoding/json" "fmt" - "github.com/pkg/errors" - "github.com/spf13/pflag" "net" "os" "path/filepath" "strconv" "strings" + "github.com/pkg/errors" + "github.com/spf13/pflag" + kubekeyapiv1alpha2 "bytetrade.io/web3os/installer/apis/kubekey/v1alpha2" kubekeyclientset "bytetrade.io/web3os/installer/clients/clientset/versioned" "bytetrade.io/web3os/installer/pkg/core/common" @@ -120,6 +121,8 @@ type Argument struct { HostIP string `json:"host_ip"` CudaVersion string `json:"cuda_version"` + + IsOlaresInContainer bool `json:"is_olares_in_container"` } type MasterHostConfig struct { @@ -241,6 +244,7 @@ func NewArgument() *Argument { } arg.IsCloudInstance, _ = strconv.ParseBool(os.Getenv(ENV_TERMINUS_IS_CLOUD_VERSION)) arg.PublicNetworkInfo.PubliclyAccessible, _ = strconv.ParseBool(os.Getenv(ENV_PUBLICLY_ACCESSIBLE)) + arg.IsOlaresInContainer = os.Getenv("CONTAINER_MODE") == "oic" return arg } diff --git a/pkg/images/images.go b/pkg/images/images.go index 2e9d5cf6..3b5c51b7 100644 --- a/pkg/images/images.go +++ b/pkg/images/images.go @@ -20,9 +20,7 @@ import ( "bufio" "fmt" "os" - "path/filepath" "strings" - "time" kubekeyapiv1alpha2 "bytetrade.io/web3os/installer/apis/kubekey/v1alpha2" "bytetrade.io/web3os/installer/pkg/common" @@ -130,101 +128,101 @@ func (images *Images) PullImages(runtime connector.Runtime, kubeConf *common.Kub return nil } -type LocalImage struct { - Filename string -} - -type LocalImages []LocalImage - -func (i LocalImages) LoadImages(runtime connector.Runtime, kubeConf *common.KubeConf) error { - loadCmd := "docker" - host := runtime.RemoteHost() - retry := func(f func() error, times int) (err error) { - for i := 0; i < times; i++ { - err = f() - if err == nil { - return nil - } - var dur = 5 + (i+1)*10 - logger.Warnf("load image %s failed, wait for %d seconds(%d times)", err, dur, i+1) - if (i + 1) < times { - time.Sleep(time.Duration(dur) * time.Second) - } - } - - return - } - - for _, image := range i { - switch { - case host.IsRole(common.Master): - // logger.Debugf("%s preloading image: %s", host.GetName(), image.Filename) - start := time.Now() - fileName := filepath.Base(image.Filename) - // fileName = strings.ReplaceAll(fileName, ".gz", "") - // fmt.Println(">>> ", fileName, HasSuffixI(image.Filename, ".tar.gz", ".tgz")) - if HasSuffixI(image.Filename, ".tar.gz", ".tgz") { - switch kubeConf.Cluster.Kubernetes.ContainerManager { - case "crio": - loadCmd = "ctr" // BUG - case "containerd": - loadCmd = "ctr -n k8s.io images import -" - case "isula": - loadCmd = "isula" - default: - loadCmd = "docker load" - } - - // continue if load image error - if err := retry(func() error { - logger.Infof("preloading image: %s", fileName) - if stdout, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("env PATH=$PATH gunzip -c %s | %s", image.Filename, loadCmd), false, false); err != nil { - return fmt.Errorf("%s", fileName) - } else { - logger.Infof("%s in %s\n", formatLoadImageRes(stdout, fileName), time.Since(start)) - // fmt.Printf("%s in %s\n", formatLoadImageRes(stdout, fileName), time.Since(start)) - } - return nil - }, 5); err != nil { - return fmt.Errorf("%s", fileName) - } - } else if HasSuffixI(image.Filename, ".tar") { - switch kubeConf.Cluster.Kubernetes.ContainerManager { - case "crio": - loadCmd = "ctr" // BUG - case "containerd": - loadCmd = "ctr -n k8s.io images import" - case "isula": - loadCmd = "isula" - default: - loadCmd = "docker load -i" - } - - if err := retry(func() error { - logger.Infof("preloading image: %s", fileName) - if stdout, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("env PATH=$PATH %s %s", loadCmd, image.Filename), false, false); err != nil { - return fmt.Errorf("%s", fileName) - } else { - logger.Infof("%s in %s\n", formatLoadImageRes(stdout, fileName), time.Since(start)) - // fmt.Printf("%s in %s\n", formatLoadImageRes(stdout, fileName), time.Since(start)) - } - - return nil - }, 5); err != nil { - return fmt.Errorf("%s", fileName) - } - } else { - logger.Warnf("invalid image file name %s, skip ...", image.Filename) - return nil - } - default: - continue - } - - } - return nil - -} +// type LocalImage struct { +// Filename string +// } + +// type LocalImages []LocalImage + +// func (i LocalImages) LoadImages(runtime connector.Runtime, kubeConf *common.KubeConf) error { +// loadCmd := "docker" +// host := runtime.RemoteHost() +// retry := func(f func() error, times int) (err error) { +// for i := 0; i < times; i++ { +// err = f() +// if err == nil { +// return nil +// } +// var dur = 5 + (i+1)*10 +// logger.Warnf("load image %s failed, wait for %d seconds(%d times)", err, dur, i+1) +// if (i + 1) < times { +// time.Sleep(time.Duration(dur) * time.Second) +// } +// } + +// return +// } + +// for _, image := range i { +// switch { +// case host.IsRole(common.Master): +// // logger.Debugf("%s preloading image: %s", host.GetName(), image.Filename) +// start := time.Now() +// fileName := filepath.Base(image.Filename) +// // fileName = strings.ReplaceAll(fileName, ".gz", "") +// // fmt.Println(">>> ", fileName, HasSuffixI(image.Filename, ".tar.gz", ".tgz")) +// if HasSuffixI(image.Filename, ".tar.gz", ".tgz") { +// switch kubeConf.Cluster.Kubernetes.ContainerManager { +// case "crio": +// loadCmd = "ctr" // BUG +// case "containerd": +// loadCmd = "ctr -n k8s.io images import -" +// case "isula": +// loadCmd = "isula" +// default: +// loadCmd = "docker load" +// } + +// // continue if load image error +// if err := retry(func() error { +// logger.Infof("preloading image: %s", fileName) +// if stdout, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("env PATH=$PATH gunzip -c %s | %s", image.Filename, loadCmd), false, false); err != nil { +// return fmt.Errorf("%s", fileName) +// } else { +// logger.Infof("%s in %s\n", formatLoadImageRes(stdout, fileName), time.Since(start)) +// // fmt.Printf("%s in %s\n", formatLoadImageRes(stdout, fileName), time.Since(start)) +// } +// return nil +// }, 5); err != nil { +// return fmt.Errorf("%s", fileName) +// } +// } else if HasSuffixI(image.Filename, ".tar") { +// switch kubeConf.Cluster.Kubernetes.ContainerManager { +// case "crio": +// loadCmd = "ctr" // BUG +// case "containerd": +// loadCmd = "ctr -n k8s.io images import" +// case "isula": +// loadCmd = "isula" +// default: +// loadCmd = "docker load -i" +// } + +// if err := retry(func() error { +// logger.Infof("preloading image: %s", fileName) +// if stdout, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("env PATH=$PATH %s %s", loadCmd, image.Filename), false, false); err != nil { +// return fmt.Errorf("%s", fileName) +// } else { +// logger.Infof("%s in %s\n", formatLoadImageRes(stdout, fileName), time.Since(start)) +// // fmt.Printf("%s in %s\n", formatLoadImageRes(stdout, fileName), time.Since(start)) +// } + +// return nil +// }, 5); err != nil { +// return fmt.Errorf("%s", fileName) +// } +// } else { +// logger.Warnf("invalid image file name %s, skip ...", image.Filename) +// return nil +// } +// default: +// continue +// } + +// } +// return nil + +// } func formatLoadImageRes(str string, fileName string) string { if strings.Contains(str, "(sha256:") { diff --git a/pkg/images/load.go b/pkg/images/load.go index 3ea299c5..17966cde 100644 --- a/pkg/images/load.go +++ b/pkg/images/load.go @@ -119,8 +119,13 @@ func (t *LoadImages) Execute(runtime connector.Runtime) (reserr error) { var loadCmd string var loadParm string - if runtime.GetSystemInfo().GetFsType() == "zfs" { - loadParm = "--snapshotter=zfs" + // unused + // if runtime.GetSystemInfo().GetFsType() == "zfs" { + // loadParm = "--snapshotter=zfs" + // } + + if t.KubeConf.Arg.IsOlaresInContainer { + loadParm = "--no-unpack" } if runtime.RemoteHost().GetOs() == common.Darwin { diff --git a/pkg/k3s/tasks.go b/pkg/k3s/tasks.go index 86558a5c..da6059d9 100644 --- a/pkg/k3s/tasks.go +++ b/pkg/k3s/tasks.go @@ -17,17 +17,17 @@ package k3s import ( - "bytetrade.io/web3os/installer/pkg/storage" - storagetpl "bytetrade.io/web3os/installer/pkg/storage/templates" "context" "encoding/base64" "fmt" - "os" "path" "path/filepath" "strings" "time" + "bytetrade.io/web3os/installer/pkg/storage" + storagetpl "bytetrade.io/web3os/installer/pkg/storage/templates" + "bytetrade.io/web3os/installer/pkg/container" "bytetrade.io/web3os/installer/pkg/manifest" "bytetrade.io/web3os/installer/pkg/registry" @@ -352,40 +352,40 @@ func (e *EnableK3sService) Execute(runtime connector.Runtime) error { return nil } -type PreloadImagesService struct { - common.KubeAction -} - -func (p *PreloadImagesService) Execute(runtime connector.Runtime) error { - if utils.IsExist(common.K3sImageDir) { - if err := util.CreateDir(common.K3sImageDir); err != nil { - logger.Errorf("create dir %s failed: %v", common.K3sImageDir, err) - return err - } - } - - fileInfos, err := os.ReadDir(common.K3sImageDir) - if err != nil { - logger.Errorf("Unable to read images in %s: %v", common.K3sImageDir, err) - return nil - } - - var loadingImages images.LocalImages - for _, fileInfo := range fileInfos { - if fileInfo.IsDir() { - continue - } - - filePath := filepath.Join(common.K3sImageDir, fileInfo.Name()) - - loadingImages = append(loadingImages, images.LocalImage{Filename: filePath}) - } - - if err := loadingImages.LoadImages(runtime, p.KubeConf); err != nil { - return errors.Wrap(errors.WithStack(err), "preload image failed") - } - return nil -} +// type PreloadImagesService struct { +// common.KubeAction +// } + +// func (p *PreloadImagesService) Execute(runtime connector.Runtime) error { +// if utils.IsExist(common.K3sImageDir) { +// if err := util.CreateDir(common.K3sImageDir); err != nil { +// logger.Errorf("create dir %s failed: %v", common.K3sImageDir, err) +// return err +// } +// } + +// fileInfos, err := os.ReadDir(common.K3sImageDir) +// if err != nil { +// logger.Errorf("Unable to read images in %s: %v", common.K3sImageDir, err) +// return nil +// } + +// var loadingImages images.LocalImages +// for _, fileInfo := range fileInfos { +// if fileInfo.IsDir() { +// continue +// } + +// filePath := filepath.Join(common.K3sImageDir, fileInfo.Name()) + +// loadingImages = append(loadingImages, images.LocalImage{Filename: filePath}) +// } + +// if err := loadingImages.LoadImages(runtime, p.KubeConf); err != nil { +// return errors.Wrap(errors.WithStack(err), "preload image failed") +// } +// return nil +// } type CopyK3sKubeConfig struct { common.KubeAction