Skip to content

Commit

Permalink
validate subcommand: various changes (#1088)
Browse files Browse the repository at this point in the history
Remove "connections" validation. This checked that various services were
TCP-reachable (and resolvable by kube-dns), from various pods. For
example, it checked that pgsql was reachable from frontend. This
particular validation wouldn't always be useful, e.g. when external
postgres clusters were used. It was also failing due to the removal of
`nc` from base images.

This upstream health-checking _could_ be replaced by doing this check in
the readiness probe handler. For example: frontend pods could check the
availability of a pgsql connection, and fail their readiness probe if
that is unavailable. This is a contentious topic however and should be
approached with caution: mean-time-to-recovery can actually
substantially increase under some configurations due to cascading
failing readiness probes. This commit conservatively aims to restore
utility of src-validate by removing the always-failing connections
check, and there are no immediate plans to add upstream readiness
probing.

This commit also changes src-validate to exit with non-zero status when
there are no pods and/or services. This white-box check does have some
utility, as a sanity check that we've deployed anything at all. The "no
PVCs" case is left as a warning, in case the user is using external DBs
everywhere, and might legitimately have no PVCs in their namespace.

This closes
https://linear.app/sourcegraph/issue/REL-42/src-validate-should-not-exit-with-zero-when-no-pods-are-available-at
and
https://linear.app/sourcegraph/issue/REL-40/fix-src-validate-now-that-nc-is-not-available,
assuming we agree with
https://linear.app/sourcegraph/issue/REL-40/fix-src-validate-now-that-nc-is-not-available#comment-7b40beee.

It will be followed-up with an implementation of
https://linear.app/sourcegraph/issue/REL-43/src-validate-should-perform-black-box-validations-of-core-flows.
  • Loading branch information
Craig Furman authored May 21, 2024
1 parent d16db0a commit 3ce717f
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 137 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ All notable changes to `src-cli` are documented in this file.

### Fixed

- validate kube: connections check removed.
- validate kube: exits non-zero when there are no pods or services in the target
namespace.

### Removed

## 5.4.0
Expand Down
141 changes: 4 additions & 137 deletions internal/validate/kube/kube.go
Original file line number Diff line number Diff line change
@@ -1,25 +1,21 @@
package kube

import (
"bytes"
"context"
"fmt"
"io"
"log"
"os"
"path/filepath"
"reflect"
"regexp"
"strings"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"

"k8s.io/client-go/tools/remotecommand"
"k8s.io/client-go/util/homedir"

"github.com/aws/aws-sdk-go-v2/service/ec2"
Expand All @@ -30,12 +26,6 @@ import (
"github.com/sourcegraph/sourcegraph/lib/errors"
)

var (
sourcegraphFrontend = regexp.MustCompile(`^sourcegraph-frontend-.*`)
sourcegraphRepoUpdater = regexp.MustCompile(`^repo-updater-.*`)
sourcegraphWorker = regexp.MustCompile(`^worker-.*`)
)

type Option = func(config *Config)

type Config struct {
Expand Down Expand Up @@ -95,7 +85,6 @@ func Validate(ctx context.Context, clientSet *kubernetes.Clientset, restConfig *
{Pods, "validating pods", "pods validated", "validating pods failed"},
{Services, "validating services", "services validated", "validating services failed"},
{PVCs, "validating pvcs", "pvcs validated", "validating pvcs failed"},
{Connections, "validating connections", "connections validated", "validating connections failed"},
}

if cfg.eks {
Expand Down Expand Up @@ -213,9 +202,9 @@ func Pods(ctx context.Context, config *Config) ([]validate.Result, error) {

if len(pods.Items) == 0 {
results = append(results, validate.Result{
Status: validate.Warning,
Status: validate.Failure,
Message: fmt.Sprintf(
"No pods exist on namespace '%s'. check namespace/cluster",
"no pods exist in namespace '%s'. check namespace/cluster",
config.namespace,
),
})
Expand Down Expand Up @@ -305,9 +294,9 @@ func Services(ctx context.Context, config *Config) ([]validate.Result, error) {

if len(services.Items) <= 1 {
results = append(results, validate.Result{
Status: validate.Warning,
Status: validate.Failure,
Message: fmt.Sprintf(
"unexpected number of services on namespace '%s'; check namespace/cluster",
"no services in namespace '%s'; check namespace/cluster",
config.namespace,
),
})
Expand Down Expand Up @@ -384,128 +373,6 @@ func validatePVC(pvc *corev1.PersistentVolumeClaim) []validate.Result {
return results
}

type connection struct {
src corev1.Pod
dest []dest
}

type dest struct {
addr string
port string
}

// Connections will validate that Sourcegraph services can reach each other over the network.
func Connections(ctx context.Context, config *Config) ([]validate.Result, error) {
var results []validate.Result
var connections []connection

pods, err := config.clientSet.CoreV1().Pods(config.namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}

if len(pods.Items) == 0 {
results = append(results, validate.Result{
Status: validate.Warning,
Message: fmt.Sprintf(
"cannot check connections: zero pods exist in namespace '%s'",
config.namespace,
),
})

return results, nil
}

// iterate through pods looking for specific pod name prefixes, then construct
// a relationship map between pods that should have connectivity with each other
for _, pod := range pods.Items {
switch name := pod.Name; {
case sourcegraphFrontend.MatchString(name): // pod is one of the sourcegraph front-end pods
connections = append(connections, connection{
src: pod,
dest: []dest{
{
addr: "pgsql",
port: "5432",
},
{
addr: "indexed-search",
port: "6070",
},
{
addr: "repo-updater",
port: "3182",
},
{
addr: "syntect-server",
port: "9238",
},
},
})
case sourcegraphWorker.MatchString(name): // pod is a worker pod
connections = append(connections, connection{
src: pod,
dest: []dest{
{
addr: "pgsql",
port: "5432",
},
},
})
case sourcegraphRepoUpdater.MatchString(name):
connections = append(connections, connection{
src: pod,
dest: []dest{
{
addr: "pgsql",
port: "5432",
},
},
})
}
}

// use network relationships constructed above to test network connection for each relationship
for _, c := range connections {
for _, d := range c.dest {
req := config.clientSet.CoreV1().RESTClient().Post().
Resource("pods").
Name(c.src.Name).
Namespace(c.src.Namespace).
SubResource("exec")

req.VersionedParams(&corev1.PodExecOptions{
Command: []string{"/usr/bin/nc", "-z", d.addr, d.port},
Stdin: false,
Stdout: true,
Stderr: true,
TTY: false,
}, scheme.ParameterCodec)

exec, err := remotecommand.NewSPDYExecutor(config.restConfig, "POST", req.URL())
if err != nil {
return nil, err
}

var stdout, stderr bytes.Buffer

err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{
Stdout: &stdout,
Stderr: &stderr,
})
if err != nil {
return nil, errors.Wrapf(err, "connecting to %s", c.src.Name)
}

if stderr.String() != "" {
results = append(results, validate.Result{Status: validate.Failure, Message: stderr.String()})
}
}
}

return results, nil
}

func CurrentContextSetTo(clusterService string) error {
currentContext, err := GetCurrentContext()
if err != nil {
Expand Down

0 comments on commit 3ce717f

Please sign in to comment.