From 73416fcece67ac00f083cd801754ee53fc0dc915 Mon Sep 17 00:00:00 2001 From: Yusuke Kadowaki Date: Mon, 29 Jan 2024 16:57:34 +0900 Subject: [PATCH] Add HPA for read replica (#2307) * Add hpa for read replica * Update schema * Add read replica flag to agent ngt to skip save index when it's read replica instance * Refactor * Add tests for Close * enable readreplica hpa on e2e * Add error handling for write operation to read replica in CreateIndex and saveIndex --- .github/helm/values/values-readreplica.yaml | 3 +- charts/vald-benchmark-operator/README.md | 186 ++++++++++ .../vald-helm-operator/crds/valdrelease.yaml | 190 +++++++++- .../vald-readreplica/templates/configmap.yaml | 47 +++ .../templates/deployment.yaml | 8 +- charts/vald-readreplica/templates/hpa.yaml | 67 ++++ charts/vald/README.md | 97 ++++- .../vald/templates/gateway/lb/configmap.yaml | 4 +- charts/vald/values.schema.json | 340 +++++++++++++++++- charts/vald/values.yaml | 20 +- internal/config/ngt.go | 3 + internal/errors/agent.go | 3 + pkg/agent/core/ngt/service/ngt.go | 22 +- pkg/agent/core/ngt/service/ngt_test.go | 205 +++++++++++ pkg/agent/core/ngt/service/option.go | 8 + pkg/agent/core/ngt/usecase/agentd.go | 1 + 16 files changed, 1182 insertions(+), 22 deletions(-) create mode 100644 charts/vald-benchmark-operator/README.md create mode 100644 charts/vald-readreplica/templates/configmap.yaml create mode 100644 charts/vald-readreplica/templates/hpa.yaml diff --git a/.github/helm/values/values-readreplica.yaml b/.github/helm/values/values-readreplica.yaml index 2e4b86b89f..6f392b8816 100644 --- a/.github/helm/values/values-readreplica.yaml +++ b/.github/helm/values/values-readreplica.yaml @@ -58,7 +58,8 @@ agent: readreplica: enabled: true snapshot_classname: "csi-hostpath-snapclass" - replica: 1 + hpa: + enabled: true discoverer: minReplicas: 1 diff --git a/charts/vald-benchmark-operator/README.md b/charts/vald-benchmark-operator/README.md new file mode 100644 index 0000000000..77a8b21770 --- /dev/null +++ b/charts/vald-benchmark-operator/README.md @@ -0,0 +1,186 @@ +# vald-benchmark-operator + +![Version: v1.7.5](https://img.shields.io/badge/Version-v1.7.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.16.0](https://img.shields.io/badge/AppVersion-1.16.0-informational?style=flat-square) + +A benchmark operator for benchmarking the Vald cluster. + +**Homepage:** + +## Maintainers + +| Name | Email | Url | +| -------- | -------------------- | --- | +| kpango | | | +| vankichi | | | +| kmrmt | | | + +## Source Code + +- + +## Values + +| Key | Type | Default | Description | +| ----------------------------------------------------------------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | +| affinity | object | `{}` | affinity | +| annotations | object | `{}` | deployment annotations | +| image.pullPolicy | string | `"Always"` | image pull policy | +| image.repository | string | `"vdaas/vald-benchmark-operator"` | job image repository | +| image.tag | string | `"v1.7.5"` | image tag for job docker image | +| job_image.pullPolicy | string | `"Always"` | | +| job_image.repository | string | `"vdaas/vald-benchmark-job"` | | +| job_image.tag | string | `"v1.7.5"` | | +| logging.format | string | `"raw"` | logging format. logging format must be `raw` or `json` | +| logging.level | string | `"debug"` | logging level. logging level must be `debug`, `info`, `warn`, `error` or `fatal`. | +| logging.logger | string | `"glg"` | logger name. currently logger must be `glg` or `zap`. | +| name | string | `"vald-benchmark-operator"` | name of the deployment | +| nodeSelector | object | `{}` | node labels for pod assignment | +| observability.enabled | bool | `false` | | +| observability.metrics.enable_cgo | bool | `true` | | +| observability.metrics.enable_goroutine | bool | `true` | | +| observability.metrics.enable_memory | bool | `true` | | +| observability.metrics.enable_version_info | bool | `true` | | +| observability.metrics.version_info_labels[0] | string | `"vald_version"` | | +| observability.metrics.version_info_labels[1] | string | `"server_name"` | | +| observability.metrics.version_info_labels[2] | string | `"git_commit"` | | +| observability.metrics.version_info_labels[3] | string | `"build_time"` | | +| observability.metrics.version_info_labels[4] | string | `"go_version"` | | +| observability.metrics.version_info_labels[5] | string | `"go_os"` | | +| observability.metrics.version_info_labels[6] | string | `"go_arch"` | | +| observability.metrics.version_info_labels[7] | string | `"ngt_version"` | | +| observability.otlp.attribute.namespace | string | `"_MY_POD_NAMESPACE_"` | | +| observability.otlp.attribute.node_name | string | `"_MY_NODE_NAME_"` | | +| observability.otlp.attribute.pod_name | string | `"_MY_POD_NAME_"` | | +| observability.otlp.attribute.service_name | string | `"vald-benchmark-operator"` | | +| observability.otlp.collector_endpoint | string | `""` | | +| observability.otlp.metrics_export_interval | string | `"1s"` | | +| observability.otlp.metrics_export_timeout | string | `"1m"` | | +| observability.otlp.trace_batch_timeout | string | `"1s"` | | +| observability.otlp.trace_export_timeout | string | `"1m"` | | +| observability.otlp.trace_max_export_batch_size | int | `1024` | | +| observability.otlp.trace_max_queue_size | int | `256` | | +| observability.trace.enabled | bool | `false` | | +| observability.trace.sampling_rate | int | `1` | | +| podAnnotations | object | `{}` | pod annotations | +| podSecurityContext | object | `{"fsGroup":65532,"fsGroupChangePolicy":"OnRootMismatch","runAsGroup":65532,"runAsNonRoot":true,"runAsUser":65532}` | security context for pod | +| rbac.create | bool | `true` | required roles and rolebindings will be created | +| rbac.name | string | `"vald-benchmark-operator"` | name of roles and rolebindings | +| replicas | int | `1` | the number of replica for deployment | +| resources | object | `{"limits":{"cpu":"300m","memory":"300Mi"},"requests":{"cpu":"200m","memory":"200Mi"}}` | kubernetes resources of pod | +| securityContext | object | `{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"privileged":false,"readOnlyRootFilesystem":true,"runAsGroup":65532,"runAsNonRoot":true,"runAsUser":65532}` | security context for container | +| server_config.full_shutdown_duration | string | `"600s"` | | +| server_config.healths.liveness.enabled | bool | `true` | | +| server_config.healths.liveness.host | string | `"0.0.0.0"` | | +| server_config.healths.liveness.livenessProbe.failureThreshold | int | `2` | liveness probe failure threshold | +| server_config.healths.liveness.livenessProbe.httpGet.path | string | `"/liveness"` | readiness probe path | +| server_config.healths.liveness.livenessProbe.httpGet.port | string | `"liveness"` | readiness probe port | +| server_config.healths.liveness.livenessProbe.httpGet.scheme | string | `"HTTP"` | readiness probe scheme | +| server_config.healths.liveness.livenessProbe.initialDelaySeconds | int | `15` | liveness probe initial delay seconds | +| server_config.healths.liveness.livenessProbe.periodSeconds | int | `20` | liveness probe period seconds | +| server_config.healths.liveness.livenessProbe.successThreshold | int | `1` | liveness probe success threshold | +| server_config.healths.liveness.livenessProbe.timeoutSeconds | int | `5` | liveness probe timeout seconds | +| server_config.healths.liveness.port | int | `3000` | | +| server_config.healths.liveness.server.http.handler_timeout | string | `""` | | +| server_config.healths.liveness.server.http.idle_timeout | string | `""` | | +| server_config.healths.liveness.server.http.read_header_timeout | string | `""` | | +| server_config.healths.liveness.server.http.read_timeout | string | `""` | | +| server_config.healths.liveness.server.http.shutdown_duration | string | `"5s"` | | +| server_config.healths.liveness.server.http.write_timeout | string | `""` | | +| server_config.healths.liveness.server.mode | string | `""` | | +| server_config.healths.liveness.server.network | string | `"tcp"` | | +| server_config.healths.liveness.server.probe_wait_time | string | `"3s"` | | +| server_config.healths.liveness.server.socket_path | string | `""` | | +| server_config.healths.liveness.servicePort | int | `3000` | | +| server_config.healths.readiness.enabled | bool | `true` | | +| server_config.healths.readiness.host | string | `"0.0.0.0"` | | +| server_config.healths.readiness.port | int | `3001` | | +| server_config.healths.readiness.readinessProbe.failureThreshold | int | `2` | readiness probe failure threshold | +| server_config.healths.readiness.readinessProbe.httpGet.path | string | `"/readiness"` | readiness probe path | +| server_config.healths.readiness.readinessProbe.httpGet.port | string | `"readiness"` | readiness probe port | +| server_config.healths.readiness.readinessProbe.httpGet.scheme | string | `"HTTP"` | readiness probe scheme | +| server_config.healths.readiness.readinessProbe.initialDelaySeconds | int | `10` | readiness probe initial delay seconds | +| server_config.healths.readiness.readinessProbe.periodSeconds | int | `3` | readiness probe period seconds | +| server_config.healths.readiness.readinessProbe.successThreshold | int | `1` | readiness probe success threshold | +| server_config.healths.readiness.readinessProbe.timeoutSeconds | int | `2` | readiness probe timeout seconds | +| server_config.healths.readiness.server.http.handler_timeout | string | `""` | | +| server_config.healths.readiness.server.http.idle_timeout | string | `""` | | +| server_config.healths.readiness.server.http.read_header_timeout | string | `""` | | +| server_config.healths.readiness.server.http.read_timeout | string | `""` | | +| server_config.healths.readiness.server.http.shutdown_duration | string | `"0s"` | | +| server_config.healths.readiness.server.http.write_timeout | string | `""` | | +| server_config.healths.readiness.server.mode | string | `""` | | +| server_config.healths.readiness.server.network | string | `"tcp"` | | +| server_config.healths.readiness.server.probe_wait_time | string | `"3s"` | | +| server_config.healths.readiness.server.socket_path | string | `""` | | +| server_config.healths.readiness.servicePort | int | `3001` | | +| server_config.healths.startup.enabled | bool | `true` | enable startup probe. | +| server_config.healths.startup.startupProbe.failureThreshold | int | `30` | | +| server_config.healths.startup.startupProbe.httpGet.path | string | `"/liveness"` | | +| server_config.healths.startup.startupProbe.httpGet.port | string | `"liveness"` | | +| server_config.healths.startup.startupProbe.httpGet.scheme | string | `"HTTP"` | | +| server_config.healths.startup.startupProbe.initialDelaySeconds | int | `5` | | +| server_config.healths.startup.startupProbe.periodSeconds | int | `5` | | +| server_config.healths.startup.startupProbe.successThreshold | int | `1` | | +| server_config.healths.startup.startupProbe.timeoutSeconds | int | `2` | | +| server_config.metrics.pprof.enabled | bool | `false` | | +| server_config.metrics.pprof.host | string | `"0.0.0.0"` | | +| server_config.metrics.pprof.port | int | `6060` | | +| server_config.metrics.pprof.server.http.handler_timeout | string | `"5s"` | | +| server_config.metrics.pprof.server.http.idle_timeout | string | `"2s"` | | +| server_config.metrics.pprof.server.http.read_header_timeout | string | `"1s"` | | +| server_config.metrics.pprof.server.http.read_timeout | string | `"1s"` | | +| server_config.metrics.pprof.server.http.shutdown_duration | string | `"5s"` | | +| server_config.metrics.pprof.server.http.write_timeout | string | `"1m"` | | +| server_config.metrics.pprof.server.mode | string | `"REST"` | | +| server_config.metrics.pprof.server.network | string | `"tcp"` | | +| server_config.metrics.pprof.server.probe_wait_time | string | `"3s"` | | +| server_config.metrics.pprof.server.socket_path | string | `""` | | +| server_config.servers.grpc.enabled | bool | `true` | | +| server_config.servers.grpc.host | string | `"0.0.0.0"` | | +| server_config.servers.grpc.name | string | `"grpc"` | | +| server_config.servers.grpc.port | int | `8081` | | +| server_config.servers.grpc.server.grpc.bidirectional_stream_concurrency | int | `20` | | +| server_config.servers.grpc.server.grpc.connection_timeout | string | `""` | | +| server_config.servers.grpc.server.grpc.enable_reflection | bool | `true` | | +| server_config.servers.grpc.server.grpc.header_table_size | int | `0` | | +| server_config.servers.grpc.server.grpc.initial_conn_window_size | int | `0` | | +| server_config.servers.grpc.server.grpc.initial_window_size | int | `0` | | +| server_config.servers.grpc.server.grpc.interceptors | list | `[]` | | +| server_config.servers.grpc.server.grpc.keepalive.max_conn_age | string | `""` | gRPC server keep alive max connection age | +| server_config.servers.grpc.server.grpc.keepalive.max_conn_age_grace | string | `""` | gRPC server keep alive max connection age grace | +| server_config.servers.grpc.server.grpc.keepalive.max_conn_idle | string | `""` | gRPC server keep alive max connection idle | +| server_config.servers.grpc.server.grpc.keepalive.min_time | string | `"60s"` | gRPC server keep alive min_time | +| server_config.servers.grpc.server.grpc.keepalive.permit_without_stream | bool | `true` | gRPC server keep alive permit_without_stream | +| server_config.servers.grpc.server.grpc.keepalive.time | string | `"120s"` | gRPC server keep alive time | +| server_config.servers.grpc.server.grpc.keepalive.timeout | string | `"30s"` | gRPC server keep alive timeout | +| server_config.servers.grpc.server.grpc.max_header_list_size | int | `0` | | +| server_config.servers.grpc.server.grpc.max_receive_message_size | int | `0` | | +| server_config.servers.grpc.server.grpc.max_send_message_size | int | `0` | | +| server_config.servers.grpc.server.grpc.read_buffer_size | int | `0` | | +| server_config.servers.grpc.server.grpc.write_buffer_size | int | `0` | | +| server_config.servers.grpc.server.mode | string | `"GRPC"` | | +| server_config.servers.grpc.server.network | string | `"tcp"` | | +| server_config.servers.grpc.server.probe_wait_time | string | `"3s"` | | +| server_config.servers.grpc.server.restart | bool | `true` | | +| server_config.servers.grpc.server.socket_path | string | `""` | | +| server_config.servers.grpc.serviecPort | int | `8081` | | +| server_config.servers.rest.enabled | bool | `false` | | +| server_config.tls.ca | string | `"/path/to/ca"` | | +| server_config.tls.cert | string | `"/path/to/cert"` | | +| server_config.tls.enabled | bool | `false` | | +| server_config.tls.insecure_skip_verify | bool | `false` | enable/disable skip SSL certificate verification | +| server_config.tls.key | string | `"/path/to/key"` | | +| service.annotations | object | `{}` | service annotations | +| service.enabled | bool | `true` | service enabled | +| service.externalTrafficPolicy | string | `""` | external traffic policy (can be specified when service type is LoadBalancer or NodePort) : Cluster or Local | +| service.labels | object | `{}` | service labels | +| service.type | string | `"ClusterIP"` | service type: ClusterIP, LoadBalancer or NodePort | +| serviceAccount.create | bool | `true` | service account will be created | +| serviceAccount.name | string | `"vald-benchmark-operator"` | name of service account | +| time_zone | string | `""` | time_zone | +| tolerations | list | `[]` | tolerations | +| version | string | `"v0.0.0"` | version of benchmark-operator config | + +--- + +Autogenerated from chart metadata using [helm-docs v1.11.3](https://github.com/norwoodj/helm-docs/releases/v1.11.3) diff --git a/charts/vald-helm-operator/crds/valdrelease.yaml b/charts/vald-helm-operator/crds/valdrelease.yaml index 214ee2b95d..4a9345982c 100644 --- a/charts/vald-helm-operator/crds/valdrelease.yaml +++ b/charts/vald-helm-operator/crds/valdrelease.yaml @@ -392,12 +392,23 @@ spec: type: string enabled: type: boolean + hpa: + type: object + properties: + enabled: + type: boolean + targetCPUUtilizationPercentage: + type: integer label_key: type: string + maxReplicas: + type: integer + minimum: 1 + minReplicas: + type: integer + minimum: 1 name: type: string - replica: - type: integer service: type: object properties: @@ -5314,6 +5325,181 @@ spec: type: boolean duration: type: string + read_client: + type: object + properties: + addrs: + type: array + items: + type: string + backoff: + type: object + properties: + backoff_factor: + type: number + backoff_time_limit: + type: string + enable_error_log: + type: boolean + initial_duration: + type: string + jitter_limit: + type: string + maximum_duration: + type: string + retry_count: + type: integer + call_option: + type: object + x-kubernetes-preserve-unknown-fields: true + circuit_breaker: + type: object + properties: + closed_error_rate: + type: number + closed_refresh_timeout: + type: string + half_open_error_rate: + type: number + min_samples: + type: integer + open_timeout: + type: string + connection_pool: + type: object + properties: + enable_dns_resolver: + type: boolean + enable_rebalance: + type: boolean + old_conn_close_duration: + type: string + rebalance_duration: + type: string + size: + type: integer + dial_option: + type: object + properties: + backoff_base_delay: + type: string + backoff_jitter: + type: number + backoff_max_delay: + type: string + backoff_multiplier: + type: number + enable_backoff: + type: boolean + initial_connection_window_size: + type: integer + initial_window_size: + type: integer + insecure: + type: boolean + interceptors: + type: array + items: + type: string + enum: + - TraceInterceptor + keepalive: + type: object + properties: + permit_without_stream: + type: boolean + time: + type: string + timeout: + type: string + max_msg_size: + type: integer + min_connection_timeout: + type: string + net: + type: object + properties: + dialer: + type: object + properties: + dual_stack_enabled: + type: boolean + keepalive: + type: string + timeout: + type: string + dns: + type: object + properties: + cache_enabled: + type: boolean + cache_expiration: + type: string + refresh_duration: + type: string + socket_option: + type: object + properties: + ip_recover_destination_addr: + type: boolean + ip_transparent: + type: boolean + reuse_addr: + type: boolean + reuse_port: + type: boolean + tcp_cork: + type: boolean + tcp_defer_accept: + type: boolean + tcp_fast_open: + type: boolean + tcp_no_delay: + type: boolean + tcp_quick_ack: + type: boolean + tls: + type: object + properties: + ca: + type: string + cert: + type: string + enabled: + type: boolean + insecure_skip_verify: + type: boolean + key: + type: string + read_buffer_size: + type: integer + timeout: + type: string + write_buffer_size: + type: integer + health_check_duration: + type: string + max_recv_msg_size: + type: integer + max_retry_rpc_buffer_size: + type: integer + max_send_msg_size: + type: integer + tls: + type: object + properties: + ca: + type: string + cert: + type: string + enabled: + type: boolean + insecure_skip_verify: + type: boolean + key: + type: string + wait_for_ready: + type: boolean index_replica: type: integer minimum: 1 diff --git a/charts/vald-readreplica/templates/configmap.yaml b/charts/vald-readreplica/templates/configmap.yaml new file mode 100644 index 0000000000..f928c96752 --- /dev/null +++ b/charts/vald-readreplica/templates/configmap.yaml @@ -0,0 +1,47 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +{{- $agent := .Values.agent -}} +{{- $readreplica := .Values.agent.readreplica -}} +{{- if $agent.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ $readreplica.name }}-config + labels: + app.kubernetes.io/name: {{ include "vald.name" . }} + helm.sh/chart: {{ include "vald.chart" . }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/version: {{ .Chart.Version }} + app.kubernetes.io/component: agent +data: + config.yaml: | + --- + version: {{ $agent.version }} + time_zone: {{ default .Values.defaults.time_zone $agent.time_zone }} + logging: + {{- $logging := dict "Values" $agent.logging "default" .Values.defaults.logging }} + {{- include "vald.logging" $logging | nindent 6 }} + server_config: + {{- $servers := dict "Values" $agent.server_config "default" .Values.defaults.server_config }} + {{- include "vald.servers" $servers | nindent 6 }} + observability: + {{- $observability := dict "Values" $agent.observability "default" .Values.defaults.observability }} + {{- include "vald.observability" $observability | nindent 6 }} + ngt: + {{- toYaml $agent.ngt | nindent 6 }} + is_readreplica: true +{{- end }} diff --git a/charts/vald-readreplica/templates/deployment.yaml b/charts/vald-readreplica/templates/deployment.yaml index 5cb2a9a926..880eb8e5ff 100644 --- a/charts/vald-readreplica/templates/deployment.yaml +++ b/charts/vald-readreplica/templates/deployment.yaml @@ -45,7 +45,7 @@ metadata: {{- end }} spec: {{- if not $agent.hpa.enabled }} - replicas: {{ $readreplica.replica }} + replicas: {{ $readreplica.minReplicas }} {{- end }} revisionHistoryLimit: {{ $agent.revisionHistoryLimit }} selector: @@ -110,7 +110,7 @@ spec: {{- toYaml $agent.env | nindent 12 }} {{- end }} volumeMounts: - - name: {{ $agent.name }}-config + - name: {{ $readreplica.name }}-config mountPath: /etc/server/ {{- if not $agent.ngt.enable_in_memory_mode }} {{- if $agent.ngt.index_path }} @@ -136,10 +136,10 @@ spec: {{- end }} terminationGracePeriodSeconds: {{ $agent.terminationGracePeriodSeconds }} volumes: - - name: {{ $agent.name }}-config + - name: {{ $readreplica.name }}-config configMap: defaultMode: 420 - name: {{ $agent.name }}-config + name: {{ $readreplica.name }}-config - name: {{ $readreplica.volume_name }} persistentVolumeClaim: claimName: {{ $readreplica.name }}-pvc-{{ $id }} diff --git a/charts/vald-readreplica/templates/hpa.yaml b/charts/vald-readreplica/templates/hpa.yaml new file mode 100644 index 0000000000..d0c90c073d --- /dev/null +++ b/charts/vald-readreplica/templates/hpa.yaml @@ -0,0 +1,67 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +{{- $agent := .Values.agent -}} +{{- $readreplica := .Values.agent.readreplica -}} +{{- $defaults := .Values.defaults -}} +{{- $release := .Release -}} +{{- $chart := .Chart -}} +{{- $valdname := include "vald.name" . -}} +{{- $valdchart := include "vald.chart" . -}} +{{- $cap := .Capabilities -}} +{{- if and $readreplica.enabled $readreplica.hpa.enabled }} +{{ range $id := until (int $agent.minReplicas) }} +--- +{{- if ($cap.APIVersions.Has "autoscaling/v2") }} +apiVersion: autoscaling/v2 +{{- else if ($cap.APIVersions.Has "autoscaling/v1") }} +apiVersion: autoscaling/v1 +{{- else if ($cap.APIVersions.Has "autoscaling/v2beta2") }} +apiVersion: autoscaling/v2beta2 +{{- else }} +apiVersion: autoscaling/v2beta1 +{{- end }} +kind: HorizontalPodAutoscaler +metadata: + name: {{ $readreplica.name }}-{{ $id }} + labels: + app.kubernetes.io/name: {{ $valdname }} + helm.sh/chart: {{ $valdchart }} + app.kubernetes.io/managed-by: {{ $release.Service }} + app.kubernetes.io/instance: {{ $release.Name }} + app.kubernetes.io/version: {{ $chart.Version }} + app.kubernetes.io/component: {{ $readreplica.component_name }} + {{ $readreplica.label_key }}: "{{ $id }}" +spec: + maxReplicas: {{ $readreplica.maxReplicas }} + minReplicas: {{ $readreplica.minReplicas }} + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ $readreplica.name }}-{{ $id }} +{{- if or ($cap.APIVersions.Has "autoscaling/v2") ($cap.APIVersions.Has "autoscaling/v2beta2") ($cap.APIVersions.Has "autoscaling/v2beta1") }} + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ $readreplica.hpa.targetCPUUtilizationPercentage }} +{{ else }} + targetCPUUtilizationPercentage: {{ $readreplica.hpa.targetCPUUtilizationPercentage }} +{{- end }} +status: +{{- end }} +{{- end }} diff --git a/charts/vald/README.md b/charts/vald/README.md index 831aba25dc..a9bd03c32e 100644 --- a/charts/vald/README.md +++ b/charts/vald/README.md @@ -109,12 +109,15 @@ Run the following command to install the chart, | agent.podPriority.value | int | `1000000000` | agent pod PriorityClass value | | agent.podSecurityContext | object | `{"fsGroup":65532,"fsGroupChangePolicy":"OnRootMismatch","runAsGroup":65532,"runAsNonRoot":true,"runAsUser":65532}` | security context for pod | | agent.progressDeadlineSeconds | int | `600` | progress deadline seconds | -| agent.readreplica | object | `{"component_name":"agent-readreplica","enabled":false,"label_key":"vald-readreplica-id","name":"vald-agent-ngt-readreplica","replica":2,"service":{"annotations":{}},"snapshot_classname":"","volume_name":"vald-agent-ngt-readreplica-pvc"}` | readreplica deployment annotations | +| agent.readreplica | object | `{"component_name":"agent-readreplica","enabled":false,"hpa":{"enabled":false,"targetCPUUtilizationPercentage":80},"label_key":"vald-readreplica-id","maxReplicas":3,"minReplicas":1,"name":"vald-agent-ngt-readreplica","service":{"annotations":{}},"snapshot_classname":"","volume_name":"vald-agent-ngt-readreplica-pvc"}` | readreplica deployment annotations | | agent.readreplica.component_name | string | `"agent-readreplica"` | app.kubernetes.io/component name of agent readreplica | | agent.readreplica.enabled | bool | `false` | [This feature is WORK IN PROGRESS]enable agent readreplica | +| agent.readreplica.hpa.enabled | bool | `false` | HPA enabled | +| agent.readreplica.hpa.targetCPUUtilizationPercentage | int | `80` | HPA CPU utilization percentage | | agent.readreplica.label_key | string | `"vald-readreplica-id"` | label key to identify read replica resources | +| agent.readreplica.maxReplicas | int | `3` | maximum number of replicas. if HPA is disabled, this value will be ignored. | +| agent.readreplica.minReplicas | int | `1` | minimum number of replicas. if HPA is disabled, the replicas will be set to this value | | agent.readreplica.name | string | `"vald-agent-ngt-readreplica"` | name of agent readreplica | -| agent.readreplica.replica | int | `2` | replica number of read replica | | agent.readreplica.service | object | `{"annotations":{}}` | service settings for read replica service resources | | agent.readreplica.service.annotations | object | `{}` | readreplica deployment annotations | | agent.readreplica.snapshot_classname | string | `""` | snapshot class name for snapshotter used for read replica | @@ -663,6 +666,7 @@ Run the following command to install the chart, | gateway.lb.gateway_config.discoverer.agent_client_options | object | `{}` | gRPC client options for agents (overrides defaults.grpc.client) | | gateway.lb.gateway_config.discoverer.client | object | `{}` | gRPC client for discoverer (overrides defaults.grpc.client) | | gateway.lb.gateway_config.discoverer.duration | string | `"200ms"` | | +| gateway.lb.gateway_config.discoverer.read_client | object | `{}` | gRPC client for discoverer (overrides defaults.grpc.client) | | gateway.lb.gateway_config.index_replica | int | `3` | number of index replica | | gateway.lb.gateway_config.multi_operation_concurrency | int | `20` | number of concurrency of multiXXX api's operation | | gateway.lb.gateway_config.node_name | string | `""` | node name | @@ -710,6 +714,95 @@ Run the following command to install the chart, | gateway.lb.version | string | `"v0.0.0"` | version of gateway config | | gateway.lb.volumeMounts | list | `[]` | volume mounts | | gateway.lb.volumes | list | `[]` | volumes | +| gateway.mirror.affinity.nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution | list | `[]` | node affinity preferred scheduling terms | +| gateway.mirror.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms | list | `[]` | node affinity required node selectors | +| gateway.mirror.affinity.podAffinity.preferredDuringSchedulingIgnoredDuringExecution | list | `[]` | pod affinity preferred scheduling terms | +| gateway.mirror.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution | list | `[]` | pod affinity required scheduling terms | +| gateway.mirror.affinity.podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecution | list | `[{"podAffinityTerm":{"labelSelector":{"matchExpressions":[{"key":"app","operator":"In","values":["vald-mirror-gateway"]}]},"topologyKey":"kubernetes.io/hostname"},"weight":100}]` | pod anti-affinity preferred scheduling terms | +| gateway.mirror.affinity.podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecution | list | `[]` | pod anti-affinity required scheduling terms | +| gateway.mirror.annotations | object | `{}` | deployment annotations | +| gateway.mirror.clusterRole.enabled | bool | `true` | creates clusterRole resource | +| gateway.mirror.clusterRole.name | string | `"gateway-mirror"` | name of clusterRole | +| gateway.mirror.clusterRoleBinding.enabled | bool | `true` | creates clusterRoleBinding resource | +| gateway.mirror.clusterRoleBinding.name | string | `"gateway-mirror"` | name of clusterRoleBinding | +| gateway.mirror.enabled | bool | `false` | gateway enabled | +| gateway.mirror.env | list | `[{"name":"MY_NODE_NAME","valueFrom":{"fieldRef":{"fieldPath":"spec.nodeName"}}},{"name":"MY_POD_NAME","valueFrom":{"fieldRef":{"fieldPath":"metadata.name"}}},{"name":"MY_POD_NAMESPACE","valueFrom":{"fieldRef":{"fieldPath":"metadata.namespace"}}}]` | environment variables | +| gateway.mirror.externalTrafficPolicy | string | `""` | external traffic policy (can be specified when service type is LoadBalancer or NodePort) : Cluster or Local | +| gateway.mirror.gateway_config.client | object | `{}` | gRPC client (overrides defaults.grpc.client) | +| gateway.mirror.gateway_config.colocation | string | `"dc1"` | colocation name | +| gateway.mirror.gateway_config.discovery_duration | string | `"1s"` | duration to discovery | +| gateway.mirror.gateway_config.gateway_addr | string | `""` | address for lb-gateway | +| gateway.mirror.gateway_config.group | string | `""` | mirror group name | +| gateway.mirror.gateway_config.namespace | string | `"_MY_POD_NAMESPACE_"` | namespace to discovery | +| gateway.mirror.gateway_config.net.dialer.dual_stack_enabled | bool | `false` | TCP dialer dual stack enabled | +| gateway.mirror.gateway_config.net.dialer.keepalive | string | `"10m"` | TCP dialer keep alive | +| gateway.mirror.gateway_config.net.dialer.timeout | string | `"30s"` | TCP dialer timeout | +| gateway.mirror.gateway_config.net.dns.cache_enabled | bool | `true` | TCP DNS cache enabled | +| gateway.mirror.gateway_config.net.dns.cache_expiration | string | `"24h"` | TCP DNS cache expiration | +| gateway.mirror.gateway_config.net.dns.refresh_duration | string | `"5m"` | TCP DNS cache refresh duration | +| gateway.mirror.gateway_config.net.socket_option.ip_recover_destination_addr | bool | `false` | server listen socket option for ip_recover_destination_addr functionality | +| gateway.mirror.gateway_config.net.socket_option.ip_transparent | bool | `false` | server listen socket option for ip_transparent functionality | +| gateway.mirror.gateway_config.net.socket_option.reuse_addr | bool | `true` | server listen socket option for reuse_addr functionality | +| gateway.mirror.gateway_config.net.socket_option.reuse_port | bool | `true` | server listen socket option for reuse_port functionality | +| gateway.mirror.gateway_config.net.socket_option.tcp_cork | bool | `false` | server listen socket option for tcp_cork functionality | +| gateway.mirror.gateway_config.net.socket_option.tcp_defer_accept | bool | `true` | server listen socket option for tcp_defer_accept functionality | +| gateway.mirror.gateway_config.net.socket_option.tcp_fast_open | bool | `true` | server listen socket option for tcp_fast_open functionality | +| gateway.mirror.gateway_config.net.socket_option.tcp_no_delay | bool | `true` | server listen socket option for tcp_no_delay functionality | +| gateway.mirror.gateway_config.net.socket_option.tcp_quick_ack | bool | `true` | server listen socket option for tcp_quick_ack functionality | +| gateway.mirror.gateway_config.net.tls.ca | string | `"/path/to/ca"` | TLS ca path | +| gateway.mirror.gateway_config.net.tls.cert | string | `"/path/to/cert"` | TLS cert path | +| gateway.mirror.gateway_config.net.tls.enabled | bool | `false` | TLS enabled | +| gateway.mirror.gateway_config.net.tls.insecure_skip_verify | bool | `false` | enable/disable skip SSL certificate verification | +| gateway.mirror.gateway_config.net.tls.key | string | `"/path/to/key"` | TLS key path | +| gateway.mirror.gateway_config.pod_name | string | `"_MY_POD_NAME_"` | self mirror gateway pod name | +| gateway.mirror.gateway_config.register_duration | string | `"1s"` | duration to register mirror-gateway. | +| gateway.mirror.gateway_config.self_mirror_addr | string | `""` | address for self mirror-gateway | +| gateway.mirror.hpa.enabled | bool | `true` | HPA enabled | +| gateway.mirror.hpa.targetCPUUtilizationPercentage | int | `80` | HPA CPU utilization percentage | +| gateway.mirror.image.pullPolicy | string | `"Always"` | image pull policy | +| gateway.mirror.image.repository | string | `"vdaas/vald-mirror-gateway"` | image repository | +| gateway.mirror.image.tag | string | `""` | image tag (overrides defaults.image.tag) | +| gateway.mirror.ingress.annotations | object | `{"nginx.ingress.kubernetes.io/grpc-backend":"true"}` | annotations for ingress | +| gateway.mirror.ingress.defaultBackend | object | `{"enabled":true}` | defaultBackend config | +| gateway.mirror.ingress.defaultBackend.enabled | bool | `true` | gateway ingress defaultBackend enabled | +| gateway.mirror.ingress.enabled | bool | `false` | gateway ingress enabled | +| gateway.mirror.ingress.host | string | `"mirror.gateway.vald.vdaas.org"` | ingress hostname | +| gateway.mirror.ingress.pathType | string | `"ImplementationSpecific"` | gateway ingress pathType | +| gateway.mirror.ingress.servicePort | string | `"grpc"` | service port to be exposed by ingress | +| gateway.mirror.initContainers | list | `[{"image":"busybox:stable","name":"wait-for-gateway-lb","sleepDuration":2,"target":"gateway-lb","type":"wait-for"}]` | init containers | +| gateway.mirror.internalTrafficPolicy | string | `""` | internal traffic policy (can be specified when service type is LoadBalancer or NodePort) : Cluster or Local | +| gateway.mirror.kind | string | `"Deployment"` | deployment kind: Deployment or DaemonSet | +| gateway.mirror.logging | object | `{}` | logging config (overrides defaults.logging) | +| gateway.mirror.maxReplicas | int | `9` | maximum number of replicas. if HPA is disabled, this value will be ignored. | +| gateway.mirror.maxUnavailable | string | `"50%"` | maximum number of unavailable replicas | +| gateway.mirror.minReplicas | int | `3` | minimum number of replicas. if HPA is disabled, the replicas will be set to this value | +| gateway.mirror.name | string | `"vald-mirror-gateway"` | name of gateway deployment | +| gateway.mirror.nodeName | string | `""` | node name | +| gateway.mirror.nodeSelector | object | `{}` | node selector | +| gateway.mirror.observability | object | `{"otlp":{"attribute":{"service_name":"vald-mirror-gateway"}}}` | observability config (overrides defaults.observability) | +| gateway.mirror.podAnnotations | object | `{}` | pod annotations | +| gateway.mirror.podPriority.enabled | bool | `true` | gateway pod PriorityClass enabled | +| gateway.mirror.podPriority.value | int | `1000000` | gateway pod PriorityClass value | +| gateway.mirror.podSecurityContext | object | `{"fsGroup":65532,"fsGroupChangePolicy":"OnRootMismatch","runAsGroup":65532,"runAsNonRoot":true,"runAsUser":65532}` | security context for pod | +| gateway.mirror.progressDeadlineSeconds | int | `600` | progress deadline seconds | +| gateway.mirror.resources | object | `{"limits":{"cpu":"2000m","memory":"700Mi"},"requests":{"cpu":"200m","memory":"150Mi"}}` | compute resources | +| gateway.mirror.revisionHistoryLimit | int | `2` | number of old history to retain to allow rollback | +| gateway.mirror.rollingUpdate.maxSurge | string | `"25%"` | max surge of rolling update | +| gateway.mirror.rollingUpdate.maxUnavailable | string | `"25%"` | max unavailable of rolling update | +| gateway.mirror.securityContext | object | `{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"privileged":false,"readOnlyRootFilesystem":true,"runAsGroup":65532,"runAsNonRoot":true,"runAsUser":65532}` | security context for container | +| gateway.mirror.server_config | object | `{"healths":{"liveness":{},"readiness":{},"startup":{}},"metrics":{"pprof":{}},"servers":{"grpc":{},"rest":{}}}` | server config (overrides defaults.server_config) | +| gateway.mirror.service.annotations | object | `{}` | service annotations | +| gateway.mirror.service.labels | object | `{}` | service labels | +| gateway.mirror.serviceAccount.enabled | bool | `true` | creates service account | +| gateway.mirror.serviceAccount.name | string | `"gateway-mirror"` | name of service account | +| gateway.mirror.serviceType | string | `"ClusterIP"` | service type: ClusterIP, LoadBalancer or NodePort | +| gateway.mirror.terminationGracePeriodSeconds | int | `30` | duration in seconds pod needs to terminate gracefully | +| gateway.mirror.time_zone | string | `""` | Time zone | +| gateway.mirror.tolerations | list | `[]` | tolerations | +| gateway.mirror.topologySpreadConstraints | list | `[]` | topology spread constraints of gateway pods | +| gateway.mirror.version | string | `"v0.0.0"` | version of gateway config | +| gateway.mirror.volumeMounts | list | `[]` | volume mounts | +| gateway.mirror.volumes | list | `[]` | volumes | | manager.index.affinity.nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution | list | `[]` | node affinity preferred scheduling terms | | manager.index.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms | list | `[]` | node affinity required node selectors | | manager.index.affinity.podAffinity.preferredDuringSchedulingIgnoredDuringExecution | list | `[]` | pod affinity preferred scheduling terms | diff --git a/charts/vald/templates/gateway/lb/configmap.yaml b/charts/vald/templates/gateway/lb/configmap.yaml index 7c840cb0c2..f26cd53ed1 100644 --- a/charts/vald/templates/gateway/lb/configmap.yaml +++ b/charts/vald/templates/gateway/lb/configmap.yaml @@ -51,7 +51,7 @@ data: agent_namespace: {{ $gateway.gateway_config.agent_namespace | quote }} node_name: {{ $gateway.gateway_config.node_name | quote }} index_replica: {{ $gateway.gateway_config.index_replica }} - read_replica_replicas: {{ $readreplica.replica }} + read_replica_replicas: {{ $readreplica.minReplicas }} discoverer: duration: {{ $gateway.gateway_config.discoverer.duration }} client: @@ -70,7 +70,7 @@ data: {{- if $readreplica.enabled }} read_replica_client: client: - {{- $discovererClient := $gateway.gateway_config.discoverer.client }} + {{- $discovererClient := $gateway.gateway_config.discoverer.read_client }} {{- $readReplicaPort := $agent.server_config.servers.grpc.port }} {{- $defaultReadReplicaPort := default .Values.defaults.server_config.servers.grpc.port $readReplicaPort }} {{- $readReplicaAddrs := list }} diff --git a/charts/vald/values.schema.json b/charts/vald/values.schema.json index 41cd419a73..551f638a77 100644 --- a/charts/vald/values.schema.json +++ b/charts/vald/values.schema.json @@ -475,18 +475,34 @@ "type": "boolean", "description": "[This feature is WORK IN PROGRESS]enable agent readreplica" }, + "hpa": { + "type": "object", + "properties": { + "enabled": { "type": "boolean", "description": "HPA enabled" }, + "targetCPUUtilizationPercentage": { + "type": "integer", + "description": "HPA CPU utilization percentage" + } + } + }, "label_key": { "type": "string", "description": "label key to identify read replica resources" }, + "maxReplicas": { + "type": "integer", + "description": "maximum number of replicas. if HPA is disabled, this value will be ignored.", + "minimum": 1 + }, + "minReplicas": { + "type": "integer", + "description": "minimum number of replicas. if HPA is disabled, the replicas will be set to this value", + "minimum": 1 + }, "name": { "type": "string", "description": "name of agent readreplica" }, - "replica": { - "type": "integer", - "description": "replica number of read replica" - }, "service": { "type": "object", "description": "service settings for read replica service resources", @@ -8819,7 +8835,321 @@ "wait_for_ready": { "type": "boolean" } } }, - "duration": { "type": "string" } + "duration": { "type": "string" }, + "read_client": { + "type": "object", + "properties": { + "addrs": { + "type": "array", + "description": "gRPC client addresses", + "items": { "type": "string" } + }, + "backoff": { + "type": "object", + "properties": { + "backoff_factor": { + "type": "number", + "description": "gRPC client backoff factor" + }, + "backoff_time_limit": { + "type": "string", + "description": "gRPC client backoff time limit" + }, + "enable_error_log": { + "type": "boolean", + "description": "gRPC client backoff log enabled" + }, + "initial_duration": { + "type": "string", + "description": "gRPC client backoff initial duration" + }, + "jitter_limit": { + "type": "string", + "description": "gRPC client backoff jitter limit" + }, + "maximum_duration": { + "type": "string", + "description": "gRPC client backoff maximum duration" + }, + "retry_count": { + "type": "integer", + "description": "gRPC client backoff retry count" + } + } + }, + "call_option": { "type": "object" }, + "circuit_breaker": { + "type": "object", + "properties": { + "closed_error_rate": { + "type": "number", + "description": "gRPC client circuitbreaker closed error rate" + }, + "closed_refresh_timeout": { + "type": "string", + "description": "gRPC client circuitbreaker closed refresh timeout" + }, + "half_open_error_rate": { + "type": "number", + "description": "gRPC client circuitbreaker half-open error rate" + }, + "min_samples": { + "type": "integer", + "description": "gRPC client circuitbreaker minimum sampling count" + }, + "open_timeout": { + "type": "string", + "description": "gRPC client circuitbreaker open timeout" + } + } + }, + "connection_pool": { + "type": "object", + "properties": { + "enable_dns_resolver": { + "type": "boolean", + "description": "enables gRPC client connection pool dns resolver, when enabled vald uses ip handshake exclude dns discovery which improves network performance" + }, + "enable_rebalance": { + "type": "boolean", + "description": "enables gRPC client connection pool rebalance" + }, + "old_conn_close_duration": { + "type": "string", + "description": "makes delay before gRPC client connection closing during connection pool rebalance" + }, + "rebalance_duration": { + "type": "string", + "description": "gRPC client connection pool rebalance duration" + }, + "size": { + "type": "integer", + "description": "gRPC client connection pool size" + } + } + }, + "dial_option": { + "type": "object", + "properties": { + "backoff_base_delay": { + "type": "string", + "description": "gRPC client dial option base backoff delay" + }, + "backoff_jitter": { + "type": "number", + "description": "gRPC client dial option base backoff delay" + }, + "backoff_max_delay": { + "type": "string", + "description": "gRPC client dial option max backoff delay" + }, + "backoff_multiplier": { + "type": "number", + "description": "gRPC client dial option base backoff delay" + }, + "enable_backoff": { + "type": "boolean", + "description": "gRPC client dial option backoff enabled" + }, + "initial_connection_window_size": { + "type": "integer", + "description": "gRPC client dial option initial connection window size" + }, + "initial_window_size": { + "type": "integer", + "description": "gRPC client dial option initial window size" + }, + "insecure": { + "type": "boolean", + "description": "gRPC client dial option insecure enabled" + }, + "interceptors": { + "type": "array", + "description": "gRPC client interceptors", + "items": { + "type": "string", + "enum": ["TraceInterceptor"] + } + }, + "keepalive": { + "type": "object", + "properties": { + "permit_without_stream": { + "type": "boolean", + "description": "gRPC client keep alive permit without stream" + }, + "time": { + "type": "string", + "description": "gRPC client keep alive time" + }, + "timeout": { + "type": "string", + "description": "gRPC client keep alive timeout" + } + } + }, + "max_msg_size": { + "type": "integer", + "description": "gRPC client dial option max message size" + }, + "min_connection_timeout": { + "type": "string", + "description": "gRPC client dial option minimum connection timeout" + }, + "net": { + "type": "object", + "properties": { + "dialer": { + "type": "object", + "properties": { + "dual_stack_enabled": { + "type": "boolean", + "description": "gRPC client TCP dialer dual stack enabled" + }, + "keepalive": { + "type": "string", + "description": "gRPC client TCP dialer keep alive" + }, + "timeout": { + "type": "string", + "description": "gRPC client TCP dialer timeout" + } + } + }, + "dns": { + "type": "object", + "properties": { + "cache_enabled": { + "type": "boolean", + "description": "gRPC client TCP DNS cache enabled" + }, + "cache_expiration": { + "type": "string", + "description": "gRPC client TCP DNS cache expiration" + }, + "refresh_duration": { + "type": "string", + "description": "gRPC client TCP DNS cache refresh duration" + } + } + }, + "socket_option": { + "type": "object", + "properties": { + "ip_recover_destination_addr": { + "type": "boolean", + "description": "server listen socket option for ip_recover_destination_addr functionality" + }, + "ip_transparent": { + "type": "boolean", + "description": "server listen socket option for ip_transparent functionality" + }, + "reuse_addr": { + "type": "boolean", + "description": "server listen socket option for reuse_addr functionality" + }, + "reuse_port": { + "type": "boolean", + "description": "server listen socket option for reuse_port functionality" + }, + "tcp_cork": { + "type": "boolean", + "description": "server listen socket option for tcp_cork functionality" + }, + "tcp_defer_accept": { + "type": "boolean", + "description": "server listen socket option for tcp_defer_accept functionality" + }, + "tcp_fast_open": { + "type": "boolean", + "description": "server listen socket option for tcp_fast_open functionality" + }, + "tcp_no_delay": { + "type": "boolean", + "description": "server listen socket option for tcp_no_delay functionality" + }, + "tcp_quick_ack": { + "type": "boolean", + "description": "server listen socket option for tcp_quick_ack functionality" + } + } + }, + "tls": { + "type": "object", + "properties": { + "ca": { + "type": "string", + "description": "TLS ca path" + }, + "cert": { + "type": "string", + "description": "TLS cert path" + }, + "enabled": { + "type": "boolean", + "description": "TLS enabled" + }, + "insecure_skip_verify": { + "type": "boolean", + "description": "enable/disable skip SSL certificate verification" + }, + "key": { + "type": "string", + "description": "TLS key path" + } + } + } + } + }, + "read_buffer_size": { + "type": "integer", + "description": "gRPC client dial option read buffer size" + }, + "timeout": { + "type": "string", + "description": "gRPC client dial option timeout" + }, + "write_buffer_size": { + "type": "integer", + "description": "gRPC client dial option write buffer size" + } + } + }, + "health_check_duration": { + "type": "string", + "description": "gRPC client health check duration" + }, + "max_recv_msg_size": { "type": "integer" }, + "max_retry_rpc_buffer_size": { "type": "integer" }, + "max_send_msg_size": { "type": "integer" }, + "tls": { + "type": "object", + "properties": { + "ca": { + "type": "string", + "description": "TLS ca path" + }, + "cert": { + "type": "string", + "description": "TLS cert path" + }, + "enabled": { + "type": "boolean", + "description": "TLS enabled" + }, + "insecure_skip_verify": { + "type": "boolean", + "description": "enable/disable skip SSL certificate verification" + }, + "key": { + "type": "string", + "description": "TLS key path" + } + } + }, + "wait_for_ready": { "type": "boolean" } + } + } } }, "index_replica": { diff --git a/charts/vald/values.yaml b/charts/vald/values.yaml index 312fc09df7..0c3cda4520 100644 --- a/charts/vald/values.yaml +++ b/charts/vald/values.yaml @@ -1125,6 +1125,9 @@ gateway: # @schema {"name": "gateway.lb.gateway_config.discoverer.agent_client_options", "alias": "grpc.client"} # gateway.lb.gateway_config.discoverer.agent_client_options -- gRPC client options for agents (overrides defaults.grpc.client) agent_client_options: {} + # @schema {"name": "gateway.lb.gateway_config.discoverer.read_client", "alias": "grpc.client"} + # gateway.lb.gateway_config.discoverer.read_client -- gRPC client for discoverer (overrides defaults.grpc.client) + read_client: {} # @schema {"name": "gateway.filter", "type": "object"} filter: # @schema {"name": "gateway.filter.enabled", "type": "boolean"} @@ -2003,9 +2006,20 @@ agent: # @schema {"name": "agent.readreplica.snapshot_classname", "type": "string"} # agent.readreplica.snapshot_classname -- snapshot class name for snapshotter used for read replica snapshot_classname: "" - # @schema {"name": "agent.readreplica.replica", "type": "integer"} - # agent.readreplica.replica -- replica number of read replica - replica: 1 + # @schema {"name": "agent.readreplica.minReplicas", "type": "integer", "minimum": 1} + # agent.readreplica.minReplicas -- minimum number of replicas. + # if HPA is disabled, the replicas will be set to this value + minReplicas: 1 + # @schema {"name": "agent.readreplica.maxReplicas", "type": "integer", "minimum": 1} + # agent.readreplica.maxReplicas -- maximum number of replicas. + # if HPA is disabled, this value will be ignored. + maxReplicas: 3 + # @schema {"name": "agent.readreplica.hpa", "alias": "hpa"} + hpa: + # agent.readreplica.hpa.enabled -- HPA enabled + enabled: false + # agent.readreplica.hpa.targetCPUUtilizationPercentage -- HPA CPU utilization percentage + targetCPUUtilizationPercentage: 80 # @schema {"name": "agent.readreplica.service", "type": "object"} # agent.readreplica.service -- service settings for read replica service resources service: diff --git a/internal/config/ngt.go b/internal/config/ngt.go index d277b3b2a6..6a9d8add9d 100644 --- a/internal/config/ngt.go +++ b/internal/config/ngt.go @@ -94,6 +94,9 @@ type NGT struct { // ErrorBufferLimit represents the maximum number of core ngt error buffer pool size limit ErrorBufferLimit uint64 `yaml:"error_buffer_limit" json:"error_buffer_limit,omitempty"` + + // IsReadReplica represents whether the ngt is read replica or not + IsReadReplica bool `yaml:"is_readreplica" json:"is_readreplica"` } // KVSDB represent the ngt vector bidirectional kv store configuration diff --git a/internal/errors/agent.go b/internal/errors/agent.go index 3286a086ef..01d89ce9be 100644 --- a/internal/errors/agent.go +++ b/internal/errors/agent.go @@ -112,4 +112,7 @@ var ( // ErrAgentIndexDirectoryRecreationFailed represents an error that the index directory recreation failed during the process of broken index backup. ErrIndexDirectoryRecreationFailed = New("failed to recreate the index directory") + + // ErrWriteOperationToReadReplica represents an error that when a write operation is made to read replica. + ErrWriteOperationToReadReplica = New("write operation to read replica is not possible") ) diff --git a/pkg/agent/core/ngt/service/ngt.go b/pkg/agent/core/ngt/service/ngt.go index 6cdae59b60..004b7db253 100644 --- a/pkg/agent/core/ngt/service/ngt.go +++ b/pkg/agent/core/ngt/service/ngt.go @@ -129,7 +129,6 @@ type ngt struct { basePath string // index base directory for CoW brokenPath string // backup broken index path cowmu sync.Mutex // copy on write move lock - backupGen uint64 // number of backup generation poolSize uint32 // default pool size radius float32 // default radius @@ -140,6 +139,8 @@ type ngt struct { kvsdbConcurrency int // kvsdb concurrency historyLimit int // the maximum generation number of broken index backup + + isReadReplica bool } const ( @@ -1101,6 +1102,11 @@ func (n *ngt) CreateIndex(ctx context.Context, poolSize uint32) (err error) { span.End() } }() + + if n.isReadReplica { + return errors.ErrWriteOperationToReadReplica + } + ic := n.vq.IVQLen() + n.vq.DVQLen() if ic == 0 { return errors.ErrUncommittedIndexNotFound @@ -1270,6 +1276,11 @@ func (n *ngt) SaveIndex(ctx context.Context) (err error) { } func (n *ngt) saveIndex(ctx context.Context) (err error) { + // Skip it here in case this private function is called directly from someone + if n.isReadReplica { + return errors.ErrWriteOperationToReadReplica + } + nocie := atomic.LoadUint64(&n.nocie) if atomic.LoadUint64(&n.lastNocie) == nocie { return @@ -1656,8 +1667,14 @@ func (n *ngt) GetDimensionSize() int { } func (n *ngt) Close(ctx context.Context) (err error) { + defer n.core.Close() + err = n.kvs.Close() if len(n.path) != 0 { + if n.isReadReplica { + log.Info("skip create and save index operation on close because this is read replica") + return err + } cerr := n.CreateIndex(ctx, n.poolSize) if cerr != nil && !errors.Is(err, errors.ErrUncommittedIndexNotFound) && @@ -1681,8 +1698,7 @@ func (n *ngt) Close(ctx context.Context) (err error) { } } } - n.core.Close() - return + return err } func (n *ngt) BrokenIndexCount() uint64 { diff --git a/pkg/agent/core/ngt/service/ngt_test.go b/pkg/agent/core/ngt/service/ngt_test.go index 2aa7b21367..1029cd67a4 100644 --- a/pkg/agent/core/ngt/service/ngt_test.go +++ b/pkg/agent/core/ngt/service/ngt_test.go @@ -63,6 +63,7 @@ var defaultConfig = config.NGT{ Concurrency: 10, }, BrokenIndexHistoryLimit: 1, + ErrorBufferLimit: 100, } type index struct { @@ -921,6 +922,210 @@ func testFoundInBothIvqAndDvq(t *testing.T) { require.Equal(t, err.Error(), want.Error()) } +func Test_ngt_CreateIndex(t *testing.T) { + t.Parallel() + + type args struct { + cfg *config.NGT + opts []Option + } + type test struct { + name string + args args + want error + } + + setup := func(t *testing.T) string { + tmpDir := t.TempDir() + testIndexDir := testdata.GetTestdataPath(testdata.ValidIndex) + err := file.CopyDir(context.Background(), testIndexDir, tmpDir) + require.NoError(t, err) + + return tmpDir + } + + tests := []test{ + func() test { + tmpDir := setup(t) + return test{ + name: "CreateIndex returns ErrUncommittedIndexNotFound when there is nothing uncommitted", + args: args{ + cfg: &defaultConfig, + opts: []Option{ + WithIndexPath(tmpDir), + WithIsReadReplica(false), + }, + }, + want: errors.ErrUncommittedIndexNotFound, + } + }(), + func() test { + tmpDir := setup(t) + return test{ + name: "CreateIndex returns ErrWriteOperationToReadReplica when try to create index to read replica", + args: args{ + cfg: &defaultConfig, + opts: []Option{ + WithIndexPath(tmpDir), + WithIsReadReplica(true), + }, + }, + want: errors.ErrWriteOperationToReadReplica, + } + }(), + } + + for _, tc := range tests { + test := tc + t.Run(test.name, func(tt *testing.T) { + tt.Parallel() + defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) + + ngt, err := New(test.args.cfg, test.args.opts...) + require.NoError(tt, err) + + err = ngt.CreateIndex(context.Background(), test.args.cfg.DefaultPoolSize) + require.Equal(tt, test.want, err) + }) + } +} + +func Test_ngt_SaveIndex(t *testing.T) { + t.Parallel() + + type args struct { + cfg *config.NGT + opts []Option + } + type test struct { + name string + args args + want error + } + + setup := func(t *testing.T) string { + tmpDir := t.TempDir() + testIndexDir := testdata.GetTestdataPath(testdata.ValidIndex) + err := file.CopyDir(context.Background(), testIndexDir, tmpDir) + require.NoError(t, err) + + return tmpDir + } + + tests := []test{ + func() test { + tmpDir := setup(t) + return test{ + name: "CreateIndex successes when there is nothing to save", + args: args{ + cfg: &defaultConfig, + opts: []Option{ + WithIndexPath(tmpDir), + WithIsReadReplica(false), + }, + }, + want: nil, + } + }(), + func() test { + tmpDir := setup(t) + return test{ + name: "SaveIndex returns ErrWriteOperationToReadReplica when try to save index to read replica", + args: args{ + cfg: &defaultConfig, + opts: []Option{ + WithIndexPath(tmpDir), + WithIsReadReplica(true), + }, + }, + want: errors.ErrWriteOperationToReadReplica, + } + }(), + } + + for _, tc := range tests { + test := tc + t.Run(test.name, func(tt *testing.T) { + tt.Parallel() + defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) + + ngt, err := New(test.args.cfg, test.args.opts...) + require.NoError(tt, err) + + err = ngt.SaveIndex(context.Background()) + require.Equal(tt, test.want, err) + }) + } +} + +func Test_ngt_Close(t *testing.T) { + t.Parallel() + + type args struct { + cfg *config.NGT + opts []Option + } + type test struct { + name string + args args + want error + } + + setup := func(t *testing.T) string { + tmpDir := t.TempDir() + testIndexDir := testdata.GetTestdataPath(testdata.ValidIndex) + err := file.CopyDir(context.Background(), testIndexDir, tmpDir) + require.NoError(t, err) + + return tmpDir + } + + tests := []test{ + func() test { + tmpDir := setup(t) + return test{ + name: "Close returns ErrUncommittedIndexNotFound when it is not a read replica and try to Create Index because nothing has committed", + args: args{ + cfg: &defaultConfig, + opts: []Option{ + WithIndexPath(tmpDir), + WithIsReadReplica(false), + }, + }, + want: errors.ErrUncommittedIndexNotFound, + } + }(), + func() test { + tmpDir := setup(t) + return test{ + name: "Close successes when it is a read replica because it skips all the Close operations", + args: args{ + cfg: &defaultConfig, + opts: []Option{ + WithIndexPath(tmpDir), + WithIsReadReplica(true), + }, + }, + want: nil, + } + }(), + } + + for _, tc := range tests { + test := tc + t.Run(test.name, func(tt *testing.T) { + tt.Parallel() + defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) + + ngt, err := New(test.args.cfg, test.args.opts...) + require.NoError(tt, err) + + err = ngt.Close(context.Background()) + require.Equal(tt, test.want, err) + }) + } +} + func Test_ngt_InsertUpsert(t *testing.T) { if testing.Short() { t.Skip("The execution of this test takes a lot of time, so it is not performed during the short test\ttest: Test_ngt_InsertUpsert") diff --git a/pkg/agent/core/ngt/service/option.go b/pkg/agent/core/ngt/service/option.go index 17bcdaa98a..301c66c621 100644 --- a/pkg/agent/core/ngt/service/option.go +++ b/pkg/agent/core/ngt/service/option.go @@ -299,3 +299,11 @@ func WithCopyOnWrite(enabled bool) Option { return nil } } + +// WithIsReadReplica returns the functional option to set the read replica flag. +func WithIsReadReplica(isReadReplica bool) Option { + return func(n *ngt) error { + n.isReadReplica = isReadReplica + return nil + } +} diff --git a/pkg/agent/core/ngt/usecase/agentd.go b/pkg/agent/core/ngt/usecase/agentd.go index db97ea7f90..ff5a6ee0ab 100644 --- a/pkg/agent/core/ngt/usecase/agentd.go +++ b/pkg/agent/core/ngt/usecase/agentd.go @@ -66,6 +66,7 @@ func New(cfg *config.Data) (r runner.Runner, err error) { service.WithDefaultEpsilon(cfg.NGT.DefaultEpsilon), service.WithProactiveGC(cfg.NGT.EnableProactiveGC), service.WithCopyOnWrite(cfg.NGT.EnableCopyOnWrite), + service.WithIsReadReplica(cfg.NGT.IsReadReplica), ) if err != nil { return nil, err