diff --git a/VERSION b/VERSION index 00f7928c..a4ea962e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.0.34 +v1.0.35 diff --git a/charts/federated-learning/fedbed/.gitignore b/charts/databases/parallax/.gitignore similarity index 100% rename from charts/federated-learning/fedbed/.gitignore rename to charts/databases/parallax/.gitignore diff --git a/charts/federated-learning/fedbed/.helmignore b/charts/databases/parallax/.helmignore similarity index 84% rename from charts/federated-learning/fedbed/.helmignore rename to charts/databases/parallax/.helmignore index 6b0f281e..8a487fdd 100644 --- a/charts/federated-learning/fedbed/.helmignore +++ b/charts/databases/parallax/.helmignore @@ -25,5 +25,5 @@ # Docs *.md -# Ignore any figures used in the README.md -examples/README.assets \ No newline at end of file +# Tools +build \ No newline at end of file diff --git a/charts/databases/parallax/Chart.yaml b/charts/databases/parallax/Chart.yaml new file mode 100644 index 00000000..2a5f9e9d --- /dev/null +++ b/charts/databases/parallax/Chart.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v2 +appVersion: "1.16.0" +description: "Tebis is a rack-scale key/value store optimized for replication over RDMA" + +name: tebis +type: application + +maintainers: + - name: Fotis Nikolaidis + email: nikolaidis.fotis@gmail.com + url: https://www.linkedin.com/in/fotis-nikolaidis-444a6634/ + +version: 0.0.0 \ No newline at end of file diff --git a/charts/databases/parallax/README.md b/charts/databases/parallax/README.md new file mode 100644 index 00000000..ba13fa1f --- /dev/null +++ b/charts/databases/parallax/README.md @@ -0,0 +1,171 @@ +# Tebis + +This document focuses on setting up a development environment for the +distributed of Tebis on a local machine. + + +## Build Tebis Containers + + +Build the Tebis initialization manager: + +```shell +docker build . -t tebis-init -f init.Dockerfile +``` + +Build the Tebis nodes: + +```shell +docker build -t icsforth/tebis-node . -f tebis.Dockerfile +``` + + +## Set up Execution Environment + +Tebis uses RDMA for all network communication, which requires support from the +network interface to run. A software implementation (soft-RoCE) exists and can +run on all network interfaces. + + +### Install dependencies + +The `ibverbs-utils` and `rdma-core` packages are required to enable soft-RoCE. +These packages should be in most distirbutions' repositories. + +``` +apt install ibverbs-utils rdma-core perftest +``` + +#### + +### Enabling soft-RoCE + +Soft ROCE is a software implementation of RoCE that allows using Infiniband over any ethernet adapter. + + +ROCE requires the `ethtool` to be installed and the `rdma_ucm` and `uverbs0` modules to be loaded in your system. + +``` +sudo yum install ethtool +sudo modprobe rdma_rxe rdma_ucm +``` + +Then, where enp1s0 is the ethernet interface (e.g, eth0, en01, ...) + +``` +rdma link add rxe0 type rxe netdev enp1s0 +``` + +validate it: + +``` +>> rdma link +link rxe0/1 state ACTIVE physical_state LINK_UP netdev eno1 +``` + + +#### Verify soft-RoCE is working +To verify that soft-RoCE is working, we can run a simple RDMA Write throuhgput +benchmark. + +First, open two shells, one to act as the server and one to act as the client. +Then run the following commands: +* On the server: `ib_write_bw` +* On the client: `ib_write_bw eth_interface_ip`, where `eth_interface_ip` is +the IP address of a soft-RoCE enabled ethernet interface. + +Example output: +* Server process: +``` +************************************ +* Waiting for client to connect... * +************************************ +--------------------------------------------------------------------------------------- +RDMA_Write BW Test +Dual-port : OFF Device : rxe0 +Number of qps : 1 Transport type : IB +Connection type : RC Using SRQ : OFF +CQ Moderation : 100 +Mtu : 1024[B] +Link type : Ethernet +GID index : 1 +Max inline data : 0[B] +rdma_cm QPs : OFF +Data ex. method : Ethernet +--------------------------------------------------------------------------------------- +local address: LID 0000 QPN 0x0011 PSN 0x3341fd RKey 0x000204 VAddr 0x007f7e1b8fa000 +GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:122:205 +remote address: LID 0000 QPN 0x0012 PSN 0xbfbac5 RKey 0x000308 VAddr 0x007f70f5843000 +GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:122:205 +--------------------------------------------------------------------------------------- +#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps] +65536 5000 847.44 827.84 0.013245 +--------------------------------------------------------------------------------------- +``` + +* Client process: +``` +--------------------------------------------------------------------------------------- +RDMA_Write BW Test +Dual-port : OFF Device : rxe0 +Number of qps : 1 Transport type : IB +Connection type : RC Using SRQ : OFF +TX depth : 128 +CQ Moderation : 100 +Mtu : 1024[B] +Link type : Ethernet +GID index : 1 +Max inline data : 0[B] +rdma_cm QPs : OFF +Data ex. method : Ethernet +--------------------------------------------------------------------------------------- +local address: LID 0000 QPN 0x0012 PSN 0xbfbac5 RKey 0x000308 VAddr 0x007f70f5843000 +GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:122:205 +remote address: LID 0000 QPN 0x0011 PSN 0x3341fd RKey 0x000204 VAddr 0x007f7e1b8fa000 +GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:122:205 +--------------------------------------------------------------------------------------- +#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps] +65536 5000 847.44 827.84 0.013245 +--------------------------------------------------------------------------------------- +``` + + +```shell +server: ibv_rc_pingpong -d rxe0 -g 1 +client: ibv_rc_pingpong -d rxe0 -g 1 10.1.128.51 +``` + + +### Others + + +Run rxe_cfg add ethN to configure an RXE instance on ethernet device ethN. + +``` shell +You should now have an rxe0 device: + +# rxe_cfg status + +Name Link Driver Speed NMTU IPv4_addr RDEV RMTU +enp1s0 yes virtio_net 1500 192.168.122.211 rxe0 1024 (3) +``` + + +https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html-single/configuring_infiniband_and_rdma_networks/index +https://support.mellanox.com/s/article/howto-configure-soft-roce +https://github.com/ememos/GiantVM/issues/24 + + +On /etc/security/limits.conf you must add + +``` +* soft memlock unlimited +* hard memlock unlimited +``` + +https://bbs.archlinux.org/viewtopic.php?id=273059 + +https://ask.cyberinfrastructure.org/t/access-to-dev-infiniband-from-user-space/854/2 + + +## Parameters diff --git a/charts/databases/parallax/example.yml b/charts/databases/parallax/example.yml new file mode 100644 index 00000000..1647ae07 --- /dev/null +++ b/charts/databases/parallax/example.yml @@ -0,0 +1,60 @@ +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Scenario +metadata: + name: parallax +spec: + actions: + - action: Cluster + name: sequential-benchmarks + cluster: + templateRef: parallax.ycsb + schedule: + event: + state: '{{.NumPendingJobs}} == 0 && {{.NumRunningJobs}} == 0' + tolerate: + failedJobs: 35 # number of tests + inputs: + - { test: test_small } + - { test: test_medium } + - { test: test_large } + - { test: test_smalld } + - { test: test_mediumd } + - { test: test_larged } + - { test: test_index_node } + - { test: test_dirty_scans_sd_greater } + - { test: test_dirty_scans_small } + - { test: test_dirty_scans_medium } + - { test: test_dirty_scans_large } + - { test: test_dirty_scans_smalld } + - { test: test_dirty_scans_mediumd } + - { test: test_dirty_scans_larged } + - { test: test_dirty_scans_smallp } + - { test: test_dirty_scans_mediump } + - { test: test_dirty_scans_largep } + - { test: test_dirty_scans_smalldp } + - { test: test_dirty_scans_mediumdp } + - { test: test_dirty_scans_largedp } + - { test: test_options } + - { test: test_categories } + - { test: test_sanitizers } + - { test: test_gc } + - { test: test_medium } + - { test: test_mixes_99_small_1_medium } + - { test: test_mixes_45_small_50_medium_5_big } + - { test: simple_test_delete } + - { test: test_leaf_root_delete_get_scan } + - { test: test_region_allocations } + - { test: test_redo_undo_log } + - { test: test_optional_logging } + - { test: test_par_format } + - { test: test_par_put_serialized } + - { test: test_par_put_metadata } + - { test: tracer} + + # Teardown + - action: Delete + name: teardown + depends: { success: [ sequential-benchmarks ] } + delete: + jobs: [] diff --git a/charts/databases/parallax/templates/benchmark.yml b/charts/databases/parallax/templates/benchmark.yml new file mode 100644 index 00000000..24f3c401 --- /dev/null +++ b/charts/databases/parallax/templates/benchmark.yml @@ -0,0 +1,169 @@ +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Template +metadata: + name: tebis.cluster.zookeeper +spec: + service: + # decorators: + # telemetry: [ system.telemetry.agent] + containers: + - name: main + image: zookeeper:3.5.9 + ports: + - name: peerport + containerPort: 2888 + - name: leaderport + containerPort: 3888 + - name: clientport + containerPort: 2181 + command: + - /bin/sh # Run shell + - -c # Read from string + - | # Multi-line str + set -eum + + echo "Initiate Zookeeper at 0.0.0.0:2181" + /docker-entrypoint.sh zkServer.sh start-foreground + +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Template +metadata: + name: tebis.cluster.bootstrap +spec: + inputs: + parameters: + zookeeper: localhost + serverHost: "localhost" + serverPort: "6060" + service: + # decorators: + # telemetry: [ system.telemetry.agent] + containers: + - name: main + image: icsforth/tebis-init:latest + command: + - /bin/sh # Run shell + - -c # Read from string + - | # Multi-line str + set -eum + + echo "Create hosts file" + cat > hosts < regions < /dev/shm/app # Sidecar: use it for entering the cgroup + + device={{"{{.inputs.parameters.device}}"}} + zookeeper={{"{{.inputs.parameters.zookeeper}}"}} + cores="0,1" + + rdma=$(hostname -I) + + echo "RDMA " ${rdma} + + echo "Pre-allocate space for Tebis server" + fallocate --length 16G ${device} + + cd /build/tebis/build/kreon_server + + echo "Starting Tebis server" + echo ./kreon_server ${device} ${zookeeper}:2181 ${rdma} 65536 8 send_index "6060,${cores}" + ./kreon_server ${device} ${zookeeper}:2181 ${rdma} 65536 8 send_index "6060,${cores}" + + +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Template +metadata: + name: tebis.cluster.client +spec: + inputs: + parameters: + zookeeper: localhost + scenario: "./execution_plan.txt" + threads: "4" + regions: "1" + service: + # decorators: + # telemetry: [ system.telemetry.agent] + # annotations: + # k8s.v1.cni.cncf.io/networks: macvlan-rdma + containers: + - name: main + image: icsforth/tebis-node:latest + securityContext: + privileged: true + command: + - /bin/sh # Run shell + - -c # Read from string + - | # Multi-line str + set -eum + cut -d ' ' -f 4 /proc/self/stat > /dev/shm/app # Sidecar: use it for entering the cgroup + + echo "Starting Tebis client" + + cd /build/tebis/build/YCSB-CXX + + sleep infinity + + ./ycsb-async-kreon \ + -threads {{"{{.inputs.parameters.threads}}"}} \ + -w sd \ + -zookeeper {{"{{.inputs.parameters.zookeeper}}"}}:2181 \ + -dbnum 1 \ + -e {{"{{.inputs.parameters.scenario}}"}} \ + insertStart 0 + diff --git a/charts/databases/parallax/values.yaml b/charts/databases/parallax/values.yaml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/charts/databases/parallax/values.yaml @@ -0,0 +1 @@ +--- diff --git a/charts/federated-learning/fedbed/example.yaml b/charts/federated-learning/fedbed/example.yaml new file mode 100644 index 00000000..e69de29b diff --git a/charts/federated-learning/fedbed/templates/framework.yaml b/charts/federated-learning/fedbed/templates/client-with-dataset.yaml similarity index 97% rename from charts/federated-learning/fedbed/templates/framework.yaml rename to charts/federated-learning/fedbed/templates/client-with-dataset.yaml index b5c3cfd7..062f4562 100644 --- a/charts/federated-learning/fedbed/templates/framework.yaml +++ b/charts/federated-learning/fedbed/templates/client-with-dataset.yaml @@ -36,9 +36,9 @@ spec: - name: FL_DATASET value: "CIFAR100" - name: FL_TRAINING_SET_SIZE - value: "50_000" - - name: FL_TEST_SET_SIZE - value: "50_000" + value: "-1" +# - name: FL_TEST_SET_SIZE +# value: "-1" ports: - name: http containerPort: 8080 @@ -50,7 +50,6 @@ spec: set -eux cut -d ' ' -f 4 /proc/self/stat > /dev/shm/app # Sidecar: use it for entering the cgroup - python server.py sleep infinity diff --git a/charts/federated-learning/fedbed/templates/client.yaml b/charts/federated-learning/fedbed/templates/client.yaml new file mode 100644 index 00000000..062f4562 --- /dev/null +++ b/charts/federated-learning/fedbed/templates/client.yaml @@ -0,0 +1,118 @@ +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Template +metadata: + name: fedbed.server +spec: + inputs: + parameters: + min_fit_clients: 2 + min_available_clients: 2 + service: + decorators: + telemetry: + - system.telemetry.agent + + containers: + - name: main + image: icsforth/fedbed:latest + env: + - name: FL_STRATEGY + value: FedAvg + - name: FL_NUM_OF_ROUNDS + value: "5" + - name: FL_FRACTION_FIT + value: "0.1" + - name: FL_FRACTION_EVAL + value: "0.1" + - name: FL_MIN_EVAL_CLIENTS + value: {{"{{.inputs.parameters.min_available_clients}}" | quote}} + - name: FL_MIN_FIT_CLIENTS + value: {{"{{.inputs.parameters.min_fit_clients}}" | quote }} + - name: FL_MIN_AVAILABLE_CLIENTS + value: {{"{{.inputs.parameters.min_available_clients}}" | quote}} + - name: FL_EVAL_DATASET + value: "false" + - name: FL_DATASET + value: "CIFAR100" + - name: FL_TRAINING_SET_SIZE + value: "-1" +# - name: FL_TEST_SET_SIZE +# value: "-1" + ports: + - name: http + containerPort: 8080 + + command: + - /bin/sh # Run shell + - -c # Read from string + - | # Multi-line str + set -eux + cut -d ' ' -f 4 /proc/self/stat > /dev/shm/app # Sidecar: use it for entering the cgroup + + python server.py + + sleep infinity + + +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Template +metadata: + name: fedbed.client +spec: + inputs: + parameters: + fl_server: "server" + dataset: "fl.dataset.cifar10" + backend: "pytorch" + + total_nodes: 10 + node_id: 0 + service: + decorators: + telemetry: + - system.telemetry.agent + + volumes: + - name: dataset + persistentVolumeClaim: + claimName: {{"{{.inputs.parameters.dataset}}" | quote}} + + containers: + - name: main + image: icsforth/fedbed:latest + volumeMounts: + - name: dataset + mountPath: /data/{{"{{.inputs.parameters.backend}}"}}/cifar10 + env: + - name: FL_SERVER + value: {{"{{.inputs.parameters.fl_server}}" | quote }} + - name: FL_BACKEND + value: {{"{{.inputs.parameters.backend}}" | quote }} + - name: FL_NUM_OF_THREADS + value: "1" + + # Are these values really needed here? + - name: FL_NODES + value: {{"{{.inputs.parameters.total_nodes}}" | quote}} + - name: FL_NODE_ID + value: {{"{{.inputs.parameters.node_id}}" | quote}} + - name: FL_DATASET_DISTRIBUTION + value: "flat" + - name: FL_DATASET_DISTRIBUTION_PARAMETERS + value: "{}" + - name: FL_DATASET_RANDOM + value: "false" + + command: + - /bin/sh # Run shell + - -c # Read from string + - | # Multi-line str + set -eux + cut -d ' ' -f 4 /proc/self/stat > /dev/shm/app # Sidecar: use it for entering the cgroup + + # do it like that until I find a better way to go from fl.dataset.cifar10 to cifar10. + FL_DATASET=$(basename {{"{{.inputs.parameters.dataset}}"}}) + + python client.py \ No newline at end of file diff --git a/charts/federated-learning/fedbed/templates/server.yaml b/charts/federated-learning/fedbed/templates/server.yaml new file mode 100644 index 00000000..062f4562 --- /dev/null +++ b/charts/federated-learning/fedbed/templates/server.yaml @@ -0,0 +1,118 @@ +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Template +metadata: + name: fedbed.server +spec: + inputs: + parameters: + min_fit_clients: 2 + min_available_clients: 2 + service: + decorators: + telemetry: + - system.telemetry.agent + + containers: + - name: main + image: icsforth/fedbed:latest + env: + - name: FL_STRATEGY + value: FedAvg + - name: FL_NUM_OF_ROUNDS + value: "5" + - name: FL_FRACTION_FIT + value: "0.1" + - name: FL_FRACTION_EVAL + value: "0.1" + - name: FL_MIN_EVAL_CLIENTS + value: {{"{{.inputs.parameters.min_available_clients}}" | quote}} + - name: FL_MIN_FIT_CLIENTS + value: {{"{{.inputs.parameters.min_fit_clients}}" | quote }} + - name: FL_MIN_AVAILABLE_CLIENTS + value: {{"{{.inputs.parameters.min_available_clients}}" | quote}} + - name: FL_EVAL_DATASET + value: "false" + - name: FL_DATASET + value: "CIFAR100" + - name: FL_TRAINING_SET_SIZE + value: "-1" +# - name: FL_TEST_SET_SIZE +# value: "-1" + ports: + - name: http + containerPort: 8080 + + command: + - /bin/sh # Run shell + - -c # Read from string + - | # Multi-line str + set -eux + cut -d ' ' -f 4 /proc/self/stat > /dev/shm/app # Sidecar: use it for entering the cgroup + + python server.py + + sleep infinity + + +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Template +metadata: + name: fedbed.client +spec: + inputs: + parameters: + fl_server: "server" + dataset: "fl.dataset.cifar10" + backend: "pytorch" + + total_nodes: 10 + node_id: 0 + service: + decorators: + telemetry: + - system.telemetry.agent + + volumes: + - name: dataset + persistentVolumeClaim: + claimName: {{"{{.inputs.parameters.dataset}}" | quote}} + + containers: + - name: main + image: icsforth/fedbed:latest + volumeMounts: + - name: dataset + mountPath: /data/{{"{{.inputs.parameters.backend}}"}}/cifar10 + env: + - name: FL_SERVER + value: {{"{{.inputs.parameters.fl_server}}" | quote }} + - name: FL_BACKEND + value: {{"{{.inputs.parameters.backend}}" | quote }} + - name: FL_NUM_OF_THREADS + value: "1" + + # Are these values really needed here? + - name: FL_NODES + value: {{"{{.inputs.parameters.total_nodes}}" | quote}} + - name: FL_NODE_ID + value: {{"{{.inputs.parameters.node_id}}" | quote}} + - name: FL_DATASET_DISTRIBUTION + value: "flat" + - name: FL_DATASET_DISTRIBUTION_PARAMETERS + value: "{}" + - name: FL_DATASET_RANDOM + value: "false" + + command: + - /bin/sh # Run shell + - -c # Read from string + - | # Multi-line str + set -eux + cut -d ' ' -f 4 /proc/self/stat > /dev/shm/app # Sidecar: use it for entering the cgroup + + # do it like that until I find a better way to go from fl.dataset.cifar10 to cifar10. + FL_DATASET=$(basename {{"{{.inputs.parameters.dataset}}"}}) + + python client.py \ No newline at end of file diff --git a/charts/federated-learning/flower/.gitignore b/charts/federated-learning/flower/.gitignore deleted file mode 100644 index 127925ba..00000000 --- a/charts/federated-learning/flower/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# ignore local charts (used for testing) -charts -Chart.lock diff --git a/charts/federated-learning/flower/examples/advanced_pytorch.yaml b/charts/federated-learning/flower/example.yaml similarity index 93% rename from charts/federated-learning/flower/examples/advanced_pytorch.yaml rename to charts/federated-learning/flower/example.yaml index cb0293d3..b4b29124 100644 --- a/charts/federated-learning/flower/examples/advanced_pytorch.yaml +++ b/charts/federated-learning/flower/example.yaml @@ -2,7 +2,7 @@ apiVersion: frisbee.dev/v1alpha1 kind: Scenario metadata: - name: advanced-pytoch + name: advanced-pytorch spec: actions: # Run the default script diff --git a/charts/federated-learning/flower/examples/advanced_tensorflow.yaml b/charts/federated-learning/flower/examples/advanced_tensorflow.yaml deleted file mode 100644 index 8d871760..00000000 --- a/charts/federated-learning/flower/examples/advanced_tensorflow.yaml +++ /dev/null @@ -1,20 +0,0 @@ ---- -apiVersion: frisbee.dev/v1alpha1 -kind: Scenario -metadata: - name: advanced-tensorflow -spec: - actions: - # Run the default script - - action: Service - name: script - service: - templateRef: flower.advanced-tensorflow.standalone - - # Teardown - - action: Delete - name: teardown - depends: { success: [ script ] } - delete: - jobs: [ ] - diff --git a/deploy/cadvisor/Dockerfile b/deploy/cadvisor/Dockerfile new file mode 100644 index 00000000..e69de29b diff --git a/deploy/cadvisor/builder.sh b/deploy/cadvisor/builder.sh new file mode 100644 index 00000000..a62c8a16 --- /dev/null +++ b/deploy/cadvisor/builder.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +docker build . -t icsforth/prometheus -f prometheus.Dockerfile + +docker push icsforth/prometheus:latest \ No newline at end of file diff --git a/hack/builder/Dockerfile b/deploy/ci-environment/Dockerfile similarity index 100% rename from hack/builder/Dockerfile rename to deploy/ci-environment/Dockerfile diff --git a/hack/builder/builder.sh b/deploy/ci-environment/builder.sh similarity index 100% rename from hack/builder/builder.sh rename to deploy/ci-environment/builder.sh diff --git a/deploy/prometheus/Dockerfile b/deploy/prometheus/Dockerfile new file mode 100644 index 00000000..63fbf875 --- /dev/null +++ b/deploy/prometheus/Dockerfile @@ -0,0 +1,5 @@ +FROM prom/prometheus + +# Use envsubst to replace ${var} or $var according to the values of the current environment variables. +RUN wget https://github.com/a8m/envsubst/releases/download/v1.2.0/envsubst-Linux-x86_64 && \ + chmod +x envsubst-Linux-x86_64 diff --git a/deploy/prometheus/builder.sh b/deploy/prometheus/builder.sh new file mode 100644 index 00000000..e69de29b diff --git a/examples/federated_learning/fedbed/all_combined.yml b/examples/federated_learning/fedbed/all_combined.yml new file mode 100644 index 00000000..a1f72320 --- /dev/null +++ b/examples/federated_learning/fedbed/all_combined.yml @@ -0,0 +1,35 @@ +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Scenario +metadata: + name: resource-distribution +spec: + actions: + # Step 1: Create FedBed server + - action: Service + name: server + service: + templateRef: fedbed.server + inputs: + - { min_fit_clients: 3 } + + # Step 2: Create FedBed clients + - action: Cluster + name: clients + depends: { running: [ server ]} + cluster: + templateRef: fedbed.client + inputs: + - { fl_server: server, node_id: 0, total_nodes: 3, backend: "pytorch" } + - { fl_server: server, node_id: 1, total_nodes: 3, backend: "pytorch" } + - { fl_server: server, node_id: 2, total_nodes: 3, backend: "pytorch" } + resources: + total: { mem: 500Mi } + distribution: { name: uniform } + + # Teardown + - action: Delete + name: teardown + depends: { success: [ clients ] } + delete: + jobs: [ server ] diff --git a/charts/federated-learning/fedbed/examples/basic-setup.yml b/examples/federated_learning/fedbed/cached-dataset.yml similarity index 72% rename from charts/federated-learning/fedbed/examples/basic-setup.yml rename to examples/federated_learning/fedbed/cached-dataset.yml index 896a255c..997d5034 100644 --- a/charts/federated-learning/fedbed/examples/basic-setup.yml +++ b/examples/federated_learning/fedbed/cached-dataset.yml @@ -2,10 +2,10 @@ apiVersion: frisbee.dev/v1alpha1 kind: Scenario metadata: - name: baseline + name: cached-datasets spec: actions: - # Step 0: Prepare the dataset + # Step 0: Cache the downloaded dataset to the host (for multi-host setups, the host may change across executions) - action: Service name: cifar10-download service: @@ -19,7 +19,7 @@ spec: service: templateRef: fedbed.server inputs: - - { min_fit_clients: 2, min_available_clients: 3 } + - { min_fit_clients: 3 } # Step 2: Create FedBed clients - action: Cluster @@ -27,18 +27,14 @@ spec: depends: { running: [ server ], success: [ cifar10-download ] } cluster: templateRef: fedbed.client - defaultDistribution: { name: uniform } inputs: - - { fl_server: server, dataset: fl.datasets.cifar10, node_id: 0, total_nodes: 3 } + - { fl_server: server, dataset: fl.datasets.cifar10, node_id: 0, total_nodes: 3, } - { fl_server: server, dataset: fl.datasets.cifar10, node_id: 1, total_nodes: 3 } - { fl_server: server, dataset: fl.datasets.cifar10, node_id: 2, total_nodes: 3 } - resources: - total: { mem: 500Mi } - distribution: { name: default } # Teardown - action: Delete name: teardown - depends: { success: [ clients, server ] } + depends: { success: [ clients ] } delete: - jobs: [ ] + jobs: [server ] diff --git a/examples/federated_learning/fedbed/custom-backend.yml b/examples/federated_learning/fedbed/custom-backend.yml new file mode 100644 index 00000000..092867e0 --- /dev/null +++ b/examples/federated_learning/fedbed/custom-backend.yml @@ -0,0 +1,32 @@ +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Scenario +metadata: + name: custom-backend +spec: + actions: + # Step 1: Create FedBed server + - action: Service + name: server + service: + templateRef: fedbed.server + inputs: + - { min_fit_clients: 3 } + + # Step 2: Create FedBed clients + - action: Cluster + name: clients + depends: { running: [ server ], success: [ cifar10-download ] } + cluster: + templateRef: fedbed.client + inputs: + - { fl_server: server, node_id: 0, total_nodes: 3, backend: "pytorch" } + - { fl_server: server, node_id: 1, total_nodes: 3, backend: "pytorch" } + - { fl_server: server, node_id: 2, total_nodes: 3, backend: "pytorch" } + + # Teardown + - action: Delete + name: teardown + depends: { success: [ clients ] } + delete: + jobs: [ server ] diff --git a/examples/federated_learning/fedbed/resource-distribution.yml b/examples/federated_learning/fedbed/resource-distribution.yml new file mode 100644 index 00000000..05f05854 --- /dev/null +++ b/examples/federated_learning/fedbed/resource-distribution.yml @@ -0,0 +1,43 @@ +--- +apiVersion: frisbee.dev/v1alpha1 +kind: Scenario +metadata: + name: custom-backend +spec: + actions: + # Step 0: Cache the downloaded dataset to the host (for multi-host setups, the host may change across executions) + - action: Service + name: cifar10-download + service: + templateRef: fl.datasets.cifar10.download + inputs: + - { cache: "/tmp/dataset" } + + # Step 1: Create FedBed server + - action: Service + name: server + service: + templateRef: fedbed.server + inputs: + - { min_fit_clients: 3 } + + # Step 2: Create FedBed clients + - action: Cluster + name: clients + depends: { running: [ server ], success: [ cifar10-download ] } + cluster: + templateRef: fedbed.client + inputs: + - { fl_server: server, node_id: 0, total_nodes: 3, backend: "tensorflow" } + - { fl_server: server, node_id: 1, total_nodes: 3, backend: "tensorflow" } + - { fl_server: server, node_id: 2, total_nodes: 3, backend: "tensorflow" } + resources: + total: { mem: 500Mi } + distribution: { name: uniform } + + # Teardown + - action: Delete + name: teardown + depends: { success: [ clients ] } + delete: + jobs: [ server ] diff --git a/examples/1.hello-world.yml b/examples/tutorial/1.hello-world.yml similarity index 100% rename from examples/1.hello-world.yml rename to examples/tutorial/1.hello-world.yml diff --git a/examples/10.resource-throttling.yml b/examples/tutorial/10.resource-throttling.yml similarity index 100% rename from examples/10.resource-throttling.yml rename to examples/tutorial/10.resource-throttling.yml diff --git a/examples/10b.resource-distribution.yml b/examples/tutorial/10b.resource-distribution.yml similarity index 100% rename from examples/10b.resource-distribution.yml rename to examples/tutorial/10b.resource-distribution.yml diff --git a/examples/11.advanced-placement.yml b/examples/tutorial/11.advanced-placement.yml similarity index 100% rename from examples/11.advanced-placement.yml rename to examples/tutorial/11.advanced-placement.yml diff --git a/examples/12.callables.yml b/examples/tutorial/12.callables.yml similarity index 100% rename from examples/12.callables.yml rename to examples/tutorial/12.callables.yml diff --git a/examples/13.assertions.yml b/examples/tutorial/13.assertions.yml similarity index 100% rename from examples/13.assertions.yml rename to examples/tutorial/13.assertions.yml diff --git a/examples/14.delete-job.yml b/examples/tutorial/14.delete-job.yml similarity index 100% rename from examples/14.delete-job.yml rename to examples/tutorial/14.delete-job.yml diff --git a/examples/15.performance-monitoring.yml b/examples/tutorial/15.performance-monitoring.yml similarity index 100% rename from examples/15.performance-monitoring.yml rename to examples/tutorial/15.performance-monitoring.yml diff --git a/examples/16.single-chaos.yml b/examples/tutorial/16.single-chaos.yml similarity index 100% rename from examples/16.single-chaos.yml rename to examples/tutorial/16.single-chaos.yml diff --git a/examples/17.multi-chaos.yml b/examples/tutorial/17.multi-chaos.yml similarity index 100% rename from examples/17.multi-chaos.yml rename to examples/tutorial/17.multi-chaos.yml diff --git a/examples/18.revoke-failures.yml b/examples/tutorial/18.revoke-failures.yml similarity index 100% rename from examples/18.revoke-failures.yml rename to examples/tutorial/18.revoke-failures.yml diff --git a/examples/19.sla-assertions.yml b/examples/tutorial/19.sla-assertions.yml similarity index 100% rename from examples/19.sla-assertions.yml rename to examples/tutorial/19.sla-assertions.yml diff --git a/examples/2.parameters.yml b/examples/tutorial/2.parameters.yml similarity index 100% rename from examples/2.parameters.yml rename to examples/tutorial/2.parameters.yml diff --git a/examples/20.testdata.yml b/examples/tutorial/20.testdata.yml similarity index 100% rename from examples/20.testdata.yml rename to examples/tutorial/20.testdata.yml diff --git a/examples/21.shared-storage.yml b/examples/tutorial/21.shared-storage.yml similarity index 100% rename from examples/21.shared-storage.yml rename to examples/tutorial/21.shared-storage.yml diff --git a/examples/22.diagnostics.yml b/examples/tutorial/22.diagnostics.yml similarity index 100% rename from examples/22.diagnostics.yml rename to examples/tutorial/22.diagnostics.yml diff --git a/examples/23.debugging.yml b/examples/tutorial/23.debugging.yml similarity index 100% rename from examples/23.debugging.yml rename to examples/tutorial/23.debugging.yml diff --git a/examples/3.execution-order.yml b/examples/tutorial/3.execution-order.yml similarity index 100% rename from examples/3.execution-order.yml rename to examples/tutorial/3.execution-order.yml diff --git a/examples/4.dependency-order.yml b/examples/tutorial/4.dependency-order.yml similarity index 100% rename from examples/4.dependency-order.yml rename to examples/tutorial/4.dependency-order.yml diff --git a/examples/5.clustering.yml b/examples/tutorial/5.clustering.yml similarity index 100% rename from examples/5.clustering.yml rename to examples/tutorial/5.clustering.yml diff --git a/examples/6.time-driven-scheduling.yml b/examples/tutorial/6.time-driven-scheduling.yml similarity index 100% rename from examples/6.time-driven-scheduling.yml rename to examples/tutorial/6.time-driven-scheduling.yml diff --git a/examples/6b.timeline-distribution.yml b/examples/tutorial/6b.timeline-distribution.yml similarity index 100% rename from examples/6b.timeline-distribution.yml rename to examples/tutorial/6b.timeline-distribution.yml diff --git a/examples/7.event-driven-scheduling.yml b/examples/tutorial/7.event-driven-scheduling.yml similarity index 100% rename from examples/7.event-driven-scheduling.yml rename to examples/tutorial/7.event-driven-scheduling.yml diff --git a/examples/8.conditional-scheduling.yml b/examples/tutorial/8.conditional-scheduling.yml similarity index 100% rename from examples/8.conditional-scheduling.yml rename to examples/tutorial/8.conditional-scheduling.yml diff --git a/examples/9.tolerate-failures.yml b/examples/tutorial/9.tolerate-failures.yml similarity index 100% rename from examples/9.tolerate-failures.yml rename to examples/tutorial/9.tolerate-failures.yml diff --git a/pkg/structure/structs.go b/pkg/structure/structs.go new file mode 100644 index 00000000..430d6a4f --- /dev/null +++ b/pkg/structure/structs.go @@ -0,0 +1,42 @@ +package structure + +import ( + "reflect" + "strings" +) + +func structToLowercase(in interface{}) map[string]interface{} { + v := reflect.ValueOf(in) + if v.Kind() != reflect.Struct { + return nil + } + + vType := v.Type() + + result := make(map[string]interface{}, v.NumField()) + + for i := 0; i < v.NumField(); i++ { + name := vType.Field(i).Name + result[strings.ToLower(name)] = v.Field(i).Interface() + } + + return result +} + +func lower(f interface{}) interface{} { + switch f := f.(type) { + case []interface{}: + for i := range f { + f[i] = lower(f[i]) + } + return f + case map[string]interface{}: + lf := make(map[string]interface{}, len(f)) + for k, v := range f { + lf[strings.ToLower(k)] = lower(v) + } + return lf + default: + return f + } +}