Skip to content

Commit

Permalink
KubeFATE support FATE v2.0.0 (#927)
Browse files Browse the repository at this point in the history
* docker compose eggroll support

Signed-off-by: Chenlong Ma <chenlongm@vmware.com>

* support spark

Signed-off-by: Chenlong Ma <chenlongm@vmware.com>

* Helm chart support FATE v2.0.0

Signed-off-by: Chenlong Ma <chenlongm@vmware.com>

* helm-chart fix

Signed-off-by: Chenlong Ma <chenlongm@vmware.com>

* update version tag

Signed-off-by: Chenlong Ma <chenlongm@vmware.com>

---------

Signed-off-by: Chenlong Ma <chenlongm@vmware.com>
  • Loading branch information
owlet42 authored Jan 25, 2024
1 parent 2a8ce82 commit 61190b0
Show file tree
Hide file tree
Showing 48 changed files with 620 additions and 610 deletions.
18 changes: 10 additions & 8 deletions docker-deploy/.env
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,26 @@ SSH_PORT=22
# SSH_PORT: port of SSH, default 22


KubeFATE_Version=v2.0.0-beta
KubeFATE_Version=v2.0.0-release

# components version

FATEFlow_IMAGE="federatedai/fateflow"
FATEFlow_IMAGE_TAG="v2.0.0-beta"
FATEFlow_IMAGE_TAG="2.0.0-release"
FATEBoard_IMAGE="federatedai/fateboard"
FATEBoard_IMAGE_TAG="v2.0.0-beta"
FATEBoard_IMAGE_TAG="2.0.0-release"
MySQL_IMAGE="mysql"
MySQL_IMAGE_TAG="8.0.28"
Client_IMAGE="federatedai/client"
Client_IMAGE_TAG="v2.0.0-beta"
Client_IMAGE_TAG="2.0.0-release"

EGGRoll_IMAGE="federatedai/eggroll"
EGGRoll_IMAGE_TAG="v2.0.0-beta"
EGGRoll_IMAGE_TAG="2.0.0-release"
OSX_IMAGE="federatedai/osx"
OSX_IMAGE_TAG="2.0.0-release"

Nginx_IMAGE="federatedai/nginx"
Nginx_IMAGE_TAG="v2.0.0-beta"
Nginx_IMAGE_TAG="2.0.0-release"
RabbitMQ_IMAGE="federatedai/rabbitmq"
RabbitMQ_IMAGE_TAG="3.8.3-management"
Pulsar_IMAGE="federatedai/pulsar"
Expand All @@ -35,6 +37,6 @@ Hadoop_NameNode_IMAGE_TAG="2.0.0-hadoop3.2.1-java8"
Hadoop_DataNode_IMAGE="federatedai/hadoop-datanode"
Hadoop_DataNode_IMAGE_TAG="2.0.0-hadoop3.2.1-java8"
Spark_Master_IMAGE="federatedai/spark-master"
Spark_Master_IMAGE_TAG="v2.0.0-beta"
Spark_Master_IMAGE_TAG="2.0.0-release"
Spark_Worker_IMAGE="federatedai/spark-worker"
Spark_Worker_IMAGE_TAG="v2.0.0-beta"
Spark_Worker_IMAGE_TAG="2.0.0-release"
12 changes: 6 additions & 6 deletions docker-deploy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,13 @@ The output is shown as follows. If the status of each component is `Up`, and the

```bash
NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS
confs-10000-client-1 federatedai/client:v2.0.0-beta "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp
confs-10000-clustermanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp
confs-10000-fateboard-1 federatedai/fateboard:v2.0.0-beta "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp
confs-10000-fateflow-1 federatedai/fateflow:v2.0.0-beta "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp
confs-10000-client-1 federatedai/client:2.0.0-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp
confs-10000-clustermanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp
confs-10000-fateboard-1 federatedai/fateboard:2.0.0-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp
confs-10000-fateflow-1 federatedai/fateflow:2.0.0-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp
confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp
confs-10000-nodemanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp
confs-10000-rollsite-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp
confs-10000-nodemanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp
confs-10000-osx-1 federatedai/osx:2.0.0-release "/tini -- bash -c 'j…" osx About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp
```

### Verifying the deployment
Expand Down
12 changes: 6 additions & 6 deletions docker-deploy/README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,13 +231,13 @@ docker compose ps

```bash
NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS
confs-10000-client-1 federatedai/client:v2.0.0-beta "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp
confs-10000-clustermanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp
confs-10000-fateboard-1 federatedai/fateboard:v2.0.0-beta "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp
confs-10000-fateflow-1 federatedai/fateflow:v2.0.0-beta "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp
confs-10000-client-1 federatedai/client:2.0.0-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp
confs-10000-clustermanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp
confs-10000-fateboard-1 federatedai/fateboard:2.0.0-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp
confs-10000-fateflow-1 federatedai/fateflow:2.0.0-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp
confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp
confs-10000-nodemanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp
confs-10000-rollsite-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp
confs-10000-nodemanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp
confs-10000-osx-1 federatedai/osx:2.0.0-release "/tini -- bash -c 'j…" osx About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp
```

### 验证部署
Expand Down
2 changes: 1 addition & 1 deletion docker-deploy/docker_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ handleLocally() {

main() {

if [ "$1" = "" ] || [ "$" = "--help" ]; then
if [ "$1" = "" ] || [ "$1" = "--help" ]; then
ShowUsage
exit 1
elif [ "$1" = "--delete" ] || [ "$1" = "--del" ]; then
Expand Down
126 changes: 58 additions & 68 deletions docker-deploy/generate_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ function list_include_item {

function CheckConfig(){
# Check config start
computing_list="Eggroll Spark Spark_local"
computing_list="Eggroll Spark STANDALONE"
spark_federation_list="RabbitMQ Pulsar"
algorithm_list="Basic NN ALL"
device_list="CPU IPCL GPU"
Expand All @@ -67,8 +67,8 @@ function CheckConfig(){
fi

if [ $computing == "Eggroll" ]; then
if [ $federation != "Eggroll" ] || [ $storage != "Eggroll" ]; then
echo "[ERROR]: Please select the correct engine. When eggroll is selected as the computing engine, both Federation and storage must be eggroll engines!"
if [ $federation != "OSX" ] || [ $storage != "Eggroll" ]; then
echo "[ERROR]: Please select the correct engine. When eggroll is selected as the computing engine, both Federation and Storage must be osx/eggroll engines!"
exit 1
fi
fi
Expand All @@ -84,13 +84,13 @@ function CheckConfig(){
fi
fi

if [ "$computing" == "Spark_local" ]; then
if ! $(list_include_item "$spark_federation_list" "$federation"); then
echo "[ERROR]: If you choose the Spark_local computing engine, the federation component must be Pulsar or RabbitMQ!"
exit 1
fi
if [ "$storage" != "LocalFS" ]; then
echo "[ERROR]: If you choose the Spark computing engine, the storage component must be LocalFS!"
if [ "$computing" == "STANDALONE" ]; then
# if ! $(list_include_item "$spark_federation_list" "$federation"); then
# echo "[ERROR]: If you choose the STANDALONE computing engine, the federation component must be Pulsar or RabbitMQ!"
# exit 1
# fi
if [ "$storage" != "STANDALONE" ]; then
echo "[ERROR]: If you choose the Spark computing engine, the storage component must be STANDALONE!"
exit 1
fi
fi
Expand Down Expand Up @@ -180,6 +180,7 @@ GenerateConfig() {
#clustermanager & nodemanager
sed -i "s#<clustermanager.host>#${clustermanager_ip}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties
sed -i "s#<clustermanager.port>#${clustermanager_port}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties
sed -i "s#<nodemanager.host>#${nodemanager_ip}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties
sed -i "s#<nodemanager.port>#${nodemanager_port}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties
sed -i "s#<party.id>#${party_id}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties

Expand All @@ -206,7 +207,7 @@ GenerateConfig() {
if [ "$federation" == "RabbitMQ" ]; then
cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/
# delete Pulsar spec
sed -i '203,217d' confs-"$party_id"/docker-compose.yml
sed -i '203,218d' confs-"$party_id"/docker-compose.yml
elif [ "$federation" == "Pulsar" ]; then
cp -r training_template/backends/spark/pulsar confs-$party_id/confs/
# delete RabbitMQ spec
Expand All @@ -215,21 +216,21 @@ GenerateConfig() {
fi
fi

if [ "$computing" == "Spark_local" ]; then
if [ "$computing" == "STANDALONE" ]; then
# computing
cp -r training_template/backends/spark/nginx confs-$party_id/confs/
cp -r training_template/backends/spark/spark confs-$party_id/confs/
# cp -r training_template/backends/spark/nginx confs-$party_id/confs/
# cp -r training_template/backends/spark/spark confs-$party_id/confs/
# storage
if [ "$storage" == "LocalFS" ]; then
if [ "$storage" == "STANDALONE" ]; then
cp training_template/docker-compose-spark-slim.yml confs-$party_id/docker-compose.yml
# federation
if [ "$federation" == "RabbitMQ" ]; then
cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/
sed -i '149,163d' confs-$party_id/docker-compose.yml
elif [ "$federation" == "Pulsar" ]; then
cp -r training_template/backends/spark/pulsar confs-$party_id/confs/
sed -i '131,147d' confs-$party_id/docker-compose.yml
fi
# if [ "$federation" == "RabbitMQ" ]; then
# cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/
# sed -i '149,163d' confs-$party_id/docker-compose.yml
# elif [ "$federation" == "Pulsar" ]; then
# cp -r training_template/backends/spark/pulsar confs-$party_id/confs/
# sed -i '131,147d' confs-$party_id/docker-compose.yml
# fi
fi
fi

Expand All @@ -241,7 +242,7 @@ GenerateConfig() {
# Images choose
Suffix=""
# computing
if [ "$computing" == "Spark" ] || [ "$computing" == "Spark_local" ]; then
if [ "$computing" == "Spark" ]; then
Suffix=$Suffix""
fi
# algorithm
Expand All @@ -264,7 +265,7 @@ GenerateConfig() {
if [ "$computing" == "Eggroll" ]; then
sed -i "s#image: \"\${FATEFlow_IMAGE}:\${FATEFlow_IMAGE_TAG}\"#image: \"\${FATEFlow_IMAGE}${Suffix}:\${FATEFlow_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml
sed -i "s#image: \"\${EGGRoll_IMAGE}:\${EGGRoll_IMAGE_TAG}\"#image: \"\${EGGRoll_IMAGE}${Suffix}:\${EGGRoll_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml
elif [ "$computing" == "Spark" ] || [ "$computing" == "Spark_local" ]; then
elif [ "$computing" == "Spark" ] ; then
sed -i "s#image: \"\${FATEFlow_IMAGE}:\${FATEFlow_IMAGE_TAG}\"#image: \"\${FATEFlow_IMAGE}-spark${Suffix}:\${FATEFlow_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml
sed -i "s#image: \"\${Spark_Worker_IMAGE}:\${Spark_Worker_IMAGE_TAG}\"#image: \"\${Spark_Worker_IMAGE}${Suffix}:\${Spark_Worker_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml
fi
Expand All @@ -273,12 +274,12 @@ GenerateConfig() {
if [ "$device" == "GPU" ]; then
line=0 # line refers to the line number of the fateflow `command` line in docker-compose.yaml
if [ "$computing" == "Eggroll" ]; then
line=140
line=141
fi
if [ "$computing" == "Spark" ]; then
line=84
line=85
fi
if [ "$computing" == "Spark_local" ]; then
if [ "$computing" == "STANDALONE" ]; then
line=85
fi
sed -i "${line}i\\
Expand Down Expand Up @@ -345,39 +346,43 @@ GenerateConfig() {
echo mysql module of $party_id done!

# fate_flow
sed -i "s/party_id:/party_id: \"${party_id}\"/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/party_id: .*/party_id: \"${party_id}\"/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/name: <db_name>/name: '${db_name}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/user: <db_user>/user: '${db_user}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/passwd: <db_passwd>/passwd: '${db_password}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/host: <db_host>/host: '${db_ip}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/127.0.0.1:8000/${serving_ip}:8000/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml


if [[ "$computing" == "Spark" ]] || [[ "$computing" == "Spark_local" ]] ; then
sed -i "s/proxy_name: rollsite/proxy_name: nginx/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
if [[ "$computing" == "Spark" ]] ; then
sed -i "s/proxy_name: osx/proxy_name: nginx/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/computing: eggroll/computing: spark/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
fi
if [[ "$computing" == "STANDALONE" ]] ; then
# sed -i "s/proxy_name: osx/proxy_name: nginx/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/computing: eggroll/computing: standalone/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
fi
if [[ "$federation" == "Pulsar" ]]; then
sed -i "s/ federation: rollsite/ federation: pulsar/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/ federation: osx/ federation: pulsar/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
elif [[ "$federation" == "RabbitMQ" ]]; then
sed -i "s/ federation: rollsite/ federation: rabbitmq/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/ federation: osx/ federation: rabbitmq/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
fi

if [[ "$storage" == "HDFS" ]]; then
sed -i "s/ storage: eggroll/ storage: hdfs/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
elif [[ "$storage" == "LocalFS" ]]; then
sed -i "s/ storage: eggroll/ storage: localfs/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
elif [[ "$storage" == "STANDALONE" ]]; then
sed -i "s/ storage: eggroll/ storage: standalone/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
fi

if [[ "$computing" == "Spark_local" ]] ; then
sed -i "s#spark.master .*#spark.master local[*]#g" ./confs-$party_id/confs/spark/spark-defaults.conf
fi
# if [[ "$computing" == "STANDALONE" ]] ; then
# sed -i "s#spark.master .*#spark.master local[*]#g" ./confs-$party_id/confs/spark/spark-defaults.conf
# fi

# compute_core
sed -i "s/nodes: .*/nodes: 1/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml
sed -i "s/cores_per_node: .*/cores_per_node: $compute_core/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml

if [[ "$federation" == "Eggroll" ]]; then
if [[ "$computing" == "Eggroll" ]]; then
sed -i "s/eggroll.session.processors.per.node=.*/eggroll.session.processors.per.node=$compute_core/g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties
fi
if [[ "$computing" == "Spark"* ]]; then
Expand All @@ -386,29 +391,14 @@ GenerateConfig() {
echo fate_flow module of $party_id done!

# federation config
# eggroll
if [[ "$federation" == "Eggroll" ]]; then
cat >./confs-$party_id/confs/eggroll/conf/route_table.json <<EOF
# OSX
sed -i "s/self.party=9999/self.party=${party_id}/g" ./confs-$party_id/confs/osx/conf/broker.properties
if [[ "$federation" == "OSX" ]]; then
cat >./confs-$party_id/confs/osx/conf/route_table.json <<EOF
{
"route_table": {
"default": {
"default": [
{
$(if [ "$exchange_ip" != "" ]; then
echo "
\"ip\": \"${exchange_ip}\",
\"port\": 9371
"
else
echo "
\"ip\": \"${proxy_ip}\",
\"port\": \"${proxy_port}\"
"
fi)
}
]
},
$(for ((j = 0; j < ${#party_list[*]}; j++)); do
"route_table":
{
$(for ((j = 0; j < ${#party_list[*]}; j++)); do
if [ "${party_id}" == "${party_list[${j}]}" ]; then
continue
fi
Expand All @@ -422,19 +412,19 @@ $(for ((j = 0; j < ${#party_list[*]}; j++)); do
"
done)
"${party_id}": {
"default": [{
"ip": "${proxy_ip}",
"port": ${proxy_port}
}],
"fateflow": [{
"ip": "${fate_flow_ip}",
"port": ${fate_flow_grpc_port}
}]
}
},
"permission": {
"default_allow": true
}
},
"self_party":[
"${party_id}"
],
"permission":
{
"default_allow": true
}
}
EOF
fi
Expand Down
4 changes: 2 additions & 2 deletions docker-deploy/parties.conf
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ serving_ip_list=(192.168.1.1 192.168.1.2)
# Engines:
# Computing : Eggroll, Spark, Spark_local
computing=Eggroll
# Federation: Eggroll(computing: Eggroll), Pulsar/RabbitMQ(computing: Spark/Spark_local)
federation=Eggroll
# Federation: OSX(computing: Eggroll/Spark/Spark_local), Pulsar/RabbitMQ(computing: Spark/Spark_local)
federation=OSX
# Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local)
storage=Eggroll
# Algorithm: Basic, NN, ALL
Expand Down
Loading

0 comments on commit 61190b0

Please sign in to comment.