Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test(workflow): change env into flash2 and add rerun workflow #48

Merged
merged 5 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/workflows/demo_in_readme.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ jobs:

- name: raw-chinese-data
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}

- name: alpaca-data
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/data/tokenizer_alpaca.sh

train:
Expand All @@ -44,26 +44,26 @@ jobs:
- name: slurm-train
id: basic_train
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}

- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}

- name: load_new_ckpt
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak

- name: torchrun-train
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak

Expand All @@ -79,7 +79,7 @@ jobs:

- name: convert-model-then-load
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/model/convert_to_hf.sh
cd ./hf_ckpt
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ on:
- "doc/**"
- "**.md"
env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm_s

jobs:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 10
timeout-minutes: 15
steps:
- name: mask env
run: |
Expand All @@ -22,8 +23,8 @@ jobs:

- name: training_4GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py

training_8GPU_ISP:
runs-on: [t_cluster]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_before_merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ jobs:

- name: model_init_tests
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_init.py --seed=1024
4 changes: 2 additions & 2 deletions .github/workflows/pr_merged.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:

- name: acc_tests
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py

Expand All @@ -40,7 +40,7 @@ jobs:

- name: loss_tests
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-loss-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_swap_nb_loss_and_gradnorm.py

Expand Down
21 changes: 21 additions & 0 deletions .github/workflows/rerun.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: check-status

on:
workflow_run:
workflows: [unit-tests,pr-merged,weekly-tests]
types: [completed]

jobs:
on-failure:
runs-on: ubuntu-latest
if: ${{ (github.event.workflow_run.head_branch == 'main' || github.event.workflow_run.head_branch == 'develop') && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.run_attempt < 3 }}
steps:
- run: |
echo 'The triggering workflow failed'
sleep 600
curl -L \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ github.token }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/rerun-failed-jobs
12 changes: 6 additions & 6 deletions .github/workflows/unit_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:

- name: core_pipeline
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py

Expand All @@ -47,7 +47,7 @@ jobs:

- name: utils_storage_manager
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py

Expand All @@ -63,7 +63,7 @@ jobs:

- name: model_fused_precision
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_model/test_fused_precision/test_fused_precision.py

Expand All @@ -79,7 +79,7 @@ jobs:

- name: data_batch_sample
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_data/test_batch_sampler.py

Expand All @@ -95,7 +95,7 @@ jobs:

- name: utils_timeout
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python -m pytest -s -v ./tests/test_utils/test_timeout.py

Expand All @@ -111,7 +111,7 @@ jobs:

- name: utils_model_checkpoint
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:2 python -m pytest -s -v ./tests/test_utils/test_model_checkpoint.py

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/upload_to_pypi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:

- name: build and upload package
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} dist/*
31 changes: 16 additions & 15 deletions .github/workflows/weekly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ on:
schedule:
- cron: '56 18 * * 5'
env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm_s

jobs:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 10
timeout-minutes: 15
steps:
- name: mask env
run: |
Expand All @@ -21,8 +22,8 @@ jobs:

- name: training_4GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py

training_8GPU_4DP2TP:
runs-on: [t_cluster]
Expand All @@ -38,7 +39,7 @@ jobs:

- name: training_8GPU_4DP2TP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py

Expand All @@ -56,7 +57,7 @@ jobs:

- name: training_8GPU_4DP2TPSP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py

Expand All @@ -74,7 +75,7 @@ jobs:

- name: training_8GPU_4DP2PP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py

Expand All @@ -92,7 +93,7 @@ jobs:

- name: training_8GPU_4DP2PP_InterleavedOverlap
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
Expand All @@ -111,7 +112,7 @@ jobs:

- name: training_16GPU_4DP2TP2PP_MTP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="mtp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
Expand All @@ -130,7 +131,7 @@ jobs:

- name: training_16GPU_4DP2TP2PP_MSP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="msp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
Expand All @@ -149,7 +150,7 @@ jobs:

- name: training_16GPU_4DP2TP2PP_FSP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
Expand Down Expand Up @@ -199,7 +200,7 @@ jobs:

- name: test_optimizer
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py

unit_test_model:
Expand All @@ -216,17 +217,17 @@ jobs:

- name: test_embedding_accuracy
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py

- name: test_model_internlm_accuracy
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py

- name: test_norm_accuracy
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py

load_ckpt_then_assert_loss:
Expand All @@ -243,7 +244,7 @@ jobs:

- name: test_ckpt_loss
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_load_ckpt_loss.py

notify_to_feishu:
Expand Down
2 changes: 1 addition & 1 deletion ci_scripts/model/convert_to_hf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ readonly TOKENIZER="${GITHUB_WORKSPACE}/hf_ckpt/tokenizer.model"
readonly CONFIG="${GITHUB_WORKSPACE}/hf_ckpt/config.json"
readonly INERNLM="${GITHUB_WORKSPACE}/hf_ckpt/modeling_internlm.py"
exit_code=0
expected_num=8
expected_num=9

source ./ci_scripts/common/basic_func.sh

Expand Down
Loading