Merge pull request #2 from h4iku/update-opennmt

Update OpenNMT-py scripts to work with v2
EhsanMashhadi · Jan 8, 2022 · 61e6ff0 · 61e6ff0
2 parents f8fda9d + 554afff
commit 61e6ff0
Show file tree

Hide file tree

Showing 14 changed files with 123 additions and 58 deletions.
diff --git a/README.md b/README.md
@@ -3,38 +3,51 @@
 ### Paper
 You can find the paper here: https://arxiv.org/abs/2103.11626
 ### Data
-data folder contains multiple folders and files:
+`data` folder contains multiple folders and files:
 
 - `repetition` folder contains MSR datasets WITH <buggy code, fixed code> duplicate pairs
 - `unique` folder contains MSR datasets WITHOUT <buggy code, fixed code> duplicate pairs
 - `sstubs(Large|Small).json` files contain dataset in JSON format
 - `sstubs(Large|Small)-(train|test|val).json` files contain dataset split in JSON format
 - `split/(large|small)` folders contain dataset in text format (what the CodeBERT works with)
 
-### Running CodeBert Experiments
-1. Download the CodeBERT model
-    - git lfs install
-    - git clone https://huggingface.co/microsoft/codebert-base
-    - use current path as `pretrained_model` variable in script files
-2. Clone the repository
-    - git clone https://github.com/EhsanMashhadi/MSR2021-ProgramRepair.git
+### Running CodeBERT Experiments
+1. Clone the repository
+    - `git lfs install` 
+    - `git clone https://github.com/EhsanMashhadi/MSR2021-ProgramRepair.git`
+2. Download the CodeBERT model
+    - `cd MSR2021-ProgramRepair`
+    - `git clone https://huggingface.co/microsoft/codebert-base`
+    - use the downloaded model's directory path as `pretrained_model` variable in script files
 3. Install dependencies
-    - pip install torch==1.4.0
-    - pip install transformers==2.5.0
+    - `pip install torch==1.4.0`
+    - `pip install transformers==2.5.0`
 4. Train the model with MSR data
-    - bash ./scripts/codebert/train.sh
+    - `bash ./scripts/codebert/train.sh`
 5. Evaluate the model
-    - bash ./scripts/codebert/test.sh 
+    - `bash ./scripts/codebert/test.sh`
 
 ### Running Simple LSTM Experiments
-1. Install OpenNMT-py
-    - https://github.com/OpenNMT/OpenNMT-py
+1. Install [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py)
+    - `pip install OpenNMT-py==2.2.0`
 2. Preprocess the MSR data
-    - bash ./scripts/simple-lstm/preprocess.sh
+    - `bash ./scripts/simple-lstm/build_vocab.sh`
 3. Train the model
-    - bash ./scripts/simple-lstm/train.sh
+    - `bash ./scripts/simple-lstm/train.sh`
 4. Evaluate the model
-    - bash ./scripts/simple-lstm/test.sh
+    - `bash ./scripts/simple-lstm/test.sh`
+
+### Running Simple LSTM Experiments using the legacy version of OpenNMT-py
+**(This is the original version used to run the simple LSTM experiments in the paper.)**
+
+1. Install [OpenNMT-py legacy](https://github.com/OpenNMT/OpenNMT-py/tree/legacy)
+    - `pip install OpenNMT-py==1.2.0`
+2. Preprocess the MSR data
+    - `bash ./scripts/simple-lstm/legacy/preprocess.sh`
+3. Train the model
+    - `bash ./scripts/simple-lstm/legacy/train.sh`
+4. Evaluate the model
+    - `bash ./scripts/simple-lstm/legacy/test.sh`
 
 ### How to run all of experiments?
-   - You can change the `size` and `type` variables value in script files to run different experiments (large | small, unique | repetition)
+   - You can change the `size` and `type` variables value in script files to run different experiments (large | small, unique | repetition).
diff --git a/scripts/codebert/test.sh b/scripts/codebert/test.sh
@@ -4,16 +4,15 @@ batch_size=8
 beam_size=5
 source_length=510
 target_length=510
-size=small #Can be: small OR large
-type=unique #Can be: repetition OR unique
-data_dir=../data/$type/split/$size
-output_dir=saved_models/codebert/$type/$size
+size=small  # Can be: small OR large
+type=unique  # Can be: repetition OR unique
+data_dir=./data/$type/split/$size
+output_dir=./saved_models/codebert/$type/$size
 validate_file=$data_dir/src-val.txt,$data_dir/tgt-val.txt
 test_file=$data_dir/src-test.txt,$data_dir/tgt-test.txt
 test_model=$output_dir/checkpoint-best-ppl/pytorch_model.bin
-pretrained_model=./code-bert #CodeBert model path downloaded from Huggingface
-pretrained_model=../../codebert-model/codebert-base
-CodeBERT=../../codebert
+pretrained_model=./codebert-base  # CodeBert model path downloaded from Huggingface
+CodeBERT=./codebert
 
 python $CodeBERT/run.py \
 --do_test \

diff --git a/scripts/codebert/train.sh b/scripts/codebert/train.sh
@@ -7,15 +7,14 @@ source_length=510
 target_length=510
 train_steps=50000
 eval_steps=1000
-size=large #Can be: small OR large
-type=unique #Can be: repetition OR unique
-data_dir=../data/$type/split/$size
-output_dir=saved_models/codebert/$type/$size
+size=small  # Can be: small OR large
+type=unique  # Can be: repetition OR unique
+data_dir=./data/$type/split/$size
+output_dir=./saved_models/codebert/$type/$size
 train_file=$data_dir/src-train.txt,$data_dir/tgt-train.txt
 validate_file=$data_dir/src-val.txt,$data_dir/tgt-val.txt
-pretrained_model=./code-bert #CodeBert model path downloaded from Huggingface
-pretrained_model=../../codebert-model/codebert-base
-CodeBERT=../../codebert
+pretrained_model=./codebert-base  # CodeBert model path downloaded from Huggingface
+CodeBERT=./codebert
 
 
 python $CodeBERT/run.py \

diff --git a/scripts/simple-lstm/build_vocab.sh b/scripts/simple-lstm/build_vocab.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+size=small  # Can be: small OR large
+type=unique  # Can be: repetition OR unique
+data_config=./scripts/simple-lstm/${type}_${size}_data.yaml
+output_dir=./saved_models/simple-lstm/$type/$size
+
+if [ ! -d $output_dir ]
+then
+    mkdir -p $output_dir
+fi
+
+onmt_build_vocab -config $data_config  -src_seq_length 510 -tgt_seq_length 510 -src_vocab_size 64000 -tgt_vocab_size 64000 -share_vocab -n_sample -1 -save_data $output_dir/final -src_vocab $output_dir/final.vocab
diff --git a/scripts/simple-lstm/legacy/preprocess.sh b/scripts/simple-lstm/legacy/preprocess.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+size=small  # Can be: small OR large
+type=unique  # Can be: repetition OR unique
+data_path=./data/$type/split/$size
+output_dir=./saved_models/simple-lstm-legacy/$type/$size
+
+if [ ! -d $output_dir ]
+then
+    mkdir -p $output_dir
+fi
+
+onmt_preprocess -train_src $data_path/src-train.txt -train_tgt $data_path/tgt-train.txt -valid_src $data_path/src-val.txt -valid_tgt $data_path/tgt-val.txt --src_seq_length 510 --tgt_seq_length 510 --src_vocab_size 64000 --tgt_vocab_size 64000 -dynamic_dict -share_vocab --save_data $output_dir/final
diff --git a/scripts/simple-lstm/legacy/test.sh b/scripts/simple-lstm/legacy/test.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+size=small  # Can be: small OR large
+type=unique  # Can be: repetition OR unique
+data_path=./data/$type/split/$size
+output_dir=./saved_models/simple-lstm-legacy/$type/$size
+
+onmt_translate -model $output_dir/final-model_step_20000.pt -src $data_path/src-test.txt -beam_size 5 -n_best 1 -output $output_dir/pred-test_beam5.txt -dynamic_dict
diff --git a/scripts/simple-lstm/legacy/train.sh b/scripts/simple-lstm/legacy/train.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+size=small  # Can be: small OR large
+type=unique  # Can be: repetition OR unique
+output_dir=./saved_models/simple-lstm-legacy/$type/$size
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+onmt_train -data $output_dir/final -world_size 4 -gpu_ranks 0 1 2 3 -encoder_type brnn -enc_layers 2 -decoder_type rnn -dec_layers 2 -rnn_size 256 -global_attention general -batch_size 32 -word_vec_size 256 -bridge -copy_attn -reuse_copy_attn -train_steps 20000 -save_checkpoint_steps 5000 -valid_steps 1000 -save_model $output_dir/final-model
diff --git a/scripts/simple-lstm/preprocess.sh b/scripts/simple-lstm/preprocess.sh
diff --git a/scripts/simple-lstm/repetition_large_data.yaml b/scripts/simple-lstm/repetition_large_data.yaml
@@ -0,0 +1,7 @@
+data:
+    corpus:
+        path_src: data/repetition/split/large/src-train.txt
+        path_tgt: data/repetition/split/large/tgt-train.txt
+    valid:
+        path_src: data/repetition/split/large/src-val.txt
+        path_tgt: data/repetition/split/large/tgt-val.txt
diff --git a/scripts/simple-lstm/repetition_small_data.yaml b/scripts/simple-lstm/repetition_small_data.yaml
@@ -0,0 +1,7 @@
+data:
+    corpus:
+        path_src: data/repetition/split/small/src-train.txt
+        path_tgt: data/repetition/split/small/tgt-train.txt
+    valid:
+        path_src: data/repetition/split/small/src-val.txt
+        path_tgt: data/repetition/split/small/tgt-val.txt
diff --git a/scripts/simple-lstm/test.sh b/scripts/simple-lstm/test.sh
@@ -1,9 +1,8 @@
 #!/bin/bash
 
-OpenNMT_py=../../OpenNMT-py
-size=large #Can be: small OR large
-type=unique #Can be: repetition OR unique
-data_path=../../data/$type/split/$size
-output_dir=../../saved_models/simple-lstm/$type/$size
+size=small  # Can be: small OR large
+type=unique  # Can be: repetition OR unique
+data_path=./data/$type/split/$size
+output_dir=./saved_models/simple-lstm/$type/$size
 
-python $OpenNMT_py/translate.py -model $output_dir/final-model_step_20000.pt -src $data_path/src-test.txt -beam_size 5 -n_best 1 -output $output_dir/pred-test_beam5.txt -dynamic_dict
+onmt_translate -model $output_dir/final-model_step_20000.pt -src $data_path/src-test.txt -beam_size 5 -n_best 1 -output $output_dir/pred-test_beam5.txt
diff --git a/scripts/simple-lstm/train.sh b/scripts/simple-lstm/train.sh
@@ -1,9 +1,8 @@
 #!/bin/bash
 
-OpenNMT_py=../../OpenNMT-py
-size=large #Can be: small OR large
-type=unique #Can be: repetition OR unique
-output_dir=../../saved_models/simple-lstm/$type/$size
+size=small  # Can be: small OR large
+type=unique  # Can be: repetition OR unique
+data_config=./scripts/simple-lstm/${type}_${size}_data.yaml
+output_dir=./saved_models/simple-lstm/$type/$size
 
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-python $OpenNMT_py/train.py -data $output_dir/final -world_size 4 -gpu_ranks 0 1 2 3 -encoder_type brnn -enc_layers 2 -decoder_type rnn -dec_layers 2 -rnn_size 256 -global_attention general -batch_size 32 -word_vec_size 256 -bridge -copy_attn -reuse_copy_attn -train_steps 20000 -save_checkpoint_steps 5000 -valid_steps 1000 -save_model $output_dir/final-model
+onmt_train -config $data_config -share_vocab -src_vocab $output_dir/final.vocab -world_size 4 -gpu_ranks 0 1 2 3 -encoder_type brnn -enc_layers 2 -decoder_type rnn -dec_layers 2 -rnn_size 256 -global_attention general -batch_size 32 -word_vec_size 256 -bridge -copy_attn -reuse_copy_attn -train_steps 20000 -save_checkpoint_steps 5000 -valid_steps 1000 -save_model $output_dir/final-model
diff --git a/scripts/simple-lstm/unique_large_data.yaml b/scripts/simple-lstm/unique_large_data.yaml
@@ -0,0 +1,7 @@
+data:
+    corpus:
+        path_src: data/unique/split/large/src-train.txt
+        path_tgt: data/unique/split/large/tgt-train.txt
+    valid:
+        path_src: data/unique/split/large/src-val.txt
+        path_tgt: data/unique/split/large/tgt-val.txt
diff --git a/scripts/simple-lstm/unique_small_data.yaml b/scripts/simple-lstm/unique_small_data.yaml
@@ -0,0 +1,7 @@
+data:
+    corpus:
+        path_src: data/unique/split/small/src-train.txt
+        path_tgt: data/unique/split/small/tgt-train.txt
+    valid:
+        path_src: data/unique/split/small/src-val.txt
+        path_tgt: data/unique/split/small/tgt-val.txt