forked from daqcri/DeepER
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRun_DLER.sh
executable file
·112 lines (97 loc) · 4.93 KB
/
Run_DLER.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
set -e
if [ $# -lt 6 ]; then
echo " "
echo "USAGE $0 <Dataset> <first-table-name> <second-table-name> <do-sample> <do-preprocess> <do-full-sampling-control>"
echo " "
echo "INFO This script must be run from DLER dir"
echo " "
echo "INFO Other Fine Grained Parameters like data splitting, neg/pos ratio, columns to include, etc. can be controlled inside the script"
echo " "
echo "INFO e.g.: bash Run_DLER.sh Abt-Buy Abt Buy no no no"
echo " "
exit 1
fi
echo " "
echo "Other Fine Grained Parameters like data splitting, neg/pos ratio, columns to include, etc. can be controlled inside the script"
echo " "
DATA_DIR=$PWD/data
DATASET=$1
FIRST_TABLE=$2
SECOND_TABLE=$3
SAMPLE=$4 #"no: train/dev/test data has been sampled"
FULL_CONTROL=$5 #"no: preprocessing has been done, i.e. fetch_and_preprocess.sh has been run before and DATASET_DLER folder is ready"
PREPROCESS=$6 #"no:sample random negatives with same ratio on all splits and include all columns", "yes: sample all three types of negatives with different ratios across splits and choose which columns to include"
DATASET_DIR=$DATA_DIR/DataSets/$DATASET #where the three main inputs (table1, table2, matches) are stored
SPLIT_DATASET_DIR=$DATA_DIR/SplitDataSets/$DATASET #where data splits (train, dev, test) will be stored
DATASET_DLER=$DATA_DIR/dler/$DATASET #same as SPLIT_DATA_SET_DIR, however this is were parsing and preprocessing will
#SIMPLE CONTROL SAMPLING
SEED=31
TRAIN_RATIO=0.25
DEV_RATIO=0.25
TEST_RAIO=0.5
NEG_TO_POS_RATIO=1
#FULL CONTROL SAMPLING
RATIO_NEG_TO_POS_TYPE_1=6 #e.g. 2 neg examples for each pos example where the table one record of a random pos is part of the tuple
RATIO_NEG_TO_POS_TYPE_2=6 #e.g. 2 neg examples for each pos example where the table two record of a random pos is part of the tuple
RATIO_NEG_TO_POS_TYPE_3=6 #e.g. 2 neg examples for each pos example where both records are not part of any pos example
RATIO_OF_NEG_TO_INCLUDE_IN_TRAINING=0.025 #off all samples negative examples for training, what ratio do you want to keep (class balancing)
COLUMNS_TO_INCLUDE_FROM_FIRST="1,2,3" #only include these columns as part of the record, column 0 is assumed to be the id
COLUMNS_TO_INCLUDE_FROM_SECOND="1,2,4" #only include these columns as part of the record, column 0 is assumed to be the id
#TRAINING PARAMETERS
NET_ARCH='avg' # avg | lstm | bilstm
OPTIM_METHOD='adam' # sgd | adam | adagrad
BATCH_SIZE=16
RNN_DIM=150 #rnn units
HIDDEN_LAYER_SIZE=50
LEARNING_RATE=0.01
REGULARIZATION=1e-3
END_TO_END_LEARNING=false #updates embeddings for current training session
EMBEDDING_UPDATE_RATE=0.01
NOISE_LEVEL=0
NUM_EPOCHS=20
if [ "$SAMPLE" = "yes" ]
then
echo 'building required dir structure ..'
rm -rf $SPLIT_DATASET_DIR
mkdir $SPLIT_DATASET_DIR
rm -rf $DATASET_DLER
mkdir $DATASET_DLER
mkdir $DATASET_DLER/avg
mkdir $DATASET_DLER/lstm
mkdir $DATASET_DLER/bilstm
mkdir $DATASET_DLER/avg/debug
mkdir $DATASET_DLER/lstm/debug
mkdir $DATASET_DLER/bilstm/debug
if [ "$FULL_CONTROL" = "yes" ]
then
echo 'full control sampling & preparing the data for parsing ..'
$PWD/PrepData/./PrepMagellan.exe $DATASET_DIR/$DATASET"_perfectMapping.csv" "$DATASET_DIR/$FIRST_TABLE.csv" "$DATASET_DIR/$SECOND_TABLE.csv" \
"$SPLIT_DATASET_DIR/train.csv" "$SPLIT_DATASET_DIR/dev.csv" "/$SPLIT_DATASET_DIR/test.csv" \
"$DATASET_DLER/train.txt" "$DATASET_DLER/dev.txt" "$DATASET_DLER/test.txt" \
$TRAIN_RATIO $DEV_RATIO $TEST_RAIO \
$RATIO_NEG_TO_POS_TYPE_1 $RATIO_NEG_TO_POS_TYPE_2 $RATIO_NEG_TO_POS_TYPE_3 \
$RATIO_OF_NEG_TO_INCLUDE_IN_TRAINING \
$SEED $COLUMNS_TO_INCLUDE_FROM_FIRST $COLUMNS_TO_INCLUDE_FROM_SECOND
echo " "
else
echo 'simle sampling & preparing the data for parsing ..'
$PWD/PrepData/./PrepData.exe $DATASET_DIR/$DATASET"_perfectMapping.csv" "$DATASET_DIR/$FIRST_TABLE.csv" "$DATASET_DIR/$SECOND_TABLE.csv" \
"$DATASET_DLER/train.txt" "$DATASET_DLER/dev.txt" "$DATASET_DLER/test.txt" \
$TRAIN_RATIO $DEV_RATIO $TEST_RAIO $NEG_TO_POS_RATIO $SEED
echo " "
fi
else
echo "No Sampling was done!"
fi
if [ "$PREPROCESS" = "yes" ]
then
echo 'preprocessing and parsing the data for model building and testing ..'
sh fetch_and_preprocess.sh $DATASET_DLER
echo " "
fi
echo 'building a model and test it ..'
th relatedness/main.lua --model $NET_ARCH --data_sub_folder $DATASET_DLER \
--optim_method $OPTIM_METHOD --batch_size $BATCH_SIZE --dim $RNN_DIM --sim_nhidden $HIDDEN_LAYER_SIZE \
--learning_rate $LEARNING_RATE --regularization $REGULARIZATION --update_emb $EMBEDDING_UPDATE_RATE \
--emb_learning_rate $EMBEDDING_UPDATE_RATE --noise_level $NOISE_LEVEL --epochs $NUM_EPOCHS