File size: 13,194 Bytes
7718235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/bin/bash
# $1 is the name of the scripts folder
# pretrain.seed.0.yaml: main file, the pretrain model
# first select the best model for TL based on validation dataset in pretrain
if [ ! -f $1/pretrain.seed.0.summary ] || [ ! -s $1/pretrain.seed.0.summary ]; then
  Rscript visualize.train.process/plot.test.AUC.by.step.R $1/pretrain.seed.0.yaml > $1/pretrain.seed.0.summary
fi
number=$(cat $1/pretrain.seed.0.summary | grep 'val' | grep -oE '\([0-9]+\)' | sed 's/[(|)]//g')
logdir=$(cat $1/pretrain.seed.0.yaml | grep log_dir | sed 's/.*: //')
if [ -z $number ]; then
  best_model="null"
else
  best_model=$logdir"model.step."$number".pt"
fi
echo "Best model is: "$best_model
# origin hyper paramters
lr_warmup_steps=$(cat $1/pretrain.seed.0.yaml | grep lr_warmup_steps | sed 's/.*: //' | sed 's/ #.*//g')
num_save_batches=$(cat $1/pretrain.seed.0.yaml | grep num_save_batches | sed 's/.*: //' | sed 's/ #.*//g')
target_num_save_batches=400
num_epochs=$(cat $1/pretrain.seed.0.yaml | grep num_epochs | sed 's/.*: //' | sed 's/ #.*//g')
batch_size=$(cat $1/pretrain.seed.0.yaml | grep batch_size | sed 's/.*: //' | sed 's/ #.*//g')
lr=$(cat $1/pretrain.seed.0.yaml | grep lr: | sed 's/.*: //' | sed 's/ #.*//g')
half_lr=$(printf "%.1e" "$(echo "scale=10; $(printf "%f" "$lr")" | bc)")
five_lr=$(printf "%.1e" "$(echo "scale=10; $(printf "%f" "$lr") * 5" | bc)")
lr_min=$(cat $1/pretrain.seed.0.yaml | grep lr_min: | sed 's/.*: //' | sed 's/ #.*//g')
half_lr_min=$(echo "$lr_min" | awk '{ printf "%.1e", $1/10 }')
data_split=$(cat $1/pretrain.seed.0.yaml | grep data_split_fn | sed 's/.*: //' | sed 's/ #.*//g')
loss_fn=$(cat $1/pretrain.seed.0.yaml | grep ^loss_fn | sed 's/.*: //' | sed 's/ #.*//g')
drop_out=$(cat $1/pretrain.seed.0.yaml | grep drop_out | sed 's/.*: //' | sed 's/ #.*//g')
num_steps_update=$(cat $1/pretrain.seed.0.yaml | grep num_steps_update | sed 's/.*: //' | sed 's/ #.*//g')
ngpus=$(cat $1/pretrain.seed.0.yaml | grep ngpus | sed 's/.*: //' | sed 's/ #.*//g')
nworkers=$(cat $1/pretrain.seed.0.yaml | grep num_workers | sed 's/.*: //' | sed 's/ #.*//g')
target_nworkers=0
batch_size=$(cat $1/pretrain.seed.0.yaml | grep batch_size | sed 's/.*: //' | sed 's/ #.*//g')
echo "loss_fn was: "$loss_fn
changed_data=false
if grep -q "_by_anno" $1/pretrain.seed.0.yaml; then
    echo "modify data-file-train in original yaml"
    if [ ! -f $1/pretrain.seed.0.yaml.bak ]; then
        cp $1/pretrain.seed.0.yaml $1/pretrain.seed.0.yaml.bak
    fi
    sed -i 's|_by_anno|""|g' $1/pretrain.seed.0.yaml
    changed_data=true
fi
# prepare yaml files for all tasks
for gene in PTEN PTEN.bin CCR5 CXCR4 NUDT15 SNCA CYP2C9 GCK ASPA Stab $(cat scripts/gene.txt) $(cat scripts/gene.itan.txt) $(cat scripts/gene.pfams.txt) fluorescence
do
  # use original yaml as template
  cp $1/pretrain.seed.0.yaml $1/$gene.yaml
  # ngpu should be 1
  sed -i "s|ngpus: "$ngpus"|ngpus: 1\nuse_lora: |g" $1/$gene.yaml
  # learning rate should be half
  sed -i "s|lr: "$lr"|lr: "$half_lr"|g" $1/$gene.yaml
  sed -i "s|lr_min: "$lr_min"|lr_min: "$half_lr_min"|g" $1/$gene.yaml
  # change data type
  sed -i "s|data_type: ClinVar|data_type: "$gene"|g" $1/$gene.yaml
  # change loss fn
  sed -i "s|loss_fn: "$loss_fn"|loss_fn: mse_loss|g" $1/$gene.yaml
  # change logdir
  sed -i "s|log_dir: "$logdir"|log_dir: "$logdir"TL."$gene".seed.0/|g" $1/$gene.yaml
  # change drop out rate
  sed -i "s|drop_out: "$drop_out"|drop_out: 0.1|g" $1/$gene.yaml
  # change num workers in dataloader
  sed -i "s|num_workers: "$nworkers"|num_workers: "$target_nworkers"|g" $1/$gene.yaml
  # change loaded msa
  if grep -q "loaded_msa" $1/pretrain.seed.0.yaml; then
  	sed -i "s|loaded_msa: false|loaded_msa: true|g" $1/$gene.yaml
  else
	echo "loaded_msa: true" >> $1/$gene.yaml
  fi
  # change loaded confidence
  if grep -q "loaded_confidence" $1/pretrain.seed.0.yaml; then
  	sed -i "s|loaded_confidence: false|loaded_confidence: true|g" $1/$gene.yaml
  else
	echo "loaded_confidence: true" >> $1/$gene.yaml
  fi
  if grep -q "loaded_esm" $1/pretrain.seed.0.yaml; then
	sed -i "s|loaded_esm: false|loaded_esm: true|g" $1/$gene.yaml
  else
	echo "loaded_esm: true" >> $1/$gene.yaml
  fi
done

# if original loss_fn is combined_loss or weighted_combined_loss, change loss back
if [ "$loss_fn" == "combined_loss" ] || [ "$loss_fn" == "weighted_combined_loss" ] || [ "$loss_fn" == "GP_loss" ]; then
  for gene in $(cat scripts/gene.txt) $(cat scripts/gene.itan.txt) $(cat scripts/gene.pfams.txt)
  do
    sed -i "s|loss_fn: mse_loss|loss_fn: "$loss_fn"|g" $1/$gene.yaml
  done
fi

# for human genes, load_model based on the best model in pretrain
for gene in PTEN PTEN.bin CCR5 CXCR4 NUDT15 SNCA CYP2C9 GCK ASPA Stab $(cat scripts/gene.txt) $(cat scripts/gene.itan.txt) $(cat scripts/gene.pfams.txt)
do
  # change load model
  orig_load_model=$(cat $1/pretrain.seed.0.yaml | grep ^load_model | sed 's/.*: //' | sed 's/ #.*//g')
  sed -i "s|load_model: "$orig_load_model"|load_model: "$best_model"|g" $1/$gene.yaml
  sed -i "s|partial_load_model: true|partial_load_model: false|g" $1/$gene.yaml
  # change num epochs to 2 times larger
  sed -i "s|num_epochs: "$num_epochs"|num_epochs: "$(($num_epochs))"|g" $1/$gene.yaml
  # warm up steps should be 20 times lower
  sed -i "s|lr_warmup_steps: "$lr_warmup_steps"|lr_warmup_steps: "$(($lr_warmup_steps/20))"|g" $1/$gene.yaml
  # num saved batches should be 20 times lower
  if [[ "PTEN PTEN.bin CCR5 CXCR4 NUDT15 SNCA CYP2C9 GCK ASPA DDX3X" == *"$gene"* ]]; then
      sed -i "s|num_save_batches: "$num_save_batches"|num_save_batches: "$(($target_num_save_batches))"|g" $1/$gene.yaml
  else
    if [[ ! "Stab" == *"$gene"* ]]; then
      sed -i "s|num_save_batches: "$num_save_batches"|num_save_batches: "$(($target_num_save_batches/80))"|g" $1/$gene.yaml
    fi
  fi
done

# for Human DMS tasks, data should be changed
for gene in PTEN PTEN.bin CCR5 CXCR4 NUDT15 SNCA CYP2C9 GCK ASPA Stab
do
  sed -i "s|pretrain/|"$gene"/|g" $1/$gene.yaml
  # add a new line that specifies "convert_to_onesite: true"
  echo "convert_to_onesite: true" >> $1/$gene.yaml
  # change num steps update
  sed -i "s|num_steps_update: "$num_steps_update"|num_steps_update: 1|g" $1/$gene.yaml
  # don't add contrastive loss because they are from same protein
  sed -i "s|contrastive_loss_fn: cosin_contrastive_loss|contrastive_loss_fn: null|g" $1/$gene.yaml
  # change output model to regression
  if [[ "PTEN.bin" == "$gene" ]]; then
      sed -i "s|loss_fn: mse_loss|loss_fn: binary_cross_entropy|g" $1/$gene.yaml
  else
      sed -i "s|BinaryClassification|Regression|g" $1/$gene.yaml
  fi
  # if Onesite is in the yaml file, means we need Weighted MSE loss
  if grep -q "Onesite" $1/$gene.yaml; then
    sed -i "s|loss_fn: mse_loss|loss_fn: mse_loss_weighted|g" $1/$gene.yaml
  fi
done

for gene in PTEN NUDT15 SNCA CYP2C9 GCK ASPA CXCR4 CCR5 
do
  # change num epochs to 2 times larger
  sed -i "s|num_epochs: "$num_epochs"|num_epochs: "$(($num_epochs*2))"|g" $1/$gene.yaml
done

for gene in PTEN PTEN.bin NUDT15 SNCA CYP2C9 GCK ASPA Stab 
do
  sed -i "s|output_dim: 1|output_dim: 2|g" $1/$gene.yaml
done

for gene in CCR5 CXCR4
do
  sed -i "s|output_dim: 1|output_dim: 3|g" $1/$gene.yaml
done

for gene in PTEN CCR5 CXCR4
do
  sed -i "s|num_epochs: "$num_epochs"|num_epochs: "$(($num_epochs))"|g" $1/$gene.yaml
done

for gene in $(cat scripts/gene.txt) $(cat scripts/gene.itan.txt) $(cat scripts/gene.pfams.txt)
do
  sed -i "s|pretrain/|ICC.seed.0/"$gene"/|g" $1/$gene.yaml
  # sed -i "s|BinaryClassification|Tanh|g" $1/$gene.yaml
  sed -i "s|train_size: 0.95|train_size: 0.75|g" $1/$gene.yaml
  sed -i "s|val_size: 0.05|val_size: 0.25|g" $1/$gene.yaml
  sed -i "s|output_dim: 1|output_dim: 1|g" $1/$gene.yaml
  sed -i "s|num_steps_update: "$num_steps_update"|num_steps_update: 1|g" $1/$gene.yaml
  # change loss function to combined_loss
  sed -i "s|loss_fn: mse_loss|loss_fn: weighted_loss|g" $1/$gene.yaml
done

# for genes in gene.pfams.txt, use pre-split data
for gene in $(cat scripts/gene.pfams.txt)
do
  sed -i 's|data_split_fn: ""|data_split_fn: _by_anno|g' $1/$gene.yaml
done

# for non human data, change learning rates and data split fn
for gene in fluorescence
do
  sed -i "s|pretrain/|"$gene"/|g" $1/$gene.yaml
  # change to large learning rate
  sed -i "s|lr: "$half_lr"|lr: "$lr"|g" $1/$gene.yaml
  # # change to large batch size
  # sed -i "s|batch_size: 6|batch_size: 7|g" $1/$gene.yaml
  # data split fn
  sed -i 's|data_split_fn: ""|data_split_fn: _by_anno|g' $1/$gene.yaml
  # don't add contrastive loss because they are from same protein
  sed -i "s|contrastive_loss_fn: cosin_contrastive_loss|contrastive_loss_fn: null|g" $1/$gene.yaml
  # change output model to regression
  sed -i "s|BinaryClassification|Regression|g" $1/$gene.yaml
done

# change seed
for gene in PTEN PTEN.bin CCR5 CXCR4 NUDT15 SNCA CYP2C9 GCK ASPA Stab $(cat scripts/gene.txt) $(cat scripts/gene.itan.txt) $(cat scripts/gene.pfams.txt) fluorescence
do
  # use original yaml as template
  mv $1/$gene.yaml $1/$gene.seed.0.yaml
  for seed in {1..4}
  do
    cp $1/$gene.seed.0.yaml $1/$gene.seed.$seed.yaml
    sed -i "s|seed: 0|seed: "$seed"|g" $1/$gene.seed.$seed.yaml
    sed -i "s|log_dir: "$logdir"TL."$gene".seed.0/|log_dir: "$logdir"TL."$gene".seed."$seed"/|g" $1/$gene.seed.$seed.yaml
  done
  # make a dir and move all yaml files into it
  mkdir -p $1/$gene
  mv $1/$gene.seed.*.yaml $1/$gene
done

mkdir $1/PTEN.replicates.rest/
for replicate in {1..8}
do
        cp $1/PTEN/PTEN.seed.0.yaml $1/PTEN.replicates.rest/PTEN.replicate.rest.$replicate.yaml
        sed -i "s|output_dim: 2|output_dim: 1|g" $1/PTEN.replicates.rest/PTEN.replicate.rest.$replicate.yaml
        sed -i "s|PTEN|PTEN.replicate.rest."$replicate"|g" $1/PTEN.replicates.rest/PTEN.replicate.rest.$replicate.yaml
done

bash scripts/DMS.subset.prepare.yaml.sh $1

# for all genes, prepare a large window version
need_large_window_list=$(cat scripts/gene.txt)" "$(cat scripts/gene.itan.txt)" "$(cat scripts/gene.pfams.txt)
added_large_window_list=""
for gene in $need_large_window_list
do
  added_large_window_list=$added_large_window_list" "$gene".large.window"
done
# do large window list
for gene in $need_large_window_list
do
  mkdir $1/$gene.large.window/
  cp $1/$gene/$gene.seed.0.yaml $1/$gene.large.window/$gene.large.window.seed.0.yaml
  sed -i "s|max_len: 251|max_len: 1251|g" $1/$gene.large.window/$gene.large.window.seed.0.yaml
  sed -i "s|log_dir: "$logdir"TL."$gene".seed.0/|log_dir: "$logdir"TL."$gene".large.window.seed.0/|g" $1/$gene.large.window/$gene.large.window.seed.0.yaml
done

# run IonChannel and ICC with five fold cross validation
for gene in $(cat scripts/gene.txt) $(cat scripts/gene.itan.txt) $(cat scripts/gene.pfams.txt) $added_large_window_list
do
  mkdir $1/$gene.5fold/
  cp $1/$gene/$gene.seed.0.yaml $1/$gene.5fold/$gene.fold.0.yaml
  for fold in {1..4}
  do
    cp $1/$gene.5fold/$gene.fold.0.yaml $1/$gene.5fold/$gene.fold.$fold.yaml
    sed -i "s|ICC.seed.0|ICC.seed."$fold"|g" $1/$gene.5fold/$gene.fold.$fold.yaml
    sed -i "s|TL."$gene".seed.0|TL."$gene".seed.0.fold."$fold"|g" $1/$gene.5fold/$gene.fold.$fold.yaml
  done
done

# run ICC genes with five fold cross validation in subsets
for gene in $(cat scripts/gene.txt)
do
  # use ratio of 1 2 4 6
  for subset in 1 2 4 6
  do
    mkdir $1/$gene.subset.$subset.5fold/
    cp -r $1/$gene.5fold/* $1/$gene.subset.$subset.5fold/
    for fold in {0..4}
    do
      # change num_save_batches to 2 if subset is 1 and 2
      if [[ $subset -lt 3 ]]; then
        sed -i "s|num_save_batches: "$(($target_num_save_batches/80))"|num_save_batches: "$(($target_num_save_batches/200))"|g" $1/$gene.subset.$subset.5fold/$gene.fold.$fold.yaml
        # warm up steps should be 200 times lower
        sed -i "s|lr_warmup_steps: "$(($lr_warmup_steps/20))"|lr_warmup_steps: "$(($lr_warmup_steps/400))"|g" $1/$gene.subset.$subset.5fold/$gene.fold.$fold.yaml
      else
        # warm up steps should be 80 times lower
        sed -i "s|lr_warmup_steps: "$(($lr_warmup_steps/20))"|lr_warmup_steps: "$(($lr_warmup_steps/200))"|g" $1/$gene.subset.$subset.5fold/$gene.fold.$fold.yaml
      fi
      # use the subset2 data, not the subset
      sed -i "s|"$gene"|"$gene".subset2."$subset"|g" $1/$gene.subset.$subset.5fold/$gene.fold.$fold.yaml
      mv $1/$gene.subset.$subset.5fold/$gene.fold.$fold.yaml $1/$gene.subset.$subset.5fold/$gene.subset.$subset.fold.$fold.yaml
    done
  done
done

# run DMS with five fold cross validation
for gene in PTEN PTEN.bin NUDT15 CCR5 CXCR4 SNCA CYP2C9 GCK ASPA Stab
do
  mkdir $1/$gene.5fold/
  cp $1/$gene/$gene.seed.0.yaml $1/$gene.5fold/$gene.fold.0.yaml
  for fold in {1..4}
  do
    cp $1/$gene.5fold/$gene.fold.0.yaml $1/$gene.5fold/$gene.fold.$fold.yaml
    sed -i "s|training|train.seed."$fold"|g" $1/$gene.5fold/$gene.fold.$fold.yaml
    sed -i "s|testing.csv|test.seed."$fold".csv|g" $1/$gene.5fold/$gene.fold.$fold.yaml
    sed -i "s|TL."$gene".seed.0|TL."$gene".seed.0.fold."$fold"|g" $1/$gene.5fold/$gene.fold.$fold.yaml
  done
done

echo $changed_data
if [ $changed_data = true ]; then
  echo "change data-file-train back to original yaml"
  mv $1/pretrain.seed.0.yaml.bak $1/pretrain.seed.0.yaml
fi