-
Notifications
You must be signed in to change notification settings - Fork 2
/
submit_train.sh
54 lines (41 loc) · 2.12 KB
/
submit_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/bin/bash
#SBATCH --job-name=train-cdcr-gpu
#SBATCH -t 02:30:00 # estimated time # TODO: adapt to your needs
#SBATCH -p grete:shared # the partition you are training on (i.e., which nodes), for nodes see sinfo -p grete:shared --format=%N,%G
#SBATCH -G A100:1 # take 1 GPU, see https://www.hlrn.de/doc/display/PUB/GPU+Usage for more options
#SBATCH --mem-per-gpu=30G # setting the right constraints for the splitted gpu partitions
#SBATCH --nodes=1 # total number of nodes
#SBATCH --ntasks=1 # total number of tasks
#SBATCH --cpus-per-task=4 # number cores per task
#SBATCH --mail-type=all # send mail when job begins and ends
#SBATCH [email protected] # TODO: change this to your mailaddress!
#SBATCH --output=./slurm_files/slurm-%x-%j.out # where to write output, %x give job name, %j names job id
#SBATCH --error=./slurm_files/slurm-%x-%j.err # where to write slurm error
source ~/.bashrc
module load cuda
which python
echo $PATH
conda info --envs
source activate dnlp #
# Printing out some info.
echo "Submitting job with sbatch from directory: ${SLURM_SUBMIT_DIR}"
echo "Home directory: ${HOME}"
echo "Working directory: $PWD"
echo "Current node: ${SLURM_NODELIST}"
export http_proxy=http://www-cache.gwdg.de:3128
export https_proxy=http://www-cache.gwdg.de:3128
echo "added http proxy"
# For debugging purposes.
python --version
python -m torch.utils.collect_env
nvcc -V
# Run the script:
#Pretrain setting
#python -u classifier.py --option pretrain --use_gpu
#finetune setting
#python -u classifier.py --option finetune --use_gpu --epochs 30 --batch_size 128 --lr 1e-5
#python -u multitask_classifier.py --option pretrain --lr 1e-3 --batch_size 64 --local_files_only --use_gpu
python -u multitask_classifier.py --option finetune --lr 1e-5 --batch_size 64 --local_files_only --use_gpu --epochs 13
#python -u predictModel.py --use_gpu
# Run the script with logger:
#python -u train_with_logger.py -l ~/${SLURM_JOB_NAME}_${SLURM_JOB_ID} -t True -p True -d True -s True -f True