forked from sanger-pathogens/SPN-Resistance_Sanger
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SPN_pbpDB-Updater.sh
executable file
·210 lines (183 loc) · 6.96 KB
/
SPN_pbpDB-Updater.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env bash
while getopts :r:u:t: option
do
case $option in
r) refSeq_path=$OPTARG;;
u) update_PBP=$OPTARG;;
t) table_out=$OPTARG;;
esac
done
###Check if the PBP reference paths exist and are correct###
if [[ -z "$refSeq_path" ]]
then
echo "No PBP reference database path argument given."
exit 1
else
refSeq_p2=$(echo $refSeq_path | sed 's/\/$//g')
refSeq="${refSeq_p2}"
echo "refSeq: $refSeq"
fi
Ref_1A=$(ls $refSeq/*1A*.faa)
Ref_2B=$(ls $refSeq/*2B*.faa)
Ref_2X=$(ls $refSeq/*2X*.faa)
if [[ -e "$Ref_1A" && -e "$Ref_2X" && -e "$Ref_2B" ]]
then
echo "The PBP 1A reference database is: $Ref_1A"
echo "The PBP 2B reference database is: $Ref_2B"
echo "The PBP 2X reference database is: $Ref_2X"
else
echo "At least one of the PBP reference databases are not in the correct format or don't exist."
echo "Make sure they are in the $refSeq directory and have the correct naming format."
echo "More specifically, each PBP sequence must contain '1A', '2B' or '2X' and have the '.faa' extension."
exit 1
fi
if [[ -z "$update_PBP" ]]
then
echo "A file containing the new PBP sequence info hasn't been given."
exit 1
else
if [[ -e "$update_PBP" ]]
then
update_PBP="${update_PBP}"
else
echo "The file containing the new PBP sequence info doesn't exist."
exit 1
fi
fi
if [[ -z "$table_out" ]]
then
echo "The argument giving the filename of the TABLE formatted typing output hasn't been given."
exit 1
else
if [[ -e "$table_out" ]]
then
table_out="${table_out}"
else
echo "The argument giving the filename of the TABLE formatted typing output doesn't exist."
exit 1
fi
fi
eval refSeq=$refSeq
eval pbp1A=$pbp1A
eval pbp2X=$pbp2X
eval update_PBP=$update_PBP
eval table_out=$table_out
###Subroutines###
blastTyper () {
pbpName=$(echo "$2" | sed 's/.*bLactam_\(.*\)-.*\.faa/\1/g')
SampleName=$(echo "$1" | awk -F"/" '{print $(NF-1)}')
typedPath=$(dirname "$5")
pbpAlleleID=""
#echo "1: $1 | 2: $2 | 3: $3 | 4: $4 | 5: $5 | 6: $6"
pbpStop="no"
if [[ ! -s "$1" ]]
then
printf "The following extracted PBP sequence file is empty:\n"
printf "$1\n"
pbpAlleleID="ERROR"
pbpStop="yes"
fi
#isBlastDB=$(ls "$refSeq"/"$pbpName"_prot_blast_db*)
isBlastDB=$(ls "$refSeq"/Blast_bLactam_"$pbpName"_prot_DB*)
if [[ "$pbpStop" == "no" ]]
then
if [[ -z "$isBlastDB" ]]
then
### Make blast database
#makeblastdb -in "$2" -dbtype prot -out "$refSeq"/"$pbpName"_prot_blast_db
#blastp -db "$refSeq"/"$pbpName"_prot_blast_db -query "$1" -outfmt 6 -out "$SampleName"_blast-out_"$pbpName".txt
makeblastdb -in "$2" -dbtype prot -out "$refSeq"/Blast_bLactam_"$pbpName"_prot_DB
blastp -db "$refSeq"/Blast_bLactam_"$pbpName"_prot_DB -query "$1" -outfmt 6 -out "$SampleName"_blast-out_"$pbpName".txt
else
#blastp -db "$refSeq"/"$pbpName"_prot_blast_db -query "$1" -outfmt 6 -out "$SampleName"_blast-out_"$pbpName".txt
blastp -db "$refSeq"/Blast_bLactam_"$pbpName"_prot_DB -query "$1" -outfmt 6 -out "$SampleName"_blast-out_"$pbpName".txt
fi
cat "$SampleName"_blast-out_"$pbpName".txt | sort -r -n -k12,12 -k3,3 -k4,4 | sed -n 1p > prot_best_blast.txt
printf "\n"
cat prot_best_blast.txt
geneLen=$(cat "$1" | grep -v '>' | tr '\n' ' ' | sed 's/ //g' | wc -c)
blastIden=$(cat prot_best_blast.txt | awk -F"\t" '{print $3}')
blastLen=$(cat prot_best_blast.txt | awk -F"\t" '{print $4}')
blastName=$(cat prot_best_blast.txt | awk -F"\t" '{print $2}')
echo "Best blast identity is: $blastIden"
echo "Best blast length is: $blastLen"
echo "PBP gene length is: $geneLen"
if [[ $(echo "$blastIden == 100" | bc) -eq 1 && $(echo "$blastLen == $geneLen" | bc) -eq 1 ]]
then
pbpAlleleID=$(echo "$blastName" | sed 's/^\([0-9]\+\)||.*/\1/g')
echo "Found match in database. Sequence match is $pbpAlleleID."
else
echo "Didn't find match. Will add sequence to database."
###Modify header (including adding number ID)###
oldMaxID=$(cat "$2" | grep '>' | sed 's/>\([0-9]\+\)||.*/\1/g' | sort -n | tail -n1)
pbpAlleleID=$(echo "$oldMaxID + 1" | bc)
while read line
do
if [[ $line =~ ^\>.* ]]
then
echo $line | sed "s/>.*/>$pbpAlleleID||SPN_$pbpName/g" >> Single_newRef_Seq.faa
else
echo $line >> Single_newRef_Seq.faa
fi
done < "$1"
printf "New PBP Sequence\n"
cat Single_newRef_Seq.faa
###Output the new allele to Final_newRef_Seq.faa and add it to the database###
cat Single_newRef_Seq.faa >> "$2"
echo "Remaking blast database"
#rm "$refSeq"/"$pbpName"_prot_blast_db*
#makeblastdb -in "$2" -dbtype prot -out "$refSeq"/"$pbpName"_prot_blast_db
rm "$refSeq"/Blast_bLactam_"$pbpName"_prot_DB*
makeblastdb -in "$2" -dbtype prot -out "$refSeq"/Blast_bLactam_"$pbpName"_prot_DB
rm Single_newRef_Seq.faa
fi
fi
###Add in the code to update the TABLE pbp output file with the pbp IDs for the new alleles###
while read line
do
lineName=$(echo "$line" | awk -F"\t" '{print $1}')
samplName=$(echo "$3" | sed 's/PBP_//g')
if [[ "$samplName" == "$lineName" ]]
then
if [[ "$4" == "1A" ]]
then
echo "New PBP 1A: $pbpAlleleID"
echo "$line" | awk -v var="$pbpAlleleID" -v OFS='\t' '{$12=var; print }' >> "$5"_PRE
elif [[ "$4" == "2B" ]]
then
echo "New PBP 2B: $pbpAlleleID"
echo "$line" | awk -v var="$pbpAlleleID" -v OFS='\t' '{$13=var; print }' >> "$5"_PRE
elif [[ "$4" == "2X" ]]
then
echo "New PBP 2X: $pbpAlleleID"
echo "$line" | awk -v var="$pbpAlleleID" -v OFS='\t' '{$14=var; print }' >> "$5"_PRE
fi
else
echo "$line" >> "$5"_PRE
fi
done < "$5"
rm "$5"
mv "$5"_PRE "$5"
###Remove Files###
rm prot_best_blast.txt
rm "$SampleName"_blast-out_"$pbpName".txt
}
###Start Doing Stuff###
cp "$table_out" "$table_out"_OLD
while read line
do
sampleID=$(echo "$line" | awk -F"\t" '{print $1}')
pbpPath=$(echo "$line" | awk -F"\t" '{print $2}')
pbpAllele=$(echo "$line" | awk -F"\t" '{print $3}')
#echo "sample: $sampleID | path: $pbpPath | allele: $pbpAllele"
if [[ "$pbpAllele" == "1A" ]]
then
blastTyper "$pbpPath" "$Ref_1A" "$sampleID" "$pbpAllele" "$table_out"
elif [[ "$pbpAllele" == "2B" ]]
then
blastTyper "$pbpPath" "$Ref_2B" "$sampleID" "$pbpAllele" "$table_out"
elif [[ "$pbpAllele" == "2X" ]]
then
blastTyper "$pbpPath" "$Ref_2X" "$sampleID" "$pbpAllele" "$table_out"
fi
done < "$update_PBP"