-
Notifications
You must be signed in to change notification settings - Fork 2
/
ParsePhageGenomes.py
150 lines (117 loc) · 5.21 KB
/
ParsePhageGenomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# parse accession numbers of genomes with phages into txt file
import csv
fname = "phage_results_putonti.tsv"
#parse the info from phage results to an easier to use file
print("Parsing genome accessions with phages to file phageAccessions.txt...")
with open(fname) as f_in, open('phageAccessions.txt', 'w') as f_out:
rd = csv.reader(f_in, delimiter="\t", quotechar='"')
headers = next(rd)
for row in rd:
f_out.write(row[0]+','+row[1]+','+row[2]+'\n')
f_in.close(); f_out.close();
phageResultsFileName = 'phageCompareResults.csv'
print("Running phage analysis... (output will be in " + phageResultsFileName + ")")
#read the phage results, putting the information into a dictionary for each accession
accessionDict = {}
for line in open('phageAccessions.txt'):
accessionInfo = line.split(',')
accessionDict[accessionInfo[0]] = []
accessionDict[accessionInfo[0]].append(accessionInfo[1].split(" ")[0])
accessionDict[accessionInfo[0]].append(accessionInfo[2].strip())
#key = accession, value = [genus, phage #], RM system type will be appended later
#initialize a list for accessions with BLAST info: if it doesn't have BLAST info, it won't be used in the analysis
accessionsWithBlastInfo = []
#read all lines of the CSV (blast info) file, skipping the first line
for line in open('RMBlastCSV.csv').readlines()[1:]:
#extract the accession number from the file name
blastInfo = line.split(',')
#get the E value for the accession's hit
#if it is over 0.01, it will not be used
evalue = float(blastInfo[4])
if evalue <= 0.01:
accession = blastInfo[0]
accession = accession[:accession.index('_ASM')]
accessionsWithBlastInfo.append(accession)
#extract the enzyme type that was found
enzType = blastInfo[3]
accessionDict[accession].append(enzType) #RM system type is now in the information for each accession (also contains genus and # for phage)
#open an output file for the phage results
phageResults = open(phageResultsFileName,'w')
#initialize two dictionaries: one to holds the number of 'yes' for type and one that holds all for type
typeYes = {}
typeAll = {}
#go through each accession that was ran in the BLAST analysis, counting the number of each
#RM system type, and the number of that type that have phages as well
for accession in accessionsWithBlastInfo:
rmtype = accessionDict[accession][2]
hasPhage = (int(accessionDict[accession][1]) > 0)
#add to total of RM System type
if rmtype not in typeAll:
typeAll[rmtype] = 1
else:
typeAll[rmtype] += 1
#add to number that has phages if applicable
if hasPhage:
if rmtype not in typeYes:
typeYes[rmtype] = 1
else:
typeYes[rmtype] += 1
#write headers for the type information to be written
phageResults.write('RM System Type,Total # of System Type,# with Phage(s),% with Phage(s)\n')
for rmType in typeAll.keys():
phageResults.write(rmType + ',') #write the type name
phageResults.write(str(typeAll[rmType]) + ',') #write the total number of that type
#write the number of 'yes' values for that type
try:
phageResults.write(str(typeYes[rmType]) + ',')
except KeyError: #if it has no 'yes' values, write 0
typeYes[rmType] = 0
phageResults.write(str(typeYes[rmType]) + ',')
#write the % of type that have phage
phageResults.write(str(float((typeYes[rmType]/typeAll[rmType]))*100) + '\n')
phageResults.write('\n')
#for each type (already written to the file), analyze which genera containing that type also have phages
#are certain genera more likely to have or not have phages?
for type in typeYes:
#initialize dictionaries to hold the number of 'yes' for each genera and the total for each genera
genusYes = {}
genusTotal = {}
#write the type and headers to the file
phageResults.write(type + '\n')
phageResults.write('Genus,Total # of Genus,# with Phage,% with Phage\n')
#go through the accessions
for acc in accessionsWithBlastInfo:
accInfo = accessionDict[acc]
#extract the RM type that was found from that accession
#and check if it matches the current type
rmtype = accInfo[2]
if rmtype == type:
#if it matches the current type, extract the genus and check if it has a phage
genus = accInfo[0]
hasPhage = (int(accInfo[1]) > 0)
#add the genus to the total
if genus not in genusTotal:
genusTotal[genus] = 1
else:
genusTotal[genus] += 1
#add the genus to the number with phages, if applicable
if hasPhage:
if genus not in genusYes:
genusYes[genus] = 1
else:
genusYes[genus] += 1
#for each genus under the current type, write its info to the file
for genus in genusTotal.keys():
#write the genus
phageResults.write(genus + ',')
#write the total number of that genus with the RM system type
phageResults.write(str(genusTotal[genus]) + ',')
#write the total number of that genus with phage
try:
phageResults.write(str(genusYes[genus]) + ',')
except KeyError:
genusYes[genus] = 0
phageResults.write(str(genusYes[genus]) + ',')
#write the % of genus with phage
phageResults.write(str(float((genusYes[genus]/genusTotal[genus]))*100) + '\n')
phageResults.write('\n')