-
Notifications
You must be signed in to change notification settings - Fork 0
/
iterative-assembler.py
executable file
·193 lines (162 loc) · 6.17 KB
/
iterative-assembler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python3
import os
import sys
import glob
import subprocess
from collections import Counter, defaultdict
from Bio.SeqIO.FastaIO import SimpleFastaParser
thisdir = os.path.dirname(os.path.realpath(__file__))
iterative_assembler_exe = os.path.join(thisdir, "iterative-assembler")
target, = sys.argv[1:]
iteration = max(int(fname.split("/")[-1].split(".")[0])
for fname in glob.glob("%s/*.contig.seq" % target))
K=40
END_LENGTH=100
with open("%s/%s.contig.seq" % (target, 0)) as inf:
seed = inf.read().strip()
with open("%s/%s.contig.seq" % (target, iteration)) as inf:
prev_contig = inf.read().strip()
prev_start = prev_contig[:K]
prev_end = prev_contig[-K:]
prev_start_count = 0
prev_end_count = 0
seqs = []
for fname in glob.glob("%s/*.%s.fasta" % (target, iteration)):
with open(fname) as inf:
for title, seq in SimpleFastaParser(inf):
seqs.append(seq)
if prev_start in seq:
prev_start_count += 1
if prev_end in seq:
prev_end_count += 1
def get_base(score_A, score_C, score_G, score_T):
total = score_A + score_C + score_G + score_T
if not total:
return None
top_base = 'A'
top_score = score_A
if score_C > top_score:
top_base = 'C'
top_score = score_C
if score_G > top_score:
top_base = 'G'
top_score = score_G
if score_T > top_score:
top_base = 'T'
top_score = score_T
# Very rough conversion from scores to the counts we were using before.
count = top_score / K / 2
# These rules could be a lot fancier. The basic principle is that we'd get
# the most accuracy by adding only the single most confident base each
# time, but if we did this we'd need a very large number of iterations.
# But if we're confident enough about the bases we're adding, though, we
# can add dozens in a single iteration, making it *much* faster.
#
# The main idea is that once enough reads have dropped off we should
# rescan.
if (count < 10 or
count < max(prev_start_count, prev_end_count) / 100):
sys.stderr.write(
"%s rejected (max(%s, %s)): %s / %s = %.2f\n" % (
target, prev_start_count, prev_end_count,
count, total, count/total))
return None
return top_base
# Each time through the loop we look at every read, and if it matches either
# the start or end of the contig we count the distribution of bases. We take
# the most common base, and keep going.
contig = prev_contig[END_LENGTH:-END_LENGTH]
if len(contig) < len(seed):
contig = seed
MISMATCH_PENALTY=.9
def score(seq, match_pos, direction):
# We want to give high scores for sequences that are a good match for our
# in-progress contig. This idea is that if we see:
#
# contig: ABCDEFGHIJK
# 1: FGHIJKL
# 2: FZHWXKM
# 3: YTUVSKN
# then L > M > N
if direction == 1 and False:
print(seq)
print("%s%s" % (
" " * (match_pos), contig))
print ("%s[%s]" % (
" " * (match_pos), " " * (K-2)))
elif direction == -1 and False:
print(contig)
print("%s%s" % (
" " * (len(contig) - K - match_pos), seq))
print ("%s[%s]" % (
" " * (len(contig) - K), " " * (K-2)))
# First, figure out how well seq matches contig.
# We know that seq[match_pos:match_pos+K] is
# if direction == 1: contig[:K]
# if direction == -1: contig[-K:]
assert(seq[match_pos:match_pos+K] ==
contig[:K] if direction == 1 else contig[-K:])
matches = K
mismatches = 0
# Now work away from the known match seeing how many other matches there
# are.
contig_index = K if direction == 1 else len(contig)-K-1
seq_index = match_pos + K if direction == 1 else match_pos - 1
while 0 <= contig_index < len(contig) and 0 <= seq_index < len(seq):
if False:
print("%s =? %s" % (contig[contig_index], seq[seq_index]))
if contig[contig_index] == seq[seq_index]:
matches += 1
else:
mismatches += 1
contig_index += direction
seq_index += direction
if False:
print(matches, mismatches, matches * MISMATCH_PENALTY**mismatches)
# How to go from #matches and #mismatches to a score? Nothing principled
# here, but the general idea is that more matches is good and more
# mismatches is bad. We don't want to go below zero, since even terrible
# evidence for a base is still not evidence against that base.
return matches * MISMATCH_PENALTY**mismatches
def run_subprocess(direction):
proc = subprocess.Popen([iterative_assembler_exe, direction, contig],
stdout=subprocess.PIPE,
stdin=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True)
outs, errs = proc.communicate(timeout=15, input="\n".join(seqs))
if proc.returncode != 0:
raise Exception("%s failed with exit code %s %r" % (
proc.returncode, (outs, errs)))
score_A, score_C, score_G, score_T = outs.strip().split()
return float(score_A), float(score_C), float(score_G), float(score_T)
need_next = True
need_prev = True
while need_next or need_prev:
print(contig)
start = contig[:K]
end = contig[-K:]
# We currently don't have any way of assembling around loops of >K, so if
# we get to a kmer we've seen before just bail.
if start in contig[1:]:
need_prev = False
if end in contig[:-1]:
need_next = False
if need_next:
next_base = get_base(*run_subprocess("n"))
if next_base:
contig = contig + next_base
else:
need_next = False
if need_prev:
prev_base = get_base(*run_subprocess("p"))
if prev_base:
contig = prev_base + contig
else:
need_prev = False
if len(contig) <= len(prev_contig):
out_prefix = "final_contig"
else:
out_prefix = "%s.contig" % (iteration + 1)
with open("%s/%s.seq" % (target, out_prefix), "w") as outf:
outf.write("%s\n" % contig)