-
Notifications
You must be signed in to change notification settings - Fork 0
/
iterative-assembler.sh
executable file
·58 lines (50 loc) · 1.6 KB
/
iterative-assembler.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/bash
# A greedy assembler that assembles a single contig from a large amount of data
# given a seed.
#
# 1. The contig starts as the seed.
# 2. Extract all reads that overlap with the current contig.
# 3. Extend the current contig by finding the most common value at each
# position.
#
# This is lazy in several ways, including that it doesn't attempt to handle
# read errors or variation. It just steamrolls ahead with the most common
# variant at every point.
#
# Since the main cost is the corpus scans, assemble multiple contigs in
# parallel.
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SEEDS=$(realpath $1)
shift
ACCESSIONS="$@"
mkdir iterative-assembly
cd iterative-assembly
cat $SEEDS | sed s/.fasta// | while read target count seed; do
if [ ! -e $target ]; then
mkdir $target
echo $seed > $target/0.contig.seq
fi
done
TARGETS=$(cat $SEEDS | sed 's/.fasta.*//')
while true; do
echo "Scanning..."
echo $ACCESSIONS | \
tr ' ' '\n' | \
xargs -P 32 -I {} bash -c \
"aws s3 cp s3://prjna729801/{}.arclean.fastq.gz - | \
gunzip | \
$SCRIPT_DIR/extract-overlapping-reads.py {} $SEEDS"
echo "Assembling..."
should_stop=true
for target in $TARGETS; do
if [ ! -e $target/final_contig.seq ]; then
should_stop=false
$SCRIPT_DIR/iterative-assembler-clean-up.py $target
$SCRIPT_DIR/iterative-assembler.py $target &
fi
done
if $should_stop; then
break # we stop once every sequence has finished
fi
wait
done