Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try to automatically convert from CIGARs v1.4 to v1.3 format #147

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 35 additions & 35 deletions src/bin/sga-astat.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def computeAStat(arrivalRate, len, n):
arrivalRate = 0

def usage():
print 'usage: sga-astat.py in.bam'
print 'Compute Myers\' a-statistic for a set of contigs using the read alignments in in.bam'
print 'usage: sga-astat.py [OPTIONS] *.bam'
print 'Compute Myers\' a-statistic for a set of contigs using the read alignments from at least one SAM/BAM file'
print 'Options:'
print ' -m=INT only compute a-stat for contigs at least INT bases in length'
print ' -b=INT use the longest INT contigs to perform the initial estimate'
Expand Down Expand Up @@ -62,53 +62,53 @@ def usage():
usage()
sys.exit(1)

if len(args) == 0:
print 'Error: a BAM file must be provided\n'
if not args:
print 'Error: at least one SAM/BAM file must be provided\n'
usage()
sys.exit(2)

bamFilename = args[0]

sys.stderr.write('Reading alignments from ' + bamFilename + '\n')
contigData = list()

# Read the contig names and lengths from the bam
bamFile = pysam.Samfile(bamFilename, "rb")

for (name, length) in zip(bamFile.references, bamFile.lengths):
t = name, length, 0
contigData.append(ContigData(name, length))
#print 'Name: ' + name
#print 'Length: ' + str(len)

# Read the alignments and populate the counts
totalReads = 0
sumReadLength = 0

last_ref_idx = -1
last_pos = -1

for alignment in bamFile:
for bamFilename in args:
sys.stderr.write('Reading alignments from ' + bamFilename + '\n')

# Read the contig names and lengths from the bam
bamFile = pysam.Samfile(bamFilename, "rb")

for (name, length) in zip(bamFile.references, bamFile.lengths):
t = name, length, 0
contigData.append(ContigData(name, length))
#print 'Name: ' + name
#print 'Length: ' + str(len)

# Read the alignments and populate the counts
for alignment in bamFile:

if alignment.is_unmapped:
continue

if alignment.is_unmapped:
continue
ref_idx = alignment.rname
ref_name = bamFile.getrname(ref_idx)

ref_idx = alignment.rname
ref_name = bamFile.getrname(ref_idx)
pos = alignment.pos

pos = alignment.pos
if ref_idx == last_ref_idx and pos == last_pos and not bKeepDuplicates:
continue

if ref_idx == last_ref_idx and pos == last_pos and not bKeepDuplicates:
continue
# Update the count
cd = contigData[ref_idx]
cd.n += 1
totalReads += 1
sumReadLength += alignment.rlen
assert(cd.name == ref_name)
last_ref_idx = ref_idx
last_pos = pos

# Update the count
cd = contigData[ref_idx]
cd.n += 1
totalReads += 1
sumReadLength += alignment.rlen
assert(cd.name == ref_name)
last_ref_idx = ref_idx
last_pos = pos
bamFile.close()

avgReadLen = sumReadLength / totalReads
contigData.sort(key=lambda c : c.len, reverse=True)
Expand Down
23 changes: 20 additions & 3 deletions src/bin/sga-bam2de.pl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,24 @@
my $cmd;

# fix-mate info
$cmd = "abyss-fixmate -h $prefix.tmp.hist $bamFile | samtools view -Sb - > $prefix.diffcontigs.bam";
# use reformat.sh from BBmap bundle to convert from v1.4 CIGAR strings into v1.3, alternatively one could have used RTG-CORE (unlike RTG-TOOLS which is the stripped version lacking sammerge feature)
my $mypath = `which reformat.sh`;
my $cigar_convertor_cmd = "";
my $rtg_core_distdir = "";
if ($mypath ne '')
{
$cigar_convertor_cmd = "reformat.sh sam=1.3 in=stdin.sam out=stdout.sam |";
}
else
{
$mypath = `which rtg`;
if ($mypath ne '')
{
$rtg_core_distdir = `dirname $mypath`;
$cigar_convertor_cmd = "java -Djava.library.path=${rtg_core_distdir} -XX:ErrorFile=./hs_err_pid'$$'.log -Dreferences.dir=${rtg_core_distdir}/references -Dmodels.dir=${rtg_core_distdir}/models -Dtalkback=false -Xmx38g -jar ${rtg_core_distdir}/RTG.jar sammerge ${bamFile} --legacy-cigars --exclude-duplicates --exclude-unmapped --exclude-unplaced --output=- --no-gzip |";
}
}
$cmd = "samtools view -h -F 0x800 -F 0x200 -F 0x400 -F 0x100 $bamFile | $cigar_convertor_cmd abyss-fixmate -h $prefix.tmp.hist | samtools view -h -Sb -o $prefix.diffcontigs.bam";
runCmd($cmd);

# DistanceEst is very slow when the learned insert size distribution is very wide
Expand Down Expand Up @@ -88,7 +105,7 @@ sub runCmd
{
my($c) = @_;
print "$c\n";
system($c);
system($c) and warn "Error: ", chomp($c), " failed with: $!\n" and die;
}

# Check whether the programs are found and executable
Expand All @@ -97,7 +114,7 @@ sub checkDependency
while(my $program = shift) {
my $ret = system("/bin/bash -c \"hash $program\"");
if($ret != 0) {
print STDERR "Could not find program $program. Please install it or update your PATH.\n";
print STDERR "Could not find program $program: $!. Please install it or update your PATH.\n";
print STDERR "Return: $ret\n";
exit(1);
}
Expand Down