-
Notifications
You must be signed in to change notification settings - Fork 1
/
count_clades.py
executable file
·71 lines (60 loc) · 1.9 KB
/
count_clades.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
# input: kraken output
# output: tsv of taxid, assignments, hits
import re
import sys
from collections import Counter
parents = {} # child_taxid -> parent_taxid
with open("dashboard/nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
parents[child_taxid] = parent_taxid
direct_assignments = Counter() # taxid -> direct assignments
direct_hits = Counter() # taxid -> direct hits
clade_assignments = Counter() # taxid -> clade assignments
clade_hits = Counter() # taxid -> clade hits
for lineno, line in enumerate(sys.stdin):
line = line.strip()
if not line:
continue
try:
_, _, name_and_taxid, _, encoded_hits = line.split("\t")
except ValueError:
raise Exception("Bad line #%d: %r" % (
lineno, line))
(taxid,) = re.findall("^.*[(]taxid ([0-9]+)[)]$", name_and_taxid)
taxid = int(taxid)
direct_assignments[taxid] += 1
while True:
clade_assignments[taxid] += 1
if taxid in [0, 1]:
break
taxid = parents[taxid]
direct_incremented = set()
clade_incremented = set()
for hit in re.findall("([0-9]+):", encoded_hits):
hit = int(hit)
if hit not in direct_incremented:
direct_hits[hit] += 1
direct_incremented.add(hit)
while hit not in clade_incremented:
clade_hits[hit] += 1
clade_incremented.add(hit)
if hit in [0, 1]:
break
hit = parents[hit]
for taxid in sorted(clade_hits):
print(
"%s\t%s\t%s\t%s\t%s"
% (
taxid,
direct_assignments[taxid],
direct_hits[taxid],
clade_assignments[taxid],
clade_hits[taxid],
)
)