-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval-buckets-sm.py
97 lines (68 loc) · 2.42 KB
/
eval-buckets-sm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import sys
import math
import statsmodels.api as sm
import dateutil.parser
def too_little_data(vals):
# Not worth running regression if we have too little data. Need to see
# at least five instances, and at least three times total.
return sum(vals) < 5 or vals.count(0) > len(vals)-3
def parse_metadata(wtp, metadata):
first_date = None
days = []
# file is sorted by days already
with open(metadata) as inf:
for line in inf:
fname, date, line_wtp, is_enriched = line.strip().split("\t")
# Metadata has both forward and reverse reads, but we just care
# about sample dates so ignore reverse reads
if not fname.endswith("_1.fastq.gz"): continue
# only looking at the specified wtp
if line_wtp != wtp: continue
# only considering unenriched data
if is_enriched != "0": continue
date = dateutil.parser.isoparse(date)
if first_date is None:
first_date = date
days.append([(date - first_date).days])
return days
def start(wtp, metadata):
days = parse_metadata(wtp, metadata)
for line in sys.stdin:
line = line.strip()
if not line: continue
bucket, *vals = line.split('\t')
vals = [int(x) for x in vals]
if too_little_data(vals):
continue
for i in range(10, len(vals)):
result = eval_bucket(vals[:i+1], days[:i+1])
if not result: continue
pvalue, coef, ci_025 = result
print("%.4e\t%.1f%%\t%.1f%%\t%s\t%s" % (
pvalue,
coef*100,
ci_025*100,
i,
bucket))
def eval_bucket(vals, days):
if too_little_data(vals):
return
model = sm.GLM(vals, sm.add_constant(days),
family=sm.families.Poisson())
try:
result = model.fit()
except ValueError:
print("ERROR\t%s" % bucket)
return
pvalue = result.pvalues[1]
if pvalue > 1e-5:
# Rough filtering to exclude most uninteresting output
return
coef = result.params[1]
ci_025, ci_975 = result.conf_int()[1]
# These are in log space; convert to percentage daily growth.
coef = math.exp(coef)-1
ci_025 = math.exp(ci_025)-1
return pvalue, coef, ci_025
if __name__ == "__main__":
start(*sys.argv[1:])