-
Notifications
You must be signed in to change notification settings - Fork 0
/
step10_fig1_TPR_FPR.py
185 lines (163 loc) · 7.95 KB
/
step10_fig1_TPR_FPR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import numpy as np
import pandas as pd
import os
import pdb
from copy import deepcopy as COPY
from matplotlib import pyplot as plt
from scipy.stats import norm
from scipy.stats import pearsonr
from scipy.stats import gaussian_kde as smooth
from scipy.stats import expon
from scipy.stats import norm
from scipy.stats import yeojohnson
from tqdm import tqdm
from STAR_outliers_polishing_library import approximate_quantiles
from STAR_outliers_polishing_library import adjust_median_values
# source title: Outlier identification for skewed and/or
# heavy-tailed unimodal multivariate distributions
def estimate_tukey_params(W, bound):
"""
Purpose
-------
To estimate tukey parameters for the distribution of deviation scores
Parameters
----------
W: the distribution of deviation scores
bound: the highest percentile on which to fit the tukey parameters
also, 100 - bound is the corresponding lowest percentile.
Returns
-------
A: tukey location parameter
B: tukey scale parameter
g: tukey skew parameter
h: tukey tail heaviness parameter
W_ignored: values of W that are too close to the median.
very few values are too close in truly continuous data
most data is not truly continuous and has an overabundance.
Note that the tukey only needs to fit the right side of
the W distribution, and ignored values are far to the left.
"""
# pdb.set_trace()
Q_vec = np.percentile(W, [10, 25, 50, 75, 90])
W_main = W[np.logical_and(W <= Q_vec[4], W >= Q_vec[0])]
if len(np.unique(Q_vec)) != 5 or len(np.unique(W_main)) < 30:
Q_vec = approximate_quantiles(W, [10, 25, 50, 75, 90])
A = Q_vec[2]
IQR = Q_vec[3] - Q_vec[1]
SK = (Q_vec[4] + Q_vec[0] - 2*Q_vec[2])/(Q_vec[4] - Q_vec[0])
T = (Q_vec[4] - Q_vec[0])/(Q_vec[3] - Q_vec[1])
phi = 0.6817766 + 0.0534282*SK + 0.1794771*T - 0.0059595*(T**2)
B = (0.7413*IQR)/phi
Q_vec2 = np.percentile(W, [100 - bound, 25, 50, 75, bound])
if len(np.unique(Q_vec2)) != 5 or len(np.unique(W_main)) < 30:
Q_vec2 = approximate_quantiles(W, [100 - bound, 25, 50, 75, bound])
zv = norm.ppf(bound/100, 0, 1)
UHS = Q_vec2[4] - Q_vec2[2]
LHS = Q_vec2[2] - Q_vec2[0]
g = (1/zv)*np.log(UHS/LHS)
y = (W - A)/B
Q_vec3 = np.percentile(y, [100 - bound, 25, 50, 75, bound])
if len(np.unique(Q_vec3)) != 5 or len(np.unique(W_main)) < 30:
Q_vec3 = approximate_quantiles(y, [100 - bound, 25, 50, 75, bound])
Q_ratio = (Q_vec3[4]*Q_vec3[0])/(Q_vec3[4] + Q_vec3[0])
h = (2/(zv**2))*np.log(-g*Q_ratio)
if np.isnan(h):
h = 0
return((A, B, g, h))
def compute_w(x):
"""
Purpose
-------
To convert x values into a statistic that quantifies
deviation from the mean relative to skew and tail heaviness
Parameters
----------
x: numeric input numpy array and the original
dataset that needs outliers to be removed
Returns
-------
W: a numeric numpy array and the distribution
of deviation scores for the x values
"""
Q_vec = np.percentile(x, [10, 25, 50, 75, 90])
x_main = x[np.logical_and(x <= Q_vec[4], x >= Q_vec[0])]
if len(np.unique(Q_vec)) != 5 or len(np.unique(x_main)) < 30:
Q_vec = approximate_quantiles(x, [10, 25, 50, 75, 90])
x2 = adjust_median_values(x, Q_vec)
c = 0.7413
ASO_high = (x2 - Q_vec[2])/(2*c*(Q_vec[3] - Q_vec[2]))
ASO_low = (Q_vec[2] - x2)/(2*c*(Q_vec[2] - Q_vec[1]))
ASO = np.zeros(len(x2))
ASO[x2 >= Q_vec[2]] = ASO_high[x2 >= Q_vec[2]]
ASO[x2 < Q_vec[2]] = ASO_low[x2 < Q_vec[2]]
W = norm.ppf((ASO + 1E-10)/(np.min(ASO) + np.max(ASO) + 2E-10))
return(W)
outlier_indices = pd.read_csv("skewed_data_outlier_indices.txt", delimiter = "\t").to_numpy()
fake_data = pd.read_csv("skewed_data.txt", delimiter = "\t").to_numpy()
prefix = "original_method_skewed_data"
if not os.path.exists(prefix):
os.mkdir(prefix)
if not os.path.exists(prefix + "_untransformed"):
os.mkdir(prefix + "_untransformed")
for bound in [99]:
dataset_names, TPR, FPR, correct_cutoffs, cutoffs = np.arange(len(fake_data.T)), [], [], [], []
for i in tqdm(range(len(dataset_names))):
x, ind, name = (fake_data.T)[i], (outlier_indices.T)[i], dataset_names[i]
W, pcutoff = compute_w(x), 0.993
A, B, g, h = estimate_tukey_params(W, bound)
z = np.random.normal(0, 1, 200000)
fitted_TGH = A + B*(1/(g + 1E-10))*(np.exp((g + 1E-10)*z)-1)*np.exp(h*(z**2)/2)
cutoff = np.percentile(fitted_TGH, pcutoff*100)
x_outliers = x[W > cutoff]
ind_pred = W > cutoff
TPR.append(np.sum(ind_pred[ind == True])/np.sum(ind == True)*100)
FPR.append(np.sum(ind_pred[ind == False])/np.sum(ind == False)*100)
correct_cutoffs.append(np.percentile(W[ind == False], 99.3))
cutoffs.append(cutoff)
if bound == 99:
x_unique, x_counts = np.unique(x, return_counts = True)
q5, q95 = np.percentile(x, [5, 95])
x_main = x[np.logical_and(x <= q95, x >= q5)]
bw_coef = 0.075
if len(np.unique(x_main)) < 1000:
bw_coef = 0.3
if len(np.unique(x_main)) < 100:
bw_coef = 0.5
if len(np.unique(x_main)) < 30:
bw_coef = 0.7
delta = (np.percentile(W, 99) - np.percentile(W, 1))/2
xlims = [np.percentile(W, 1) - delta, np.percentile(W, 99) + delta]
range0 = np.linspace(xlims[0] - delta, xlims[1] + delta, np.max([250, int(len(W)/300)]))
smooth_TGH = smooth(fitted_TGH, bw_method = bw_coef)(range0)
plt.hist([W[ind == False], np.concatenate([W[ind]]*15)], bins = 75, density = True,
stacked = True, color = ['blue', 'red'], label = ["data", "outliers (enlarged 15x for emphasis)"],
edgecolor = 'black')
plt.plot(range0, smooth_TGH, "c-", linewidth = 4, label = "fitted tukey distribution")
plt.plot([cutoff, cutoff], [0, 1], "k-", label = "predicted outlier threshold")
plt.legend(loc = "upper left", prop={'size': 11})
plt.xlabel("ASO-probit transformed value", fontsize = 14.5)
plt.ylabel("probability density", fontsize = 14.5)
plt.xticks(fontsize=12.5)
plt.yticks(fontsize=12.5)
plt.title("dataset" + str(name) + "_bound" + str(bound))
plt.savefig(prefix + "/dataset" + str(name) + "_bound" + str(bound) + ".png")
plt.clf()
plt.hist(x, bins = 75, color = 'blue', density = True, edgecolor = 'black')
plt.savefig(prefix + "_untransformed/dataset" + str(name) + "_bound" + str(bound) + ".png")
plt.clf()
df = pd.DataFrame(np.array([dataset_names, TPR, FPR, correct_cutoffs, cutoffs]).T)
df.columns = ["simulated dataset", "TPR", "FPR", "correct cutoff", "cutoff"]
df.to_csv("original_method_skewed_data_" + str(bound) + ".txt", sep = "\t", header = True, index = False)
#-----------------------------------------------------------------------------------------------------------------------------
# For STAR_outliers
#-----------------------------------------------------------------------------------------------------------------------------
true_outliers = pd.read_csv("skewed_data_outlier_indices.txt", delimiter = "\t").to_numpy().T
pred_outliers = pd.read_csv("skewed_data_cleaned_data.txt", delimiter = "\t").to_numpy().T
pred_outliers = np.isnan(pred_outliers)
dataset_names, TPR, FPR = np.arange(len(true_outliers)), [], []
for inds, inds_pred in zip(true_outliers, pred_outliers):
TPR.append(np.sum(inds_pred[inds == True])/np.sum(inds))
FPR.append(np.sum(inds_pred[inds == False])/np.sum(inds == False))
df = pd.DataFrame(np.array([dataset_names, TPR, FPR]).T)
df.columns = ["simulated dataset", "TPR", "FPR"]
df.to_csv("STAR_efficacy_skewed.txt", sep = "\t", header = True, index = False)