-
Notifications
You must be signed in to change notification settings - Fork 2
/
utilityfunctions.py
173 lines (152 loc) · 6.53 KB
/
utilityfunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import numpy as np
import scipy.linalg as sp_la
import pandas as pd
import math
def getSummaryStatistics(data):
"Get the max, min, mean, var for each variable in the data."
return pd.DataFrame(np.array([data.max(axis=0), data.min(axis=0), data.mean(axis=0), data.var(axis=0)]))
def getShapeType(data):
"Get the shape and type of the data."
return (data.shape, data.dtype)
def minmaxGlobal(data):
"Global max-min normalization."
scaleTransform = np.eye(data.shape[1], data.shape[1])
for i in range(data.shape[1]):
scaleTransform[i, i] = 1 / (data.max() - data.min())
return ([email protected]).T
def minmaxLocal(data):
"Local max-min normalization."
scaleTransform = np.eye(data.shape[1], data.shape[1])
for i in range(data.shape[1]):
if data[:, i].max() - data[:, i].min() != 0:
scaleTransform[i, i] = 1 / (data[:, i].max() - data[:, i].min())
return ([email protected]).T
def zScore(data):
"z score."
homogenizedData = np.append(data, np.array([np.ones(data.shape[0], dtype=int)]).T, axis=1)
translateTransform = np.eye(homogenizedData.shape[1])
for i in range(homogenizedData.shape[1]):
translateTransform[i, homogenizedData.shape[1]-1] = -homogenizedData[:, i].mean()
diagonal = [1 / homogenizedData[:, i].std() if homogenizedData[:, i].std() != 0 else 1 for i in range(homogenizedData.shape[1])]
scaleTransform = np.eye(homogenizedData.shape[1]) * diagonal
data = (scaleTransform@[email protected]).T
return data[:, :data.shape[1]-1]
def fitlstsq(data, independent, dependent):
"Fit a linear regression using least squares. Independent should be an array of indices of independent variables. Dependent should be one independent variable."
# These are our independent variable(s)
x = data[np.ix_(np.arange(data.shape[0]), independent)]
# We add a column of 1s for the intercept
A = np.hstack((np.array([np.ones(x.shape[0])]).T, x))
# This is the dependent variable
y = data[:, dependent]
# This is the regression coefficients that were fit, plus some other results
c, res, _, _ = sp_la.lstsq(A, y)
return c
def fitnorm(data, independent, dependent):
"Fit a linear regression using the normal equation. Independent should be an array of indices of independent variables. Dependent should be one independent variable."
# These are our independent variable(s)
x = data[np.ix_(np.arange(data.shape[0]), independent)]
# We add a column of 1s for the intercept
A = np.hstack((np.array([np.ones(x.shape[0])]).T, x))
# This is the dependent variable
y = data[:, dependent]
# This is the regression coefficients that were fit, plus some other results
c = sp_la.inv(A.T.dot(A)).dot(A.T).dot(y)
return c
def fitqr(data, independent, dependent):
"Fit a linear regression using QR decomposition. Independent should be an array of indices of independent variables. Dependent should be one independent variable."
# These are our independent variable(s)
x = data[np.ix_(np.arange(data.shape[0]), independent)]
# We add a column of 1s for the intercept
A = np.hstack((np.array([np.ones(x.shape[0])]).T, x))
# This is the dependent variable
y = data[:, dependent]
# This is the regression coefficients that were fit, plus some other results
Q, R = sp_la.qr(A)
print(A.shape)
print(Q.shape)
print(R.shape)
c = sp_la.solve_triangular(R, Q.T.dot(y))
return c
def gradient_descent(data, independent, dependent, lr, epochs):
"Fit a linear regression using gradient descent. Independent should be an array of indices of independent variables. Dependent should be one independent variable."
# initialize m and b
rng = default_rng()
c = rng.standard_normal(2)
# set n, x and y for readability of the method
n = data.shape[0]
x = data[np.ix_(np.arange(data.shape[0]), independent)]
y = data[:, dependent]
A = np.hstack((np.array([np.ones(x.shape[0])]).T, x))
for i in range(epochs):
yhat = np.dot(A, c)
# how are we doing on MSSE?
print((1/n) * np.sum(y - yhat)**2)
if ((1/n) * np.sum(y - yhat)**2) < 0.00001:
return c
# fill in the partial derivatives
dpdm = (-2/n) * np.sum(x * (y - yhat))
dpdb = (-2/n) * np.sum(y - yhat)
# update c
c = c - np.array([lr * dpdm, lr * dpdb])
return c
def msse(y, yhat):
"Calculate MSSE."
if len(y) != len(yhat):
print("Need y and yhat to be the same length!")
return 0
return (1 / len(y)) * (((y - yhat())**2).sum())
def rsquared(y, yhat):
"Calculate R^2."
if len(y) != len(yhat):
print("Need y and yhat to be the same length!")
return 0
return 1 - (((y - yhat)**2).sum() / ((y - y.mean())**2).sum())
def predict(data, independent, c):
"Given a linear regression function, predict on new data."
# These are our independent variable(s)
x = data[np.ix_(np.arange(data.shape[0]), independent)]
# We add a column of 1s for the intercept
A = np.hstack((np.array([np.ones(x.shape[0])]).T, x))
return np.dot(A, c)
# Let's split off the labels
def split(data, ycol):
y = data[:, ycol]
xfirst = data[:, 0:ycol]
xsecond = data[:, ycol+1:data.shape[1]]
return (np.hstack((xfirst, xsecond)), y)
# center
def center(data):
centered = data - np.mean(data, axis=0)
return centered
# preprocess
def preprocess(data, minmax=False, local=False, zscore=False):
if minmax == True and zscore == True:
print("Nope, won't do that!")
return center(data)
elif zscore == True:
return zScore(data)
elif minmax == True:
if local == False:
data = minmaxGlobal(data)
else:
data = minmaxLocal(data)
return center(data)
# This is most of the code from Day 19 in one function; it fits a PCA and prints out all kinds of things along the way
def pca_with_plots(data):
# covariance
covariance_matrix = (data.T @ data) / (data.shape[0] - 1)
print("covariance matrix")
print(covariance_matrix.shape)
# svd
(evals, evectors) = np.linalg.eig(covariance_matrix)
# sort
evals_order = np.argsort(evals)[::-1]
evals_sorted = evals[evals_order]
evectors_sorted = evectors[:, evals_order]
# proportional variance
evals_sum = np.sum(evals_sorted)
proportional_vars = [e / evals_sum for e in evals_sorted]
# cumulative sum of proportional variance
cumulative_sum = np.cumsum(proportional_vars)
return evals_sorted, evectors_sorted