-
Notifications
You must be signed in to change notification settings - Fork 1
/
statistical_analysis.py
258 lines (214 loc) · 8.94 KB
/
statistical_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#Statistical Analysis of NBA Referees
#Amanda Li, Connor Turkatte, Fernando De Stefanis, Steven Wei
import pandas
import math
import sqlite3
import csv
top_refs = ['Mark Lindsay',
'Tony Brothers',
'Lauren Holtkamp',
'Steve Anderson',
'Kevin Scott',
'Bill Kennedy',
'J.T. Orr',
'Nick Buchert',
'James Capers',
'Derek Richardson']
top_10 = ['Jason Phillips',
'Monty McCutchen',
'Scott Foster',
'Josh Tiven',
'Kane Fitzgerald',
'Eric Lewis',
'Marc Davis',
'Ken Mauer',
'Tom Washington',
'Tony Brothers']
top_teams = ['Warriors', 'Spurs', 'Cavaliers', 'Raptors',
'Thunder', 'Clippers', 'Hawks', 'Celtics']
bottom_refs = ['Bennie Adams',
'Gary Zielinski',
'David Jones',
'Josh Tiven',
'Tony Brown',
'Derrick Stafford',
'Leroy Richardson',
'Matt Boland',
'Mike Callahan',
'Tre Maddox']
star_players = ['Stephen Curry', 'Kawhi Leonard', 'Kevin Durant',
'Russell Westbrook', 'Kyle Lowry', 'Paul George', 'Carmelo Anthony',
'Dwyane Wade', 'LeBron James','Anthony Davis', 'James Harden',
'Giannis Antetokounmpo', 'Kyrie Irving', 'Jimmy Butler', 'Marc Gasol',
'Klay Thompson', 'LaMarcus Aldridge', 'Pau Gasol', 'John Wall']
def generate_ranking(filename = 'ranking.csv', num = 300):
'''
returns a ranking of the best referees in the NBA by ratio of
incorrect calls over total calls. It eliminates referees with
fewer than num calls on our dataset, with a preset value of
300. At the preset value, this excludes 9 out of 70
referees in total.
Inputs:
A filename corresponding to a file
containing the list of all referees in the dataset,
with the total number of calls, incorrect calls, incorrect
non calls, total incorrect calls, and incorrect to correct call
ratio.
An integer corresponding to the minimum number of calls we
want to have for the cutoff
'''
ranking = pandas.read_csv(filename)
sorted_ranking = ranking[ranking['total']>num].sort('percentage', ascending = True)
return sorted_ranking
def variation_coefficient(filename = 'ranking.csv', number = 300):
'''
Returns the standard deviation, mean and variation_coefficient
for the incorrect calls, the error ratio for a ranking without
cutoff, and the error ratio for a ranking with cutoff.
Note that only the latter two are used in the paper
'''
ranking = pandas.read_csv(filename)
stdev_calls = math.sqrt(ranking['totali'].var())
mean_calls = ranking['totali'].mean()
calls_variation_coefficient = stdev_calls/mean_calls
stdev_error = math.sqrt(ranking['percentage'].var())
mean_error = ranking['percentage'].mean()
error_variation_coefficient = stdev_error/mean_error
sorted_ranking = generate_ranking(num = number)
stdev_error_sort = math.sqrt(sorted_ranking['percentage'].var())
mean_error_sort = sorted_ranking['percentage'].mean()
error_variation_coefficient_sort = stdev_error_sort/mean_error_sort
print('The stats for calls are', stdev_calls, mean_calls, calls_variation_coefficient)
print('The stats for unsorted errors are', stdev_error, mean_error, error_variation_coefficient)
print('The stats for sorted errors are', stdev_error_sort, mean_error_sort, error_variation_coefficient_sort)
def analysis(filename = 'proportion_shooting.csv'):
'''
Returns the mean and standard deviation for a certain Statistic
for the first 10 and last 10 elements of a given csv file.
Very often throughout the paper we try to get information for the
same statistic for the top and bottom 10 (players, referees, etc)
'''
shots = pandas.read_csv(filename)
mean_top = shots[0:10]['Ratio'].mean()
stdev_top = math.sqrt(shots[0:10]['Ratio'].var())
mean_acc = shots[10:]['Ratio'].mean()
stdev_acc = math.sqrt(shots[10:]['Ratio'].var())
return mean_top, stdev_top, mean_acc, stdev_acc
def proportion_shooting(filename = 'NBARefs.db'):
'''
This function computes the proportion of shooting fouls
over total fouls amongst the calls by a given referee in our
top 10 by accuracy and top 10 by number of calls. It
generates a csv file that is a suitable input for analysis()
'''
con = sqlite3.connect(filename)
cur = con.cursor()
rv = [('Name', 'Total Fouls', 'Shooting Fouls', 'Ratio')]
for element in top_10:
total = cur.execute('select count(*) from calls join referees\
on calls.game_code=referees.game_code where referee_name = ?', [element])
total_no = total.fetchall()[0][0]
shooting = cur.execute('select count(*) from calls join referees\
on calls.game_code=referees.game_code where call_type = ? and referee_name = ?',
['Foul: Shooting', element])
shooting_no = shooting.fetchall()[0][0]
rv.append((element, total_no, shooting_no, shooting_no/total_no))
for element in top_refs:
total = cur.execute('select count(*) from calls join referees\
on calls.game_code=referees.game_code where referee_name = ?', [element])
total_no = total.fetchall()[0][0]
shooting = cur.execute('select count(*) from calls join referees\
on calls.game_code=referees.game_code where call_type = ? and referee_name = ?',
['Foul: Shooting', element])
shooting_no = shooting.fetchall()[0][0]
rv.append((element, total_no, shooting_no, shooting_no/total_no))
cur.close()
del cur
con.close()
f = open("top_teams.csv","w")
out = csv.writer(f, delimiter=',',quoting=csv.QUOTE_ALL)
for tup in rv:
out.writerow(tup)
f.close()
return rv
def better_teams(filename = 'NBARefs.db'):
'''
This function returns the proportion of games by top teams
officiated by referees in the top and bottom 10 by accuracy
lists. It also generates a csv file that is a suitable input
for the analysis() function
'''
con = sqlite3.connect(filename)
cur = con.cursor()
rv = [('Name', 'Games with Top Teams', 'Total Games with Top Teams', 'Ratio')]
for ref in top_refs:
ref_counter = 0
game_counter = 0
for team in top_teams:
referee = cur.execute('select count(distinct game_name) from calls join referees\
on calls.game_code=referees.game_code where referee_name = ? \
and home_team = ? or away_team = ?', [ref, team, team])
ref_counter += referee.fetchall()[0][0]
total = cur.execute('select count(distinct game_name) from calls join referees\
on calls.game_code=referees.game_code where home_team = ? or away_team = ?',
[team, team])
game_counter += total.fetchall()[0][0]
rv.append((ref, ref_counter, game_counter, ref_counter/game_counter))
for ref in bottom_refs:
ref_counter = 0
game_counter = 0
for team in top_teams:
referee = cur.execute('select count(distinct game_name) from calls join referees\
on calls.game_code=referees.game_code where referee_name = ? \
and home_team = ? or away_team = ?', [ref, team, team])
ref_counter += referee.fetchall()[0][0]
total = cur.execute('select count(distinct game_name) from calls join referees\
on calls.game_code=referees.game_code where home_team = ? or away_team = ?',
[team, team])
game_counter += total.fetchall()[0][0]
rv.append((ref, ref_counter, game_counter, ref_counter/game_counter))
cur.close()
del cur
con.close()
f = open("top_teams.csv","w")
out = csv.writer(f, delimiter=',',quoting=csv.QUOTE_ALL)
for tup in rv:
out.writerow(tup)
f.close()
return rv
def better_players(filename = 'NBARefs.db'):
'''
Returns the error ratios for calls involving the
star players
'''
con = sqlite3.connect(filename)
cur = con.cursor()
rv = [('Name', 'Total Incorrect Calls and Non-Calls', 'Total Calls', 'Ratio')]
for player in star_players:
totali = cur.execute('select count(*) from calls where (committing_player = \
? or disadvantaged_player = ?) and (call_accuracy\
= ? or call_accuracy = ?)', [player, player, 'INC', 'IC']);
total_inc = totali.fetchall()[0][0]
total = cur.execute('select count(*) from calls where committing_player = ?\
or disadvantaged_player = ?', [player, player])
total_calls = total.fetchall()[0][0]
rv.append((player, total_inc, total_calls, total_inc/total_calls))
cur.close()
del cur
con.close()
f = open("star_players.csv","w")
out = csv.writer(f, delimiter=',',quoting=csv.QUOTE_ALL)
for tup in rv:
out.writerow(tup)
f.close()
return rv
def analysis2(filename = 'star_players.csv'):
'''
This function is identical to analysis, but it
returns the statistics for the entire file instead of
dividing it into two categories.
'''
shots = pandas.read_csv(filename)
mean= shots['Ratio'].mean()
stdev = math.sqrt(shots['Ratio'].var())
return mean, stdev