forked from wmcnicho/Neurips-Challenge-22
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_data.py
89 lines (70 loc) · 3.47 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import pandas as pd
import torch
import json
from scipy import stats
def createDataset(filename: str):
lessons_df = pd.read_csv(filename)
# 1) Consider only type "Checkin"
# TODO: This is pretty aggresive cleaning, maybe others Types are useful
checkin_df = lessons_df[lessons_df['Type'] == 'Checkin']
# 2) Clean out all the columns we don't need
# 2: UserId, 5: IsCorrect, 9: ConstructId
simple_df = checkin_df.iloc[:, [2, 5, 9]] # [S, 3]
# Substitue IsCorrect value of 0 to -1
simple_df.loc[simple_df["IsCorrect"] == 0, "IsCorrect"] = -1 # [S, 3]
# 3) Set number of questions (Q) as the most questions answered by a single student
# A little dirty importing a module just for mode
num_of_questions = stats.mode(simple_df["UserId"]).count[0] # Q
# 4) Create dataset in form of [S, 2, Q]
# S: number of unique students
# 2: features and label
# Q: most questions answered by a single student
tot_construct_set = set()
result = torch.tensor([]) # [1] -> [S, 2, Q]
for ctr, (user, user_info) in enumerate(simple_df.groupby('UserId')):
if ctr % 100 == 0:
print(ctr)
# print("User: ", user, "\t", "User Info: ", user_info)
# List of ConstructId and IsCorrect
# C: number of constructs of each user
constructs = user_info["ConstructId"].values.tolist() # [C]
correct = user_info["IsCorrect"].values.tolist() # [C]
tot_construct_set.update(constructs)
# Pad for ConstructId that has not been dealt with a user
num_of_constructs = len(constructs)
pad_needed = num_of_questions - num_of_constructs # [P = Q - C]
constructs += [0] * pad_needed # [Q]
correct += [0] * pad_needed # [Q]
# Build tensor
student = torch.tensor([constructs, correct]) # [2, Q]
student = torch.unsqueeze(student, 0) # [1, 2, Q]
# Add to running result
result = torch.cat([result, student]) # [i, 2, Q]
# num_of_constructs = len(tot_construct_set)
tot_construct_list = list(tot_construct_set)
return result, tot_construct_list # [S, 2, Q]
def main():
use_main = 1 # 1 for using main data, anything else for using sample data
if use_main==1:
data_path = 'data/Task_3_dataset/checkins_lessons_checkouts_training.csv'
prefix_path = 'student_data'
else:
data_path = 'data/sample_data_lessons_small.csv'
prefix_path = 'sample_student_data'
student_data, tot_construct_list = createDataset(data_path) # [S, 2, Q] Faster and better for debugging
print("num_of_constructs: ", len(tot_construct_list))
transform_student_data = torch.transpose(student_data, 0, 2) # [S, 2, Q] --> [Q, 2, S]
torch.save(transform_student_data, os.path.join('serialized_torch', prefix_path+'_tensor.pt'))
with open(os.path.join('serialized_torch', prefix_path+'_construct_list.json'), "w") as fp:
json.dump(tot_construct_list, fp)
# student_data = createDataset('data/Task_3_dataset/checkins_lessons_checkouts_training.csv') # [S, 2, Q]
features = transform_student_data[:, 0, :] # [Q, S]
labels = transform_student_data[:, 1, :] # [Q, S]
# print("labels: ", labels)
# print("features: ", features)
print('Features shape:', features.shape)
print('Labels shape', labels.shape)
if __name__ == "__main__":
# loaded_file = torch.load('serialized_torch/student_data_tensor.pt') # This is also an option
main()