forked from Aayush-Ankit/traNNsformers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
master_run_cnn_Prunemode2_0_pruneSlowdown.m
executable file
·211 lines (164 loc) · 7.66 KB
/
master_run_cnn_Prunemode2_0_pruneSlowdown.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
%% Train CNN network on MNIST dataset
clear all; close all; clc
dataset_path = ['/home/min/a/tibrayev/RESEARCH/traNNsformer/datasets/%s_uint8.mat'];
dirspec = 'CNNoutputs/softwarePerspective_AccuracyVsTrainingEffortAnalysis/%s';
epochs_mnist = 40;
%% MNIST
data_name = 'mnist';
epochs = 60;
prune_slowdown = epochs_mnist / epochs;
cnn = struct();
dataset_pathid = sprintf(dataset_path, data_name);
mkdir (sprintf (dirspec, data_name));
% % run prunemode = 0 - No pruning, just clear training of NN
% prunemode = 0;
% cnn = run_cnn_addPrune(data_name, dataset_pathid, cnn, epochs, prune_slowdown, prunemode);
% run prunemode = 1 - pruning only
% prunemode = 1;
% % run prunemode = 2 - pruning and clustering
prunemode = 2;
% Global file to keep traces (for debugging)
global fid;
%% Load the trained CNN if already trained
%load 'cnn_99.14.mat'
%% Load paths
% path if running on windows
% addpath(genpath('U:/AA/AproxSNN-ControlledSparsity/Matlab/'));
% path if running on linux
addpath(genpath('CNN_withPrune/'));
%% Load data
rand('state', 0);
load (dataset_pathid);
train_x = double(reshape(train_x',28,28,60000)) / 255;
train_y = double(train_y');
test_x = double(reshape(test_x',28,28,10000)) / 255;
test_y = double(test_y');
% Control number of testing images
% train_x = train_x(:,:,(1:10000));
% train_y = train_y(:,(1:10000));
%test_x = test_x(:,:,(1:10000));
%test_y = test_y(:,(1:10000));
%% Initialize net
% Type of layers:
% 'c' for convolutional,
% 's' for scalling/pooling
% 'f' for fully connected
cnn.layers = {
struct('type', 'i') %input layer
struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5) %convolution layer
struct('type', 's', 'scale', 2) %sub sampling layer
struct('type', 'c', 'outputmaps', 64, 'kernelsize', 5) %convolution layer
struct('type', 's', 'scale', 2) %subsampling layer
struct('type', 'f', 'size', 1200)
struct('type', 'f', 'size', 1200)
struct('type', 'f', 'size', size(test_y, 1))
};
cnn = cnnsetup_addPrune(cnn, train_x, train_y);
% Set the activation function to be a ReLU
cnn.act_fun = @(inp)max(0, inp);
% Set the derivative to be the binary derivative of a ReLU
cnn.d_act_fun = @(forward_act)double(forward_act>0);
%% ReLU settings
% Set up learning constants
opts.alpha = 0.7; % learning rate
opts.momentum = 0.5; % momentum
opts.batchsize = 400; % batchsize
opts.numepochs = epochs; % epochs
opts.learn_bias = 0; % bias
opts.dropout = 0.0;
cnn.first_layer_dropout = 0;
% NEW
cnn.scaling_learningRate = 1;
cnn.weightPenaltyL2 = 0;
cnn.nonSparsityPenalty = 0;
cnn.sparsityTarget = 0.05;
cnn.inputZeroMaskedFraction = 0;
cnn.dropoutFraction = 0.5;
cnn.testing = 0;
%% TraNNsformer constants
% TRANNSFORMER
cnn.clusterstartepoch = 0.2* opts.numepochs; % epoch when clustering map is created (need to start from somewhat pruned map)
cnn.prunemode = prunemode;
cnn.scaling_pruneRate = prune_slowdown * 0.001*[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]; % prune_slowdown is an external parameter
cnn.utilth = 0.01*[70 70 70 70 70 70 70 70];
cnn.crossbarSize = 64;
cnn.tol = 0.05; % delta_unclustered synpases when pruning should be stopped
cnn.cluster_base_quality_max = 0.7;
cnn.cluster_base_quality_min = 0.3;
cnn.cluster_prune_start = 0;
cnn.cluster_prune_start = 0.8* opts.numepochs;
cnn.scale_clusterpruneRate = [0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01];
assert(size(cnn.scaling_pruneRate, 2) == cnn.n, 'Size of scaling_pruneRate variable does not match with number of layers')
assert(size(cnn.utilth, 2) == cnn.n, 'Size of utilization variable does not match with number of layers')
assert(size(cnn.scale_clusterpruneRate, 2) == cnn.n, 'Size of scale_clusterpruneRate variable does not match with number of layers')
%% Create record file
kernel1 = cnn.layers{2}.outputmaps;
kernel2 = cnn.layers{4}.outputmaps;
cnn.FCcounter = 0;
cnn.firstFClayerIndex = 0;
for i = 1:numel(cnn.layers)
if (cnn.layers{i}.type == 'f')
if (cnn.FCcounter == 0)
cnn.firstFClayerIndex = i;
end
cnn.FCcounter = cnn.FCcounter + 1;
end
end
fid = fopen([sprintf('CNNoutputs/softwarePerspective_AccuracyVsTrainingEffortAnalysis/%s/trace_prunemode%d_numlayers%d_with%dFClayers_numfilters%dand%d_learningRate%2.2f_xbarutilmin%0.2f',...
data_name, prunemode, cnn.n-1, cnn.FCcounter, kernel1, kernel2, opts.alpha, cnn.cluster_base_quality_min) '.txt'],'w');
fprintf(fid, [dataset_pathid, '\n']);
fprintf(fid, ['************Prune parameters:************', '\n']);
fprintf(fid, ['Prune slowdown: ', num2str(prune_slowdown), '\n']);
if (prunemode == 2)
fprintf(fid, ['Cluster start epoch: ', num2str(cnn.clusterstartepoch), '\n']);
fprintf(fid, ['Cluster prune start epoch: ', num2str(cnn.cluster_prune_start), '\n']);
end
fprintf(fid, ['***************************************************', '\n']);
%% Initial Hardware estimations
% TRANNSFORMER
[num_mpe, ~] = get_hardware_params_cnn(cnn, 0);
fprintf(fid, 'Number of mPEs needed before trannsformation: %d\n', num_mpe);
%% Train
% Train - takes about 199 seconds per epoch on my machine - (for 16,16 conv layers)
cnn = cnntrain_addPrune_1(cnn, train_x, train_y, opts, test_x, test_y);
%% Test
fprintf(fid, ['***************************************************', '\n']);
[er, train_bad] = cnntest_addPrune(cnn, train_x, train_y);
fprintf(fid,'TRAINING Accuracy: %2.2f%%.\n', (1-er)*100);
[er, bad] = cnntest_addPrune(cnn, test_x, test_y);
fprintf(fid,'TEST Accuracy: %2.2f%%.\n', (1-er)*100);
fprintf(fid, ['***************************************************', '\n']);
fprintf(fid, ['************Prune parameters:************', '\n']);
fprintf(fid, ['Prune slowdown: ', num2str(prune_slowdown), '\n']);
if (prunemode == 2)
fprintf(fid, ['Cluster start epoch: ', num2str(cnn.clusterstartepoch), '\n']);
fprintf(fid, ['Cluster prune start epoch: ', num2str(cnn.cluster_prune_start), '\n']);
end
fprintf(fid, ['***************************************************', '\n']);
%% Plot the cluster quality histograms after SCIC & pruned fractions
if (cnn.prunemode == 2)
if_hist = 1;
% find the updated clustering statistics & plot them (histograms)
fig = figure(1);
p = 1;
for i = (cnn.firstFClayerIndex) : (cnn.n)
prunestats = 100* sum(sum(cnn.map{i}))/(size(cnn.map{i},1) * size(cnn.map{i},2));
fprintf(fid, 'Pruned percentage of Layer %d: %2.2f%%.\n', i, 100-prunestats);
subplot(1,cnn.FCcounter,p)
p = p + 1;
% final connectivity matrix - logic or of cmap and pmap
conn_matrix = logical(cnn.cmap{i}) | logical(cnn.pmap{i});
cnn_analyse_cluster(cnn.clusters{i}, conn_matrix, if_hist);
end
saveas(fig, sprintf('CNNoutputs/softwarePerspective_AccuracyVsTrainingEffortAnalysis/%s/hist_prunemode%d_numlayers%d_with%dFClayers_xbarutilmin%0.2f.png',...
data_name, prunemode, (cnn.n-1), cnn.FCcounter, cnn.cluster_base_quality_min))
end
%% Saving Output Data
save ([sprintf('CNNoutputs/softwarePerspective_AccuracyVsTrainingEffortAnalysis/%s/trace_prunemode%d_numlayers%d_with%dFClayers_numfilters%dand%d_learningRate%2.2f_xbarutilmin%0.2f',...
data_name, prunemode, (cnn.n-1), cnn.FCcounter, kernel1, kernel2, opts.alpha, cnn.cluster_base_quality_min) '.mat'],'cnn','opts');
%% Extract the hardware results
[num_mpe, ~, num_mpe_unclustered] = get_hardware_params_cnn(cnn, prunemode);
fprintf(fid, 'Training effort in terms of number of epochs: %d\n', epochs);
fprintf(fid, 'Total Number of mPEs needed after transformation: %d\n', num_mpe);
fprintf(fid, 'Number of mPEs needed for unclustered synapses: %d\n', num_mpe_unclustered);
fclose(fid);