Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
###############################################################################################################################
###################### Map CIM-QNN inputs/weights/outputs from Python to SystemVerilog/Hardware ###############################
###############################################################################################################################
import sys,os
import h5py
import numpy as np
import tensorflow as tf
from keras.models import load_model
from ctypes import c_uint32, c_uint64
from config.config_cim_cnn_param import*
from layers.binary_ops import binarize as binarize
from utils.config_hardware_model import SramInfo_charge as SramInfo
from chip_files.create_C_header import create_C_header
from chip_files.create_fpga_files import create_fpga_files
#################################################
########## Local variables definition ###########
#################################################
# Img dimension
H = dim;
# Computing precision
R_IN = IAres;
R_W = Wres;
R_OUT = IAres;
R_BETA = r_beta;
R_GAMMA = r_gamma;
# Network length
Nlayers = len(C_IN_VEC);
# Flags for test files generation
OUT_EN = 0; # 1: output files per layer exist ; 0: they do not, prevent storage and comparison
# Create CIMU structure
sramInfo = SramInfo(arch,tech,typeT,VDD,BBN,BBP,IAres,Wres,OAres,r_gamma,r_beta,Nrows,[0,0]);
epsilon = 1e-8;
###################################################
########## Get files to map from config ###########
###################################################
in_file = path_to_out+in_file_template.format(dataset_name,cim_type,arch,IAres);
out_file = path_to_out+out_file_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,r_gamma,r_beta,Niter,EN_SCALE,ANALOG_BN,EN_NOISE);
w_file = path_to_out+w_file_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,r_gamma,r_beta,Niter,EN_SCALE,ANALOG_BN,EN_NOISE);
inference_file = path_to_out+inference_file_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,r_gamma,r_beta,Niter,EN_SCALE,ANALOG_BN,EN_NOISE);
#################################################
########## Get files to store outputs ###########
#################################################
file_out_inputs = path_to_chip+chip_in_template.format(dataset_name,network_struct,cim_type,arch,IAres);
file_out_outputs = path_to_chip+chip_out_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_inference = path_to_chip+chip_inference_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_weights = path_to_chip+chip_w_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_gamma = path_to_chip+chip_gamma_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_beta = path_to_chip+chip_beta_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_weights_FP = path_to_chip+chip_w_FP_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_gamma_FP = path_to_chip+chip_gamma_FP_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_beta_FP = path_to_chip+chip_beta_FP_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
####################################################
########## Define files for FPGA storage ###########
####################################################
file_fpga_inputs = path_to_fpga+'inputs.mif';
file_fpga_weights_cim = path_to_fpga+'weights_cim.mif';
file_fpga_beta = path_to_fpga+'beta_cim.mif';
file_fpga_weights_FP = path_to_fpga+'weights_FP.mif';
file_fpga_inf_res = path_to_fpga+'inf_results.mif';
file_fpga_outputs = path_to_fpga+'outputs.mif';
##########################################
########## Post-process data #############
##########################################
# // Transform inputs sub-set into 32b words for SRAM encoding //
C_IN = C_IN_VEC[0];
with open(in_file,"r") as f:
# Get inputs
inputs = np.genfromtxt(f,delimiter=" ");
inputs = np.reshape(inputs,(Nimg_save,-1));
# Reshape depending upon the operation type
if(OP_TYPE == "FC"):
inputs = np.pad(inputs,((0,0),(0,C_IN_VEC[0]-np.shape(inputs)[-1])),mode="constant");
inputs = inputs.flatten();
if(C_IN*R_IN < 32):
img_temp = np.reshape(inputs,(C_IN))[np.newaxis,:];
int_img = np.dot(img_temp,2**np.arange(C_IN));
else:
img_temp = np.reshape(inputs,(-1,32//R_IN));
int_img = np.dot(img_temp,2**np.arange(0,32,R_IN));
elif(OP_TYPE == "CONV-1D"):
img_temp = np.reshape(inputs,(-1,32//R_IN));
int_img = np.dot(img_temp,2**np.arange(0,32,R_IN));
elif(OP_TYPE == "CONV-2D"):
img_temp = np.reshape(inputs,(-1,32//R_IN));
int_img = np.dot(img_temp,2**np.arange(0,32,R_IN));
else:
print("Warning: operation mode not supported")
# // Transform outputs sub-set into 32b words for SRAM encoding //
if(OUT_EN):
outputs_list = []; outputs_list_test = [];
for i in range(Nlayers):
C_OUT = C_OUT_VEC[i];
# Get outputs (only ADC outputs)
with open(out_file+"_layer_{}.txt".format(i),"r") as f:
outputs = np.genfromtxt(f,delimiter=" ");
# Store raw outputs for FP test
outputs_list_test.append(np.int32(outputs));
# Reshaping depending upon operation type
if(OP_TYPE == "FC"):
if(R_OUT*C_OUT <= 32):
int_dout = np.array([np.dot(outputs,2**(R_OUT*np.arange(0,(R_OUT*C_OUT),R_OUT)))]);
else:
int_dout = np.dot(np.reshape(outputs,(-1,32//R_OUT)),2**np.arange(0,32,R_OUT));
elif(OP_TYPE == "CONV-1D"):
int_dout = np.dot(np.reshape(outputs,(-1,32//R_OUT)),2**(R_OUT*np.arange(32//R_OUT)));
elif(OP_TYPE == "CONV-2D"):
# Pad with zeros when necessary to fit memory size
Npads = (R_OUT*C_OUT*(H-2*(i+1))*(H-2*(i+1)))%32;
Npads = 0 if (Npads == 0) else (32//R_OUT-Npads);
# Swap rows and columns
data_out = np.reshape(outputs,(-1,H-2*(i+1),H-2*(i+1),C_OUT));
data_out = np.swapaxes(data_out,1,2);
data_out = data_out.reshape(-1);
data_out = np.pad(data_out,(0,Npads),mode="constant");
# Encode into 32b words
int_dout = np.dot(np.reshape(data_out,(-1,32//R_OUT)),2**(R_OUT*np.arange(32//R_OUT)));
else:
print("Warning: operation type not supported !")
# Add result to outputs list
outputs_list.append(int_dout.astype("uint64"));
for i in range(Nl_fp):
# Get outputs
with open(out_file+"_layer_{}.txt".format(Nlayers+i),"r") as f:
outputs = np.genfromtxt(f,delimiter=" ");
# Transform into FP
outputs = np.int32(np.round(outputs*(2**16-1)*(2**15)/fS_beta_fp/fS_gamma_fp));
# outputs = outputs*(2**15)*(2**15)/fS_beta_fp/fS_gamma_fp;
# Append result
outputs_list.append(outputs);
# // Write inference results to destination //
with open(inference_file,"r") as f:
inf_results = np.genfromtxt(f,delimiter=" ");
# // Get weights for each layer and quantize them //
weights_list = []; weights_FP_list = [];
gamma_list = []; beta_list = [];
gamma_FP_list = []; beta_FP_list = [];
c_in_vec = []; c_out_vec = [];
Nlayers_cim = 0; Nlayers_fp = 0;
with h5py.File(w_file,"r") as f:
# List all groups
list_of_keys = list(f.keys())
# print(list_of_keys)
for key in list_of_keys:
# // Different cases depending upon the layer type (key) //
# CIM-QNN layer
if(('cim_charge_conv2d' in key) or ('cim_charge_dense' in key)):
dataset = f[key][key];
local_keys = list(dataset.keys());
w_data = dataset[local_keys[0]][()];
# Binarize weights
w_data = tf.cast((binarize(w_data,H=1.)+np.ones_like(w_data))/2,dtype="int32");
# Get weights shape (detect FC or CONV)
w_shape = tf.shape(w_data);
if(len(w_shape)>1):
w_data = tf.reshape(w_data,(-1,w_shape[-1]));
w_shape = tf.shape(w_data);
# Pad with zeros to reach the full array size
w_data = np.pad(w_data,((0,Nrows-w_shape[0]),(0,Ncols-w_shape[1])));
# Store layer dimensions
c_in_vec.append(w_shape[-2]); c_out_vec.append(w_shape[-1]);
Nlayers_cim += 1;
# Flatten weights in 32b words
int_weights = np.dot(np.reshape(w_data,(-1,32)),2**np.arange(32));
# Store T_dp and weights to output file
weights_list.append(int_weights);
# Full-precision dense or conv layer
elif(('dense' in key) or ('conv' in key)):
dataset = f[key][key];
local_keys = list(dataset.keys());
w_data = dataset[local_keys[0]][()];
# Transform floating-point weights into full-precision signed int32
w_data = np.round(w_data*(2**15)/fS_beta_fp);
w_data = np.int32(w_data);
# w_data = w_data*(2**15)/fS_beta_fp;
# Store weights
weights_FP_list.append(np.reshape(w_data,(-1,1)));
# Count one more FP layer
Nlayers_fp += 1;
# Analog BN
elif('analog_bn' in key):
dataset = f[key][key];
local_keys = list(dataset.keys());
beta = dataset[local_keys[0]][()];
gamma = dataset[local_keys[1]][()];
#m_sigma = dataset[local_keys[2]][()]; # to be corrected with updated training, if necesseay
m_sigma = 1;
mov_mean = dataset[local_keys[2]][()];
mov_var = dataset[local_keys[3]][()];
# // Retrieve hardware parameters //
Vmax_beta = sramInfo.Vmax_beta;
Vlsb_beta = Vmax_beta/2**(r_beta-1);
# // Equivalent gain computation //
# Target variance
sigma_goal = VDD/m_sigma; var_goal = sigma_goal*sigma_goal;
# Get custom renorm factors (single gain for all columns)
mov_variance_DP_t = np.mean(mov_var)/var_goal;
sigma_DP_t = np.sqrt(mov_variance_DP_t);
# Get equivalent coefficients
gamma_eq = gamma/(sigma_DP_t + epsilon);
# Get gamma encoding
gamma_code = np.round(np.log2(gamma_eq));
# // Equivalent offset computation //
beta_eq = beta/gamma_eq - mov_mean;
# Get beta encoding
beta_code = np.round(beta_eq/Vlsb_beta);
print(beta_code)
# // Append gamma & beta configs (uint8, encoding done during C mapping)
gamma_list.append(gamma_code.astype("uint8"));
beta_list.append(beta_code.astype("uint8"));
# Full-precision BN
elif('batch_normalization' in key):
dataset = f[key][key];
local_keys = list(dataset.keys());
beta = dataset[local_keys[0]][()];
gamma = dataset[local_keys[1]][()];
mov_mean = dataset[local_keys[2]][()];
mov_var = dataset[local_keys[3]][()];
# Get equivalent coefficients
mov_sig = np.sqrt(mov_var);
gamma_eq = gamma/mov_sig;
beta_eq = beta/gamma_eq - mov_mean;
# Transform floating-point result into full-precision signed int32
beta_eq = np.int32(np.round(beta_eq/fS_beta_fp*(2**15)));
gamma_eq = np.int32(np.round(gamma_eq/fS_gamma_fp*(2**16-1)));
# beta_eq = beta_eq*(2**15)/fS_beta_fp;
# gamma_eq = gamma_eq*(2**15)/fS_gamma_fp;
# Store results
gamma_FP_list.append(np.reshape(gamma_eq,(-1,1)));
beta_FP_list.append(np.reshape(beta_eq,(-1,1)));
# NOTHING TO DO FOR ACTIVATIONS/REGU LAYERS
# // Verify full-precision conversion gives expected output (converted to 64b) //
# print(np.shape(outputs_list_test[-1])); print(np.shape(weights_FP_list[0]));
# print(np.shape(gamma_FP_list[0])); print(np.shape(beta_FP_list[0]));
# print(np.shape(weights_FP_list[-1]))
# print(np.reshape(weights_FP_list[-1],(-1,classes)))
print("\n");
#print("--- Operands ---");
#print(np.reshape(outputs_list_test[-1],(Nimg_save,-1)));
#print(np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10)));
#print(outputs_list[-1]);
###################################################
########## Test FP output equivalence #############
###################################################
if(OUT_EN):
print("--- Computing FP MAC equivalence ---");
temp_mac = np.squeeze(np.dot(np.reshape(outputs_list_test[-1],(Nimg_save,-1)),np.reshape(weights_FP_list[0],(-1,classes))));
actual_mac = np.int32(np.round(np.squeeze(gamma_FP_list[-1])*(temp_mac+np.squeeze(beta_FP_list[-1]))));
# actual_mac = np.uint64(np.squeeze(np.dot(np.reshape(outputs_list_test[-1],(Nimg_save,-1)),np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10)))));
# actual_mac = np.squeeze(gamma_FP_list[0])*(np.squeeze(np.dot(np.reshape(outputs_list_test[-1],(Nimg_save,-1)),np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10))))+np.squeeze(beta_FP_list[0]));
# expected_mac = np.uint64(np.round(outputs_list[-1]));
# expected_mac = np.uint64(outputs_list[-1]*(2**31)/fS_beta_fp);
expected_mac = outputs_list[-1];
expected_mac = np.reshape(expected_mac,(Nimg_save,-1));
#print("--- Operands ---");
#print(np.reshape(outputs_list_test[-1],(Nimg_save,-1)));
#print(np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10)));
#print(outputs_list[-1]);
# print(np.int32(actual_mac)); print(np.int32(expected_mac));
print(actual_mac); print(expected_mac);
print(outputs_list_test[-1]);
# Detailed comptuation below
in_FP = np.reshape(outputs_list_test[-1],(Nimg_save,-1));
w_FP = np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10));
gamma_FP = gamma_FP_list[0]; beta_FP = beta_FP_list[0];
mac_val = np.zeros((Nimg_save,10),dtype="int32");
for m in range(Nimg_save):
# Perform MAC operations
for i in range(C_OUT_VEC[-1]):
# Fetch input
inputs = in_FP[m][i];
for j in range(10):
# Fetch weight
weights = w_FP[i][j];
# MAC operation
mac_val[m][j] = mac_val[m][j] + inputs*weights;
#if(m==0 and (i==0 or i==1)):
if(m==0 and i<8 and j==0):
print('Input {} is {}'.format(j,inputs));
print('Weight {} is {}'.format(j,weights));
print('DP {} at iter {} is {}'.format(j,i,mac_val[m][j]));
# Print final DP value
for j in range(10):
if(m==0):
print('DP result {} is {}'.format(j,mac_val[m][j]));
# Perform BN operations
for j in range(10):
mac_val[m][j] = gamma_FP[j]*(mac_val[m][j]+beta_FP[j]);
if(m==0):
print('BN result {} is {}'.format(j,mac_val[m][j]));
count_error = 0;
for i in range(Nimg_save):
for j in range(np.shape(actual_mac)[-1]):
perc_error = 100*np.abs(np.int32(actual_mac[i,j]-expected_mac[i,j])/np.int32(expected_mac[i,j]));
if(perc_error>1e-1):
error_val = actual_mac[i,j]^expected_mac[i,j];
count_error+=1;
#print("Error for FP computation {}: {} ({:3f}%) !".format(i,hex(error_val),perc_error));
print("Error for FP computation {}: {} instead of {} ({:3f}%) !".format(Nimg_save*i+j,hex(actual_mac[i,j]),hex(expected_mac[i,j]),perc_error));
if(count_error == 0):
print('All results are correct to 0.1%, congrats !');
else:
print('There were {} word errors found !'.format(count_error));
else:
print('Warning: output results not available after training, FP comparison bypassed');
####################################################
########## Store results to text files #############
####################################################
# Inputs
data_in = int_img.astype("uint32");
np.savetxt(file_out_inputs,data_in,fmt='%x');
# Outputs
if(OUT_EN):
cim_outputs = np.concatenate(outputs_list,axis=None).astype("uint64");
for i in range(len(outputs_list)):
if(i<Nlayers):
np.savetxt(file_out_outputs+'_layer_{}.txt'.format(i),outputs_list[i].astype("uint64"),fmt='%x');
else:
np.savetxt(file_out_outputs+'_layer_{}.txt'.format(i),outputs_list[i].astype("uint64"),fmt='%x');
# Inference results
np.savetxt(file_out_inference,np.array([inf_results]).astype("uint64"),fmt='%x');
# CIM weights
weights_cim = np.concatenate(weights_list,axis=None).astype("uint64");
np.savetxt(file_out_weights+'.txt',weights_cim,fmt='%x');
# Gamma file
gamma_cim = np.concatenate(gamma_list,axis=None);
np.savetxt(file_out_gamma+'.txt',gamma_cim,fmt='%x');
# Beta file
beta_cim = np.concatenate(beta_list,axis=None);
np.savetxt(file_out_beta+'.txt',beta_cim,fmt='%x');
# FP FC/CONV weights
weights_fp = np.concatenate(weights_FP_list,axis=None).astype("uint64");
np.savetxt(file_out_weights_FP+'.txt',weights_fp,fmt='%x');
# FP BN weights
gamma_fp = np.concatenate(gamma_FP_list,axis=None).astype("uint64");
beta_fp = np.concatenate(beta_FP_list,axis=None).astype("uint64");
np.savetxt(file_out_gamma_FP+'.txt',gamma_fp,fmt='%x');
np.savetxt(file_out_beta_FP+'.txt',beta_fp,fmt='%x');
######################################################################
########## Generate final test files for on-chip testing #############
######################################################################
# // Parameters folding //
# Filenames
filename_c = path_to_chip+'./cim_config.h';
filename_fpga = [file_fpga_inputs,file_fpga_weights_cim,file_fpga_beta,file_fpga_weights_FP,file_fpga_inf_res,file_fpga_outputs];
# CNN info
network_info = (Nlayers_cim,Nlayers_fp,Nimg_save);
# CIM dimensions
cim_dim = (Nrows,Ncols,Nimg_save);
# Precision/Channels/Timing
D_VEC = (dim,dim,c_in_vec,c_out_vec);
P_VEC = (R_IN,R_W,R_OUT,R_BETA,R_GAMMA);
T_VEC = (T_DP,T_PRE,T_MBIT,T_ADC);
# Data for FPGA
data_fpga = [data_in,weights_cim,beta_list,weights_fp,inf_results.astype("int32")];
if(OUT_EN):
data_fpga.append(cim_outputs);
# // Generate C header file with hardware params //
create_C_header(filename_c,network_info,cim_dim,D_VEC,P_VEC,T_VEC,gamma_cim,beta_fp,gamma_fp);
# // Generate off-chip FPGA memory files //
create_fpga_files(filename_fpga,network_info,cim_dim,D_VEC,P_VEC,data_fpga);
print('///////////////////////////////////////////////////////');
print('//////////////// FILES CONVERSION DONE ////////////////');
print('///////////////////////////////////////////////////////');