sw_to_chip.py

###############################################################################################################################
###################### Map CIM-QNN inputs/weights/outputs from Python to SystemVerilog/Hardware ###############################
###############################################################################################################################
import sys,os
import h5py
import numpy as np
import tensorflow as tf
from keras.models import load_model

from ctypes import c_uint32, c_uint64

from config.config_cim_cnn_param import*
from layers.binary_ops import binarize as binarize

from utils.config_hardware_model import SramInfo_charge as SramInfo

from chip_files.create_C_header import create_C_header
from chip_files.create_fpga_files import create_fpga_files

#################################################
########## Local variables definition ###########
#################################################
# Img dimension
H = dim;
# Computing precision
R_IN    = IAres;
R_W     = Wres;
R_OUT   = IAres;
R_BETA  = r_beta;
R_GAMMA = r_gamma;
# Network length
Nlayers = len(C_IN_VEC); 
# Flags for test files generation
OUT_EN = 0; # 1: output files per layer exist ; 0: they do not, prevent storage and comparison

# Create CIMU structure
sramInfo = SramInfo(arch,tech,typeT,VDD,BBN,BBP,IAres,Wres,OAres,r_gamma,r_beta,Nrows,[0,0]);     
epsilon = 1e-8;

###################################################
########## Get files to map from config ###########
###################################################
in_file  = path_to_out+in_file_template.format(dataset_name,cim_type,arch,IAres);
out_file = path_to_out+out_file_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,r_gamma,r_beta,Niter,EN_SCALE,ANALOG_BN,EN_NOISE);
w_file   = path_to_out+w_file_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,r_gamma,r_beta,Niter,EN_SCALE,ANALOG_BN,EN_NOISE);
inference_file = path_to_out+inference_file_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,r_gamma,r_beta,Niter,EN_SCALE,ANALOG_BN,EN_NOISE);

#################################################
########## Get files to store outputs ###########
#################################################
file_out_inputs     = path_to_chip+chip_in_template.format(dataset_name,network_struct,cim_type,arch,IAres);
file_out_outputs    = path_to_chip+chip_out_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_inference  = path_to_chip+chip_inference_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_weights    = path_to_chip+chip_w_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_gamma      = path_to_chip+chip_gamma_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_beta       = path_to_chip+chip_beta_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_weights_FP = path_to_chip+chip_w_FP_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_gamma_FP   = path_to_chip+chip_gamma_FP_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);
file_out_beta_FP    = path_to_chip+chip_beta_FP_template.format(dataset_name,network_struct,cim_type,arch,IAres,Wres,OAres,EN_NOISE);

####################################################
########## Define files for FPGA storage ###########
####################################################
file_fpga_inputs      = path_to_fpga+'inputs.mif';
file_fpga_weights_cim = path_to_fpga+'weights_cim.mif';
file_fpga_beta        = path_to_fpga+'beta_cim.mif';
file_fpga_weights_FP  = path_to_fpga+'weights_FP.mif';
file_fpga_inf_res     = path_to_fpga+'inf_results.mif';
file_fpga_outputs     = path_to_fpga+'outputs.mif';

##########################################
########## Post-process data #############
##########################################
# // Transform inputs sub-set into 32b words for SRAM encoding //
C_IN = C_IN_VEC[0];
with open(in_file,"r") as f:
  # Get inputs
  inputs  = np.genfromtxt(f,delimiter=" ");
  inputs  = np.reshape(inputs,(Nimg_save,-1));
  # Reshape depending upon the operation type
  if(OP_TYPE == "FC"):
      inputs = np.pad(inputs,((0,0),(0,C_IN_VEC[0]-np.shape(inputs)[-1])),mode="constant");
      inputs = inputs.flatten();
      if(C_IN*R_IN < 32):
          img_temp = np.reshape(inputs,(C_IN))[np.newaxis,:];
          int_img = np.dot(img_temp,2**np.arange(C_IN));
      else:
          img_temp = np.reshape(inputs,(-1,32//R_IN));
          int_img = np.dot(img_temp,2**np.arange(0,32,R_IN));
  elif(OP_TYPE == "CONV-1D"):
      img_temp = np.reshape(inputs,(-1,32//R_IN));
      int_img = np.dot(img_temp,2**np.arange(0,32,R_IN));
  elif(OP_TYPE == "CONV-2D"):
      img_temp = np.reshape(inputs,(-1,32//R_IN));
      int_img = np.dot(img_temp,2**np.arange(0,32,R_IN));
  else:
      print("Warning: operation mode not supported")
    
# // Transform outputs sub-set into 32b words for SRAM encoding //
if(OUT_EN):
  outputs_list = []; outputs_list_test = [];
  for i in range(Nlayers):
    C_OUT = C_OUT_VEC[i];
    # Get outputs (only ADC outputs)
    with open(out_file+"_layer_{}.txt".format(i),"r") as f:
      outputs = np.genfromtxt(f,delimiter=" ");
    # Store raw outputs for FP test
    outputs_list_test.append(np.int32(outputs));
    # Reshaping depending upon operation type
    if(OP_TYPE == "FC"):
      if(R_OUT*C_OUT <= 32):
          int_dout = np.array([np.dot(outputs,2**(R_OUT*np.arange(0,(R_OUT*C_OUT),R_OUT)))]);
      else:
          int_dout = np.dot(np.reshape(outputs,(-1,32//R_OUT)),2**np.arange(0,32,R_OUT));
    elif(OP_TYPE == "CONV-1D"):
      int_dout = np.dot(np.reshape(outputs,(-1,32//R_OUT)),2**(R_OUT*np.arange(32//R_OUT)));
    elif(OP_TYPE == "CONV-2D"):
      # Pad with zeros when necessary to fit memory size
      Npads = (R_OUT*C_OUT*(H-2*(i+1))*(H-2*(i+1)))%32;
      Npads = 0 if (Npads == 0) else (32//R_OUT-Npads);
      # Swap rows and columns
      data_out = np.reshape(outputs,(-1,H-2*(i+1),H-2*(i+1),C_OUT));
      data_out = np.swapaxes(data_out,1,2);
      data_out = data_out.reshape(-1);
      data_out = np.pad(data_out,(0,Npads),mode="constant");
      # Encode into 32b words
      int_dout = np.dot(np.reshape(data_out,(-1,32//R_OUT)),2**(R_OUT*np.arange(32//R_OUT)));
    else:
      print("Warning: operation type not supported !")
    # Add result to outputs list
    outputs_list.append(int_dout.astype("uint64"));

  for i in range(Nl_fp):
    # Get outputs
    with open(out_file+"_layer_{}.txt".format(Nlayers+i),"r") as f:
      outputs = np.genfromtxt(f,delimiter=" ");
      # Transform into FP
      outputs = np.int32(np.round(outputs*(2**16-1)*(2**15)/fS_beta_fp/fS_gamma_fp));
      # outputs = outputs*(2**15)*(2**15)/fS_beta_fp/fS_gamma_fp;
      # Append result
      outputs_list.append(outputs);
  
# // Write inference results to destination //
with open(inference_file,"r") as f:
  inf_results = np.genfromtxt(f,delimiter=" ");

# // Get weights for each layer and quantize them //
weights_list = []; weights_FP_list = [];
gamma_list = []; beta_list = [];
gamma_FP_list = []; beta_FP_list = [];
c_in_vec = []; c_out_vec = [];
Nlayers_cim = 0;  Nlayers_fp = 0;
with h5py.File(w_file,"r") as f:
  # List all groups
  list_of_keys = list(f.keys())
  # print(list_of_keys)
  for key in list_of_keys:
    # // Different cases depending upon the layer type (key) //
    # CIM-QNN layer
    if(('cim_charge_conv2d' in key) or ('cim_charge_dense' in key)):
      dataset = f[key][key];
      local_keys = list(dataset.keys());
      w_data = dataset[local_keys[0]][()];
      # Binarize weights
      w_data = tf.cast((binarize(w_data,H=1.)+np.ones_like(w_data))/2,dtype="int32");
      # Get weights shape (detect FC or CONV)
      w_shape = tf.shape(w_data);
      if(len(w_shape)>1):
        w_data    = tf.reshape(w_data,(-1,w_shape[-1]));
        w_shape   = tf.shape(w_data);
      # Pad with zeros to reach the full array size
      w_data = np.pad(w_data,((0,Nrows-w_shape[0]),(0,Ncols-w_shape[1])));
      # Store layer dimensions
      c_in_vec.append(w_shape[-2]); c_out_vec.append(w_shape[-1]); 
      Nlayers_cim += 1;
      
      # Flatten weights in 32b words
      int_weights = np.dot(np.reshape(w_data,(-1,32)),2**np.arange(32));
      # Store T_dp and weights to output file
      weights_list.append(int_weights);
    # Full-precision dense or conv layer
    elif(('dense' in key) or ('conv' in key)):
      dataset = f[key][key];
      local_keys = list(dataset.keys());
      w_data = dataset[local_keys[0]][()];
      # Transform floating-point weights into full-precision signed int32
      w_data = np.round(w_data*(2**15)/fS_beta_fp);
      w_data = np.int32(w_data);
      # w_data = w_data*(2**15)/fS_beta_fp;
      # Store weights
      weights_FP_list.append(np.reshape(w_data,(-1,1)));
      # Count one more FP layer
      Nlayers_fp += 1;
    # Analog BN
    elif('analog_bn' in key):
      dataset = f[key][key];
      local_keys = list(dataset.keys());
      beta  = dataset[local_keys[0]][()];
      gamma = dataset[local_keys[1]][()];
      #m_sigma  = dataset[local_keys[2]][()]; # to be corrected with updated training, if necesseay
      m_sigma = 1;
      mov_mean = dataset[local_keys[2]][()];
      mov_var  = dataset[local_keys[3]][()];
      
      # // Retrieve hardware parameters //
      Vmax_beta = sramInfo.Vmax_beta;
      Vlsb_beta = Vmax_beta/2**(r_beta-1);
      
      # // Equivalent gain computation //
      # Target variance
      sigma_goal = VDD/m_sigma; var_goal = sigma_goal*sigma_goal;
      # Get custom renorm factors (single gain for all columns)
      mov_variance_DP_t = np.mean(mov_var)/var_goal;
      sigma_DP_t = np.sqrt(mov_variance_DP_t); 
      # Get equivalent coefficients
      gamma_eq = gamma/(sigma_DP_t + epsilon);
      # Get gamma encoding
      gamma_code = np.round(np.log2(gamma_eq));
      
      # // Equivalent offset computation //
      beta_eq = beta/gamma_eq - mov_mean;
      # Get beta encoding
      beta_code = np.round(beta_eq/Vlsb_beta);
      print(beta_code)
      
      # // Append gamma & beta configs (uint8, encoding done during C mapping) 
      gamma_list.append(gamma_code.astype("uint8"));
      beta_list.append(beta_code.astype("uint8"));

      
    # Full-precision BN
    elif('batch_normalization' in key):
      dataset = f[key][key];
      local_keys = list(dataset.keys());
      beta  = dataset[local_keys[0]][()];
      gamma = dataset[local_keys[1]][()];
      mov_mean = dataset[local_keys[2]][()];
      mov_var  = dataset[local_keys[3]][()];
      # Get equivalent coefficients
      mov_sig = np.sqrt(mov_var);
      gamma_eq = gamma/mov_sig;
      beta_eq  = beta/gamma_eq - mov_mean;
      # Transform floating-point result into full-precision signed int32
      beta_eq = np.int32(np.round(beta_eq/fS_beta_fp*(2**15)));
      gamma_eq = np.int32(np.round(gamma_eq/fS_gamma_fp*(2**16-1)));
      # beta_eq = beta_eq*(2**15)/fS_beta_fp;
      # gamma_eq = gamma_eq*(2**15)/fS_gamma_fp;
      
      # Store results
      gamma_FP_list.append(np.reshape(gamma_eq,(-1,1)));
      beta_FP_list.append(np.reshape(beta_eq,(-1,1)));
   # NOTHING TO DO FOR ACTIVATIONS/REGU LAYERS
   
# // Verify full-precision conversion gives expected output (converted to 64b) //
# print(np.shape(outputs_list_test[-1])); print(np.shape(weights_FP_list[0]));
# print(np.shape(gamma_FP_list[0])); print(np.shape(beta_FP_list[0]));
# print(np.shape(weights_FP_list[-1]))
# print(np.reshape(weights_FP_list[-1],(-1,classes)))
print("\n");
#print("--- Operands ---");
#print(np.reshape(outputs_list_test[-1],(Nimg_save,-1)));
#print(np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10)));
#print(outputs_list[-1]);

###################################################
########## Test FP output equivalence #############
###################################################

if(OUT_EN):
  print("--- Computing FP MAC equivalence ---");
  temp_mac = np.squeeze(np.dot(np.reshape(outputs_list_test[-1],(Nimg_save,-1)),np.reshape(weights_FP_list[0],(-1,classes))));
  actual_mac = np.int32(np.round(np.squeeze(gamma_FP_list[-1])*(temp_mac+np.squeeze(beta_FP_list[-1]))));
  # actual_mac = np.uint64(np.squeeze(np.dot(np.reshape(outputs_list_test[-1],(Nimg_save,-1)),np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10)))));
  # actual_mac = np.squeeze(gamma_FP_list[0])*(np.squeeze(np.dot(np.reshape(outputs_list_test[-1],(Nimg_save,-1)),np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10))))+np.squeeze(beta_FP_list[0]));
  # expected_mac = np.uint64(np.round(outputs_list[-1]));
  # expected_mac = np.uint64(outputs_list[-1]*(2**31)/fS_beta_fp);
  expected_mac = outputs_list[-1];
  expected_mac = np.reshape(expected_mac,(Nimg_save,-1));

  #print("--- Operands ---");
  #print(np.reshape(outputs_list_test[-1],(Nimg_save,-1)));
  #print(np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10)));
  #print(outputs_list[-1]);
  # print(np.int32(actual_mac)); print(np.int32(expected_mac));
  print(actual_mac); print(expected_mac);
  print(outputs_list_test[-1]);
  # Detailed comptuation below
  in_FP = np.reshape(outputs_list_test[-1],(Nimg_save,-1));
  w_FP  = np.reshape(weights_FP_list[0],(C_OUT_VEC[-1],10));
  gamma_FP = gamma_FP_list[0]; beta_FP = beta_FP_list[0];
  mac_val = np.zeros((Nimg_save,10),dtype="int32");
  for m in range(Nimg_save):
    # Perform MAC operations
    for i in range(C_OUT_VEC[-1]):
      # Fetch input
      inputs = in_FP[m][i]; 
      for j in range(10):
        # Fetch weight
        weights = w_FP[i][j];
        # MAC operation
        mac_val[m][j] = mac_val[m][j] + inputs*weights;
        #if(m==0 and (i==0 or i==1)):
        if(m==0 and i<8 and j==0):
          print('Input {} is {}'.format(j,inputs));
          print('Weight {} is {}'.format(j,weights));
          print('DP {} at iter {} is {}'.format(j,i,mac_val[m][j]));
    # Print final DP value
    for j in range(10):
      if(m==0):
        print('DP result {} is {}'.format(j,mac_val[m][j]));
    # Perform BN operations
    for j in range(10):
      mac_val[m][j] = gamma_FP[j]*(mac_val[m][j]+beta_FP[j]);
      if(m==0):
        print('BN result {} is {}'.format(j,mac_val[m][j]));

  count_error = 0;
  for i in range(Nimg_save):
    for j in range(np.shape(actual_mac)[-1]):
      perc_error = 100*np.abs(np.int32(actual_mac[i,j]-expected_mac[i,j])/np.int32(expected_mac[i,j]));
      if(perc_error>1e-1):
        error_val = actual_mac[i,j]^expected_mac[i,j];
        count_error+=1;
        #print("Error for FP computation {}: {} ({:3f}%) !".format(i,hex(error_val),perc_error));
        print("Error for FP computation {}: {} instead of {} ({:3f}%) !".format(Nimg_save*i+j,hex(actual_mac[i,j]),hex(expected_mac[i,j]),perc_error));


  if(count_error == 0):
    print('All results are correct to 0.1%, congrats !');
  else:
    print('There were {} word errors found !'.format(count_error));

else:
  print('Warning: output results not available after training, FP comparison bypassed');


####################################################
########## Store results to text files #############
####################################################
# Inputs
data_in = int_img.astype("uint32");
np.savetxt(file_out_inputs,data_in,fmt='%x');
# Outputs
if(OUT_EN):
  cim_outputs = np.concatenate(outputs_list,axis=None).astype("uint64");
  for i in range(len(outputs_list)):
    if(i<Nlayers):
      np.savetxt(file_out_outputs+'_layer_{}.txt'.format(i),outputs_list[i].astype("uint64"),fmt='%x');
    else:
      np.savetxt(file_out_outputs+'_layer_{}.txt'.format(i),outputs_list[i].astype("uint64"),fmt='%x');
  
# Inference results
np.savetxt(file_out_inference,np.array([inf_results]).astype("uint64"),fmt='%x');
# CIM weights
weights_cim = np.concatenate(weights_list,axis=None).astype("uint64");
np.savetxt(file_out_weights+'.txt',weights_cim,fmt='%x');
# Gamma file
gamma_cim = np.concatenate(gamma_list,axis=None);
np.savetxt(file_out_gamma+'.txt',gamma_cim,fmt='%x');
# Beta file
beta_cim = np.concatenate(beta_list,axis=None);
np.savetxt(file_out_beta+'.txt',beta_cim,fmt='%x');
# FP FC/CONV weights
weights_fp = np.concatenate(weights_FP_list,axis=None).astype("uint64");
np.savetxt(file_out_weights_FP+'.txt',weights_fp,fmt='%x');
# FP BN weights
gamma_fp = np.concatenate(gamma_FP_list,axis=None).astype("uint64");
beta_fp  = np.concatenate(beta_FP_list,axis=None).astype("uint64");
np.savetxt(file_out_gamma_FP+'.txt',gamma_fp,fmt='%x'); 
np.savetxt(file_out_beta_FP+'.txt',beta_fp,fmt='%x'); 
  
######################################################################
########## Generate final test files for on-chip testing #############
######################################################################
# // Parameters folding //
# Filenames
filename_c    = path_to_chip+'./cim_config.h';
filename_fpga = [file_fpga_inputs,file_fpga_weights_cim,file_fpga_beta,file_fpga_weights_FP,file_fpga_inf_res,file_fpga_outputs];
# CNN info
network_info = (Nlayers_cim,Nlayers_fp,Nimg_save);
# CIM dimensions
cim_dim       = (Nrows,Ncols,Nimg_save);
# Precision/Channels/Timing
D_VEC         = (dim,dim,c_in_vec,c_out_vec);
P_VEC         = (R_IN,R_W,R_OUT,R_BETA,R_GAMMA);
T_VEC         = (T_DP,T_PRE,T_MBIT,T_ADC);
# Data for FPGA
data_fpga = [data_in,weights_cim,beta_list,weights_fp,inf_results.astype("int32")];
if(OUT_EN):
  data_fpga.append(cim_outputs);

# // Generate C header file with hardware params //
create_C_header(filename_c,network_info,cim_dim,D_VEC,P_VEC,T_VEC,gamma_cim,beta_fp,gamma_fp);
# // Generate off-chip FPGA memory files //
create_fpga_files(filename_fpga,network_info,cim_dim,D_VEC,P_VEC,data_fpga);

 
print('///////////////////////////////////////////////////////');
print('//////////////// FILES CONVERSION DONE ////////////////');
print('///////////////////////////////////////////////////////');