diff --git a/main.cpp b/main.cpp
index e035676c6c2905132c89b4e0c28de105f0cd0408..a36d5e6eec167c3431b2a17203e0ac60835da336 100644
--- a/main.cpp
+++ b/main.cpp
@@ -106,7 +106,14 @@ int main(int argc, char** argv) {
     std::cout << "Using Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
 
     cl::Context context(device);
-    cl::CommandQueue queue(context, device, CL_QUEUE_PROFILING_ENABLE); // Keep profiling enabled
+    cl_int err;
+    cl_command_queue cq = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &err);
+    if (err != CL_SUCCESS) {
+        std::cerr << "Failed to create command queue: " << err << std::endl;
+        exit(1);
+    }
+    cl::CommandQueue queue(cq, true);
+
 
     std::vector<cl::Device> devices_to_init = {device};
     try {
diff --git a/matrix_opencl.cpp b/matrix_opencl.cpp
index 6786c1c829d8c36002fa23d8a597fb59c53d56b2..bd052dc75ac7a1ff3d7837e80c5879db12d49416 100644
--- a/matrix_opencl.cpp
+++ b/matrix_opencl.cpp
@@ -77,7 +77,7 @@ const std::string kernel_source_transpose = R"(
     }
 )";
 // NAIVE
-/*const std::string kernel_source_matrix_mul = R"(
+const std::string kernel_source_matrix_mul = R"(
     __kernel void matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
         int row = get_global_id(0);
         int col = get_global_id(1);
@@ -85,10 +85,10 @@ const std::string kernel_source_transpose = R"(
             C[row * B_cols + col] += A[row * A_cols + k] * B[k * B_cols + col];
         }
     }
-)";*/
+)";
 
 // FASTER
-const std::string kernel_source_matrix_mul = R"(
+/*const std::string kernel_source_matrix_mul = R"(
     __kernel void matrix_mul(__global const float* A,
                          __global const float* B,
                          __global float* C,
@@ -120,7 +120,7 @@ const std::string kernel_source_matrix_mul = R"(
 
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-})";
+})";*/
 const std::string kernel_source_sigmoid = R"(
     __kernel void sigmoid(__global const float* input, __global float* output, int rows, int cols) {
         int idx = get_global_id(0);
@@ -348,7 +348,7 @@ MatrixCL MatrixCL::operator+(const MatrixCL& other) const {
 }
 
 // NAIVE VERSION
-/*MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
+MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
     if (cols_ != other.rows_)
         throw std::runtime_error("Matrix dimension error.");
 
@@ -368,10 +368,10 @@ MatrixCL MatrixCL::operator+(const MatrixCL& other) const {
     }
 
     return result;
-}*/
+}
 
 // FASTER VERSION
-MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
+/*MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
     if (cols_ != other.rows_)
         throw std::runtime_error("Matrix dimension error.");
 
@@ -404,7 +404,7 @@ MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
     }
 
     return result;
-}
+}*/
 
 
 MatrixCL MatrixCL::transpose() const {