diff --git a/benchmark.cpp b/benchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a64a1bdfdbc2b2fd08d6ada662b3b3b28545334
--- /dev/null
+++ b/benchmark.cpp
@@ -0,0 +1,92 @@
+#include <iostream>
+#include <vector>
+#include <random>
+#include <chrono>
+
+#include "matrix_opencl.hpp"
+
+std::vector<float> fill_random(int rows, int cols) {
+    std::mt19937 gen(42);
+    std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+    std::vector<float> data(rows*cols);
+    for (int i = 0; i < rows * cols; ++i) {
+        data[i] = dist(gen);
+    }
+    return data;
+}
+
+int main(int argc, char** argv) {
+    if (argc < 3) {
+        std::cerr << "Usage: ./matrix_mul_exec <size> <runs>" << std::endl;
+        return 1;
+    }
+
+    int size = std::stoi(argv[1]);
+    int runs = std::stoi(argv[2]);
+
+    // 1. --- OpenCL Setup ---
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    if (platforms.empty()) {
+        std::cerr << "No OpenCL platforms found." << std::endl;
+        return 1;
+    }
+    cl::Platform platform = platforms.front();
+
+    std::vector<cl::Device> devices;
+    platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+    if (devices.empty()) {
+        platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
+        if (devices.empty()) {
+            std::cerr << "No OpenCL devices found." << std::endl;
+            return 1;
+        }
+    }
+    cl::Device device = devices.front();
+
+    cl::Context context(device);
+    cl_int err;
+    cl_command_queue cq = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &err);
+    if (err != CL_SUCCESS) {
+        std::cerr << "Failed to create command queue: " << err << std::endl;
+        exit(1);
+    }
+    cl::CommandQueue queue(cq, true);
+
+
+    std::vector<cl::Device> devices_to_init = {device};
+    try {
+        MatrixCL::initializeKernels(context, devices_to_init);
+    } catch (const std::exception& e) {
+        // Catching std::exception here because initializeKernels wraps cl::Error
+        std::cerr << "FATAL ERROR during kernel initialization: " << e.what() << std::endl;
+        // If the error was a BuildError, the log should have been printed
+        // by the loadAndBuildProgram function within initializeKernels.
+        return 1;
+    }
+
+    std::chrono::duration<double, std::milli> total_time(0);
+    for (int i = 0; i < runs; ++i) {
+
+        // 2. --- Matrix Multiplication ---
+        std::vector<float> dataA = fill_random(size, size);
+        std::vector<float> dataB = fill_random(size, size);
+
+        MatrixCL A(size, size, context, queue, &dataA);
+        MatrixCL B(size, size, context, queue, &dataB);
+        
+        auto start = std::chrono::high_resolution_clock::now();
+
+        #ifdef FAST_MATMUL
+            MatrixCL C = A.fast_matrix_mul(B);
+        #else
+            MatrixCL C = A * B;
+        #endif
+            queue.finish();
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double, std::milli> elapsed = end - start;
+        total_time += elapsed;
+    }
+    std::cout << total_time.count() / runs << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/matrix_opencl.cpp b/matrix_opencl.cpp
index f5551b651d5d3e6ff4542c86bb7bf339e5f4c5df..8c108ef2862eb898645431ee8f5f9e96a8267771 100644
--- a/matrix_opencl.cpp
+++ b/matrix_opencl.cpp
@@ -76,7 +76,7 @@ const std::string kernel_source_transpose = R"(
         B[output_idx] = A[input_idx];
     }
 )";
-/*const std::string kernel_source_matrix_mul = R"(
+const std::string kernel_source_matrix_mul = R"(
     __kernel void matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
         int row = get_global_id(0);
         int col = get_global_id(1);
@@ -84,9 +84,9 @@ const std::string kernel_source_transpose = R"(
             C[row * B_cols + col] += A[row * A_cols + k] * B[k * B_cols + col];
         }
     }
-)";*/
-const std::string kernel_source_matrix_mul = R"(
-    __kernel void matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
+)";
+const std::string kernel_source_matrix_mul_V2 = R"(
+    __kernel void fast_matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
         int row = get_global_id(0);
         int col = get_global_id(1);
 
@@ -191,6 +191,9 @@ void KernelCache::compileKernels(cl::Context context, const std::vector<cl::Devi
         cl::Program prog_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_matrix_mul, "matrix_mul");
         kernel_matrix_mul = cl::Kernel(prog_matrix_mul, "matrix_mul");
 
+        cl::Program prog_matrix_mul_V2 = loadAndBuildProgram(context, devices, kernel_source_matrix_mul_V2, "matrix_mul_V2");
+        kernel_matrix_mul_V2 = cl::Kernel(prog_matrix_mul_V2, "matrix_mul_V2");
+
         cl::Program prog_sigmoid = loadAndBuildProgram(context, devices, kernel_source_sigmoid, "sigmoid");
         kernel_sigmoid = cl::Kernel(prog_sigmoid, "sigmoid");
 
@@ -374,6 +377,31 @@ MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
     return result;
 }
 
+MatrixCL MatrixCL::matrix_mul_V2(const MatrixCL& other) const {
+    MatrixCL result(rows_, other.numCols(), context_, queue_);
+
+    cl::Kernel kernel = kernels_->kernel_matrix_mul_V2; 
+    kernel.setArg(0, buffer_);
+    kernel.setArg(1, other.getBuffer());
+    kernel.setArg(2, result.getBuffer()); 
+    kernel.setArg(3, rows_);
+    kernel.setArg(4, cols_);
+    kernel.setArg(5, other.numCols());
+
+    const size_t TILE_SIZE = 16;
+
+    // Align global work size to the nearest multiple of TILE_SIZE
+    size_t global_rows = ((rows_ + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
+    size_t global_cols = ((other.numCols() + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
+
+    cl::NDRange global_work_size(global_rows, global_cols);
+    cl::NDRange local_work_size(TILE_SIZE, TILE_SIZE);
+
+    queue_.enqueueNDRangeKernel(kernel, cl::NullRange, global_work_size, local_work_size);
+
+    return result;
+}
+
 
 MatrixCL MatrixCL::transpose() const {
     MatrixCL result(cols_, rows_, context_, queue_);
diff --git a/matrix_opencl.hpp b/matrix_opencl.hpp
index 42225db260d8715aa90e097016910e2ba4ddb077..6e9d5dfa9929aa5d59374d4833c3334aaced9d68 100644
--- a/matrix_opencl.hpp
+++ b/matrix_opencl.hpp
@@ -25,6 +25,7 @@ struct KernelCache {
     cl::Kernel kernel_sub_mul;
     cl::Kernel kernel_transpose;
     cl::Kernel kernel_matrix_mul;
+    cl::Kernel kernel_matrix_mul_v2;
     cl::Kernel kernel_sigmoid;
     cl::Kernel kernel_sigmoid_backward;
     cl::Kernel kernel_bce_elementwise;
@@ -93,6 +94,8 @@ public:
     // Matrix multiplication: C = A * B
     MatrixCL operator*(const MatrixCL& other) const;
 
+    MatrixCL matrix_mul_V2(const MatrixCL& other) const;
+
     // Transpose: returns a new Matrix that is the transpose (B = A^T)
     MatrixCL transpose() const;