diff --git a/Makefile b/Makefile
index 5d0505b8292649bca1ba96360d7c14de7ccc0702..e6406991b0c8bcf166241e9295621b1bcdb9a96c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,27 +1,18 @@
 CXX = g++
-CXXFLAGS = -std=c++17 -Wall -Wextra -O3 
+CXXFLAGS = -std=c++17 -Wall -Wextra -O0
 TARGET = main
-SRC = matrix_opencl.cpp mlp_sgd.cpp globals.cpp
-LIBS = -lOpenCL
+SRC = matrix_opencl.cpp benchmark.cpp mlp_sgd.cpp globals.cpp
 
 all:
 	$(MAKE) clean && $(MAKE) run
 
 $(TARGET): $(SRC)
-	$(CXX) $(CXXFLAGS) -o $(TARGET)$(SUFFIX) $(SRC) main.cpp $(LIBS)
+	$(CXX) $(CXXFLAGS) -o $(TARGET) $(SRC) -lOpenCL
 
 run: $(TARGET)
 	./$(TARGET)
 
-fast: benchmark.cpp
-	$(CXX) $(CXXFLAGS) -DFAST_MATMUL -o benchmark$(SUFFIX) benchmark.cpp $(LIBS) $(SRC)
-
-naive: benchmark.cpp
-	$(CXX) $(CXXFLAGS) -o benchmark$(SUFFIX) benchmark.cpp $(LIBS) $(SRC)
-
 clean:
 	rm -f $(TARGET)
-	rm -f benchmark
-
 
 .PHONY: all run clean
\ No newline at end of file
diff --git a/matrix_opencl.cpp b/matrix_opencl.cpp
index 190932ea5c7fe7c9581d1ba64457cfb6b00f6430..03a00eeebb121ac8c31a207a7424c1079257cfd0 100644
--- a/matrix_opencl.cpp
+++ b/matrix_opencl.cpp
@@ -85,47 +85,23 @@ const std::string kernel_source_matrix_mul = R"(
         }
     }
 )";
-const std::string kernel_source_fast_matrix_mul = R"(
-    __kernel void fast_matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
-        int row = get_global_id(0);
-        int col = get_global_id(1);
-
-        int local_row = get_local_id(0);
-        int local_col = get_local_id(1);
-
-        const int TILE_SIZE = 16; 
-
-        __local float As[TILE_SIZE][TILE_SIZE];
-        __local float Bs[TILE_SIZE][TILE_SIZE];
-
-        float sum = 0.0f;
-        
-        for (int t = 0; t < (A_cols + TILE_SIZE - 1) / TILE_SIZE; t++) {
-            int tile_row = t * TILE_SIZE + local_row;
-            int tile_col = t * TILE_SIZE + local_col;
-
-            if (row < A_rows && tile_col < A_cols)
-                As[local_row][local_col] = A[row * A_cols + tile_col];
-            else
-                As[local_row][local_col] = 0.0f;
-            
-            if (tile_row < A_cols && col < B_cols)
-                Bs[local_row][local_col] = B[tile_row * B_cols + col];
-            else
-                Bs[local_row][local_col] = 0.0f;
-            
-            barrier(CLK_LOCAL_MEM_FENCE);
-            
-            for (int k = 0; k < TILE_SIZE; ++k)
-                sum += As[local_row][k] * Bs[k][local_col];
-            
-            barrier(CLK_LOCAL_MEM_FENCE);
+/*const std::string kernel_source_matrix_mul = R"(
+    __kernel void matrix_mul(__global const float* A,__global const float* B, __global float* C, int M, int K, int N) {
+        int i = get_global_id(0);
+        float Awrk[1024];
+        for (int k = 0; k < K; ++k) {
+            Awrk[k] = A[i * K + k];
         }
-        if (row < A_rows && col < B_cols) {
-            C[row * B_cols + col] = sum;
+
+        for (int j = 0; j < N; ++j) {
+            float tmp = 0.0f;
+            for (int k = 0; k < K; ++k) {
+                tmp += Awrk[k] * B[k * N + j];
+            }
+            C[i * N + j] = tmp;
         }
     }
-)";
+)";*/
 const std::string kernel_source_sigmoid = R"(
     __kernel void sigmoid(__global const float* input, __global float* output, int rows, int cols) {
         int idx = get_global_id(0);
@@ -191,9 +167,6 @@ void KernelCache::compileKernels(cl::Context context, const std::vector<cl::Devi
         cl::Program prog_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_matrix_mul, "matrix_mul");
         kernel_matrix_mul = cl::Kernel(prog_matrix_mul, "matrix_mul");
 
-        cl::Program prog_fast_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_fast_matrix_mul, "fast_matrix_mul");
-        kernel_fast_matrix_mul = cl::Kernel(prog_fast_matrix_mul, "fast_matrix_mul");
-
         cl::Program prog_sigmoid = loadAndBuildProgram(context, devices, kernel_source_sigmoid, "sigmoid");
         kernel_sigmoid = cl::Kernel(prog_sigmoid, "sigmoid");
 
@@ -377,31 +350,6 @@ MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
     return result;
 }
 
-MatrixCL MatrixCL::fast_matrix_mul(const MatrixCL& other) const {
-    MatrixCL result(rows_, other.numCols(), context_, queue_);
-
-    cl::Kernel kernel = kernels_->kernel_fast_matrix_mul; 
-    kernel.setArg(0, buffer_);
-    kernel.setArg(1, other.getBuffer());
-    kernel.setArg(2, result.getBuffer()); 
-    kernel.setArg(3, rows_);
-    kernel.setArg(4, cols_);
-    kernel.setArg(5, other.numCols());
-
-    const size_t TILE_SIZE = 16;
-
-    // Align global work size to the nearest multiple of TILE_SIZE
-    size_t global_rows = ((rows_ + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
-    size_t global_cols = ((other.numCols() + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
-
-    cl::NDRange global_work_size(global_rows, global_cols);
-    cl::NDRange local_work_size(TILE_SIZE, TILE_SIZE);
-
-    queue_.enqueueNDRangeKernel(kernel, cl::NullRange, global_work_size, local_work_size);
-
-    return result;
-}
-
 
 MatrixCL MatrixCL::transpose() const {
     MatrixCL result(cols_, rows_, context_, queue_);
diff --git a/matrix_opencl.hpp b/matrix_opencl.hpp
index 07c1bab5bef2f8315cd3b98dea33d0d17bc85f24..42225db260d8715aa90e097016910e2ba4ddb077 100644
--- a/matrix_opencl.hpp
+++ b/matrix_opencl.hpp
@@ -25,7 +25,6 @@ struct KernelCache {
     cl::Kernel kernel_sub_mul;
     cl::Kernel kernel_transpose;
     cl::Kernel kernel_matrix_mul;
-    cl::Kernel kernel_fast_matrix_mul;
     cl::Kernel kernel_sigmoid;
     cl::Kernel kernel_sigmoid_backward;
     cl::Kernel kernel_bce_elementwise;
@@ -94,9 +93,6 @@ public:
     // Matrix multiplication: C = A * B
     MatrixCL operator*(const MatrixCL& other) const;
 
-    // Fast matrix multiplication: C = A * B (optimized for large matrices)
-    MatrixCL fast_matrix_mul(const MatrixCL& other) const;
-
     // Transpose: returns a new Matrix that is the transpose (B = A^T)
     MatrixCL transpose() const;