diff --git a/Makefile b/Makefile index 5d0505b8292649bca1ba96360d7c14de7ccc0702..e6406991b0c8bcf166241e9295621b1bcdb9a96c 100644 --- a/Makefile +++ b/Makefile @@ -1,27 +1,18 @@ CXX = g++ -CXXFLAGS = -std=c++17 -Wall -Wextra -O3 +CXXFLAGS = -std=c++17 -Wall -Wextra -O0 TARGET = main -SRC = matrix_opencl.cpp mlp_sgd.cpp globals.cpp -LIBS = -lOpenCL +SRC = matrix_opencl.cpp benchmark.cpp mlp_sgd.cpp globals.cpp all: $(MAKE) clean && $(MAKE) run $(TARGET): $(SRC) - $(CXX) $(CXXFLAGS) -o $(TARGET)$(SUFFIX) $(SRC) main.cpp $(LIBS) + $(CXX) $(CXXFLAGS) -o $(TARGET) $(SRC) -lOpenCL run: $(TARGET) ./$(TARGET) -fast: benchmark.cpp - $(CXX) $(CXXFLAGS) -DFAST_MATMUL -o benchmark$(SUFFIX) benchmark.cpp $(LIBS) $(SRC) - -naive: benchmark.cpp - $(CXX) $(CXXFLAGS) -o benchmark$(SUFFIX) benchmark.cpp $(LIBS) $(SRC) - clean: rm -f $(TARGET) - rm -f benchmark - .PHONY: all run clean \ No newline at end of file diff --git a/matrix_opencl.cpp b/matrix_opencl.cpp index 190932ea5c7fe7c9581d1ba64457cfb6b00f6430..03a00eeebb121ac8c31a207a7424c1079257cfd0 100644 --- a/matrix_opencl.cpp +++ b/matrix_opencl.cpp @@ -85,47 +85,23 @@ const std::string kernel_source_matrix_mul = R"( } } )"; -const std::string kernel_source_fast_matrix_mul = R"( - __kernel void fast_matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) { - int row = get_global_id(0); - int col = get_global_id(1); - - int local_row = get_local_id(0); - int local_col = get_local_id(1); - - const int TILE_SIZE = 16; - - __local float As[TILE_SIZE][TILE_SIZE]; - __local float Bs[TILE_SIZE][TILE_SIZE]; - - float sum = 0.0f; - - for (int t = 0; t < (A_cols + TILE_SIZE - 1) / TILE_SIZE; t++) { - int tile_row = t * TILE_SIZE + local_row; - int tile_col = t * TILE_SIZE + local_col; - - if (row < A_rows && tile_col < A_cols) - As[local_row][local_col] = A[row * A_cols + tile_col]; - else - As[local_row][local_col] = 0.0f; - - if (tile_row < A_cols && col < B_cols) - Bs[local_row][local_col] = B[tile_row * B_cols + col]; - else - Bs[local_row][local_col] = 0.0f; - - barrier(CLK_LOCAL_MEM_FENCE); - - for (int k = 0; k < TILE_SIZE; ++k) - sum += As[local_row][k] * Bs[k][local_col]; - - barrier(CLK_LOCAL_MEM_FENCE); +/*const std::string kernel_source_matrix_mul = R"( + __kernel void matrix_mul(__global const float* A,__global const float* B, __global float* C, int M, int K, int N) { + int i = get_global_id(0); + float Awrk[1024]; + for (int k = 0; k < K; ++k) { + Awrk[k] = A[i * K + k]; } - if (row < A_rows && col < B_cols) { - C[row * B_cols + col] = sum; + + for (int j = 0; j < N; ++j) { + float tmp = 0.0f; + for (int k = 0; k < K; ++k) { + tmp += Awrk[k] * B[k * N + j]; + } + C[i * N + j] = tmp; } } -)"; +)";*/ const std::string kernel_source_sigmoid = R"( __kernel void sigmoid(__global const float* input, __global float* output, int rows, int cols) { int idx = get_global_id(0); @@ -191,9 +167,6 @@ void KernelCache::compileKernels(cl::Context context, const std::vector<cl::Devi cl::Program prog_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_matrix_mul, "matrix_mul"); kernel_matrix_mul = cl::Kernel(prog_matrix_mul, "matrix_mul"); - cl::Program prog_fast_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_fast_matrix_mul, "fast_matrix_mul"); - kernel_fast_matrix_mul = cl::Kernel(prog_fast_matrix_mul, "fast_matrix_mul"); - cl::Program prog_sigmoid = loadAndBuildProgram(context, devices, kernel_source_sigmoid, "sigmoid"); kernel_sigmoid = cl::Kernel(prog_sigmoid, "sigmoid"); @@ -377,31 +350,6 @@ MatrixCL MatrixCL::operator*(const MatrixCL& other) const { return result; } -MatrixCL MatrixCL::fast_matrix_mul(const MatrixCL& other) const { - MatrixCL result(rows_, other.numCols(), context_, queue_); - - cl::Kernel kernel = kernels_->kernel_fast_matrix_mul; - kernel.setArg(0, buffer_); - kernel.setArg(1, other.getBuffer()); - kernel.setArg(2, result.getBuffer()); - kernel.setArg(3, rows_); - kernel.setArg(4, cols_); - kernel.setArg(5, other.numCols()); - - const size_t TILE_SIZE = 16; - - // Align global work size to the nearest multiple of TILE_SIZE - size_t global_rows = ((rows_ + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE; - size_t global_cols = ((other.numCols() + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE; - - cl::NDRange global_work_size(global_rows, global_cols); - cl::NDRange local_work_size(TILE_SIZE, TILE_SIZE); - - queue_.enqueueNDRangeKernel(kernel, cl::NullRange, global_work_size, local_work_size); - - return result; -} - MatrixCL MatrixCL::transpose() const { MatrixCL result(cols_, rows_, context_, queue_); diff --git a/matrix_opencl.hpp b/matrix_opencl.hpp index 07c1bab5bef2f8315cd3b98dea33d0d17bc85f24..42225db260d8715aa90e097016910e2ba4ddb077 100644 --- a/matrix_opencl.hpp +++ b/matrix_opencl.hpp @@ -25,7 +25,6 @@ struct KernelCache { cl::Kernel kernel_sub_mul; cl::Kernel kernel_transpose; cl::Kernel kernel_matrix_mul; - cl::Kernel kernel_fast_matrix_mul; cl::Kernel kernel_sigmoid; cl::Kernel kernel_sigmoid_backward; cl::Kernel kernel_bce_elementwise; @@ -94,9 +93,6 @@ public: // Matrix multiplication: C = A * B MatrixCL operator*(const MatrixCL& other) const; - // Fast matrix multiplication: C = A * B (optimized for large matrices) - MatrixCL fast_matrix_mul(const MatrixCL& other) const; - // Transpose: returns a new Matrix that is the transpose (B = A^T) MatrixCL transpose() const;