diff --git a/benchmark.cpp b/benchmark.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a64a1bdfdbc2b2fd08d6ada662b3b3b28545334 --- /dev/null +++ b/benchmark.cpp @@ -0,0 +1,92 @@ +#include <iostream> +#include <vector> +#include <random> +#include <chrono> + +#include "matrix_opencl.hpp" + +std::vector<float> fill_random(int rows, int cols) { + std::mt19937 gen(42); + std::uniform_real_distribution<float> dist(-1.0f, 1.0f); + std::vector<float> data(rows*cols); + for (int i = 0; i < rows * cols; ++i) { + data[i] = dist(gen); + } + return data; +} + +int main(int argc, char** argv) { + if (argc < 3) { + std::cerr << "Usage: ./matrix_mul_exec <size> <runs>" << std::endl; + return 1; + } + + int size = std::stoi(argv[1]); + int runs = std::stoi(argv[2]); + + // 1. --- OpenCL Setup --- + std::vector<cl::Platform> platforms; + cl::Platform::get(&platforms); + if (platforms.empty()) { + std::cerr << "No OpenCL platforms found." << std::endl; + return 1; + } + cl::Platform platform = platforms.front(); + + std::vector<cl::Device> devices; + platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); + if (devices.empty()) { + platform.getDevices(CL_DEVICE_TYPE_CPU, &devices); + if (devices.empty()) { + std::cerr << "No OpenCL devices found." << std::endl; + return 1; + } + } + cl::Device device = devices.front(); + + cl::Context context(device); + cl_int err; + cl_command_queue cq = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &err); + if (err != CL_SUCCESS) { + std::cerr << "Failed to create command queue: " << err << std::endl; + exit(1); + } + cl::CommandQueue queue(cq, true); + + + std::vector<cl::Device> devices_to_init = {device}; + try { + MatrixCL::initializeKernels(context, devices_to_init); + } catch (const std::exception& e) { + // Catching std::exception here because initializeKernels wraps cl::Error + std::cerr << "FATAL ERROR during kernel initialization: " << e.what() << std::endl; + // If the error was a BuildError, the log should have been printed + // by the loadAndBuildProgram function within initializeKernels. + return 1; + } + + std::chrono::duration<double, std::milli> total_time(0); + for (int i = 0; i < runs; ++i) { + + // 2. --- Matrix Multiplication --- + std::vector<float> dataA = fill_random(size, size); + std::vector<float> dataB = fill_random(size, size); + + MatrixCL A(size, size, context, queue, &dataA); + MatrixCL B(size, size, context, queue, &dataB); + + auto start = std::chrono::high_resolution_clock::now(); + + #ifdef FAST_MATMUL + MatrixCL C = A.fast_matrix_mul(B); + #else + MatrixCL C = A * B; + #endif + queue.finish(); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double, std::milli> elapsed = end - start; + total_time += elapsed; + } + std::cout << total_time.count() / runs << std::endl; + return 0; +} \ No newline at end of file diff --git a/matrix_opencl.cpp b/matrix_opencl.cpp index f5551b651d5d3e6ff4542c86bb7bf339e5f4c5df..8c108ef2862eb898645431ee8f5f9e96a8267771 100644 --- a/matrix_opencl.cpp +++ b/matrix_opencl.cpp @@ -76,7 +76,7 @@ const std::string kernel_source_transpose = R"( B[output_idx] = A[input_idx]; } )"; -/*const std::string kernel_source_matrix_mul = R"( +const std::string kernel_source_matrix_mul = R"( __kernel void matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) { int row = get_global_id(0); int col = get_global_id(1); @@ -84,9 +84,9 @@ const std::string kernel_source_transpose = R"( C[row * B_cols + col] += A[row * A_cols + k] * B[k * B_cols + col]; } } -)";*/ -const std::string kernel_source_matrix_mul = R"( - __kernel void matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) { +)"; +const std::string kernel_source_matrix_mul_V2 = R"( + __kernel void fast_matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) { int row = get_global_id(0); int col = get_global_id(1); @@ -191,6 +191,9 @@ void KernelCache::compileKernels(cl::Context context, const std::vector<cl::Devi cl::Program prog_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_matrix_mul, "matrix_mul"); kernel_matrix_mul = cl::Kernel(prog_matrix_mul, "matrix_mul"); + cl::Program prog_matrix_mul_V2 = loadAndBuildProgram(context, devices, kernel_source_matrix_mul_V2, "matrix_mul_V2"); + kernel_matrix_mul_V2 = cl::Kernel(prog_matrix_mul_V2, "matrix_mul_V2"); + cl::Program prog_sigmoid = loadAndBuildProgram(context, devices, kernel_source_sigmoid, "sigmoid"); kernel_sigmoid = cl::Kernel(prog_sigmoid, "sigmoid"); @@ -374,6 +377,31 @@ MatrixCL MatrixCL::operator*(const MatrixCL& other) const { return result; } +MatrixCL MatrixCL::matrix_mul_V2(const MatrixCL& other) const { + MatrixCL result(rows_, other.numCols(), context_, queue_); + + cl::Kernel kernel = kernels_->kernel_matrix_mul_V2; + kernel.setArg(0, buffer_); + kernel.setArg(1, other.getBuffer()); + kernel.setArg(2, result.getBuffer()); + kernel.setArg(3, rows_); + kernel.setArg(4, cols_); + kernel.setArg(5, other.numCols()); + + const size_t TILE_SIZE = 16; + + // Align global work size to the nearest multiple of TILE_SIZE + size_t global_rows = ((rows_ + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE; + size_t global_cols = ((other.numCols() + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE; + + cl::NDRange global_work_size(global_rows, global_cols); + cl::NDRange local_work_size(TILE_SIZE, TILE_SIZE); + + queue_.enqueueNDRangeKernel(kernel, cl::NullRange, global_work_size, local_work_size); + + return result; +} + MatrixCL MatrixCL::transpose() const { MatrixCL result(cols_, rows_, context_, queue_); diff --git a/matrix_opencl.hpp b/matrix_opencl.hpp index 42225db260d8715aa90e097016910e2ba4ddb077..6e9d5dfa9929aa5d59374d4833c3334aaced9d68 100644 --- a/matrix_opencl.hpp +++ b/matrix_opencl.hpp @@ -25,6 +25,7 @@ struct KernelCache { cl::Kernel kernel_sub_mul; cl::Kernel kernel_transpose; cl::Kernel kernel_matrix_mul; + cl::Kernel kernel_matrix_mul_v2; cl::Kernel kernel_sigmoid; cl::Kernel kernel_sigmoid_backward; cl::Kernel kernel_bce_elementwise; @@ -93,6 +94,8 @@ public: // Matrix multiplication: C = A * B MatrixCL operator*(const MatrixCL& other) const; + MatrixCL matrix_mul_V2(const MatrixCL& other) const; + // Transpose: returns a new Matrix that is the transpose (B = A^T) MatrixCL transpose() const;