Skip to content
Extraits de code Groupes Projets
Valider 8b5cadf2 rédigé par JordanHanotiaux's avatar JordanHanotiaux
Parcourir les fichiers

update

parent 518e69b1
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
#include <iostream>
#include <vector>
#include <random>
#include <chrono>
#include "matrix_opencl.hpp"
std::vector<float> fill_random(int rows, int cols) {
std::mt19937 gen(42);
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
std::vector<float> data(rows*cols);
for (int i = 0; i < rows * cols; ++i) {
data[i] = dist(gen);
}
return data;
}
int main(int argc, char** argv) {
if (argc < 3) {
std::cerr << "Usage: ./matrix_mul_exec <size> <runs>" << std::endl;
return 1;
}
int size = std::stoi(argv[1]);
int runs = std::stoi(argv[2]);
// 1. --- OpenCL Setup ---
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.empty()) {
std::cerr << "No OpenCL platforms found." << std::endl;
return 1;
}
cl::Platform platform = platforms.front();
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if (devices.empty()) {
platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
if (devices.empty()) {
std::cerr << "No OpenCL devices found." << std::endl;
return 1;
}
}
cl::Device device = devices.front();
cl::Context context(device);
cl_int err;
cl_command_queue cq = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &err);
if (err != CL_SUCCESS) {
std::cerr << "Failed to create command queue: " << err << std::endl;
exit(1);
}
cl::CommandQueue queue(cq, true);
std::vector<cl::Device> devices_to_init = {device};
try {
MatrixCL::initializeKernels(context, devices_to_init);
} catch (const std::exception& e) {
// Catching std::exception here because initializeKernels wraps cl::Error
std::cerr << "FATAL ERROR during kernel initialization: " << e.what() << std::endl;
// If the error was a BuildError, the log should have been printed
// by the loadAndBuildProgram function within initializeKernels.
return 1;
}
std::chrono::duration<double, std::milli> total_time(0);
for (int i = 0; i < runs; ++i) {
// 2. --- Matrix Multiplication ---
std::vector<float> dataA = fill_random(size, size);
std::vector<float> dataB = fill_random(size, size);
MatrixCL A(size, size, context, queue, &dataA);
MatrixCL B(size, size, context, queue, &dataB);
auto start = std::chrono::high_resolution_clock::now();
#ifdef FAST_MATMUL
MatrixCL C = A.fast_matrix_mul(B);
#else
MatrixCL C = A * B;
#endif
queue.finish();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> elapsed = end - start;
total_time += elapsed;
}
std::cout << total_time.count() / runs << std::endl;
return 0;
}
\ No newline at end of file
......@@ -76,7 +76,7 @@ const std::string kernel_source_transpose = R"(
B[output_idx] = A[input_idx];
}
)";
/*const std::string kernel_source_matrix_mul = R"(
const std::string kernel_source_matrix_mul = R"(
__kernel void matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
int row = get_global_id(0);
int col = get_global_id(1);
......@@ -84,9 +84,9 @@ const std::string kernel_source_transpose = R"(
C[row * B_cols + col] += A[row * A_cols + k] * B[k * B_cols + col];
}
}
)";*/
const std::string kernel_source_matrix_mul = R"(
__kernel void matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
)";
const std::string kernel_source_matrix_mul_V2 = R"(
__kernel void fast_matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
int row = get_global_id(0);
int col = get_global_id(1);
......@@ -191,6 +191,9 @@ void KernelCache::compileKernels(cl::Context context, const std::vector<cl::Devi
cl::Program prog_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_matrix_mul, "matrix_mul");
kernel_matrix_mul = cl::Kernel(prog_matrix_mul, "matrix_mul");
cl::Program prog_matrix_mul_V2 = loadAndBuildProgram(context, devices, kernel_source_matrix_mul_V2, "matrix_mul_V2");
kernel_matrix_mul_V2 = cl::Kernel(prog_matrix_mul_V2, "matrix_mul_V2");
cl::Program prog_sigmoid = loadAndBuildProgram(context, devices, kernel_source_sigmoid, "sigmoid");
kernel_sigmoid = cl::Kernel(prog_sigmoid, "sigmoid");
......@@ -374,6 +377,31 @@ MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
return result;
}
MatrixCL MatrixCL::matrix_mul_V2(const MatrixCL& other) const {
MatrixCL result(rows_, other.numCols(), context_, queue_);
cl::Kernel kernel = kernels_->kernel_matrix_mul_V2;
kernel.setArg(0, buffer_);
kernel.setArg(1, other.getBuffer());
kernel.setArg(2, result.getBuffer());
kernel.setArg(3, rows_);
kernel.setArg(4, cols_);
kernel.setArg(5, other.numCols());
const size_t TILE_SIZE = 16;
// Align global work size to the nearest multiple of TILE_SIZE
size_t global_rows = ((rows_ + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
size_t global_cols = ((other.numCols() + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
cl::NDRange global_work_size(global_rows, global_cols);
cl::NDRange local_work_size(TILE_SIZE, TILE_SIZE);
queue_.enqueueNDRangeKernel(kernel, cl::NullRange, global_work_size, local_work_size);
return result;
}
MatrixCL MatrixCL::transpose() const {
MatrixCL result(cols_, rows_, context_, queue_);
......
......@@ -25,6 +25,7 @@ struct KernelCache {
cl::Kernel kernel_sub_mul;
cl::Kernel kernel_transpose;
cl::Kernel kernel_matrix_mul;
cl::Kernel kernel_matrix_mul_v2;
cl::Kernel kernel_sigmoid;
cl::Kernel kernel_sigmoid_backward;
cl::Kernel kernel_bce_elementwise;
......@@ -93,6 +94,8 @@ public:
// Matrix multiplication: C = A * B
MatrixCL operator*(const MatrixCL& other) const;
MatrixCL matrix_mul_V2(const MatrixCL& other) const;
// Transpose: returns a new Matrix that is the transpose (B = A^T)
MatrixCL transpose() const;
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter