Skip to content
Extraits de code Groupes Projets
Valider b7861e74 rédigé par JordanHanotiaux's avatar JordanHanotiaux
Parcourir les fichiers

u

parent 01315542
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
CXX = g++
CXXFLAGS = -std=c++17 -Wall -Wextra -O3
CXXFLAGS = -std=c++17 -Wall -Wextra -O0
TARGET = main
SRC = matrix_opencl.cpp mlp_sgd.cpp globals.cpp
LIBS = -lOpenCL
SRC = matrix_opencl.cpp benchmark.cpp mlp_sgd.cpp globals.cpp
all:
$(MAKE) clean && $(MAKE) run
$(TARGET): $(SRC)
$(CXX) $(CXXFLAGS) -o $(TARGET)$(SUFFIX) $(SRC) main.cpp $(LIBS)
$(CXX) $(CXXFLAGS) -o $(TARGET) $(SRC) -lOpenCL
run: $(TARGET)
./$(TARGET)
fast: benchmark.cpp
$(CXX) $(CXXFLAGS) -DFAST_MATMUL -o benchmark$(SUFFIX) benchmark.cpp $(LIBS) $(SRC)
naive: benchmark.cpp
$(CXX) $(CXXFLAGS) -o benchmark$(SUFFIX) benchmark.cpp $(LIBS) $(SRC)
clean:
rm -f $(TARGET)
rm -f benchmark
.PHONY: all run clean
\ No newline at end of file
......@@ -85,47 +85,23 @@ const std::string kernel_source_matrix_mul = R"(
}
}
)";
const std::string kernel_source_fast_matrix_mul = R"(
__kernel void fast_matrix_mul(__global const float* A, __global const float* B, __global float* C, int A_rows, int A_cols, int B_cols) {
int row = get_global_id(0);
int col = get_global_id(1);
int local_row = get_local_id(0);
int local_col = get_local_id(1);
const int TILE_SIZE = 16;
__local float As[TILE_SIZE][TILE_SIZE];
__local float Bs[TILE_SIZE][TILE_SIZE];
float sum = 0.0f;
for (int t = 0; t < (A_cols + TILE_SIZE - 1) / TILE_SIZE; t++) {
int tile_row = t * TILE_SIZE + local_row;
int tile_col = t * TILE_SIZE + local_col;
if (row < A_rows && tile_col < A_cols)
As[local_row][local_col] = A[row * A_cols + tile_col];
else
As[local_row][local_col] = 0.0f;
if (tile_row < A_cols && col < B_cols)
Bs[local_row][local_col] = B[tile_row * B_cols + col];
else
Bs[local_row][local_col] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < TILE_SIZE; ++k)
sum += As[local_row][k] * Bs[k][local_col];
barrier(CLK_LOCAL_MEM_FENCE);
/*const std::string kernel_source_matrix_mul = R"(
__kernel void matrix_mul(__global const float* A,__global const float* B, __global float* C, int M, int K, int N) {
int i = get_global_id(0);
float Awrk[1024];
for (int k = 0; k < K; ++k) {
Awrk[k] = A[i * K + k];
}
if (row < A_rows && col < B_cols) {
C[row * B_cols + col] = sum;
for (int j = 0; j < N; ++j) {
float tmp = 0.0f;
for (int k = 0; k < K; ++k) {
tmp += Awrk[k] * B[k * N + j];
}
C[i * N + j] = tmp;
}
}
)";
)";*/
const std::string kernel_source_sigmoid = R"(
__kernel void sigmoid(__global const float* input, __global float* output, int rows, int cols) {
int idx = get_global_id(0);
......@@ -191,9 +167,6 @@ void KernelCache::compileKernels(cl::Context context, const std::vector<cl::Devi
cl::Program prog_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_matrix_mul, "matrix_mul");
kernel_matrix_mul = cl::Kernel(prog_matrix_mul, "matrix_mul");
cl::Program prog_fast_matrix_mul = loadAndBuildProgram(context, devices, kernel_source_fast_matrix_mul, "fast_matrix_mul");
kernel_fast_matrix_mul = cl::Kernel(prog_fast_matrix_mul, "fast_matrix_mul");
cl::Program prog_sigmoid = loadAndBuildProgram(context, devices, kernel_source_sigmoid, "sigmoid");
kernel_sigmoid = cl::Kernel(prog_sigmoid, "sigmoid");
......@@ -377,31 +350,6 @@ MatrixCL MatrixCL::operator*(const MatrixCL& other) const {
return result;
}
MatrixCL MatrixCL::fast_matrix_mul(const MatrixCL& other) const {
MatrixCL result(rows_, other.numCols(), context_, queue_);
cl::Kernel kernel = kernels_->kernel_fast_matrix_mul;
kernel.setArg(0, buffer_);
kernel.setArg(1, other.getBuffer());
kernel.setArg(2, result.getBuffer());
kernel.setArg(3, rows_);
kernel.setArg(4, cols_);
kernel.setArg(5, other.numCols());
const size_t TILE_SIZE = 16;
// Align global work size to the nearest multiple of TILE_SIZE
size_t global_rows = ((rows_ + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
size_t global_cols = ((other.numCols() + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
cl::NDRange global_work_size(global_rows, global_cols);
cl::NDRange local_work_size(TILE_SIZE, TILE_SIZE);
queue_.enqueueNDRangeKernel(kernel, cl::NullRange, global_work_size, local_work_size);
return result;
}
MatrixCL MatrixCL::transpose() const {
MatrixCL result(cols_, rows_, context_, queue_);
......
......@@ -25,7 +25,6 @@ struct KernelCache {
cl::Kernel kernel_sub_mul;
cl::Kernel kernel_transpose;
cl::Kernel kernel_matrix_mul;
cl::Kernel kernel_fast_matrix_mul;
cl::Kernel kernel_sigmoid;
cl::Kernel kernel_sigmoid_backward;
cl::Kernel kernel_bce_elementwise;
......@@ -94,9 +93,6 @@ public:
// Matrix multiplication: C = A * B
MatrixCL operator*(const MatrixCL& other) const;
// Fast matrix multiplication: C = A * B (optimized for large matrices)
MatrixCL fast_matrix_mul(const MatrixCL& other) const;
// Transpose: returns a new Matrix that is the transpose (B = A^T)
MatrixCL transpose() const;
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter