Code runs faster with Rcpp than compiled with g++ - r

I am running the exact same code in two ways: compiling it with the g++ compiler, and calling it from R with Rcpp. It turns out that it runs almost 4 times faster when I run it through R.
Why does this happen? Is it because the compiler used by Rcpp is different?
This is the code I'm running in c++:
#include <iostream>
#include <nlopt.hpp>
#include <time.h>
using namespace std;
int main()
{
//--------------------------------//
// Initialization //
//--------------------------------//
// Grid for x
const int nx = 60000;
float xgrid[nx];
const float xstep = 4 /(nx - 1);
float it = 0;
for(int i = 0; i < nx; i++){
xgrid[i] = it*xstep;
it++;
}
// Initialize value function V
size_t sizeV = nx*sizeof(float);
float *V;
V = (float *)malloc(sizeV);
//--------------------------------//
// Computation //
//--------------------------------//
// Variables for computation time
double t0 = clock();
double t = t0;
float utility;
float VV = pow(-10.0,5.0);
for(int ix = 0; ix<nx; ix++){
VV = pow(-10.0,5.0);
for(int ixp = 0; ixp < nx; ixp++){
utility = (xgrid[ix] + 1 - xgrid[ixp])*(xgrid[ix] + 1 - xgrid[ixp]);
if(utility >= VV){
VV = utility;
}
}
V[ix] = VV;
}
t = clock() - t0;
cout << "Time: " << ((float)t)/CLOCKS_PER_SEC << " seconds." << endl;
return 0;
}
To run it I use:
g++ Cpp_main.cpp -o Cpp_main
The code in Rcpp is:
#include <iostream>
#include <nlopt.hpp>
#include <time.h>
using namespace std;
// [[Rcpp::export]]
vector<double> value(int nx){
//--------------------------------//
// Grid creation //
//--------------------------------//
float xgrid[nx];
const float xstep = 4 /(nx - 1);
float it = 0;
for(int i = 0; i < nx; i++){
xgrid[i] = it*xstep;
it++;
}
// Initialize value function V
vector<double> V;
V.resize(nx);
//--------------------------------//
// Computation //
//--------------------------------//
// Variables for computation time
double t0 = clock();
double t = t0;
float utility;
float VV = pow(-10.0,5.0);
for(int ix = 0; ix<nx; ix++){
VV = pow(-10.0,5.0);
for(int ixp = 0; ixp < nx; ixp++){
utility = (xgrid[ix] + 1 - xgrid[ixp])*(xgrid[ix] + 1 - xgrid[ixp]);
if(utility >= VV){
VV = utility;
}
}
V[ix] = VV;
}
t = clock() - t0;
cout << "Time: " << ((float)t)/CLOCKS_PER_SEC << " seconds." << endl;
return V;
}
And I call it from R with:
library("Rcpp")
sourceCpp("Rcpp_main.cpp")
# Grid for x
nx = 60000;
V = value(nx);
The running time in c++ is twice the running time in Rcpp. Any clues why this happens?

Just looking your main() we get this:
edd#rob:/tmp/soQ$ g++ -o main main.cpp
edd#rob:/tmp/soQ$ ./main
Time: 8.42708 seconds.
edd#rob:/tmp/soQ$ g++ -o main -O3 -march=native main.cpp
edd#rob:/tmp/soQ$ ./main
Time: 1.59151 seconds.
edd#rob:/tmp/soQ$
That is already a factor of 5.3, and one of the weirdest examples I have seen in some time for the impact of -O3.
For R, I get about the same time as R defaults to using -O3 here too.
R> Rcpp::sourceCpp("/tmp/soQ/rcppfunction.cpp")
R> V <- value(60000)
Time: 1.65224 seconds.
R>
So no real mistery here. You used different options, and it mattered.

Related

CUFFT wrong result for batch 1D ifft

I am new to CUDA and CUFFT, when I was trying to recover the fft result of cufftExecC2R(...) by applying the corresponding cufftExecC2R(...), it went wrong, the recovered data and the original data is not identical.
Here is the code, the cuda library I used was cuda-9.0.
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "cuda.h"
#include "cufft.h"
#include <iostream>
#include <sys/time.h>
#include <cstdio>
#include <cmath>
using namespace std;
// cuda error check
#define gpuErrchk(ans) {gpuAssrt((ans), __FILE__, __LINE__);}
inline void gpuAssrt(cudaError_t code, const char* file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) {
exit(code);
}
}
}
// ifft scale for cufft
__global__ void IFFTScale(int scale_, cufftReal* real) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
real[idx] *= 1.0 / scale_;
}
void batch_1d_irfft2_test() {
const int BATCH = 3;
const int DATASIZE = 4;
/// RFFT
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
for (int i = 0; i < BATCH; ++ i) {
for (int j = 0; j < DATASIZE; ++ j) {
hostInputData[i * DATASIZE + j] = (cufftReal)(i * DATASIZE + j + 1);
}
}
// DEBUG:print host input data
cout << "print host input data" << endl;
for (int i = 0; i < BATCH; ++ i) {
for (int j = 0; j < DATASIZE; ++ j) {
cout << hostInputData[i * DATASIZE + j] << ", ";
}
cout << endl;
}
cout << "=====================================================" << endl;
// --- Device side input data allocation and initialization
cufftReal *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));
// --- Device side output data allocation
cufftComplex *deviceOutputData;
gpuErrchk(cudaMalloc(
(void**)&deviceOutputData,
(DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
// Host sice input data copied to Device side
cudaMemcpy(deviceInputData,
hostInputData,
DATASIZE * BATCH * sizeof(cufftReal),
cudaMemcpyHostToDevice);
// --- Batched 1D FFTs
cufftHandle handle;
int rank = 1; // --- 1D FFTs
int n[] = {DATASIZE}; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = DATASIZE, odist = DATASIZE / 2 + 1; // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int batch = BATCH; // --- Number of batched executions
cufftPlanMany(
&handle,
rank,
n,
inembed, istride, idist,
onembed, ostride, odist,
CUFFT_R2C,
batch);
cufftExecR2C(handle, deviceInputData, deviceOutputData);
// **************************************************************************
/// IRFFT
cufftReal *deviceOutputDataIFFT;
gpuErrchk(cudaMalloc((void**)&deviceOutputDataIFFT, DATASIZE * BATCH * sizeof(cufftReal)));
// --- Batched 1D IFFTs
cufftHandle handleIFFT;
int n_ifft[] = {DATASIZE / 2 + 1}; // --- Size of the Fourier transform
idist = DATASIZE / 2 + 1; odist = DATASIZE; // --- Distance between batches
cufftPlanMany(
&handleIFFT,
rank,
n_ifft,
inembed, istride, idist,
onembed, ostride, odist,
CUFFT_C2R,
batch);
cufftExecC2R(handleIFFT, deviceOutputData, deviceOutputDataIFFT);
/* scale
// dim3 dimGrid(512);
// dim3 dimBlock(max((BATCH * DATASIZE + 512 - 1) / 512, 1));
// IFFTScale<<<dimGrid, dimBlock>>>((DATASIZE - 1) * 2, deviceOutputData);
*/
// host output data for ifft
cufftReal *hostOutputDataIFFT = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
cudaMemcpy(hostOutputDataIFFT,
deviceOutputDataIFFT,
DATASIZE * BATCH * sizeof(cufftReal),
cudaMemcpyDeviceToHost);
// print IFFT recovered host output data
cout << "print host output IFFT data" << endl;
for (int i=0; i<BATCH; i++) {
for (int j=0; j<DATASIZE; j++) {
cout << hostOutputDataIFFT[i * DATASIZE + j] << ", ";
}
printf("\n");
}
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceOutputData));
gpuErrchk(cudaFree(deviceInputData));
gpuErrchk(cudaFree(deviceOutputDataIFFT));
free(hostOutputDataIFFT);
free(hostInputData);
}
int main() {
batch_1d_irfft2_test();
return 0;
}
I compile the 'rfft_test.cu' file by nvcc -o rfft_test rfft_test.cu -lcufft. the result is as below:
print host input data
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
=====================================================
print IFFT recovered host output data
6, 8.5359, 15.4641, 0,
22, 24.5359, 31.4641, 0,
38, 40.5359, 47.4641, 0,
Specifically, I check the scale issue for the cufftExecC2R(...), and I comment out the IFFTScale() kernel function. Thus I assume that the recovered output data was like DATASIZE*input_batched_1d_data, but even so, the result is not as expected.
I have checked the cufft manual and my code several times, I also search for some Nvidia forums and StackOverflow answers, but I didn't find any solution. Anyone's help is greatly appreciated.
Thanks in advance.
Size of your inverse transform is incorrect and should be DATASIZE not DATASIZE/2+1.
Following sections of cuFFT docs should help:
https://docs.nvidia.com/cuda/cufft/index.html#data-layout
https://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional
"In C2R mode an input array ( x 1 , x 2 , … , x ⌊ N 2 ⌋ + 1 ) of only non-redundant complex elements is required." - N is transform size you pass to plan function

C code translated to Qt crashes while executing

I´m a beginner on programming and Qt, but as liked the framework I´m trying to improve my skills and write my C++ codes on it. I got a task of writing a Ricker wavelet code and then plot it.
I divided it in two tasks, first make the ricker code works, and when it is running, then implement a way to plot it, I will use qcustomplot for it.
I got a code from C and I´m trying to adapt it to Qt. Although it doesn´t give any errors during compilation, when executing it crashes, with the following message:
Invalid parameter passed to C runtime function. C:/Users/Flavio/Documents/qtTest/build-ricker2-Desktop_Qt_5_11_0_MinGW_32bit-Debug/debug/ricker2.exe
exited with code 255
The code I´m supposed to translate is:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
float *rickerwavelet(float fpeak, float dt, int *nwricker);
int main(int argc, char **argv)
{
int i;
float dt;
float fpeak;
float *wricker=NULL;
int nwricker;
fpeak = atof(argv[1]);
dt = atof(argv[2]);
wricker = rickerwavelet(fpeak, dt, &nwricker);
/* show value of ricker wavelets */
for (i=0; i<nwricker; i++)
printf("%i. %3.5f \n", i, wricker[i]);
free(wricker);
return(1);
}
/* ricker wavelet function, return an array ricker wavelets */
float *rickerwavelet(float fpeak, float dt, int *nwricker)
{
int i, k;
int nw;
int nc;
float pi;
float nw1, alpha, beta;
float *wricker=NULL;
pi = 3.141592653589793;
nw1 = 2.2/fpeak/dt;
nw = 2*floor(nw1/2)+1;
nc = floor(nw/2);
wricker = (float*) calloc (nw, sizeof(float));
for (i=0; i<nw; i++)
{
k = i+1;
alpha = (nc-k+1)fpeakdtpi;
beta = pow(alpha, 2.0);
wricker[i] = (1 - (beta2)) * exp(-beta);
}
(*nwricker) = nw;
return(wricker);
}
The code i wrote on Qt is:
#include <QCoreApplication>
#include <qmath.h>
#include <stdio.h>
#include <stdlib.h>
#include <QDebug>
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
int i,k,nw,nc;
double *wricker=NULL;
int nwricker = 60;
int wavelet_freq = 30;
int polarity=1;
int sampling_rate=0.004;
float nw1, alpha, beta;
const double pi = 3.141592653589793238460;
nw1 = 2.2/wavelet_freq/sampling_rate;
nw = 2*floor(nw1/2)+1;
nc = floor(nw/2);
wricker = (double*)calloc (nw, sizeof(double));
for (i=0; i<nw; i++)
{
k = i+1;
alpha = (nc-k+1)wavelet_freqsampling_ratepi;
beta = pow(alpha, 2.0);
wricker[i] = polarity((1 - (beta2)) * exp(-beta));
};
/* show value of ricker wavelets */
for (i=0; i<nwricker; i++)
{
qDebug()<<i<<wricker[i];
};
free(wricker);
return a.exec();
}
Analytic expression
The amplitude A of the Ricker wavelet with peak frequency f at time t is computed like so:
A = (1-2 pi^2 f^2* t^2) e^{-pi^2* f^2* t^2}
A py code for it would be:
import numpy as np
import matplotlib.pyplot as plt
def ricker(f, length=0.128, dt=0.001):
t = np.arange(-length/2, (length-dt)/2, dt)
y = (1.0 - 2.0*(np.pi2)(f2)(t2)) * np.exp(-(np.pi2)(f2)(t**2))
return t, y
f = 25 # A low wavelength of 25 Hz
t, w = ricker(f)
What seems quite simple.
Does anyone have any idea what is wrong in my code???
Doing a bit of Debugging I found the problem is when passing the vectors to qDebug, it give a message:
THE INFERIOR STOPPED BECAUSE IT RECEIVED A SIGNAL FROM THE OPERATING
SYSTEM . SIGNAL NAME: SIGSEGV SIGNAL MEANING: SEGMENTATION FAULT
I´ll search for more information on this signal meaning. I used qDebug only with the intention of showing the data on a terminal to make sure the calculation was working.
Thanks in advance.
C++ is much more like Python than C. Even though your C code was particularly convoluted, it still isn't as nice a C++ can be.
A complete example that even plots the data can be very, very simple. If that doesn't show the combined power of C++ and Qt, I hardly know what will :)
wavelet-plot-50690312.pro file
QT = charts
SOURCES = main.cpp
main.cpp
// https://github.com/KubaO/stackoverflown/tree/master/questions/wavelet-plot-50690312
#include <QtCharts>
#include <cmath>
const double pi = 3.14159265358979323846;
QVector<QPointF> ricker(double f, double length = 2.0, double dt = 0.001) {
size_t N = (length - dt/2.0)/dt;
QVector<QPointF> w(N);
for (size_t i = 0; i < N; ++i) {
double t = -length/2 + i*dt;
w[i].setX(t);
w[i].setY((1.0 - 2*pi*pi*f*f*t*t) * exp(-pi*pi*f*f*t*t));
}
return w;
}
QLineSeries *rickerSeries(double f) {
auto *series = new QLineSeries;
series->setName(QStringLiteral("Ricker Wavelet for f=%1").arg(f, 2));
series->replace(ricker(f));
return series;
}
int main(int argc, char *argv[]) {
QApplication app(argc, argv);
QChartView view;
view.chart()->addSeries(rickerSeries(1.0));
view.chart()->addSeries(rickerSeries(2.0));
view.chart()->createDefaultAxes();
view.setMinimumSize(800, 600);
view.show();
return app.exec();
}
Of course, this looks nice in C++. How about C? Let's pretend we had some C binding for Qt. Then it might look as follows:
// https://github.com/KubaO/stackoverflown/tree/master/questions/wavelet-plot-50690312/c-binding
#include "qc_binding.h"
#include <math.h>
#include <stddef.h>
#include <stdio.h>
const double pi = 3.14159265358979323846;
CQVector *ricker(double f, double length, double dt) {
scope_enter();
size_t N = (length - dt/2.0)/dt;
CQVector *vector = CQVector_size_(CQ_, CQPointF_type(), N);
CQPointF *const points = CQPointF_(CQVector_data_at(vector, 0));
for (size_t i = 0; i < N; ++i) {
double t = -length/2 + i*dt;
points[i].x = t;
points[i].y = (1.0 - 2*pi*pi*f*f*t*t) * exp(-pi*pi*f*f*t*t);
}
return scope_leave_ptr(vector);
}
CQLineSeries *rickerSeries(double f) {
scope_enter();
CQLineSeries *series = CQLineSeries_(CQ_);
CQLineSeries_setName(series, CQString_asprintf(CQ_, "Ricker Wavelet for f=%.2f", f));
CQLineSeries_replaceVector(series, ricker(f, 2.0, 0.001));
return scope_leave_ptr(series);
}
int main(int argc, char *argv[]) {
scope_enter();
CQApplication *app = CQApplication_(CQ_, &argc, argv);
CQChartView *view = CQChartView_(CQ_);
CQChart *chart = CQChartView_getChart(view);
CQChart_addLineSeries(chart, rickerSeries(1.0));
CQChart_addLineSeries(chart, rickerSeries(2.0));
CQChart_createDefaultAxes(chart);
CQWidget *view_ = CQWidget_cast(view);
CQWidget_setMinimumSize(view_, 800, 600);
CQWidget_show(view_);
return scope_leave_int(CQApplication_exec(app));
}
With a little bit of work, a C binding can be made that is about as easy to use as C++: scopes are tracked, RAII works, destructors get called when needed, values about to be returned are not destructed, etc.
Such a minimum binding, implementing all that's needed to run the code above, is available at https://github.com/KubaO/stackoverflown/tree/master/questions/wavelet-plot-50690312/c-binding.

Why does mclapply function in R is more efficient than Rcpp + OpenMP?

I have a function with a loop (EstimateUniques) that is parallelized with OpenMP. I suggested that multithreading should be more efficient than multiprocessing, but when I compare this function with the simple run of "mclapply", it showed lower performance. What is the proper way to achieve the same level of parallelization in c++ as in R? Am I doing something wrong?
Performance comparison (time in seconds):
#Cores CPP R
1 1.721s 1.538s
2 1.945s 1.080s
3 2.858s 0.801s
R code:
Rcpp::sourceCpp('ReproducibleExample.cpp')
arr <- 1:10000
n_rep <- 150
n_iters <- 200
EstimateUniquesR <- function(arr, n_iters, n_rep, cores) {
parallel::mclapply(1:n_iters, function(i)
GetNumberOfUniqSamples(arr, i * 10, n_rep), mc.cores=cores)
}
cpp_times <- sapply(1:3, function(threads)
system.time(EstimateUniques(arr, n_iters, n_rep, threads))['elapsed'])
r_times <- sapply(1:3, function(cores)
system.time(EstimateUniquesR(arr, n_iters, n_rep, cores))['elapsed'])
data.frame(CPP=cpp_times, R=r_times)
Example.cpp file:
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::plugins(cpp11)]]
#include <algorithm>
#include <vector>
#include <omp.h>
// [[Rcpp::export]]
int GetNumberOfUniqSamples(const std::vector<int> &bs_array, int size, unsigned n_rep) {
unsigned long sum = 0;
for (unsigned i = 0; i < n_rep; ++i) {
std::vector<int> uniq_vals(size);
for (int try_num = 0; try_num < size; ++try_num) {
uniq_vals[try_num] = bs_array[rand() % bs_array.size()];
}
std::sort(uniq_vals.begin(), uniq_vals.end());
sum += std::distance(uniq_vals.begin(), std::unique(uniq_vals.begin(), uniq_vals.end()));
}
return std::round(double(sum) / n_rep);
}
// [[Rcpp::export]]
std::vector<int> EstimateUniques(const std::vector<int> &bs_array, const int n_iters,
const int n_rep = 1000, const int threads=1) {
std::vector<int> uniq_counts(n_iters);
#pragma omp parallel for num_threads(threads) schedule(dynamic)
for (int i = 0; i < n_iters; ++i) {
uniq_counts[i] = GetNumberOfUniqSamples(bs_array, (i + 1) * 10, n_rep);
}
return uniq_counts;
}
I tried to use other types of scheduling in OpenMP, but they gave even worse results.

Implementing Rc4 algorithm

I need to implement a Rc4 algorithm with a seed: 1 2 3 6 and the plain text cryptology. I am following this guideline we were provided in class, but it's not initializing S correctly.
my output is
and needs to be
My code was previously printing negative values , not sure why but I managed to fix that error. Thought everything was good to go but it's not. Sorry for the pictures, I figured it was easier to explain what I was following for my code structure. I am mod 4 the seed since it contains 4 characters, could that possibly be my error?
#include <iostream>
#include <string>
#include <string.h>
using std::endl;
using std::string;
void swap(unsigned int *x, unsigned int *y);
int main()
{
string plaintext = "cryptology";
char cipherText[256] = { ' ' };
unsigned int S[256] = { 0 };
unsigned int t[256] = { 0 };
unsigned int seed[4] = { 1, 2, 3, 6 }; // seed used for test case 1
unsigned int temp = 0;
int runningTotal = 0;
unsigned int key = 0;
// inilializing s and t
for (int i = 0; i < 256; i++)
{
S[i] = i;
t[i] = seed[i % 4];
}
for (int i = 0; i < 256; i++)
{
runningTotal += S[i] + t[i];
runningTotal %= 256;
swap(&S[runningTotal], &S[i]);
std::cout << S[i] <<" ";
}
runningTotal = 0;
for (int i = 0; i < plaintext.size(); i++)
{
runningTotal %= 256;
swap(&S[i], &S[runningTotal]);
temp = (unsigned int)S[i] + (unsigned int)S[runningTotal];
temp %= 256;
key = S[temp];
std::cout << endl;
cipherText[i] = plaintext[i] ^ key;
}
std::cout << " this is cipher text " << endl;
std::cout << cipherText << endl;
system("pause");
return 0;
}
void swap(unsigned int *x, unsigned int *y)
{
unsigned int temp = 0;
temp = *x;
*x = *y;
*y = temp;
}
Actually I think you're generating S[] correctly. I can only assume you're supposed to do something different with the key. (Perhaps's its an ASCII string instead of four byte values? Check your assignment notes.)
There is a problem later on, however. In the stream generation loop, you're supposed to do the increment and swap operations before you fetch a byte from S[].
for (int k = 0; k < plaintext.size(); k++)
{
i = (i+1) % 256; // increment S[] index
runningTotal = (runningTotal + S[i]) % 256; // swap bytes
swap(&S[i], &S[runningTotal]);
temp = (S[i] + S[runningTotal]) % 256; // fetch byte from S and
cipherText[k] = plaintext[k] ^ S[temp]; // XOR with plaintext
}
NOTE: Although unrelated to your question, your code could be made a lot tidier by using unsigned char values instead of ints. That would eliminate the % 256 instructions that are littered all over the place. (But be careful during initialization, because i<256 will always be true if i is an unsigned char.)

OpenCL clEnqueueNDRangeKernel how to set work group size correctly

In OpenCL, if I want to add two N-dimension vectors, the global work group size (globalSize) should satisfy globalSize = ceil(N/localSize) * localSize, where localSize is the local work group size. Is this correct? If N = 1000, and localSize = 128, globalSize should be 1024? Can we always set globalSize some multiple of localSize and larger than needed?
I tried many times and it worked well for 1-dimension problems.
However, when it comes to 2d problems, for example, multiply two matrices of dimension m*n and n*p, the result matrix is of order m*p, things get more complicated.
The max work group size on my device is 128, so I set localSize [2] = {16,8} and
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}.
It is similar to the 1-dimension case but the result is wrong!
If I set localSize [2] = {1,128} and change the globalSize accordingly, I can get the correct result. So where is the problem? Can anyone tell me why?
In addition, I find out the indices where the matrix element is wrong.
It seems that the result is wrong at (i,j) where i*p + j = n * some constant (n = 1,2,3...)
Why?
Here is my kernel function:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
{
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
else
{
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
}
And then it is the host program:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
{
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
{
A[i] = i;
}
for(int i = 0; i < n*p; i++)
{
B[i] = i;
}
try
{
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("mmul.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
{
float z = 0.0;
for(int k = 0; k < n; k++)
{
z += A[i*n+k] * B[k*p+j];
}
c[i*p+j] = z;
}
for(int i = 0; i < m*p; i++)
{
if(fabs(c[i]-C[i])>0.001)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
}
delete []A;
delete []B;
delete []C;
}
catch(Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
return 0;
}
Your bounds checking code inside your OpenCL kernel is incorrect. Instead of this:
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
You should have this:
if(i < 0 || j < 0 || i >= Mdim || j >= Pdim) return;
Let's assume, that you have float matrix of size 1000x1000:
const int size = 1000;
// Whatever
float* myMatrix = (float*)calloc(size * size, sizeof(*myMatrix));
Determine size of Local Group first:
size_t localSize[] = {16, 8};
Then determine, how many Local Groups do you need:
size_t numLocalGroups[] = {ceil(size/localSize[0]), ceil(size/localSize[1])};
Finally, determine NDRange size:
size_t globalSize[] = {localSize[0] * numLocalGroups[0], localSize[1] * numLocalGroups[1]};
Don't forget to handle out-of-bounds access in right-most Local Groups.

Resources