Related
Lot of code to post because I can't really tell where the issue is. I am trying to run this on an ESP32 chip and am writing the code in vscode with platformio.
The function
byte _previousPoint(){
for (byte point = 0; point < maxPoints; point ++){ //loop through active points.
if (_points[point + 1].getActive() != 1){
return point;
}
else {
long t1 = point_seconds(point);
long t2 = point_seconds(point + 1);
if (t1 <= currentTime){ //If current time is after this point.
if (t2 > currentTime){ //If current time is less than the next point (it is sandwiched).
return point;
}
}
}
}
return 0;
}
Is not behaving as I expect. And it seems the cause is that the function
bool getActive(){
return _active;
}
under the class point is returning not a bool but ascending numbers for each channel. point.getActive() should return a bool value, and that bool value should default to 0 (as it is set in the class definition), which is why this makes no sense. The point that is returning the erroneous values is the last one in the _points[maxPoints] array for each channel, and that value seems to match the channel numbers (for red it is 0, for green it is 1, for blue it is 2, etc.)
If I print the value _active upon running the point.setActive() function it comes out correctly. So something is wrong with "getting" it later.
Here is full code below. Let me know if you need some clarification because I know it's a lot. And thanks for anybody patient enough to help.
#include <Arduino.h>
long currentTime;
long lastUp;
byte totalChannels = 4;
const byte maxPoints = 3;
class point {
public:
point(){
}
void clear(){
_active = 0;
_day = 0;
_hour = 0;
_minute = 0;
_second = 0;
_intensity = 0;
_mode = 0;
}
bool getActive(){
return _active;
}
uint getDay(){
return _day;
}
byte getHour(){
return _hour;
}
byte getMinute(){
return _minute;
}
byte getSecond(){
return _second;
}
int getIntensity(){
return _intensity;
}
byte getMode(){
return _mode;
}
void setActive(bool active){
_active = active;
}
void setDay(uint day){
_day = day;
}
void setHour(byte hour){
_hour = hour;
}
void setMinute(byte minute){
_minute = minute;
}
void setSecond(byte second){
_second = second;
}
void setIntensity(byte intensity){
_intensity = intensity;
}
void setMode(byte mode){
_mode = mode;
}
private:
bool _active = 0;
uint _day = 0;
byte _hour = 0;
byte _minute = 0;
byte _second = 0;
int _intensity = 0;
byte _mode = 0;
};
class channel {
public:
channel(String color, byte pin){
this->_color = color;
this->_pin = pin;
init();
};
void setpoint(byte row, point &newPoint, uint day, byte hour, byte minute, byte second, byte intensity, byte mode){ //edits points (for debug)
newPoint.setDay(day);
newPoint.setHour(hour);
newPoint.setMinute(minute);
newPoint.setSecond(second);
newPoint.setIntensity(intensity);
newPoint.setMode(mode);
newPoint.setActive(1);
_points[row] = newPoint;
}
bool getPoint(byte point){
return _points[point + 1].getActive();
}
void clearAllPoints(){
for (byte point = 0; point < maxPoints; point ++ ){
_points[point].clear();
}
}
void setPin(byte pin){
_pin = pin;
}
byte getPin(){
return _pin;
}
uint getIntensity(){
byte point1 = _previousPoint();
byte point2 = _nextPoint(point1);
byte fade_mode = _points[point1].getMode();
uint intensity;
if (point2 != 0){
if (fade_mode == 0){
intensity = _interpolate_lin(point1, point2);
}
else if (fade_mode == 1){
intensity = _interpolate_sin(point1, point2);
}
}
else if (point2 != 1){
if (fade_mode == 0){
intensity = _interpolate_lin(point1, point2);
}
else if (fade_mode == 1){
intensity = _interpolate_sin(point1, point2);
}
}
return intensity;
}
private:
//class attributes
point _points[maxPoints]; //points maximum of 64 points per channel.
byte _pin; //PWM pin output for channel
String _color; //LED color
float _interpolate_lin(byte point1, byte point2){
float idiff = _points[point2].getIntensity() - _points[point1].getIntensity();
float tdiff = point_seconds(point2) - point_seconds(point1);
float m;
if (tdiff != 0){
m = idiff / tdiff;
}
else{
m = 0;
}
float t = currentTime - point_seconds(point1);
float b = _points[point1].getIntensity();
//linear equation result
float i = (m * t) + b;
return i;
}
float _interpolate_sin(byte point1, byte point2){
float amplitude = _points[point2].getIntensity() - _points[point1].getIntensity();
float tdiff = point_seconds(point2) - point_seconds(point1);
float a = (-0.5 * amplitude);
float b = (2 * PI) / (2 * tdiff);
float t = (currentTime - point_seconds(point1));
float d = 0.5 * abs(amplitude);
//cosine equation result
float i = (a * cos(b * t)) + d;
return i;
}
byte _previousPoint(){
for (byte point = 0; point < maxPoints; point ++){ //loop through active points.
if (_points[point + 1].getActive() != 1){
return point;
}
else {
long t1 = point_seconds(point);
long t2 = point_seconds(point + 1);
if (t1 <= currentTime){ //If current time is after this point.
if (t2 > currentTime){ //If current time is less than the next point (it is sandwiched).
return point;
}
}
}
}
return 0;
}
byte _nextPoint(byte point){
if (_points[point + 1].getActive() != 1){ //if next point is inactive, previous is last in cycle. Next point is 0.
return 0;
}
else if (_points[point + 1].getActive() == 1){ //if next point is active, return it as _nextPoint.
return point + 1;
}
return 0;
}
long point_seconds(byte point){
return ((_points[point].getHour() * 3600) + (_points[point].getMinute() * 60) + _points[point].getSecond());
}
};
//declaring channels and initializing channel array.
channel red("red", 0);
channel green("green", 1);
channel blue("blue", 2);
channel royal("royal blue", 3);
channel *channels[] = {
&red,
&green,
&blue,
&royal
};
void setIntensities(){
for (byte ch = 0; ch < totalChannels; ch ++){
channel ledChannel = *channels[ch];
byte intensity = ledChannel.getIntensity();
ledcWrite(ledChannel.getPin(), intensity);
}
}
void setup() {
Serial.begin(9600);
//set up LED outputs
ledcAttachPin(12, 0);
ledcAttachPin(13, 1);
ledcAttachPin(16, 2);
ledcSetup(0, 1000, 8);
ledcSetup(1, 1000, 8);
ledcSetup(2, 1000, 8);
//clear all points
for (byte ch = 0; ch < totalChannels; ch++){
channel ledChannel = *channels[ch];
ledChannel.clearAllPoints();
}
//Set points for testing purposes
point red1;
point red2;
point red3;
point green1;
point green2;
point green3;
point blue1;
point blue2;
point blue3;
point royal1;
point royal2;
point royal3;
red.setpoint(0, red1, 0, 0, 0, 0, 0, 0);
red.setpoint(1, red2, 0, 0, 0, 10, 255, 0);
red.setpoint(2, red3, 0, 0, 0, 20, 0, 0);
green.setpoint(0, green1, 0, 0, 0, 10, 0, 0);
green.setpoint(1, green2, 0, 0, 0, 20, 255, 0);
green.setpoint(2, green3, 0, 0, 0, 30, 0, 0);
blue.setpoint(0, blue1, 0, 0, 0, 20, 0, 0);
blue.setpoint(1, blue2, 0, 0, 0, 30, 255, 0);
blue.setpoint(2, blue3, 0, 0, 0, 40, 0, 0);
royal.setpoint(0, royal1, 0, 0, 0, 30, 0, 0);
royal.setpoint(1, royal2, 0, 0, 0, 40, 255, 0);
royal.setpoint(2, royal3, 0, 0, 0, 50, 0, 0);
Serial.print("red: "); // <----------These are to debug. The channels are returning ascending bool values for point 3 (which should all be 0)
for (byte point = 0; point < 3; point++){
Serial.print(red.getPoint(point));
}
Serial.print(" green: ");
for (byte point = 0; point < 3; point++){
Serial.print(green.getPoint(point));
}
Serial.print(" blue: ");
for (byte point = 0; point < 3; point++){
Serial.print(blue.getPoint(point));
}
Serial.print(" royal: ");
for (byte point = 0; point < 3; point++){
Serial.print(royal.getPoint(point));
}
}
void loop() {
currentTime = millis() / 1000;
if (currentTime > lastUp){
setIntensities();
lastUp = currentTime;
}
}
The problem is with your getPoint() method:
bool getPoint(byte point){
return _points[point + 1].getActive();
}
You're referencing the array with point + 1 instead of point so when you attempt to read the last item in the array you're actually reading off the end of the array and getting arbitary data.
Just change this to:
bool getPoint(byte point){
return _points[point].getActive();
}
You're also doing the same thing on your _previousPoint() method:
byte _previousPoint(){
for (byte point = 0; point < maxPoints; point ++){ //loop through active points.
if (_points[point + 1].getActive() != 1){
return point;
}
Again - I think you want to just replace [point + 1] with [point] to go through the list of points and prevent reading past the end of the array.
Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.
Before I start I am a C beginner and I am trying to do some openCL work which might have been a mistake. Below is my kernel code:
__kernel void collatz(__global int* in, __global int* out)
{
uint id = get_global_id(0);
unsigned long n = (unsigned long)id;
uint count = 0;
while (n > 1) {
if (n % 2 == 0) {
n = n / 2;
} else {
if(n == 1572066143) {
unsigned long test = n;
printf("BEFORE - %lu\n", n);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
n = (3 * n) + 1;
} else {
n = (3 * n) + 1;
}
}
count = count + 1;
}
out[id] = count;
}
and the output:
BEFORE - 1572066143
AFTER - 421231134
To me it looks like n is overflowing but I can't figure out why it is happening.
The interesting thing is if I create a new variable to store the same value as n then it seems to work correctly.
unsigned long test = 1572066143;
printf("BEFORE - %lu\n", test);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
Output:
BEFORE - 1572066143
AFTER - 4716198430
As I said I am a C beginner so I could be doing something very stupid! Any help would be appreciated as I have been pulling my hair out for hours now!
Thanks,
Stephen
Update:
Here is my host code in case I am doing something stupid on that end:
int _tmain(int argc, _TCHAR* argv[])
{
/*Step1: Getting platforms and choose an available one.*/
cl_uint numPlatforms; //the NO. of platforms
cl_platform_id platform = NULL; //the chosen platform
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
/*Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *devices;
devices = (cl_device_id*)malloc(1 * sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
/*Step 3: Create context.*/
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
/*Step 4: Creating command queue associate with the context.*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
const char *filename = "HelloWorld_Kernel.cl";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
cl_ulong max = 2000000;
cl_ulong *numbers = NULL;
numbers = new cl_ulong[max];
for (int i = 1; i <= max; i++) {
numbers[i] = i;
}
int *output = (int*)malloc(sizeof(cl_ulong) * max);
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, max * sizeof(cl_ulong), (void *)numbers, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, max * sizeof(cl_ulong), NULL, NULL);
/*Step 8: Create kernel object */
cl_kernel kernel = clCreateKernel(program, "collatz", NULL);
/*Step 9: Sets Kernel arguments.*/
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
/*Step 10: Running the kernel.*/
size_t global_work_size[] = { max };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
/*Step 11: Read the data put back to host memory.*/
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, max * sizeof(cl_ulong), output, 0, NULL, NULL);
return SUCCESS;
}
I finally got to the bottom of the issue.
I was running the code on my Intel HD Graphics 4600 chip and it was producing the strange behaviour shown in the original question. I switched to using my AMD card and then it started working as expected!
Very strange. Thanks to everyone for their help!
Host side and device size values have different sizes. In host, long can vary from 32 to 64bits, depending on the platform. In device, long refers to 64bits only.
printf() function, as defined in C says that %ld is to print long (host side long) numbers. You are using printf in a kernel, so.... It could be that the C-like parser is used, therefore printing the variable as a 32bits long.
Can you try printing it as %lld or as a floating point?
I have faced the same problem as here: How to effectively swap OpenCL memory buffers?. My first implementation was the same as has been described in the question, at each cycle it writes/reads memory buffers to/from the device. As pointed out this introduces useless read/write buffer overhead. The code (with memory overhead) below works fine:
//THIS WORKS!!!
f0_mem = clCreateBuffer(
context,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof (int)*(capacity + 1),
NULL,
&err);
f1_mem = (..."the same as above"...);
m_d_mem = clCreateBuffer(..., CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof (int)*capacity,...);
for (int k = 0; k < numelem; k++) {
sumK = sumK - weight[k];
cmax = 0;
cmax = max(capacity - sumK, weight[k]);
total_elements = (size_t) (capacity - cmax + 1);
if (k % 2 == 0) {
//clEnqueueWriteBuffer of cl_mem buffers
writeBufferToDevice(f0_mem, f1_mem, f0, f1);
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
} else {
//clEnqueueWriteBuffer of cl_mem buffers
writeBufferToDevice(f1_mem, f0_mem, f1, f0);
setKernelArgs(f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
}
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
//clEnqueueReadBuffer of cl_mem buffers
readBufferFromDevice(f0_mem, f1_mem, m_d_mem, f0, f1, m_d);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
EDIT: My kernel:
void kernel knapsack(global int *input_f, global int *output_f, global int *m_d, int cmax, int weightk, int pk, int maxelem){
int c = get_global_id(0)+cmax;
if(get_global_id(0) < maxelem){
if(input_f[c] < input_f[c - weightk] + pk){
output_f[c] = input_f[c - weightk] + pk;
m_d[c-1] = 1;
}
else{
output_f[c] = input_f[c];
}
}
}
After I have tried to implement the two suggested solutions:
simply swapping setKernelArgs(...)
create two kernels
For the first one this my code:
//ARGUMENTS SWAP
f0_mem = ...
f1_mem = ...
m_d_mem = ...
//clEnqueueWriteBuffer occurs hear
writeBufferToDevice( (cl_mem&) f0_mem, (cl_mem&) f1_mem, (cl_mem&) m_d_mem, (int*) f0, (int*) f1, (int*) m_d);
for (int k = 0; k < numelem; k++) {
/*
The same code block
*/
if (k % 2 == 0) {
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
} else {
setKernelArgs(f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
}
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity, m_d, 0, NULL, NULL);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
The second solution is implemented in this way:
//TWO KERNELS
f0_mem = ...
f1_mem = ...
m_d_mem = ...
//clEnqueueWriteBuffer occurs hear
writeBufferToDevice( (cl_mem&) f0_mem, (cl_mem&) f1_mem, (cl_mem&) m_d_mem, (int*) f0, (int*) f1, (int*) m_d);
for (int k = 0; k < numelem; k++) {
/*
The same code block
*/
if (k % 2 == 0) {
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
} else {
setKernelArgs(kernel1, f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
}
clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity, m_d, 0, NULL, NULL);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
Neither of the two solutions work for me (it seems to me, no swapping occur at all!), what am I doing wrong?
Sub-question: in the last two solutions, is it possible to have memory buffers filled with zeroes without using writeBufferToDevice( f0_mem, f1_mem, m_d_mem...) before the for cycle?
This work is based on this article:
Solving knapsack problems on GPU by V. Boyera, D. El Baza, M. Elkihel
related work: Accelerating the knapsack problem on GPUs by Bharath Suri
Both attempted solutions looks correct to me but there may be some dependencies between each iteration - you would have to post your kernel to check.
It works fine in your solution probably because you are writing and reading each iteration which works slower so it's enough time to synchronize itself.
You can try to add clFinish(command); after each OpenCL API call to see if that makes a difference.
Apart from that there is 3rd solution you could try: swapping pointers in the kernel. You will need to move your loop from CPU to GPU.
inline void swap_pointers(__global double **A, __global double **B)
{
__global double *tmp = *A;
*A = *B;
*B = tmp;
}
__kernel void my_kernel(
__global double *pA,
__global double *pB,
...
)
{
for (int k = 0; k < numelem; k++)
{
// some stuff here
swap_pointers(&pA, &pB);
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
}
}
Then read everything in one go on the host (m_d_mem must be big enough to store data from all iterations):
clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity*numelem, m_d, 0, NULL, NULL);
Solution:
At each cycle after copying m_d to M, the m_d should be reseted and written back to m_d_mem buffer object with Knapsack::writeBuffer_m_d_ToDevice()
ksack.readBuffer_m_d_FromDevice();
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
ksack.writeBuffer_m_d_ToDevice();//resets m_d_mem
I have a native kernel setup but I don't know how to convert its void* argument into anything useful. In the native kernel of this snippet, how would I get the int (7) or the int[] (16 ints set to 0)?
void __stdcall nativeKernel(void * args)
{
int a1 = (*(int*)args);
cout << "a1-->: "<< a1 << endl; // gibberish
}
void kernelCaller()
{
const int dim1Size = 16;
int dim1[dim1Size] = {};
cl_int status = 0;
cl_mem mem_d1 = clCreateBuffer(*context, 0, sizeof(int)*dim1Size, NULL, &status);
clEnqueueWriteBuffer(*queue, mem_d1, CL_TRUE, 0, sizeof(int)*dim1Size, dim1, 0, NULL, NULL);
const void* args[2] = {(void*)7, NULL};
cl_mem mem_list[1] = {mem_d1};
const void* args_mem_loc[1] = {&args[1]};
cl_event run;
status = clEnqueueNativeKernel(*queue, nativeKernel, args, 2, 1, mem_list, args_mem_loc, 0, NULL, &run);
status = clEnqueueReadBuffer(*queue, mem_d1, CL_TRUE, 0, sizeof(int)*dim1Size, dim1, 1, &run, NULL);
for(auto i = 0; i != dim1Size; i++)
cout << dim1[i] << " ";
}
instead of playing hard with void* i would like to suggest to use struct
create your parameter structure like:
struct myparams{
int a
int a[3];
};
and then create and fill one struct myparams in your program and pass its address to the kernelcaller
struct myparams params;
params.a=3;
status = clEnqueueNativeKernel(*queue, nativeKernel, (void*)¶ms, 2, 1, mem_list, args_mem_loc, 0, NULL, &run);
and in the nativeKernel just unbox the void* into your parameter struct:
struct myparams *params=(myparams*)args;
beware: in the example above i passed a pointer of the stack...you might not want that ;)