OpenCL : UNREACHABLE executed

OpenCL : UNREACHABLE executed - opencl

I have the generic kernel that calculates part sums of array elements in temporary buffer.
#if FUNC_SUM
#define FUNC(a, b) b += a;
#elif FUNC_ABS_SUM
#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
#elif FUNC_SQR_SUM
#define FUNC(a, b) b += a * a;
#else
#error No sum function
#endif
__kernel void sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
__global srcT *src, __global dstT *dst)
{
int lid = get_local_id(0);
int gid = get_group_id(0);
int id = get_global_id(0);
int idx = offset + id + (id / cols) * invalid_cols;
__local dstT localmem_sum[128];
dstT sum = (dstT)(0), temp;
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
{
idx = offset + id + (id / cols) * invalid_cols;
temp = convertToDstT(src[idx]);
FUNC(temp, sum);
}
if (lid > 127)
localmem_sum[lid - 128] = sum; // ??
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 128)
localmem_sum[lid] = sum + localmem_sum[lid];
barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = 64; lsize > 0; lsize >>= 1)
{
if (lid < lsize)
{
int lid2 = lsize + lid;
localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lid == 0)
dst[gid] = localmem_sum[0];
}
And this code fails with the message "UNREACHABLE executed!" on the line marked as // ??
Any wrong in this code? does some workaround exist to avoid this error?
Target platform: AMD GPU

Related

How to reduce code duplication between OpenCL kernels?

I have several similar kernels to generate random data and store it in global memory. I'm always using the same algorithm to randomize, but due to variable scope issues (I need to keep track of data) I fail to avoid severe code duplications.
Are there any ways to avoid this? Generating random data in OpenCL seems a fairly standard task, but it goes against any good coding standards to have this level of code duplication. For example, here are two of my kernels:
////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to AWGN channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterAWGN(__global double* d_Rand,
__global int* seeds,
__global long* inputcw,
int nPerRng, float sigma)
{
int globalID = get_global_id(0);
double c = 2.0/(sigma*sigma);
int iState, iState1, iStateM, iOut;
unsigned int mti, mti1, mtiM, x;
unsigned int mt[MT_NN];
//Initialize current state
mt[0] = seeds[globalID];
for (iState = 1; iState < MT_NN; iState++)
mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;
iState = 0;
mti1 = mt[0];
for (iOut = 0; iOut < nPerRng; iOut=iOut+2) {
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double u1 = ((double)x + 1.0f) / 4294967296.0f;
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double u2 = ((double)x + 1.0f) / 4294967296.0f;
double r = sqrt(-2.0f * log(u1));
double phi = 2 * PI * u2;
u1 = r * cos(phi);
u1 = inputcw[iOut]+sigma*u1;
u1=1/(1+exp(-c*u1));
d_Rand[globalID * nPerRng + iOut]=log((1-u1)/u1);
if (iOut!=nPerRng-1) {
u2 = r * sin(phi);
u2 = inputcw[iOut+1]+sigma*u2;
u2=1/(1+exp(-c*u2));
u2=log((1-u2)/u2);
d_Rand[globalID * nPerRng + iOut+1]=u2;
}
}
}
and
////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to BSC channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterBSC(__global double* d_Rand,
__global int* seeds,
__global long* inputcw,
int nPerRng, float flipProb)
{
int globalID = get_global_id(0);
int iState, iState1, iStateM, iOut;
unsigned int mti, mti1, mtiM, x;
unsigned int mt[MT_NN];
//Initialize current state
mt[0] = seeds[globalID];
for (iState = 1; iState < MT_NN; iState++)
mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;
iState = 0;
mti1 = mt[0];
for (iOut = 0; iOut < nPerRng; iOut=iOut+1) {
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double c = log((1-flipProb)/flipProb);
double u = ((double)x + 1.0f) / 4294967296.0f;
u = (2*isless(u,flipProb)-1)*inputcw[iOut]*c;
d_Rand[globalID * nPerRng + iOut]=u;
}
}
Are there any ways, tricks or methods to avoid this? Subroutines seem unable to make proper use of the variables (especially mt), so I didn't manage to cut it down in the way other languages would allow to.
Or should I just accept this as a necessary evil in OpenCL and keep managing 10 different kernels this way?

At Khronos' site, it says
OpenCL programs may also contain auxiliary functions and constant data that can be used by __kernel functions.
An example to generate random number between 0.0f and 1.0f per thread:
Core function to iterate a seed:
uint wang_hash(uint seed)
{
seed = (seed ^ 61) ^ (seed >> 16);
seed *= 9;
seed = seed ^ (seed >> 4);
seed *= 0x27d4eb2d;
seed = seed ^ (seed >> 15);
return seed;
}
Initialization and iteration of each threads seed:
// id=thread id, rnd=seed array
void wang_rnd_init(__global unsigned int * rnd,int id)
{
uint maxint=0;
maxint--; // could be a 0xFFFFFFFF
uint rndint=wang_hash(id);
rnd[id]=rndint;
}
// id=thread id, rnd=seed array
float wang_rnd(__global unsigned int * rnd,int id)
{
uint maxint=0;
maxint--; // could be a 0xFFFFFFFF
uint rndint=wang_hash(rnd[id]);
rnd[id]=rndint;
return ((float)rndint)/(float)maxint;
}
Usage in a random grayscale color pixel generator kernel:
__kernel void rnd_1(__global unsigned int * rnd, __global int *rgba)
{
int id=get_global_id(0);
float rgba_register=wang_rnd(rnd,id);
rgba[id] = ((int)(rgba_register * 255) << 24) | ((int)(rgba_register * 255) << 16) | ((int)(rgba_register * 255) << 8) | ((int)(rgba_register * 255));
}
and wang_rnd() can be used in other kernels without defining it twice if they are in same compiled context, same as putting all relevant kernels and functions in the same file to be compiled.
Auxilliary functions are not limited to registers and global memory. They can take local and constant memory parameters too. Since they are working with device side memory mainly, they can take and return structs too.

Parallel reduction using local memory in OpenCL

I implemented a reduce kernel in OpenCL to sum up all entries in the input vector of size N. For a easier testing I initialize the input vector with 1.0f. So the result should be N. But it is not!
Here is my reduce-kernel:
kernel void reduce(global float* input, global float* output, const unsigned int N, local float* cache)
{
const uint local_id = get_local_id(0);
const uint global_id = get_global_id(0);
const uint local_size = get_local_size(0);
cache[local_id] = (global_id < N) ? input[global_id] : 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int s = local_size >> 1; s > 0; s >>= 1) {
if (local_id < s) {
cache[local_id] += cache[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_id == 0) output[local_size] = cache[0];
}
And here is the setting for OpenCL:
const uint N = 8196;
cl_float a[N];
cl_float b[N];
for (uint i=0; i<N; i++) {
a[i] = 1.0f;
b[i] = 0.0f;
}
cl::Buffer inputBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float)*N);
cl::Buffer resultBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float)*N);
queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, sizeof(cl_float)*N, a);
queue.enqueueWriteBuffer(resultBuffer, CL_TRUE, 0, sizeof(cl_float)*N, b);
cl::Kernel addVectorKernel = cl::Kernel(program, "reduce");
size_t localSize = addVectorKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device); // e.g. => 512
size_t globalSize = roundUp(localSize, N); // rounds up to a multiple of localSize
addVectorKernel.setArg(0, inputBuffer);
addVectorKernel.setArg(1, resultBuffer);
addVectorKernel.setArg(2, N);
addVectorKernel.setArg(3, (sizeof(cl_float) * localSize), NULL);
queue.enqueueNDRangeKernel(
addVectorKernel,
cl::NullRange,
cl::NDRange(globalSize),
cl::NDRange(localSize)
);
queue.finish(); // wait for ending
queue.enqueueReadBuffer(resultBuffer, CL_TRUE, 0, sizeof(cl_float)*N, b); // e.g. => 1024
The result depends on the workgroup size. What am I doing wrong? Is it the kernel itself or is it the settings for OpenCL?

You should be using the group's id when writing the sum back to global memory.
if (local_id == 0) output[local_size] = cache[0];
That line will write to output[512] repeatedly. You need each work group to write to a dedicated location in the output.
kernel void reduce(global float* input, global float* output, const unsigned int N, local float* cache)
{
const uint local_id = get_local_id(0);
const uint global_id = get_global_id(0);
const uint group_id = get_group_id(0);
const uint local_size = get_local_size(0);
cache[local_id] = (global_id < N) ? input[global_id] : 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int s = local_size >> 1; s > 0; s >>= 1) {
if (local_id < s) {
cache[local_id] += cache[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_id == 0) output[group_id] = cache[0];
}
Then you need to sum the values from the output on the host. Note that 'b' in the host code does not need to hold N elements. Only one element for each work group will be used.
//replace (globalSize/localSize) with the pre-calculated/known number of work groups
for (i=1; i<(globalSize/localSize); i++) {
b[0] += b[i];
}
Now b[0] is your grand total.

In the reduction for loop, you need this:
for(unsigned int s = localSize >> 1; s > 0; s >>= 1)
You are shifting one more bit than you should when initializing s.
After that's fixed, let's look at what your kernel is doing. The host code executes it with globalSize of 8192 and localSize of 512, which results in 16 work groups. Inside the kernel you first sum the data from the two consecutive memory locations at index 2*global_id. For work group with id 15, work item 0, that will be at index 15*512*2 = 15,360 and 15,361, which is outside the boundaries of your input array. I am surprised you don't get a crash. At the same time, this explains why you have double the values that you expect.
To fix it, you can do this:
cache[localID] = input[globalID];
Or specify a global size that's half of the number of the current one.

Getting segmentation fault (or bad access) for some inputs and the program halts

#include <iostream>
#include <vector>
#include <string>
using namespace std;
void step_selection_sort(vector <int> &a, int size, int idx){
int i,j,min,temp;
i = idx;
min = i;
for (j=i+1;j<size;j++)
{
if (a[min]>a[j])
min=j;
}
if (min!=i)
{
temp = a[i];
a[i] = a[min];
a[min] = temp;
}
idx++;
}
void selection_sort(vector <int> &a, int size, int idx){
int i;
for(i=0;i<size;i++)
{
step_selection_sort(a,size,idx);
}
}
void step_desc_sort(vector <int>& a, int size, int idx){
int i,j,max,temp;
i = idx;
max = i;
for (j=i+1;j<size;j++)
{
if (a[max]<a[j])
max=j;
}
if (max!=i)
{
temp = a[i];
a[i] = a[max];
a[max] = temp;
}
idx++;
}
void desc_sort(vector <int>& a, int size, int idx){
int i;
for(i=0;i<size;i++)
{
step_desc_sort(a,size,idx);
}
}
void swap (int & a, int & b)
{
int t = a;
a = b;
b = t;
}
int findCeil (vector <int>& nums, int first, int begin, int end)
{
int ceilIndex = begin;
for (int i = begin+1; i <= end; i++)
if (nums[i] > first && nums[i] < nums[ceilIndex])
ceilIndex = i;
return ceilIndex;
}
int findBottom(vector <int>& nums,int first,int begin,int end)
{
int bottomIndex = begin;
for (int i = begin+1; i <= end; i++)
if (nums[i] < first && nums[i] > nums[bottomIndex])
bottomIndex = i;
return bottomIndex;
}
void sortedPermutations_ASC (vector <int> nums,int num)
{
bool isfinished=false;
if(isfinished==false)
for(int i=0;i<num;i++)
cout << nums[i]; //bad access when giving inputs bigger than 8
cout << endl;
int k;
for ( k = num - 2; k >= 0; --k )
if (nums[k] < nums[k+1])
break;
if ( k == -1 )
isfinished=true;
else
{
int ceilIndex = findCeil( nums, nums[k], k + 1, num - 1 );
swap( nums[k], nums[ceilIndex] );
selection_sort(nums,num,k+1);
sortedPermutations_ASC(nums,num);
}
}
void sortedPermutations_DESC (vector <int> nums,int num)
{
int i;
bool isfinished=false;
if(isfinished==false)
for(i=0;i<num;i++)
cout << nums[i];
cout << endl;
int k;
for ( k = num - 2; k >= 0; --k )
if (nums[k] > nums[k+1])
break;
if ( k == -1 )
isfinished=true;
else
{
int bottomIndex = findBottom( nums, nums[k], k + 1, num - 1 );
swap( nums[k], nums[bottomIndex] );
desc_sort(nums,num,k+1);
sortedPermutations_DESC(nums,num);
}
return;
}
int main(){
vector <int> nums;
string line,temp;
int num,j,k;
getline(cin,line);
while(j<line.size() && line[j]!=' ')
j++;
num=stoi(line.substr(0,j));
string kind;
j++;
kind=line.substr(j);
if(kind=="ASC"){
for(k=0;k<num;k++)
nums.push_back(k+1);
sortedPermutations_ASC(nums,num);
}
if(kind=="DESC"){
for(k=0;k<num;k++)
nums.push_back(num-k);
sortedPermutations_DESC(nums,num);
}
return 0;
}
here's is my code. it gives the permutations of a number.It works properly when inputs are between 1 and 8 .But it doesn't work with numbers bigger than 8 .
for example if I give
9 ASC (it means in Ascending order)
to the program , I get "Segmentation Fault:11" in terminal (mac) after printing some of the permutations .
I tried running it in Xcode . with the same input it says :
Thread 1:EXC_BAD_ACCESS(code=2,address=0x7ffff5f3fffc8)
for the line that I put comment in front of it .
I don't know what to do anymore ...
Any help would be appreciated - thanks in advance

OpenCL clEnqueueNDRangeKernel how to set work group size correctly

In OpenCL, if I want to add two N-dimension vectors, the global work group size (globalSize) should satisfy globalSize = ceil(N/localSize) * localSize, where localSize is the local work group size. Is this correct? If N = 1000, and localSize = 128, globalSize should be 1024? Can we always set globalSize some multiple of localSize and larger than needed?
I tried many times and it worked well for 1-dimension problems.
However, when it comes to 2d problems, for example, multiply two matrices of dimension m*n and n*p, the result matrix is of order m*p, things get more complicated.
The max work group size on my device is 128, so I set localSize [2] = {16,8} and
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}.
It is similar to the 1-dimension case but the result is wrong!
If I set localSize [2] = {1,128} and change the globalSize accordingly, I can get the correct result. So where is the problem? Can anyone tell me why?
In addition, I find out the indices where the matrix element is wrong.
It seems that the result is wrong at (i,j) where i*p + j = n * some constant (n = 1,2,3...)
Why?
Here is my kernel function:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
{
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
else
{
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
}
And then it is the host program:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
{
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
{
A[i] = i;
}
for(int i = 0; i < n*p; i++)
{
B[i] = i;
}
try
{
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("mmul.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
{
float z = 0.0;
for(int k = 0; k < n; k++)
{
z += A[i*n+k] * B[k*p+j];
}
c[i*p+j] = z;
}
for(int i = 0; i < m*p; i++)
{
if(fabs(c[i]-C[i])>0.001)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
}
delete []A;
delete []B;
delete []C;
}
catch(Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
return 0;
}

Your bounds checking code inside your OpenCL kernel is incorrect. Instead of this:
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
You should have this:
if(i < 0 || j < 0 || i >= Mdim || j >= Pdim) return;

Let's assume, that you have float matrix of size 1000x1000:
const int size = 1000;
// Whatever
float* myMatrix = (float*)calloc(size * size, sizeof(*myMatrix));
Determine size of Local Group first:
size_t localSize[] = {16, 8};
Then determine, how many Local Groups do you need:
size_t numLocalGroups[] = {ceil(size/localSize[0]), ceil(size/localSize[1])};
Finally, determine NDRange size:
size_t globalSize[] = {localSize[0] * numLocalGroups[0], localSize[1] * numLocalGroups[1]};
Don't forget to handle out-of-bounds access in right-most Local Groups.

Unhandled exception error with two dimensional array

This dynamic programming algorithm is returning unhandled exception error probably due to the two dimensional arrays that I am using for various (and very large) number of inputs. I can't seem to figure out the issue here. The complete program as follows:
// A Dynamic Programming based solution for 0-1 Knapsack problem
#include<stdio.h>
#include<stdlib.h>
#define MAX 10000
int size;
int Weight;
int p[MAX];
int w[MAX];
// A utility function that returns maximum of two integers
int maximum(int a, int b) { return (a > b) ? a : b; }
// Returns the maximum value that can be put in a knapsack of capacity W
int knapSack(int W, int wt[], int val[], int n)
{
int i, w;
int retVal;
int **K;
K = (int**)calloc(n+1, sizeof(int*));
for (i = 0; i < n + 1; ++i)
{
K[i] = (int*)calloc(W + 1, sizeof(int));
}
// Build table K[][] in bottom up manner
for (i = 0; i <= n; i++)
{
for (w = 0; w <= W; w++)
{
if (i == 0 || w == 0)
K[i][w] = 0;
else if (wt[i - 1] <= w)
K[i][w] = maximum(val[i - 1] + K[i - 1][w - wt[i - 1]], K[i - 1][w]);
else
K[i][w] = K[i - 1][w];
}
}
retVal = K[n][W];
for (i = 0; i < size + 1; i++)
free(K[i]);
free(K);
return retVal;
}
int random_in_range(unsigned int min, unsigned int max)
{
int base_random = rand();
if (RAND_MAX == base_random) return random_in_range(min, max);
int range = max - min,
remainder = RAND_MAX % range,
bucket = RAND_MAX / range;
if (base_random < RAND_MAX - remainder) {
return min + base_random / bucket;
}
else {
return random_in_range(min, max);
}
}
int main()
{
srand(time(NULL));
int val = 0;
int i, j;
//each input set is contained in an array
int batch[] = { 10, 20, 30, 40, 50, 5000, 10000 };
int sizeOfBatch = sizeof(batch) / sizeof(batch[0]);
//algorithms are called per size of the input array
for (i = 0; i < sizeOfBatch; i++){
printf("\n");
//dynamic array allocation (variable length to avoid stack overflow
//calloc is used to avoid garbage values
int *p = (int*)calloc(batch[i], sizeof(int));
int *w = (int*)calloc(batch[i], sizeof(int));
for (j = 0; j < batch[i]; j++){
p[j] = random_in_range(1, 500);
w[j] = random_in_range(1, 100);
}
size = batch[i];
Weight = batch[i] * 25;
printf("| %d ", batch[i]);
printf(" %d", knapSack(Weight, w, p, size));
free(p);
free(w);
}
_getch();
return 0;
}

Change this:
for (i = 0; i < size + 1; i++)
free(K[i]);
free(K);
return K[size][Weight];
To this:
int retVal;
...
retVal = K[size][Weight];
for (i = 0; i < size + 1; i++)
free(K[i]);
free(K);
return retVal;

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

OpenCL : UNREACHABLE executed - opencl

Related

How to reduce code duplication between OpenCL kernels?

Parallel reduction using local memory in OpenCL

Getting segmentation fault (or bad access) for some inputs and the program halts

OpenCL clEnqueueNDRangeKernel how to set work group size correctly

Unhandled exception error with two dimensional array

Categories

Resources