Algorithm implemented with openCL working till size exceeds 768 - opencl

I've implemented sorting algorithm using openCL. Its using one work group per array to sort (arrays are connected in __global float *array, all have the same size).
Im testing results using 200 random arrays and result are deterministic.
With one parameter, its working correctly till array size exceeds of array 768
With two parameters, its working correctly till arrays size exceeds 768
With three parameters, its working correctly till arrays size exceeds 317
What could be the reason of correct processing of just 768 (CL_KERNEL_WORK_GROUP_SIZE returns 1024 elements). Is it some memory constraints? What is the best way of invastigation such issue?
Gpu specs (4th answer):
Kernel code below:
__kernel void assort(
__global float *array,
__local float *currentOutput,
__local float *stimulations,
__local int *noOfValuesAdded,
__local float *addedValue,
__local float *positionToInsert,
__local int *activatedIdx,
__local float *range,
int size
) {
int id = get_local_id(0);
int gid = get_group_id(0);
if (id == 0)
{
if (array[gid*size]<array[gid*size+1])
{
currentOutput[0] = array[gid*size];
currentOutput[1] = array[gid*size + 1];
}
else
{
currentOutput[1] = array[gid*size];
currentOutput[0] = array[gid*size + 1];
}
noOfValuesAdded[0] = 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 2; i < size; i++)
{
int maxIdx = noOfValuesAdded[0] - 1;
if (id == 0)
{
addedValue[0] = array[gid*size + i];
positionToInsert[0] = -100.0f;
activatedIdx[0] = -2;
range[0] = currentOutput[maxIdx] - currentOutput[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (id < noOfValuesAdded[0])
{
if (id == 0)
{
stimulations[id] = (currentOutput[maxIdx] - addedValue[0]) / range[0];
float stimulation = stimulations[id];
if ( fabs(stimulation - 1.0f) < 0.000001)
activatedIdx[0] = 0;
else if (stimulation > 1.0f)
{
activatedIdx[0] = -1;
}
}
else if (id == maxIdx)
{
stimulations[maxIdx] = (addedValue[0] - currentOutput[0]) / range[0];
float stimulations = (addedValue[0] - currentOutput[0]) / range[0];
if ( fabs(stimulations - 1.0f) < 0.000001 )
activatedIdx[0] = maxIdx;
else
if (stimulations > 1)
activatedIdx[0] = maxIdx + 1;
}
else
{
stimulations[id] = 1.0f - (fabs((currentOutput[id] - addedValue[0])) / range[0]);
if ( fabs(stimulations[id] - 1.0f) < 0.000001)
activatedIdx[0] = id;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (activatedIdx[0] == -2 && id < noOfValuesAdded[0])
{
if (noOfValuesAdded[0] == 2)
{
positionToInsert[0] = 0.9f;
}
else if (id != 0 &&
id != maxIdx &&
stimulations[id] >= stimulations[(id - 1)] &&
stimulations[id] >= stimulations[(id + 1)] )
{
if ((1.0f - (fabs(currentOutput[(id - 1)] - currentOutput[id]) / range[0]) ) < stimulations[(id - 1)])
positionToInsert[0] = (float)id - 0.1f;
else
positionToInsert[0] = (float)id + 0.9f;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (activatedIdx[0] == -2)
{
if (id == 0 && positionToInsert[0] < -90.0f) // default value maintained
{
if (stimulations[0] > stimulations[1])
positionToInsert[0] = 0.9f;
else
positionToInsert[0] = (float)maxIdx - 0.1f;
}
}
else
{
if (activatedIdx[0] == -1)
positionToInsert[0] = -0.1f;
else if (activatedIdx[0] == (maxIdx + 1))
{
positionToInsert[0] = (float)maxIdx + 0.9f;
}
else
{
currentOutput[activatedIdx[0]] = addedValue[0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (positionToInsert[0] > -50.0f) // default value changed
{
float temp = 0.0f;
if ((float)id>positionToInsert[0])
{
temp = currentOutput[id];
currentOutput[id + 1] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if ((float)id > positionToInsert[0])
{
temp = currentOutput[id];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (id == round(positionToInsert[0]))
{
currentOutput[id] = addedValue[0];
noOfValuesAdded[0] = noOfValuesAdded[0] + 1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
array[gid*size + id] = currentOutput[id];
return;
}

Related

Trouble when rendering voxels with pathtracing

I'm currently working on a pathtracer in c and open cl.
I'm using this algorithm for rendering. The first collision works well, however, from the second collision onwards there is a dark shadow on the lower side of the voxels.
This is the color of the voxel the initial ray hits:
result
This is the color of the voxel that the second ray hits:
result
And this is the result after rendering to a depth of 1000:
result
This is the code I used (openCL):
int cast_ray(Renderer *r, Ray ray, float3 *hitPos, int3 *normal, Material *material) {
int3 voxel = convert_int3(ray.origin);
int3 step = {
(ray.direction.x >= 0) ? 1 : -1,
(ray.direction.y >= 0) ? 1 : -1,
(ray.direction.z >= 0) ? 1 : -1
};
float3 tMax = {
(ray.direction.x != 0) ? (voxel.x + step.x - ray.origin.x) / ray.direction.x : MAXFLOAT,
(ray.direction.y != 0) ? (voxel.y + step.y - ray.origin.y) / ray.direction.y : MAXFLOAT,
(ray.direction.z != 0) ? (voxel.z + step.z - ray.origin.z) / ray.direction.z : MAXFLOAT
};
float3 tDelta = {
(ray.direction.x != 0) ? 1 / ray.direction.x * step.x : MAXFLOAT,
(ray.direction.y != 0) ? 1 / ray.direction.y * step.y : MAXFLOAT,
(ray.direction.z != 0) ? 1 / ray.direction.z * step.z : MAXFLOAT
};
int side;
while(1) {
if(tMax.x < tMax.y) {
if(tMax.x < tMax.z) {
voxel.x += step.x;
tMax.x += tDelta.x;
side = 0;
} else {
voxel.z += step.z;
tMax.z += tDelta.z;
side = 2;
}
} else {
if(tMax.y < tMax.z) {
voxel.y += step.y;
tMax.y += tDelta.y;
side = 1;
} else {
voxel.z += step.z;
tMax.z += tDelta.z;
side = 2;
}
}
if(out_of_scene(r, voxel))
return 0;
MaterialID id = get_material_ID(r, voxel);
if(id == 0)
continue;
*material = get_material(r, id);
switch(side) {
case 0:
hitPos->x = (float)voxel.x;
hitPos->y = ray.origin.y + (hitPos->x - ray.origin.x) * ray.direction.y / ray.direction.x;
hitPos->z = ray.origin.z + (hitPos->x - ray.origin.x) * ray.direction.z / ray.direction.x;
*normal = (int3){-step.x, 0, 0};
break;
case 1:
hitPos->y = (float)voxel.y;
hitPos->x = ray.origin.x + (hitPos->y - ray.origin.y) * ray.direction.x / ray.direction.y;
hitPos->z = ray.origin.z + (hitPos->y - ray.origin.y) * ray.direction.z / ray.direction.y;
*normal = (int3){0, -step.y, 0};
break;
case 2:
hitPos->z = (float)voxel.z;
hitPos->y = ray.origin.y + (hitPos->z - ray.origin.z) * ray.direction.y / ray.direction.z;
hitPos->x = ray.origin.x + (hitPos->z - ray.origin.z) * ray.direction.x / ray.direction.z;
*normal = (int3){0, 0, -step.z};
break;
}
return 1;
}
}
float3 get_color(Renderer *r, Ray ray) {
float3 mask = 1;
float3 color = 0;
int maxDepth = 1000;
for(int i = 0; i < maxDepth; i++) {
float3 hitPos;
int3 iNormal;
Material material;
if(cast_ray(r, ray, &hitPos, &iNormal, &material)) {
float3 fNormal = convert_float3(iNormal);
if(material.type == 1) {
color = mask * material.color;
break;
} else if(material.type == 2) {
float3 direction = fNormal + random_unit_vector(r->rng);
ray = (Ray){hitPos, direction};
mask *= material.color;
} else if(material.type == 3) {
float3 direction = reflection_dir(ray.direction, fNormal) + random_unit_vector(r->rng) * material.fuzzyness;
ray = (Ray){hitPos, direction};
mask = mask * (1 - material.tint) + mask * material.color * material.tint;
}
} else {
color = mask * r->bgColor;
break;
}
// if(i == 1)
// return material.color;
}
return color;
}
I think that the problem is that the new origin of the ray is somehow not correct, but I can't find a way to fix it.

Why do I get a StackOverFlow error on the first recursive call?

Below is the code I am referring to, and the first recursive call # checkDirections(grid, i - 1, j) is giving me a StackOverFlow error. I understand that this means the code is not hitting the base case, but I do not understand why.
class Solution {
public int orangesRotting(int[][] grid) {
int rowLength = grid.length;
int colLength = grid[0].length;
int minMinutes = 0;
for (int i = 0; i < rowLength; i++) {
for (int j = 0; j < colLength; j++) {
if (grid[i][j] == 2) {
checkDirections(grid, i, j);
}
}
}
return minMinutes;
}
public void checkDirections(int[][] grid, int i, int j) {
if ((i < 0 || i > grid.length || j < 0 || j > grid[0].length) || grid[i][j] == 0) {
return;
} else if (grid[i][j] == 1) {
grid[i][j] = 2;
return;
}
//check left
checkDirections(grid, i - 1, j);
//check right
checkDirections(grid, i + 1, j);
//check up
checkDirections(grid, i, j - 1);
//check down
checkDirections(grid, i, j + 1);
}
}

How to convert Ximea xiAPI camera data into QImage?

I have data from a camera in mono 8bit.
This is converted into an int vector using
std::vector<int> grayVector(size);
// convert / copy pointer data into vector: 8 bit
if (static_cast<XI_IMG_FORMAT>(format) == XI_MONO8)
{
quint8* imageIterator = reinterpret_cast<quint8*> (pMemVoid);
for (size_t count = 0; count < size; ++count)
{
grayVector[count] = static_cast<int>(*imageIterator);
imageIterator++;
}
}
Next, I need to convert this into a QImage. If I set the image format to QImage::Format_Mono the app crashes. With QImage::Format_RGB16 I get strippes, and with QImage::Format_RGB32 everything is black.
I would like to know how to do this the best, efficient and correct way?
// convert gray values into QImage data
QImage image = QImage(static_cast<int>(sizeX), static_cat<int>(sizeY), QImage::Format_RGB16);
for ( int y = 0; y < sizeY; ++y )
{
int yoffset = sizeY*y;
QRgb *line = reinterpret_cast<QRgb *>(image.scanLine(y)) ;
for ( int x = 0; x < sizeX ; ++x )
{
int pos = x + yoffset;
int color = grayVector[static_cast<size_t>(pos)];
*line++ = qRgb(color, color, color);
}
}
The conversion to int is unnecessary and you do it in a very inefficient way; all you need is to use the QImage::Format_Grayscale8 available since Qt 5.5 (mid-2015).
Anyway, what you really want is a way to go from XI_IMG to QImage. The default BP_UNSAFE buffering policy should be adequate - the QImage will do a format conversion, so taking the data from XiApi's internal buffer is OK. Thus the following - all of the conversions are implemented in Qt and are quite efficient - much better than most any naive code.
I didn't check whether some Xi formats may need a BGR swap. If so, then the swap can be set to true in the format selection code and the rest will happen automatically.
See also: xiAPI manual.
static QVector<QRgb> grayScaleColorTable() {
static QVector<QRgb> table;
if (table.isEmpty()) {
table.resize(256);
auto *data = table.data();
for (int i = 0; i < table.size(); ++i)
data[i] = qRgb(i, i, i);
}
return table;
}
constexpr QImage::Format grayScaleFormat() {
return (QT_VERSION >= QT_VERSION_CHECK(5,5,0))
? QImage::Format_Grayscale8
: QImage::Format_Indexed8;
}
QImage convertToImage(const XI_IMG *src, QImage::Format f) {
Q_ASSERT(src->fmt == XI_MONO16);
Q_ASSERT((src->padding_x % 2) == 0);
if (src->fmt != XI_MONO16) return {};
const quint16 *s = static_cast<const quint16*>(src->bp);
const int s_pad = src->padding_x/2;
if (f == QImage::Format_BGR30 ||
f == QImage::Format_A2BGR30_Premultiplied ||
f == QImage::Format_RGB30 ||
f == QImage::Format_A2RGB30_Premultiplied)
{
QImage ret{src->width, src->height, f};
Q_ASSERT((ret->bytesPerLine() % 4) == 0);
const int d_pad = ret->bytesPerLine()/4 - ret->width();
quint32 *d = (quint32*)ret.bits();
if (s_pad == d_pad) {
const int N = (src->width + s_pad) * src->height - s_pad;
for (int i = 0; i < N; ++i) {
quint32 const v = (*s++) >> (16-10);
*d++ = 0xC0000000 | v << 20 | v << 10 | v;
}
} else {
for (int j = 0; j < src->height; ++j) {
for (int i = 0; i < src->width; ++i) {
quint32 const v = (*s++) >> (16-10);
*d++ = 0xC0000000u | v << 20 | v << 10 | v;
}
s += s_pad;
d += d_pad;
}
}
return ret;
}
QImage ret{src->width, src->height, grayScaleFormat()};
const int d_pad = ret->bytesPerLine() - ret->width();
auto *d = ret.bits();
if (s_pad == d_pad) {
const int N = (src->width + s_pad) * src->height - s_pad;
for (int i = 0; i < N; ++i) {
*d++ = (*s++) >> 8;
} else {
for (int j = 0; j < src->height; ++j) {
for (int i = 0; i < src->width; ++i)
*d++ = (*s++) >> 8;
s += s_pad;
d += d_pad;
}
}
return ret;
}
QImage fromXiImg(const XI_IMG *src, QImage::Format dstFormat = QImage::Format_ARGB32Premultiplied) {
Q_ASSERT(src->width > 0 && src->height > 0 && src->padding_x >= 0 && src->bp_size > 0);
Q_ASSERT(dstFormat != QImage::Format_Invalid);
bool swap = false;
int srcPixelBytes = 0;
bool externalConvert = false;
QImage::Format srcFormat = QImage::Format_Invalid;
switch (src->fmt) {
case XI_MONO8:
srcPixelBytes = 1;
srcFormat = grayScaleFormat();
break;
case XI_MONO16:
srcPixelBytes = 2;
externalConvert = true;
break;
case XI_RGB24:
srcPixelBytes = 3;
srcFormat = QImage::Format_RGB888;
break;
case XI_RGB32:
srcPixelBytes = 4;
srcFormat = QImage::Format_RGB32;
break;
};
if (srcFormat == QImage::Format_Invalid && !externalConvert) {
qWarning("Unhandled XI_IMG image format");
return {};
}
Q_ASSERT(srcPixelBytes > 0 && srcPixelBytes <= 4);
int bytesPerLine = src->width * srcPixelBytes + src->padding_x;
if ((bytesPerLine * src->height - src->padding_x) > src->bp_size) {
qWarning("Inconsistent XI_IMG data");
return {};
}
QImage ret;
if (!externalConvert)
ret = QImage{static_cast<const uchar*>(src->bp), src->width, src->height,
bytesPerLine, srcFormat};
else
ret = convertToImage(src, dstFormat);
if (ret.format() == QImage::Format_Indexed8)
ret.setColorTable(grayScaleColorTable());
if (ret.format() != dstFormat)
ret = std::move(ret).convertToFormat(dstFormat);
if (swap)
ret = std::move(ret).rgbSwapped();
if (!ret.isDetached()) // ensure that we don't share XI_IMG's data buffer
ret.detach();
return ret;
}

Random tree procedure in ANSI-C - what generates SIGSEGV?

In main.c the tree is declared and initiated, but procedure:
void MakeTree(Tree T, int n)
{
int i, r;
Node *x, *p;
time_t t;
srand((unsigned) time(&t));
for(i = 0; i < n; i++)
{
Node * new = malloc(sizeof(Node));
new->key = rand() % 100;
x = T.root;
NodeInit(p);
while(x != NULL)
{
p = x;
r = rand() % 2;
if(r == 0) x = p->left;
else x = p->right;
}
new->parent = p;
if (T.root == NULL)
T.root = new;
else
if (r == 0)
{
p->left = new;
}
else
{
p->right = new;
}
}
}
generates memory violation error in WHILE loop in line below:
else x = p->right;
What have I missed in this assignment?

Changing method to be completely recursive

void merge(List<E> l, int lower, int upper) {
ArrayList<E> array = new ArrayList<E>();
for (int i = lower; i <= upper; i++)
array.add(list.get(i));
int front= 0;
int front2= (array.size() + 1) / 2;
for (int i = lower; i <= upper; i++) {
if (front2 >= array.size() ||
(first < ((array.size() + 1) / 2) &&
(array.get(first).compareTo(array.get(second)) <= 0))) {
l.set(i, array.get(front));
front++;
}// end if
else {
l.set(i, array.get(front2));
front2++;
}
}
}
This is my method. I want to change it to be completely recursive(I don't want to use for loops), but I simply don't see how. Is there a way to make this recursive or avoid using loops?

Resources