RISC-V undefined reference to `memcpy' - memcpy

I am using RISC-V 32E toolchain to compile some standalone CPP code. I got errors below.
/opt/riscv32i/bin/riscv32-unknown-elf-g++ -Os -ffreestanding -o firmware/firmware.elf \
-Wl,-Bstatic,-T,firmware/sections.lds,-Map,firmware/firmware.map,--strip-debug \
firmware/start.o firmware/print.o firmware/stream.o firmware/main.o firmware/accel.o firmware/data_redir_m.o -lgcc -fno-threadsafe-statics -nostdlib
/opt/riscv32i/lib/gcc/riscv32-unknown-elf/8.2.0/../../../../riscv32-unknown-elf/bin/ld: firmware/data_redir_m.o: in function `.L31':
data_redir_m.cpp:(.text+0x442): undefined reference to `memcpy'
collect2: error: ld returned 1 exit status
Makefile:46: recipe for target 'firmware/firmware.elf' failed
make: *** [firmware/firmware.elf] Error 1
I want to keep the hex file as small as possible, so my preference is to not use -lstdc++ library. What confused me a lot is why it still complains about such errors even if I manually add the memcpy() function definition in data_redir_m.cpp file.
#include "typedefs.h"
void * memcpy ( void * destination, const void * source, int num ){
int i=0;
*((int*)destination) = *((int*)source);
}
static int check_clockwise( Triangle_2D triangle_2d )
{
int cw;
cw = (triangle_2d.x2 - triangle_2d.x0) * (triangle_2d.y1 - triangle_2d.y0)
- (triangle_2d.y2 - triangle_2d.y0) * (triangle_2d.x1 - triangle_2d.x0);
return cw;
}
// swap (x0, y0) (x1, y1) of a Triangle_2D
static void clockwise_vertices( Triangle_2D *triangle_2d )
{
bit8 tmp_x, tmp_y;
tmp_x = triangle_2d->x0;
tmp_y = triangle_2d->y0;
triangle_2d->x0 = triangle_2d->x1;
triangle_2d->y0 = triangle_2d->y1;
triangle_2d->x1 = tmp_x;
triangle_2d->y1 = tmp_y;
}
// find the min from 3 integers
static bit8 find_min( bit8 in0, bit8 in1, bit8 in2 )
{
if (in0 < in1)
{
if (in0 < in2)
return in0;
else
return in2;
}
else
{
if (in1 < in2)
return in1;
else
return in2;
}
}
// find the max from 3 integers
static bit8 find_max( bit8 in0, bit8 in1, bit8 in2 )
{
if (in0 > in1)
{
if (in0 > in2)
return in0;
else
return in2;
}
else
{
if (in1 > in2)
return in1;
else
return in2;
}
}
// project a 3D triangle to a 2D triangle
void projection(
bit32 input_lo,
bit32 input_mi,
bit32 input_hi,
Triangle_2D *triangle_2d
)
{
#pragma HLS INLINE off
Triangle_3D triangle_3d;
// Setting camera to (0,0,-1), the canvas at z=0 plane
// The 3D model lies in z>0 space
// The coordinate on canvas is proportional to the corresponding coordinate
// on space
bit2 angle = 0;
triangle_3d.x0 = bit8(input_lo( 7, 0));
triangle_3d.y0 = bit8(input_lo(15, 8));
triangle_3d.z0 = bit8(input_lo(23, 16));
triangle_3d.x1 = bit8(input_lo(31, 24));
triangle_3d.y1 = bit8(input_mi( 7, 0));
triangle_3d.z1 = bit8(input_mi(15, 8));
triangle_3d.x2 = bit8(input_mi(23, 16));
triangle_3d.y2 = bit8(input_mi(31, 24));
triangle_3d.z2 = bit8(input_hi( 7, 0));
if(angle == 0)
{
triangle_2d->x0 = triangle_3d.x0;
triangle_2d->y0 = triangle_3d.y0;
triangle_2d->x1 = triangle_3d.x1;
triangle_2d->y1 = triangle_3d.y1;
triangle_2d->x2 = triangle_3d.x2;
triangle_2d->y2 = triangle_3d.y2;
triangle_2d->z = triangle_3d.z0 / 3 + triangle_3d.z1 / 3 + triangle_3d.z2 / 3;
}
else if(angle == 1)
{
triangle_2d->x0 = triangle_3d.x0;
triangle_2d->y0 = triangle_3d.z0;
triangle_2d->x1 = triangle_3d.x1;
triangle_2d->y1 = triangle_3d.z1;
triangle_2d->x2 = triangle_3d.x2;
triangle_2d->y2 = triangle_3d.z2;
triangle_2d->z = triangle_3d.y0 / 3 + triangle_3d.y1 / 3 + triangle_3d.y2 / 3;
}
else if(angle == 2)
{
triangle_2d->x0 = triangle_3d.z0;
triangle_2d->y0 = triangle_3d.y0;
triangle_2d->x1 = triangle_3d.z1;
triangle_2d->y1 = triangle_3d.y1;
triangle_2d->x2 = triangle_3d.z2;
triangle_2d->y2 = triangle_3d.y2;
triangle_2d->z = triangle_3d.x0 / 3 + triangle_3d.x1 / 3 + triangle_3d.x2 / 3;
}
}
// calculate bounding box for a 2D triangle
void rasterization1 (
Triangle_2D triangle_2d,
hls::stream<ap_uint<32> > & Output_1,
hls::stream<ap_uint<32> > & Output_2
)
{
Triangle_2D triangle_2d_same;
bit8 max_min[5];
max_min[0]=0;
max_min[1]=0;
max_min[2]=0;
max_min[3]=0;
max_min[4]=0;
bit16 max_index[1];
max_index[0]=0;
bit32 tmp1, tmp2, tmp3, tmp4;
static int parity = 0;
#pragma HLS INLINE off
// clockwise the vertices of input 2d triangle
if ( check_clockwise( triangle_2d ) == 0 ){
tmp1(7,0) = 1;
tmp1(15, 8) = triangle_2d_same.x0;
tmp1(23,16) = triangle_2d_same.y0;
tmp1(31,24) = triangle_2d_same.x1;
tmp2(7,0) = triangle_2d_same.y1;
tmp2(15, 8) = triangle_2d_same.x2;
tmp2(23,16) = triangle_2d_same.y2;
tmp2(31,24) = triangle_2d_same.z;
tmp3(15,0) = max_index[0];
tmp3(23,16) = max_min[0];
tmp3(31,24) = max_min[1];
tmp4(7,0) = max_min[2];
tmp4(15, 8) = max_min[3];
tmp4(23,16) = max_min[4];
tmp4(31,24) = 0;
if(parity==0){
Output_1.write(tmp1);
Output_1.write(tmp2);
Output_1.write(tmp3);
Output_1.write(tmp4);
parity = 1;
}else{
Output_2.write(tmp1);
Output_2.write(tmp2);
Output_2.write(tmp3);
Output_2.write(tmp4);
parity = 0;
}
#ifdef PROFILE
data_redir_m_out_1+=4;
#endif
return;
}
if ( check_clockwise( triangle_2d ) < 0 )
clockwise_vertices( &triangle_2d );
// copy the same 2D triangle
triangle_2d_same.x0 = triangle_2d.x0;
triangle_2d_same.y0 = triangle_2d.y0;
triangle_2d_same.x1 = triangle_2d.x1;
triangle_2d_same.y1 = triangle_2d.y1;
triangle_2d_same.x2 = triangle_2d.x2;
triangle_2d_same.y2 = triangle_2d.y2;
triangle_2d_same.z = triangle_2d.z ;
// find the rectangle bounds of 2D triangles
max_min[0] = find_min( triangle_2d.x0, triangle_2d.x1, triangle_2d.x2 );
max_min[1] = find_max( triangle_2d.x0, triangle_2d.x1, triangle_2d.x2 );
max_min[2] = find_min( triangle_2d.y0, triangle_2d.y1, triangle_2d.y2 );
max_min[3] = find_max( triangle_2d.y0, triangle_2d.y1, triangle_2d.y2 );
max_min[4] = max_min[1] - max_min[0];
// calculate index for searching pixels
max_index[0] = (max_min[1] - max_min[0]) * (max_min[3] - max_min[2]);
tmp1(7,0) = 0;
tmp1(15,8) = triangle_2d_same.x0;
tmp1(23,16) = triangle_2d_same.y0;
tmp1(31,24) = triangle_2d_same.x1;
tmp2(7,0) = triangle_2d_same.y1;
tmp2(15,8) = triangle_2d_same.x2;
tmp2(23,16) = triangle_2d_same.y2;
tmp2(31,24) = triangle_2d_same.z;
tmp3(15,0) = max_index[0];
tmp3(23,16) = max_min[0];
tmp3(31,24) = max_min[1];
tmp4(7,0) = max_min[2];
tmp4(15,8) = max_min[3];
tmp4(23, 16) = max_min[4];
tmp4(31, 24) = 0;
if(parity==0){
Output_1.write(tmp1);
Output_1.write(tmp2);
Output_1.write(tmp3);
Output_1.write(tmp4);
parity = 1;
}else{
Output_2.write(tmp1);
Output_2.write(tmp2);
Output_2.write(tmp3);
Output_2.write(tmp4);
parity = 0;
}
return;
}
void data_redir_m (
hls::stream<ap_uint<32> > & Input_1,
hls::stream<ap_uint<32> > & Output_1,
hls::stream<ap_uint<32> > & Output_2
)
{
#pragma HLS INTERFACE ap_hs port=Input_1
#pragma HLS INTERFACE ap_hs port=Output_1
#pragma HLS INTERFACE ap_hs port=Output_2
bit32 input_lo;
bit32 input_mi;
bit32 input_hi;
bit128 input_tmp;
hls::stream<ap_uint<32> > Output_1_1;
hls::stream<ap_uint<32> > Output_2_2;
Triangle_2D triangle_2ds_1;
Triangle_2D triangle_2ds_2;
input_lo = Input_1.read();
input_mi = Input_1.read();
input_hi = Input_1.read();
#ifdef PROFILE
data_redir_m_in_1+=3;
#endif
projection (input_lo,input_mi,input_hi,&triangle_2ds_1);
rasterization1 (triangle_2ds_1, Output_1, Output_2);
}

If you want to build gcc for non-os environment, you need to build newlib in advance. And tell gcc where to find the libc, the way is to declare sysroot when configure the project.
You can use this script(ian910297/build-riscv-gnu-toolchain) to build riscv gnu toolchain. If you choose this method, be care of the directory name.
Otherwise, riscv official also provide similar script to build, like: riscv/riscv-gnu-toolchain)

Related

Trouble when rendering voxels with pathtracing

I'm currently working on a pathtracer in c and open cl.
I'm using this algorithm for rendering. The first collision works well, however, from the second collision onwards there is a dark shadow on the lower side of the voxels.
This is the color of the voxel the initial ray hits:
result
This is the color of the voxel that the second ray hits:
result
And this is the result after rendering to a depth of 1000:
result
This is the code I used (openCL):
int cast_ray(Renderer *r, Ray ray, float3 *hitPos, int3 *normal, Material *material) {
int3 voxel = convert_int3(ray.origin);
int3 step = {
(ray.direction.x >= 0) ? 1 : -1,
(ray.direction.y >= 0) ? 1 : -1,
(ray.direction.z >= 0) ? 1 : -1
};
float3 tMax = {
(ray.direction.x != 0) ? (voxel.x + step.x - ray.origin.x) / ray.direction.x : MAXFLOAT,
(ray.direction.y != 0) ? (voxel.y + step.y - ray.origin.y) / ray.direction.y : MAXFLOAT,
(ray.direction.z != 0) ? (voxel.z + step.z - ray.origin.z) / ray.direction.z : MAXFLOAT
};
float3 tDelta = {
(ray.direction.x != 0) ? 1 / ray.direction.x * step.x : MAXFLOAT,
(ray.direction.y != 0) ? 1 / ray.direction.y * step.y : MAXFLOAT,
(ray.direction.z != 0) ? 1 / ray.direction.z * step.z : MAXFLOAT
};
int side;
while(1) {
if(tMax.x < tMax.y) {
if(tMax.x < tMax.z) {
voxel.x += step.x;
tMax.x += tDelta.x;
side = 0;
} else {
voxel.z += step.z;
tMax.z += tDelta.z;
side = 2;
}
} else {
if(tMax.y < tMax.z) {
voxel.y += step.y;
tMax.y += tDelta.y;
side = 1;
} else {
voxel.z += step.z;
tMax.z += tDelta.z;
side = 2;
}
}
if(out_of_scene(r, voxel))
return 0;
MaterialID id = get_material_ID(r, voxel);
if(id == 0)
continue;
*material = get_material(r, id);
switch(side) {
case 0:
hitPos->x = (float)voxel.x;
hitPos->y = ray.origin.y + (hitPos->x - ray.origin.x) * ray.direction.y / ray.direction.x;
hitPos->z = ray.origin.z + (hitPos->x - ray.origin.x) * ray.direction.z / ray.direction.x;
*normal = (int3){-step.x, 0, 0};
break;
case 1:
hitPos->y = (float)voxel.y;
hitPos->x = ray.origin.x + (hitPos->y - ray.origin.y) * ray.direction.x / ray.direction.y;
hitPos->z = ray.origin.z + (hitPos->y - ray.origin.y) * ray.direction.z / ray.direction.y;
*normal = (int3){0, -step.y, 0};
break;
case 2:
hitPos->z = (float)voxel.z;
hitPos->y = ray.origin.y + (hitPos->z - ray.origin.z) * ray.direction.y / ray.direction.z;
hitPos->x = ray.origin.x + (hitPos->z - ray.origin.z) * ray.direction.x / ray.direction.z;
*normal = (int3){0, 0, -step.z};
break;
}
return 1;
}
}
float3 get_color(Renderer *r, Ray ray) {
float3 mask = 1;
float3 color = 0;
int maxDepth = 1000;
for(int i = 0; i < maxDepth; i++) {
float3 hitPos;
int3 iNormal;
Material material;
if(cast_ray(r, ray, &hitPos, &iNormal, &material)) {
float3 fNormal = convert_float3(iNormal);
if(material.type == 1) {
color = mask * material.color;
break;
} else if(material.type == 2) {
float3 direction = fNormal + random_unit_vector(r->rng);
ray = (Ray){hitPos, direction};
mask *= material.color;
} else if(material.type == 3) {
float3 direction = reflection_dir(ray.direction, fNormal) + random_unit_vector(r->rng) * material.fuzzyness;
ray = (Ray){hitPos, direction};
mask = mask * (1 - material.tint) + mask * material.color * material.tint;
}
} else {
color = mask * r->bgColor;
break;
}
// if(i == 1)
// return material.color;
}
return color;
}
I think that the problem is that the new origin of the ray is somehow not correct, but I can't find a way to fix it.

Doppler radar (HB100) Arduino code : Why do we use bit-shifting?

I've been working on my doppler radar speed project for a while. I found this very helpful link and the code below:
// Based on the Adafruit Trinket Sound-Reactive LED Color Organ
// http://learn.adafruit.com/trinket-sound-reactive-led-color-organ/code
#define RADAR A5 // RADAR inut is attached to A7
#define MICRODELAY 100 // 100microseconds ~10000hz
#define MAXINDEX 1024 // 10 bits
#define TOPINDEX 1023 // 10 bits
byte collect[MAXINDEX];
int mean;
int minimum;
int maximum;
int hysteresis; // 1/16 of max-min
bool currentphase; // are value above mean + hysteresis;
int lastnull; // index for last null passing value
int prevnull; // index for previous null passing value
int deltaindex;
int deltadeltaindex;
int index;
bool phasechange = false;
void setup() {
// put your setup code here, to run once:
Serial.begin(115200);
while (!Serial) {}
index = 0;
mean = 0;
maximum = 255;
minimum = 0;
hysteresis = 0;
currentphase = false;
lastnull = 0;
prevnull = 0;
Serial.print("deltadeltaindex");
Serial.print("\t");
Serial.print("deltaindex");
Serial.print("\t");
Serial.println("delta");
}
void loop() {
int newVal = analogRead(RADAR); // Raw reading from amplified radar
mean -= (collect[index] >> 2);
mean += (newVal >> 2);
collect[index]= newVal;
minimum = newVal < minimum ? newVal : minimum + 1;
maximum = newVal > maximum ? newVal : maximum - 1;
hysteresis = abs(maximum - minimum) >> 5;
if(newVal > (mean + hysteresis))
{
if(false == currentphase)
{
currentphase = true;
phasechange = true;
}
}
else if(newVal < (mean - hysteresis))
{
if(currentphase)
{
currentphase = false;
phasechange = true;
}
}
if(phasechange)
{
prevnull = lastnull;
lastnull = index;
int delta = (prevnull > lastnull) ?
(lastnull - prevnull + MAXINDEX) :
(lastnull - prevnull);
deltadeltaindex = abs(deltaindex - delta);
deltaindex = delta;
Serial.print(deltadeltaindex);
Serial.print("\t");
Serial.print(deltaindex);
Serial.print("\t");
Serial.println(delta);
}
index = index == TOPINDEX ? 0 : index + 1;
phasechange = false;
//delayMicroseconds(10);
}
I tried it out on my Arduino with HB100(model with breakout board), and it works just fine.
However, what I really wanted to do was to understand the mechanism behind the code. I read some articles on hysteresis and bit-shifting, but I simply cannot understand why the programmer here used bit-shifting.
What would mean -= (collect[index] >> 2); and mean += (newVal >> 2); do to the values exactly?
Help will be appreciated.

Magnetometer biases issue

I am using lis3mdl magnetometer in my quadcopter project to compensate gyroscope drift. Unfortunatelly Im having problems probably with calibrating.
Ive achive max and min values (what is weird they are 14 bits instead of 16) and calculated biases like that :
biases[i] = (MAX_VALUES[i]+MIN_VALUES[i])/2;
(where i represent each of 3 axis).
Ive substracted biases from raw values x = (double)new_x-biases[0]; (etc), and then wanted to calculate heading like that :
heading = atan2(x,y);
heading += declinationAngle;
where declination angle is calculated.
Outcome are angles (conversion from radians heading*(180/M_PI)), and it does change when Iam rotating quad in yaw axi, BUT when I am rotating it in roll and pitch axi value change either. I want to achive stable yaw value which does not change when Iam rotating object in other axis. Maybe some type of fusing with accelerometer?
I am not sure when Ive made mistake in my calculations...
Whole class:
class Magnetometer {
int x=0,y=0,z=0;
LIS3MDL mag;
int running_min[3] = {32767, 32767, 32767}, running_max[3] = {-32768, -32768, -32768};
double mag_norm = 0.0;
double declinationAngle = 0.0;
double heading=0.0;
const int MAX_VALUES[3] = {3014,3439,10246};
const int MIN_VALUES[3] = {-4746, -4110, 492};
double biases[3] = {0.0};
double scales[3] = {0.0};
double avg_scale = 0.0;
ButterworthDLPF xyz_filter[3];
double DLPF_ON = true;
const float sample_rate = MAG_SAMPLE_RATE;
const float cutoff_freq = 4.0;
public:
Magnetometer() {}
void Init() {
declinationAngle = (6.0 + (8.0 / 60.0)) / (180.0 / M_PI);
for(int i=0; i<3; i++) {
biases[i] = (MAX_VALUES[i]+MIN_VALUES[i])/2;
scales[i] = (MAX_VALUES[i]-MIN_VALUES[i])/2;
}
avg_scale = (scales[0]+scales[1]+scales[2])/3.0;
for(int i=0; i<3; i++) scales[i] = avg_scale / scales[i];
Serial.println("Turning on magnetometer. . .");
if(!mag.init()) {
Serial.println("Failed to detect magnetometer!");
ESP.restart();
}
mag.enableDefault();
//Calibrate();
for(int i=0; i<3; i++) xyz_filter[i].Init(sample_rate, cutoff_freq);
Serial.println("9DOF readdy!");
}
void Calibrate() {
delay(100);
while(true) {
mag.read();
if(running_max[0]<mag.m.x) running_max[0] = mag.m.x;
if(running_max[1]<mag.m.y) running_max[1] = mag.m.y;
if(running_max[2]<mag.m.z) running_max[2] = mag.m.z;
if(running_min[0]>mag.m.x) running_min[0] = mag.m.x;
if(running_min[1]>mag.m.y) running_min[1] = mag.m.y;
if(running_min[2]>mag.m.z) running_min[2] = mag.m.z;
Serial.println((String)running_max[0]+" "+(String)running_max[1]+" "+(String)running_max[2]+ " "+(String)running_min[0] +" "+(String)running_min[1]+" "+(String)running_min[2]);
delay(20);
}
}
void Update(){
mag.read();
xyz_filter[0].Update(mag.m.x);
xyz_filter[1].Update(mag.m.y);
xyz_filter[2].Update(mag.m.z);
//Serial.println(xyz_filter[0].getData());
/*x = ((double)xyz_filter[0].getData()-biases[0])*scales[0];
y = ((double)xyz_filter[1].getData()-biases[1])*scales[1];
z = ((double)xyz_filter[2].getData()-biases[2])*scales[2];*/
x = ((double)mag.m.x-biases[0])*scales[0];
y = ((double)mag.m.y-biases[1])*scales[1];
z = ((double)mag.m.z-biases[2])*scales[2];
CalculateHeading();
}
void CalculateHeading() {
heading = atan2(y,x);
heading += declinationAngle;
//if(heading<0) heading += 2*PI;
//else if(heading>2*PI) heading -= 2*PI;
heading=MOD(heading*(180/M_PI));
}
double GetHeading() {return heading;}
void ShowRawValues(bool names=false) {
if(names) Serial.print("X: "+(String)x+" Y: "+ (String)y+ " Z: " + (String)z);
else Serial.print((String)x+" "+ (String)y+ " " + (String)z);
}
};

How to convert Ximea xiAPI camera data into QImage?

I have data from a camera in mono 8bit.
This is converted into an int vector using
std::vector<int> grayVector(size);
// convert / copy pointer data into vector: 8 bit
if (static_cast<XI_IMG_FORMAT>(format) == XI_MONO8)
{
quint8* imageIterator = reinterpret_cast<quint8*> (pMemVoid);
for (size_t count = 0; count < size; ++count)
{
grayVector[count] = static_cast<int>(*imageIterator);
imageIterator++;
}
}
Next, I need to convert this into a QImage. If I set the image format to QImage::Format_Mono the app crashes. With QImage::Format_RGB16 I get strippes, and with QImage::Format_RGB32 everything is black.
I would like to know how to do this the best, efficient and correct way?
// convert gray values into QImage data
QImage image = QImage(static_cast<int>(sizeX), static_cat<int>(sizeY), QImage::Format_RGB16);
for ( int y = 0; y < sizeY; ++y )
{
int yoffset = sizeY*y;
QRgb *line = reinterpret_cast<QRgb *>(image.scanLine(y)) ;
for ( int x = 0; x < sizeX ; ++x )
{
int pos = x + yoffset;
int color = grayVector[static_cast<size_t>(pos)];
*line++ = qRgb(color, color, color);
}
}
The conversion to int is unnecessary and you do it in a very inefficient way; all you need is to use the QImage::Format_Grayscale8 available since Qt 5.5 (mid-2015).
Anyway, what you really want is a way to go from XI_IMG to QImage. The default BP_UNSAFE buffering policy should be adequate - the QImage will do a format conversion, so taking the data from XiApi's internal buffer is OK. Thus the following - all of the conversions are implemented in Qt and are quite efficient - much better than most any naive code.
I didn't check whether some Xi formats may need a BGR swap. If so, then the swap can be set to true in the format selection code and the rest will happen automatically.
See also: xiAPI manual.
static QVector<QRgb> grayScaleColorTable() {
static QVector<QRgb> table;
if (table.isEmpty()) {
table.resize(256);
auto *data = table.data();
for (int i = 0; i < table.size(); ++i)
data[i] = qRgb(i, i, i);
}
return table;
}
constexpr QImage::Format grayScaleFormat() {
return (QT_VERSION >= QT_VERSION_CHECK(5,5,0))
? QImage::Format_Grayscale8
: QImage::Format_Indexed8;
}
QImage convertToImage(const XI_IMG *src, QImage::Format f) {
Q_ASSERT(src->fmt == XI_MONO16);
Q_ASSERT((src->padding_x % 2) == 0);
if (src->fmt != XI_MONO16) return {};
const quint16 *s = static_cast<const quint16*>(src->bp);
const int s_pad = src->padding_x/2;
if (f == QImage::Format_BGR30 ||
f == QImage::Format_A2BGR30_Premultiplied ||
f == QImage::Format_RGB30 ||
f == QImage::Format_A2RGB30_Premultiplied)
{
QImage ret{src->width, src->height, f};
Q_ASSERT((ret->bytesPerLine() % 4) == 0);
const int d_pad = ret->bytesPerLine()/4 - ret->width();
quint32 *d = (quint32*)ret.bits();
if (s_pad == d_pad) {
const int N = (src->width + s_pad) * src->height - s_pad;
for (int i = 0; i < N; ++i) {
quint32 const v = (*s++) >> (16-10);
*d++ = 0xC0000000 | v << 20 | v << 10 | v;
}
} else {
for (int j = 0; j < src->height; ++j) {
for (int i = 0; i < src->width; ++i) {
quint32 const v = (*s++) >> (16-10);
*d++ = 0xC0000000u | v << 20 | v << 10 | v;
}
s += s_pad;
d += d_pad;
}
}
return ret;
}
QImage ret{src->width, src->height, grayScaleFormat()};
const int d_pad = ret->bytesPerLine() - ret->width();
auto *d = ret.bits();
if (s_pad == d_pad) {
const int N = (src->width + s_pad) * src->height - s_pad;
for (int i = 0; i < N; ++i) {
*d++ = (*s++) >> 8;
} else {
for (int j = 0; j < src->height; ++j) {
for (int i = 0; i < src->width; ++i)
*d++ = (*s++) >> 8;
s += s_pad;
d += d_pad;
}
}
return ret;
}
QImage fromXiImg(const XI_IMG *src, QImage::Format dstFormat = QImage::Format_ARGB32Premultiplied) {
Q_ASSERT(src->width > 0 && src->height > 0 && src->padding_x >= 0 && src->bp_size > 0);
Q_ASSERT(dstFormat != QImage::Format_Invalid);
bool swap = false;
int srcPixelBytes = 0;
bool externalConvert = false;
QImage::Format srcFormat = QImage::Format_Invalid;
switch (src->fmt) {
case XI_MONO8:
srcPixelBytes = 1;
srcFormat = grayScaleFormat();
break;
case XI_MONO16:
srcPixelBytes = 2;
externalConvert = true;
break;
case XI_RGB24:
srcPixelBytes = 3;
srcFormat = QImage::Format_RGB888;
break;
case XI_RGB32:
srcPixelBytes = 4;
srcFormat = QImage::Format_RGB32;
break;
};
if (srcFormat == QImage::Format_Invalid && !externalConvert) {
qWarning("Unhandled XI_IMG image format");
return {};
}
Q_ASSERT(srcPixelBytes > 0 && srcPixelBytes <= 4);
int bytesPerLine = src->width * srcPixelBytes + src->padding_x;
if ((bytesPerLine * src->height - src->padding_x) > src->bp_size) {
qWarning("Inconsistent XI_IMG data");
return {};
}
QImage ret;
if (!externalConvert)
ret = QImage{static_cast<const uchar*>(src->bp), src->width, src->height,
bytesPerLine, srcFormat};
else
ret = convertToImage(src, dstFormat);
if (ret.format() == QImage::Format_Indexed8)
ret.setColorTable(grayScaleColorTable());
if (ret.format() != dstFormat)
ret = std::move(ret).convertToFormat(dstFormat);
if (swap)
ret = std::move(ret).rgbSwapped();
if (!ret.isDetached()) // ensure that we don't share XI_IMG's data buffer
ret.detach();
return ret;
}

Algorithm implemented with openCL working till size exceeds 768

I've implemented sorting algorithm using openCL. Its using one work group per array to sort (arrays are connected in __global float *array, all have the same size).
Im testing results using 200 random arrays and result are deterministic.
With one parameter, its working correctly till array size exceeds of array 768
With two parameters, its working correctly till arrays size exceeds 768
With three parameters, its working correctly till arrays size exceeds 317
What could be the reason of correct processing of just 768 (CL_KERNEL_WORK_GROUP_SIZE returns 1024 elements). Is it some memory constraints? What is the best way of invastigation such issue?
Gpu specs (4th answer):
Kernel code below:
__kernel void assort(
__global float *array,
__local float *currentOutput,
__local float *stimulations,
__local int *noOfValuesAdded,
__local float *addedValue,
__local float *positionToInsert,
__local int *activatedIdx,
__local float *range,
int size
) {
int id = get_local_id(0);
int gid = get_group_id(0);
if (id == 0)
{
if (array[gid*size]<array[gid*size+1])
{
currentOutput[0] = array[gid*size];
currentOutput[1] = array[gid*size + 1];
}
else
{
currentOutput[1] = array[gid*size];
currentOutput[0] = array[gid*size + 1];
}
noOfValuesAdded[0] = 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 2; i < size; i++)
{
int maxIdx = noOfValuesAdded[0] - 1;
if (id == 0)
{
addedValue[0] = array[gid*size + i];
positionToInsert[0] = -100.0f;
activatedIdx[0] = -2;
range[0] = currentOutput[maxIdx] - currentOutput[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (id < noOfValuesAdded[0])
{
if (id == 0)
{
stimulations[id] = (currentOutput[maxIdx] - addedValue[0]) / range[0];
float stimulation = stimulations[id];
if ( fabs(stimulation - 1.0f) < 0.000001)
activatedIdx[0] = 0;
else if (stimulation > 1.0f)
{
activatedIdx[0] = -1;
}
}
else if (id == maxIdx)
{
stimulations[maxIdx] = (addedValue[0] - currentOutput[0]) / range[0];
float stimulations = (addedValue[0] - currentOutput[0]) / range[0];
if ( fabs(stimulations - 1.0f) < 0.000001 )
activatedIdx[0] = maxIdx;
else
if (stimulations > 1)
activatedIdx[0] = maxIdx + 1;
}
else
{
stimulations[id] = 1.0f - (fabs((currentOutput[id] - addedValue[0])) / range[0]);
if ( fabs(stimulations[id] - 1.0f) < 0.000001)
activatedIdx[0] = id;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (activatedIdx[0] == -2 && id < noOfValuesAdded[0])
{
if (noOfValuesAdded[0] == 2)
{
positionToInsert[0] = 0.9f;
}
else if (id != 0 &&
id != maxIdx &&
stimulations[id] >= stimulations[(id - 1)] &&
stimulations[id] >= stimulations[(id + 1)] )
{
if ((1.0f - (fabs(currentOutput[(id - 1)] - currentOutput[id]) / range[0]) ) < stimulations[(id - 1)])
positionToInsert[0] = (float)id - 0.1f;
else
positionToInsert[0] = (float)id + 0.9f;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (activatedIdx[0] == -2)
{
if (id == 0 && positionToInsert[0] < -90.0f) // default value maintained
{
if (stimulations[0] > stimulations[1])
positionToInsert[0] = 0.9f;
else
positionToInsert[0] = (float)maxIdx - 0.1f;
}
}
else
{
if (activatedIdx[0] == -1)
positionToInsert[0] = -0.1f;
else if (activatedIdx[0] == (maxIdx + 1))
{
positionToInsert[0] = (float)maxIdx + 0.9f;
}
else
{
currentOutput[activatedIdx[0]] = addedValue[0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (positionToInsert[0] > -50.0f) // default value changed
{
float temp = 0.0f;
if ((float)id>positionToInsert[0])
{
temp = currentOutput[id];
currentOutput[id + 1] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if ((float)id > positionToInsert[0])
{
temp = currentOutput[id];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (id == round(positionToInsert[0]))
{
currentOutput[id] = addedValue[0];
noOfValuesAdded[0] = noOfValuesAdded[0] + 1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
array[gid*size + id] = currentOutput[id];
return;
}

Resources