Searching for the best prescaler (closest integer multiple) - math

Let's say we have a timer with a counter (cnt) and a prescaler (psc), which triggers clk_freq / div times per second (Hz), where div = cnt * psc and clk_freq is a clock frequency, which is irrelevant. Value range for both counter and prescaler is [1;65536].
Which is the best (quickest) method of picking cnt and psc so that cnt * psc would be as close as possible to some desired_div value?
Example:
desired_div = 3849586
psc = 251, cnt = 15337, psc * cnt = 3849587, distance = 1
psc = 313, cnt = 12299, distance = 1
psc = 1757, cnt = 2191, distance = 1
psc = 2191, cnt = 1757, distance = 1
psc = 12299, cnt = 313, distance = 1
psc = 15337, cnt = 251, distance = 1
The code used to acquire these values (not effective enough):
int main(int argc, char *argv[])
{
uint32_t val, cnt, psc;
if (argc < 2) {
printf("Usage: %s <u32_value>\n", argv[0]);
return 1;
}
sscanf(argv[1], "%u", &val);
printf("Searching for %u\n", val);
uint32_t hlimit = min(65536, val);
uint32_t llimit = max(1, val / 65536);
uint32_t minpsc = 0, minredif = 0xFFFFFFFF;
for (psc = llimit; psc <= hlimit; ++psc) {
cnt = min(65536, val / psc);
uint32_t redif = abs(val - cnt * psc);
if (cnt < 65536) {
redif = min(redif, abs((cnt + 1) * psc - val));
}
if (redif < minredif) {
minpsc = psc;
minredif = redif;
}
printf("%u %u\n", psc, redif);
}
printf("Optimal psc: %u, difference: %u\n", minpsc, minredif);
return 0;
}

not sure how much faster or slower, not calling any libraries so that helps (well other than printf).
#include <stdio.h>
void fun ( unsigned int x )
{
unsigned int ra;
unsigned int rb;
unsigned int rc;
unsigned int rd;
unsigned int min;
min=0; min--;
for(ra=1;ra<32768;ra++)
{
rb=x/ra;
if(rb>65536) continue;
if(1)
{
rc=rb*ra;
if(rc>x) rd=rc-x;
else rd=x-rc;
if(rd<=min)
{
printf("%u %u %u\n",ra,rb,rd);
min=rd;
}
}
if(1)
{
rc=(rb+1)*ra;
if(rc>x) rd=rc-x;
else rd=x-rc;
if(rd<=min)
{
printf("%u %u %u\n",ra,rb+1,rd);
min=rd;
}
}
}
}
for your value the +1 gave the closest results.
59 65247 13
61 63108 2
122 31554 2
183 21036 2
244 15777 2
251 15337 1
313 12299 1
1757 2191 1
2191 1757 1
12299 313 1
15337 251 1
but dont know off hand if that will always be the case (expect not).
You shouldnt need to go past the halfway point as you are just testing the same pairs twice. ab = ba

Related

ILI9488 TFT LCD Arduino Shield - Can't use Touch and SDcard reading in the same program

I have a problem with my ILI9488 TFT Touch LCD module (Arduino Uno Shield). (320x480)
I can show .bmp pictures on the screen, read out of a SD-card.
In another testprogram, I can Serial.print() a char when I touch the display. (That's all it needs to do)
But when I merge the two programs together, it doesnt't work anymore.
I think the libraries are interferring with each other.
Is there any way to fix this, or can anyone recommend two compatible libraries to fix this?
Thanks in advance!
//----------------------------------------------------------------------------------------
// Declaration MCUFRIEND
//----------------------------------------------------------------------------------------
#include "MCUFRIEND_kbv.h"
MCUFRIEND_kbv tft;
#define LOWFLASH (defined(__AVR_ATmega328P__) && defined(MCUFRIEND_KBV_H_))
//----------------------------------------------------------------------------------------
// Declaration SDCard & Dependencies
//----------------------------------------------------------------------------------------
#include <SPI.h>
#include <SD.h>
#include <Adafruit_GFX.h>
#include <stdint.h>
#include "TouchScreen.h"
#define YP A2
#define XM A3
#define YM 8
#define XP 9
#define MINPRESSURE 1000
#define MAXPRESSURE 10000
//---------------------------------------------------------------------------------------
///Variables & Constants
#define BLACK 0x0000 /* 0, 0, 0 */
#define NAVY 0x000F /* 0, 0, 128 */
#define DARKGREEN 0x03E0 /* 0, 128, 0 */
#define DARKCYAN 0x03EF /* 0, 128, 128 */
#define MAROON 0x7800 /* 128, 0, 0 */
#define PURPLE 0x780F /* 128, 0, 128 */
#define OLIVE 0x7BE0 /* 128, 128, 0 */
#define LIGHTGREY 0xC618 /* 192, 192, 192 */
#define DARKGREY 0x7BEF /* 128, 128, 128 */
#define BLUE 0x001F /* 0, 0, 255 */
#define GREEN 0x07E0 /* 0, 255, 0 */
#define CYAN 0x07FF /* 0, 255, 255 */
#define RED 0xF800 /* 255, 0, 0 */
#define MAGENTA 0xF81F /* 255, 0, 255 */
#define YELLOW 0xFFE0 /* 255, 255, 0 */
#define WHITE 0xFFFF /* 255, 255, 255 */
#define ORANGE 0xFDA0 /* 255, 180, 0 */
#define GREENYELLOW 0xB7E0 /* 180, 255, 0 */
#define PINK 0xFC9F
#define SD_CS 10 // Chip Select from SPI Interface
#define LOGO "default"
#define AUTHUSER "authuser"
#define ADDUSER "adduser"
#define PALETTEDEPTH 4
File root;
char ReadMode = '0'; // Sleep = 0, Default = 1, Finger ID = 2, Finger Enroll = 3
char namebuf[32] = "/"; // BMP=Files in the root directory
int x, y, pathlen, count;
TouchScreen ts = TouchScreen(XP, YP, XM, YM, 300);
bool pressed = false;
void setup()
{
uint16_t ID = tft.readID();
Serial.begin(9600);
tft.begin(ID);
//Please select the mode you want to use
//Sleep --> 0
//Default --> 1
//FingerID --> 2
//FingerEnroll --> 3
//Getting SDCard Ready
bool good = SD.begin(SD_CS);
root = SD.open(namebuf);
pathlen = strlen(namebuf);
x = 0;
y = 0;
}
void loop(void)
{
if (Serial.available()) {
ReadMode = Serial.read();
}
if (ReadMode == '0') {
tft.fillScreen(NULL);
}
if(ReadMode == '1') // The Default HomeScreen displays, Read from the SDCard
{
for(int i = 0; i < 5; i++)
{
char *nm = namebuf + pathlen;
File f = root.openNextFile();
uint8_t ret;
uint32_t start;
if (f != NULL)
{
#ifdef USE_SDFAT
f.getName(nm, 32 - pathlen);
#else
strcpy(nm, (char *)f.name());
#endif
f.close();
strlwr(nm);
if (strstr(nm, ".bmp") != NULL && strstr(nm, LOGO) != NULL)
{
ret = showBMP(namebuf, x, y);
}
}
else root.rewindDirectory();
}
}
if (ReadMode == '2') // The AuthUser displays, Read from the SDCard
{
for(int i = 0; i < 5; i++)
{
char *nm = namebuf + pathlen;
File f = root.openNextFile();
uint8_t ret;
uint32_t start;
if (f != NULL)
{
#ifdef USE_SDFAT
f.getName(nm, 32 - pathlen);
#else
strcpy(nm, (char *)f.name());
#endif
f.close();
strlwr(nm);
if (strstr(nm, ".bmp") != NULL && strstr(nm, AUTHUSER) != NULL)
{
ret = showBMP(namebuf, x, y);
}
}
else root.rewindDirectory();
}
}
TSPoint p = ts.getPoint();
if (p.z > MINPRESSURE && p.z < MAXPRESSURE) { // TOUCH
Serial.println('X');
}
}
//----------------------------------------------------------------------------------------
// Methods for reading from SDCard
//----------------------------------------------------------------------------------------
#define BMPIMAGEOFFSET 54
#define BUFFPIXEL 20
uint16_t read16(File& f) {
uint16_t result; // read little-endian
f.read((uint8_t*)&result, sizeof(result));
return result;
}
uint32_t read32(File& f) {
uint32_t result;
f.read((uint8_t*)&result, sizeof(result));
return result;
}
uint8_t showBMP(char *nm, int x, int y)
{
File bmpFile;
int bmpWidth, bmpHeight; // W+H in pixels
uint8_t bmpDepth; // Bit depth (currently must be 24, 16, 8, 4, 1)
uint32_t bmpImageoffset; // Start of image data in file
uint32_t rowSize; // Not always = bmpWidth; may have padding
uint8_t sdbuffer[3 * BUFFPIXEL]; // pixel in buffer (R+G+B per pixel)
uint16_t lcdbuffer[(1 << PALETTEDEPTH) + BUFFPIXEL], *palette = NULL;
uint8_t bitmask, bitshift;
boolean flip = true; // BMP is stored bottom-to-top
int w, h, row, col, lcdbufsiz = (1 << PALETTEDEPTH) + BUFFPIXEL, buffidx;
uint32_t pos; // seek position
boolean is565 = false; //
uint16_t bmpID;
uint16_t n; // blocks read
uint8_t ret;
if ((x >= tft.width()) || (y >= tft.height()))
return 1; // off screen
bmpFile = SD.open(nm); // Parse BMP header
bmpID = read16(bmpFile); // BMP signature"
(void) read32(bmpFile); // Read & ignore file size
(void) read32(bmpFile); // Read & ignore creator bytes
bmpImageoffset = read32(bmpFile); // Start of image data
(void) read32(bmpFile); // Read & ignore DIB header size
bmpWidth = read32(bmpFile);
bmpHeight = read32(bmpFile);
n = read16(bmpFile); // # planes -- must be '1'
bmpDepth = read16(bmpFile); // bits per pixel
pos = read32(bmpFile); // format
if (bmpID != 0x4D42) ret = 2; // bad ID
else if (n != 1) ret = 3; // too many planes
else if (pos != 0 && pos != 3) ret = 4; // format: 0 = uncompressed, 3 = 565
else if (bmpDepth < 16 && bmpDepth > PALETTEDEPTH) ret = 5; // palette
else {
bool first = true;
is565 = (pos == 3); // ?already in 16-bit format
// BMP rows are padded (if needed) to 4-byte boundary
rowSize = (bmpWidth * bmpDepth / 8 + 3) & ~3;
if (bmpHeight < 0) { // If negative, image is in top-down order.
bmpHeight = -bmpHeight;
flip = false;
}
w = bmpWidth;
h = bmpHeight;
if ((x + w) >= tft.width()) // Crop area to be loaded
w = tft.width() - x;
if ((y + h) >= tft.height()) //
h = tft.height() - y;
if (bmpDepth <= PALETTEDEPTH) { // these modes have separate palette
bmpFile.seek(bmpImageoffset - (4<<bmpDepth)); //54 for regular, diff for colorsimportant
bitmask = 0xFF;
if (bmpDepth < 8)
bitmask >>= bmpDepth;
bitshift = 8 - bmpDepth;
n = 1 << bmpDepth;
lcdbufsiz -= n;
palette = lcdbuffer + lcdbufsiz;
for (col = 0; col < n; col++) {
pos = read32(bmpFile); //map palette to 5-6-5
palette[col] = ((pos & 0x0000F8) >> 3) | ((pos & 0x00FC00) >> 5) | ((pos & 0xF80000) >> 8);
}
}
// Set TFT address window to clipped image bounds
tft.setAddrWindow(x, y, x + w - 1, y + h - 1);
for (row = 0; row < h; row++) { // For each scanline...
uint8_t r, g, b, *sdptr;
int lcdidx, lcdleft;
if (flip) // Bitmap is stored bottom-to-top order (normal BMP)
pos = bmpImageoffset + (bmpHeight - 1 - row) * rowSize;
else // Bitmap is stored top-to-bottom
pos = bmpImageoffset + row * rowSize;
if (bmpFile.position() != pos) { // Need seek?
bmpFile.seek(pos);
buffidx = sizeof(sdbuffer); // Force buffer reload
}
for (col = 0; col < w; ) { //pixels in row
lcdleft = w - col;
if (lcdleft > lcdbufsiz) lcdleft = lcdbufsiz;
for (lcdidx = 0; lcdidx < lcdleft; lcdidx++) { // buffer at a time
uint16_t color;
// Time to read more pixel data?
if (buffidx >= sizeof(sdbuffer)) { // Indeed
bmpFile.read(sdbuffer, sizeof(sdbuffer));
buffidx = 0; // Set index to beginning
r = 0;
}
switch (bmpDepth) { // Convert pixel from BMP to TFT format
case 24:
b = sdbuffer[buffidx++];
g = sdbuffer[buffidx++];
r = sdbuffer[buffidx++];
color = tft.color565(r, g, b);
break;
case 16:
b = sdbuffer[buffidx++];
r = sdbuffer[buffidx++];
if (is565)
color = (r << 8) | (b);
else
color = (r << 9) | ((b & 0xE0) << 1) | (b & 0x1F);
break;
case 1:
case 4:
case 8:
if (r == 0)
b = sdbuffer[buffidx++], r = 8;
color = palette[(b >> bitshift) & bitmask];
r -= bmpDepth;
b <<= bmpDepth;
break;
}
lcdbuffer[lcdidx] = color;
}
tft.pushColors(lcdbuffer, lcdidx, first);
first = false;
col += lcdidx;
} // end cols
} // end rows
tft.setAddrWindow(0, 0, tft.width() - 1, tft.height() - 1); //restore full screen
ret = 0; // good render
}
bmpFile.close();
return (ret);
}

Getting segmentation fault (or bad access) for some inputs and the program halts

#include <iostream>
#include <vector>
#include <string>
using namespace std;
void step_selection_sort(vector <int> &a, int size, int idx){
int i,j,min,temp;
i = idx;
min = i;
for (j=i+1;j<size;j++)
{
if (a[min]>a[j])
min=j;
}
if (min!=i)
{
temp = a[i];
a[i] = a[min];
a[min] = temp;
}
idx++;
}
void selection_sort(vector <int> &a, int size, int idx){
int i;
for(i=0;i<size;i++)
{
step_selection_sort(a,size,idx);
}
}
void step_desc_sort(vector <int>& a, int size, int idx){
int i,j,max,temp;
i = idx;
max = i;
for (j=i+1;j<size;j++)
{
if (a[max]<a[j])
max=j;
}
if (max!=i)
{
temp = a[i];
a[i] = a[max];
a[max] = temp;
}
idx++;
}
void desc_sort(vector <int>& a, int size, int idx){
int i;
for(i=0;i<size;i++)
{
step_desc_sort(a,size,idx);
}
}
void swap (int & a, int & b)
{
int t = a;
a = b;
b = t;
}
int findCeil (vector <int>& nums, int first, int begin, int end)
{
int ceilIndex = begin;
for (int i = begin+1; i <= end; i++)
if (nums[i] > first && nums[i] < nums[ceilIndex])
ceilIndex = i;
return ceilIndex;
}
int findBottom(vector <int>& nums,int first,int begin,int end)
{
int bottomIndex = begin;
for (int i = begin+1; i <= end; i++)
if (nums[i] < first && nums[i] > nums[bottomIndex])
bottomIndex = i;
return bottomIndex;
}
void sortedPermutations_ASC (vector <int> nums,int num)
{
bool isfinished=false;
if(isfinished==false)
for(int i=0;i<num;i++)
cout << nums[i]; //bad access when giving inputs bigger than 8
cout << endl;
int k;
for ( k = num - 2; k >= 0; --k )
if (nums[k] < nums[k+1])
break;
if ( k == -1 )
isfinished=true;
else
{
int ceilIndex = findCeil( nums, nums[k], k + 1, num - 1 );
swap( nums[k], nums[ceilIndex] );
selection_sort(nums,num,k+1);
sortedPermutations_ASC(nums,num);
}
}
void sortedPermutations_DESC (vector <int> nums,int num)
{
int i;
bool isfinished=false;
if(isfinished==false)
for(i=0;i<num;i++)
cout << nums[i];
cout << endl;
int k;
for ( k = num - 2; k >= 0; --k )
if (nums[k] > nums[k+1])
break;
if ( k == -1 )
isfinished=true;
else
{
int bottomIndex = findBottom( nums, nums[k], k + 1, num - 1 );
swap( nums[k], nums[bottomIndex] );
desc_sort(nums,num,k+1);
sortedPermutations_DESC(nums,num);
}
return;
}
int main(){
vector <int> nums;
string line,temp;
int num,j,k;
getline(cin,line);
while(j<line.size() && line[j]!=' ')
j++;
num=stoi(line.substr(0,j));
string kind;
j++;
kind=line.substr(j);
if(kind=="ASC"){
for(k=0;k<num;k++)
nums.push_back(k+1);
sortedPermutations_ASC(nums,num);
}
if(kind=="DESC"){
for(k=0;k<num;k++)
nums.push_back(num-k);
sortedPermutations_DESC(nums,num);
}
return 0;
}
here's is my code. it gives the permutations of a number.It works properly when inputs are between 1 and 8 .But it doesn't work with numbers bigger than 8 .
for example if I give
9 ASC (it means in Ascending order)
to the program , I get "Segmentation Fault:11" in terminal (mac) after printing some of the permutations .
I tried running it in Xcode . with the same input it says :
Thread 1:EXC_BAD_ACCESS(code=2,address=0x7ffff5f3fffc8)
for the line that I put comment in front of it .
I don't know what to do anymore ...
Any help would be appreciated - thanks in advance

SSE2 intrinsics - comparing 2 __m128i's containing 4 int32's each to see how many are equal

I'm diving in SSE2 intrinsics for the first time and I'm not sure how to do this.
I want to compare 4 int32's to 4 other int32's and count how many are equal.
So I read my first 4 int32's, set them in a __m128i, do the same for the second set, and use _mm_cmpeq_epi32 for the comparison.
This should result in a __m128i containing 4 int32's, each one either 0xffffffff or 0 depending on whether the ints were equal.
But I have no idea how to get from that resulting __m128i to a count specifying how many were actually equal.
Can anyone point me in the right direction ?
The code as far as I'm piecing it together :
int* source = blah;
int* reference = otherblah;
// Load the 4 source int32's (they are actually 4 int32s apart)
__m128i first_4_int32s = _mm_set_epi32(*(source + 12), *(source + 8), *(source + 4), *(source));
// Load the 4 source int32's (also actually 4 int32s apart)
__m128i second_4_int32s = _mm_set_epi32(*(reference + 12), *(reference + 8), *(reference + 4), *(reference));
// Compare the int32's
__m128i result = _mm_cmpeq_epi32(first_4_int32s, second_4_int32s);
// Perform magic here that counts whether 0, 1, 2, 3 or all 4 ints were equal ?!?!
You can AND the compare result with a vector of ones to create a vector of zeros and ones. Then use a horizontal add operation to count the ones. Here are some possibilities.
#include "stdio.h"
#include "stdint.h"
#include "intrin.h"
//----------------------------------------------------------------------------
// non-SSE method (reference for result check)
static int method0 (__m128i value)
{
int index, total = 0;
uint32_t *buffer = (void *) &value;
for (index = 0; index < 4; index++)
total += buffer [index] == 0xFFFFFFFF;
return total;
}
//----------------------------------------------------------------------------
//
// horizontalAddBytes - return integer total of all 16 bytes in xmm argument
//
static int horizontalAddBytes (__m128i byteArray)
{
__m128i total;
const __m128i zero = _mm_setzero_si128 ();
total = _mm_sad_epu8 (byteArray, zero);
return _mm_cvtsi128_si64 (_mm_add_epi32 (total, _mm_shuffle_epi32 (total, 0xAA)));
}
//----------------------------------------------------------------------------
// requires SSE2
static int method1 (__m128i value)
{
return horizontalAddBytes (_mm_srli_epi32 (value, 31));
}
//----------------------------------------------------------------------------
// requires SSE3
static int method2 (__m128i value)
{
__m128 count;
const __m128 mask = _mm_set1_ps (1);
count = _mm_and_ps (_mm_castsi128_ps (value), mask);
count = _mm_hadd_ps (count, count);
count = _mm_hadd_ps (count, count);
return _mm_cvtss_si32 (count);
}
//----------------------------------------------------------------------------
// requires SSSE3
static int method3 (__m128i value)
{
__m128i count;
count = _mm_srli_epi32 (value, 31);
count = _mm_hadd_epi32 (count, count);
count = _mm_hadd_epi32 (count, count);
return _mm_cvtsi128_si32 (count);
}
//----------------------------------------------------------------------------
static void createTestData (uint32_t *data, int mask)
{
int index;
for (index = 0; index < 4; index++)
data [index * 4] = (mask & (1 << index)) != 0;
}
//----------------------------------------------------------------------------
int main (void)
{
int index1, index2, expected, result1, result2, result3;
uint32_t source [16];
uint32_t reference [16];
for (index1 = 0; index1 < 16; index1++)
for (index2 = 0; index2 < 16; index2++)
{
__m128i first_4_int32s, second_4_int32s, result;
createTestData (source, index1);
createTestData (reference, index2);
// Load the 4 source int32's (they are actually 4 int32s apart)
first_4_int32s = _mm_set_epi32(*(source + 12), *(source + 8), *(source + 4), *(source));
// Load the 4 source int32's (also actually 4 int32s apart)
second_4_int32s = _mm_set_epi32(*(reference + 12), *(reference + 8), *(reference + 4), *(reference));
// Compare the int32's
result = _mm_cmpeq_epi32(first_4_int32s, second_4_int32s);
expected = method0 (result);
result1 = method1 (result);
result2 = method2 (result);
result3 = method3 (result);
if (result1 != expected) printf ("method1, index %d,%d expected %d, actual %d\n", index1, index2, expected, result1);
if (result2 != expected) printf ("method2, index %d,%d expected %d, actual %d\n", index1, index2, expected, result2);
if (result3 != expected) printf ("method3, index %d,%d expected %d, actual %d\n", index1, index2, expected, result3);
}
return 0;
}
//----------------------------------------------------------------------------

Unhandled exception error with two dimensional array

This dynamic programming algorithm is returning unhandled exception error probably due to the two dimensional arrays that I am using for various (and very large) number of inputs. I can't seem to figure out the issue here. The complete program as follows:
// A Dynamic Programming based solution for 0-1 Knapsack problem
#include<stdio.h>
#include<stdlib.h>
#define MAX 10000
int size;
int Weight;
int p[MAX];
int w[MAX];
// A utility function that returns maximum of two integers
int maximum(int a, int b) { return (a > b) ? a : b; }
// Returns the maximum value that can be put in a knapsack of capacity W
int knapSack(int W, int wt[], int val[], int n)
{
int i, w;
int retVal;
int **K;
K = (int**)calloc(n+1, sizeof(int*));
for (i = 0; i < n + 1; ++i)
{
K[i] = (int*)calloc(W + 1, sizeof(int));
}
// Build table K[][] in bottom up manner
for (i = 0; i <= n; i++)
{
for (w = 0; w <= W; w++)
{
if (i == 0 || w == 0)
K[i][w] = 0;
else if (wt[i - 1] <= w)
K[i][w] = maximum(val[i - 1] + K[i - 1][w - wt[i - 1]], K[i - 1][w]);
else
K[i][w] = K[i - 1][w];
}
}
retVal = K[n][W];
for (i = 0; i < size + 1; i++)
free(K[i]);
free(K);
return retVal;
}
int random_in_range(unsigned int min, unsigned int max)
{
int base_random = rand();
if (RAND_MAX == base_random) return random_in_range(min, max);
int range = max - min,
remainder = RAND_MAX % range,
bucket = RAND_MAX / range;
if (base_random < RAND_MAX - remainder) {
return min + base_random / bucket;
}
else {
return random_in_range(min, max);
}
}
int main()
{
srand(time(NULL));
int val = 0;
int i, j;
//each input set is contained in an array
int batch[] = { 10, 20, 30, 40, 50, 5000, 10000 };
int sizeOfBatch = sizeof(batch) / sizeof(batch[0]);
//algorithms are called per size of the input array
for (i = 0; i < sizeOfBatch; i++){
printf("\n");
//dynamic array allocation (variable length to avoid stack overflow
//calloc is used to avoid garbage values
int *p = (int*)calloc(batch[i], sizeof(int));
int *w = (int*)calloc(batch[i], sizeof(int));
for (j = 0; j < batch[i]; j++){
p[j] = random_in_range(1, 500);
w[j] = random_in_range(1, 100);
}
size = batch[i];
Weight = batch[i] * 25;
printf("| %d ", batch[i]);
printf(" %d", knapSack(Weight, w, p, size));
free(p);
free(w);
}
_getch();
return 0;
}
Change this:
for (i = 0; i < size + 1; i++)
free(K[i]);
free(K);
return K[size][Weight];
To this:
int retVal;
...
retVal = K[size][Weight];
for (i = 0; i < size + 1; i++)
free(K[i]);
free(K);
return retVal;

OpenCL : UNREACHABLE executed

I have the generic kernel that calculates part sums of array elements in temporary buffer.
#if FUNC_SUM
#define FUNC(a, b) b += a;
#elif FUNC_ABS_SUM
#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
#elif FUNC_SQR_SUM
#define FUNC(a, b) b += a * a;
#else
#error No sum function
#endif
__kernel void sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
__global srcT *src, __global dstT *dst)
{
int lid = get_local_id(0);
int gid = get_group_id(0);
int id = get_global_id(0);
int idx = offset + id + (id / cols) * invalid_cols;
__local dstT localmem_sum[128];
dstT sum = (dstT)(0), temp;
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
{
idx = offset + id + (id / cols) * invalid_cols;
temp = convertToDstT(src[idx]);
FUNC(temp, sum);
}
if (lid > 127)
localmem_sum[lid - 128] = sum; // ??
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 128)
localmem_sum[lid] = sum + localmem_sum[lid];
barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = 64; lsize > 0; lsize >>= 1)
{
if (lid < lsize)
{
int lid2 = lsize + lid;
localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lid == 0)
dst[gid] = localmem_sum[0];
}
And this code fails with the message "UNREACHABLE executed!" on the line marked as // ??
Any wrong in this code? does some workaround exist to avoid this error?
Target platform: AMD GPU

Resources