int main() {
const int i = 1;
const int* p = &i;
int j = 2;
const int* q = &j;
j = 3;
printf("%d", *p + *q);
return 0;
}
I have this code, and I try to understand how it compiles. p and q are pointers to constant integers but j isn't declared as constant. Moreover, j changes into 3.
How does it work?
Thanks!
On the 5th line, you're assigning the address of variable j to q. This do not enforce any constraint over j, just on pointer q. Through q the compiler won't allow you to change pointed value, however j remains writeable and the line j = 3; is legal.
See What is the difference between const int*, const int * const, and int const *?
Related
I am trying to include a local atomic similar to that described by DarkZeros here within a working reduction kernel. The kernel finds a largest value within a set of points; the aim of the local atomic is to allow me to filter selected point_ids into an output array without any gaps.
At present when I use the local atomic to increment the addition to a local array the kernel runs but produces a wrong overall highest point. If the atomic line is commented out then a correct result returns.
What is going on here and how do I fix it?
Simplified kernel code:
__kernel void reduce(__global const float4* dataSet, __global const int* input, const unsigned int items, //points and index
__global int* output, __local float4* shared, const unsigned int n, //finding highest
__global int* filtered, __global const float2* tri_input, const unsigned int pass, //finding filtered
__global int* global_count //global count
){
//set everything up
const unsigned int group_id = get_global_id(0) / get_local_size(0);
const unsigned int local_id = get_local_id(0);
const unsigned int group_size = items;
const unsigned int group_stride = 2 * group_size;
const int local_stride = group_stride * group_size;
__local float4 *zeroIt = &shared[local_id];
zeroIt->x = 0; zeroIt->y = 0; zeroIt->z = 0; zeroIt->w = 0;
volatile __local int local_count_set_1;
volatile __local int global_val_set_1;
volatile __local int filter_local[64];
if(local_id==0){
local_count_set_1 = 0;
global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
int i = group_id * group_stride + local_id;
while (i < n){
//load up a pair of points using the index to locate them within a massive dataSet
int ia = input[i];
float4 a = dataSet[ia-1];
int ib = input[i + group_size];
float4 b = dataSet[ib-1];
//on the first pass kernel increment a local count
if(pass == 0){
filter_local[atomic_inc(&local_count_set_1)] = 1; //including this line causes an erroneous highest point result
//filter_local[local_id] = 1; //but including this line does not
//atomic_inc(&local_count_set_1); //and neither does this one
}
//find the highest of the pair
float4 result;
if(a.z>b.z) result = a;
else result = b;
//load up the previous highest result locally
float4 s = shared[local_id];
//if the previous highest beat this, stick, else twist
if(s.z>result.z){ result = s; }
shared[local_id] = result;
i += local_stride;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (group_size >= 512){
if (local_id < 256) {
__local float4 *a = &shared[local_id];
__local float4 *b = &shared[local_id+256];
if(b->z>a->z){ shared[local_id] = shared[local_id+256]; }
}}
//repeat barrier ops in increments down to group_size>=2 - this filters the highest result in shared
//finally, return the filtered highest result of shared to the global level
barrier(CLK_LOCAL_MEM_FENCE);
if(local_id == 0){
__local float4 *v = &shared[0];
int send = v->w ;
output[group_id] = send+1;
}}
[UPDATE]: When the atomic_inc line is included the 'wrong' highest point result is always a point near the end of the test dataset. I'm guessing that this means that the atomic_inc is affecting a latter comparison, but I'm not sure exactly what or where yet.
[UPDATE]: Edited code to simplify/clarify/update with debugging tweaks. Still not working and it is driving me loopy.
Total face-palm moment. In the setup phase of the kernel there are the lines:
if(local_id==0){
local_count_set_1 = 0;
global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
When these are split and the local_count_set_1 is included within the while loop, the error does not occur. i.e:
if(local_id==0) global_val_set_1 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
while (i < n){
if(local_id==0) local_count_set_1 = 0;
barrier(CLK_LOCAL_MEM_FENCE);
....
if(pass = 0){
filter_local[atomic_inc(&local_count_set_1)] = 1;
}
....
I'm hoping this fixes the issue // will update if not.
Aaaand that's a weekend I'll never get back.
I'm hoping everyone is familiar with the standard "naive" method of multiplying two (n x n square for simplicity) matrices. In C this is:
for(int i = 0; i < n; ++i)
for(int j = 0; j < n; ++j)
for(int k = 0; k < n; ++k)
C[i*n + j] += A[i*n + k] * B[k*n + j];
The above method computes the dot (inner) product of a row of A with a column of B and is easy to implement in OpenCL as follows:
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
)
{
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
for(int i = 0; i < n; i++)
C[row*n + col] += A[row*n + i]*B[i*n + col];
}
Interchanging the two inner-most loops of the original C implementation results in a method that computes outer products, i.e., it computes rank-1 updates of the rows of the C matrix:
for(int i = 0; i < n; ++i)
for(int k = 0; k < n; ++k)
for(int j = 0; j < n; ++j)
C[i*n + j] += A[i*n + k] * B[k*n + j];
Does anybody know how to properly implement the above outer-product method in OpenCL? I have two of my attempts pasted below but I just can't seem to nail it
Attempt 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
)
{
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
__local float r;
r = A[row*n + col];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = 0; i < n; ++i)
C[row*n + i] += r * B[col*n + i];
}
Attempt 2
#define TS 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
int n)
{
// Thread coordinates
const int row = get_local_id(1); // row
const int col = get_local_id(0); // col
// Group tile coordinates
const int by = get_group_id(1); // row
const int bx = get_group_id(0); // col
A += TS*by + TS*bx*n + n*row + (col);
B += TS*by*n + n*row + (col);
C += TS*bx*n + n*(row) + col;
__global const float *Blast = B + n;
float c[2] = {0.0f,0.0f};
float* cptr = &c[0];
__local float bs[2];
do
{
bs[0] = B[0];
bs[1] = B[n];
barrier(CLK_LOCAL_MEM_FENCE);
*cptr += A[0] * bs[0];
*cptr++ += A[0] * bs[1];
B++;
barrier(CLK_LOCAL_MEM_FENCE);
} while( B < Blast );
C[0] += c[0];
C[1] += c[1];
}
The OpenCL implementation of the common algorithm maps the outer two loops to the OpenCL NDRange implicit loops. This works because the outer two loops can be safely run in parallel.
There are a few problems with Attempt 1:
The __local variable r is assigned different values from multiple work-items simultaneously. There is a race condition here, the value of r is undefined. This could be fixed by just making r a private variable instead.
The more serious problem is that there is a race condition in the assignment of C. Every value of col (NDRange dimension 0) will be running its own loop over i in parallel.
There isn't a simple way around the second issue. The loop over k (in the transposed version) cannot be run in parallel. You can only map either the outer loop or the inner loop to a single dimensional NDRange in OpenCL.
struct a{
double array[2][3];
};
struct b{
double array[3][4];
};
void main(){
a x = {{1,2,3,4,5,6}};
b y = {{1,2,3,4,5,6,7,8,9,10,11,12}};
}
I have two structs, inside which there are two dim arrays with different sizes. If I want to define only one function, which can deal with both x and y (one for each time), i.e., the function allows both x.array and y.array to be its argument. How can I define the input argument? I think I should use a pointer.... But **x.array seems not to work.
For example, I want to write a function PrintArray which can print the input array.
void PrintArray( ){}
What should I input into the parenthesis? double ** seems not work for me... (we can let dimension to be the PrintArray's argument as well, telling them its 2*3 array)
Write a function that takes three parameters: a pointer, the number of rows, and the number of columns. When you call the function, reduce the array to a pointer.
void PrintArray(const double *a, int rows, int cols) {
int r, c;
for (r = 0; r < rows; ++r) {
for (c = 0; c < cols; ++c) {
printf("%3.1f ", a[r * cols + c]);
}
printf("\n");
}
}
int main(){
struct a x = {{{1,2,3},{4,5,6}}};
struct b y = {{{1,2,3,4},{5,6,7,8},{9,10,11,12}}};
PrintArray(&x.array[0][0], 2, 3);
PrintArray(&y.array[0][0], 3, 4);
return 0;
}
I have int A, B, C. And A is in range 0-9999, B is 0-99, C is 0-99.
Because the function must return only one double, I think of putting them all into one number. Otherwise I need to call function three times.
But I cannot write an efficient code to do this. This will be called millions times, so it should be quite effective, but no ASM.
I need a function double pack3int_to_double(int A, int B, int C) {}
Couldn't you just store A + 1000B + 100000C?
For example, if you wanted to store A = 1234, B = 6, and C = 89, you'd just store
89061234
CCBAAAA
You can then extract the numbers by casting the double to an int and using standard integer division and modulus tricks to recover the individual values.
Hope this helps!
If A<10,000 and B & C <100, A can be expressed with 14 bits, and B & C with 8 bits. Thus you need 30 bits in total.
You could therefore pack/unpack the integers by shifting it to the right place:
int packed = A + B<<14 + C<<22;
A = packed & 0x3FFF; B = (packed >> 14) & 0xFF; C = (packed >> 22) & 0xFF;
Bit shifting is of course MUCH faster than multiply/divide, and you can cast the int to a double and vice versa.
This is technically not legal C code, so you would use this at your own risk:
typedef union {
double x;
struct {
unsigned a : 14;
unsigned b : 7;
unsigned c : 7;
} y;
} result_t;
The C standard doesn't allow using a union member to write a value and a different one to read it out, but I am not aware of a compiler that does the static analysis to diagnose such a problem (it doesn't mean one won't do so in the future). Also, using certain int values may result in a trap representation for a double. But, if you know your system will not generate any trap representations, you can consider using this.
double pack3int_to_double(int A, int B, int C) {
result_t r;
r.y.a = A;
r.y.b = B;
r.y.c = C;
return r.x;
}
void unpack3int_from_double (double X, int *A, int *B, int *C) {
result_t r = { X };
*A = r.y.a;
*B = r.y.b;
*C = r.y.c;
}
You can use out parameters in function call and retrieve all 3 int variables.
You could return a NaN double with the data stored in the mantissa. That gives you 53 bits to utilize. Should be plenty.
http://en.m.wikipedia.org/wiki/NaN
Inspired by your answers, this is what I come up so far. This should be quite efficient, and only 32 bits are used, so the exponent of the double is not touched.
struct pack_abc {
unsigned short a;
unsigned char b, c;
int safety;
};
double pack3int_to_double(int A, int B, int C) {
struct pack_abc R = {A, B, C, 0}; // or 0 could be replaced with something smater, like NaN?
return *(double*)&R;
}
void main() {
int w = 1234, a = 56, d = 78;
int W, A, D, i;
double p = pack3int_to_double(w, a, d);
// we got the data packed into 'p', now let's unpack it
struct pack_abc *R = (struct pack_abc*) & p;
printf("%i %i %i\n", (int)R->a, (int)R->b, (int)R->c);
}
I am working on a piece of OpencL code for a specialized matrix function: for a Dx1 vector v, two DxD matrices A and B and a constant c, return 1xD vector r where r[i] = c * sum_over_j (v[j] * A[i][j] * B[i][j])
Below is what I have so far, but it runs freakishly slow. A version without summing that returns a DxD matrix is about ten times faster. It's called from PyOpenCL if that makes any difference.
Is anything done wrong? Could it be optimized?
#define D 1000
...
__kernel void element_mult(
__global float *result,
__global const float *vector,
__global const float *matrix,
__global const float *matrix2,
const float factor)
{
int y = get_global_id(1);
float sum = 0;
for(int k = 0; k < D; k++)
{
sum += vector[k] * matrix[(y*D) + k]
* matrix2[(y*D) + k ];
}
result[y] = sum * factor;
}
Cheers!
Optimization #1: make vector __local.
My first pass at this got a decent improvement in performance. I noticed that each vector[k] is read a total of D times, so I copied it to a __local. This is only possible because D is small enough to allow this. The kernel as you have it above suffers from a terrible ALU:fetch ratio of 0.08 on both the 5870 and the 6970 gpus. Even the slower gpus are still waiting on the memory access.
#define D 1000
__kernel void element_mult(
__global float *result,
__global const float *vector,
__global const float *matrix,
__global const float *matrix2,
const float factor)
{
int y = get_global_id(0);
float sum = 0;
__local float vectCopy[D];
int ls = get_local_size(0);
int lid = get_local_id(0);
for(int i=0;i<D;i+=ls){
vectCopy[i+lid] = vector[i+lid];
}
mem_fence(CLK_LOCAL_MEM_FENCE);
for(int k = 0; k < D; k++)
{
sum += vectCopy[k] * matrix[(y*D) + k] * matrix2[(y*D) + k ];
}
result[y] = sum * factor;
}
With this change, APP profiler is showing a new ALU:fetch ratio of 0.20 for the 5870 and 6970 gpus. Average times changed from 1513-->1034, and 1261-->861 on the same cards. The low end gpus are now bound by ALU instead of fetch. (greater than 4:1 ratio)
Opimization #2: calculate each result[y] using an entire work group.
You would have to do this id D were much larger (100k+). The idea is to get the best memory access pattern by using the work group to compute a single element of the result at a time. I defined ls (local size) to be 64 here, because it works on my hardware, as well as most vendors'. The workgroup size you use from the host-side will have to be 64 unless you change that definition. It needs to be defined to create the sum[ls] storage as __local, and I don't like passing variable sized __local vars into my kernels.
results: 5870 ALU:fetch=0.59:1, avg=708. 6970 ALU:fetch=0.72, avg=590. According to APP profiler, this is about twice as fast as your original listing.
#define D 1000
#define ls 64
__kernel void element_mult(
__global float *result,
__global const float *vector,
__global const float *matrix,
__global const float *matrix2,
const float factor)
{
__local float vectCopy[D];
int lid = get_local_id(0);
for(int i=0;i<D;i+=ls){
vectCopy[i+lid] = vector[i+lid];
}
mem_fence(CLK_LOCAL_MEM_FENCE);
int ng = get_num_groups(0);
int gid = get_group_id(0);
int y, k;
__local float sum[ls];
for(y = gid; y < D; y+=ng){
for(k = lid; k < D; k+=ls)
{
sum[lid] += vectCopy[k] * matrix[(y*D) + k] * matrix2[(y*D) + k ];
}
if(lid==0){
result[y] = sum[0];
for(k=1;k<ls;k++){
result[y] += sum[k];
}
result[y] *= factor;
}
mem_fence(CLK_LOCAL_MEM_FENCE);
}
}
EDIT: APP profiler = AMD APP KernelAnalyzer