OpenCl : Speed comparison of using global memory and private memory - opencl

I am learning OpenCl and I've stumble upon these two code snippets and now I am wondering why using private memory is much faster than just using global memory.
kernel void mmul(
const int N,
global float* A,
global float* B,
global float* C)
{
int k, j;
int i = get_global_id(0);
float tmp;
if (i < N) {
for (j = 0; j < N; j++) {
tmp = 0.0f;
for (k = 0; k < N; k++)
tmp += A[i*N+k] * B[k*N+j];
C[i*N+j] = tmp;
}
}
}
and between this
kernel void mmul(
const int N,
global float* A,
global float* B,
global float* C)
{
int k, j;
int i = get_global_id(0);
float Awrk[2048];
float tmp;
if (i < N) {
for (k = 0; k < N; k++)
Awrk[k] = A[i*N+k];
for (j = 0; j < N; j++) {
tmp = 0.0;
for (k = 0; k < N; k++)
tmp += Awrk[k] * B[k*N+j];
C[i*N+j] = tmp;
}
}
}
On the bottom code snippet, the code assigns a memory, Awrk[2048], and copies data from the global float A, which I think it is waste of operation. However, the bottom code is much faster (4.27 seconds) than the top one (about 14 seconds). Why is that?
Thank you.

Related

PyOpenCL - not seeing expected speedup

In experimenting with PyOpenCL, I noticed my code was running slower than expected. It turned out that it ran faster on CPU than on GPU (running on PyOpenCL in both cases, achieving just 1 GFLOP).
To debug this, I then tried naive matrix multiplication as a comparison, and only see a 2x speedup on GPU vs CPU (~20 GFLOPs vs ~10 GFLOPs). My system is i7 8750H + GTX 1070 Max-Q.
Does anyone have any thoughts they could share about what I might be doing wrong? I know that the code below is not optimal, but I would have expected that with the much increased floating point capability and memory bandwidth of my GPU there would be a bigger difference.
import pyopencl as cl
import pyopencl.array as pycl_array
import numpy as np
import numpy.linalg as la
import time
size = 4000
m1 = np.random.normal(size = [size,size]).astype(np.float32)
m2 = np.random.normal(size = [size,size]).astype(np.float32)
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)
a = pycl_array.to_device(queue, m1)
b = pycl_array.to_device(queue, m2)
res = pycl_array.empty_like(a)
prg = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
res[size * i + j] = 0;
for (int k = 0; k < size; k++)
{
res[size * i + j] += a[k + size * j] * b[i + size * k];
}
}
""").build()
t = time.time()
task = prg.multiplymatrices(queue, m1.shape, None, np.int32(size), a.data, b.data, res.data)
task.wait()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
Following the suggestion to use a local register to accumulate the results, I modified my code as follows, getting about 90 gflops at about 360 GB/s of memory bandwidth (which is the maximum bandwidth my GPU is capable of). Improving the gflops would require a more sophisticated matrix multiplication algorithm which reuses the same data stored in cache multiple times, but is outside the scope of this question.
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
float temp = 0;
for (int k = 0; k < size; k++)
{
temp += a[k + size * j] * b[i + size * k];
}
res[size * i + j] = temp;
}
EDIT: For those looking for an example of fast matrix multiplication, which showcases using local memory with workgroups as well as 2D register tiling, I have created the below based on the tutorial here. It gets 1.4 TFLOPs on my GPU.
prg4 = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * A,
__global const float * B, __global float * res) {
int ig = get_group_id(0);
int jg = get_group_id(1);
int il = get_local_id(0);
int jl = get_local_id(1);
const int memtile = 64;
const int regtile = 4;
volatile int il2;
volatile int jl2;
int iglob = memtile*ig + regtile*il;
int jglob = memtile*jg + regtile*jl;
__local float Asub[64][64];
__local float Bsub[64][64];
float acc[4][4];
float Areg;
float Breg[4];
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
acc[k][m] = 0;
}
}
for (int l = 0; l < size/memtile; l++) {
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
il2 = il*regtile + k;
jl2 = jl*regtile + m;
Asub[il2][jl2] = A[size*(iglob + k) + memtile*l + jl2];
Bsub[il2][jl2] = B[size*(memtile*l + il2) + jglob + m];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < regtile; k++) {
for (int r = 0; r < regtile; r++) {
Breg[r] = Bsub[il*regtile+k][jl*regtile+r];
}
for (int m = 0; m < regtile; m++) {
Areg = Asub[il*regtile+m][jl*regtile+k];
for (int r = 0; r < regtile; r++) {
acc[k][m] += Areg*Breg[r];
}
}
}
}
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
res[size*(iglob+k)+jglob+m] = acc[k][m];
}
}
}
""").build()
t = time.time()
memtile = 64
regtile = 4
wgsize = int(memtile/regtile)
global_size = int(size/regtile)
task = prg4.multiplymatrices(queue, (global_size,global_size), (wgsize,wgsize), np.int32(size), a.data, b.data, res.data)
queue.finish()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
print("GB/s total", 2*4*size**3/(tot_time*1000**3))
print("GB/s global", 2*4*size**3/(memtile*tot_time*1000**3))

How to write Nested Loop in Kernel side OpenCL

I'm a beginner in OpenCL. I'm trying to implement an OpenCL application.I have a doubt that how to write opencl kernel code . i have given a original c code.
Question :- help me to change that given c code into opencl kernel code?.
ORIGINAL C CODE:
int i, j;
// initialization of indexes
for (i = 0; i<n; i++)
Index[i] = i;
// Bubble sort
for (i = 0; i<n - 1; i++)
{
for (j = i + 1; j<n; j++)
{
if (I[i] > I[j])
{
double z = I[i]; // exchange attractiveness
I[i] = I[j];
I[j] = z;
z = f[i]; // exchange fitness
f[i] = f[j];
f[j] = z;
int k = Index[i]; // exchange indexes
Index[i] = Index[j];
Index[j] = k;
}
}
}
Example for 4096 element arrays(alternate bubble1 and bubble2 at least 2048 times--->4096(N) kernel executions ):
index init on host side since its just assignment.
Auxiliary functions:
void swap2p(__private int * I,int i,int j)
{
int tmp=I[i];
I[i]=I[j];
I[j]=tmp;
}
void swap2g(__global int * I,int i,int j)
{
int tmp=I[i];
I[i]=I[j];
I[j]=tmp;
}
Alternating kernel-1:
__kernel void bubble1(__global int * I, __global int * f, __global int * Index){
int threadId=get_global_id(0);
__private int vals[2];
if(threadId*2+1<4096)
{
vals[0]=I[threadId*2];
vals[1]=I[threadId*2+1];
if(vals[0]>vals[1])
{
swap2p(vals,threadId*2,threadId*2+1);
swap2g(f,threadId*2,threadId*2+1);
swap2g(Index,threadId*2,threadId*2+1);
I[threadId*2]=vals[0];
I[threadId*2+1]=vals[1];
}
}
}
alternating kernel-2:
__kernel void bubble2(__global int * I){
int threadId=get_global_id(0);
__private int vals[2];
if(threadId*2+2<4096)
{
vals[0]=I[threadId*2+1];
vals[1]=I[threadId*2+2];
if(vals[0]>vals[1])
{
swap2p(vals,threadId*2+1,threadId*2+2);
swap2g(f,threadId*2+1,threadId*2+2);
swap2g(Index,threadId*2+1,threadId*2+2);
I[threadId*2+1]=vals[0];
I[threadId*2+2]=vals[1];
}
}
}
Global thread number: N/2 (2048)

OpenCL: Move data between __global memory

I am trying to move some data between 2 global memory before running a kernel on it.
Here buffer contains data that needs to be written in array, but sadly not contiguously:
void exchange_2_halo_write(
__global float2 *array,
__global float *buffer,
const unsigned int im,
const unsigned int jm,
const unsigned int km
) {
const unsigned int v_dim = 2;
unsigned int i, j, k, v, i_buf = 0;
// Which vector component, ie along v_dim
for (v = 0; v < v_dim; v++) {
// top halo
for (k = 0; k < km; k++) {
for (i = 0; i < im; i++) {
((__global float*)&array[i + k*im*jm])[v] = buffer[i_buf];
i_buf++;
}
}
// bottom halo
for (k = 0; k < km; k++) {
for (i = 0; i < im; i++) {
((__global float*)&array[i + k*im*jm + im*(jm-1)])[v] = buffer[i_buf];
i_buf++;
}
}
// left halo
for (k = 0; k < km; k++) {
for (j = 1; j < jm-1; j++) {
((__global float*)&array[j*im + k*im*jm])[v] = buffer[i_buf];
i_buf++;
}
}
// right halo
for (k = 0; k < km; k++) {
for (j = 1; j < jm-1; j++) {
((__global float*)&array[j*im + k*im*jm + (im-1)])[v] = buffer[i_buf];
i_buf++;
}
}
}
}
This works really fine in C (with a few minor changes), and for the data size I need (im = 150, jm = 150, km = 90, buf_sz = 107280), it runs in about 0.02s.
I had expected the same code to be slower on the GPU, but not that slower, it actually takes about 90 minutes to do the same thing (that's about 250000x slower!).
Simply doing a straight allocation takes about 15 minutes, which clearly shows it is not the way to go.
for (i = 0; i < buf_sz; i++) {
array[i] = buffer[i];
}
In that case, I have seen that I can do something like this:
int xid = get_global_id(0);
array[xid] = buffer[xid];
which seems to work fine/quickly.
However, I do not know how to adapt this to use the conditions I have in the first code.
The top and bottom_halo parts have im contiguous elements to transfer to array, which I think means it could be ok to transfer easily. Sadly the left and right_halos don't.
Also with better code, can I expect to get somewhat close to the CPU time? If it is impossible to do it in, say, under 1s, it's probably going to be a waste.
Thank you.
Before the answer, 1 remark. When you do a for loop inside a kernel, like this:
for (i = 0; i < buf_sz; i++) {
array[i] = buffer[i];
}
And you launch ie: 512 work items, you are doing the copy 512 times!!, not doing it in parallel with 512 threads. So obviously, it is going to be even slower! more than 512x slower!!!
That said, you can split it in this way:
2D Global size: km x max(im,jm)
void exchange_2_halo_write(
__global float2 *array,
__global float *buffer,
const unsigned int im,
const unsigned int jm
) {
const unsigned int v_dim = 2;
const unsigned int k = get_global_id(0);
const unsigned int i = get_global_id(1);
const unsigned int km = get_global_size(0);
// Which vector component, ie along v_dim
for (unsigned int v = 0; v < v_dim; v++) {
if(i < im){
// top halo
((__global float*)&array[i + k*im*jm])[v] = buffer[v*(2*km*im + 2*km*(jm-2))+km*i];
// bottom halo
((__global float*)&array[i + k*im*jm + im*(jm-1)])[v] = buffer[v*(2*km*im + 2*km*(jm-2))+km*im+km*i];
}
if(i < jm-1 && i > 0){
// left halo
((__global float*)&array[i*im + k*im*jm])[v] = buffer[v*(2*km*im + 2*km*(jm-2))+km*im*2+km*(i-1)];
// right halo
((__global float*)&array[i*im + k*im*jm + (im-1))[v] = buffer[v*(2*km*im + 2*km*(jm-2))+km*im*2+km*(jm-2)+km*(i-1)];
}
}
}
Other options are possible, like using local memory, but that is a tedious work....

OpenCL Matrix multiplication: inner product versus outer product

I'm hoping everyone is familiar with the standard "naive" method of multiplying two (n x n square for simplicity) matrices. In C this is:
for(int i = 0; i < n; ++i)
for(int j = 0; j < n; ++j)
for(int k = 0; k < n; ++k)
C[i*n + j] += A[i*n + k] * B[k*n + j];
The above method computes the dot (inner) product of a row of A with a column of B and is easy to implement in OpenCL as follows:
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
)
{
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
for(int i = 0; i < n; i++)
C[row*n + col] += A[row*n + i]*B[i*n + col];
}
Interchanging the two inner-most loops of the original C implementation results in a method that computes outer products, i.e., it computes rank-1 updates of the rows of the C matrix:
for(int i = 0; i < n; ++i)
for(int k = 0; k < n; ++k)
for(int j = 0; j < n; ++j)
C[i*n + j] += A[i*n + k] * B[k*n + j];
Does anybody know how to properly implement the above outer-product method in OpenCL? I have two of my attempts pasted below but I just can't seem to nail it
Attempt 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
)
{
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
__local float r;
r = A[row*n + col];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = 0; i < n; ++i)
C[row*n + i] += r * B[col*n + i];
}
Attempt 2
#define TS 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
int n)
{
// Thread coordinates
const int row = get_local_id(1); // row
const int col = get_local_id(0); // col
// Group tile coordinates
const int by = get_group_id(1); // row
const int bx = get_group_id(0); // col
A += TS*by + TS*bx*n + n*row + (col);
B += TS*by*n + n*row + (col);
C += TS*bx*n + n*(row) + col;
__global const float *Blast = B + n;
float c[2] = {0.0f,0.0f};
float* cptr = &c[0];
__local float bs[2];
do
{
bs[0] = B[0];
bs[1] = B[n];
barrier(CLK_LOCAL_MEM_FENCE);
*cptr += A[0] * bs[0];
*cptr++ += A[0] * bs[1];
B++;
barrier(CLK_LOCAL_MEM_FENCE);
} while( B < Blast );
C[0] += c[0];
C[1] += c[1];
}
The OpenCL implementation of the common algorithm maps the outer two loops to the OpenCL NDRange implicit loops. This works because the outer two loops can be safely run in parallel.
There are a few problems with Attempt 1:
The __local variable r is assigned different values from multiple work-items simultaneously. There is a race condition here, the value of r is undefined. This could be fixed by just making r a private variable instead.
The more serious problem is that there is a race condition in the assignment of C. Every value of col (NDRange dimension 0) will be running its own loop over i in parallel.
There isn't a simple way around the second issue. The loop over k (in the transposed version) cannot be run in parallel. You can only map either the outer loop or the inner loop to a single dimensional NDRange in OpenCL.

How are the gather instructions in AVX2 implemented?

Suppose I'm using AVX2's VGATHERDPS - this should load 8 single-precision floats using 8 DWORD indices.
What happens when the data to be loaded exists in different cache-lines? Is the instruction implemented as a hardware loop which fetches cache-lines one by one? Or, can it issue a load to multiple cache-lines at once?
I read a couple of papers which state the former (and that's the one which makes more sense to me), but I would like to know a bit more about this.
Link to one paper: http://arxiv.org/pdf/1401.7494.pdf
I did some benchmarking of the AVX gather instructions (on a Haswell CPU) and it seems to be a fairly simple brute force implementation - even when the elements to be loaded are contiguous it seems that there is still one read cycle per element, so performance is really no better than just doing scalar loads.
NB: this answer is now obsolete as things have changed considerably since Haswell. See the accepted answer for full details (unless you happen to be targeting Haswell CPUs).
Gather was first implemented with Haswell but was not optimized until Broadwell (the first generation after Haswell).
I wrote my own code to test gather (see below). Here is a summary on Skylake, SkylakeX (with a dedicated AVX512 port), and KNL systems.
scalar auto AVX2 AVX512
Skylake GCC 0.47 0.38 0.38 NA
SkylakeX GCC 0.56 0.23 0.35 0.24
KNL GCC 3.95 1.37 2.11 1.16
KNL ICC 3.92 1.17 2.31 1.17
From the table it's clear that in all cases gather loads are faster than scalar loads (for the benchmark I used).
I'm not sure how Intel implements gather internally. The masks don't seem to have an effect on performance for gather. That's one thing Intel could optimize (if you only read one scalar value to due the mask it should be faster than gathering all values and then using the mask.
The Intel manual shows some nice figures on gather
https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
DCU = L1 Data Cache Unit. MCU = mid-level = L2 cache. LLC = last-level = L3 cache. L3 is shared, L2 and L1d are per-core private.
Intel is just benchmarking gathers, not using the result for anything.
//gather.c
#include <stdio.h>
#include <omp.h>
#include <stdlib.h>
#define N 1024
#define R 1000000
void foo_auto(double * restrict a, double * restrict b, int *idx, int n);
void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n);
void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n);
void foo1(double * restrict a, double * restrict b, int *idx, int n);
void foo2(double * restrict a, double * restrict b, int *idx, int n);
void foo3(double * restrict a, double * restrict b, int *idx, int n);
double test(int *idx, void (*fp)(double * restrict a, double * restrict b, int *idx, int n)) {
double a[N];
double b[N];
double dtime;
for(int i=0; i<N; i++) a[i] = 1.0*N;
for(int i=0; i<N; i++) b[i] = 1.0;
fp(a, b, idx, N);
dtime = -omp_get_wtime();
for(int i=0; i<R; i++) fp(a, b, idx, N);
dtime += omp_get_wtime();
return dtime;
}
int main(void) {
//for(int i=0; i<N; i++) idx[i] = N - i - 1;
//for(int i=0; i<N; i++) idx[i] = i;
//for(int i=0; i<N; i++) idx[i] = rand()%N;
//for(int i=0; i<R; i++) foo2(a, b, idx, N);
int idx[N];
double dtime;
int ntests=2;
void (*fp[4])(double * restrict a, double * restrict b, int *idx, int n);
fp[0] = foo_auto;
fp[1] = foo_AVX2;
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
fp[2] = foo_AVX512;
ntests=3;
#endif
for(int i=0; i<ntests; i++) {
for(int i=0; i<N; i++) idx[i] = 0;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = i;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = N-i-1;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = rand()%N;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f\n", dtime);
}
for(int i=0; i<N; i++) idx[i] = 0;
test(idx, foo1);
dtime = test(idx, foo1);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = i;
test(idx, foo2);
dtime = test(idx, foo2);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = N-i-1;
test(idx, foo3);
dtime = test(idx, foo3);
printf("%.2f ", dtime);
printf("NA\n");
}
//foo2.c
#include <x86intrin.h>
void foo_auto(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[idx[i]];
}
void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i+=4) {
__m128i vidx = _mm_loadu_si128((__m128i*)&idx[i]);
__m256d av = _mm256_i32gather_pd(&a[i], vidx, 8);
_mm256_storeu_pd(&b[i],av);
}
}
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i+=8) {
__m256i vidx = _mm256_loadu_si256((__m256i*)&idx[i]);
__m512d av = _mm512_i32gather_pd(vidx, &a[i], 8);
_mm512_storeu_pd(&b[i],av);
}
}
#endif
void foo1(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[0];
}
void foo2(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[i];
}
void foo3(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[n-i-1];
}

Resources