OpenCL: Move data between __global memory - opencl

I am trying to move some data between 2 global memory before running a kernel on it.
Here buffer contains data that needs to be written in array, but sadly not contiguously:
void exchange_2_halo_write(
__global float2 *array,
__global float *buffer,
const unsigned int im,
const unsigned int jm,
const unsigned int km
) {
const unsigned int v_dim = 2;
unsigned int i, j, k, v, i_buf = 0;
// Which vector component, ie along v_dim
for (v = 0; v < v_dim; v++) {
// top halo
for (k = 0; k < km; k++) {
for (i = 0; i < im; i++) {
((__global float*)&array[i + k*im*jm])[v] = buffer[i_buf];
// bottom halo
for (k = 0; k < km; k++) {
for (i = 0; i < im; i++) {
((__global float*)&array[i + k*im*jm + im*(jm-1)])[v] = buffer[i_buf];
// left halo
for (k = 0; k < km; k++) {
for (j = 1; j < jm-1; j++) {
((__global float*)&array[j*im + k*im*jm])[v] = buffer[i_buf];
// right halo
for (k = 0; k < km; k++) {
for (j = 1; j < jm-1; j++) {
((__global float*)&array[j*im + k*im*jm + (im-1)])[v] = buffer[i_buf];
This works really fine in C (with a few minor changes), and for the data size I need (im = 150, jm = 150, km = 90, buf_sz = 107280), it runs in about 0.02s.
I had expected the same code to be slower on the GPU, but not that slower, it actually takes about 90 minutes to do the same thing (that's about 250000x slower!).
Simply doing a straight allocation takes about 15 minutes, which clearly shows it is not the way to go.
for (i = 0; i < buf_sz; i++) {
array[i] = buffer[i];
In that case, I have seen that I can do something like this:
int xid = get_global_id(0);
array[xid] = buffer[xid];
which seems to work fine/quickly.
However, I do not know how to adapt this to use the conditions I have in the first code.
The top and bottom_halo parts have im contiguous elements to transfer to array, which I think means it could be ok to transfer easily. Sadly the left and right_halos don't.
Also with better code, can I expect to get somewhat close to the CPU time? If it is impossible to do it in, say, under 1s, it's probably going to be a waste.
Thank you.

Before the answer, 1 remark. When you do a for loop inside a kernel, like this:
for (i = 0; i < buf_sz; i++) {
array[i] = buffer[i];
And you launch ie: 512 work items, you are doing the copy 512 times!!, not doing it in parallel with 512 threads. So obviously, it is going to be even slower! more than 512x slower!!!
That said, you can split it in this way:
2D Global size: km x max(im,jm)
void exchange_2_halo_write(
__global float2 *array,
__global float *buffer,
const unsigned int im,
const unsigned int jm
) {
const unsigned int v_dim = 2;
const unsigned int k = get_global_id(0);
const unsigned int i = get_global_id(1);
const unsigned int km = get_global_size(0);
// Which vector component, ie along v_dim
for (unsigned int v = 0; v < v_dim; v++) {
if(i < im){
// top halo
((__global float*)&array[i + k*im*jm])[v] = buffer[v*(2*km*im + 2*km*(jm-2))+km*i];
// bottom halo
((__global float*)&array[i + k*im*jm + im*(jm-1)])[v] = buffer[v*(2*km*im + 2*km*(jm-2))+km*im+km*i];
if(i < jm-1 && i > 0){
// left halo
((__global float*)&array[i*im + k*im*jm])[v] = buffer[v*(2*km*im + 2*km*(jm-2))+km*im*2+km*(i-1)];
// right halo
((__global float*)&array[i*im + k*im*jm + (im-1))[v] = buffer[v*(2*km*im + 2*km*(jm-2))+km*im*2+km*(jm-2)+km*(i-1)];
Other options are possible, like using local memory, but that is a tedious work....


How do I plot E8 (Exceptional Lie Group order 8) in 2D?

For the last week or so I have been struggling to find a resource that will allow me to make something like the 2D petrie polygon diagrams in this article.
My main trouble is finding out what the rules are for the edge and node connections.
I.e. in this plot, is there a simple way to make the image from scratch (even if it not fully representative of the bigger theory behind it)?
Any help is massively appreciated!
Here is how I solved this problem!
// to run
// clink -c Ex8
// ./Ex8
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "dislin.h"
// method to generate all permutations of a set with repeated elements:
the root system
float root_sys[240][8];
int count = 0;
/// checks elements in root system to see if they should be permuted
int shouldSwap(float base[], int start, int curr)
for (int i = start; i < curr; i++)
if (base[i] == base[curr])
return 0;
return 1;
/// performs permutations of root system
void permutations(float base[], int index, int n)
if (index >= n) {
for(int i = 0; i < n; i++){
root_sys[count][i] = base[i];
for (int i = index; i < n; i++) {
int check = shouldSwap(base, index, i);
if (check) {
float temp_0 = base[index];
float temp_1 = base[i];
base[index] = temp_1;
base[i] = temp_0;
permutations(base, index + 1, n);
float temp_2 = base[index];
float temp_3 = base[i];
base[index] = temp_3;
base[i] = temp_2;
// function to list all distances from one node to others
float inner_product(float * vect_0, float * vect_1){
float sum = 0;
for(int i = 0; i < 8; i++){
sum = sum + ((vect_0[i] - vect_1[i]) * (vect_0[i] - vect_1[i]));
}return sum;
/// inner product funtion
float inner_product_plus(float * vect_0, float * vect_1){
float sum = 0;
for(int i = 0; i < 8; i++){
sum = sum + (vect_0[i] * vect_1[i]);
}return sum;
int main(void){
// base vector permutations of E8 root system
float base_sys[8][8] = {
//permute the base vectors
for(int i = 0; i < 8; i++){
//calculating distances between all roots, outputting correspondence matrix
int distance_matrix[240][240];
for(int i = 0; i < 240; i++){
int dist_m = 100;
for(int ii = 0; ii < 240; ii++){
float dist = inner_product(root_sys[i], root_sys[ii]);
if(dist == 2){ //connecting distance in E8
distance_matrix[i][ii] = 1;
}else{distance_matrix[i][ii] == 0;};
//use another program to calculate eigenvectors of root system . . . after some fiddling, these vectors appear
float re[8] = {0.438217070641, 0.205187681291,
0.36459828198, 0.0124511903657,
-0.0124511903657, -0.36459828198,
-0.205187681291, -0.67645247517};
float im[8] = {-0.118465163028, 0.404927414852,
0.581970822973, 0.264896157496,
0.501826483552, 0.345040496917,
0.167997088796, 0.118465163028};
//define co-ordinate system for relevent points
float rings_x[240];
float rings_y[240];
//decide on which points belong to the system
for(int i = 0; i < 240; i++){
float current_point[8];
for(int ii = 0; ii < 8; ii++){
current_point[ii] = root_sys[i][ii];
rings_x[i] = inner_product_plus(current_point, re);
rings_y[i] = inner_product_plus(current_point, im);
//graph the system using DISLIN library
graf(-1.2, 1.2, -1.2, 1.2, -1.2, 1.2, -1.2, 1);
// a connection appears depending on the previously calculated distance matrix
for(int i = 0; i < 240; i++){
for(int ii = 0; ii < 240; ii++){
int connect = distance_matrix[i][ii];
if(connect == 1){
rline(rings_x[i], rings_y[i], rings_x[ii], rings_y[ii]);
distance_matrix[ii][i] = 0;
// More DISLIN functions
titlin("E8", 1);
name("R-axis", "x");
name("I-axis", "y");
qplsca(rings_x, rings_y, 240);
return 0;
Extra points to anyone who can explain how to rotate the 2d plot to create a 3-d animation of this object

PyOpenCL - not seeing expected speedup

In experimenting with PyOpenCL, I noticed my code was running slower than expected. It turned out that it ran faster on CPU than on GPU (running on PyOpenCL in both cases, achieving just 1 GFLOP).
To debug this, I then tried naive matrix multiplication as a comparison, and only see a 2x speedup on GPU vs CPU (~20 GFLOPs vs ~10 GFLOPs). My system is i7 8750H + GTX 1070 Max-Q.
Does anyone have any thoughts they could share about what I might be doing wrong? I know that the code below is not optimal, but I would have expected that with the much increased floating point capability and memory bandwidth of my GPU there would be a bigger difference.
import pyopencl as cl
import pyopencl.array as pycl_array
import numpy as np
import numpy.linalg as la
import time
size = 4000
m1 = np.random.normal(size = [size,size]).astype(np.float32)
m2 = np.random.normal(size = [size,size]).astype(np.float32)
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)
a = pycl_array.to_device(queue, m1)
b = pycl_array.to_device(queue, m2)
res = pycl_array.empty_like(a)
prg = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
res[size * i + j] = 0;
for (int k = 0; k < size; k++)
res[size * i + j] += a[k + size * j] * b[i + size * k];
t = time.time()
task = prg.multiplymatrices(queue, m1.shape, None, np.int32(size),,,
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
Following the suggestion to use a local register to accumulate the results, I modified my code as follows, getting about 90 gflops at about 360 GB/s of memory bandwidth (which is the maximum bandwidth my GPU is capable of). Improving the gflops would require a more sophisticated matrix multiplication algorithm which reuses the same data stored in cache multiple times, but is outside the scope of this question.
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
float temp = 0;
for (int k = 0; k < size; k++)
temp += a[k + size * j] * b[i + size * k];
res[size * i + j] = temp;
EDIT: For those looking for an example of fast matrix multiplication, which showcases using local memory with workgroups as well as 2D register tiling, I have created the below based on the tutorial here. It gets 1.4 TFLOPs on my GPU.
prg4 = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * A,
__global const float * B, __global float * res) {
int ig = get_group_id(0);
int jg = get_group_id(1);
int il = get_local_id(0);
int jl = get_local_id(1);
const int memtile = 64;
const int regtile = 4;
volatile int il2;
volatile int jl2;
int iglob = memtile*ig + regtile*il;
int jglob = memtile*jg + regtile*jl;
__local float Asub[64][64];
__local float Bsub[64][64];
float acc[4][4];
float Areg;
float Breg[4];
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
acc[k][m] = 0;
for (int l = 0; l < size/memtile; l++) {
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
il2 = il*regtile + k;
jl2 = jl*regtile + m;
Asub[il2][jl2] = A[size*(iglob + k) + memtile*l + jl2];
Bsub[il2][jl2] = B[size*(memtile*l + il2) + jglob + m];
for (int k = 0; k < regtile; k++) {
for (int r = 0; r < regtile; r++) {
Breg[r] = Bsub[il*regtile+k][jl*regtile+r];
for (int m = 0; m < regtile; m++) {
Areg = Asub[il*regtile+m][jl*regtile+k];
for (int r = 0; r < regtile; r++) {
acc[k][m] += Areg*Breg[r];
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
res[size*(iglob+k)+jglob+m] = acc[k][m];
t = time.time()
memtile = 64
regtile = 4
wgsize = int(memtile/regtile)
global_size = int(size/regtile)
task = prg4.multiplymatrices(queue, (global_size,global_size), (wgsize,wgsize), np.int32(size),,,
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
print("GB/s total", 2*4*size**3/(tot_time*1000**3))
print("GB/s global", 2*4*size**3/(memtile*tot_time*1000**3))

OpenCl : Speed comparison of using global memory and private memory

I am learning OpenCl and I've stumble upon these two code snippets and now I am wondering why using private memory is much faster than just using global memory.
kernel void mmul(
const int N,
global float* A,
global float* B,
global float* C)
int k, j;
int i = get_global_id(0);
float tmp;
if (i < N) {
for (j = 0; j < N; j++) {
tmp = 0.0f;
for (k = 0; k < N; k++)
tmp += A[i*N+k] * B[k*N+j];
C[i*N+j] = tmp;
and between this
kernel void mmul(
const int N,
global float* A,
global float* B,
global float* C)
int k, j;
int i = get_global_id(0);
float Awrk[2048];
float tmp;
if (i < N) {
for (k = 0; k < N; k++)
Awrk[k] = A[i*N+k];
for (j = 0; j < N; j++) {
tmp = 0.0;
for (k = 0; k < N; k++)
tmp += Awrk[k] * B[k*N+j];
C[i*N+j] = tmp;
On the bottom code snippet, the code assigns a memory, Awrk[2048], and copies data from the global float A, which I think it is waste of operation. However, the bottom code is much faster (4.27 seconds) than the top one (about 14 seconds). Why is that?
Thank you.

OpenCL clEnqueueNDRangeKernel how to set work group size correctly

In OpenCL, if I want to add two N-dimension vectors, the global work group size (globalSize) should satisfy globalSize = ceil(N/localSize) * localSize, where localSize is the local work group size. Is this correct? If N = 1000, and localSize = 128, globalSize should be 1024? Can we always set globalSize some multiple of localSize and larger than needed?
I tried many times and it worked well for 1-dimension problems.
However, when it comes to 2d problems, for example, multiply two matrices of dimension m*n and n*p, the result matrix is of order m*p, things get more complicated.
The max work group size on my device is 128, so I set localSize [2] = {16,8} and
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}.
It is similar to the 1-dimension case but the result is wrong!
If I set localSize [2] = {1,128} and change the globalSize accordingly, I can get the correct result. So where is the problem? Can anyone tell me why?
In addition, I find out the indices where the matrix element is wrong.
It seems that the result is wrong at (i,j) where i*p + j = n * some constant (n = 1,2,3...)
Here is my kernel function:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
And then it is the host program:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
A[i] = i;
for(int i = 0; i < n*p; i++)
B[i] = i;
// Get available platforms
vector<Platform> platforms;
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("");
std::string sourceCode(
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices;
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
float z = 0.0;
for(int k = 0; k < n; k++)
z += A[i*n+k] * B[k*p+j];
c[i*p+j] = z;
for(int i = 0; i < m*p; i++)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
delete []A;
delete []B;
delete []C;
catch(Error error)
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
return 0;
Your bounds checking code inside your OpenCL kernel is incorrect. Instead of this:
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
You should have this:
if(i < 0 || j < 0 || i >= Mdim || j >= Pdim) return;
Let's assume, that you have float matrix of size 1000x1000:
const int size = 1000;
// Whatever
float* myMatrix = (float*)calloc(size * size, sizeof(*myMatrix));
Determine size of Local Group first:
size_t localSize[] = {16, 8};
Then determine, how many Local Groups do you need:
size_t numLocalGroups[] = {ceil(size/localSize[0]), ceil(size/localSize[1])};
Finally, determine NDRange size:
size_t globalSize[] = {localSize[0] * numLocalGroups[0], localSize[1] * numLocalGroups[1]};
Don't forget to handle out-of-bounds access in right-most Local Groups.

OpenCL Matrix multiplication: inner product versus outer product

I'm hoping everyone is familiar with the standard "naive" method of multiplying two (n x n square for simplicity) matrices. In C this is:
for(int i = 0; i < n; ++i)
for(int j = 0; j < n; ++j)
for(int k = 0; k < n; ++k)
C[i*n + j] += A[i*n + k] * B[k*n + j];
The above method computes the dot (inner) product of a row of A with a column of B and is easy to implement in OpenCL as follows:
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
for(int i = 0; i < n; i++)
C[row*n + col] += A[row*n + i]*B[i*n + col];
Interchanging the two inner-most loops of the original C implementation results in a method that computes outer products, i.e., it computes rank-1 updates of the rows of the C matrix:
for(int i = 0; i < n; ++i)
for(int k = 0; k < n; ++k)
for(int j = 0; j < n; ++j)
C[i*n + j] += A[i*n + k] * B[k*n + j];
Does anybody know how to properly implement the above outer-product method in OpenCL? I have two of my attempts pasted below but I just can't seem to nail it
Attempt 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
__local float r;
r = A[row*n + col];
for(int i = 0; i < n; ++i)
C[row*n + i] += r * B[col*n + i];
Attempt 2
#define TS 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
int n)
// Thread coordinates
const int row = get_local_id(1); // row
const int col = get_local_id(0); // col
// Group tile coordinates
const int by = get_group_id(1); // row
const int bx = get_group_id(0); // col
A += TS*by + TS*bx*n + n*row + (col);
B += TS*by*n + n*row + (col);
C += TS*bx*n + n*(row) + col;
__global const float *Blast = B + n;
float c[2] = {0.0f,0.0f};
float* cptr = &c[0];
__local float bs[2];
bs[0] = B[0];
bs[1] = B[n];
*cptr += A[0] * bs[0];
*cptr++ += A[0] * bs[1];
} while( B < Blast );
C[0] += c[0];
C[1] += c[1];
The OpenCL implementation of the common algorithm maps the outer two loops to the OpenCL NDRange implicit loops. This works because the outer two loops can be safely run in parallel.
There are a few problems with Attempt 1:
The __local variable r is assigned different values from multiple work-items simultaneously. There is a race condition here, the value of r is undefined. This could be fixed by just making r a private variable instead.
The more serious problem is that there is a race condition in the assignment of C. Every value of col (NDRange dimension 0) will be running its own loop over i in parallel.
There isn't a simple way around the second issue. The loop over k (in the transposed version) cannot be run in parallel. You can only map either the outer loop or the inner loop to a single dimensional NDRange in OpenCL.
