How are the gather instructions in AVX2 implemented? - intel

Suppose I'm using AVX2's VGATHERDPS - this should load 8 single-precision floats using 8 DWORD indices.
What happens when the data to be loaded exists in different cache-lines? Is the instruction implemented as a hardware loop which fetches cache-lines one by one? Or, can it issue a load to multiple cache-lines at once?
I read a couple of papers which state the former (and that's the one which makes more sense to me), but I would like to know a bit more about this.
Link to one paper: http://arxiv.org/pdf/1401.7494.pdf

I did some benchmarking of the AVX gather instructions (on a Haswell CPU) and it seems to be a fairly simple brute force implementation - even when the elements to be loaded are contiguous it seems that there is still one read cycle per element, so performance is really no better than just doing scalar loads.
NB: this answer is now obsolete as things have changed considerably since Haswell. See the accepted answer for full details (unless you happen to be targeting Haswell CPUs).

Gather was first implemented with Haswell but was not optimized until Broadwell (the first generation after Haswell).
I wrote my own code to test gather (see below). Here is a summary on Skylake, SkylakeX (with a dedicated AVX512 port), and KNL systems.
scalar auto AVX2 AVX512
Skylake GCC 0.47 0.38 0.38 NA
SkylakeX GCC 0.56 0.23 0.35 0.24
KNL GCC 3.95 1.37 2.11 1.16
KNL ICC 3.92 1.17 2.31 1.17
From the table it's clear that in all cases gather loads are faster than scalar loads (for the benchmark I used).
I'm not sure how Intel implements gather internally. The masks don't seem to have an effect on performance for gather. That's one thing Intel could optimize (if you only read one scalar value to due the mask it should be faster than gathering all values and then using the mask.
The Intel manual shows some nice figures on gather
https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
DCU = L1 Data Cache Unit. MCU = mid-level = L2 cache. LLC = last-level = L3 cache. L3 is shared, L2 and L1d are per-core private.
Intel is just benchmarking gathers, not using the result for anything.
//gather.c
#include <stdio.h>
#include <omp.h>
#include <stdlib.h>
#define N 1024
#define R 1000000
void foo_auto(double * restrict a, double * restrict b, int *idx, int n);
void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n);
void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n);
void foo1(double * restrict a, double * restrict b, int *idx, int n);
void foo2(double * restrict a, double * restrict b, int *idx, int n);
void foo3(double * restrict a, double * restrict b, int *idx, int n);
double test(int *idx, void (*fp)(double * restrict a, double * restrict b, int *idx, int n)) {
double a[N];
double b[N];
double dtime;
for(int i=0; i<N; i++) a[i] = 1.0*N;
for(int i=0; i<N; i++) b[i] = 1.0;
fp(a, b, idx, N);
dtime = -omp_get_wtime();
for(int i=0; i<R; i++) fp(a, b, idx, N);
dtime += omp_get_wtime();
return dtime;
}
int main(void) {
//for(int i=0; i<N; i++) idx[i] = N - i - 1;
//for(int i=0; i<N; i++) idx[i] = i;
//for(int i=0; i<N; i++) idx[i] = rand()%N;
//for(int i=0; i<R; i++) foo2(a, b, idx, N);
int idx[N];
double dtime;
int ntests=2;
void (*fp[4])(double * restrict a, double * restrict b, int *idx, int n);
fp[0] = foo_auto;
fp[1] = foo_AVX2;
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
fp[2] = foo_AVX512;
ntests=3;
#endif
for(int i=0; i<ntests; i++) {
for(int i=0; i<N; i++) idx[i] = 0;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = i;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = N-i-1;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = rand()%N;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f\n", dtime);
}
for(int i=0; i<N; i++) idx[i] = 0;
test(idx, foo1);
dtime = test(idx, foo1);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = i;
test(idx, foo2);
dtime = test(idx, foo2);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = N-i-1;
test(idx, foo3);
dtime = test(idx, foo3);
printf("%.2f ", dtime);
printf("NA\n");
}
//foo2.c
#include <x86intrin.h>
void foo_auto(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[idx[i]];
}
void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i+=4) {
__m128i vidx = _mm_loadu_si128((__m128i*)&idx[i]);
__m256d av = _mm256_i32gather_pd(&a[i], vidx, 8);
_mm256_storeu_pd(&b[i],av);
}
}
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i+=8) {
__m256i vidx = _mm256_loadu_si256((__m256i*)&idx[i]);
__m512d av = _mm512_i32gather_pd(vidx, &a[i], 8);
_mm512_storeu_pd(&b[i],av);
}
}
#endif
void foo1(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[0];
}
void foo2(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[i];
}
void foo3(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[n-i-1];
}

Related

PyOpenCL - not seeing expected speedup

In experimenting with PyOpenCL, I noticed my code was running slower than expected. It turned out that it ran faster on CPU than on GPU (running on PyOpenCL in both cases, achieving just 1 GFLOP).
To debug this, I then tried naive matrix multiplication as a comparison, and only see a 2x speedup on GPU vs CPU (~20 GFLOPs vs ~10 GFLOPs). My system is i7 8750H + GTX 1070 Max-Q.
Does anyone have any thoughts they could share about what I might be doing wrong? I know that the code below is not optimal, but I would have expected that with the much increased floating point capability and memory bandwidth of my GPU there would be a bigger difference.
import pyopencl as cl
import pyopencl.array as pycl_array
import numpy as np
import numpy.linalg as la
import time
size = 4000
m1 = np.random.normal(size = [size,size]).astype(np.float32)
m2 = np.random.normal(size = [size,size]).astype(np.float32)
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)
a = pycl_array.to_device(queue, m1)
b = pycl_array.to_device(queue, m2)
res = pycl_array.empty_like(a)
prg = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
res[size * i + j] = 0;
for (int k = 0; k < size; k++)
{
res[size * i + j] += a[k + size * j] * b[i + size * k];
}
}
""").build()
t = time.time()
task = prg.multiplymatrices(queue, m1.shape, None, np.int32(size), a.data, b.data, res.data)
task.wait()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
Following the suggestion to use a local register to accumulate the results, I modified my code as follows, getting about 90 gflops at about 360 GB/s of memory bandwidth (which is the maximum bandwidth my GPU is capable of). Improving the gflops would require a more sophisticated matrix multiplication algorithm which reuses the same data stored in cache multiple times, but is outside the scope of this question.
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
float temp = 0;
for (int k = 0; k < size; k++)
{
temp += a[k + size * j] * b[i + size * k];
}
res[size * i + j] = temp;
}
EDIT: For those looking for an example of fast matrix multiplication, which showcases using local memory with workgroups as well as 2D register tiling, I have created the below based on the tutorial here. It gets 1.4 TFLOPs on my GPU.
prg4 = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * A,
__global const float * B, __global float * res) {
int ig = get_group_id(0);
int jg = get_group_id(1);
int il = get_local_id(0);
int jl = get_local_id(1);
const int memtile = 64;
const int regtile = 4;
volatile int il2;
volatile int jl2;
int iglob = memtile*ig + regtile*il;
int jglob = memtile*jg + regtile*jl;
__local float Asub[64][64];
__local float Bsub[64][64];
float acc[4][4];
float Areg;
float Breg[4];
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
acc[k][m] = 0;
}
}
for (int l = 0; l < size/memtile; l++) {
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
il2 = il*regtile + k;
jl2 = jl*regtile + m;
Asub[il2][jl2] = A[size*(iglob + k) + memtile*l + jl2];
Bsub[il2][jl2] = B[size*(memtile*l + il2) + jglob + m];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < regtile; k++) {
for (int r = 0; r < regtile; r++) {
Breg[r] = Bsub[il*regtile+k][jl*regtile+r];
}
for (int m = 0; m < regtile; m++) {
Areg = Asub[il*regtile+m][jl*regtile+k];
for (int r = 0; r < regtile; r++) {
acc[k][m] += Areg*Breg[r];
}
}
}
}
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
res[size*(iglob+k)+jglob+m] = acc[k][m];
}
}
}
""").build()
t = time.time()
memtile = 64
regtile = 4
wgsize = int(memtile/regtile)
global_size = int(size/regtile)
task = prg4.multiplymatrices(queue, (global_size,global_size), (wgsize,wgsize), np.int32(size), a.data, b.data, res.data)
queue.finish()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
print("GB/s total", 2*4*size**3/(tot_time*1000**3))
print("GB/s global", 2*4*size**3/(memtile*tot_time*1000**3))

OpenCl : Speed comparison of using global memory and private memory

I am learning OpenCl and I've stumble upon these two code snippets and now I am wondering why using private memory is much faster than just using global memory.
kernel void mmul(
const int N,
global float* A,
global float* B,
global float* C)
{
int k, j;
int i = get_global_id(0);
float tmp;
if (i < N) {
for (j = 0; j < N; j++) {
tmp = 0.0f;
for (k = 0; k < N; k++)
tmp += A[i*N+k] * B[k*N+j];
C[i*N+j] = tmp;
}
}
}
and between this
kernel void mmul(
const int N,
global float* A,
global float* B,
global float* C)
{
int k, j;
int i = get_global_id(0);
float Awrk[2048];
float tmp;
if (i < N) {
for (k = 0; k < N; k++)
Awrk[k] = A[i*N+k];
for (j = 0; j < N; j++) {
tmp = 0.0;
for (k = 0; k < N; k++)
tmp += Awrk[k] * B[k*N+j];
C[i*N+j] = tmp;
}
}
}
On the bottom code snippet, the code assigns a memory, Awrk[2048], and copies data from the global float A, which I think it is waste of operation. However, the bottom code is much faster (4.27 seconds) than the top one (about 14 seconds). Why is that?
Thank you.

vectorization and parallelization Xeon Phi

I am looking for an simple example where using vectorization and parallelization on Xeon Phi this has better perfomance than only-Xeon. Could you help me please?
I am trying with the next example. I comment the lines 14, 18 and 19 for run on only-Xeon and uncoment these for Xeon-Phi, but only-Xeon has better performance than Xeon-phi
1.void main(){
2.double *a, *b, *c;
3.int i,j,k, ok, n=100;
4.int nPadded = ( n%8 == 0 ? n : n + (8-n%8) );
5.ok = posix_memalign((void**)&a, 64, n*nPadded*sizeof(double));
6.ok = posix_memalign((void**)&b, 64, n*nPadded*sizeof(double));
7.ok = posix_memalign((void**)&c, 64, n*nPadded*sizeof(double));
8.for(i=0; i<n; i++)
9.{
10. a[i] = (int) rand();
11. b[i] = (int) rand();
12. c[i] = 0.0;
13.}
14.#pragma offload target(mic) in(a,b:length(n*nPadded)) inout(c:length(n*nPadded))
15.#pragma omp parallel for
16.for( i = 0; i < n; i++ )
17. for( k = 0; k < n; k++ )
18. #pragma vector aligned
19. #pragma ivdep
20. for( j = 0; j < n; j++ ){
21. c[i*nPadded+j] = c[i*nPadded+j] + a[i*nPadded+k]*b[k*nPadded+j]
22.}
First couple words about autovectorization. Advantage of autovectorization is simplicity. You need to set some keywords than magic happens and compiler make fast code for you. If you want to go this way try this manual.
The disadvantage of this approach is that there is no easy way to understand how compiler make his work. In vectorization report you will see "LOOP WAS VECTORIZED" or "LOOP WAS NOT VECTORIZED". But if you want truly understand how your code works the only way is look in your program assembly. This is not a problem to get assembly. You need to compile program with -fcode-asm. But I think if you need to read assembly to check how "simple autovectorization" method works it is not so simple.
Alternative to autovectorization are intrinsics (actually, this is not single alternative). Think about intrinsics like assembly wrapped with C functions. Many intrinsics internally wrap single assembly command.
I recommend to use this intrinsics guide.
So my simple way steps:
Make single thread reference implementation. You will use it to check correctness of intrinsics version.
Implement SSE intrinsics version. SSE intrinsics are much simpler and can be tested on Xeon.
Implement AVX-512 version for Xeon Phi.
Measure your speed.
Let's do it with your program.
There are many differences with your program:
I use float instead double.
I use _mm_malloc instead posix_memalign.
I suppose n is divided by 16 without remainder (16 floats in AVX-512 vector register). I don't work with loop peeling in this example.
I use native mode instead of offload mode. KNL is bootable so it is not necessary to use offload mode anymore.
Also I think your program is not correct because it modifies c array from several threads in one moment of time. But lets think it is not important and we just need some calculation job.
My code work time:
Intel Xeon 5680
reference calc time: 97.677505 seconds
Intrinsics calc time: 6.189296 seconds
Intel Xeon Phi (KNC) SE10X
reference calc time: 199.0 seconds
Intrinsics calc time: 2.78 seconds
Code:
#include <stdio.h>
#include <omp.h>
#include <math.h>
#include "immintrin.h"
#include <assert.h>
#define F_E_Q(X,Y,N) (round((X) * pow(10, N)-(Y) * pow(10, N)) == 0)
void reference(float* a, float* b, float* c, int n, int nPadded);
void intrinsics(float* a, float* b, float* c, int n, int nPadded);
char *test(){
int n=4800;
int nPadded = n;
assert(n%16 == 0);
float* a = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64);
float* b = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64);
float* cRef = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64);
float* c = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64);
assert(a != NULL);
assert(b != NULL);
assert(cRef != NULL);
assert(c != NULL);
for(int i=0, max = n*nPadded; i<max; i++){
a[i] = (int) rand() / 1804289408.0;
b[i] = (int) rand() / 1804289408.0;
cRef[i] = 0.0;
c[i] = 0.0;
}
debug_arr("a", "%f", a, 0, 9, 1);
debug_arr("b", "%f", b, 0, 9, 1);
debug_arr("cRef", "%f", cRef, 0, 9, 1);
debug_arr("c", "%f", c, 0, 9, 1);
double t1 = omp_get_wtime();
reference(a, b, cRef, n, nPadded);
double t2 = omp_get_wtime();
debug("reference calc time: %f", t2-t1);
t1 = omp_get_wtime();
intrinsics(a, b, c, n, nPadded);
t2 = omp_get_wtime();
debug("Intrinsics calc time: %f", t2-t1);
debug_arr("cRef", "%f", cRef, 0, 9, 1);
debug_arr("c", "%f", c, 0, 9, 1);
for(int i=0, max = n*nPadded; i<max; i++){
assert(F_E_Q(cRef[i], c[i], 2));
}
_mm_free(a);
_mm_free(b);
_mm_free(cRef);
_mm_free(c);
return NULL;
}
void reference(float* a, float* b, float* c, int n, int nPadded){
for(int i = 0; i < n; i++ )
for(int k = 0; k < n; k++ )
for(int j = 0; j < n; j++ )
c[i*nPadded+j] = c[i*nPadded+j] + a[i*nPadded+k]*b[k*nPadded+j];
}
#if __MIC__
void intrinsics(float* a, float* b, float* c, int n, int nPadded){
#pragma omp parallel for
for(int i = 0; i < n; i++ )
for(int k = 0; k < n; k++ )
for(int j = 0; j < n; j+=16 ){
__m512 aPart = _mm512_extload_ps(a + i*nPadded+k, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
__m512 bPart = _mm512_load_ps(b + k*nPadded+j);
__m512 cPart = _mm512_load_ps(c + i*nPadded+j);
cPart = _mm512_add_ps(cPart, _mm512_mul_ps(aPart, bPart));
_mm512_store_ps(c + i*nPadded+j, cPart);
}
}
#else
void intrinsics(float* a, float* b, float* c, int n, int nPadded){
#pragma omp parallel for
for(int i = 0; i < n; i++ )
for(int k = 0; k < n; k++ )
for(int j = 0; j < n; j+=4 ){
__m128 aPart = _mm_load_ps1(a + i*nPadded+k);
__m128 bPart = _mm_load_ps(b + k*nPadded+j);
__m128 cPart = _mm_load_ps(c + i*nPadded+j);
cPart = _mm_add_ps(cPart, _mm_mul_ps(aPart, bPart));
_mm_store_ps(c + i*nPadded+j, cPart);
}
}
#endif

OpenCL clEnqueueNDRangeKernel how to set work group size correctly

In OpenCL, if I want to add two N-dimension vectors, the global work group size (globalSize) should satisfy globalSize = ceil(N/localSize) * localSize, where localSize is the local work group size. Is this correct? If N = 1000, and localSize = 128, globalSize should be 1024? Can we always set globalSize some multiple of localSize and larger than needed?
I tried many times and it worked well for 1-dimension problems.
However, when it comes to 2d problems, for example, multiply two matrices of dimension m*n and n*p, the result matrix is of order m*p, things get more complicated.
The max work group size on my device is 128, so I set localSize [2] = {16,8} and
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}.
It is similar to the 1-dimension case but the result is wrong!
If I set localSize [2] = {1,128} and change the globalSize accordingly, I can get the correct result. So where is the problem? Can anyone tell me why?
In addition, I find out the indices where the matrix element is wrong.
It seems that the result is wrong at (i,j) where i*p + j = n * some constant (n = 1,2,3...)
Why?
Here is my kernel function:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
{
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
else
{
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
}
And then it is the host program:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
{
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
{
A[i] = i;
}
for(int i = 0; i < n*p; i++)
{
B[i] = i;
}
try
{
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("mmul.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
{
float z = 0.0;
for(int k = 0; k < n; k++)
{
z += A[i*n+k] * B[k*p+j];
}
c[i*p+j] = z;
}
for(int i = 0; i < m*p; i++)
{
if(fabs(c[i]-C[i])>0.001)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
}
delete []A;
delete []B;
delete []C;
}
catch(Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
return 0;
}
Your bounds checking code inside your OpenCL kernel is incorrect. Instead of this:
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
You should have this:
if(i < 0 || j < 0 || i >= Mdim || j >= Pdim) return;
Let's assume, that you have float matrix of size 1000x1000:
const int size = 1000;
// Whatever
float* myMatrix = (float*)calloc(size * size, sizeof(*myMatrix));
Determine size of Local Group first:
size_t localSize[] = {16, 8};
Then determine, how many Local Groups do you need:
size_t numLocalGroups[] = {ceil(size/localSize[0]), ceil(size/localSize[1])};
Finally, determine NDRange size:
size_t globalSize[] = {localSize[0] * numLocalGroups[0], localSize[1] * numLocalGroups[1]};
Don't forget to handle out-of-bounds access in right-most Local Groups.

OpenCL Matrix multiplication: inner product versus outer product

I'm hoping everyone is familiar with the standard "naive" method of multiplying two (n x n square for simplicity) matrices. In C this is:
for(int i = 0; i < n; ++i)
for(int j = 0; j < n; ++j)
for(int k = 0; k < n; ++k)
C[i*n + j] += A[i*n + k] * B[k*n + j];
The above method computes the dot (inner) product of a row of A with a column of B and is easy to implement in OpenCL as follows:
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
)
{
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
for(int i = 0; i < n; i++)
C[row*n + col] += A[row*n + i]*B[i*n + col];
}
Interchanging the two inner-most loops of the original C implementation results in a method that computes outer products, i.e., it computes rank-1 updates of the rows of the C matrix:
for(int i = 0; i < n; ++i)
for(int k = 0; k < n; ++k)
for(int j = 0; j < n; ++j)
C[i*n + j] += A[i*n + k] * B[k*n + j];
Does anybody know how to properly implement the above outer-product method in OpenCL? I have two of my attempts pasted below but I just can't seem to nail it
Attempt 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
)
{
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
__local float r;
r = A[row*n + col];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = 0; i < n; ++i)
C[row*n + i] += r * B[col*n + i];
}
Attempt 2
#define TS 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
int n)
{
// Thread coordinates
const int row = get_local_id(1); // row
const int col = get_local_id(0); // col
// Group tile coordinates
const int by = get_group_id(1); // row
const int bx = get_group_id(0); // col
A += TS*by + TS*bx*n + n*row + (col);
B += TS*by*n + n*row + (col);
C += TS*bx*n + n*(row) + col;
__global const float *Blast = B + n;
float c[2] = {0.0f,0.0f};
float* cptr = &c[0];
__local float bs[2];
do
{
bs[0] = B[0];
bs[1] = B[n];
barrier(CLK_LOCAL_MEM_FENCE);
*cptr += A[0] * bs[0];
*cptr++ += A[0] * bs[1];
B++;
barrier(CLK_LOCAL_MEM_FENCE);
} while( B < Blast );
C[0] += c[0];
C[1] += c[1];
}
The OpenCL implementation of the common algorithm maps the outer two loops to the OpenCL NDRange implicit loops. This works because the outer two loops can be safely run in parallel.
There are a few problems with Attempt 1:
The __local variable r is assigned different values from multiple work-items simultaneously. There is a race condition here, the value of r is undefined. This could be fixed by just making r a private variable instead.
The more serious problem is that there is a race condition in the assignment of C. Every value of col (NDRange dimension 0) will be running its own loop over i in parallel.
There isn't a simple way around the second issue. The loop over k (in the transposed version) cannot be run in parallel. You can only map either the outer loop or the inner loop to a single dimensional NDRange in OpenCL.

Resources