Related
Using Qt5 (5.12) and OpenGL 4.3, code below won't draw the rectangle I want.
compiles fine, but just shows the black background (color changes when I change the value, so at least something working) but doesn't show any the triangle
The example is from early chapters of the OpenGL blue book. Since thats the only resource I have and can comfortably follow, would like to code directly with opengl functions and avoid using Qt's class's at this point if possible.
> glxinfo
OpenGL core profile version string: 4.6.0 NVIDIA 390.77
OpenGL core profile shading language version string: 4.60 NVIDIA
OpenGL core profile context flags: (none)
OpenGL core profile profile mask: core profile
OpenGL core profile extensions:
OpenGL version string: 4.6.0 NVIDIA 390.77
OpenGL shading language version string: 4.60 NVIDIA
please note Window is initialized with OpenGL, CoreProfile, 4.3 format
class Window : public QOpenGLWindow, protected QOpenGLFunctions_4_3_Core
{
Q_OBJECT
...
// QOpenGLWindow interface
protected:
void initializeGL() override;
void resizeGL(int w, int h) override;
void paintGL() override;
GLuint rendering_program;
GLuint vertex_array_object;
}
initializeGL() ----------------
{
/**** COMPILE SHADERS ****/
GLuint vertex_shader, fragment_shader, program;
static const GLchar *vertex_shader_source[] = {
"#version 430 core \n",
" \n",
"void main(void) \n",
"{ \n",
" const vec4 vertices[3] = vec4[3]( \n",
" vec4( 0.25, -0.25, 0.5, 1.0), \n",
" vec4(-0.25, -0.25, 0.5, 1.0), \n",
" vec4( 0.25, 0.25, 0.5, 1.0)); \n",
" \n",
" gl_Position = vertices[gl_VertexID]; \n",
"} \n",
};
static const GLchar *fragment_shader_source[] = {
"#version 430 core \n",
" \n",
"out vec4 color; \n",
" \n",
"void main(void) \n",
"{ \n",
" color = vec4(0.0, 0.8, 1.0, 1.0); \n",
"} \n",
};
// create and compile vertex shader
vertex_shader = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vertex_shader, 1, vertex_shader_source, NULL);
glCompileShader(vertex_shader);
// create and compile fragment shader
fragment_shader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fragment_shader, 1, fragment_shader_source, NULL);
glCompileShader(fragment_shader);
// crate a program, attach shaders to it
program = glCreateProgram();
glAttachShader(program, vertex_shader);
glAttachShader(program, fragment_shader);
glLinkProgram(program);
glDeleteShader(vertex_shader);
glDeleteShader(fragment_shader);
rendering_program = program;
glGenVertexArrays(1, &vertex_array_object);
glBindVertexArray(vertex_array_object);
}
paintGL() --------------------------
{
const GLfloat color[] = { 0.0f, 0.0f, 0.0f, 1.0f };
glClearBufferfv(GL_COLOR, 0, color);
glUseProgram(rendering_program);
glBindVertexArray(vertex_array_object);
glDrawArrays(GL_TRIANGLES, 0, 3);
glFlush();
}
This was hard to see.
The declaration of vertex_shader_source
static const GLchar *vertex_shader_source[] = {
"#version 430 core \n",
" \n",
"void main(void) \n",
"{ \n",
" const vec4 vertices[3] = vec4[3]( \n",
" vec4( 0.25, -0.25, 0.5, 1.0), \n",
" vec4(-0.25, -0.25, 0.5, 1.0), \n",
" vec4( 0.25, 0.25, 0.5, 1.0)); \n",
" \n",
" gl_Position = vertices[gl_VertexID]; \n",
"} \n",
};
and fragment_shader_source
static const GLchar *fragment_shader_source[] = {
"#version 430 core \n",
" \n",
"out vec4 color; \n",
" \n",
"void main(void) \n",
"{ \n",
" color = vec4(0.0, 0.8, 1.0, 1.0); \n",
"} \n",
};
are not of type const char*, but of const char*[]. vertex_shader has 11 elements, and fragment_shader_source has 8 elements.
The 2nd paramter of glShaderSource has to be the number of elements in the array.
So it has to be:
glShaderSource(vertex_shader, 11, vertex_shader_source, NULL);
respectively
glShaderSource(fragment_shader, 8, fragment_shader_source, NULL);
I recommend to use Raw string literal instead of arrays:
const char *vertex_shader_source = R"(
#version 430 core
void main(void)
{
const vec4 vertices[3] = vec4[3](
vec4( 0.25, -0.25, 0.5, 1.0),
vec4(-0.25, -0.25, 0.5, 1.0),
vec4( 0.25, 0.25, 0.5, 1.0));
gl_Position = vertices[gl_VertexID];
}
)";
const char *fragment_shader_source = R"(
#version 430 core
out vec4 color;
void main(void)
{
color = vec4(0.0, 0.8, 1.0, 1.0);
}
)";
glShaderSource(vertex_shader, 1, &vertex_shader_source, NULL);
glShaderSource(fragment_shader, 1, &fragment_shader_source, NULL);
Further, check if the compilation of the shaders succeeded:
e.g.
GLint status;
glCompileShader(vertex_shader);
glGetShaderiv( vertex_shader, GL_COMPILE_STATUS, &status );
if ( status == GL_FALSE )
{
GLint logLen;
glGetShaderiv( vertex_shader, GL_INFO_LOG_LENGTH, &logLen );
std::vector< char >log( logLen );
GLsizei written;
glGetShaderInfoLog( vertex_shader, logLen, &written, log.data() );
std::cout << "compile error:" << std::endl << log.data() << std::endl;
}
and if linking the program succeeded:
e.g.
glLinkProgram(program);
glGetProgramiv( program, GL_LINK_STATUS, &status );
if ( status == GL_FALSE )
{
GLint logLen;
glGetProgramiv( program, GL_INFO_LOG_LENGTH, &logLen );
std::vector< char >log( logLen );
GLsizei written;
glGetProgramInfoLog( program, logLen, &written, log.data() );
std::cout << "link error:" << std::endl << log.data() << std::endl;
}
I need help with OpenCL.
The task is as follows:
There is an input parameter of type string. It is necessary to generate a SHA-256 hash using the resources of the video card.
It is necessary to create a cycle to select a hash. Each time add some postfix to the original string.
Result*Hash should start with 5 zeros "00000 ...".
For example, the entrance. parameter: "strela".
SHA-256: "7d7ceecdee08ea1c0ac46b27657a79395af36526b3214b59a92f8351ccf8f762"
Next, you need to add a postfix. For example, "strela1"
Here the hash will be: a2afd15651f44f19f3e4e216bf3ead22d5f5937e9f9dc250382ff1f764ba219f
then continue to add the postfix until the resulting hash begins to start with "00000.."
It is necessary to use all the cores of the video card, i.e. use parallelization. Each core will use its postfix.
As soon as some kernel computes the hash we need, interrupt all calculations on the cores and display the hash we need.
Source:
main.cpp
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
#include <stdio.h>
#include < string.h >
void crypt_and_print(char input[])
{
char result[65];
char diff[65] = "00000";
char *istr;
char buffer2[20];
int temp;
char str2[20];
for (int i = 0; i < 1; i++)
{
char string[] = "1qqq";
sprintf(buffer2, "%d", i);
temp = 8 - strlen(buffer2);
str2[0] = '\0';
while (strlen(str2) != temp)
strcat(str2, "0");
strcat(str2, buffer2);
strcat(string, str2);
sha256_crypt(string, result);
istr = strstr(result, diff);
if (istr != NULL) {
printf(istr);
break;
}
}
}
int main()
{
char result[65];
sha256_init(2048);
crypt_and_print((char*)"");
}
sha256.c
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
static cl_platform_id platform_id = NULL;
static cl_device_id device_id = NULL;
static cl_uint ret_num_devices;
static cl_uint ret_num_platforms;
static cl_context context;
static cl_int ret;
static char* source_str;
static size_t source_size;
static cl_program program;
static cl_kernel kernel;
static cl_command_queue command_queue;
static cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_info;
static cl_uint *partial_hashes;
static cl_uint *res_hashes;
static char *saved_plain;
static unsigned int datai[3];
static int have_full_hashes;
static size_t kpc = 4;
static size_t global_work_size=3;
static size_t local_work_size=1;
static size_t string_len;
void load_source();
void createDevice();
void createkernel();
void create_clobj();
void crypt_all();
void sha256_init(size_t user_kpc)
{
kpc = user_kpc;
load_source();
createDevice();
createkernel();
create_clobj();
}
void sha256_crypt(char input[], char* output)
{
int i;
string_len = strlen(input);
global_work_size = 3;
datai[0] = SHA256_PLAINTEXT_LENGTH;
datai[1] = global_work_size;
datai[2] = string_len;
memcpy(saved_plain, input, string_len+1);
crypt_all();
for(i=0; i<SHA256_RESULT_SIZE; i++)
{
sprintf(output+i*8,"%08x", partial_hashes[i]);
}
printf("'%s':\n%s\n", input, output);
}
void crypt_all()
{
//printf("%s\n",saved_plain);
ret = clEnqueueWriteBuffer(command_queue, data_info, CL_TRUE, 0, sizeof(unsigned int) * 3, datai, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, buffer_keys, CL_TRUE, 0, SHA256_PLAINTEXT_LENGTH * kpc, saved_plain, 0, NULL, NULL);
// printf("%s\n",buffer_keys);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
ret = clFinish(command_queue);
// read back partial hashes
ret = clEnqueueReadBuffer(command_queue, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, partial_hashes, 0, NULL, NULL);
have_full_hashes = 0;
}
void load_source()
{
FILE *fp;
fp = fopen("/sha256.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
}
void create_clobj(){
pinned_saved_keys = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (SHA256_PLAINTEXT_LENGTH)*kpc, NULL, &ret);
saved_plain = (char*)clEnqueueMapBuffer(command_queue, pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, (SHA256_PLAINTEXT_LENGTH)*kpc, 0, NULL, NULL, &ret);
memset(saved_plain, 0, SHA256_PLAINTEXT_LENGTH * kpc);
res_hashes = (cl_uint *)malloc(sizeof(cl_uint) * SHA256_RESULT_SIZE);
memset(res_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
pinned_partial_hashes = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
partial_hashes = (cl_uint *) clEnqueueMapBuffer(command_queue, pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, 0, NULL, NULL, &ret);
memset(partial_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
buffer_keys = clCreateBuffer(context, CL_MEM_READ_ONLY, (SHA256_PLAINTEXT_LENGTH) * kpc, NULL, &ret);
buffer_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
data_info = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int) * 3, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(data_info), (void *) &data_info);
clSetKernelArg(kernel, 1, sizeof(buffer_keys), (void *) &buffer_keys);
clSetKernelArg(kernel, 2, sizeof(buffer_out), (void *) &buffer_out);
}
void createDevice()
{
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
}
void createkernel()
{
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
}
sha256.cl
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#define H0 0x6a09e667
#define H1 0xbb67ae85
#define H2 0x3c6ef372
#define H3 0xa54ff53a
#define H4 0x510e527f
#define H5 0x9b05688c
#define H6 0x1f83d9ab
#define H7 0x5be0cd19
uint rotr(uint x, int n) {
if (n < 32) return (x >> n) | (x << (32 - n));
return x;
}
uint ch(uint x, uint y, uint z) {
return (x & y) ^ (~x & z);
}
uint maj(uint x, uint y, uint z) {
return (x & y) ^ (x & z) ^ (y & z);
}
uint sigma0(uint x) {
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
uint sigma1(uint x) {
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
uint gamma0(uint x) {
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}
uint gamma1(uint x) {
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}
__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key, __global uint *digest){
int t, gid, msg_pad;
int stop, mmod;
uint i, ulen, item, total;
uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
uint num_keys = data_info[1];
int current_pad;
//printf(get_global_id(0));
uint K[64]={
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
msg_pad=0;
ulen = data_info[2];
total = ulen%64>=56?2:1 + ulen/64;
//printf("ulen: %u total:%u\n", ulen, total);
digest[0] = H0;
digest[1] = H1;
digest[2] = H2;
digest[3] = H3;
digest[4] = H4;
digest[5] = H5;
digest[6] = H6;
digest[7] = H7;
for(item=0; item<total; item++)
{
A = digest[0];
B = digest[1];
C = digest[2];
D = digest[3];
E = digest[4];
F = digest[5];
G = digest[6];
H = digest[7];
#pragma unroll
for (t = 0; t < 80; t++){
W[t] = 0x00000000;
}
msg_pad=item*64;
if(ulen > msg_pad)
{
current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
}
else
{
current_pad =-1;
}
// printf("current_pad: %d\n",current_pad);
if(current_pad>0)
{
i=current_pad;
stop = i/4;
// printf("i:%d, stop: %d msg_pad:%d\n",i,stop, msg_pad);
for (t = 0 ; t < stop+get_global_id(0) ; t++){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= (uchar) plain_key[msg_pad + t * 4 + 3];
// printf("W[%u]: %u\n",t,W[t]);
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= ((uchar) 0x80) ;
} else if (mmod == 2) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
if (current_pad<56)
{
W[15] = ulen*8 ;
// printf("ulen avlue 2 :w[15] :%u\n", W[15]);
}
}
else if(current_pad <0)
{
if( ulen%64==0)
W[0]=0x80000000;
W[15]=ulen*8;
//printf("ulen avlue 3 :w[15] :%u\n", W[15]);
}
for (t = 0; t < 64; t++) {
if (t >= 16)
W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16];
T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
T2 = sigma0(A) + maj(A, B, C);
H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
}
digest[0] += A;
digest[1] += B;
digest[2] += C;
digest[3] += D;
digest[4] += E;
digest[5] += F;
digest[6] += G;
digest[7] += H;
}
printf("hi");
}
How can i use here paralelism (all GPU cores) to calculate needed hash code?
Is it real to do task like this using OPENCL ?
Below is a simple QT program using a VBO to draw 6 points. The points appear as squares on Android and single pixels on the desktop. When I change gl_PointSize in the vertex shader, the squares change to the appropriate size on Android but remain single pixels on the desktop.
example.pro
QT += core gui widgets opengl
TARGET = example
TEMPLATE = app
SOURCES = main.cpp
HEADERS = main.h
main.h
#include <QGLWidget>
#include <QGLFunctions>
#include <QGLShader>
class glview : public QGLWidget, protected QGLFunctions
{
Q_OBJECT
public:
explicit glview(QWidget *parent = 0);
protected:
void initializeGL();
void resizeGL(int w, int h);
void paintGL();
private:
quint32 vbo_id[1];
QGLShaderProgram *program;
};
main.cpp
#include <QApplication>
#include "main.h"
struct vrtx {
GLfloat x;
GLfloat y;
GLfloat z;
GLfloat r;
GLfloat g;
GLfloat b;
}__attribute__((packed)) geomrtry[] = {
// x, y, z r, g, b
{1, 1, 0, 1, 0, 0},
{1.5, 2, 0, 0, 1, 0},
{2, 1, 0, 0, 0, 1},
{3, 1, 0, 1, 0, 1},
{3.5, 2, 0, 1, 1, 0},
{4, 1, 0, 0, 1, 1},
};
int main(int argc, char *argv[])
{
QApplication app(argc, argv);
glview widget;
widget.show();
return app.exec();
}
glview::glview(QWidget *parent) : QGLWidget(parent)
{
}
void glview::initializeGL()
{
initializeGLFunctions();
qglClearColor(Qt::white);
QGLShader *vshader = new QGLShader(QGLShader::Vertex, this);
const char *vsrc =
"attribute highp vec4 vertex;\n"
"attribute mediump vec4 colour;\n"
"varying mediump vec4 f_colour;\n"
"uniform mediump mat4 matrix;\n"
"void main(void)\n"
"{\n"
" gl_Position = matrix * vertex;\n"
" f_colour = colour;\n"
" gl_PointSize = 12.0;\n"
"}\n";
vshader->compileSourceCode(vsrc);
QGLShader *fshader = new QGLShader(QGLShader::Fragment, this);
const char *fsrc =
"varying mediump vec4 f_colour;\n"
"void main(void)\n"
"{\n"
" gl_FragColor = f_colour;\n"
"}\n";
fshader->compileSourceCode(fsrc);
program = new QGLShaderProgram(this);
program->addShader(vshader);
program->addShader(fshader);
program->link();
glGenBuffers(1, vbo_id);
glBindBuffer(GL_ARRAY_BUFFER, vbo_id[0]);
glBufferData(GL_ARRAY_BUFFER, sizeof(geomrtry), geomrtry, GL_STATIC_DRAW);
glEnable(GL_DEPTH_TEST);
}
void glview::resizeGL(int w, int h)
{
glViewport(0, 0, w, h);
}
void glview::paintGL()
{
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
QMatrix4x4 matrix;
matrix.ortho(0, 5, 0, 3, -1, 1);
program->bind();
program->setUniformValue("matrix", matrix);
glBindBuffer(GL_ARRAY_BUFFER, vbo_id[0]);
int vertexLocation = program->attributeLocation("vertex");
program->enableAttributeArray(vertexLocation);
glVertexAttribPointer(vertexLocation, 3, GL_FLOAT, GL_FALSE, sizeof(struct vrtx), 0);
int colourLocation = program->attributeLocation("colour");
program->enableAttributeArray(colourLocation);
glVertexAttribPointer(colourLocation, 3, GL_FLOAT, GL_FALSE, sizeof(struct vrtx), ((char*)NULL + 12));
//glDrawArrays(GL_TRIANGLES, 0, sizeof(geomrtry) / sizeof(struct vrtx));
glDrawArrays(GL_POINTS, 0, sizeof(geomrtry) / sizeof(struct vrtx));
glFlush();
}
You're using functionality where the size of rendered points is taken from a built-in gl_PointSize variable set in the vertex shader. This functionality is the default in ES 2.0.
The same functionality is available in desktop OpenGL as well, but it is disabled by default. It can be enabled by calling:
glEnable(GL_PROGRAM_POINT_SIZE);
If this setting is not enabled, desktop OpenGL uses the point size set with the glPointSize() API call.
Continuing with my OpenCL adventure, this is what I have till now from my CUDA implementation. I was trying to check if at least the first kernel call was working but I got error 48 and have no idea what am I missing. I was following the example in this page
KERNEL
__kernel
void clut_distributePixels(__global int *pixelGroup, int c_rows, int c_cols, int c_numColors){
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= c_cols || y >= c_rows) return;
int index = y * c_cols + x;
pixelGroup[index] = index/c_numColors;
}
Read Kernel from file
char *file_contents(const char *filename, int *length){
FILE *f = fopen(filename, "r");
void *buffer;
if (!f) {
fprintf(stderr, "Unable to open %s for reading\n", filename);
return NULL;
}
fseek(f, 0, SEEK_END);
*length = ftell(f);
fseek(f, 0, SEEK_SET);
buffer = malloc(*length+1);
*length = fread(buffer, 1, *length, f);
fclose(f);
((char*)buffer)[*length] = '\0';
return (char*)buffer;
}
CODE
#include <iostream>
#include <OpenCL/OpenCL.h>
#include "Utilities.hpp"
int main(int argc, const char * argv[]){
if (argc < 3) {
std::cout << "Use: {GPU|CPU} nColors" << std::endl;
return 1;
}
/************************************************
HOST SIDE INITIALIZATION
************************************************/
int h_numColors = atoi(argv[2]);
Color *h_image;
int h_rows, h_cols;
if (readText2RGB("LenaOriginal.txt", &h_image, &h_rows, &h_cols) != SUCCESS){
return 1;
}
int *h_pixelGroup = new int[h_rows*h_cols];
Color *h_groupRep = new Color[h_numColors];
Color *h_clutImage = new Color[h_rows*h_cols];
int h_change = 0;
/************************************************
PLATFORM AND DEVICE SETUP
************************************************/
cl_int errorStatus;
//Use the first platform
cl_platform_id platform;
errorStatus = clGetPlatformIDs(1, &platform, NULL);
//Use the first device that matches the type selected
cl_device_id device;
if (strcmp(argv[1], "CPU")){
errorStatus = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
}else if (strcmp(argv[1], "GPU")){
errorStatus = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
}else{
std::cout << "Unknown device type. Choose either CPU or GPU" << std::endl;
return 1;
}
//Define context properties and create context
cl_context_properties contextProps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
cl_context context = clCreateContext(contextProps, 1, &device, NULL, NULL, &errorStatus);
//Create the command queue
cl_command_queue queue = clCreateCommandQueue(context, device, 0, &errorStatus);
/************************************************
DEVICE VARIABLE SETUP
************************************************/
cl_mem d_image;
cl_mem d_pixelGroup;
cl_mem d_groupRep;
cl_mem d_clutImage;
cl_mem d_change;
d_image = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(Color)*h_rows*h_cols, h_image, &errorStatus);
d_pixelGroup = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*h_rows*h_cols, NULL, &errorStatus);
d_groupRep = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Color)*h_numColors, NULL, &errorStatus);
d_clutImage = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Color)*h_rows*h_cols, NULL, &errorStatus);
d_change = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &errorStatus);
/************************************************
CREATE, COMPILE PROGRAM and CREATE KERNEL
************************************************/
int pl;
size_t sourceLength;
char * sourceCode = file_contents("vectorQuantization.cl", &pl);
sourceLength = (size_t)pl;
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&sourceCode, &sourceLength, &errorStatus);
errorStatus = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel k_clut_distributePixels = clCreateKernel(program, "clut_distributePixels", &errorStatus);
errorStatus = clSetKernelArg(k_clut_distributePixels, 0, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_distributePixels, 1, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_distributePixels, 2, sizeof(cl_mem), (void*)&h_cols);
errorStatus = clSetKernelArg(k_clut_distributePixels, 3, sizeof(cl_mem), (void*)&h_numColors);
cl_kernel k_clut_checkDistances = clCreateKernel(program, "clut_checkDistances", &errorStatus);
errorStatus = clSetKernelArg(k_clut_checkDistances, 0, sizeof(cl_mem), (void*)&d_image);
errorStatus = clSetKernelArg(k_clut_checkDistances, 1, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_checkDistances, 2, sizeof(cl_mem), (void*)&d_groupRep);
errorStatus = clSetKernelArg(k_clut_checkDistances, 3, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_checkDistances, 4, sizeof(cl_mem), (void*)&h_cols);
errorStatus = clSetKernelArg(k_clut_checkDistances, 5, sizeof(cl_mem), (void*)&h_numColors);
errorStatus = clSetKernelArg(k_clut_checkDistances, 6, sizeof(cl_mem), (void*)&d_change);
cl_kernel k_clut_createImage = clCreateKernel(program, "clut_createImage", &errorStatus);
errorStatus = clSetKernelArg(k_clut_createImage, 0, sizeof(cl_mem), (void*)&d_clutImage);
errorStatus = clSetKernelArg(k_clut_createImage, 1, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_createImage, 2, sizeof(cl_mem), (void*)&d_groupRep);
errorStatus = clSetKernelArg(k_clut_createImage, 3, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_createImage, 4, sizeof(cl_mem), (void*)&h_cols);
/************************************************
EXECUTE PROGRAM AND GET RESULTS
************************************************/
/*STEP 1: evenly distribute pixels among the colors in the CLUT */
size_t grid[2] = {static_cast<size_t>(h_rows), static_cast<size_t>(h_cols)};
errorStatus = clEnqueueNDRangeKernel(queue, k_clut_distributePixels, 2, NULL, grid, NULL, 0, NULL, NULL);
clFinish(queue);
/*********/
/* ERROR */
/*********/
errorStatus = clEnqueueReadBuffer(queue, d_pixelGroup, CL_TRUE, 0, sizeof(int)*h_rows*h_cols, h_pixelGroup, 0, NULL, NULL);
std::cout << h_pixelGroup[7] << ", " << h_pixelGroup[8] << ", " << h_pixelGroup[9] << ", " << h_pixelGroup[10] << std::endl;
//do {
/*STEP 2: compute reprenstative */
/*STEP 3: compute distances and reassign pixel to group */
//copyFromConstantMemory
//} while (h_change != 0);
std::cout << "Done !!" << std::endl;
return 0;
}
I found my error. First of all Always check return values when you are learning new stuff. I just remember that from when I was learning CUDA, so with this simple macro I started checking everything
#define CL_SUCCESS_OR_RETURN(code) do { \
assert(code == CL_SUCCESS); \
if (code != CL_SUCCESS) { return code; } \
}while (0);
And the error was at the very beginning when I check if it is CPU or GPU. I forgot that strcmp returns 0 when the strings are equal. After fixing this, all worked beautifully !!
Anyways, if you have any other suggestion or advise or you see something ugly or not a best practice in the code please comment.
EDIT: It was my cards fault... The local memory kernel goes a few times faster, sorry all!!
I am writing a simple sgemm (square, alpha=1, beta=0) that is supposed to take advantage of local memory, but it performs at half the speed of a naive version.
Here are the kernels:
const char* matrixMultiplySource =
"__kernel\n"
" void matrixMultiply(__global float* A, __global float* B, __global float* C)\n"
" {\n"
" int i = get_local_id(0);\n"
" int j = get_local_id(1);\n"
" int ig = get_global_id(0);\n"
" int jg = get_global_id(1);\n"
" int sizeG0 = get_global_size(0);\n"
" __local float localA[BLOCK_SIZE][BLOCK_SIZE];\n"
" __local float localB[BLOCK_SIZE][BLOCK_SIZE];\n"
" float val=0.0f;\n"
" for ( int index = 0; index < sizeG0; index += BLOCK_SIZE )\n"
" {\n"
" localA[j][i] = A[ig + sizeG0 * (index+j)];\n"
" localB[j][i] = B[index+i + sizeG0 * jg];\n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" #pragma unroll\n"
" for ( int kk = 0; kk < BLOCK_SIZE; ++kk)\n"
" {\n"
" val = val + localA[kk][i] * localB[j][kk];\n"
" }\n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" }\n"
" C[ig + sizeG0 * jg] = val;\n"
"}\n"
;
const char* matrixMultiplySource2 =
"__kernel\n"
" void matrixMultiply(__global float* A, __global float* B, __global float* C)\n"
" {\n"
" int ig = get_global_id(0);\n"
" int jg = get_global_id(1);\n"
" int sizeG0 = get_global_size(0);\n"
" float val=0;\n"
" for ( int k = 0; k < sizeG0; k++)\n"
" {\n"
" val = val + A[ig + k * sizeG0] * B[k + jg * sizeG0];\n"
" }\n"
" C[ig + sizeG0 * jg] = val;\n"
"}\n"
;
BLOCK_SIZE is 16 and I am using 1024x1024 matrices as well as warming up.
// Create OpenCL context
context = mycl::myclCreateContext( NULL, ret_num_devices, devices, NULL, NULL, &ret);
// Create Command Queue
command_queue = mycl::myclCreateCommandQueue(context, devices[0], 0, &ret);
// Create Memory Buffer
memobjA = mycl::myclCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret);
memobjB = mycl::myclCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret);
memobjC = mycl::myclCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = mycl::myclEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
widthA * heightA * sizeof(float), A, 0, NULL, NULL);
ret = mycl::myclEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
widthB * heightB * sizeof(float), B, 0, NULL, NULL);
// Create Kernel Program from the source
program = mycl::myclCreateProgramWithSource(context, 1, (const char **)&matrixMultiplySource,
NULL, &ret);
// Build Kernel Program
ret = mycl::myclBuildProgram(program, ret_num_devices, devices, "-D BLOCK_SIZE=16", NULL, NULL);
if(ret != CL_SUCCESS){cout << "PROBREM! " << ret << endl;return -1;}
// Create OpenCL Kernel
kernel = mycl::myclCreateKernel(program, "matrixMultiply", &ret);
size_t globalThreads[2] = {heightA, widthB};
size_t localThreads[2] = {BLOCK_SIZE, BLOCK_SIZE};
// Set OpenCL Kernel Arguments
ret = mycl::myclSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = mycl::myclSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
ret = mycl::myclSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
// Time the kernel
struct timeval timev1, timev2;
float time_seconds = 0.0f;
mycl::myclEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, 0, NULL);
mycl::myclFinish(command_queue);
gettimeofday(&timev1, NULL);
ret = mycl::myclEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, 0, NULL);
if(ret != CL_SUCCESS){cout << "fail! " << ret << endl;}
ret = mycl::myclFinish(command_queue);
if(ret != CL_SUCCESS){cout << "fail! " << ret << endl;}
gettimeofday(&timev2,NULL);
time_seconds=(timev2.tv_sec-timev1.tv_sec)+0.000001*(timev2.tv_usec- timev1.tv_usec);
Have you looked at the two kernels in the AMD APP KernelAnalyzer or equivalent tools? These tools compile the Kernels and show their predicted performance characteristics
You use
barrier(CLK_GLOBAL_MEM_FENCE);
where I would expect to see
barrier(CLK_LOCAL_MEM_FENCE);
as you write in the loop to local memory.
Further I doubt that the copy to localA does help you -- at one time every items there is only accessed once.