How do you represent an image in OpenCL - opencl

I have sample code but it completely leaves out what my (void*)should_be!
I setup a cl_image_desc, cl_image_format, buffer, origin, and region:
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = width;
desc.image_height = height;
desc.image_depth = 0;
desc.image_array_size = 0;
desc.image_row_pitch = 0;
desc.image_slice_pitch = 0;
desc.num_mip_levels = 0;
desc.num_samples = 0;
desc.buffer = NULL;
cl_image_format format;
format.image_channel_order = CL_R;
format.image_channel_data_type = CL_FLOAT;
cl_mem bufferSourceImage = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, NULL);
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height,1};
In this next snippet sourceImage is a void pointer to my image. But what is my image? For every pixel there are r, g, b, a, x, and y values.
clEnqueueWriteImage(queue, bufferSourceImage, CL_TRUE, origin, region, 0, 0, sourceImage, 0, NULL, NULL);
How do I turn my image (a bunch of (r,g,b,a,x,y)'s) into a suitable array?
This is the kernel they provide:
__kernel void convolution(__read_only image2d_t sourceImage, __write_only image2d_t outputImage, int rows, int cols, __constant float* filter, int filterWidth, sampler_t sampler)
{
int column = get_global_id(0);
int row = get_global_id(1);
int halfWidth = (int)(filterWidth/2);
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
int filterIdx = 0;
int2 coords;
for(int i = -halfWidth; i <= halfWidth; i++)
{
coords.y = row + i;
for(int i2 = -halfWidth; i2 <= halfWidth; i2++)
{
coords.x = column + i2;
float4 pixel;
pixel = read_imagef(sourceImage, sampler, coords);
sum.x += pixel.x * filter[filterIdx++];
}
}
if(myRow < rows && myCol < cols)
{
coords.x = column;
coords.y = row;
write_imagef(outputImage, coords, sum);
}
}

Set up the cl_image_format as you like and then you just have to follow that format what you selected. Currently your channel (R, G, B, A) data should be represented as "single precision floating-point value" - image_channel_data_type = CL_FLOAT, and you can take only one channel of those and feed it into the expected R channel (image_channel_order = CL_R).
Your kernel expect float:
float4 pixel;
pixel = read_imagef(sourceImage, sampler, coords);

Related

Libavcodec mpeg4 avi encoding framerate and timebase problem

I try to generate a video with a timebase more precise than the 1/fps (camera frame rate are not constant). But the generated AVI does not seem to take into account the framerate that I indicate.
#include <iostream>
extern "C" {
#include "libavformat/avformat.h"
#include "libavcodec/avcodec.h"
}
#pragma comment(lib,"avcodec.lib")
#pragma comment(lib,"avformat.lib")
constexpr int TIMEFACTOR = 10;
static void encodeFrame(AVCodecContext *avctx, AVFormatContext *ctx, AVFrame* frame)
{
int ret = avcodec_send_frame(avctx, frame);
AVPacket packet;
av_init_packet(&packet);
ret = 0;
while (ret >= 0) {
ret = avcodec_receive_packet(avctx, &packet);
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
return; // nothing to write
};
//packet.pts = (frame) ? frame->pts : packet.pts;
av_packet_rescale_ts(&packet, avctx->time_base, ctx->streams[0]->time_base);
packet.duration = TIMEFACTOR;
av_interleaved_write_frame(ctx, &packet);
av_packet_unref(&packet);
}
}
static void fill_yuv_frame(AVFrame* frame, int width, int height, int frameId)
{
// Y
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
frame->data[0][y * frame->linesize[0] + x] = x + y + frameId * 3;
}
}
// Cb and Cr
for (int y = 0; y < height / 2; y++) {
for (int x = 0; x < width / 2; x++) {
frame->data[1][y * frame->linesize[1] + x] = 128 + y + frameId * 2;
frame->data[2][y * frame->linesize[2] + x] = 64 + x + frameId * 5;
}
}
}
int main(int argc, char** argv)
{
const char filename[] = "output.avi";
const char encoder[] = "mpeg4";
constexpr int WIDTH = 640;
constexpr int HEIGHT = 480;
av_log_set_level(AV_LOG_DEBUG);
//delete file because libavcodec reload file
remove(filename);
AVFormatContext* ofmtCtx;
int ret = avformat_alloc_output_context2(&ofmtCtx, NULL, "avi", filename);
AVCodec* avcodec = avcodec_find_encoder_by_name(encoder);
AVCodecContext* avctx = avcodec_alloc_context3(avcodec);
avctx->width = WIDTH ;
avctx->height = HEIGHT ;
avctx->sample_aspect_ratio = { 1, 1 };
avctx->pix_fmt = AV_PIX_FMT_YUV420P; //Do not work with other type
avctx->codec_id = AV_CODEC_ID_MPEG4;
avctx->bit_rate = 4 * 1000 * 1000;
avctx->time_base = av_make_q(1, 25 * TIMEFACTOR);
avctx->framerate = av_make_q(25, 1);
avctx->ticks_per_frame = TIMEFACTOR;
avctx->gop_size = 10;
avctx->max_b_frames = 1;
AVStream* m_outStream = avformat_new_stream(ofmtCtx, NULL);
ret = avcodec_open2(avctx, avcodec, NULL);
ret = avcodec_parameters_from_context(m_outStream->codecpar, avctx);
m_outStream->time_base = avctx->time_base;
m_outStream->r_frame_rate = avctx->framerate;
m_outStream->avg_frame_rate = avctx->framerate;
ret = avio_open(&(ofmtCtx->pb),
filename,
AVIO_FLAG_WRITE);
ret = avformat_write_header(ofmtCtx, NULL);
av_dump_format(ofmtCtx, 0, filename, 1);
AVFrame * avframe = av_frame_alloc();
avframe->format = avctx->pix_fmt;
avframe->width = avctx->width;
avframe->height = avctx->height;
ret = av_frame_get_buffer(avframe, 0);
ret = av_frame_make_writable(avframe);
for (int i = 0; i < 25; ++i) {
fflush(stdout);
fill_yuv_frame(avframe, avctx->width, avctx->height, i);
avframe->pts = i * TIMEFACTOR;
encodeFrame(avctx, ofmtCtx, avframe);
}
encodeFrame(avctx, ofmtCtx, NULL);
av_write_trailer(ofmtCtx);
return 0;
}
And this is dump output :
[mpeg4 # 000002AA64FA1880] intra_quant_bias = 0 inter_quant_bias = -64
[file # 000002AA64F69AC0] Setting default whitelist 'file,crypto'
[avi # 000002AA64F73400] reserve_index_space:0 master_index_max_size:256
[avi # 000002AA64F73400] duration_est:36000.000, filesize_est:18.4GiB, master_index_max_size:256
Output #0, avi, to 'output.avi':
Metadata:
ISFT : Lavf58.12.100
Stream #0:0, 0, 1/250: Video: mpeg4, 1 reference frame (FMP4 / 0x34504D46), yuv420p, 640x480 (0x0) [SAR 1:1 DAR 4:3], 0/1, q=2-31, 4000 kb/s, 25 fps, 25 tbr, 250 tbn
But if I open the video in ffmpeg command line fps is 250, and video content 250 frames, with many dropframe.

Using clEnqueueNDRangeKernel in OpenCL

I need help with one function in OpenCL. When I'm starting using clEnqueueNDRangeKernel instead of clEnqueueTask it takes much more time for program to succeed. Why so? As I understand, the program should use data parallel model and it will work faster, am I wrong? And if I am, how I can change code to see the actual work of data parallel model?
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
for (int i = 0, j = 0; i < InSize; i+=4, j++)
{
unsigned char Value = (pDataIn[i] + pDataIn[i + 1] + pDataIn[i + 2]) / 3;
pDataOut[j] = Value;
}
}
int iWidth, iHeight, iBpp;
vector<unsigned char> pDataIn;
vector<unsigned char> pDataOut;
int err = LoadBmpFile(L"3840x2160.bmp", iWidth, iHeight, iBpp, pDataIn);
if (err != 0 || pDataIn.size() == 0 || iBpp != 32)
{
std::cout << "error load input file!\n";
}
pDataOut.resize(pDataIn.size()/4);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj = NULL;
cl_mem memobj1 = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
unsigned int SizeIn, SizeOut;
SizeIn = pDataIn.size();
SizeOut = pDataOut.size();
FILE *fp;
char fileName[] = "./kernel.cl";
char *source_str;
size_t source_size;
//Loading kernel
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
system("PAUSE");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
//Getting Platform and Device
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
//Create context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
//create kernel program
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
//build it
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
//create queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//create bufer
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, pDataIn.size(), NULL, &ret);
memobj1 = clCreateBuffer(context, CL_MEM_READ_WRITE,pDataOut.size(), NULL, &ret);
//copy buffer to kernel
ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, pDataIn.size(), pDataIn.data(), 0, NULL, NULL);
//create opencl kernel
kernel = clCreateKernel(program, "red_to_green", &ret);
//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
ret = clSetKernelArg(kernel, 2, sizeof(unsigned int), (void *)&SizeIn);
ret = clSetKernelArg(kernel, 3, sizeof(unsigned int), (void *)&SizeOut);
const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){
float start_time = clock();
float search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = 8;
size_t local_item_size = 4;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
//copy from buffer
ret = clEnqueueReadBuffer(command_queue, memobj1, CL_TRUE, 0, pDataOut.size(), pDataOut.data(), 0, NULL, NULL);
ret = clFinish(command_queue);
float end_time = clock();
search_time = end_time - start_time;
//float last_time = last_time + search_time;
cout << search_time << endl;
}
clock_t t1 = clock();
double time_seconds = (t1-t0)*CLOCKS_PER_SEC/cycles_max;
cout << time_seconds/1000 <<endl;
WriteBmpFile(L"3840x2160_wb.bmp", iWidth, iHeight, 8, pDataOut.size(), pDataOut.data(), false);
system("PAUSE");
from the docs page:
The kernel is executed using a single work-item.
clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with
work_dim = 1, global_work_offset = NULL, global_work_size[0] set to 1,
and local_work_size[0] set to 1.
When you use clEnqueueNDRangeKernel, you are using 2 work groups of 4 work items, but they are all doing the same work. They all read from the same global memory, but more importantly, they all try to write to the same locations in global memory.
You need to take into account the worker's global id when doing your computations.
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
int gid = get_global_id(0);
int gsize = get_global_size(0);
for (int j = gid; j < (InSize >> 2); j+= gsize)
{
unsigned char Value = (pDataIn[j*4] + pDataIn[j*4 + 1] + pDataIn[j*4 + 2]) / 3;
pDataOut[j] = Value;
}
}
It looks like you are iterating over all pixels of an input image in your kernel. This will cause all threads to calculate the image intensity for all pixels. Try to launch a single thread for each pixel instead. To do so, change your kernel source code to only calculate the output value for one pixel:
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut) {
int j = get_global_id(0);
int i = j*4;
pDataOut[i] = (pDataIn[j] + pDataIn[j + 1] + pDataIn[j + 2]) / 3;
}
This code will now perform the averaging over the RGB values of your RGBA input image for the single pixel at location i. Now all you need to do is launch as many threads as your image has pixels. Relevant changes:
//create opencl kernel
kernel = clCreateKernel(program, "black_white_img", &ret);
//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){
float start_time = clock();
float search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = iWidth * iHeight;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, NULL, 0, NULL, NULL);
This should give a considerable speedup comparing to your code.

OpenCL RGB->HSL and back

Ive been through every resource and cant fix my problem.
My host code calls the rgb2hsl kernel, then calls the hsl2rgb kernel. I should end up with the same image that I started with, but I do not. My new image hue is off in certain areas.
The red areas should not be there.
Here is the screen shot of what happens:
Here is the original picture
Here is the code:
#define E .0000001f
bool fEqual(float x, float y)
{
return (x+E > y && x-E < y);
}
__kernel void rgb2hsl(__global float *values, int numValues)
{
// thread index and total
int idx = get_global_id(0);
int idxVec3 = idx*3;
float3 gMem;
if (idx < numValues)
{
gMem.x = values[idxVec3];
gMem.y = values[idxVec3+1];
gMem.z = values[idxVec3+2];
}
barrier(CLK_LOCAL_MEM_FENCE);
gMem /= 255.0f; //convert from 256 color to float
//calculate chroma
float M = max(gMem.x, gMem.y);
M = max(M, gMem.z);
float m = min(gMem.x, gMem.y);
m = min(m, gMem.z);
float chroma = M-m; //calculate chroma
float lightness = (M+m)/2.0f;
float saturation = chroma/(1.0f-fabs(2.0f*lightness-1.0f));
float hue = 0;
if (fEqual(gMem.x, M))
hue = (int)((gMem.y - gMem.z)/chroma) % 6;
if (fEqual(gMem.y, M))
hue = (((gMem.z - gMem.x))/chroma) + 2;
if (fEqual(gMem.z, M))
hue = (((gMem.x - gMem.y))/chroma) + 4;
hue *= 60.0f;
barrier(CLK_LOCAL_MEM_FENCE);
if (idx < numValues)
{
values[idxVec3] = hue;
values[idxVec3+1] = saturation;
values[idxVec3+2] = lightness;
}
}
__kernel void hsl2rgb(__global float *values, int numValues)
{
// thread index and total
int idx = get_global_id(0);
int idxVec3 = idx*3;
float3 gMem;
if (idx < numValues)
{
gMem.x = values[idxVec3];
gMem.y = values[idxVec3+1];
gMem.z = values[idxVec3+2];
}
barrier(CLK_LOCAL_MEM_FENCE);
float3 rgb = (float3)(0,0,0);
//calculate chroma
float chroma = (1.0f - fabs( (float)(2.0f*gMem.z - 1.0f) )) * gMem.y;
float H = gMem.x/60.0f;
float x = chroma * (1.0f - fabs( fmod(H, 2.0f) - 1.0f ));
switch((int)H)
{
case 0:
rgb = (float3)(chroma, x, 0);
break;
case 1:
rgb = (float3)(x, chroma, 0);
break;
case 2:
rgb = (float3)(0, chroma, x);
break;
case 3:
rgb = (float3)(0, x, chroma);
break;
case 4:
rgb = (float3)(x, 0, chroma);
break;
case 5:
rgb = (float3)(chroma, 0, x);
break;
default:
rgb = (float3)(0, 0, 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
rgb += gMem.z - .5f*chroma;
rgb *= 255;
if (idx < numValues)
{
values[idxVec3] = rgb.x;
values[idxVec3+1] = rgb.y;
values[idxVec3+2] = rgb.z;
}
}
The problem was this line:
hue = (int)((gMem.y - gMem.z)/chroma) % 6;
It should be
hue = fmod((gMem.y - gMem.z)/chroma, 6.0f);
I did some more changes to remove artifacts:
#define E .0000001f
bool fEqual(float x, float y)
{
return (x+E > y && x-E < y);
}
__kernel void rgb2hsl(__global float *values, int numValues)
{
// thread index and total
int idx = get_global_id(0);
int idxVec3 = idx*3;
float3 gMem;
if (idx < numValues)
{
gMem.x = values[idxVec3];
gMem.y = values[idxVec3+1];
gMem.z = values[idxVec3+2];
}
barrier(CLK_LOCAL_MEM_FENCE);
gMem /= 255.0f; //convert from 256 color to float
//calculate chroma
float M = max(gMem.x, gMem.y);
M = max(M, gMem.z);
float m = min(gMem.x, gMem.y);
m = min(m, gMem.z);
float chroma = M-m; //calculate chroma
float lightness = (M+m)/2.0f;
float saturation = chroma/(1.0f-fabs(2.0f*lightness-1.0f));
float hue = 0;
if (fEqual(gMem.x, M))
hue = fmod((gMem.y - gMem.z)/chroma, 6.0f);
if (fEqual(gMem.y, M))
hue = (((gMem.z - gMem.x))/chroma) + 2;
if (fEqual(gMem.z, M))
hue = (((gMem.x - gMem.y))/chroma) + 4;
hue *= 60.0f;
barrier(CLK_LOCAL_MEM_FENCE);
if (M == m)
hue = saturation = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
if (idx < numValues)
{
//NOTE: ARTIFACTS SHOW UP if we do not cast to integer!
values[idxVec3] = (int)hue;
values[idxVec3+1] = saturation;
values[idxVec3+2] = lightness;
}
}

OpenCL Error Computing Matrix Multiplication during Runtime

I have been debugging for the past few days and cannot get this OpenCL matrix multiplication kernel to run. Whenever I run the program, the output from the GPU results in large negative numbers similar to -198746573.0000. I was wondering if someone with HPC experience could point out an error in my code or if it is an error with the driver.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define widthA 2
#define heightA 2
#define widthB heightA
#define heightB 2
#define widthC widthA
#define heightC heightB
#ifdef __APPLE__
#include < OpenCL/opencl.h >
#else
#include <opencl.h>
#endif
#define MEM_SIZE (128)
#define MAX_SOURCE_SIZE (0x100000)
int main()
{
float * A = (float *)malloc(sizeof(float)*widthA*heightA);
float * B = (float *)malloc(sizeof(float)*widthB*heightB);
float * C = (float *)malloc(sizeof(float)*widthC*heightC);
float * Res = (float *)malloc(sizeof(float)*widthC*heightC);
float * D= (float *)malloc(sizeof(float)*widthC*heightC);
float ref[widthC][heightC];
int i, j, k;
FILE * fp1 = fopen("matAdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightA; j++) {
float p=(rand()%100)/7.0;
//*(A+i*heightA+j)=rand()%100 + p;
*(A+i*heightA+j)=4.0;
fprintf(fp1, "%f ",*(A+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
fp1 = fopen("matBdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthB; i++)
{
for(j=0; j < heightB; j++) {
float p=(rand()%100)/7.0;
//*((B+i*heightB+j))=rand()%100 + p;
*((B+i*heightB+j))=4.0;
fprintf(fp1, "%f ",*(B+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobjA = NULL;
cl_mem memobjB = NULL;
cl_mem memobjC = NULL;
cl_mem rowA = NULL;
cl_mem colC = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id[10];
cl_platform_id platform = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event GPUDone[0];
//char string[MEM_SIZE];
FILE *fp;
char fileName[] = "matrixMultiplication.cl";
char *source_str;
size_t source_size;
int row = widthA;
int col = heightC;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
/* Get Platform and Device Info */
ret = clGetPlatformIDs(10, platform_id, &ret_num_platforms);
char cBuffer[1024];
cl_uint c;
for(c = 0; c < ret_num_platforms; c++)
{
clGetPlatformInfo(platform_id[c], CL_PLATFORM_NAME, 1024, &cBuffer, NULL);
if (strstr(cBuffer, "NVIDIA") != NULL)
{
platform = platform_id[c];
break;
}
}
printf("Found Platform %s\n", cBuffer);
ret = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
printf("Found %d devices.\n", ret_num_devices);
/* Create OpenCL context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create Memory Buffer */
memobjA = clCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret);
memobjB = clCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret);
memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
rowA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
colC = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
widthA * heightA * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
widthB * heightB * sizeof(float), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0, sizeof(int), &row, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0, sizeof(int), &col, 0, NULL, NULL);
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, "matrixMultiplication", &ret);
/* Set OpenCL Kernel Arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row);
ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col);
/* Execute OpenCL Kernel */
//ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
size_t globalThreads[2] = {widthA, heightB};
size_t localThreads[2] = {16,16};
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
//clFlush(command_queue);
//clFinish(command_queue);
/* Copy results from the memory buffer */
ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0,
widthA * heightC * sizeof(float), Res, 0, NULL, &GPUDone[0]);
printf("Buffer Read ended with %d.\n", ret);
clWaitForEvents(1, GPUDone);
fp1 = fopen("matGPURes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(Res+i*heightC+j));
ref[i][j] = *(Res+i*heightC+j);
printf("GPU Output: %f\n", *(Res+i*heightC+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobjA);
ret = clReleaseMemObject(memobjB);
ret = clReleaseMemObject(memobjC);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
ret = clReleaseEvent(GPUDone[0]);
free(source_str);
float sum=0.0;
for(i = 0;i < widthA; i++)
{
for(j = 0; j < heightC; j++)
{
sum = 0;
for(k = 0; k < widthB; k++)
{
sum += A[i*col+k] * B[k*row+j];
printf("Multiplying A: %f, B: %f\n", A[i*col+k], B[k*row+j]);
}
D[i*heightC+j] = sum;
}
}
fp1 = fopen("matNormalMultiplicationRes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matNormalMultiplicationRes.txt\n");
exit(1);
}
for(i = 0; i<widthA; i++)
{
for(j = 0; j<heightA; j++)
{
if (ref[i][j] != D[i*heightA+j])
{
printf("Calculation error[ CPU: %f, GPU: %f ]\n", D[i*heightA+j], ref[i][j]);
}
}
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(D+i*heightC+j));
}
fprintf(fp1, "\n");
}
free(A);
free(B);
free(C);
free(D);
free(Res);
return 0;
}
Here is the kernel
#define BLOCK_SIZE 16
__kernel
void matrixMultiplication(__global float* A, __global float* B, __global float* C, int wA, int wB )
{
//int i = get_global_id(0);
//int j = get_global_id(1);
float Csub = 0.0f;
int bx = get_group_id(0);
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int aBegin = wA * BLOCK_SIZE * by;
int aEnd = aBegin + wA - 1;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * wB;
for (int a = aBegin, b=bBegin;
a <= aEnd;
a += aStep, b+=bStep)
{
__local float As[BLOCK_SIZE][BLOCK_SIZE];
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
barrier(CLK_LOCAL_MEM_FENCE);
for( int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
barrier(CLK_LOCAL_MEM_FENCE);
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
/*
float value=0;
for ( int k = 0; k < widthA; k++)
{
value = value + A[k + j * widthA] * B[k*widthB + i];
}
C[i + widthA * j] = value;
*/
}
I have double checked over and over again but simply cannot find any errors. I want to make sure its not a code error before I conclude its a driver issue.
Thanks!
Do you really need a complex kernel like that ? if you really want to do simple matrix multiplication
you can write a simple kernel like this, which is easy to debug.
__kernel void matrixMultiplication (__global float* A,
__global float* B,
__global float* C,
int widthA, int widthB )
{
//y direction
int row = get_global_id(1);
int col = get_global_id(0);
float cSum = 0.0f;
//calculate the result
for (int i=0; i<widthA; i++)
{
cSum += A[row*widthA+ i] * B[i*widthB+col];
}
C[row*widthB+col] = cSum;
}
Case is probably closed already, but for the sake of google-comers:
Shouldnt shared memory be explicitly declared on host and passed as kernel argument to the source? __local keyword is not the one you are looking for in this case.
See post on How to declare local memory in OpenCL? for the detailed explanation.
Check the functionality of your host. Here a few things to get you started ...
1) You don't need to create a buffer and enqueue it for a scalar constant Int like row and col. Just set it as a kernel arg.
2) Wait for the clEnqueueNDRangeKernel with an event. You want to be sure the calc has completed.
3) Add a printf statement in the kernel to print selected values to see that the input and output values are what you expect.
try
if ( get_local_id(0) % 8 == 0)
{
printf some useful value of a,b,c
}
3) Try the host code with a dumb kernel that copies an input array to an output array. That will confirm it you have the handling of buffer creation and the enqeue read/write code correct!

JOGL ArcBall not working

I'm trying to embed an existing implementation of ArcBall in JOGL into my own project. It compiles and runs but I doesn't work! I can't play around with the view.
I took the implementation (two classes) from here:
http://www.mdimension.com/page/Software?appNum=1
And followed the instructions of embeding the thing into my own project. Here's the class I'm using ArcBall in:
public class GLRenderer implements GLEventListener {
private static final int MAP_SIZE = 1024;
private static final int STEP_SIZE = 16;
private static final float HEIGHT_RATIO = 1.5f;
private float[][] temperatureMap = new float[MAP_SIZE][MAP_SIZE];
private float scaleValue = 0.15f;
private GLU glu = new GLU();
private ArcBall arcBall = new ArcBall();
public void init(GLAutoDrawable drawable) {
GL gl = drawable.getGL();
gl.glShadeModel(GL.GL_SMOOTH);
gl.glClearColor(0.0f, 0.0f, 0.0f, 0.5f);
gl.glClearDepth(1.0f);
gl.glEnable(GL.GL_DEPTH_TEST);
gl.glDepthFunc(GL.GL_LEQUAL);
gl.glHint(GL.GL_PERSPECTIVE_CORRECTION_HINT, GL.GL_NICEST);
loadValuesToMap();
arcBall.registerDrawable(drawable);
}
public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) {
GL gl = drawable.getGL();
gl.glViewport(0, 0, width, height);
gl.glMatrixMode(GL.GL_PROJECTION); // Select The Projection Matrix
gl.glLoadIdentity();
glu.gluPerspective(30,(float)width/(float)height,1.0f,650.0);
gl.glMatrixMode(GL.GL_MODELVIEW);
gl.glLoadIdentity();
arcBall.reshape(width, height);
}
public void display(GLAutoDrawable drawable) {
arcBall.displayUpdateRotations();
GL gl = drawable.getGL();
gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
gl.glClear(GL.GL_COLOR_BUFFER_BIT);
gl.glClear(GL.GL_DEPTH_BUFFER_BIT); //added
gl.glMatrixMode(GL.GL_PROJECTION);
gl.glLoadIdentity();
setLight(gl);
positionCamera(glu, gl);
drawXYZ(gl);
arcBall.displayTransform(gl);
drawMap(glu, gl);
gl.glFlush();
}
public void setVertexColor(GL gl, int x, int y) {
float fColor = -0.15f + (temperatureMap[x][y] / 256.0f);
gl.glColor3f(0.0f, 0.0f, fColor);
}
public void drawMap(GLU glu, GL gl) {
int x, z;
float y;
gl.glBegin(GL.GL_QUADS);
for(int X = 0; X <(MAP_SIZE - STEP_SIZE); X += STEP_SIZE) {
for(int Y = 0; Y < (MAP_SIZE -STEP_SIZE); Y += STEP_SIZE) {
// Get The (X, Y, Z) Value For The Bottom Left Vertex
x = X;
y = temperatureMap[X][Y];
z = Y;
// Set The Color Value Of The Current Vertex
setVertexColor(gl, x, z);
gl.glVertex3f(x, y, z);
// Get The (X, Y, Z) Value For The Top Left Vertex
x = X;
y = temperatureMap[X][Y + STEP_SIZE];
z = Y + STEP_SIZE ;
// Set The Color Value Of The Current Vertex
setVertexColor(gl, x, z);
gl.glVertex3f(x, y, z); // Send This Vertex To OpenGL To Be Rendered
// Get The (X, Y, Z) Value For The Top Right Vertex
x = X + STEP_SIZE;
y = temperatureMap[X + STEP_SIZE][Y + STEP_SIZE];
z = Y + STEP_SIZE ;
// Set The Color Value Of The Current Vertex
setVertexColor(gl, x, z);
gl.glVertex3f(x, y, z); // Send This Vertex To OpenGL To Be Rendered
// Get The (X, Y, Z) Value For The Bottom Right Vertex
x = X + STEP_SIZE;
y = temperatureMap[X + STEP_SIZE][Y];
z = Y;
// Set The Color Value Of The Current Vertex
setVertexColor(gl, x, z);
gl.glVertex3f(x, y, z); // Send This Vertex To OpenGL To Be Rendered
}
}
gl.glEnd();
gl.glColor4f(1.0f, 1.0f, 1.0f, 1.0f);
gl.glTranslated(0.1, 0.1, -0.5);
gl.glColor3f(0.0f, 0.0f, 1.0f);
glu.gluSphere(glu.gluNewQuadric(), 0.05f, 32, 32);
gl.glTranslated(0.1, 0.1, -0.1);
gl.glColor3f(0.0f, 1.0f, 0.0f);
glu.gluSphere(glu.gluNewQuadric(), 0.05f, 32, 32);
gl.glTranslated(0.1, -0.1, 0.1);
gl.glColor3f(1.0f, 0.0f, 0.0f);
glu.gluSphere(glu.gluNewQuadric(), 0.05f, 32, 32);
}
public void positionCamera(GLU glu, GL gl) {
glu.gluPerspective(75.0f,1.09,0.1f,500.0f);
glu.gluLookAt(194, 80, 194,
131, 55, 131,
0, 1, 0);
gl.glScalef(scaleValue, scaleValue * HEIGHT_RATIO, scaleValue);
}
public void setLight(GL gl) {
// Prepare light parameters.
float SHINE_ALL_DIRECTIONS = 1;
float[] lightPos = {0, -30, 0, SHINE_ALL_DIRECTIONS};
float[] lightColorAmbient = {0.5f, 0.5f, 0.5f, 0.5f};
float[] diffuseLight = { 0.8f, 0.8f, 0.8f, 1.0f };
float[] lightColorSpecular = {0.5f, 0.5f, 0.5f, 0.5f};
// Set light parameters.
gl.glLightfv(GL.GL_LIGHT1, GL.GL_POSITION, lightPos, 1);
gl.glLightfv(GL.GL_LIGHT1, GL.GL_AMBIENT, lightColorAmbient, 0);
gl.glLightfv(GL.GL_LIGHT1, GL.GL_DIFFUSE, diffuseLight, 0);
gl.glLightfv(GL.GL_LIGHT1, GL.GL_SPECULAR, lightColorSpecular, 0);
// Enable lighting in GL.
gl.glEnable(GL.GL_LIGHT1);
gl.glEnable(GL.GL_LIGHTING);
// Set material properties.
gl.glEnable(GL.GL_COLOR_MATERIAL);
}
public void drawXYZ(GL gl) {
gl.glMatrixMode(GL.GL_MODELVIEW);
gl.glBegin(GL.GL_LINES);
gl.glColor3d(1.0, 0.0, 0.0); //red (x)
gl.glVertex3d(-0.1, 0.0, 0.0);
gl.glVertex3d(1500.0, 0.0, 0.0);
gl.glColor3d(0.0, 1.0, 0.0); //green (y)
gl.glVertex3d(0.0, -0.1, 0.0);
gl.glVertex3d(0.0, 1500.0, 0.0);
gl.glColor3d(0.0, 0.0, 1.0); //blue (z)
gl.glVertex3d(0.0, 0.0, 0.1);
gl.glVertex3d(0.0, 0.0, 1500.0);
gl.glEnd();
}
public void displayChanged(GLAutoDrawable drawable, boolean modeChanged, boolean deviceChanged) {
init(drawable);
}
private void loadValuesToMap() {
for(int i = 0; i < MAP_SIZE; i++) {
for(int j = 0; j< MAP_SIZE; j++) {
if(i > 300 && i < 700 && j > 300 && j < 700)
temperatureMap[i][j] = 150;
else
temperatureMap[i][j] = 100;
}
}
}
}
I'm new to openGL soke it might be a stupid mistake. I'd appreciate any help though.
Thanks
The source code is not complete. Where is your frame (AWT Frame or Swing JFrame)? Please look at the example of JOGL on Wikipedia.

Resources