This question already has answers here:
Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D
(3 answers)
Closed 5 years ago.
I'm trying to allocate matrix on device, fill it with some number in kernel and then copy it back to host. Problem is that on host only one row seems to be filled.
I got something like this:
9 9 9 9
-1 -1 -1 -1
-1 -1 -1 -1
-1 -1 -1 -1
Here is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
void check(cudaError x) {
fprintf(stderr, "%s\n", cudaGetErrorString(x));
}
void showMatrix2(int* v1, int width, int height) {
printf("---------------------\n");
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
printf("%d ", v1[i * width + j]);
}
printf("\n");
}
}
__global__ void kernel(int* tab,int width, int height, int pitch) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < width && col < height) {
tab[col * pitch + row] = 9;
}
}
int main()
{
int width = 4;
int height = 4;
int* d_tab;
int* h_tab;
int realSize = width * height* sizeof(int);
size_t pitch;
check( cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height) );
h_tab = (int*)malloc(realSize);
check( cudaMemset(d_tab, 0, realSize) );
dim3 grid(4, 4);
dim3 block(4, 4);
kernel <<<grid, block>>>(d_tab, width, height, pitch);
check( cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost) );
showMatrix2(h_tab, width, height);
printf("\nPitch size: %d \n", pitch);
getchar();
return 0;
}
Any time you are having trouble with a CUDA code, in addition to doing error checking, run your code with cuda-memcheck. If you had done so, you would have gotten at least a hint as to what is going on, and then you could use techniques like this to continue your own debug. Even if you can't figure it out, the cuda-memcheck output will be useful to others trying to help you.
You have invalid writes in your kernel. There are multiple errors here. To properly access a pitched allocation in kernel code, I strongly recommend studying the example given in the documentation for cudaMallocPitch. In a nutshell, this kind of index generation is just broken:
tab[col * pitch + row]
Firstly, pitch returned by cudaMallocPitch is a width in bytes. You cannot use it as an adjustment to an index for quantities like int or float (study the documentation). Secondly, the pitch value should ultimately multiply a row index, not a column index.
not related to your problem, but your final printf statement has an incorrect format specifier if you are on a 64-bit platform, it should be %ld (or better, %lu).
Here is a code that has the indexing issue fixed, it seems to work correctly for me:
$ cat t109.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
void check(cudaError x) {
fprintf(stderr, "%s\n", cudaGetErrorString(x));
}
void showMatrix2(int* v1, int width, int height) {
printf("---------------------\n");
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
printf("%d ", v1[i * width + j]);
}
printf("\n");
}
}
__global__ void kernel(int* tab,int width, int height, int pitch) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < width && col < height) {
*( ((int *)(((char *)tab) + (row * pitch))) + col) = 9;
}
}
int main()
{
int width = 4;
int height = 4;
int* d_tab;
int* h_tab;
int realSize = width * height* sizeof(int);
size_t pitch;
check( cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height) );
h_tab = (int*)malloc(realSize);
check( cudaMemset(d_tab, 0, realSize) );
dim3 grid(4, 4);
dim3 block(4, 4);
kernel <<<grid, block>>>(d_tab, width, height, pitch);
check( cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost) );
showMatrix2(h_tab, width, height);
printf("\nPitch size: %ld \n", pitch);
return 0;
}
$ nvcc -arch=sm_61 -o t109 t109.cu
$ cuda-memcheck ./t109
========= CUDA-MEMCHECK
no error
no error
no error
---------------------
9 9 9 9
9 9 9 9
9 9 9 9
9 9 9 9
Pitch size: 512
========= ERROR SUMMARY: 0 errors
$
Related
Hello I have a task in MPI. In this task I will implement a parallel image processing algorithm. This algorihm calculates the average of each pixel's value and its eight neighbors. You can think of an image as a 2-dimensional array of color values, that is, a matrix, the smoothing algorithm can be applied to all values of this matrix, that is, the pixels of an image.
The figure below shows the softening process of the midpoint and is the average of 8 neighbors after softening. 3x3 Smoothing (20+40+10+10+20+20+10+20+30)/9 = 20
My program applies a smoothing algorithm to the input image and then stores the results in a new image. So, I need to write a sequential program program.c that takes two inputs; first for the name of the input image and the second for the name of the output image. This section does not need to include any parallel processing.
I have two library for reading and writing: <stb_image.h> and <stb_image_write.h>
mpcc task.c -lm -o task_mpi
./task_mpi -n 2 input.jpg output.jpg
I tried to work on the program a bit, but I could not make fruitful progress.
Mycode:
#include <stdint.h>
#include <stdio.h>
#include <mpi.h>
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#define CHANNEL_NUM 1
int main(int argc,char* argv[]) {
int width, height, bpp, total,rgb;
MPI_Init(NULL,NULL);
int id ;
int size;
MPI_Comm_rank(MPI_COMM_WORLD,&id);
MPI_Comm_size(MPI_COMM_WORLD,&size);
int *img_mpi;
int local_image = (height/3)*width / size;
int* recv_buf = (int*) malloc(sizeof(int)*local_image);
int full_image[height*width];
// Reading the image
uint8_t* rgb_image = stbi_load(argv[1], &width, &height, &bpp, CHANNEL_NUM);
printf("Width: %d Height: %d \n",width,height);
for(int ii = 0; ii < height*width;ii++)
{
full_image[ii]=&rgb_image[ii];
}
MPI_Scatter(full_image, local_image ,MPI_INT, &recv_buf,local_image, MPI_INT, 1, MPI_COMM_WORLD);
if(id == 1)
{
for(int i=1;i<height;i++)
{
for(int j=1;j<width;j++)
{
total =
recv_buf,[(i-1)*width +(j-1)] +
recv_buf,[(i-1)*width +j] +
recv_buf,[(i-1)*width +(j+1)] +
recv_buf,[(i)*width +(j-1)] +
recv_buf,[i*width + j] +
recv_buf,[(i)*width +(j+1)] +
recv_buf,[(i+1)*width +(j-1)] +
recv_buf,[(i+1)*width +j] +
recv_buf,[(i+1)*width +(j+1)];
rgb = (total / 9);
recv_buf[i*width + j]= rgb;
}
}
}
MPI_Gather(&recv_buf, local_image ,MPI_INT, full_image, local_image, MPI_INT, 0, MPI_COMM_WORLD);
if(id == 0)
{
rgb_image = &recv_buf;
}
// Stoing the image
stbi_write_jpg(argv[2], width, height, CHANNEL_NUM, rgb_image, 100);
stbi_image_free(rgb_image);
MPI_Finalize();
return 0;
}
I just bought a 8x32 lattice board (led matrix) and I control it with Arduino. The problem is that I can only use text with the library I got on github. But not numbers, how can I do it?
I'm going to put the code below, the code of the scrolling text and the part of the code in the library that specifies the function used to set the text.
The arduino code that program the scrolling text is here:
#include <HT1632.h>
#include <font_5x4.h>
#include <images.h>
int i = 0;
int wd;
char disp[] = "Hello, how are you?";
int x = 10;
void setup() {
HT1632.begin(A5, A4, A3);
wd = HT1632.getTextWidth(disp, FONT_5X4_END, FONT_5X4_HEIGHT);
}
void loop() {
HT1632.renderTarget(1);
HT1632.clear();
HT1632.drawText(disp, OUT_SIZE - i, 2, FONT_5X4, FONT_5X4_END,
FONT_5X4_HEIGHT);
HT1632.render();
i = (i + 1) % (wd + OUT_SIZE);
delay(100);
}
The library code that specifies the printing of the text is this:
void HT1632Class::drawText(const char text[], int x, int y, const byte font[],
int font_end[], uint8_t font_height,
uint8_t gutter_space) {
int curr_x = x;
char i = 0;
char currchar;
// Check if string is within y-bounds
if (y + font_height < 0 || y >= COM_SIZE)
return;
while (true) {
if (text[i] == '\0')
return;
currchar = text[i] - 32;
if (currchar >= 65 &&
currchar <=
90) // If character is lower-case, automatically make it upper-case
currchar -= 32; // Make this character uppercase.
if (currchar < 0 || currchar >= 64) { // If out of bounds, skip
++i;
continue; // Skip this character.
}
// Check to see if character is not too far right.
if (curr_x >= OUT_SIZE)
break; // Stop rendering - all other characters are no longer within the
// screen
// Check to see if character is not too far left.
int chr_width = getCharWidth(font_end, font_height, currchar);
if (curr_x + chr_width + gutter_space >= 0) {
drawImage(font, chr_width, font_height, curr_x, y,
getCharOffset(font_end, currchar));
// Draw the gutter space
for (char j = 0; j < gutter_space; ++j)
drawImage(font, 1, font_height, curr_x + chr_width + j, y, 0);
}
curr_x += chr_width + gutter_space;
++i;
}
}
You need to look at snprintf. This allows you to format a string of characters just like printf. It allows you to convert something like a int into a part of a string.
an example:
int hour = 10;
int minutes = 50;
char buffer[60];
int status = snprintf(buffer, 60, "the current time is: %i:%i\n", hour, minutes);
buffer now contains:"the current time is: 10:50" (and several empty characters past the \0).
I need to implement a Rc4 algorithm with a seed: 1 2 3 6 and the plain text cryptology. I am following this guideline we were provided in class, but it's not initializing S correctly.
my output is
and needs to be
My code was previously printing negative values , not sure why but I managed to fix that error. Thought everything was good to go but it's not. Sorry for the pictures, I figured it was easier to explain what I was following for my code structure. I am mod 4 the seed since it contains 4 characters, could that possibly be my error?
#include <iostream>
#include <string>
#include <string.h>
using std::endl;
using std::string;
void swap(unsigned int *x, unsigned int *y);
int main()
{
string plaintext = "cryptology";
char cipherText[256] = { ' ' };
unsigned int S[256] = { 0 };
unsigned int t[256] = { 0 };
unsigned int seed[4] = { 1, 2, 3, 6 }; // seed used for test case 1
unsigned int temp = 0;
int runningTotal = 0;
unsigned int key = 0;
// inilializing s and t
for (int i = 0; i < 256; i++)
{
S[i] = i;
t[i] = seed[i % 4];
}
for (int i = 0; i < 256; i++)
{
runningTotal += S[i] + t[i];
runningTotal %= 256;
swap(&S[runningTotal], &S[i]);
std::cout << S[i] <<" ";
}
runningTotal = 0;
for (int i = 0; i < plaintext.size(); i++)
{
runningTotal %= 256;
swap(&S[i], &S[runningTotal]);
temp = (unsigned int)S[i] + (unsigned int)S[runningTotal];
temp %= 256;
key = S[temp];
std::cout << endl;
cipherText[i] = plaintext[i] ^ key;
}
std::cout << " this is cipher text " << endl;
std::cout << cipherText << endl;
system("pause");
return 0;
}
void swap(unsigned int *x, unsigned int *y)
{
unsigned int temp = 0;
temp = *x;
*x = *y;
*y = temp;
}
Actually I think you're generating S[] correctly. I can only assume you're supposed to do something different with the key. (Perhaps's its an ASCII string instead of four byte values? Check your assignment notes.)
There is a problem later on, however. In the stream generation loop, you're supposed to do the increment and swap operations before you fetch a byte from S[].
for (int k = 0; k < plaintext.size(); k++)
{
i = (i+1) % 256; // increment S[] index
runningTotal = (runningTotal + S[i]) % 256; // swap bytes
swap(&S[i], &S[runningTotal]);
temp = (S[i] + S[runningTotal]) % 256; // fetch byte from S and
cipherText[k] = plaintext[k] ^ S[temp]; // XOR with plaintext
}
NOTE: Although unrelated to your question, your code could be made a lot tidier by using unsigned char values instead of ints. That would eliminate the % 256 instructions that are littered all over the place. (But be careful during initialization, because i<256 will always be true if i is an unsigned char.)
In OpenCL, if I want to add two N-dimension vectors, the global work group size (globalSize) should satisfy globalSize = ceil(N/localSize) * localSize, where localSize is the local work group size. Is this correct? If N = 1000, and localSize = 128, globalSize should be 1024? Can we always set globalSize some multiple of localSize and larger than needed?
I tried many times and it worked well for 1-dimension problems.
However, when it comes to 2d problems, for example, multiply two matrices of dimension m*n and n*p, the result matrix is of order m*p, things get more complicated.
The max work group size on my device is 128, so I set localSize [2] = {16,8} and
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}.
It is similar to the 1-dimension case but the result is wrong!
If I set localSize [2] = {1,128} and change the globalSize accordingly, I can get the correct result. So where is the problem? Can anyone tell me why?
In addition, I find out the indices where the matrix element is wrong.
It seems that the result is wrong at (i,j) where i*p + j = n * some constant (n = 1,2,3...)
Why?
Here is my kernel function:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
{
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
else
{
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
}
And then it is the host program:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
{
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
{
A[i] = i;
}
for(int i = 0; i < n*p; i++)
{
B[i] = i;
}
try
{
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("mmul.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
{
float z = 0.0;
for(int k = 0; k < n; k++)
{
z += A[i*n+k] * B[k*p+j];
}
c[i*p+j] = z;
}
for(int i = 0; i < m*p; i++)
{
if(fabs(c[i]-C[i])>0.001)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
}
delete []A;
delete []B;
delete []C;
}
catch(Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
return 0;
}
Your bounds checking code inside your OpenCL kernel is incorrect. Instead of this:
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
You should have this:
if(i < 0 || j < 0 || i >= Mdim || j >= Pdim) return;
Let's assume, that you have float matrix of size 1000x1000:
const int size = 1000;
// Whatever
float* myMatrix = (float*)calloc(size * size, sizeof(*myMatrix));
Determine size of Local Group first:
size_t localSize[] = {16, 8};
Then determine, how many Local Groups do you need:
size_t numLocalGroups[] = {ceil(size/localSize[0]), ceil(size/localSize[1])};
Finally, determine NDRange size:
size_t globalSize[] = {localSize[0] * numLocalGroups[0], localSize[1] * numLocalGroups[1]};
Don't forget to handle out-of-bounds access in right-most Local Groups.
I need to find average for thousands (20,000+) images represented by unsigned short arrays. Could you please check me, it looks for me that this code is not optimal:
my kernel:
__global__ void VecAdd(unsigned short *A, float *B, unsigned int Size, float div){
register float divider = div;
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] / divider + B[idx];
}
//__syncthreads();
}
kernel wrapper:
void kernel_wrapper(unsigned short* pixels1, float* pixels2, unsigned int length, float div)
{
unsigned short* deviceData1;
float* deviceData2;
cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
cudaMalloc((void**)&deviceData2, length * sizeof(float));
cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd<<< blocks, threads >>>( deviceData1, deviceData2, length, div );
cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree( deviceData1 );
cudaFree( deviceData2 );
}`
and I do
float* avrg2f = (float*)malloc( width * height * sizeof(float));
memset( avrg2f, 0.0, sizeof(float) * width * height);
for (int k = 0; k < count; k++) {
imageObjectList.at( curObj )->getImage( k );
kernel_wrapper( avrg1, avrg2f, height * width, (float)count);
}
as result may averaged image will be in avrg2f;
Thank you.
If the images are all the same size, then your wrapper function need not do cudaMalloc and cudaFree operations on every call.
Pre-allocate that storage needed, and don't allocated and free it on every call to the wrapper.
In addition you may see something like a ~2x speedup (for the cudaMemcpy operations) if you use pinned allocations (cudaHostAlloc) on the host side for your image storage.
Finally, for the duration of your loop, there's no need to copy the results back to the host. Do this after you're done computing the average. This will save 2 out of the 3 cudaMemcpy operations you are doing in the wrapper.
While we're at it, in my opinion using memset to initialize a float array is questionable. It works for a zero value, but essentially no other. Furthermore, I would expect passing 0.0 as the second parameter to memset to at least throw a compiler warning.
The following code shows the above optimizations, and demonstrates about an 8x speedup over your code in my test case:
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
__global__ void VecAdd(unsigned short *A, float *B, unsigned int Size, float div){
register float divider = div;
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] / divider + B[idx];
}
//__syncthreads();
}
__global__ void VecAdd2(unsigned short *A, float *B, unsigned int Size, float mult){
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] * mult + B[idx];
}
}
void kernel_wrapper(unsigned short* pixels1, float* pixels2, unsigned int length, float div)
{
unsigned short* deviceData1;
float* deviceData2;
cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
cudaMalloc((void**)&deviceData2, length * sizeof(float));
cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd<<< blocks, threads >>>( deviceData1, deviceData2, length, div );
cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree( deviceData1 );
cudaFree( deviceData2 );
}
void kernel_wrapper2(unsigned short* h_pixels1, unsigned short* d_pixels1, float* d_pixels2, unsigned int length, float my_mult)
{
cudaMemcpy(d_pixels1, h_pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd2<<< blocks, threads >>>( d_pixels1, d_pixels2, length, my_mult );
}
int main(){
const int count = 2000;
const int width = 1280;
const int height = 960;
timeval t1, t2;
unsigned long et;
unsigned short *h1_image;
h1_image = (unsigned short *)malloc(height*width*sizeof(unsigned short));
float* avrg2f = (float*)malloc( width * height * sizeof(float));
for (int i = 0; i<height*width; i++){
h1_image[i] = (i%256);
avrg2f[i] = 0.0f;
}
gettimeofday(&t1,NULL);
for (int k = 0; k < count; k++) {
kernel_wrapper( h1_image, avrg2f, height * width, (float)count);
}
gettimeofday(&t2,NULL);
et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
printf("time 1 = %ld us\n", et);
unsigned short *h2_image;
float* avrg3f = (float*)malloc( width * height * sizeof(float));
cudaHostAlloc((void **)&h2_image, height*width*sizeof(unsigned short), cudaHostAllocDefault);
for (int i = 0; i<height*width; i++){
h2_image[i] = (i%256);
avrg3f[i] = 0.0f;
}
gettimeofday(&t1,NULL);
unsigned short *d_image;
float *d_result;
cudaMalloc((void **)&d_image, height*width*sizeof(unsigned short));
cudaMalloc((void **)&d_result, height*width*sizeof(float));
cudaMemcpy(d_result, avrg3f, height*width*sizeof(float), cudaMemcpyHostToDevice);
for (int k = 0; k < count; k++) {
kernel_wrapper2( h2_image, d_image, d_result, height * width, (float)(1/(float)count));
}
cudaMemcpy(avrg3f, d_result, height*width*sizeof(float), cudaMemcpyDeviceToHost);
gettimeofday(&t2,NULL);
et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
printf("time 2 = %ld us\n", et);
for (int i = 0; i < (height*width); i++)
if (fabs(avrg2f[i] - avrg3f[i]) > 0.0001) {printf("mismatch at %d, 1 = %f, 2 = %f\n", i, avrg2f[i], avrg3f[i]); return 1;}
return 0;
}