CUFFT wrong result for batch 1D ifft - inverse

I am new to CUDA and CUFFT, when I was trying to recover the fft result of cufftExecC2R(...) by applying the corresponding cufftExecC2R(...), it went wrong, the recovered data and the original data is not identical.
Here is the code, the cuda library I used was cuda-9.0.
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "cuda.h"
#include "cufft.h"
#include <iostream>
#include <sys/time.h>
#include <cstdio>
#include <cmath>
using namespace std;
// cuda error check
#define gpuErrchk(ans) {gpuAssrt((ans), __FILE__, __LINE__);}
inline void gpuAssrt(cudaError_t code, const char* file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) {
exit(code);
}
}
}
// ifft scale for cufft
__global__ void IFFTScale(int scale_, cufftReal* real) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
real[idx] *= 1.0 / scale_;
}
void batch_1d_irfft2_test() {
const int BATCH = 3;
const int DATASIZE = 4;
/// RFFT
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
for (int i = 0; i < BATCH; ++ i) {
for (int j = 0; j < DATASIZE; ++ j) {
hostInputData[i * DATASIZE + j] = (cufftReal)(i * DATASIZE + j + 1);
}
}
// DEBUG:print host input data
cout << "print host input data" << endl;
for (int i = 0; i < BATCH; ++ i) {
for (int j = 0; j < DATASIZE; ++ j) {
cout << hostInputData[i * DATASIZE + j] << ", ";
}
cout << endl;
}
cout << "=====================================================" << endl;
// --- Device side input data allocation and initialization
cufftReal *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));
// --- Device side output data allocation
cufftComplex *deviceOutputData;
gpuErrchk(cudaMalloc(
(void**)&deviceOutputData,
(DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
// Host sice input data copied to Device side
cudaMemcpy(deviceInputData,
hostInputData,
DATASIZE * BATCH * sizeof(cufftReal),
cudaMemcpyHostToDevice);
// --- Batched 1D FFTs
cufftHandle handle;
int rank = 1; // --- 1D FFTs
int n[] = {DATASIZE}; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = DATASIZE, odist = DATASIZE / 2 + 1; // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int batch = BATCH; // --- Number of batched executions
cufftPlanMany(
&handle,
rank,
n,
inembed, istride, idist,
onembed, ostride, odist,
CUFFT_R2C,
batch);
cufftExecR2C(handle, deviceInputData, deviceOutputData);
// **************************************************************************
/// IRFFT
cufftReal *deviceOutputDataIFFT;
gpuErrchk(cudaMalloc((void**)&deviceOutputDataIFFT, DATASIZE * BATCH * sizeof(cufftReal)));
// --- Batched 1D IFFTs
cufftHandle handleIFFT;
int n_ifft[] = {DATASIZE / 2 + 1}; // --- Size of the Fourier transform
idist = DATASIZE / 2 + 1; odist = DATASIZE; // --- Distance between batches
cufftPlanMany(
&handleIFFT,
rank,
n_ifft,
inembed, istride, idist,
onembed, ostride, odist,
CUFFT_C2R,
batch);
cufftExecC2R(handleIFFT, deviceOutputData, deviceOutputDataIFFT);
/* scale
// dim3 dimGrid(512);
// dim3 dimBlock(max((BATCH * DATASIZE + 512 - 1) / 512, 1));
// IFFTScale<<<dimGrid, dimBlock>>>((DATASIZE - 1) * 2, deviceOutputData);
*/
// host output data for ifft
cufftReal *hostOutputDataIFFT = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
cudaMemcpy(hostOutputDataIFFT,
deviceOutputDataIFFT,
DATASIZE * BATCH * sizeof(cufftReal),
cudaMemcpyDeviceToHost);
// print IFFT recovered host output data
cout << "print host output IFFT data" << endl;
for (int i=0; i<BATCH; i++) {
for (int j=0; j<DATASIZE; j++) {
cout << hostOutputDataIFFT[i * DATASIZE + j] << ", ";
}
printf("\n");
}
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceOutputData));
gpuErrchk(cudaFree(deviceInputData));
gpuErrchk(cudaFree(deviceOutputDataIFFT));
free(hostOutputDataIFFT);
free(hostInputData);
}
int main() {
batch_1d_irfft2_test();
return 0;
}
I compile the 'rfft_test.cu' file by nvcc -o rfft_test rfft_test.cu -lcufft. the result is as below:
print host input data
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
=====================================================
print IFFT recovered host output data
6, 8.5359, 15.4641, 0,
22, 24.5359, 31.4641, 0,
38, 40.5359, 47.4641, 0,
Specifically, I check the scale issue for the cufftExecC2R(...), and I comment out the IFFTScale() kernel function. Thus I assume that the recovered output data was like DATASIZE*input_batched_1d_data, but even so, the result is not as expected.
I have checked the cufft manual and my code several times, I also search for some Nvidia forums and StackOverflow answers, but I didn't find any solution. Anyone's help is greatly appreciated.
Thanks in advance.

Size of your inverse transform is incorrect and should be DATASIZE not DATASIZE/2+1.
Following sections of cuFFT docs should help:
https://docs.nvidia.com/cuda/cufft/index.html#data-layout
https://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional
"In C2R mode an input array ( x 1 , x 2 , … , x ⌊ N 2 ⌋ + 1 ) of only non-redundant complex elements is required." - N is transform size you pass to plan function

Related

Getting undesired behavior when sending-receiving messages using MPI

I'm exploring MPI in C++ and I wanted to parallelize the creation of a picture of the Mandelbrot set. I'm using the ppm format. Each processor builds its part and sends it back to the main process that receives it as MPI_CHAR. This is the code:
#include "mpi.h"
#include <iostream>
#include <string>
#include <fstream>
#include <complex>
using namespace std;
int mandelbrot(int x, int y, int width, int height, int max) {
complex<float> point((float) (y - height/2.0) * 4.0/width, (float) (x - width/2.0) * 4.0/width);
complex<float> z(0, 0);
unsigned int iteration = 0;
while (abs(z) < 4 && iteration < max) {
z = z * z + point;
iteration++;
}
return iteration;
}
int main(int argc, char **argv) {
int numprocs;
int myid;
int buff_size = 404270; // 200x200
char buff[buff_size];
int i;
MPI_Status stat;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
int width = 200, height = 200, max_iter = 1000;
if (myid == 0) {
ofstream image("mandel.ppm");
image << "P3\n" << width << " " << height << " 255\n";
for(i=1; i < numprocs; i++) {
MPI_Probe(i, 0, MPI_COMM_WORLD, &stat);
int length;
MPI_Get_count(&stat, MPI_CHAR, &length);
MPI_Recv(buff, length, MPI_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
image << buff;
}
} else {
stringstream ss;
// proc rank: 1, 2, ..., n
int part = height/(numprocs-1), start = (myid - 1) * part, end = part * myid;
printf("%d -> %d\n", start, end);
for (int row = start; row < end; row++) {
for (int col = 0; col < width; col++) {
int iteration = mandelbrot(row, col, width, height, max_iter);
if (row == start) ss << 255 << ' ' << 255 << ' ' << 255 << "\n";
else if (iteration < max_iter) ss << iteration * 255 << ' ' << iteration * 20 << ' ' << iteration * 5 << "\n";
else ss << 0 << ' ' << 0 << ' ' << 0 << "\n";
}
}
printf("\n sizeof = %d\n", ss.str().length());
MPI_Send(ss.str().c_str(), ss.str().length(), MPI_CHAR, 0, 0, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
Code compilation:
$ mpic++ -std=c++0x mpi.mandel.cpp -o mpi.mandel
Running with 3 processes (process main + process rank 1 and 2)
$ mpirun -np 3 ./mpi.mandel
Resulting ppm pictures when running with 3, 4, and 5 process:
It seems that the point-to-point communication of sending-receiving is mixing the results when more than 3 processes try to send the MPI_CHAR elements to the main process. How can avoid this behavior?
It works when creating the buffer buff with the same length as the receiving message:
.
.
for (int i=1; i < numprocs; i++) {
MPI_Probe(i, 0, MPI_COMM_WORLD, &stat);
int length;
MPI_Get_count(&stat, MPI_CHAR, &length);
printf("\nfrom %d <<-- %d (stat.source=%d) Receiving %d chars\n", myid, i, stat.MPI_SOURCE, length);
char buff[length + 1];
MPI_Recv(buff, length, MPI_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
buff[length] = '\0';
image << buff;
}
.
.
Thus, we don't need anymore the declaration at the beginning int buff_size = 404270; neither char buff[buff_size];

Implementing Rc4 algorithm

I need to implement a Rc4 algorithm with a seed: 1 2 3 6 and the plain text cryptology. I am following this guideline we were provided in class, but it's not initializing S correctly.
my output is
and needs to be
My code was previously printing negative values , not sure why but I managed to fix that error. Thought everything was good to go but it's not. Sorry for the pictures, I figured it was easier to explain what I was following for my code structure. I am mod 4 the seed since it contains 4 characters, could that possibly be my error?
#include <iostream>
#include <string>
#include <string.h>
using std::endl;
using std::string;
void swap(unsigned int *x, unsigned int *y);
int main()
{
string plaintext = "cryptology";
char cipherText[256] = { ' ' };
unsigned int S[256] = { 0 };
unsigned int t[256] = { 0 };
unsigned int seed[4] = { 1, 2, 3, 6 }; // seed used for test case 1
unsigned int temp = 0;
int runningTotal = 0;
unsigned int key = 0;
// inilializing s and t
for (int i = 0; i < 256; i++)
{
S[i] = i;
t[i] = seed[i % 4];
}
for (int i = 0; i < 256; i++)
{
runningTotal += S[i] + t[i];
runningTotal %= 256;
swap(&S[runningTotal], &S[i]);
std::cout << S[i] <<" ";
}
runningTotal = 0;
for (int i = 0; i < plaintext.size(); i++)
{
runningTotal %= 256;
swap(&S[i], &S[runningTotal]);
temp = (unsigned int)S[i] + (unsigned int)S[runningTotal];
temp %= 256;
key = S[temp];
std::cout << endl;
cipherText[i] = plaintext[i] ^ key;
}
std::cout << " this is cipher text " << endl;
std::cout << cipherText << endl;
system("pause");
return 0;
}
void swap(unsigned int *x, unsigned int *y)
{
unsigned int temp = 0;
temp = *x;
*x = *y;
*y = temp;
}
Actually I think you're generating S[] correctly. I can only assume you're supposed to do something different with the key. (Perhaps's its an ASCII string instead of four byte values? Check your assignment notes.)
There is a problem later on, however. In the stream generation loop, you're supposed to do the increment and swap operations before you fetch a byte from S[].
for (int k = 0; k < plaintext.size(); k++)
{
i = (i+1) % 256; // increment S[] index
runningTotal = (runningTotal + S[i]) % 256; // swap bytes
swap(&S[i], &S[runningTotal]);
temp = (S[i] + S[runningTotal]) % 256; // fetch byte from S and
cipherText[k] = plaintext[k] ^ S[temp]; // XOR with plaintext
}
NOTE: Although unrelated to your question, your code could be made a lot tidier by using unsigned char values instead of ints. That would eliminate the % 256 instructions that are littered all over the place. (But be careful during initialization, because i<256 will always be true if i is an unsigned char.)

OpenCL clEnqueueNDRangeKernel how to set work group size correctly

In OpenCL, if I want to add two N-dimension vectors, the global work group size (globalSize) should satisfy globalSize = ceil(N/localSize) * localSize, where localSize is the local work group size. Is this correct? If N = 1000, and localSize = 128, globalSize should be 1024? Can we always set globalSize some multiple of localSize and larger than needed?
I tried many times and it worked well for 1-dimension problems.
However, when it comes to 2d problems, for example, multiply two matrices of dimension m*n and n*p, the result matrix is of order m*p, things get more complicated.
The max work group size on my device is 128, so I set localSize [2] = {16,8} and
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}.
It is similar to the 1-dimension case but the result is wrong!
If I set localSize [2] = {1,128} and change the globalSize accordingly, I can get the correct result. So where is the problem? Can anyone tell me why?
In addition, I find out the indices where the matrix element is wrong.
It seems that the result is wrong at (i,j) where i*p + j = n * some constant (n = 1,2,3...)
Why?
Here is my kernel function:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
{
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
else
{
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
}
And then it is the host program:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
{
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
{
A[i] = i;
}
for(int i = 0; i < n*p; i++)
{
B[i] = i;
}
try
{
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("mmul.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
{
float z = 0.0;
for(int k = 0; k < n; k++)
{
z += A[i*n+k] * B[k*p+j];
}
c[i*p+j] = z;
}
for(int i = 0; i < m*p; i++)
{
if(fabs(c[i]-C[i])>0.001)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
}
delete []A;
delete []B;
delete []C;
}
catch(Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
return 0;
}
Your bounds checking code inside your OpenCL kernel is incorrect. Instead of this:
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
You should have this:
if(i < 0 || j < 0 || i >= Mdim || j >= Pdim) return;
Let's assume, that you have float matrix of size 1000x1000:
const int size = 1000;
// Whatever
float* myMatrix = (float*)calloc(size * size, sizeof(*myMatrix));
Determine size of Local Group first:
size_t localSize[] = {16, 8};
Then determine, how many Local Groups do you need:
size_t numLocalGroups[] = {ceil(size/localSize[0]), ceil(size/localSize[1])};
Finally, determine NDRange size:
size_t globalSize[] = {localSize[0] * numLocalGroups[0], localSize[1] * numLocalGroups[1]};
Don't forget to handle out-of-bounds access in right-most Local Groups.

Spellcheck program using MPI

So, my assignment is to write a spell check program and then parallelize it using openMPI. My take was to load the words from a text file into my array called dict[] and this is used as my dictionary. Next, I get input from the user and then it's supposed go through the dictionary array and check whether the current word is within the threshold percentage, if it is, print it out. But I'm only supposed to print out a certain amount of words. My problem is, is that, my suggestions[] array, doesn't seem to fill up the way I need it to, and it gets a lot of blank spots in it, whereas, I thought at least, is that the way I wrote it, is to just fill it when a word is within threshold. So it shouldn't get any blanks in it until there are no more words being added. I think it's close to being finished but I can't seem to figure this part out. Any help is appreciated.
#include <stdio.h>
#include <mpi.h>
#include <string.h>
#include <stdlib.h>
#define SIZE 30
#define max(x,y) (((x) > (y)) ? (x) : (y))
char *dict[50000];
char *suggestions[50000];
char enterWord[50];
char *myWord;
int wordsToPrint = 20;
int threshold = 40;
int i;
int words_added = 0;
int levenshtein(const char *word1, int len1, const char *word2, int len2){
int matrix[len1 + 1][len2 + 1];
int a;
for(a=0; a<= len1; a++){
matrix[a][0] = a;
}
for(a=0;a<=len2;a++){
matrix[0][a] = a;
}
for(a = 1; a <= len1; a++){
int j;
char c1;
c1 = word1[a-1];
for(j = 1; j <= len2; j++){
char c2;
c2 = word2[j-1];
if(c1 == c2){
matrix[a][j] = matrix[a-1][j-1];
}
else{
int delete, insert, substitute, minimum;
delete = matrix[a-1][j] +1;
insert = matrix[a][j-1] +1;
substitute = matrix[a-1][j-1] +1;
minimum = delete;
if(insert < minimum){
minimum = insert;
}
if(substitute < minimum){
minimum = substitute;
}
matrix[a][j] = minimum;
}//else
}//for
}//for
return matrix[len1][len2];
}//levenshtein
void prompt(){
printf("Enter word to search for: \n");
scanf("%s", &enterWord);
}
int p0_compute_output(int num_processes, char *word1){
int totalNumber = 0;
int k = 0;
int chunk = 50000 / num_processes;
for(i = 0; i < chunk; i++){
int minedits = levenshtein(word1, strlen(word1), dict[i], strlen(dict[i]));
int thresholdPercentage = (100 * minedits) / max(strlen(word1), strlen(dict[i]));
if(thresholdPercentage < threshold){
suggestions[totalNumber] = dict[i];
totalNumber = totalNumber + 1;
}
}//for
return totalNumber;
}//p0_compute_output
void p0_receive_output(int next_addition){
int num_to_add;
MPI_Comm comm;
MPI_Status status;
MPI_Recv(&num_to_add,1,MPI_INT,MPI_ANY_SOURCE, MPI_ANY_TAG,MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("--%d\n", num_to_add);
suggestions[next_addition] = dict[num_to_add];
next_addition = next_addition + 1;
}
void compute_output(int num_processes, int me, char *word1){
int chunk = 0;
int last_chunk = 0;
MPI_Comm comm;
if(50000 % num_processes == 0){
chunk = 50000 / num_processes;
last_chunk = chunk;
int start = me * chunk;
int end = me * chunk + chunk;
for(i = start; i < end;i++){
int minedits = levenshtein(word1, strlen(word1), dict[i], strlen(dict[i]));
int thresholdPercentage = (100 * minedits) / max(strlen(word1), strlen(dict[i]));
if(thresholdPercentage < threshold){
int number_to_send = i;
MPI_Send(&number_to_send, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
}
}
}
else{
chunk = 50000 / num_processes;
last_chunk = 50000 - ((num_processes - 1) * chunk);
if(me != num_processes){
int start = me * chunk;
int end = me * chunk + chunk;
for(i = start; i < end; i++){
int minedits = levenshtein(word1, strlen(word1), dict[i], strlen(dict[i]));
int thresholdPercentage = (100 * minedits) / max(strlen(word1), strlen(dict[i]));
if(thresholdPercentage < threshold){
int number_to_send = i;
MPI_Send(&number_to_send, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
}//if
}//for
}//if me != num_processes
else{
int start = me * chunk;
int end = 50000 - start;
for(i = start; i < end; i++){
int minedits = levenshtein(word1, strlen(word1), dict[i], strlen(dict[i]));
int thresholdPercentage = (100 * minedits) / max(strlen(word1), strlen(dict[i]));
if(thresholdPercentage < threshold){
int number_to_send = i;
MPI_Send(&number_to_send, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
}
}
}//me == num_processes
}//BIG else
return;
}//COMPUTE OUTPUT
void set_data(){
prompt();
MPI_Bcast(&enterWord,20 ,MPI_CHAR, 0, MPI_COMM_WORLD);
}//p0_send_inpui
//--------------------------MAIN-----------------------------//
main(int argc, char **argv){
int ierr, num_procs, my_id, loop;
FILE *myFile;
loop = 0;
for(i=0;i<50000;i++){
suggestions[i] = calloc(SIZE, sizeof(char));
}
ierr = MPI_Init(NULL, NULL);
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &my_id);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
printf("Check in from %d of %d processors\n", my_id, num_procs);
set_data();
myWord = enterWord;
myFile = fopen("words", "r");
if(myFile != NULL){
for(i=0;i<50000;i++){
dict[i] = calloc(SIZE, sizeof(char));
fscanf(myFile, "%s", dict[i]);
}//for
fclose(myFile);
}//read word list into dictionary
else printf("File not found");
if(my_id == 0){
words_added = p0_compute_output(num_procs, enterWord);
printf("words added so far: %d\n", words_added);
p0_receive_output(words_added);
printf("Threshold: %d\nWords To print: %d\n%s\n", threshold, wordsToPrint, myWord);
ierr = MPI_Finalize();
}
else{
printf("my word %s*\n", enterWord);
compute_output(num_procs, my_id, enterWord);
// printf("Process %d terminating...\n", my_id);
ierr = MPI_Finalize();
}
for(i=0;i<wordsToPrint;i++){
printf("*%s\n", suggestions[i]);
}//print suggestions
return (0);
}//END MAIN
Here are a few problems I see with what you're doing:
prompt() should only be called by rank 0.
The dictionary file should be read only by rank 0, then broadcast the array out to the other ranks
Alternatively, have rank 1 read the file while rank 0 is waiting for input, broadcast input and dictionary afterwards.
You're making the compute_output step overly complex. You can merge p0_compute_output and compute_output into one routine.
Store an array of indices into dict in each rank
This array will not be the same size in every rank, so the simplest way to do this would be to send from each rank a single integer indicating the size of the array, then send the array with this size. (The receiving rank must know how much data to expect). You could also use the sizes for MPI_Gatherv, but I expect this is more than you're wanting to do right now.
Once you have a single array of indices in rank 0, then use this to fill suggestions.
Save the MPI_Finalize call until immediately before the return call
For the final printf call, only rank 0 should be printing that. I suspect this is causing a large part of the "incorrect" result. As you have it, all ranks are printing suggestions, but it is only filled in rank 0. So the others will all be printing blank entries.
Try some of these changes, especially the last one, and see if that helps.

mpi dot product using point to point operations fails when data is large

I have below code two get a dot product of two vectors of size VECTORSIZE. Code works fine until VECTORSIZE up to 10000 but then it gives unrelated results. When I tried to debug the program I have seen that processor 0 (root) finishes its job before all processors send their local results. I got the same situation when I utilized the MPI_Reduce() (code part 2). However if I use MPI_Scatter() before MPI_Reduce() it is OK.
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
#define VECTORSIZE 10000000
#define ROOT 0
//[[## operation ConstructVectorPart()
void ConstructVector(double * vector, int size, short vectorEnu)
{
int i = 0;
if(vectorEnu == 1) // i.e vector 1
{
for(i = 0; i < size; i++)
{
vector[i] = 0.1 + (i%20)*0.1;
}
}
else if(vectorEnu == 2) // i.e. vector 2
{
for(i = 0 ; i < size; i++)
{
vector[i] = 2-(i%20)*0.1;
}
}
}
//[[## operation dotproduct()
double dotproduct(double* a, double* b, int length)
{
double result = 0;
int i = 0;
for (i = 0; i<length; i++)
result += a[i] * b[i];
return result;
}
int main( argc, argv )
int argc;
char **argv;
{
int processorID, numofProcessors;
int partialVectorSize ;
double t1, t2, localDotProduct, result;
MPI_Init( &argc, &argv );
MPI_Comm_size( MPI_COMM_WORLD, &numofProcessors );
MPI_Comm_rank( MPI_COMM_WORLD, &processorID );
if(processorID == 0)
t1 = MPI_Wtime();
// all processors constitute their own vector parts and
// calculates corresponding partial dot products
partialVectorSize = VECTORSIZE/ numofProcessors;
double *v1, *v2;
v1 = (double*)(malloc((partialVectorSize) * sizeof(double)));
v2 = (double*)(malloc((partialVectorSize) * sizeof(double)));
ConstructVectorPart(v1,0,partialVectorSize,1);
ConstructVectorPart(v2,0,partialVectorSize,2);
localDotProduct = dotproduct(v1,v2, partialVectorSize);
printf(" I am processor %d \n",processorID);
//----------------- code part 1 ---------------------------------------------
if( processorID != 0 ) // if not a master
{ // send partial result to master
MPI_Send( &localDotProduct, 1, MPI_DOUBLE, 0,0, MPI_COMM_WORLD );
}
else // master
{ // collect results
result = localDotProduct; // own result
int j;
for( j=1; j<numofProcessors; ++j )
{
MPI_Recv( &localDotProduct, 1, MPI_DOUBLE, j, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE);
result += localDotProduct;
}
t2 = MPI_Wtime();
printf(" result = %f TimeConsumed = %f \n",result, t2-t1);
}
//----------------------------------------------------------------------------
/*
//--------------------- code part 2 ----------------
MPI_Reduce(&localDotProduct, &result, 1, MPI_DOUBLE, MPI_SUM, 0,MPI_COMM_WORLD);
if(processorID == 0)
{
t2 = MPI_Wtime();
printf(" result = %f TimeConsumed = %f \n",result, t2-t1);
}
//---------------------------------------------------
*/
MPI_Finalize();
free(v1);
free(v2);
return 0;
}

Resources