Image Smoothing in MPI using Scatter and Gather - mpi

Hello I have a task in MPI. In this task I will implement a parallel image processing algorithm. This algorihm calculates the average of each pixel's value and its eight neighbors. You can think of an image as a 2-dimensional array of color values, that is, a matrix, the smoothing algorithm can be applied to all values of this matrix, that is, the pixels of an image.
The figure below shows the softening process of the midpoint and is the average of 8 neighbors after softening. 3x3 Smoothing (20+40+10+10+20+20+10+20+30)/9 = 20
My program applies a smoothing algorithm to the input image and then stores the results in a new image. So, I need to write a sequential program program.c that takes two inputs; first for the name of the input image and the second for the name of the output image. This section does not need to include any parallel processing.
I have two library for reading and writing: <stb_image.h> and <stb_image_write.h>
mpcc task.c -lm -o task_mpi
./task_mpi -n 2 input.jpg output.jpg
I tried to work on the program a bit, but I could not make fruitful progress.
Mycode:
#include <stdint.h>
#include <stdio.h>
#include <mpi.h>
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#define CHANNEL_NUM 1
int main(int argc,char* argv[]) {
int width, height, bpp, total,rgb;
MPI_Init(NULL,NULL);
int id ;
int size;
MPI_Comm_rank(MPI_COMM_WORLD,&id);
MPI_Comm_size(MPI_COMM_WORLD,&size);
int *img_mpi;
int local_image = (height/3)*width / size;
int* recv_buf = (int*) malloc(sizeof(int)*local_image);
int full_image[height*width];
// Reading the image
uint8_t* rgb_image = stbi_load(argv[1], &width, &height, &bpp, CHANNEL_NUM);
printf("Width: %d Height: %d \n",width,height);
for(int ii = 0; ii < height*width;ii++)
{
full_image[ii]=&rgb_image[ii];
}
MPI_Scatter(full_image, local_image ,MPI_INT, &recv_buf,local_image, MPI_INT, 1, MPI_COMM_WORLD);
if(id == 1)
{
for(int i=1;i<height;i++)
{
for(int j=1;j<width;j++)
{
total =
recv_buf,[(i-1)*width +(j-1)] +
recv_buf,[(i-1)*width +j] +
recv_buf,[(i-1)*width +(j+1)] +
recv_buf,[(i)*width +(j-1)] +
recv_buf,[i*width + j] +
recv_buf,[(i)*width +(j+1)] +
recv_buf,[(i+1)*width +(j-1)] +
recv_buf,[(i+1)*width +j] +
recv_buf,[(i+1)*width +(j+1)];
rgb = (total / 9);
recv_buf[i*width + j]= rgb;
}
}
}
MPI_Gather(&recv_buf, local_image ,MPI_INT, full_image, local_image, MPI_INT, 0, MPI_COMM_WORLD);
if(id == 0)
{
rgb_image = &recv_buf;
}
// Stoing the image
stbi_write_jpg(argv[2], width, height, CHANNEL_NUM, rgb_image, 100);
stbi_image_free(rgb_image);
MPI_Finalize();
return 0;
}

Related

How to print a number on with HT1632 only accepting text

I just bought a 8x32 lattice board (led matrix) and I control it with Arduino. The problem is that I can only use text with the library I got on github. But not numbers, how can I do it?
I'm going to put the code below, the code of the scrolling text and the part of the code in the library that specifies the function used to set the text.
The arduino code that program the scrolling text is here:
#include <HT1632.h>
#include <font_5x4.h>
#include <images.h>
int i = 0;
int wd;
char disp[] = "Hello, how are you?";
int x = 10;
void setup() {
HT1632.begin(A5, A4, A3);
wd = HT1632.getTextWidth(disp, FONT_5X4_END, FONT_5X4_HEIGHT);
}
void loop() {
HT1632.renderTarget(1);
HT1632.clear();
HT1632.drawText(disp, OUT_SIZE - i, 2, FONT_5X4, FONT_5X4_END,
FONT_5X4_HEIGHT);
HT1632.render();
i = (i + 1) % (wd + OUT_SIZE);
delay(100);
}
The library code that specifies the printing of the text is this:
void HT1632Class::drawText(const char text[], int x, int y, const byte font[],
int font_end[], uint8_t font_height,
uint8_t gutter_space) {
int curr_x = x;
char i = 0;
char currchar;
// Check if string is within y-bounds
if (y + font_height < 0 || y >= COM_SIZE)
return;
while (true) {
if (text[i] == '\0')
return;
currchar = text[i] - 32;
if (currchar >= 65 &&
currchar <=
90) // If character is lower-case, automatically make it upper-case
currchar -= 32; // Make this character uppercase.
if (currchar < 0 || currchar >= 64) { // If out of bounds, skip
++i;
continue; // Skip this character.
}
// Check to see if character is not too far right.
if (curr_x >= OUT_SIZE)
break; // Stop rendering - all other characters are no longer within the
// screen
// Check to see if character is not too far left.
int chr_width = getCharWidth(font_end, font_height, currchar);
if (curr_x + chr_width + gutter_space >= 0) {
drawImage(font, chr_width, font_height, curr_x, y,
getCharOffset(font_end, currchar));
// Draw the gutter space
for (char j = 0; j < gutter_space; ++j)
drawImage(font, 1, font_height, curr_x + chr_width + j, y, 0);
}
curr_x += chr_width + gutter_space;
++i;
}
}
You need to look at snprintf. This allows you to format a string of characters just like printf. It allows you to convert something like a int into a part of a string.
an example:
int hour = 10;
int minutes = 50;
char buffer[60];
int status = snprintf(buffer, 60, "the current time is: %i:%i\n", hour, minutes);
buffer now contains:"the current time is: 10:50" (and several empty characters past the \0).

MPI Send and receive a pointer in MPI_Type_struct

I want to send a set of data with the MPI_Type_struct and one of them is a pointer to an array (because the matrices that I'm going to use are going to be very large and I need to do malloc). The problem I see is that all the data is passed correctly except the matrix. I know that it is possible to pass a matrix through the pointer since if I only send the pointer of the matrix, correct results are observed.
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
void main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
int size, rank;
int m,n;
m=n=2;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
typedef struct estruct
{
float *array;
int sizeM, sizeK, sizeN, rank_or;
} ;
struct estruct kernel, server;
MPI_Datatype types[5] = {MPI_FLOAT, MPI_INT,MPI_INT,MPI_INT,MPI_INT};
MPI_Datatype newtype;
int lengths[5] = {n*m,1,1,1,1};
MPI_Aint displacements[5];
displacements[0] = (size_t) & (kernel.array[0]) - (size_t)&kernel;
displacements[1] = (size_t) & (kernel.sizeM) - (size_t)&kernel;
displacements[2] = (size_t) & (kernel.sizeK) - (size_t)&kernel;
displacements[3] = (size_t) & (kernel.sizeN) - (size_t)&kernel;
displacements[4] = (size_t) & (kernel.rank_or) - (size_t)&kernel;
MPI_Type_struct(5, lengths, displacements, types, &newtype);
MPI_Type_commit(&newtype);
if (rank == 0)
{
kernel.array = (float *)malloc(m * n * sizeof(float));
for(int i = 0; i < m*n; i++) kernel.array[i] = i;
kernel.sizeM = 5;
kernel.sizeK = 5;
kernel.sizeN = 5;
kernel.rank_or = 5;
MPI_Send(&kernel, 1, newtype, 1, 0, MPI_COMM_WORLD);
}
else
{
server.array = (float *)malloc(m * n * sizeof(float));
MPI_Recv(&server, 1, newtype, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("%i \n", server.sizeM);
printf("%i \n", server.sizeK);
printf("%i \n", server.sizeN);
printf("%i \n", server.rank_or);
for(int i = 0; i < m*n; i++) printf("%f\n",server.array[i]);
}
MPI_Finalize();
}
Assuming that only two processes are executed,I expect that process with rank = 1 receive and display the correct elements of the matrix on the screen (the other elements are well received), but the actual output is:
5
5
5
5
0.065004
0.000000
0.000000
0.000000
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 26206 RUNNING AT pmul
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
I hope someone can help me.

C code translated to Qt crashes while executing

I´m a beginner on programming and Qt, but as liked the framework I´m trying to improve my skills and write my C++ codes on it. I got a task of writing a Ricker wavelet code and then plot it.
I divided it in two tasks, first make the ricker code works, and when it is running, then implement a way to plot it, I will use qcustomplot for it.
I got a code from C and I´m trying to adapt it to Qt. Although it doesn´t give any errors during compilation, when executing it crashes, with the following message:
Invalid parameter passed to C runtime function. C:/Users/Flavio/Documents/qtTest/build-ricker2-Desktop_Qt_5_11_0_MinGW_32bit-Debug/debug/ricker2.exe
exited with code 255
The code I´m supposed to translate is:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
float *rickerwavelet(float fpeak, float dt, int *nwricker);
int main(int argc, char **argv)
{
int i;
float dt;
float fpeak;
float *wricker=NULL;
int nwricker;
fpeak = atof(argv[1]);
dt = atof(argv[2]);
wricker = rickerwavelet(fpeak, dt, &nwricker);
/* show value of ricker wavelets */
for (i=0; i<nwricker; i++)
printf("%i. %3.5f \n", i, wricker[i]);
free(wricker);
return(1);
}
/* ricker wavelet function, return an array ricker wavelets */
float *rickerwavelet(float fpeak, float dt, int *nwricker)
{
int i, k;
int nw;
int nc;
float pi;
float nw1, alpha, beta;
float *wricker=NULL;
pi = 3.141592653589793;
nw1 = 2.2/fpeak/dt;
nw = 2*floor(nw1/2)+1;
nc = floor(nw/2);
wricker = (float*) calloc (nw, sizeof(float));
for (i=0; i<nw; i++)
{
k = i+1;
alpha = (nc-k+1)fpeakdtpi;
beta = pow(alpha, 2.0);
wricker[i] = (1 - (beta2)) * exp(-beta);
}
(*nwricker) = nw;
return(wricker);
}
The code i wrote on Qt is:
#include <QCoreApplication>
#include <qmath.h>
#include <stdio.h>
#include <stdlib.h>
#include <QDebug>
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
int i,k,nw,nc;
double *wricker=NULL;
int nwricker = 60;
int wavelet_freq = 30;
int polarity=1;
int sampling_rate=0.004;
float nw1, alpha, beta;
const double pi = 3.141592653589793238460;
nw1 = 2.2/wavelet_freq/sampling_rate;
nw = 2*floor(nw1/2)+1;
nc = floor(nw/2);
wricker = (double*)calloc (nw, sizeof(double));
for (i=0; i<nw; i++)
{
k = i+1;
alpha = (nc-k+1)wavelet_freqsampling_ratepi;
beta = pow(alpha, 2.0);
wricker[i] = polarity((1 - (beta2)) * exp(-beta));
};
/* show value of ricker wavelets */
for (i=0; i<nwricker; i++)
{
qDebug()<<i<<wricker[i];
};
free(wricker);
return a.exec();
}
Analytic expression
The amplitude A of the Ricker wavelet with peak frequency f at time t is computed like so:
A = (1-2 pi^2 f^2* t^2) e^{-pi^2* f^2* t^2}
A py code for it would be:
import numpy as np
import matplotlib.pyplot as plt
def ricker(f, length=0.128, dt=0.001):
t = np.arange(-length/2, (length-dt)/2, dt)
y = (1.0 - 2.0*(np.pi2)(f2)(t2)) * np.exp(-(np.pi2)(f2)(t**2))
return t, y
f = 25 # A low wavelength of 25 Hz
t, w = ricker(f)
What seems quite simple.
Does anyone have any idea what is wrong in my code???
Doing a bit of Debugging I found the problem is when passing the vectors to qDebug, it give a message:
THE INFERIOR STOPPED BECAUSE IT RECEIVED A SIGNAL FROM THE OPERATING
SYSTEM . SIGNAL NAME: SIGSEGV SIGNAL MEANING: SEGMENTATION FAULT
I´ll search for more information on this signal meaning. I used qDebug only with the intention of showing the data on a terminal to make sure the calculation was working.
Thanks in advance.
C++ is much more like Python than C. Even though your C code was particularly convoluted, it still isn't as nice a C++ can be.
A complete example that even plots the data can be very, very simple. If that doesn't show the combined power of C++ and Qt, I hardly know what will :)
wavelet-plot-50690312.pro file
QT = charts
SOURCES = main.cpp
main.cpp
// https://github.com/KubaO/stackoverflown/tree/master/questions/wavelet-plot-50690312
#include <QtCharts>
#include <cmath>
const double pi = 3.14159265358979323846;
QVector<QPointF> ricker(double f, double length = 2.0, double dt = 0.001) {
size_t N = (length - dt/2.0)/dt;
QVector<QPointF> w(N);
for (size_t i = 0; i < N; ++i) {
double t = -length/2 + i*dt;
w[i].setX(t);
w[i].setY((1.0 - 2*pi*pi*f*f*t*t) * exp(-pi*pi*f*f*t*t));
}
return w;
}
QLineSeries *rickerSeries(double f) {
auto *series = new QLineSeries;
series->setName(QStringLiteral("Ricker Wavelet for f=%1").arg(f, 2));
series->replace(ricker(f));
return series;
}
int main(int argc, char *argv[]) {
QApplication app(argc, argv);
QChartView view;
view.chart()->addSeries(rickerSeries(1.0));
view.chart()->addSeries(rickerSeries(2.0));
view.chart()->createDefaultAxes();
view.setMinimumSize(800, 600);
view.show();
return app.exec();
}
Of course, this looks nice in C++. How about C? Let's pretend we had some C binding for Qt. Then it might look as follows:
// https://github.com/KubaO/stackoverflown/tree/master/questions/wavelet-plot-50690312/c-binding
#include "qc_binding.h"
#include <math.h>
#include <stddef.h>
#include <stdio.h>
const double pi = 3.14159265358979323846;
CQVector *ricker(double f, double length, double dt) {
scope_enter();
size_t N = (length - dt/2.0)/dt;
CQVector *vector = CQVector_size_(CQ_, CQPointF_type(), N);
CQPointF *const points = CQPointF_(CQVector_data_at(vector, 0));
for (size_t i = 0; i < N; ++i) {
double t = -length/2 + i*dt;
points[i].x = t;
points[i].y = (1.0 - 2*pi*pi*f*f*t*t) * exp(-pi*pi*f*f*t*t);
}
return scope_leave_ptr(vector);
}
CQLineSeries *rickerSeries(double f) {
scope_enter();
CQLineSeries *series = CQLineSeries_(CQ_);
CQLineSeries_setName(series, CQString_asprintf(CQ_, "Ricker Wavelet for f=%.2f", f));
CQLineSeries_replaceVector(series, ricker(f, 2.0, 0.001));
return scope_leave_ptr(series);
}
int main(int argc, char *argv[]) {
scope_enter();
CQApplication *app = CQApplication_(CQ_, &argc, argv);
CQChartView *view = CQChartView_(CQ_);
CQChart *chart = CQChartView_getChart(view);
CQChart_addLineSeries(chart, rickerSeries(1.0));
CQChart_addLineSeries(chart, rickerSeries(2.0));
CQChart_createDefaultAxes(chart);
CQWidget *view_ = CQWidget_cast(view);
CQWidget_setMinimumSize(view_, 800, 600);
CQWidget_show(view_);
return scope_leave_int(CQApplication_exec(app));
}
With a little bit of work, a C binding can be made that is about as easy to use as C++: scopes are tracked, RAII works, destructors get called when needed, values about to be returned are not destructed, etc.
Such a minimum binding, implementing all that's needed to run the code above, is available at https://github.com/KubaO/stackoverflown/tree/master/questions/wavelet-plot-50690312/c-binding.

CUDA two dimensional array [duplicate]

This question already has answers here:
Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D
(3 answers)
Closed 5 years ago.
I'm trying to allocate matrix on device, fill it with some number in kernel and then copy it back to host. Problem is that on host only one row seems to be filled.
I got something like this:
9 9 9 9
-1 -1 -1 -1
-1 -1 -1 -1
-1 -1 -1 -1
Here is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
void check(cudaError x) {
fprintf(stderr, "%s\n", cudaGetErrorString(x));
}
void showMatrix2(int* v1, int width, int height) {
printf("---------------------\n");
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
printf("%d ", v1[i * width + j]);
}
printf("\n");
}
}
__global__ void kernel(int* tab,int width, int height, int pitch) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < width && col < height) {
tab[col * pitch + row] = 9;
}
}
int main()
{
int width = 4;
int height = 4;
int* d_tab;
int* h_tab;
int realSize = width * height* sizeof(int);
size_t pitch;
check( cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height) );
h_tab = (int*)malloc(realSize);
check( cudaMemset(d_tab, 0, realSize) );
dim3 grid(4, 4);
dim3 block(4, 4);
kernel <<<grid, block>>>(d_tab, width, height, pitch);
check( cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost) );
showMatrix2(h_tab, width, height);
printf("\nPitch size: %d \n", pitch);
getchar();
return 0;
}
Any time you are having trouble with a CUDA code, in addition to doing error checking, run your code with cuda-memcheck. If you had done so, you would have gotten at least a hint as to what is going on, and then you could use techniques like this to continue your own debug. Even if you can't figure it out, the cuda-memcheck output will be useful to others trying to help you.
You have invalid writes in your kernel. There are multiple errors here. To properly access a pitched allocation in kernel code, I strongly recommend studying the example given in the documentation for cudaMallocPitch. In a nutshell, this kind of index generation is just broken:
tab[col * pitch + row]
Firstly, pitch returned by cudaMallocPitch is a width in bytes. You cannot use it as an adjustment to an index for quantities like int or float (study the documentation). Secondly, the pitch value should ultimately multiply a row index, not a column index.
not related to your problem, but your final printf statement has an incorrect format specifier if you are on a 64-bit platform, it should be %ld (or better, %lu).
Here is a code that has the indexing issue fixed, it seems to work correctly for me:
$ cat t109.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
void check(cudaError x) {
fprintf(stderr, "%s\n", cudaGetErrorString(x));
}
void showMatrix2(int* v1, int width, int height) {
printf("---------------------\n");
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
printf("%d ", v1[i * width + j]);
}
printf("\n");
}
}
__global__ void kernel(int* tab,int width, int height, int pitch) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < width && col < height) {
*( ((int *)(((char *)tab) + (row * pitch))) + col) = 9;
}
}
int main()
{
int width = 4;
int height = 4;
int* d_tab;
int* h_tab;
int realSize = width * height* sizeof(int);
size_t pitch;
check( cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height) );
h_tab = (int*)malloc(realSize);
check( cudaMemset(d_tab, 0, realSize) );
dim3 grid(4, 4);
dim3 block(4, 4);
kernel <<<grid, block>>>(d_tab, width, height, pitch);
check( cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost) );
showMatrix2(h_tab, width, height);
printf("\nPitch size: %ld \n", pitch);
return 0;
}
$ nvcc -arch=sm_61 -o t109 t109.cu
$ cuda-memcheck ./t109
========= CUDA-MEMCHECK
no error
no error
no error
---------------------
9 9 9 9
9 9 9 9
9 9 9 9
9 9 9 9
Pitch size: 512
========= ERROR SUMMARY: 0 errors
$

Open MPI's MPI_reduce not combining array sums

I am very new to Open MPI. I have made a small program that computes the sum of an array, by splitting array into pieces equal to the number of processes. The problem in my program is that each process is computing right sum of its share of the array, but the individually computed sums are not summed by MPI_reduce function. I tried my best to solve and also consulted the Open MPI manual, but there is still something that I might be missing. I would be grateful for any kind of guidance. Below is the program I made:
#include "mpi.h"
#include <stdio.h>
int main(int argc, char *argv[])
{
int n, rank, nrofProcs, i;
int sum, ans;
// 0,1,2, 3,4,5, 6,7,8, 9
int myarr[] = {1,5,9, 2,8,3, 7,4,6, 10};
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nrofProcs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
n = 10;
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
sum = 0.0;
int remaining = n % nrofProcs;
int lower =rank*(n/nrofProcs);
int upper = (lower+(n/nrofProcs))-1;
for (i = lower; i <= upper; i++)
{
sum = sum + myarr[i];
}
if(rank==nrofProcs-1)
{
while(i<=remaining)
{
sum = sum + myarr[i];
i++;
}
}
/* (PROBLEM IS HERE, IT IS NOT COMBINING "sums") */
MPI_Reduce(&sum, &ans, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
// if (rank == 0)
printf( "rank: %d, Sum ans: %d\n", rank, sum);
/* shut down MPI */
MPI_Finalize();
return 0;
}
Output:
rank: 2, Sum ans: 17
rank: 1, Sum ans: 13
rank: 0, Sum ans: 15
(Output should be rank: 0, Sum ans: 55)
Sorry, I made some mistakes, that I corrected after running parallel debugging on my program. Here I am sharing code to split an array of length N on M processes, where N and M can have any value:
/*
An MPI program split an array of length N on M processes, where N and M can have any value
*/
#include <math.h>
#include "mpi.h"
#include <iostream>
#include <vector>
using namespace std;
int main(int argc, char *argv[])
{
int n, rank, nrofProcs, i;
int sum, ans;
// 0,1,2, 3,4,5, 6,7,8, 9, 10
int myarr[] = {1,5,9, 2,8,3, 7,4,6,11,10};
vector<int> myvec (myarr, myarr + sizeof(myarr) / sizeof(int) );
n = myvec.size(); // number of items in array
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nrofProcs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
sum = 0.0;
int remaining = n % nrofProcs;
int lower =rank*(n/nrofProcs);
int upper = (lower+(n/nrofProcs))-1;
for (i = lower; i <= upper; i++)
{
sum = sum + myvec[i];
}
if(rank==nrofProcs-1)
{
int ctr=0;
while(ctr<remaining)
{
sum = sum + myvec[i];
ctr++;
i++;
}
}
/* combine everyone's calculations */
MPI_Reduce(&sum, &ans, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0)
cout << "rank: " <<rank << " Sum ans: " << ans<< endl;
/* shut down MPI */
MPI_Finalize();
return 0;
}

Resources