I'm trying to use one-sided communications in MPI.
The following example consists of an array of 4 doubles that is split between 2 processes.
The first process writes 0, 1, 2, 3 in the distributed array while the second one subsequently tries to read it. Unfortunately, it doesn't work. I must be doing something wrong somewhere.
#include <mpi.h>
#include <iostream>
int main(){
MPI_Init(0, nullptr);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int n=2;
double* data, x;
MPI_Win window;
MPI_Alloc_mem(n*sizeof(double), MPI_INFO_NULL, &data);
MPI_Win_create(data, n*sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &window);
int i;
MPI_Win_fence(0, window);
for(i=0; i<n*size; ++i){
MPI_Put(&x, 1, MPI_DOUBLE, i/n, i%n, 1, MPI_DOUBLE, window);
MPI_Win_fence(0, window);
MPI_Win_fence(0, window);
for(i=0; i<n*size; ++i){
MPI_Get(&x, 1, MPI_DOUBLE, i/n, i%n, 1, MPI_DOUBLE, window);
std::cout << i << " " << i/n << " " << i%n << " => " << x << "\n";
return 0;
I'm trying to send information from one processor to another in a ring way from an offset processor using MPI_Sendrecv but i got deadlock. What is wrong in my code? Basically i need to use MPI_SendRecv to solve this kind of problem.
#include <stdio.h>
#include <unistd.h>
#include <mpi.h>
int main (int argc, char *argv[])
int offset = 9;
int size, rank, value, next, prev, sendval, recval, namelen;
double t0, t;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Get_processor_name(processor_name, &namelen);
value = 5;
if (size > 1)
next = (rank + 1)% size;
prev = (size+rank - 1)% size;
sendval = value + rank;
if (rank == offset)
MPI_Sendrecv(&sendval, 1, MPI_INT, next, 1, &recval, 1, MPI_INT, prev, 10, MPI_COMM_WORLD, &status);
MPI_Recv(&recval, 1, MPI_INT, prev, 10, MPI_COMM_WORLD, &status);
MPI_Send(&sendval, 1, MPI_INT, next, 10, MPI_COMM_WORLD);
return 0;
You have mismatched message tags:
MPI_Sendrecv(&sendval, 1, MPI_INT, next, 1, &recval, 1, MPI_INT, prev, 10, MPI_COMM_WORLD, &status);
// ^
MPI_Recv(&recval, 1, MPI_INT, prev, 10, MPI_COMM_WORLD, &status);
// ^^
The tag in the send part of the send-receive operation should also be 10.
I need to send array pieces to all processes using MPI_Scatter then to get sum of all elements. Where should I initialize array then to scatter it? In root rank?
If I initialize array on root rank then other ranks dont get their data. Otherway I can initialize array for everyone (out of if(rank == root)...else), but it means, that I create array several times.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <iostream>
#include <time.h>
using namespace std;
int main(int argc, char* argv[])
int size;
int rank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int arr_size = size * 2;
int block = arr_size / (size);
int* B = new int[block];
if (rank == 0)
int* A = new int[arr_size];
cout << "generated array: " << endl;
for (int i = 0; i < arr_size; i++)
A[i] = rand() % 100;
cout << A[i] << " ";
cout << endl;
MPI_Scatter(A, block, MPI_INT, B, block, MPI_INT, 0, MPI_COMM_WORLD);
cout << "process " << rank << " received: " << endl;
for (int i = 0; i < block; i++)
cout << B[i] << " ";
cout << endl;
int local_sum = 0;
for (int i = 0; i < block; i++)
local_sum += B[i];
cout << "sum in process " << rank << " = " << local_sum << endl;
cout << endl;
int global_sum;
MPI_Reduce(&local_sum, &global_sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0)
cout << "sum = " << global_sum << endl;
return 0;
I get something like this (only root rank got its data):
process 1 received:
process 3 received:
-842150451 -842150451
-842150451 -842150451
sum in process 1 = -1684300902
sum in process 3 = -1684300902
process 2 received:
-842150451 -842150451
sum in process 2 = -1684300902
process 0 received:
4 9
sum in process 0 = 13
sum = -757935397
MPI_Scatter() is a collective operation and must hence be invoked by all the ranks.
Declare int *A = NULL; on all ranks and only allocate and populate on rank zero.
int* A = NULL;
int* B = new int[block];
if (rank == 0)
A = new int[arr_size];
cout << "generated array: " << endl;
for (int i = 0; i < arr_size; i++)
A[i] = rand() % 100;
cout << A[i] << " ";
cout << endl;
MPI_Scatter(A, block, MPI_INT, B, block, MPI_INT, 0, MPI_COMM_WORLD);
I'm exploring MPI in C++ and I wanted to parallelize the creation of a picture of the Mandelbrot set. I'm using the ppm format. Each processor builds its part and sends it back to the main process that receives it as MPI_CHAR. This is the code:
#include "mpi.h"
#include <iostream>
#include <string>
#include <fstream>
#include <complex>
using namespace std;
int mandelbrot(int x, int y, int width, int height, int max) {
complex<float> point((float) (y - height/2.0) * 4.0/width, (float) (x - width/2.0) * 4.0/width);
complex<float> z(0, 0);
unsigned int iteration = 0;
while (abs(z) < 4 && iteration < max) {
z = z * z + point;
return iteration;
int main(int argc, char **argv) {
int numprocs;
int myid;
int buff_size = 404270; // 200x200
char buff[buff_size];
int i;
MPI_Status stat;
int width = 200, height = 200, max_iter = 1000;
if (myid == 0) {
ofstream image("mandel.ppm");
image << "P3\n" << width << " " << height << " 255\n";
for(i=1; i < numprocs; i++) {
MPI_Probe(i, 0, MPI_COMM_WORLD, &stat);
int length;
MPI_Get_count(&stat, MPI_CHAR, &length);
image << buff;
} else {
stringstream ss;
// proc rank: 1, 2, ..., n
int part = height/(numprocs-1), start = (myid - 1) * part, end = part * myid;
printf("%d -> %d\n", start, end);
for (int row = start; row < end; row++) {
for (int col = 0; col < width; col++) {
int iteration = mandelbrot(row, col, width, height, max_iter);
if (row == start) ss << 255 << ' ' << 255 << ' ' << 255 << "\n";
else if (iteration < max_iter) ss << iteration * 255 << ' ' << iteration * 20 << ' ' << iteration * 5 << "\n";
else ss << 0 << ' ' << 0 << ' ' << 0 << "\n";
printf("\n sizeof = %d\n", ss.str().length());
MPI_Send(ss.str().c_str(), ss.str().length(), MPI_CHAR, 0, 0, MPI_COMM_WORLD);
return 0;
Code compilation:
$ mpic++ -std=c++0x mpi.mandel.cpp -o mpi.mandel
Running with 3 processes (process main + process rank 1 and 2)
$ mpirun -np 3 ./mpi.mandel
Resulting ppm pictures when running with 3, 4, and 5 process:
It seems that the point-to-point communication of sending-receiving is mixing the results when more than 3 processes try to send the MPI_CHAR elements to the main process. How can avoid this behavior?
It works when creating the buffer buff with the same length as the receiving message:
for (int i=1; i < numprocs; i++) {
MPI_Probe(i, 0, MPI_COMM_WORLD, &stat);
int length;
MPI_Get_count(&stat, MPI_CHAR, &length);
printf("\nfrom %d <<-- %d (stat.source=%d) Receiving %d chars\n", myid, i, stat.MPI_SOURCE, length);
char buff[length + 1];
buff[length] = '\0';
image << buff;
Thus, we don't need anymore the declaration at the beginning int buff_size = 404270; neither char buff[buff_size];
How could I read external input file for mpi? I need to read one integer from external file (zadanie4_vstup.txt), to compute simple factorial. I have tried to substitute the second argument in MPI_Init() with address of int variable (n), but it looks it is nonsense.
Thank you.
#include <stdio.h>
#include <mpi.h>
int main(int argc, char ** argv)
FILE *fr, *fw;
fr = fopen("zadanie4_vstup.txt", "r");
fw = fopen("zadanie4_vystup.txt", "w");
int nproc, me;
int fakt=1, i, buff, n;
MPI_Status stat;
fscanf(fr, "%d", &n);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
#pragma omp parallel for private(i) reduction(*:fakt)
for(i=me*n/nproc+1; i<=(me+1)*n/nproc; i++) {
fakt *= i;
if(nproc > 1) {
if(me == 0) {
for(i=1; i<nproc; i++) {
MPI_Recv(&buff, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &stat);
} else {
MPI_Send(&fakt, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
if(me == 0) {
fprintf(fw, "%d! = %d\n", n, fakt);
here is a version of your program that reads n on the command line.
note i simplified the communications by using MPI_Reduce()
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char *argv[]) {
int nproc, me;
int fakt=1, res, i, buff, n;
MPI_Status stat;
MPI_Init(&argc, &argv);
n = atoi(argv[1]);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
#pragma omp parallel for private(i) reduction(*:fakt)
for(i=me*n/nproc+1; i<=(me+1)*n/nproc; i++) {
fakt *= i;
MPI_Reduce(&fakt, &res, 1, MPI_INT, MPI_PROD, 0, MPI_COMM_WORLD);
if(me == 0) {
printf("%d! = %d\n", n, res);
return 0;
for example
$ mpirun -np 4 ./fakt 6
6! = 720
I appreciate it if somebody tell me why this simple MPI send and receive code doesn't run on two processors, when the value of n=40(at line 20), but works for n <=30. In other words, if the message size goes beyond an specific number (which is not that large, roughly a 1-D array of size 8100) the MPI deadlocks.
#include "mpi.h"
#include "stdio.h"
#include "stdlib.h"
#include "iostream"
#include "math.h"
using namespace std;
int main(int argc, char *argv[])
int processor_count, processor_rank;
double *buff_H, *buff_send_H;
int N_pa_prim1, l, n, N_p0;
MPI_Status status;
MPI_Init (&argc, &argv);
MPI_Comm_size (MPI_COMM_WORLD, &processor_count);
MPI_Comm_rank (MPI_COMM_WORLD, &processor_rank);
N_pa_prim1=14; l=7; n=40; N_p0=7;
buff_H = new double [n*n*N_p0+1]; //Receive buffer allocation
buff_send_H = new double [n*n*N_p0+1]; //Send buffer allocation
for (int j = 0; j < n*n*N_p0+1; j++)
buff_send_H[j] = 1e-8*rand();
if (processor_rank == 0)
MPI_Send(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, 1, 163, MPI_COMM_WORLD);
else if(processor_rank == 1)
MPI_Send(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, 0, 163, MPI_COMM_WORLD);
MPI_Recv(buff_H, n*n*N_p0+1, MPI_DOUBLE, MPI_ANY_SOURCE, 163, MPI_COMM_WORLD, &status);
cout << "Received successfully by " << processor_rank << endl;
return 0;
The deadlocking is correct behaviour; you have a deadlock in your code.
The MPI Specification allows MPI_Send to behave as MPI_Ssend -- that is, to be blocking. A blocking communications primitive does not return until the communications "have completed" in some sense, which (in the case of a blocking send) probably means the receive has started.
Your code looks like:
If Processor 0:
Send to processor 1
If Processor 1:
Send to processor 0
That is -- the receive doesn't start until the sends have completed. You're sending, but they'll never return, because no one is receiving! (The fact that this works for small messages is an implementation artifact - most mpi implementations use so called a so-called "eager protocol" for "small enough" messages; but this can't be counted upon in general.)
Note that there are other logic errors here, too -- this program will also deadlock for more than 2 processors, as processors of rank >= 2 will be waiting for a message which never comes.
You can fix your program by alternating sends and receives by rank:
if (processor_rank == 0) {
MPI_Send(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, 1, 163, MPI_COMM_WORLD);
MPI_Recv(buff_H, n*n*N_p0+1, MPI_DOUBLE, MPI_ANY_SOURCE, 163, MPI_COMM_WORLD, &status);
} else if (processor_rank == 1) {
MPI_Recv(buff_H, n*n*N_p0+1, MPI_DOUBLE, MPI_ANY_SOURCE, 163, MPI_COMM_WORLD, &status);
MPI_Send(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, 0, 163, MPI_COMM_WORLD);
or by using MPI_Sendrecv (which is a blocking (send + receive), rather than a blocking send + a blocking receive):
int sendto;
if (processor_rank == 0)
sendto = 1;
else if (processor_rank == 1)
sendto = 0;
if (processor_rank == 0 || processor_rank == 1) {
MPI_Sendrecv(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, sendto, 163,
buff_H, n*n*N_p0+1, MPI_DOUBLE, MPI_ANY_SOURCE, 163,
MPI_COMM_WORLD, &status);
Or by using non-blocking sends and receives:
MPI_Request reqs[2];
MPI_Status statuses[2];
if (processor_rank == 0) {
MPI_Isend(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, 1, 163, MPI_COMM_WORLD, &reqs[0]);
} else if (processor_rank == 1) {
MPI_Isend(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, 0, 163, MPI_COMM_WORLD, &reqs[0]);
if (processor_rank == 0 || processor_rank == 1)
MPI_Irecv(buff_H, n*n*N_p0+1, MPI_DOUBLE, MPI_ANY_SOURCE, 163, MPI_COMM_WORLD, &reqs[1]);
MPI_Waitall(2, reqs, statuses);
Thank you Jonathan for your help. Here I have chosen the third solution and written a similar code to yours except adding "for" loops to send a number of messages. This time it doesn't deadlock; however processors keep on receiving only the last message. (since the messages are long, I've only printed their last elements to check the consistency)
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <math.h>
using namespace std;
int main(int argc, char *argv[])
int processor_count, processor_rank;
//Initialize MPI
MPI_Init (&argc, &argv);
MPI_Comm_size (MPI_COMM_WORLD, &processor_count);
MPI_Comm_rank (MPI_COMM_WORLD, &processor_rank);
double **buff_H, *buff_send_H;
int N_pa_prim1, l, n, N_p0, count, temp;
N_pa_prim1=5; l=7; n=50; N_p0=7;
MPI_Request reqs[N_pa_prim1];
MPI_Status statuses[N_pa_prim1];
buff_H = new double *[N_pa_prim1]; //Receive buffer allocation
for (int i = 0; i < N_pa_prim1; i++)
buff_H[i] = new double [n*n*N_p0+1];
buff_send_H = new double [n*n*N_p0+1]; //Send buffer allocation
if (processor_rank == 0) {
for (int i = 0; i < N_pa_prim1; i++){
for (int j = 0; j < n*n*N_p0+1; j++)
buff_send_H[j] = 2.0325e-8*rand();
cout << processor_rank << "\t" << buff_send_H[n*n*N_p0] << "\t" << "Send" << "\t" << endl;
MPI_Isend(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, 1, 163, MPI_COMM_WORLD, &reqs[i]);
else if (processor_rank == 1) {
for (int i = 0; i < N_pa_prim1; i++){
for (int j = 0; j < n*n*N_p0+1; j++)
buff_send_H[j] = 3.5871e-8*rand();
cout << processor_rank << "\t" << buff_send_H[n*n*N_p0] << "\t" << "Send" << "\t" << endl;
MPI_Isend(buff_send_H, n*n*N_p0+1, MPI_DOUBLE, 0, 163, MPI_COMM_WORLD, &reqs[i]);
for (int i = 0; i < N_pa_prim1; i++)
MPI_Irecv(buff_H[i], n*n*N_p0+1, MPI_DOUBLE, MPI_ANY_SOURCE, 163, MPI_COMM_WORLD, &reqs[N_pa_prim1+i]);
MPI_Waitall(2*N_pa_prim1, reqs, statuses);
for (int i = 0; i < N_pa_prim1; i++)
cout << processor_rank << "\t" << buff_H[i][n*n*N_p0] << "\t" << "Receive" << endl;
return 0;