Is there some way of avoiding this implicit MPI_Allreduce() synchronisation? - mpi

I'm writing an MPI program that uses a library which makes its own MPI calls. In my program, I have a loop that calls a function from the library. The function that I'm calling from the library makes use of MPI_Allreduce.
The problem here is that in my program, some of the ranks can exit the loop before others and this causes the MPI_Allreduce call to just hang since not all ranks will be calling MPI_Allreduce again.
Is there any way of programming around this without modifying the sources of the library I'm using?
Below is the code for an example which demonstrates the execution pattern.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#include <math.h>
#include <assert.h>
#define N_ITEMS 100000
#define ITERATIONS 32
float *create_rand_nums(int num_elements) {
float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
assert(rand_nums != NULL);
int i;
for (i = 0; i < num_elements; i++) {
rand_nums[i] = (rand() / (float)RAND_MAX);
}
return rand_nums;
}
void reduce_stddev(int world_rank, int world_size, int num_elements_per_proc)
{
fprintf(stdout, "Calling %s: %d\n", __func__, world_rank);
fflush(stdout);
srand(time(NULL)*world_rank);
float *rand_nums = NULL;
rand_nums = create_rand_nums(num_elements_per_proc);
float local_sum = 0;
int i;
for (i = 0; i < num_elements_per_proc; i++) {
local_sum += rand_nums[i];
}
float global_sum;
fprintf(stdout, "%d: About to call all reduce\n", world_rank);
fflush(stdout);
MPI_Allreduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM,
MPI_COMM_WORLD);
fprintf(stdout, "%d: done calling all reduce\n", world_rank);
fflush(stdout);
float mean = global_sum / (num_elements_per_proc * world_size);
float local_sq_diff = 0;
for (i = 0; i < num_elements_per_proc; i++) {
local_sq_diff += (rand_nums[i] - mean) * (rand_nums[i] - mean);
}
float global_sq_diff;
MPI_Reduce(&local_sq_diff, &global_sq_diff, 1, MPI_FLOAT, MPI_SUM, 0,
MPI_COMM_WORLD);
if (world_rank == 0) {
float stddev = sqrt(global_sq_diff /
(num_elements_per_proc * world_size));
printf("Mean - %f, Standard deviation = %f\n", mean, stddev);
}
free(rand_nums);
}
int main(int argc, char* argv[]) {
if (argc != 2) {
fprintf(stderr, "Usage: avg num_elements_per_proc\n");
exit(1);
}
int num_elements_per_proc = atoi(argv[1]);
MPI_Init(NULL, NULL);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
unsigned long long j = 0;
for(j = 0; j < ITERATIONS; j++)
{
/* Function which calls MPI_Allreduce */
reduce_stddev(world_rank, world_size, num_elements_per_proc);
/* Simulates some processes leaving the loop early */
if( (j == (ITERATIONS/2)) && (world_rank % 2 == 0))
{
fprintf(stdout, "%d exiting\n", world_rank);
fflush(stdout);
break;
}
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
return EXIT_SUCCESS;
}

This is always an issue in MPI - how do you tell all the other ranks when one rank is finished? The easiest approach is for everyone to set a true/false flag and then do an allreduce to see if anyone finished. Using this code at the end seems to work
for(j = 0; j < ITERATIONS; j++)
{
/* Function which calls MPI_Allreduce */
reduce_stddev(world_rank, world_size, num_elements_per_proc);
int finished = 0;
/* Simulates some processes leaving the loop early */
if( (j == (ITERATIONS/2)) && (world_rank % 2 == 0))
{
fprintf(stdout, "%d finished\n", world_rank);
fflush(stdout);
finished = 1;
}
/* Check to see if anyone has finished */
int anyfinished;
MPI_Allreduce(&finished, &anyfinished, 1, MPI_INT, MPI_LOR,
MPI_COMM_WORLD);
if (anyfinished)
{
fprintf(stdout, "%d exiting\n", world_rank);
break;
}
}
OK - I just reread your question and maybe I misunderstood it. Do you want everyone else to keep calculating?

Related

Segmentation fault inside range

#include <iostream>
#include <vector>
#include <algorithm>
#include <queue> // std::priority_queue
using std::vector;
using std::cin;
using std::cout;
struct fj{
int indexI=0;
int freeT=0;
};
struct DereferenceCompareNode : public std::binary_function<fj, fj, bool>
{
bool operator()(const fj lhs, const fj rhs) const
{
return lhs.freeT > rhs.freeT;
}
};
class JobQueue {
private:
int num_workers_;
vector<int> jobs_;
vector<int> assigned_workers_;
vector<long long> start_times_;
void WriteResponse() const {
for (int i = 0; i < jobs_.size(); ++i) {
cout << assigned_workers_[i] << " " << start_times_[i] << "\n";
}
}
void ReadData() {
int m;
cin >> num_workers_ >> m;
jobs_.resize(m);
std::cout<<"Read fault"<<"\n";
for(int i = 0; i < m; i++)
cin >> jobs_[i];
std::cout<<"Read fault ends"<<"\n";
}
void AssignJobs() {
// TODO: replace this code with a faster algorithm.
std::cout<<"Fault point 1"<<"\n";
assigned_workers_.resize(jobs_.size());
start_times_.resize(jobs_.size());
vector<long long> next_free_time(num_workers_, 0);
std::priority_queue<int, vector<int>, std::greater<int> > thread;
std::priority_queue<fj, vector<fj>, DereferenceCompareNode > freeJob;
/*
for (int i = 0; i < jobs_.size(); ++i) {
int duration = jobs_[i];
int next_worker = 0;
for (int j = 0; j < num_workers_; ++j) {
if (next_free_time[j] < next_free_time[next_worker])
next_worker = j;
}
assigned_workers_[i] = next_worker;
start_times_[i] = next_free_time[next_worker];
next_free_time[next_worker] += duration;
}
*/
std::cout<<"dump point 2"<<"\n";
for(int i=0;i<num_workers_;i++){
thread.push(i);
}
std::cout<<"dump point 1"<<"\n";
int counter = 0;
while(jobs_.size()!=0){
std::cout<<"jobs_.size:"<<jobs_.size()<<"\n";
std::cout<<"freeJob.size:"<<freeJob.size()<<"\n";
//check logic
do{
if(freeJob.top().freeT == counter){
std::cout<<"freeJob.top().freeT:"<<freeJob.top().freeT<<"\n";
std::cout<<"counter:"<<counter<<"\n";
thread.push(freeJob.top().indexI);
freeJob.pop();
}else{
break;
}
}
while(freeJob.size()!=0);
std::cout<<"Thread:"<<thread.size()<<"\n";
while(thread.size()!=0){
if(jobs_.size()!=0){
fj currA;
currA.indexI = thread.top();
currA.freeT = jobs_.at(0)+counter;
std::cout<<"currA.indexI:"<<currA.indexI<<"\n";
std::cout<<"currA.freeT:"<<currA.freeT<<"\n";
thread.pop();
jobs_.erase(jobs_.begin());
assigned_workers_.push_back(currA.indexI);
start_times_.push_back(currA.freeT);
}else{
break;
}
}
counter++;
}
}
public:
void Solve() {
ReadData();
AssignJobs();
WriteResponse();
}
};
int main() {
std::ios_base::sync_with_stdio(false);
JobQueue job_queue;
job_queue.Solve();
return 0;
}
I am getting segmentation fault in function ReadData while taking inputs for vector jobs.
I am getting fault even when I am inside bounds of defined size.
Everything was fine when have not written AssignJob function.
Am I doing something wrong with some bounds or taking illegal inputs format or messing with some other stuff?
Am I doing something wrong
Yes, you are: freeJob starts out empty, so this is undefined behavior:
if(freeJob.top().freeT == counter){
In fact, you never push anything into freeJob, you only pop() things from it.

making tree function in xv6

I want to make tree command in xv6, if you don't know the tree is to list out directories on the terminal. I know this is probably easy for you but the code is so far
#include "types.h"
#include "stat.h"
#include "user.h"
#include "fcntl.h"
#include "fs.h"
#include "file.h"
int
main(int argc, char *argv[])
{
if(argc < 2){
printf(2, "Usage: tree [path]...\n");
exit();
}
tree(argv[1]);
int fd = open(argv[1],O_RDONLY);
if(fd<0)
return -1;
struct dirent dir;
while(read(fd,&dir,sizeof(dir))!=0){
printf(1,"|_ %d,%d",dir.name,dir.inum);
//struct stat *st;
struct inode ip;
ip= getinode(dir.inum);
if(ip.type==T_DIR){
int i;
for(i=0;i<NDIRECT;i++ ){
uint add=ip.addrs[i];
printf(1,"%d",add);
}
}
}
return 0;
}
and it has been giving me numerous error on the terminal the first being file.h:17:20: error: field ‘lock’ has incomplete type
struct sleeplock lock; // protects everything below here
^~~~
I'm searching for sleeplock and there is nothing like that in the code. What is wrong with the code? Thank you for your help
You cannot use kernel headers (like file.h) in a user code. To use kernel functionnalities in your code, you must use system calls.
To achieve what you want, you could start from ls function and make it recursive.
One example made quickly:
I added a parameter to the ls function to display the depth of crawling
and call itself on each directory elements but two first which are . and ..
void
ls(char *path, int decal)
{
char buf[512], *p;
int fd, i, skip = 2;
struct dirent de;
struct stat st;
if((fd = open(path, 0)) < 0){
printf(2, "tree: cannot open %s\n", path);
return;
}
if(fstat(fd, &st) < 0){
printf(2, "tree: cannot stat %s\n", path);
close(fd);
return;
}
switch(st.type){
case T_FILE:
for (i = 0; i < decal; i++)
printf(1, " ");
printf(1, "%s %d %d %d\n", fmtname(path), st.type, st.ino, st.size);
break;
case T_DIR:
if(strlen(path) + 1 + DIRSIZ + 1 > sizeof buf){
printf(1, "tree: path too long\n");
break;
}
strcpy(buf, path);
p = buf+strlen(buf);
*p++ = '/';
while(read(fd, &de, sizeof(de)) == sizeof(de)){
if(de.inum == 0)
continue;
memmove(p, de.name, DIRSIZ);
p[DIRSIZ] = 0;
if(stat(buf, &st) < 0){
printf(1, "tree: cannot stat %s\n", buf);
continue;
}
for (i = 0; i < decal; i++)
printf(1, " ");
printf(1, "%s %d %d %d\n", fmtname(buf), st.type, st.ino, st.size);
if (skip)
skip--;
else
ls(buf, decal+1);
}
break;
}
close(fd);
}

Using of MPI Barrier lead to fatal error

I get a strange behavior of my simple MPI program. I spent time to find an answer myself, but I can't. I red some questions here, like OpenMPI MPI_Barrier problems, MPI_SEND stops working after MPI_BARRIER, Using MPI_Bcast for MPI communication. I red MPI tutorial on mpitutorial.
My program just modify array that was broadcasted from root process and then gather modified arrays to one array and print them.
So, the problem is, that when I use code listed below with uncommented MPI_Barrier(MPI_COMM_WORLD) I get an error.
#include "mpi/mpi.h"
#define N 4
void transform_row(int* row, const int k) {
for (int i = 0; i < N; ++i) {
row[i] *= k;
}
}
const int root = 0;
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int rank, ranksize;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &ranksize);
if (rank == root) {
int* arr = new int[N];
for (int i = 0; i < N; ++i) {
arr[i] = i * i + 1;
}
MPI_Bcast(arr, N, MPI_INT, root, MPI_COMM_WORLD);
}
int* arr = new int[N];
MPI_Bcast(arr, N, MPI_INT, root, MPI_COMM_WORLD);
//MPI_Barrier(MPI_COMM_WORLD);
transform_row(arr, rank * 100);
int* transformed = new int[N * ranksize];
MPI_Gather(arr, N, MPI_INT, transformed, N, MPI_INT, root, MPI_COMM_WORLD);
if (rank == root) {
for (int i = 0; i < ranksize; ++i) {
for (int j = 0; j < N ; j++) {
printf("%i ", transformed[i * N + j]);
}
printf("\n");
}
}
MPI_Finalize();
return 0;
}
The error comes with number of thread > 1. The error:
Fatal error in PMPI_Barrier: Message truncated, error stack:
PMPI_Barrier(425)...................: MPI_Barrier(MPI_COMM_WORLD) failed
MPIR_Barrier_impl(332)..............: Failure during collective
MPIR_Barrier_impl(327)..............:
MPIR_Barrier(292)...................:
MPIR_Barrier_intra(150).............:
barrier_smp_intra(111)..............:
MPIR_Bcast_impl(1452)...............:
MPIR_Bcast(1476)....................:
MPIR_Bcast_intra(1287)..............:
MPIR_Bcast_binomial(239)............:
MPIC_Recv(353)......................:
MPIDI_CH3U_Request_unpack_uebuf(568): Message truncated; 16 bytes received but buffer size is 1
I understand that some problem with buffer exists, but when I use MPI_buffer_attach to attach big buffer to MPI it don't help.
Seems I need to increase this buffer, but I don't now how to do this.
XXXXXX#XXXXXXXXX:~/test_mpi$ mpirun --version
HYDRA build details:
Version: 3.2
Release Date: Wed Nov 11 22:06:48 CST 2015
So help me please.
One issue is MPI_Bcast() is invoked twice by the root rank, but only once by the other ranks. And then root rank uses an uninitialized arr.
MPI_Barrier() might only hide the problem, but it cannot fix it.
Also, note that if N is "large enough", then the second MPI_Bcast() invoked by root rank will likely hang.
Here is how you can revamp the init/broadcast phase to fix these issues.
int* arr = new int[N];
if (rank == root) {
for (int i = 0; i < N; ++i) {
arr[i] = i * i + 1;
}
MPI_Bcast(arr, N, MPI_INT, root, MPI_COMM_WORLD);
Note in this case, you can simply initialize arr on all the ranks so you do not even need to broadcast the array.
As a side note, MPI program typically
#include <mpi.h>
and then use mpicc for the compilation/linking
(this is a wrapper that invoke the real compiler after setting the include/library paths and using the MPI libs)

Anomalous MPI behavior

I am wondering if anyone can offer an explanation.
I'll start with the code:
/*
Barrier implemented using tournament-style coding
*/
// Constraints: Number of processes must be a power of 2, e.g.
// 2,4,8,16,32,64,128,etc.
#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
void mybarrier(MPI_Comm);
// global debug bool
int verbose = 1;
int main(int argc, char * argv[]) {
int rank;
int size;
int i;
int sum = 0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int check = size;
// check to make sure the number of processes is a power of 2
if (rank == 0){
while(check > 1){
if (check % 2 == 0){
check /= 2;
} else {
printf("ERROR: The number of processes must be a power of 2!\n");
MPI_Abort(MPI_COMM_WORLD, 1);
return 1;
}
}
}
// simple task, with barrier in the middle
for (i = 0; i < 500; i++){
sum ++;
}
mybarrier(MPI_COMM_WORLD);
for (i = 0; i < 500; i++){
sum ++;
}
if (verbose){
printf("process %d arrived at finalize\n", rank);
}
MPI_Finalize();
return 0;
}
void mybarrier(MPI_Comm comm){
// MPI variables
int rank;
int size;
int * data;
MPI_Status * status;
// Loop variables
int i;
int a;
int skip;
int complete = 0;
int currentCycle = 1;
// Initialize MPI vars
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &size);
// step 1, gathering
while (!complete){
skip = currentCycle * 2;
// if currentCycle divides rank evenly, then it is a target
if ((rank % currentCycle) == 0){
// if skip divides rank evenly, then it needs to receive
if ((rank % skip) == 0){
MPI_Recv(data, 0, MPI_INT, rank + currentCycle, 99, comm, status);
if (verbose){
printf("1: %d from %d\n", rank, rank + currentCycle);
}
// otherwise, it needs to send. Once sent, the process is done
} else {
if (verbose){
printf("1: %d to %d\n", rank, rank - currentCycle);
}
MPI_Send(data, 0, MPI_INT, rank - currentCycle, 99, comm);
complete = 1;
}
}
currentCycle *= 2;
// main process will never send, so this code will allow it to complete
if (currentCycle >= size){
complete = 1;
}
}
complete = 0;
currentCycle = size / 2;
// step 2, scattering
while (!complete){
// if currentCycle is 1, then this is the last loop
if (currentCycle == 1){
complete = 1;
}
skip = currentCycle * 2;
// if currentCycle divides rank evenly then it is a target
if ((rank % currentCycle) == 0){
// if skip divides rank evenly, then it needs to send
if ((rank % skip) == 0){
if (verbose){
printf("2: %d to %d\n", rank, rank + currentCycle);
}
MPI_Send(data, 0, MPI_INT, rank + currentCycle, 99, comm);
// otherwise, it needs to receive
} else {
if (verbose){
printf("2: %d waiting for %d\n", rank, rank - currentCycle);
}
MPI_Recv(data, 0, MPI_INT, rank - currentCycle, 99, comm, status);
if (verbose){
printf("2: %d from %d\n", rank, rank - currentCycle);
}
}
}
currentCycle /= 2;
}
}
Expected behavior
The code is to increment a sum to 500, wait for all other processes to reach that point using blocking MPI_Send and MPI_Recv calls, and then increment sum to 1000.
Observed behavior on cluster
Cluster behaves as expected
Anomalous behavior observed on my machine
All processes in main function are reported as being 99, which I have linked specifically to the tag of the second while loop of mybarrier.
In addition
My first draft was written with for loops, and with that one, the program executes as expected on the cluster as well, but on my machine execution never finishes, even though all processes call MPI_Finalize (but none move beyond it).
MPI Versions
My machine is running OpenRTE 2.0.2
The cluster is running OpenRTE 1.6.3
The questions
I have observed that my machine seems to run unexpectedly all of the time, while the cluster executes normally. This is true with other MPI code I have written as well. Was there major changes between 1.6.3 and 2.0.2 that I'm not aware of?
At any rate, I'm baffled, and I was wondering if anyone could offer some explanation as to why my machine seems to not run MPI correctly. I hope I have provided enough details, but if not, I will be happy to provide whatever additional information you require.
There is a problem with your code, maybe that's what causing the weird behavior you are seeing.
You are passing to the MPI_Recv routines a status object that hasn't been allocated. In fact, that pointer is not even initialized, so if it happens not to be NULL, the MPI_Recv will endup writing wherever in memory causing undefined behavior. The correct form is the following:
MPI_Status status;
...
MPI_Recv(..., &status);
Or if you want to use the heap:
MPI_Status *status = malloc(sizeof(MPI_Status));
...
MPI_Recv(..., status);
...
free(status);
Also since you are not using the value returned by the receive, you should instead use MPI_STATUS_IGNORE instead:
MPI_Recv(..., MPI_STATUS_IGNORE);

MPI hangs on MPI_Send for large messages

There is a simple program in c++ / mpi (mpich2), which sends an array of type double. If the size of the array more than 9000, then during the call MPI_Send my programm hangs. If array is smaller than 9000 (8000, for example) programm works fine. Source code is bellow:
main.cpp
using namespace std;
Cube** cubes;
int cubesLen;
double* InitVector(int N) {
double* x = new double[N];
for (int i = 0; i < N; i++) {
x[i] = i + 1;
}
return x;
}
void CreateCubes() {
cubes = new Cube*[12];
cubesLen = 12;
for (int i = 0; i < 12; i++) {
cubes[i] = new Cube(9000);
}
}
void SendSimpleData(int size, int rank) {
Cube* cube = cubes[0];
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
double* coefImOut = (double *) malloc(sizeof (double)*cube->coefficentsImLength);
cout << "Before send" << endl;
int count = cube->coefficentsImLength;
MPI_Send(coefImOut, count, MPI_DOUBLE, nodeDest, 0, MPI_COMM_WORLD);
cout << "After send" << endl;
free(coefImOut);
MPI_Status status;
double *coefIm = (double *) malloc(sizeof(double)*count);
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Recv(coefIm, count, MPI_DOUBLE, nodeFrom, 0, MPI_COMM_WORLD, &status);
free(coefIm);
}
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
CreateCubes();
if (rank != root) {
SendSimpleData(size, rank);
}
MPI_Finalize();
return 0;
}
class Cube
class Cube {
public:
Cube(int size);
Cube(const Cube& orig);
virtual ~Cube();
int Id() { return id; }
void Id(int id) { this->id = id; }
int coefficentsImLength;
double* coefficentsIm;
private:
int id;
};
Cube::Cube(int size) {
this->coefficentsImLength = size;
coefficentsIm = new double[size];
for (int i = 0; i < size; i++) {
coefficentsIm[i] = 1;
}
}
Cube::Cube(const Cube& orig) {
}
Cube::~Cube() {
delete[] coefficentsIm;
}
The program runs on 4 processes:
mpiexec -n 4 ./myApp1
Any ideas?
The details of the Cube class aren't relevant here: consider a simpler version
#include <mpi.h>
#include <cstdlib>
using namespace std;
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
int datasize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank != root) {
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Status status;
int *data = new int[datasize];
for (int i=0; i<datasize; i++)
data[i] = rank;
cout << "Before send" << endl;
MPI_Send(&data, datasize, MPI_INT, nodeDest, 0, MPI_COMM_WORLD);
cout << "After send" << endl;
MPI_Recv(&data, datasize, MPI_INT, nodeFrom, 0, MPI_COMM_WORLD, &status);
delete [] data;
}
MPI_Finalize();
return 0;
}
where running gives
$ mpirun -np 4 ./send 1
Before send
After send
Before send
After send
Before send
After send
$ mpirun -np 4 ./send 65000
Before send
Before send
Before send
If in DDT you looked at the message queue window, you'd see everyone is sending, and no one is receiving, and you have a classic deadlock.
MPI_Send's semantics, wierdly, aren't well defined, but it is allowed to block until "the receive has been posted". MPI_Ssend is clearer in this regard; it will always block until the receive has been posted. Details about the different send modes can be seen here.
The reason it worked for smaller messages is an accident of the implementation; for "small enough" messages (for your case, it looks to be <64kB), your MPI_Send implementation uses an "eager send" protocol and doesn't block on the receive; for larger messages, where it isn't necessarily safe just to keep buffered copies of the message kicking around in memory, the Send waits for the matching receive (which it is always allowed to do anyway).
There's a few things you could do to avoid this; all you have to do is make sure not everyone is calling a blocking MPI_Send at the same time. You could (say) have even processors send first, then receive, and odd processors receive first, and then send. You could use nonblocking communications (Isend/Irecv/Waitall). But the simplest solution in this case is to use MPI_Sendrecv, which is a blocking (Send + Recv), rather than a blocking send plus a blocking receive. The send and receive will execute concurrently, and the function will block until both are complete. So this works
#include <mpi.h>
#include <cstdlib>
using namespace std;
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
int datasize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank != root) {
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Status status;
int *outdata = new int[datasize];
int *indata = new int[datasize];
for (int i=0; i<datasize; i++)
outdata[i] = rank;
cout << "Before sendrecv" << endl;
MPI_Sendrecv(outdata, datasize, MPI_INT, nodeDest, 0,
indata, datasize, MPI_INT, nodeFrom, 0, MPI_COMM_WORLD, &status);
cout << "After sendrecv" << endl;
delete [] outdata;
delete [] indata;
}
MPI_Finalize();
return 0;
}
Running gives
$ mpirun -np 4 ./send 65000
Before sendrecv
Before sendrecv
Before sendrecv
After sendrecv
After sendrecv
After sendrecv

Resources