I have this code:
#include <cstdint>
#include <mpi.h>
#include <iostream>
using namespace std;
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0)
MPI_Barrier(MPI_COMM_WORLD);
cout << "Some output\n";
if (rank == 1)
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
cout << "end\n";
MPI_Finalize();
return 0;
}
When I run as
mpiexec -n 2 MPI.exe
the program works; the output is:
Some output
End
Some output
End
However, when I run as
mpiexec -n 3 MPI.exe
the program does not work properly. I expected for output like this:
rank 3 - Some_output
rank 2 - Some output
rank 3 - End
rank 0 - Some output
At this step, I expect the program to stop.
You need to make sure then number of Barrier calls is the same for each process. In your particular case, when n=3 you have two Barrier calls for rank 0 and rank 1 but only 1 for rank 2. The program will block until the rank 2 process also reaches a Barrier.
Here is what should be happening for n=3:
together:
rank 0 will reach barrier 1 then block
rank 1 will print "some output", reach barrier 2 then block
rank 2 will print "some output", reach barrier 3 then block
together:
rank 0 will print "some output", reach barrier 3 then block
rank 1 will reach barrier 3 then block
rank 2 will print "end" then hit finalize
Having one process in finalize while others are blocked is going to be undefined behaviour.
Doing the same analysis for n=2:
together:
rank 0 will reach barrier 1 then block
rank 1 will print "some output", reach barrier 2 then block
together:
rank 0 will print "some output", reach barrier 3 then block
rank 1 will reach barrier 3 then block
together:
rank 0 will print "end" then hit finalize
rank 1 will print "end" then hit finalize
This suggests the output should be:
some output
some output
end
end
however you are getting:
some output
end
some output
end
This has to do with how the mpi infrastructure is caching the transfer of stdout from the various ranks. We can see the behaviour better if we introduce a delay so that MPI decides it should gather the results:
#include <cstdint>
#include <unistd.h>
#include <mpi.h>
#include <iostream>
using namespace std;
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
cout << rank << " Barrier 1\n" << flush;
MPI_Barrier(MPI_COMM_WORLD);
}
cout << rank << " Some output \n" << flush;
usleep(1000000);
if (rank == 1) {
cout << rank << " Barrier 2\n" << flush;
MPI_Barrier(MPI_COMM_WORLD);
}
cout << rank << " Barrier 3\n" << flush;
MPI_Barrier(MPI_COMM_WORLD);
cout << rank << " end\n" << flush;
usleep(1000000);
MPI_Finalize();
return 0;
}
which produces:
$ mpiexec -n 2 ./a.out
0 Barrier 1
1 Some output
0 Some output
1 Barrier 2
1 Barrier 3
0 Barrier 3
0 end
1 end
$ mpiexec -n 3 ./a.out
2 Some output
0 Barrier 1
1 Some output
0 Some output
1 Barrier 2
1 Barrier 3
2 Barrier 3
2 end
0 Barrier 3
^Cmpiexec: killing job...
Alternatively, look at the time stamps from the following C++11 code:
#include <cstdint>
#include <chrono>
#include <mpi.h>
#include <iostream>
using namespace std;
inline unsigned long int time(void) {
return std::chrono::high_resolution_clock::now().time_since_epoch().count();
}
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
MPI_Barrier(MPI_COMM_WORLD);
}
cout << rank << " " << time() << " Some output\n";
if (rank == 1) {
MPI_Barrier(MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
cout << rank << " " << time() << " end\n";
MPI_Finalize();
return 0;
}
output:
$ mpiexec -n 2 ./a.out
0 1464100768220965374 Some output
0 1464100768221002105 end
1 1464100768220902046 Some output
1 1464100768221000693 end
sorted by timestamp:
$ mpiexec -n 2 ./a.out
1 1464100768220902046 Some output
0 1464100768220965374 Some output
1 1464100768221000693 end
0 1464100768221002105 end
The conclusion is that Barrier is behaving as expected, and that print statements are not necessarily going to tell you that.
Edit: 2016-05-24 to show detailed analysis of program behaviour.
Related
I have some sample code:
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
int main(int argc, char** argv) {
// Initialize the MPI environment
MPI_Init(&argc, &argv);
// Find out rank, size
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// We are assuming at least 2 processes for this task
if (world_size < 2) {
fprintf(stderr, "World size must be greater than 1 for %s\n", argv[0]);
MPI_Abort(MPI_COMM_WORLD, 1);
}
int number;
if (world_rank == 1) {
number = -1;
MPI_Send(&number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
raise(SIGSEGV);
} else if (world_rank == 0) {
MPI_Recv(&number, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Process 0 received number %d from process 1\n", number);
}
printf("rank %d finalize\n", world_rank);
MPI_Finalize();
}
Rank 1 raises a signal to simulate crash. After the raise() rank 1 exits. But rank 0 stills prints rank 0 finalize.
Is there any way to know in rank 0 whether rank 1 crashes in this case? Is it possible to let mpirun kill rank 0 when rank 1 crashes?
Note there is a race condition in your problem, and mpirun might have not enough time to notice task 1 crashed and kill task 0 before the message is printed.
You can force Open MPI to kill all tasks as soon as a crash is detected with the option below
mpirun -mca orte_abort_on_non_zero_status 1 ...
This question already has an answer here:
Assigning Rcpp objects into an Rcpp List yields duplicates of the last element
(1 answer)
Closed 4 years ago.
I want to create a list of matrices that I am updating in a loop and return it to R. I have
std::vector<IntegerMatrix> zb_list;
and
IntegerMatrix tb(J,nmax), zb(J,nmax);
before the loop. Inside the loop, I update zb and then have
zb_list.push_back(zb);
I also have
Rcout << (zb_list[itr]) << "\n";
Rcout << (zb) << "\n\n";
where itr counts the iterations. These both confirm that zb is changing inside the loop and zb_list keeps track of it.
Then I return zb_list after the loop. When accessing the result in R, the list contains copies of the same zb, the last one computed in the loop. I suspect there is some pass by reference going on... but can't figure it out. I don't have a good understanding of what is going on (tried to use return(wrap(zb_list))without luck) but clearly something is wrong. Also used List zb_list; for defining it which doesn't help. Any suggestions?
EDiT: Here is the minimal working example:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
List test_weird(int ITRmax=2) {
IntegerMatrix zb(2,2);
std::vector<IntegerMatrix> zb_list;
int itr = 0;
while (itr < ITRmax) {
zb( (1+itr)%2 ,(1+itr)%2 ) ++ ;
zb_list.push_back(zb);
Rcout << (zb) << (zb_list[itr]) << "\n\n";
++itr;
}
return List::create(_["zb"] = zb,
_["zb_list"] = zb_list);
}
/*** R
res <- test_weird()
res$zb_list
*/
This the output when the look is running:
0 0
0 1
0 0
0 1
1 0
0 1
1 0
0 1
... and this is the output from R:
> res$zb_list
[[1]]
[,1] [,2]
[1,] 1 0
[2,] 0 1
[[2]]
[,1] [,2]
[1,] 1 0
[2,] 0 1
As you can see both items in the list are the last zb in the loop.
The problem is that push_back(something) makes a copy of something. But if something is a pointer, than subsequent changes will effect all copies of that pointer. In plain C++:
#include <vector>
#include <iostream>
int main() {
std::vector<int*> v;
int* p = new int;
for (int i = 0; i < 2; ++i) {
*p = i;
v.push_back(p);
std::cout << *p << " " << *v[i] << std::endl;
}
std::cout << *v[0] << " " << *v[1] << std::endl;
return 0;
}
produces
$ ./pointer_fun
0 0
1 1
1 1
So if the something is a pointer (like object), which is the case for all Rcpp objects, then you need a deep copy/clone of the object, i.e.
zb_list.push_back(clone(zb));
After updating OpenMPI from 1.8.4 to 2.0.2 I ran into erroneous time measurement using MPI_Wtime(). With version 1.8.4 the result was the same as returned by omp_get_wtime() timer, and now MPI_Wtime runs about 2 times faster.
What can cause such a behaviour?
My sample code:
#include <omp.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
int some_work(int rank, int tid){
int count = 10000;
int arr[count];
for( int i=0; i<count; i++)
arr[i] = i + tid + rank;
for( int val=0; val<4000000; val++)
for(int i=0; i<count-1; i++)
arr[i] = arr[i+1];
return arr[0];
}
int main (int argc, char *argv[]) {
MPI_Init(NULL, NULL);
int rank, size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0)
printf("there are %d mpi processes\n", size);
MPI_Barrier(MPI_COMM_WORLD);
double omp_time1 = omp_get_wtime();
double mpi_time1 = MPI_Wtime();
#pragma omp parallel
{
int tid = omp_get_thread_num();
if ( tid == 0 ) {
int nthreads = omp_get_num_threads();
printf("There are %d threads for process %d\n", nthreads, rank);
int result = some_work(rank, tid);
printf("result for process %d thread %d is %d\n", rank, tid, result);
}
}
MPI_Barrier(MPI_COMM_WORLD);
double mpi_time2 = MPI_Wtime();
double omp_time2 = omp_get_wtime();
printf("process %d omp time: %f\n", rank, omp_time2 - omp_time1);
printf("process %d mpi time: %f\n", rank, mpi_time2 - mpi_time1);
printf("process %d ratio: %f\n", rank, (mpi_time2 - mpi_time1)/(omp_time2 - omp_time1) );
MPI_Finalize();
return EXIT_SUCCESS;
}
Compiling
g++ -O3 src/example_main.cpp -o bin/example -fopenmp -I/usr/mpi/gcc/openmpi-2.0.2/include -L /usr/mpi/gcc/openmpi-2.0.2/lib -lmpi
And running
salloc -N2 -n2 mpirun --map-by ppr:1:node:pe=16 bin/example
Gives something like
there are 2 mpi processes
There are 16 threads for process 0
There are 16 threads for process 1
result for process 1 thread 0 is 10000
result for process 0 thread 0 is 9999
process 1 omp time: 5.066794
process 1 mpi time: 10.098752
process 1 ratio: 1.993125
process 0 omp time: 5.066816
process 0 mpi time: 8.772390
process 0 ratio: 1.731342
The ratio is not consistent as I wrote first but still large enough.
Results for OpenMPI 1.8.4 are OK:
g++ -O3 src/example_main.cpp -o bin/example -fopenmp -I/usr/mpi/gcc/openmpi-1.8.4/include -L /usr/mpi/gcc/openmpi-1.8.4/lib -lmpi -lmpi_cxx
Gives
result for process 0 thread 0 is 9999
result for process 1 thread 0 is 10000
process 0 omp time: 4.655244
process 0 mpi time: 4.655232
process 0 ratio: 0.999997
process 1 omp time: 4.655335
process 1 mpi time: 4.655321
process 1 ratio: 0.999997
I've got similar behavior on my cluster (same OpenMPI version as yours, 2.0.2) and the problem was the default governor for the CPU frequencies, the 'conservative' one.
Once set the governor to 'performance', output of MPI_Wtime() aligned with the correct timings (output of 'time', in my case).
It appears that, for some older Xeon processors (like the Xeon E5620), some clocking function becomes skewed when too aggressive policies for dynamic frequency adjustment are used - the same OpenMPI version does not suffer from this problem on newer Xeons within the same cluster.
Maybe MPI_Wtime() could be a costly operation in itself?
Do the results get more consistent if you avoid to measure the time consumed by MPI_Wtime() as part of the OpenMP-Time?
E.g.:
double mpi_time1 = MPI_Wtime();
double omp_time1 = omp_get_wtime();
/* do something */
double omp_time2 = omp_get_wtime();
double mpi_time2 = MPI_Wtime();
I have an MPI program to calculate a sorting time. I run it with mpirun -np 2 mpiSort. So this gives me the sorting time by 2 processes.
I want to get the sorting time for 5 times to average them. How do I do that automatically?
If I do a loop in the mpiSort program. It actually executes 5(times) x 2(processes) = 10 times.
Edit: The mpiSort does the sort in parallel. Basically, I'm trying to do mpirun -np 2 mpiSort without typing it 5 times. Because I want to do the same for 4 cores, 8 cores.
You could run on five cores using mpirun -np 5 mpiSort and add an MPI_gather at the end. Is the sort code actually using MPI (i.e. calls MPI_init at the beginning?). Assuming you are, you can run on 5 cores and simply average at the end with a reduce,
# include <mpi.h>
#include <iostream>
using namespace std;
int main ( int argc, char *argv[] )
{
int ierr, rank, nprocs, root=0;
double time, buf;
ierr = MPI_Init ( &argc, &argv );
ierr = MPI_Comm_rank (MPI_COMM_WORLD, &rank);
ierr = MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
time = 0.5;
ierr = MPI_Reduce (&time, &buf, 1, MPI_DOUBLE_PRECISION,
MPI_SUM, root, MPI_COMM_WORLD);
if (rank == root){
buf = buf / nprocs;
cout << buf << "\n";
}
MPI_Finalize ( );
}
where time is each processes sort time.
Put in a loop is the way to go. I was confused because I got 10 values of endTime = MPI_Wtime(), and I only used 5 of them from the root process. Thanks to #EdSmith with his MPI_Reduce code, the correct calculated time is the average of the two processes by using MPI_Reduce.
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
for (int run=0; run<5; run++) {
...
endTime = MPI_Wtime();
totalTime = endTime - startTime;
MPI_Reduce (&totalTime, &workTime, 1, MPI_DOUBLE_PRECISION, MPI_SUM, root, MPI_COMM_WORLD);
if (rank == root) {
paraTime = workTime/nProcs;
}
...
}
MPI_Finalize();
I am trying to do sort key-value pairs with qsort. Every proc reads in files with filenames as the proc ids. MPI_Gather sends all the read values to proc 0, which sorts the keys and stores the key-val pairs in a file called "Output". The gather however, does not seem to work. Any help is appreciated. Thanks!
I run the code as
mpirun -np 3 ./a.out
and my input files are:
File "0":
21 bbbb
2119 iiii
120 hhhh
File "1":
40 dddd
10 aaaa
100 gggg
File "2":
32 cccc
44 eeee
99 ffff
And the code is:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define BUFSIZE 3
#define BUFLEN 255
struct keyval{
int key;
char val[BUFLEN];
};
typedef struct keyval keyval_s;
typedef int (*compareptr)(const void*, const void*);
int compare (keyval_s * a, keyval_s * b)
{
return ( a->key - b->key );
}
int main (int argc, char *argv[])
{
int values[BUFSIZE];
keyval_s kv[BUFSIZE], *recv;
int n, i=0, temp;
FILE *in, *out;
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
char filename[20];
char data[20];
if(rank ==0){
recv = (keyval_s *) malloc (size*BUFSIZE*sizeof(keyval_s));
}
sprintf(filename, "%d", rank);
in=fopen(filename,"r");
while(fscanf(in,"%d %s",&kv[i].key, kv[i].val) != EOF){
printf("Read key %d, data %s from file\n", kv[i].key, kv[i].val);
i++;
}
MPI_Gather(kv,BUFSIZE,MPI_BYTE,recv,BUFSIZE,MPI_BYTE,0,MPI_COMM_WORLD);
if(rank==0){
qsort ((void*)&kv, BUFSIZE, sizeof(keyval_s),(compareptr) compare);
out=fopen("Output","w");
for (n=0; n<BUFSIZE*size; n++)
fprintf (out,"%d %s\n",recv[n].key, recv[n].val);
free(recv);
fclose(out);
}
fclose(in);
return 0;
}
The size of the data in MPI_Gather is incorrect. It should be
MPI_Gather(kv,
sizeof(keyval_s)*BUFSIZE,
MPI_BYTE,
recv,
sizeof(keyval_s)*BUFSIZE,
MPI_BYTE,
0,
MPI_COMM_WORLD);
Note that the recvcount parameter in MPI_Gather is for a message from a single rank, not the total size of the gathered data.