OpenMPI has the MCA parameter mpi_abort_print_stack for trace backs after an MPI_ABORT, but the back trace is quite limited in details. For example, if I compile (mpicxx broadcast.cxx) and run (mpiexec --mca mpi_abort_print_stack 1 -n 4 ./a.out) this simple example code:
#include <mpi.h>
const int N = 10;
int arr[N];
int main(int argc, char** argv)
{
int rank;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// Correct:
MPI_Bcast(arr, N, MPI_INT, 0, MPI_COMM_WORLD);
// Incorrect:
const int my_size = (rank == 1) ? N+1 : N;
MPI_Bcast(arr, my_size, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
I get the following output and back trace:
[manjaro:2223406] *** An error occurred in MPI_Bcast
[manjaro:2223406] *** reported by process [1082195969,3]
[manjaro:2223406] *** on communicator MPI_COMM_WORLD
[manjaro:2223406] *** MPI_ERR_TRUNCATE: message truncated
[manjaro:2223406] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[manjaro:2223406] *** and potentially your MPI job)
[manjaro:2223406] [0] func:/usr/lib/libopen-pal.so.40(opal_backtrace_buffer+0x3b) [0x7f3681231a9b]
[manjaro:2223406] [1] func:/usr/lib/libmpi.so.40(ompi_mpi_abort+0x160) [0x7f368183f040]
[manjaro:2223406] [2] func:/usr/lib/libmpi.so.40(ompi_mpi_errors_are_fatal_comm_handler+0xb9) [0x7f36818369b9]
[manjaro:2223406] [3] func:/usr/lib/libmpi.so.40(ompi_errhandler_invoke+0xd3) [0x7f3681836ab3]
[manjaro:2223406] [4] func:/usr/lib/libmpi.so.40(PMPI_Bcast+0x455) [0x7f36818514b5]
[manjaro:2223406] [5] func:./a.out(+0x7c48) [0x560d4d420c48]
[manjaro:2223406] [6] func:/usr/lib/libc.so.6(+0x29290) [0x7f36812d2290]
[manjaro:2223406] [7] func:/usr/lib/libc.so.6(__libc_start_main+0x8a) [0x7f36812d234a]
[manjaro:2223406] [8] func:./a.out(+0x7ac5) [0x560d4d420ac5]
So it tells me that there is a problem in some MPI_Bcast call, but not exactly which one.
Is it possible to get a more detailed back trace, including e.g. line numbers?
One possibility, as suggested by #talonmies in the comments, is to define a macro which does the error checking after each MPI call. This is similar to the kind of error checking that you often see in CUDA code: What is the canonical way to check for errors using the CUDA runtime API?
#include <mpi.h>
#define check_mpi_error(n) __check_mpi_error(__FILE__, __LINE__, n)
inline void __check_mpi_error(const char *file, const int line, const int n)
{
char errbuffer[MPI_MAX_ERROR_STRING];
int errlen;
if (n != MPI_SUCCESS)
{
MPI_Error_string(n, errbuffer, &errlen);
printf("MPI-error: %s\n", errbuffer);
printf("Location: %s:%i\n", file, line);
MPI_Abort(MPI_COMM_WORLD, n);
}
}
const int N = 10;
int arr[N];
int main(int argc, char** argv)
{
int rank;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
// Correct:
int err;
err = MPI_Bcast(arr, N, MPI_INT, 0, MPI_COMM_WORLD);
check_mpi_error(err);
// Incorrect:
const int my_size = (rank == 1) ? N+1 : N;
err = MPI_Bcast(arr, my_size, MPI_INT, 0, MPI_COMM_WORLD);
check_mpi_error(err);
MPI_Finalize();
return 0;
}
If I compile (mpicxx -o bcast broadcast_debug.cxx) and run (mpiexec -n 4 bcast) this code, I get the following output:
MPI-error: MPI_ERR_TRUNCATE: message truncated
Location: broadcast_debug.cxx:38
Related
I have some code that for testing purposes, I removed all sends and only have non-blocking receives. You can imagine my surprise when using MPI_Test the flags were indicating some of the requests were actually being completed. I have my code setup in a cartesian grid, with a small replica below, although this doesn't reproduce the error:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h> // for sleep
#include <mpi.h>
void test(int pos);
MPI_Comm comm_cart;
int main(int argc, char *argv[])
{
int i, j;
int rank, size;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* code for mpi cartesian gird topology */
int dim[1];
dim[0] = 2;
int periods[1];
periods[0] = 0;
int reorder = 1;
int coords[1];
MPI_Cart_create(MPI_COMM_WORLD, 1, dim, periods, 1, &comm_cart);
MPI_Cart_coords(comm_cart, rank, 2, coords);
test(coords[0]);
MPI_Finalize();
return (0);
}
void test(int pos)
{
float placeholder[4];
int other = (pos+1) % 2;
MPI_Request reqs[8];
int flags[4];
for(int iter = 0; iter < 20; iter++){
// Test requests from previous time cycle
for(int i=0;i<4;i++){
if(iter == 0) break;
MPI_Test(&reqs[0], &flags[0] , MPI_STATUS_IGNORE);
printf("Flag: %d\n", flags[0]);
}
MPI_Irecv(&placeholder[0], 1, MPI_FLOAT, other, 0, comm_cart, &reqs[0]);
}
}
Any help would be appreciated.
The issue is with MPI_Test and MPI_PROC_NULLs. Quite often when using MPI_Cart_shift, you end up with MPI_PROC_NULLs as if you're on the edge of the grid, a neighbouring cell simply doesn't exist in some directions.
I can't find any documentation for this anywhere, so I had to discover it myself, but when you do an MPI_Irecv with an MPI_PROC_NULL source, it will instantly complete and when tested using MPI_Test, the flag will return true for a completed request. Example code below:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
int t;
int flag;
MPI_Request req;
MPI_Irecv(&t, 1, MPI_INT, MPI_PROC_NULL, 0, MPI_COMM_WORLD, &req);
MPI_Test(&req, &flag, MPI_STATUS_IGNORE);
printf("Flag: %d\n", flag);
MPI_Finalize();
return (0);
}
Which returns the following when run:
Flag: 1
Flag: 1
I am encountering some strange behavior with MPICH. The following minimal example, which sends a message to the non-existing process with rank -1, causes a deadlock:
// Program: send-err
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char** argv) {
MPI_Init(NULL, NULL);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// We are assuming at least 2 processes for this task
if (world_size != 2) {
fprintf(stderr, "World size must be 2 for %s\n", argv[0]);
MPI_Abort(MPI_COMM_WORLD, 1);
}
int number;
if (world_rank == 0) {
number = -1;
MPI_Send(&number, // data buffer
1, // buffer size
MPI_INT, // data type
-1, //destination
0, //tag
MPI_COMM_WORLD); // communicator
} else if (world_rank == 1) {
MPI_Recv(&number,
1, // buffer size
MPI_INT, // data type
0, // source
0, //tag
MPI_COMM_WORLD, // communicator
MPI_STATUS_IGNORE);
}
MPI_Finalize();
}
If the call to the send function,
MPI_Send( start, count, datatype, destination_rank, tag, communicator )
uses destination_rank = -2, then the program fails with the error message:
> mpirun -np 2 send-err
Abort(402250246) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Send: Invalid rank, error stack:
PMPI_Send(157): MPI_Send(buf=0x7fffeb411b44, count=1, MPI_INT, dest=MPI_ANY_SOURCE, tag=0, MPI_COMM_WORLD) failed
PMPI_Send(94).: Invalid rank has value -2 but must be nonnegative and less than 2
Based on the error message, I would expect a program that sends a message to the process with rank -1 to fail similarly to the program sending a message to the process with rank -2. What causes this difference in behavior?
I am writing a sample program MPI in which one process send an integer to other process.
This is my source code
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
// Find out rank, size
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int number;
if (world_rank == 0) {
number = -1;
MPI_Send(&number, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
}
else if (world_rank == 1) {
MPI_Recv(&number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
printf("Process 1 received number %d from process 0\n",
number);
}
}
And this is the error when I run mpiexec in windows cmd line
ERROR: Error reported: failed to set work directory to 'D:\study_documents\Thesis\Nam 4\Demo\Sample Codes\MPI_HelloWorld\Debug' on DESKTOP-EKN1RD3
Error (3) The system cannot find the path specified.
After making the changes mentioned in the comments, not getting any output.
I'm new to MPI. If I run it with more than two processes I get two additional lines on my console saying:
1- more process has sent help message help-mpi-errors.txt / mpi_errors_are_fatal
2- Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages.
What am I doing wrong?
This is the complete output on my terminal:
*** An error occurred in MPI_Bcast
*** reported by process [4248174593,1]
*** on communicator MPI_COMM_WORLD
*** MPI_ERR_TYPE: invalid datatype
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
*** and potentially your MPI job)
1 more process has sent help message help-mpi-errors.txt /
mpi_errors_are_fatal Set MCA parameter "orte_base_help_aggregate" to 0
to see all help / error messages
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char** argv)
{
const int server = 0;
const int source = server;
float* array = (float*)NULL;
int length;
int num_procs, my_rank, mpi_error_code;
int index;
mpi_error_code = MPI_Init(&argc, &argv);
mpi_error_code = MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
mpi_error_code = MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
/*input, allocate, initialize on server only*/
if(my_rank == server){
scanf("%d", &length);
array = (float*) malloc(sizeof(float) * length);
for(index = 0; index < length; index++){
array[index] = 0.0;
}
}
/*broadcast, output on all processes*/
if(num_procs > 1){
mpi_error_code = MPI_Bcast(&length, 1, MPI_INT, source, MPI_COMM_WORLD);
if(my_rank != server){
array = (float*) malloc(sizeof(float) * length);
}
mpi_error_code = MPI_Bcast(array, length, MPI_INT, source, MPI_COMM_WORLD);
printf("%d: broadcast length = %d\n", my_rank, length);
}
mpi_error_code = MPI_Finalize();
}
I have the following code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>
static int rank, size;
char msg[] = "This is a test message";
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size != 2) {
fprintf(stderr, "This test requires exactly 2 tasks (has: %d).\n", size);
MPI_Finalize();
return -1;
}
int run = 1;
if (argc > 1) {
run = atoi(argv[1]);
}
int len = strlen(msg) + 1;
if (argc > 2) {
len = atoi(argv[2]);
}
char buf[len];
strncpy(buf, msg, len);
MPI_Status statusArray[run];
MPI_Request reqArray[run];
double start = MPI_Wtime();
for (int i = 0; i < run; i++) {
if (!rank) {
MPI_Isend(buf, len, MPI_CHAR, 1, 0, MPI_COMM_WORLD, &reqArray[i]);
printf("mpi_isend for run %d\n", i);
} else {
MPI_Irecv(buf, len, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &reqArray[i]);
printf("mpi_irecv for run %d\n", i);
}
}
int buflen = 512;
char name[buflen];
gethostname(name, buflen);
printf("host: %s has rank %d\n", name, rank);
printf("Reached here! for host %s before MPI_Waitall \n", name);
if(!rank) {
printf("calling mpi_waitall for sending side which is %s\n", name);
MPI_Waitall(run, &reqArray[0], &statusArray[0]);
}
else {
printf("calling mpi_waitall for receiving side which is %s\n", name);
MPI_Waitall(run, &reqArray[0], &statusArray[0]);
}
printf("finished waiting! for host %s\n", name);
double end = MPI_Wtime();
if (!rank) {
printf("Throughput: %.4f Gbps\n", 1e-9 * len * 8 * run / (end - start));
}
MPI_Finalize();
}
I got a seg-fault on the sending side before MPI_Waitall. The error message is:
[host1:27679] *** Process received signal ***
[host1:27679] Signal: Segmentation fault (11)
[host1:27679] Signal code: Address not mapped (1)
[host1:27679] Failing at address: 0x8
[host1:27679] [ 0] /lib64/libpthread.so.0() [0x3ce7e0f500]
[host1:27679] [ 1] /usr/lib64/openmpi/mca_btl_openib.so(+0x21dc7) [0x7f46695c1dc7]
[host1:27679] [ 2] /usr/lib64/openmpi/mca_btl_openib.so(+0x1cbe1) [0x7f46695bcbe1]
[host1:27679] [ 3] /lib64/libpthread.so.0() [0x3ce7e07851]
[host1:27679] [ 4] /lib64/libc.so.6(clone+0x6d) [0x3ce76e811d]
[host1:27679] *** End of error message ***
I think there is something wrong with the array of MPI_Request. Could someone point it out?
Thanks!
I ran your program without a problem (other than a warning for not including unistd.h). The problem is probably related to your setup of Open MPI. Are you using a machine with an InfiniBand network? If not, you probably want to change to just use the default tcp implementation. Your problem might be related to that.
If you want to specify that you'll only use tcp, you should run like this:
mpirun --mca btl tcp,self -n 2 <prog_name> <prog_args>
That will ensure that openib isn't accidentally detected and used when it shouldn't be.
If, on the other hand, you do mean to use InfiniBand, you might have discovered some sort of problem with Open MPI. I doubt that's the case though since you're not doing anything fancy.