MPI: how to distinguish send and recv in MPI_Wait - mpi

Let's say I use PMPI to write a wrapper for MPI_Wait, which waits for an MPI send or receive to complete.
/* ================== C Wrappers for MPI_Wait ================== */
_EXTERN_C_ int PMPI_Wait(MPI_Request *request, MPI_Status *status);
_EXTERN_C_ int MPI_Wait(MPI_Request *request, MPI_Status *status) {
int _wrap_py_return_val = 0;
_wrap_py_return_val = PMPI_Wait(request, status);
return _wrap_py_return_val;
}
The wrapper is generated by this.
What I would like to do is:
/* ================== C Wrappers for MPI_Wait ================== */
_EXTERN_C_ int PMPI_Wait(MPI_Request *request, MPI_Status *status);
_EXTERN_C_ int MPI_Wait(MPI_Request *request, MPI_Status *status) {
int _wrap_py_return_val = 0;
if(is a send request)
printf("send\n");
else // is a recv request
printf("recv\n");
_wrap_py_return_val = PMPI_Wait(request, status);
return _wrap_py_return_val;
}
How to distinguish send and recv in Open MPI? Let's say I use Open MPI 3.0.0.

I think since MPI_Request is opaque (I think in several release it is just an int) your only chance is to monitor yourself the created MPI_Request.
Here is a proposition (it is C++ oriented, because that's the way I like it) :
#include <mpi.h>
#include <iostream>
#include <map>
//To do opaque ordering
struct RequestConverter
{
char data[sizeof(MPI_Request)];
RequestConverter(MPI_Request * mpi_request)
{
memcpy(data, mpi_request, sizeof(MPI_Request));
}
RequestConverter()
{ }
RequestConverter(const RequestConverter & req)
{
memcpy(data, req.data, sizeof(MPI_Request));
}
RequestConverter & operator=(const RequestConverter & req)
{
memcpy(data, req.data, sizeof(MPI_Request));
return *this;
}
bool operator<(const RequestConverter & request) const
{
for(size_t i=0; i<sizeof(MPI_Request); i++)
{
if(data[i]!=request.data[i])
{
return data[i]<request.data[i];
}
}
return false;
}
};
//To store the created MPI_Request
std::map<RequestConverter, std::string> request_holder;
extern "C"
{
int MPI_Isend(
void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request
)
{
int ier = PMPI_Isend(buf, count, datatype, dest, tag, comm, request);
request_holder[RequestConverter(request)]="sending";
return ier;
}
int MPI_Irecv(
void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request
)
{
int ier = PMPI_Irecv(buf, count, datatype, dest, tag, comm, request);
request_holder[RequestConverter(request)]="receiving";
return ier;
}
int MPI_Wait(
MPI_Request *request,
MPI_Status * status
)
{
int myid;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
std::cout << "waiting("<<myid<<")-> "<<request_holder[RequestConverter(request)]<<std::endl;
request_holder.erase(RequestConverter(request));
return PMPI_Wait(request, status);
}
}
RequestConverter is just a way of doing oblivious ordering to use a std::map
MPI_Isend stores the request in the global map, so does MPI_Irecv and MPI_Wait looks for the request and deletes it from the std::map.
Simple test gives :
int main(int argv, char ** args)
{
int myid, numprocs;
MPI_Init(&argv, &args);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
int i=123456789;
MPI_Request request;
MPI_Status status;
if(myid==0)
{
MPI_Isend(&i, 1, MPI_INT, 1, 44444, MPI_COMM_WORLD, &request);
MPI_Wait(&request, &status);
std::cout << myid <<' '<<i << std::endl;
}
else if(myid==1)
{
MPI_Irecv(&i, 1, MPI_INT, 0, 44444, MPI_COMM_WORLD, &request);
MPI_Wait(&request, &status);
std::cout << myid <<' '<<i << std::endl;
}
int * sb = new int[numprocs];
for(size_t i=0; i<numprocs; i++){sb[i]=(myid+1)*(i+1);}
int * rb = new int[numprocs];
MPI_Alltoall(sb, 1, MPI_INT, rb, 1, MPI_INT, MPI_COMM_WORLD );
MPI_Finalize();
}
output :
waiting(0)-> sending
0 123456789
waiting(1)-> receiving
1 123456789
However I just added a test with MPI_Alltoall to see if only the PMPI functions were called and it is the case. So no miracle there.

Related

How to solve this ==31==error Address sanitizer heap buffer overflow error?

/**
* Note: The returned array must be malloced, assume caller calls free().
*/
char memo[10];
int count[500]={0,};
typedef struct Trie{
int count;
struct Trie * next[26];
}Trie;
void Initnode(struct Trie *node){
node->count=0;
for(int i=0;i<26;i++){
node->next[i]=NULL;
}
}
void fixtrie(struct Trie *root, char *wordsitem){
int pw=0;
struct Trie* cur=root;
while(wordsitem[pw]!='\0'){
if(cur->next[wordsitem[pw]-'a']==NULL){
cur->next[wordsitem[pw]-'a']=malloc(sizeof(struct Trie));
Initnode(cur->next[wordsitem[pw]-'a']);
cur=cur->next[wordsitem[pw]-'a'];
pw++;
}
else{
cur=cur->next[wordsitem[pw]-'a'];
pw++;
}
}
(cur->count)++;
}
void dfs(struct Trie *root, char**result, int k,int pm){
if(root->count>0){
for(int i=0;i<k;i++){
if(root->count >count[i]){
for(int j=k-1;j>i;j--){
for(int c=0;c<10;c++) result[j][c]=result[j-1][c];
count[j]=count[j-1];
}
count[i]=root->count;
for(int j=0;j<10;j++) result[i][j]=0;
for(int j=0;j<pm;j++){
result[i][j]=memo[j];
}
printf("// %s //",result[i]);
break;
}
}
}
printf(" %d ",pm);
for(int i=0;i<26;i++){
if(root->next[i]!=NULL){
printf(" %d ",pm);
memo[pm]= i+'a';
printf("%s",memo);
dfs(root->next[i],result,k,pm+1);
}
}
return;
}
char ** topKFrequent(char ** words, int wordsSize, int k, int* returnSize){
for(int j=0;j<500;j++){
count[j]=0;
}
for(int j=0;j<10;j++){
memo[j]=0;
}
*returnSize=k;
struct Trie* root=malloc(sizeof(struct Trie)*1);
Initnode(root);
for(int i=0;i<wordsSize;i++){
fixtrie(root,words[i]);
}
for(int i=0;i<26;i++){
if(root->next[i]!=NULL){
}
}
char**result=(char**) malloc (sizeof(char*)*k);
for(int i=0;i<k;i++){
result[i]=malloc(sizeof(char)*10);
}
dfs(root,result,k,0);
printf("happy");
for(int i=0;i<k;i++){
printf("\n %s",result[i]);
}
printf("\n");
return result;
}
This problem is 692.problem of Leetcode.
I detect when words[I].length equals to 10, error occurs. but I don't know how to fix this code.
Input which occurs error is that ["glarko","zlfiwwb","nsfspyox","pwqvwmlgri","qggx","qrkgmliewc","zskaqzwo","zskaqzwo","ijy","htpvnmozay","jqrlad","ccjel","qrkgmliewc","qkjzgws","fqizrrnmif","jqrlad","nbuorw","qrkgmliewc","htpvnmozay","nftk","glarko","hdemkfr","axyak","hdemkfr","nsfspyox","nsfspyox","qrkgmliewc","nftk","nftk","ccjel","qrkgmliewc","ocgjsu","ijy","glarko","nbuorw","nsfspyox","qkjzgws","qkjzgws","fqizrrnmif","pwqvwmlgri","nftk","qrkgmliewc","jqrlad","nftk","zskaqzwo","glarko","nsfspyox","zlfiwwb","hwlvqgkdbo","htpvnmozay","nsfspyox","zskaqzwo","htpvnmozay","zskaqzwo","nbuorw","qkjzgws","zlfiwwb","pwqvwmlgri","zskaqzwo","qengse","glarko","qkjzgws","pwqvwmlgri","fqizrrnmif","nbuorw","nftk","ijy","hdemkfr","nftk","qkjzgws","jqrlad","nftk","ccjel","qggx","ijy","qengse","nftk","htpvnmozay","qengse","eonrg","qengse","fqizrrnmif","hwlvqgkdbo","qengse","qengse","qggx","qkjzgws","qggx","pwqvwmlgri","htpvnmozay","qrkgmliewc","qengse","fqizrrnmif","qkjzgws","qengse","nftk","htpvnmozay","qggx","zlfiwwb","bwp","ocgjsu","qrkgmliewc","ccjel","hdemkfr","nsfspyox","hdemkfr","qggx","zlfiwwb","nsfspyox","ijy","qkjzgws","fqizrrnmif","qkjzgws","qrkgmliewc","glarko","hdemkfr","pwqvwmlgri"]
14
I expect well answer.
But it occurs
enter image description here
How to fix my code?

MPI_Test returning true flags for requests despite never sending?

I have some code that for testing purposes, I removed all sends and only have non-blocking receives. You can imagine my surprise when using MPI_Test the flags were indicating some of the requests were actually being completed. I have my code setup in a cartesian grid, with a small replica below, although this doesn't reproduce the error:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h> // for sleep
#include <mpi.h>
void test(int pos);
MPI_Comm comm_cart;
int main(int argc, char *argv[])
{
int i, j;
int rank, size;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* code for mpi cartesian gird topology */
int dim[1];
dim[0] = 2;
int periods[1];
periods[0] = 0;
int reorder = 1;
int coords[1];
MPI_Cart_create(MPI_COMM_WORLD, 1, dim, periods, 1, &comm_cart);
MPI_Cart_coords(comm_cart, rank, 2, coords);
test(coords[0]);
MPI_Finalize();
return (0);
}
void test(int pos)
{
float placeholder[4];
int other = (pos+1) % 2;
MPI_Request reqs[8];
int flags[4];
for(int iter = 0; iter < 20; iter++){
// Test requests from previous time cycle
for(int i=0;i<4;i++){
if(iter == 0) break;
MPI_Test(&reqs[0], &flags[0] , MPI_STATUS_IGNORE);
printf("Flag: %d\n", flags[0]);
}
MPI_Irecv(&placeholder[0], 1, MPI_FLOAT, other, 0, comm_cart, &reqs[0]);
}
}
Any help would be appreciated.
The issue is with MPI_Test and MPI_PROC_NULLs. Quite often when using MPI_Cart_shift, you end up with MPI_PROC_NULLs as if you're on the edge of the grid, a neighbouring cell simply doesn't exist in some directions.
I can't find any documentation for this anywhere, so I had to discover it myself, but when you do an MPI_Irecv with an MPI_PROC_NULL source, it will instantly complete and when tested using MPI_Test, the flag will return true for a completed request. Example code below:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
int t;
int flag;
MPI_Request req;
MPI_Irecv(&t, 1, MPI_INT, MPI_PROC_NULL, 0, MPI_COMM_WORLD, &req);
MPI_Test(&req, &flag, MPI_STATUS_IGNORE);
printf("Flag: %d\n", flag);
MPI_Finalize();
return (0);
}
Which returns the following when run:
Flag: 1
Flag: 1

Is there some way of avoiding this implicit MPI_Allreduce() synchronisation?

I'm writing an MPI program that uses a library which makes its own MPI calls. In my program, I have a loop that calls a function from the library. The function that I'm calling from the library makes use of MPI_Allreduce.
The problem here is that in my program, some of the ranks can exit the loop before others and this causes the MPI_Allreduce call to just hang since not all ranks will be calling MPI_Allreduce again.
Is there any way of programming around this without modifying the sources of the library I'm using?
Below is the code for an example which demonstrates the execution pattern.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#include <math.h>
#include <assert.h>
#define N_ITEMS 100000
#define ITERATIONS 32
float *create_rand_nums(int num_elements) {
float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
assert(rand_nums != NULL);
int i;
for (i = 0; i < num_elements; i++) {
rand_nums[i] = (rand() / (float)RAND_MAX);
}
return rand_nums;
}
void reduce_stddev(int world_rank, int world_size, int num_elements_per_proc)
{
fprintf(stdout, "Calling %s: %d\n", __func__, world_rank);
fflush(stdout);
srand(time(NULL)*world_rank);
float *rand_nums = NULL;
rand_nums = create_rand_nums(num_elements_per_proc);
float local_sum = 0;
int i;
for (i = 0; i < num_elements_per_proc; i++) {
local_sum += rand_nums[i];
}
float global_sum;
fprintf(stdout, "%d: About to call all reduce\n", world_rank);
fflush(stdout);
MPI_Allreduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM,
MPI_COMM_WORLD);
fprintf(stdout, "%d: done calling all reduce\n", world_rank);
fflush(stdout);
float mean = global_sum / (num_elements_per_proc * world_size);
float local_sq_diff = 0;
for (i = 0; i < num_elements_per_proc; i++) {
local_sq_diff += (rand_nums[i] - mean) * (rand_nums[i] - mean);
}
float global_sq_diff;
MPI_Reduce(&local_sq_diff, &global_sq_diff, 1, MPI_FLOAT, MPI_SUM, 0,
MPI_COMM_WORLD);
if (world_rank == 0) {
float stddev = sqrt(global_sq_diff /
(num_elements_per_proc * world_size));
printf("Mean - %f, Standard deviation = %f\n", mean, stddev);
}
free(rand_nums);
}
int main(int argc, char* argv[]) {
if (argc != 2) {
fprintf(stderr, "Usage: avg num_elements_per_proc\n");
exit(1);
}
int num_elements_per_proc = atoi(argv[1]);
MPI_Init(NULL, NULL);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
unsigned long long j = 0;
for(j = 0; j < ITERATIONS; j++)
{
/* Function which calls MPI_Allreduce */
reduce_stddev(world_rank, world_size, num_elements_per_proc);
/* Simulates some processes leaving the loop early */
if( (j == (ITERATIONS/2)) && (world_rank % 2 == 0))
{
fprintf(stdout, "%d exiting\n", world_rank);
fflush(stdout);
break;
}
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
return EXIT_SUCCESS;
}
This is always an issue in MPI - how do you tell all the other ranks when one rank is finished? The easiest approach is for everyone to set a true/false flag and then do an allreduce to see if anyone finished. Using this code at the end seems to work
for(j = 0; j < ITERATIONS; j++)
{
/* Function which calls MPI_Allreduce */
reduce_stddev(world_rank, world_size, num_elements_per_proc);
int finished = 0;
/* Simulates some processes leaving the loop early */
if( (j == (ITERATIONS/2)) && (world_rank % 2 == 0))
{
fprintf(stdout, "%d finished\n", world_rank);
fflush(stdout);
finished = 1;
}
/* Check to see if anyone has finished */
int anyfinished;
MPI_Allreduce(&finished, &anyfinished, 1, MPI_INT, MPI_LOR,
MPI_COMM_WORLD);
if (anyfinished)
{
fprintf(stdout, "%d exiting\n", world_rank);
break;
}
}
OK - I just reread your question and maybe I misunderstood it. Do you want everyone else to keep calculating?

What might cause OpenCL to crash on cl::Program.build?

This program crashes when I try to cl::Program.build() but I don't know why. It crashes on the last line of this block of code:
#define __NO_STD_VECTOR
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hPP>
#include <iostream>
#include <fstream>
#include <string>
#include <CL/cl.h>
using namespace std;
using namespace cl;
int _tmain(int argc, _TCHAR* argv[])
{
int tmpSize = 1024;
float **my2D = new float*[tmpSize];
for(int i = 0; i < tmpSize; i++)
{
my2D[i] = new float[tmpSize];
for(int i2 = 0; i2 < tmpSize; i2++)
{
my2D[i][i2] = 5;
}
}
cl::vector <Platform> platforms;
Platform::get(&platforms);
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[1]()), 0};
Context context(CL_DEVICE_TYPE_ALL, cps);
cl::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
CommandQueue queue = CommandQueue(context, devices[0], 0);
int W = tmpSize; //i.width();
int H = tmpSize; //i.height();
Buffer d_ip = Buffer(context, CL_MEM_READ_ONLY, W*H*sizeof(float));
Buffer d_op = Buffer(context, CL_MEM_WRITE_ONLY, W*H*sizeof(float));
queue.enqueueWriteBuffer(d_ip, CL_TRUE, 0, W*H*sizeof(float), my2D);
std::ifstream sourceFileName("c:\\users\\me\\desktop\\img_rotate_kernel.cl");
std::string sourceFile(istreambuf_iterator<char>(sourceFileName), (istreambuf_iterator<char>()));
Program::Sources rotn_source(1,std::make_pair(sourceFile.c_str(), sourceFile.length() + 1));
Program rotn_program(context, rotn_source);
rotn_program.build(devices); // <----- CRASHES HERE
}
using this kernel
__kernel void img_rotate(__global float* dest_data, __global float* src_data, int W, int H, float sinTheta, float cosTheta)
const int ix = get_global_id(0);
const int iy = get_global_id(1);
float x0 = W/2.0f;
float y0 = W/2.0f;
float xOff = ix-x0;
float yOff = iy - y0;
int xpos = (int)(xOff*cosTheta + yOff*sinTheta + x0);
int ypos = (int)(yOff*cosTheta - yOff*sinTheta + y0);
if(((int)xpos>=0) && ((int)xpos < W) && ((int)ypos>=0) && ((int)ypos<H))
{
dest_data[iy*W+ix] = src_data[ypos*W+xpos];
}
}
Here is exception dialog I get when it crashes
From the OpenCL C++ wrapper spec:
cl::Program::Program returns a valid program object and err is set to CL_SUCCESS if the program object is
created successfully. Otherwise, it returns one of the following error values returned in err [...]
Your program object was likely not created properly, change your program construction call to use the err parameter following this signature
cl::Program::Program(const Context& context, const Sources& sources, cl_int * err = NULL)
And make sure err == CL_SUCCESS before doing anything else with your program object.
Most OpenCL calls allow you to pass a pointer to an error parameter. You should really do so and check it after your calls (at least in debug builds I guess) to reduce future headaches.
Ok so I modified your source code a little. Here it is I'll explain my changes right after.
#define __NO_STD_VECTOR
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <iostream>
#include <fstream>
#include <string>
#include <CL/cl.h>
#define ARRAY_SIZE 128
using namespace std;
using namespace cl;
int main(int, char**)
{
int err;
float my2D[ARRAY_SIZE * ARRAY_SIZE] = { 0 };
for(int i = 0; i < ARRAY_SIZE * ARRAY_SIZE; i++)
{
my2D[i] = 5;
}
cl::vector <Platform> platforms;
err = Platform::get(&platforms);
if(err != CL_SUCCESS) {
std::cout << "Platform::get failed - " << err << std::endl;
std::cin.get();
}
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0]()), 0 };
Context context(CL_DEVICE_TYPE_ALL, cps, nullptr, nullptr, &err);
if(err != CL_SUCCESS) {
std::cout << "Context::Context failed - " << err << std::endl;
std::cin.get();
}
cl::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(&err);
if(err != CL_SUCCESS) {
std::cout << "Context::getInfo failed - " << err << std::endl;
std::cin.get();
}
CommandQueue queue = CommandQueue(context, devices[0], 0, &err);
if(err != CL_SUCCESS) {
std::cout << "CommandQueue::CommandQueue failed - " << err << std::endl;
std::cin.get();
}
int W = ARRAY_SIZE; //i.width();
int H = ARRAY_SIZE; //i.height();
Buffer d_ip = Buffer(context, CL_MEM_READ_ONLY, W*H*sizeof(float), nullptr, &err);
if(err != CL_SUCCESS) {
std::cout << "Buffer::Buffer 1 failed - " << err << std::endl;
std::cin.get();
}
Buffer d_op = Buffer(context, CL_MEM_WRITE_ONLY, W*H*sizeof(float), nullptr, &err);
if(err != CL_SUCCESS) {
std::cout << "Buffer::Buffer 2 failed - " << err << std::endl;
std::cin.get();
}
err = queue.enqueueWriteBuffer(d_ip, CL_TRUE, 0, W*H*sizeof(float), &my2D[0]);
if(err != CL_SUCCESS) {
std::cout << "Queue::enqueueWriteBuffer failed - " << err << std::endl;
std::cin.get();
}
std::ifstream sourceFileName("so_question.cl");
std::string sourceFile(std::istreambuf_iterator<char>(sourceFileName), (std::istreambuf_iterator<char>()));
Program::Sources rotn_source(1,std::make_pair(sourceFile.c_str(), sourceFile.length() + 1));
Program rotn_program(context, rotn_source, &err);
if(err != CL_SUCCESS) {
std::cout << "Program::Program failed - " << err << std::endl;
std::cin.get();
}
err = rotn_program.build(devices);
if(err != CL_SUCCESS) {
std::cout << "Program::build failed - " << err << std::endl;
std::cin.get();
}
}
You'll notice I added a lot more error checks. This allowed me to find out that the call to Context::Context actually did fail in your initial program. The issue likely was that platforms[1] didn't exist (there was 1 element in the vector) so I switched it to platforms[0].
Once that was fixed, I was getting an access violation on the queue.enqueueWriteBuffer(); call. The issue was that your 2-dimensional array was actually an array of heap allocated arrays. That's a problem because OpenCL expects to be able to read data from contiguous memory, which is not the case when allocating with new in a loop like you did. There actually was no guarantee that your arrays were next to each other in memory.
To solve this point, I allocated a one dimensional array on the stack (see the loop at the beginning). The call then becomes
queue.enqueueWriteBuffer(d_ip, CL_TRUE, 0, W*H*sizeof(float), &my2D[0]);
However, you probably won't be able to do so with a 1024 x 1024 array of float because you'll bust stack space. If you need an array that big, you probably want to new a single one dimensional array large enough to contain your data and perform the index arithmetic yourself. This ensures you get your entire storage space as one contiguous chunk.
The code now errors with CL_BUILD_PROGRAM_FAILURE on the err = rotn_program.build() call which means there's probably an error in your CL program code. Since this is an entirely different issue, I'll let you figure this one out.

MPI hangs on MPI_Send for large messages

There is a simple program in c++ / mpi (mpich2), which sends an array of type double. If the size of the array more than 9000, then during the call MPI_Send my programm hangs. If array is smaller than 9000 (8000, for example) programm works fine. Source code is bellow:
main.cpp
using namespace std;
Cube** cubes;
int cubesLen;
double* InitVector(int N) {
double* x = new double[N];
for (int i = 0; i < N; i++) {
x[i] = i + 1;
}
return x;
}
void CreateCubes() {
cubes = new Cube*[12];
cubesLen = 12;
for (int i = 0; i < 12; i++) {
cubes[i] = new Cube(9000);
}
}
void SendSimpleData(int size, int rank) {
Cube* cube = cubes[0];
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
double* coefImOut = (double *) malloc(sizeof (double)*cube->coefficentsImLength);
cout << "Before send" << endl;
int count = cube->coefficentsImLength;
MPI_Send(coefImOut, count, MPI_DOUBLE, nodeDest, 0, MPI_COMM_WORLD);
cout << "After send" << endl;
free(coefImOut);
MPI_Status status;
double *coefIm = (double *) malloc(sizeof(double)*count);
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Recv(coefIm, count, MPI_DOUBLE, nodeFrom, 0, MPI_COMM_WORLD, &status);
free(coefIm);
}
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
CreateCubes();
if (rank != root) {
SendSimpleData(size, rank);
}
MPI_Finalize();
return 0;
}
class Cube
class Cube {
public:
Cube(int size);
Cube(const Cube& orig);
virtual ~Cube();
int Id() { return id; }
void Id(int id) { this->id = id; }
int coefficentsImLength;
double* coefficentsIm;
private:
int id;
};
Cube::Cube(int size) {
this->coefficentsImLength = size;
coefficentsIm = new double[size];
for (int i = 0; i < size; i++) {
coefficentsIm[i] = 1;
}
}
Cube::Cube(const Cube& orig) {
}
Cube::~Cube() {
delete[] coefficentsIm;
}
The program runs on 4 processes:
mpiexec -n 4 ./myApp1
Any ideas?
The details of the Cube class aren't relevant here: consider a simpler version
#include <mpi.h>
#include <cstdlib>
using namespace std;
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
int datasize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank != root) {
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Status status;
int *data = new int[datasize];
for (int i=0; i<datasize; i++)
data[i] = rank;
cout << "Before send" << endl;
MPI_Send(&data, datasize, MPI_INT, nodeDest, 0, MPI_COMM_WORLD);
cout << "After send" << endl;
MPI_Recv(&data, datasize, MPI_INT, nodeFrom, 0, MPI_COMM_WORLD, &status);
delete [] data;
}
MPI_Finalize();
return 0;
}
where running gives
$ mpirun -np 4 ./send 1
Before send
After send
Before send
After send
Before send
After send
$ mpirun -np 4 ./send 65000
Before send
Before send
Before send
If in DDT you looked at the message queue window, you'd see everyone is sending, and no one is receiving, and you have a classic deadlock.
MPI_Send's semantics, wierdly, aren't well defined, but it is allowed to block until "the receive has been posted". MPI_Ssend is clearer in this regard; it will always block until the receive has been posted. Details about the different send modes can be seen here.
The reason it worked for smaller messages is an accident of the implementation; for "small enough" messages (for your case, it looks to be <64kB), your MPI_Send implementation uses an "eager send" protocol and doesn't block on the receive; for larger messages, where it isn't necessarily safe just to keep buffered copies of the message kicking around in memory, the Send waits for the matching receive (which it is always allowed to do anyway).
There's a few things you could do to avoid this; all you have to do is make sure not everyone is calling a blocking MPI_Send at the same time. You could (say) have even processors send first, then receive, and odd processors receive first, and then send. You could use nonblocking communications (Isend/Irecv/Waitall). But the simplest solution in this case is to use MPI_Sendrecv, which is a blocking (Send + Recv), rather than a blocking send plus a blocking receive. The send and receive will execute concurrently, and the function will block until both are complete. So this works
#include <mpi.h>
#include <cstdlib>
using namespace std;
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
int datasize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank != root) {
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Status status;
int *outdata = new int[datasize];
int *indata = new int[datasize];
for (int i=0; i<datasize; i++)
outdata[i] = rank;
cout << "Before sendrecv" << endl;
MPI_Sendrecv(outdata, datasize, MPI_INT, nodeDest, 0,
indata, datasize, MPI_INT, nodeFrom, 0, MPI_COMM_WORLD, &status);
cout << "After sendrecv" << endl;
delete [] outdata;
delete [] indata;
}
MPI_Finalize();
return 0;
}
Running gives
$ mpirun -np 4 ./send 65000
Before sendrecv
Before sendrecv
Before sendrecv
After sendrecv
After sendrecv
After sendrecv

Resources