I am working on a code that implements Cannon matrix multiplication algorithm.
Cannon's algorithm is described in the following fragment in pseudocode:
Executed in parallel:
circular movement with i positions to the left ofsub matrices Ai,x
circular movement with j positions upwards of submatrices Bx,j
for k = 0 to n/p-1
Executed in parallel:
Ci,j = Ci,j + Ai,j * Bi,j
circular movement with 1 position to the left of sub matrices Ai,x
circular movement with 1 position upwards of
sub matrices Bx,j
However my code seems to get blocked in the for loop after sending the submatrix B.
int main(int argc, char* argv[])
read_input_files(argc, argv);
int rank, size, i, j, shift;
//print_matrix(N, A, 0);
//print_matrix(N, B, 0);
//print_matrix(N, AB);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm comm;
MPI_Status status;
int left, right, up, down;
int shiftsource, shiftdest;
int dims[2] = { 0, 0 }, periods[2] = { 1, 1 }, coords[2];
MPI_Dims_create(size, 2, dims);
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm);
MPI_Cart_coords(comm, rank, 2, coords);
MPI_Cart_shift(comm, 1, -1, &right, &left);
MPI_Cart_shift(comm, 0, -1, &down, &up);
//printf("%d --- %d %d %d %d.\n", rank, right, left, up, down);
if (dims[0] != dims[1]) {
printf("The number of processors must be a perfect square.\n");
if (rank == 0)
printf("The number of processors must be a perfect square.\n");
return 0;
int block_size = N / sqrt(size);
cout << rank << " : dims " << dims[0] << "---------------------------------------" << endl;
int* A_sub = make_sub(A, rank, block_size, size);
int* B_sub = make_sub(B, rank, block_size, size);
int* AB_sub = (int*)calloc(block_size * block_size, sizeof(int));
//print_submatrix(block_size, A_sub, rank);
cout << rank << " : coords " << coords[0] << " * " << coords[1] << "---------------------------------------" << endl;
MPI_Cart_shift(comm, 0, -coords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(A_sub, block_size * block_size, MPI_INT, shiftdest, 1, shiftsource, 1, comm, &status);
cout << rank << " : MPI_Sendrecv_replace A_sub " << endl;
//print_submatrix(block_size, A_sub, rank);
MPI_Cart_shift(comm, 1, -coords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(B_sub, block_size * block_size, MPI_INT, shiftdest, 1, shiftsource, 1, comm, &status);
cout << rank << " : MPI_Sendrecv_replace B_sub " << endl;
for (shift = 0;shift < dims[0];shift++) {
for (i = 0;i < block_size;i++) {
for (j = 0;j < block_size;j++)
for (k = 0;k < block_size;k++) {
AB_sub[i * block_size + j] += A_sub[i * block_size + k] * B_sub[k * block_size + j];
if(shift == dims[0]-1) print_submatrix(block_size, AB_sub, rank);
MPI_Cart_shift(comm, 1, 1, &left, &right);
MPI_Sendrecv_replace(A_sub, block_size * block_size, MPI_INT, left, 1, right, 1, comm, MPI_STATUS_IGNORE);
cout << rank << " : MPI_Sendrecv_replace A " << endl;
MPI_Cart_shift(comm, 0, 1, &up, &down);
MPI_Sendrecv_replace(B_sub, block_size * block_size, MPI_INT, up, 1, down, 1, comm, MPI_STATUS_IGNORE);
cout << rank << " : MPI_Sendrecv_replace B " <<endl;
//print_matrix(N, AB, rank);
//cout << rank << " : coords " << coords[0] << " * " << coords[1] << "---------------------------------------" << endl;
MPI_Gather(&AB_sub, block_size*block_size, MPI_INT, AB, N*N, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Finalize();// MPI_Comm_free(&comm); Free up communicator
//print_matrix(N, AB, 0);
return 0;
read_input_files() is a function that reads the 2 matrices in files give as cmd line args.
A matrix after reading file:
0 1 2 3 4 5
6 7 8 9 10 11
12 13 14 15 16 17
18 19 20 21 22 23
24 25 26 27 28 29
30 31 32 33 34 35
B matrix after reading file:
0 1 2 3 4 5
6 7 8 9 10 11
12 13 14 15 16 17
18 19 20 21 22 23
24 25 26 27 28 29
30 31 32 33 34 35
N is the size of matrix, N is 6 in this case.
Your call MPI_Cart_shift(comm, 1, -coords[1], has a strange shift parameter: you're shifting by something depending on the coordinate. That should probably be MPI_Cart_shift(comm,1,-1.
I'm trying to use one-sided communications in MPI.
The following example consists of an array of 4 doubles that is split between 2 processes.
The first process writes 0, 1, 2, 3 in the distributed array while the second one subsequently tries to read it. Unfortunately, it doesn't work. I must be doing something wrong somewhere.
#include <mpi.h>
#include <iostream>
int main(){
MPI_Init(0, nullptr);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int n=2;
double* data, x;
MPI_Win window;
MPI_Alloc_mem(n*sizeof(double), MPI_INFO_NULL, &data);
MPI_Win_create(data, n*sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &window);
int i;
MPI_Win_fence(0, window);
for(i=0; i<n*size; ++i){
MPI_Put(&x, 1, MPI_DOUBLE, i/n, i%n, 1, MPI_DOUBLE, window);
MPI_Win_fence(0, window);
MPI_Win_fence(0, window);
for(i=0; i<n*size; ++i){
MPI_Get(&x, 1, MPI_DOUBLE, i/n, i%n, 1, MPI_DOUBLE, window);
std::cout << i << " " << i/n << " " << i%n << " => " << x << "\n";
return 0;
I need to send array pieces to all processes using MPI_Scatter then to get sum of all elements. Where should I initialize array then to scatter it? In root rank?
If I initialize array on root rank then other ranks dont get their data. Otherway I can initialize array for everyone (out of if(rank == root)...else), but it means, that I create array several times.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <iostream>
#include <time.h>
using namespace std;
int main(int argc, char* argv[])
int size;
int rank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int arr_size = size * 2;
int block = arr_size / (size);
int* B = new int[block];
if (rank == 0)
int* A = new int[arr_size];
cout << "generated array: " << endl;
for (int i = 0; i < arr_size; i++)
A[i] = rand() % 100;
cout << A[i] << " ";
cout << endl;
MPI_Scatter(A, block, MPI_INT, B, block, MPI_INT, 0, MPI_COMM_WORLD);
cout << "process " << rank << " received: " << endl;
for (int i = 0; i < block; i++)
cout << B[i] << " ";
cout << endl;
int local_sum = 0;
for (int i = 0; i < block; i++)
local_sum += B[i];
cout << "sum in process " << rank << " = " << local_sum << endl;
cout << endl;
int global_sum;
MPI_Reduce(&local_sum, &global_sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0)
cout << "sum = " << global_sum << endl;
return 0;
I get something like this (only root rank got its data):
process 1 received:
process 3 received:
-842150451 -842150451
-842150451 -842150451
sum in process 1 = -1684300902
sum in process 3 = -1684300902
process 2 received:
-842150451 -842150451
sum in process 2 = -1684300902
process 0 received:
4 9
sum in process 0 = 13
sum = -757935397
MPI_Scatter() is a collective operation and must hence be invoked by all the ranks.
Declare int *A = NULL; on all ranks and only allocate and populate on rank zero.
int* A = NULL;
int* B = new int[block];
if (rank == 0)
A = new int[arr_size];
cout << "generated array: " << endl;
for (int i = 0; i < arr_size; i++)
A[i] = rand() % 100;
cout << A[i] << " ";
cout << endl;
MPI_Scatter(A, block, MPI_INT, B, block, MPI_INT, 0, MPI_COMM_WORLD);
I'm exploring MPI in C++ and I wanted to parallelize the creation of a picture of the Mandelbrot set. I'm using the ppm format. Each processor builds its part and sends it back to the main process that receives it as MPI_CHAR. This is the code:
#include "mpi.h"
#include <iostream>
#include <string>
#include <fstream>
#include <complex>
using namespace std;
int mandelbrot(int x, int y, int width, int height, int max) {
complex<float> point((float) (y - height/2.0) * 4.0/width, (float) (x - width/2.0) * 4.0/width);
complex<float> z(0, 0);
unsigned int iteration = 0;
while (abs(z) < 4 && iteration < max) {
z = z * z + point;
return iteration;
int main(int argc, char **argv) {
int numprocs;
int myid;
int buff_size = 404270; // 200x200
char buff[buff_size];
int i;
MPI_Status stat;
int width = 200, height = 200, max_iter = 1000;
if (myid == 0) {
ofstream image("mandel.ppm");
image << "P3\n" << width << " " << height << " 255\n";
for(i=1; i < numprocs; i++) {
MPI_Probe(i, 0, MPI_COMM_WORLD, &stat);
int length;
MPI_Get_count(&stat, MPI_CHAR, &length);
image << buff;
} else {
stringstream ss;
// proc rank: 1, 2, ..., n
int part = height/(numprocs-1), start = (myid - 1) * part, end = part * myid;
printf("%d -> %d\n", start, end);
for (int row = start; row < end; row++) {
for (int col = 0; col < width; col++) {
int iteration = mandelbrot(row, col, width, height, max_iter);
if (row == start) ss << 255 << ' ' << 255 << ' ' << 255 << "\n";
else if (iteration < max_iter) ss << iteration * 255 << ' ' << iteration * 20 << ' ' << iteration * 5 << "\n";
else ss << 0 << ' ' << 0 << ' ' << 0 << "\n";
printf("\n sizeof = %d\n", ss.str().length());
MPI_Send(ss.str().c_str(), ss.str().length(), MPI_CHAR, 0, 0, MPI_COMM_WORLD);
return 0;
Code compilation:
$ mpic++ -std=c++0x mpi.mandel.cpp -o mpi.mandel
Running with 3 processes (process main + process rank 1 and 2)
$ mpirun -np 3 ./mpi.mandel
Resulting ppm pictures when running with 3, 4, and 5 process:
It seems that the point-to-point communication of sending-receiving is mixing the results when more than 3 processes try to send the MPI_CHAR elements to the main process. How can avoid this behavior?
It works when creating the buffer buff with the same length as the receiving message:
for (int i=1; i < numprocs; i++) {
MPI_Probe(i, 0, MPI_COMM_WORLD, &stat);
int length;
MPI_Get_count(&stat, MPI_CHAR, &length);
printf("\nfrom %d <<-- %d (stat.source=%d) Receiving %d chars\n", myid, i, stat.MPI_SOURCE, length);
char buff[length + 1];
buff[length] = '\0';
image << buff;
Thus, we don't need anymore the declaration at the beginning int buff_size = 404270; neither char buff[buff_size];
I newbie to mpi programming. I was trying to write matrix multiplication. Went through the post MPI Matrix Multiplication with scatter gather about matrix multiplication using scatter and gather routine.
I tried modifying the code available on above post as below...
#define N 4
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stddef.h>
#include "mpi.h"
void print_results(char *prompt, int a[N][N]);
int main(int argc, char *argv[])
int i, j, k, rank, size, tag = 99, blksz, sum = 0;
int a[N][N]={{1,2,3,4},{5,6,7,8},{9,1,2,3},{4,5,6,7,}};
int b[N][N]={{1,2,3,4},{5,6,7,8},{9,1,2,3},{4,5,6,7,}};
int c[N][N];
int aa[N],cc[N];
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//scatter rows of first matrix to different processes
MPI_Scatter(a, N*N/size, MPI_INT, aa, N*N/size, MPI_INT,0,MPI_COMM_WORLD);
//broadcast second matrix to all processes
//perform vector multiplication by all processes
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
sum = sum + aa[j] * b[i][j];
cc[i] = sum;
sum = 0;
MPI_Gather(cc, N*N/size, MPI_INT, c, N*N/size, MPI_INT, 0, MPI_COMM_WORLD);
print_results("C = ", c);
void print_results(char *prompt, int a[N][N])
int i, j;
printf ("\n\n%s\n", prompt);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf(" %d", a[i][j]);
printf ("\n");
printf ("\n\n");
I ran above program as
$mpirun -np 4 ./a.out
For above program I am getting following incorrect output..
C =
0 0 -562242168 32766
1 0 4197933 0
-562242176 32766 0 0
4197856 0 4196672 0
C =
0 0 -1064802792 32765
1 0 4197933 0
-1064802800 32765 0 0
4197856 0 4196672 0
C =
30 70 29 60
70 174 89 148
29 89 95 74
60 148 74 126
C =
0 0 -1845552920 32765
1 0 4197933 0
-1845552928 32765 0 0
4197856 0 4196672 0
I have following queries
1. Why result matrix C is getting printed by all processes. It is
supposed to be printed by only main process.
2. Why incorrect result is being printed?
Corrections and help in this regard will be appreciated.
The result matrix c is getting printed by all processes because every process executes the function void print_results(char *prompt, int a[N][N]). Since you are gathering at the process having rank 0, add a statement if (rank == 0) before calling the print_results(...) function. Further, the result is incorrect because of a wrong loop logic in :
for (j = 0; j < N; j++)
sum = sum + aa[j] * b[i][j];
This should be :
for (j = 0; j < N; j++)
sum = sum + aa[j] * b[j][i];
Also there is no need to broadcast b as all processes already already have a copy of it and you can avoid MPI_Barrier(). The complete program then becomes :
#define N 4
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stddef.h>
#include "mpi.h"
void print_results(char *prompt, int a[N][N]);
int main(int argc, char *argv[])
int i, j, k, rank, size, tag = 99, blksz, sum = 0;
int a[N][N]={{1,2,3,4},{5,6,7,8},{9,1,2,3},{4,5,6,7,}};
int b[N][N]={{1,2,3,4},{5,6,7,8},{9,1,2,3},{4,5,6,7,}};
int c[N][N];
int aa[N],cc[N];
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//scatter rows of first matrix to different processes
MPI_Scatter(a, N*N/size, MPI_INT, aa, N*N/size, MPI_INT,0,MPI_COMM_WORLD);
//broadcast second matrix to all processes
//perform vector multiplication by all processes
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
sum = sum + aa[j] * b[j][i]; //MISTAKE_WAS_HERE
cc[i] = sum;
sum = 0;
MPI_Gather(cc, N*N/size, MPI_INT, c, N*N/size, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == 0) //I_ADDED_THIS
print_results("C = ", c);
void print_results(char *prompt, int a[N][N])
int i, j;
printf ("\n\n%s\n", prompt);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf(" %d", a[i][j]);
printf ("\n");
printf ("\n\n");
Then c =
C =
54 37 47 57
130 93 119 145
44 41 56 71
111 79 101 123
Call to mpi_finalize doesn't indicate that all the MPI processes are terminated like in OpenMP !
In most of mpi implementation, all the processes execute the instruction before the MPI_init and after MPI_Finalized.
A good practice is to do nothing before MPI_Init and after MPI_Finalized.
I have a very weird problem. I've written a con in mpi that one process should print something, but amazingly the code is terminated without any output. I can't understand where it's wrong...
PS: this code is supposed to multiply two matrices.
int main( int argc, char *argv[] )
int M = atoi(argv[1]);
// N = 2 ^ M
N = (unsigned int) pow (2.0, M); //you need to modify this code!
int my_rank, comm_sz,mt;
int rows,offset,extra,averow ,dest;
int i,j,k;
time_t t1, t2;
double dt; //t2-t1
double tavg=0.0;
//input array
A = (double*) malloc ( sizeof(double) * N * N );
B = (double*) malloc ( sizeof(double) * N * N );
C = (double*) malloc ( sizeof(double) * N * N );
//int r; for (r = 0; r < REP; r++)
//fill in matrix A and B with random numbers
//t1 = time(0);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
if (my_rank =0){
printf("mpi_mm has started with %d tasks.\n",comm_sz);
printf("Initializing arrays...\n");
averow = N/comm_sz;
extra = N%comm_sz;
offset = 0;
mt = 0;
for ( dest=1;dest<=comm_sz;dest++){
rows = (dest <=extra) ? averow+1 : averow;
mt = 1;
for (i=1; i<=comm_sz; i++){
printf("Received results from task %d\n",i);
/* Print results */
printf("Result Matrix:\n");
for (i=0; i<N; i++)
for (j=0; j<N; j++)
printf("%6.2f ", C[i*N+j]);
printf ("Done.\n");
if(my_rank !=0){
mt = 0;
C[j*N+i] =0.0;
C[j*N+i] += A[j*N+k]*B[k*N+i];
mt = 1;
MPI_Send(&offset, 1, MPI_INT, 0, mt, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, 0, mt, MPI_COMM_WORLD);
MPI_Send(&C, rows*N, MPI_DOUBLE, 0, mt, MPI_COMM_WORLD);
Found it.
You say
if (my_rank =0)
This should be
if (my_rank == 0)