I'm trying to multiply two matrices in C using MPI collective communications MPI_Scatter and MPI_Gather.
My code executes with correct result, but when I'm testing processes after MPI_Scatter, I'm getting zeros on every process.
So, I'm thinking there is some problem with sendcount and recievecount parameters in MPI_Scatter and MPI_Gather functions.
Thanks for helping :)
// matrix multiplication using MPI_Scatter/MPI_Gather
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
#define N 10
int main(int argc, char *argv[])
{
int a[N][N], b[N][N], c[N][N], r1, c1, r2, c2, i, j, k, rank, size;
MPI_Init (&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// min number of processes must be 2
if(size < 2){
fprintf(stderr, "Number of processes must be min 2 for %s\n", argv[0]);
MPI_Abort(MPI_COMM_WORLD, 1);
}
// if rank of process is zero, initialize matrix a and b
if(rank == 0){
printf("Insert number of rows and columns for matrix a: ");
scanf("%d %d", &r1, &c1);
printf("Insert number of rows and columns for matrix b: ");
scanf("%d %d",&r2, &c2);
// If the number of columns of matrix a isn't equal to the number of rows of matrix b
// ask user to insert again number of rows and columns
while (c1!=r2){
printf("Error! number of columns of matrix a isn't equal to number of rows of matrix b\n\n");
printf("Enter rows and column for matrix a: ");
scanf("%d %d", &r1, &c1);
printf("Enter rows and column for matrix b: ");
scanf("%d %d",&r2, &c2);
}
// Enter elements of matrix a
printf("\nEnter elements of matrix a:\n");
for(i = 0; i < r1; ++i){
for(j = 0; j < c1; ++j){
printf("Enter elements of matrix a%d%d: ",i+1, j+1);
scanf("%d", &a[i][j]);
}
}
// Enter elements of matrix b
printf("\nEnter elements of matrix b:\n");
for(i = 0; i < r2; ++i){
for(j = 0; j < c2; ++j){
printf("Enter elements of matrix b%d%d: ",i+1, j+1);
scanf("%d",&b[i][j]);
}
}
}
// send one row of matrix a to every process in the group
MPI_Scatter(a, N*N/size, MPI_INT, a, N*N/size, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(b, N*N, MPI_INT, 0, MPI_COMM_WORLD);
for(i=0; i<r1; ++i){
for(j=0; j<c2; ++j){
// initialization of matrix c(result matrix)
c[i][j]=0;
for(k=0; k<c1; ++k)
c[i][j]+= a[i][k]*b[k][j]; // multiplication of matrix a and matrix b, save result in matrix c
}
}
MPI_Gather(c, N*N/size, MPI_INT, c, N*N/size, MPI_INT, 0, MPI_COMM_WORLD);
// print result
if(rank == 0){
printf("\nResult of matrix multiplication:\n");
for(i = 0; i < r1; ++i){
for(j = 0; j < c2; ++j){
printf("%d ", c[i][j]);
if(j == c2-1){
printf("\n\n");
}
}
}
}
MPI_Finalize();
}
Related
I am getting the following error output while executing MPI_Recv:
MPI_Recv(buf=0x000000D62C56FC60, count=1, MPI_INT, src=3, tag=0, MPI_COMM_WORLD, status=0x0000000000000001) failed
Message truncated; 8 bytes received but buffer size is 4
My function needs to find the number of a row which has a maximum element at the ind position.
My function's code is found below:
int find_row(Matr matr, int ind)
{
int max = ind;
for (int i = ind + 1 + CurP; i < N; i += Pnum)
if (matr[i][ind] > matr[max][ind])
max = i;
int ans = max;
if (CurP != 0)
{
MPI_Send(&max, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
else
{
MPI_Barrier(MPI_COMM_WORLD);
for (int i = 1; i < Pnum; i++)
{
MPI_Recv(&max, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("max %d %Lf! Process %d;\n", max, matr[max][ind], i);
fflush(stdout);
if (matr[max][ind] > matr[ans][ind])
ans = max;
}
}
return ans;
}
Matr is the following type definition: typedef vector<vector<long double> >& Matr;
CurP and Pnum are initialized in the following way:
MPI_Comm_size(MPI_COMM_WORLD, &Pnum);
MPI_Comm_rank(MPI_COMM_WORLD, &CurP);
Please help me solve this issue. Thanks!
It's my fail. I execute MPI_Bcast from not all processes in another part of my code.
I newbie to mpi programming. I was trying to write matrix multiplication. Went through the post MPI Matrix Multiplication with scatter gather about matrix multiplication using scatter and gather routine.
I tried modifying the code available on above post as below...
#define N 4
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stddef.h>
#include "mpi.h"
void print_results(char *prompt, int a[N][N]);
int main(int argc, char *argv[])
{
int i, j, k, rank, size, tag = 99, blksz, sum = 0;
int a[N][N]={{1,2,3,4},{5,6,7,8},{9,1,2,3},{4,5,6,7,}};
int b[N][N]={{1,2,3,4},{5,6,7,8},{9,1,2,3},{4,5,6,7,}};
int c[N][N];
int aa[N],cc[N];
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//scatter rows of first matrix to different processes
MPI_Scatter(a, N*N/size, MPI_INT, aa, N*N/size, MPI_INT,0,MPI_COMM_WORLD);
//broadcast second matrix to all processes
MPI_Bcast(b, N*N, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
//perform vector multiplication by all processes
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
sum = sum + aa[j] * b[i][j];
}
cc[i] = sum;
sum = 0;
}
MPI_Gather(cc, N*N/size, MPI_INT, c, N*N/size, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
print_results("C = ", c);
}
void print_results(char *prompt, int a[N][N])
{
int i, j;
printf ("\n\n%s\n", prompt);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf(" %d", a[i][j]);
}
printf ("\n");
}
printf ("\n\n");
}
I ran above program as
$mpirun -np 4 ./a.out
For above program I am getting following incorrect output..
C =
0 0 -562242168 32766
1 0 4197933 0
-562242176 32766 0 0
4197856 0 4196672 0
C =
0 0 -1064802792 32765
1 0 4197933 0
-1064802800 32765 0 0
4197856 0 4196672 0
C =
30 70 29 60
70 174 89 148
29 89 95 74
60 148 74 126
C =
0 0 -1845552920 32765
1 0 4197933 0
-1845552928 32765 0 0
4197856 0 4196672 0
I have following queries
1. Why result matrix C is getting printed by all processes. It is
supposed to be printed by only main process.
2. Why incorrect result is being printed?
Corrections and help in this regard will be appreciated.
The result matrix c is getting printed by all processes because every process executes the function void print_results(char *prompt, int a[N][N]). Since you are gathering at the process having rank 0, add a statement if (rank == 0) before calling the print_results(...) function. Further, the result is incorrect because of a wrong loop logic in :
for (j = 0; j < N; j++)
{
sum = sum + aa[j] * b[i][j];
}
This should be :
for (j = 0; j < N; j++)
{
sum = sum + aa[j] * b[j][i];
}
Also there is no need to broadcast b as all processes already already have a copy of it and you can avoid MPI_Barrier(). The complete program then becomes :
#define N 4
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stddef.h>
#include "mpi.h"
void print_results(char *prompt, int a[N][N]);
int main(int argc, char *argv[])
{
int i, j, k, rank, size, tag = 99, blksz, sum = 0;
int a[N][N]={{1,2,3,4},{5,6,7,8},{9,1,2,3},{4,5,6,7,}};
int b[N][N]={{1,2,3,4},{5,6,7,8},{9,1,2,3},{4,5,6,7,}};
int c[N][N];
int aa[N],cc[N];
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//scatter rows of first matrix to different processes
MPI_Scatter(a, N*N/size, MPI_INT, aa, N*N/size, MPI_INT,0,MPI_COMM_WORLD);
//broadcast second matrix to all processes
MPI_Bcast(b, N*N, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
//perform vector multiplication by all processes
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
sum = sum + aa[j] * b[j][i]; //MISTAKE_WAS_HERE
}
cc[i] = sum;
sum = 0;
}
MPI_Gather(cc, N*N/size, MPI_INT, c, N*N/size, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
if (rank == 0) //I_ADDED_THIS
print_results("C = ", c);
}
void print_results(char *prompt, int a[N][N])
{
int i, j;
printf ("\n\n%s\n", prompt);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf(" %d", a[i][j]);
}
printf ("\n");
}
printf ("\n\n");
}
Then c =
C =
54 37 47 57
130 93 119 145
44 41 56 71
111 79 101 123
Call to mpi_finalize doesn't indicate that all the MPI processes are terminated like in OpenMP !
In most of mpi implementation, all the processes execute the instruction before the MPI_init and after MPI_Finalized.
A good practice is to do nothing before MPI_Init and after MPI_Finalized.
The thing I am still not too certain about is what happens with the root process in MPI Scatter / Scatterv.
If I divide an array as I try in my code, do I need to include the root process in the number of receivers (hence making the sendcounts of size nproc) or is it excluded?
In my example code for Matrix Multiplication, I still get an error by one of the processes running into aberrant behaviour, terminating the program prematurely:
void readMatrix();
double StartTime;
int rank, nproc, proc;
//double matrix_A[N_ROWS][N_COLS];
double **matrix_A;
//double matrix_B[N_ROWS][N_COLS];
double **matrix_B;
//double matrix_C[N_ROWS][N_COLS];
double **matrix_C;
int low_bound = 0; //low bound of the number of rows of each process
int upper_bound = 0; //upper bound of the number of rows of [A] of each process
int portion = 0; //portion of the number of rows of [A] of each process
int main (int argc, char *argv[]) {
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
matrix_A = (double **)malloc(N_ROWS * sizeof(double*));
for(int i = 0; i < N_ROWS; i++) matrix_A[i] = (double *)malloc(N_COLS * sizeof(double));
matrix_B = (double **)malloc(N_ROWS * sizeof(double*));
for(int i = 0; i < N_ROWS; i++) matrix_B[i] = (double *)malloc(N_COLS * sizeof(double));
matrix_C = (double **)malloc(N_ROWS * sizeof(double*));
for(int i = 0; i < N_ROWS; i++) matrix_C[i] = (double *)malloc(N_COLS * sizeof(double));
int *counts = new int[nproc](); // array to hold number of items to be sent to each process
// -------------------> If we have more than one process, we can distribute the work through scatterv
if (nproc > 1) {
// -------------------> Process 0 initalizes matrices and scatters the portions of the [A] Matrix
if (rank==0) {
readMatrix();
}
StartTime = MPI_Wtime();
int counter = 0;
for (int proc = 0; proc < nproc; proc++) {
counts[proc] = N_ROWS / nproc ;
counter += N_ROWS / nproc ;
}
counter = N_ROWS - counter;
counts[nproc-1] = counter;
//set bounds for each process
low_bound = rank*(N_ROWS/nproc);
portion = counts[rank];
upper_bound = low_bound + portion;
printf("I am process %i and my lower bound is %i and my portion is %i and my upper bound is %i \n",rank,low_bound, portion,upper_bound);
//scatter the work among the processes
int *displs = new int[nproc]();
displs[0] = 0;
for (int proc = 1; proc < nproc; proc++) displs[proc] = displs[proc-1] + (N_ROWS/nproc);
MPI_Scatterv(matrix_A, counts, displs, MPI_DOUBLE, &matrix_A[low_bound][0], portion, MPI_DOUBLE, 0, MPI_COMM_WORLD);
//broadcast [B] to all the slaves
MPI_Bcast(&matrix_B, N_ROWS*N_COLS, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// -------------------> Everybody does their work
for (int i = low_bound; i < upper_bound; i++) {//iterate through a given set of rows of [A]
for (int j = 0; j < N_COLS; j++) {//iterate through columns of [B]
for (int k = 0; k < N_ROWS; k++) {//iterate through rows of [B]
matrix_C[i][j] += (matrix_A[i][k] * matrix_B[k][j]);
}
}
}
// -------------------> Process 0 gathers the work
MPI_Gatherv(&matrix_C[low_bound][0],portion,MPI_DOUBLE,matrix_C,counts,displs,MPI_DOUBLE,0,MPI_COMM_WORLD);
}
...
The root process also takes place in the receiver side. If you are not interested in that, just set sendcounts[root] = 0.
See MPI_Scatterv for specific information on which values you have to pass exactly.
However, take care of what you are doing. I strongly suggest that you change the way you allocate your matrix as a one-dimensional array, using a single malloc like this:
double* matrix = (double*) malloc( N_ROWS * N_COLS * sizeof(double) );
If you still want to use a two-dimensional array, then you may need to define your types as a MPI derived datatype.
The datatype you are passing is not valid if you want to send more than a row in a single MPI transfer.
With MPI_DOUBLE you are telling MPI that the buffer contains a contiguous array of count MPI_DOUBLE values.
Since you are allocating a two-dimensional array using multiple malloc calls, then your data is not contiguous.
So I have a some code where I am using MPI_Bcast to send information from the root node to all nodes, but instead I want to get my P0 to send chunks of the array to individual processes.
How do I do this with MPI_Send and MPI_Receive?
I've never used them before and I don't know if I need to loop my MPI_Receive to effectively send everything or what.
I've put giant caps lock comments in the code where I need to replace my MPI_Bcast(), sorry in advance for the waterfall of code.
Code:
#include "mpi.h"
#include <stdio.h>
#include <math.h>
#define MAXSIZE 10000000
int add(int *A, int low, int high)
{
int res = 0, i;
for(i=low; i<=high; i++)
res += A[i];
return(res);
}
int main(argc,argv)
int argc;
char *argv[];
{
int myid, numprocs, x;
int data[MAXSIZE];
int i, low, high, myres, res;
double elapsed_time;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
if (myid == 0)
{
for(i=0; i<MAXSIZE; i++)
data[i]=1;
}
/* star the timer */
elapsed_time = -MPI_Wtime();
//THIS IS WHERE I GET CONFUSED ABOUT MPI_SEND AND MPI_RECIEVE!!!
MPI_Bcast(data, MAXSIZE, MPI_INT, 0, MPI_COMM_WORLD);
x = MAXSIZE/numprocs;
low = myid * x;
high = low + x - 1;
if (myid == numprocs - 1)
high = MAXSIZE-1;
myres = add(data, low, high);
printf("I got %d from %d\n", myres, myid);
MPI_Reduce(&myres, &res, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
/* stop the timer*/
elapsed_time += MPI_Wtime();
if (myid == 0)
printf("The sum is %d, time taken = %f.\n", res,elapsed_time);
MPI_Barrier(MPI_COMM_WORLD);
printf("The sum is %d at process %d.\n", res,myid);
MPI_Finalize();
return 0;
}
You need MPI_Scatter. A good intro is here: http://mpitutorial.com/tutorials/mpi-scatter-gather-and-allgather/
I think in your code it could look like this:
elements_per_proc = MAXSIZE/numprocs;
// Create a buffer that will hold a chunk of the global array
int *data_chunk = malloc(sizeof(int) * elements_per_proc);
MPI_Scatter(data, elements_per_proc, MPI_INT, data_chunk,
elements_per_proc, MPI_INT, 0, MPI_COMM_WORLD);
If you really want use MPI_Send and MPI_Recv, then you can use something like this:
int x = MAXSIZE / numprocs;
int *procData = new int[x];
if (rank == 0) {
for (int i = 1; i < num; i++) {
MPI_Send(data + i*x, x, MPI_INT, i, 0, MPI_COMM_WORLD);
}
} else {
MPI_Recv(procData, x, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
}
I am very new to Open MPI. I have made a small program that computes the sum of an array, by splitting array into pieces equal to the number of processes. The problem in my program is that each process is computing right sum of its share of the array, but the individually computed sums are not summed by MPI_reduce function. I tried my best to solve and also consulted the Open MPI manual, but there is still something that I might be missing. I would be grateful for any kind of guidance. Below is the program I made:
#include "mpi.h"
#include <stdio.h>
int main(int argc, char *argv[])
{
int n, rank, nrofProcs, i;
int sum, ans;
// 0,1,2, 3,4,5, 6,7,8, 9
int myarr[] = {1,5,9, 2,8,3, 7,4,6, 10};
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nrofProcs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
n = 10;
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
sum = 0.0;
int remaining = n % nrofProcs;
int lower =rank*(n/nrofProcs);
int upper = (lower+(n/nrofProcs))-1;
for (i = lower; i <= upper; i++)
{
sum = sum + myarr[i];
}
if(rank==nrofProcs-1)
{
while(i<=remaining)
{
sum = sum + myarr[i];
i++;
}
}
/* (PROBLEM IS HERE, IT IS NOT COMBINING "sums") */
MPI_Reduce(&sum, &ans, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
// if (rank == 0)
printf( "rank: %d, Sum ans: %d\n", rank, sum);
/* shut down MPI */
MPI_Finalize();
return 0;
}
Output:
rank: 2, Sum ans: 17
rank: 1, Sum ans: 13
rank: 0, Sum ans: 15
(Output should be rank: 0, Sum ans: 55)
Sorry, I made some mistakes, that I corrected after running parallel debugging on my program. Here I am sharing code to split an array of length N on M processes, where N and M can have any value:
/*
An MPI program split an array of length N on M processes, where N and M can have any value
*/
#include <math.h>
#include "mpi.h"
#include <iostream>
#include <vector>
using namespace std;
int main(int argc, char *argv[])
{
int n, rank, nrofProcs, i;
int sum, ans;
// 0,1,2, 3,4,5, 6,7,8, 9, 10
int myarr[] = {1,5,9, 2,8,3, 7,4,6,11,10};
vector<int> myvec (myarr, myarr + sizeof(myarr) / sizeof(int) );
n = myvec.size(); // number of items in array
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nrofProcs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
sum = 0.0;
int remaining = n % nrofProcs;
int lower =rank*(n/nrofProcs);
int upper = (lower+(n/nrofProcs))-1;
for (i = lower; i <= upper; i++)
{
sum = sum + myvec[i];
}
if(rank==nrofProcs-1)
{
int ctr=0;
while(ctr<remaining)
{
sum = sum + myvec[i];
ctr++;
i++;
}
}
/* combine everyone's calculations */
MPI_Reduce(&sum, &ans, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0)
cout << "rank: " <<rank << " Sum ans: " << ans<< endl;
/* shut down MPI */
MPI_Finalize();
return 0;
}