Here is my problem. I want to parallelize AES-128 encryption in C using Openmp. I am hardly getting any speedup with the following code using openmp. My machine is Quadcore intel i5 machine.
Here is the code. Any suggestion how to parallelize this code further would be really really appreciated. Please take a look at the main function which is at the end of the code. AES code below consists of a few functions to achieve its functionality. Please suggest how to best extract parallelism from this.
Thanks so much.
/*
******************************************************************
** Advanced Encryption Standard implementation in C. **
** By Niyaz PK **
** E-mail: niyazpk#gmail.com **
** Downloaded from Website: www.hoozi.com **
******************************************************************
This is the source code for encryption using the latest AES algorithm.
******************************************************************
*/
// Include stdio.h for standard input/output.
// Used for giving output to the screen.
#include<omp.h>
#include<stdio.h>
#include<time.h>
#include<stdlib.h>
// The number of columns comprising a state in AES. This is a constant in AES. Value=4
#define Nb 4
// The number of rounds in AES Cipher. It is simply initiated to zero. The actual value is recieved in the program.
int Nr=0;
// The number of 32 bit words in the key. It is simply initiated to zero. The actual value is recieved in the program.
int Nk=0;
// in - it is the array that holds the plain text to be encrypted.
// out - it is the array that holds the output CipherText after encryption.
// state - the array that holds the intermediate results during encryption.
unsigned char in[16], out[16], state[4][4];
// The array that stores the round keys.
unsigned char RoundKey[240];
// The Key input to the AES Program
unsigned char Key[32];
int getSBoxValue(int num)
{
int sbox[256] = {
//0 1 2 3 4 5 6 7 8 9 A B C D E F
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, //0
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, //1
0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, //2
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, //3
0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, //4
0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, //5
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, //6
0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, //7
0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, //8
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, //9
0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, //A
0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, //B
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, //C
0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, //D
0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, //E
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; //F
return sbox[num];
}
// The round constant word array, Rcon[i], contains the values given by
// x to th e power (i-1) being powers of x (x is denoted as {02}) in the field GF(28)
// Note that i starts at 1, not 0).
int Rcon[255] = {
0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a,
0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39,
0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a,
0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8,
0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef,
0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc,
0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b,
0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3,
0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94,
0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20,
0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35,
0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f,
0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04,
0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63,
0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd,
0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb };
// This function produces Nb(Nr+1) round keys. The round keys are used in each round to encrypt the states.
void KeyExpansion()
{
int i,j;
unsigned char temp[4],k;
// The first round key is the key itself.
for(i=0;i<Nk;i++)
{
RoundKey[i*4]=Key[i*4];
RoundKey[i*4+1]=Key[i*4+1];
RoundKey[i*4+2]=Key[i*4+2];
RoundKey[i*4+3]=Key[i*4+3];
}
// All other round keys are found from the previous round keys.
while (i < (Nb * (Nr+1)))
{
for(j=0;j<4;j++)
{
temp[j]=RoundKey[(i-1) * 4 + j];
}
if (i % Nk == 0)
{
// This function rotates the 4 bytes in a word to the left once.
// [a0,a1,a2,a3] becomes [a1,a2,a3,a0]
// Function RotWord()
{
k = temp[0];
temp[0] = temp[1];
temp[1] = temp[2];
temp[2] = temp[3];
temp[3] = k;
}
// SubWord() is a function that takes a four-byte input word and
// applies the S-box to each of the four bytes to produce an output word.
// Function Subword()
{
temp[0]=getSBoxValue(temp[0]);
temp[1]=getSBoxValue(temp[1]);
temp[2]=getSBoxValue(temp[2]);
temp[3]=getSBoxValue(temp[3]);
}
temp[0] = temp[0] ^ Rcon[i/Nk];
}
else if (Nk > 6 && i % Nk == 4)
{
// Function Subword()
{
temp[0]=getSBoxValue(temp[0]);
temp[1]=getSBoxValue(temp[1]);
temp[2]=getSBoxValue(temp[2]);
temp[3]=getSBoxValue(temp[3]);
}
}
RoundKey[i*4+0] = RoundKey[(i-Nk)*4+0] ^ temp[0];
RoundKey[i*4+1] = RoundKey[(i-Nk)*4+1] ^ temp[1];
RoundKey[i*4+2] = RoundKey[(i-Nk)*4+2] ^ temp[2];
RoundKey[i*4+3] = RoundKey[(i-Nk)*4+3] ^ temp[3];
i++;
}
}
// This function adds the round key to state.
// The round key is added to the state by an XOR function.
void AddRoundKey(int round)
{
int i,j;
for(i=0;i<4;i++)
{
for(j=0;j<4;j++)
{
state[j][i] ^= RoundKey[round * Nb * 4 + i * Nb + j];
}
}
}
// The SubBytes Function Substitutes the values in the
// state matrix with values in an S-box.
void SubBytes()
{
int i,j;
for(i=0;i<4;i++)
{
for(j=0;j<4;j++)
{
state[i][j] = getSBoxValue(state[i][j]);
}
}
}
// The ShiftRows() function shifts the rows in the state to the left.
// Each row is shifted with different offset.
// Offset = Row number. So the first row is not shifted.
void ShiftRows()
{
unsigned char temp;
// Rotate first row 1 columns to left
temp=state[1][0];
state[1][0]=state[1][1];
state[1][1]=state[1][2];
state[1][2]=state[1][3];
state[1][3]=temp;
// Rotate second row 2 columns to left
temp=state[2][0];
state[2][0]=state[2][2];
state[2][2]=temp;
temp=state[2][1];
state[2][1]=state[2][3];
state[2][3]=temp;
// Rotate third row 3 columns to left
temp=state[3][0];
state[3][0]=state[3][3];
state[3][3]=state[3][2];
state[3][2]=state[3][1];
state[3][1]=temp;
}
// xtime is a macro that finds the product of {02} and the argument to xtime modulo {1b}
#define xtime(x) ((x<<1) ^ (((x>>7) & 1) * 0x1b))
// MixColumns function mixes the columns of the state matrix
// The method used may look complicated, but it is easy if you know the underlying theory.
// Refer the documents specified above.
void MixColumns()
{
int i;
unsigned char Tmp,Tm,t;
for(i=0;i<4;i++)
{
t=state[0][i];
Tmp = state[0][i] ^ state[1][i] ^ state[2][i] ^ state[3][i] ;
Tm = state[0][i] ^ state[1][i] ; Tm = xtime(Tm); state[0][i] ^= Tm ^ Tmp ;
Tm = state[1][i] ^ state[2][i] ; Tm = xtime(Tm); state[1][i] ^= Tm ^ Tmp ;
Tm = state[2][i] ^ state[3][i] ; Tm = xtime(Tm); state[2][i] ^= Tm ^ Tmp ;
Tm = state[3][i] ^ t ; Tm = xtime(Tm); state[3][i] ^= Tm ^ Tmp ;
}
}
// Cipher is the main function that encrypts the PlainText.
void Cipher()
{
int i,j,round=0;
//Copy the input PlainText to state array.
for(i=0;i<4;i++)
{
for(j=0;j<4;j++)
{
state[j][i] = in[i*4 + j];
}
}
// Add the First round key to the state before starting the rounds.
AddRoundKey(0);
// There will be Nr rounds.
// The first Nr-1 rounds are identical.
// These Nr-1 rounds are executed in the loop below.
for(round=1;round<Nr;round++)
{
SubBytes();
ShiftRows();
MixColumns();
AddRoundKey(round);
}
// The last round is given below.
// The MixColumns function is not here in the last round.
SubBytes();
ShiftRows();
AddRoundKey(Nr);
// The encryption process is over.
// Copy the state array to output array.
for(i=0;i<4;i++)
{
for(j=0;j<4;j++)
{
out[i*4+j]=state[j][i];
}
}
}
void encrypt(int *K,int *PT,int *CT)
{
int i;
// int ct;
// Calculate Nk and Nr from the received value.
Nr = 128;
Nk = Nr / 32;
Nr = Nk + 6;
// Copy the Key and PlainText
for(i=0;i<Nk*4;i++)
{
Key[i]=K[i];
in[i]=PT[i];
}
/*
printf("\nKey for encryption:\n");
for(i=0; i < Nk*4; i++)
printf("%02x",Key[i]);
printf("\n");
*/
/*
printf("\nText before encryption:\n");
for(i=0; i < Nk*4; i++)
printf("%02x",in[i]);
printf("\n");
*/
// The KeyExpansion routine must be called before encryption.
KeyExpansion();
// The next function call encrypts the PlainText with the Key using AES algorithm.
Cipher();
// Output the encrypted text.
//io_printf("\nText after encryption:\n");
for(i=0; i < Nk*4; i++)
{
CT[i] = out[i];
printf("%02x",out[i]);
}
printf("\n");
// ct = out[15];
// return ct;
}
//main function
int main()
{
srand(time(NULL));
unsigned int rnd[4];
int key[16];
int pt[16];
int ct[16];
unsigned int i,j;
#pragma omp parallel for num_threads(4) schedule(dynamic)
for(i=0; i<65000*10; i++)
{
rnd[0]=rand();
rnd[1]=rand();
rnd[2]=rand();
rnd[3]=rand();
for(j=0; j < 4; j++)
{
key[4*j] = (rnd[j] & 0xff);
pt[4*j] = key[4*j];
key[4*j+1] = ((rnd[j] >> 8) & 0xff) ;
pt[4*j+1] = key[4*j+1];
key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
pt[4*j+2] = key[4*j+2];
key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
pt[4*j+3] = key[4*j+3];
}
#pragma omp task
encrypt(key,pt,ct);
}
return 0;
}
I have modified the code as suggested by Hristo. Thanks for your effort. Here is how the code looks. I don't understand how to make encrypt( ) function use local variables. Can you explain. Please add code where it should be. Thanks again for your efforts.
Secondly, if there is no printf statements, how would you see if the output is correct or not. I mean are there other mechanisms to display or save the output. Lastly, the code as shown below is still slower than the serial execution (i.e., without openmp). There is no printf in the serial version either to make the comparison fair.
void encrypt(int *K,int *PT,int *CT)
{
int i;
// int ct;
// Calculate Nk and Nr from the received value.
Nr = 128;
Nk = Nr / 32;
Nr = Nk + 6;
// Copy the Key and PlainText
for(i=0;i<Nk*4;i++)
{
Key[i]=K[i];
in[i]=PT[i];
}
/*
printf("\nKey for encryption:\n");
for(i=0; i < Nk*4; i++)
printf("%02x",Key[i]);
printf("\n");
*/
/*
printf("\nText before encryption:\n");
for(i=0; i < Nk*4; i++)
printf("%02x",in[i]);
printf("\n");
*/
// The KeyExpansion routine must be called before encryption.
KeyExpansion();
// The next function call encrypts the PlainText with the Key using AES algorithm.
Cipher();
// Output the encrypted text.
//io_printf("\nText after encryption:\n");
for(i=0; i < Nk*4; i++)
{
CT[i] = out[i];
// printf("%02x",out[i]);
}
// printf("\n");
// ct = out[15];
// return ct;
}
//main function
int main()
{
srand(time(NULL));
unsigned int rnd[4];
// printf("rand_key = %2x%2x%2x%2x\n",rnd[0],rnd[1],rnd[2],rnd[3]);
int key[16];
int pt[16];
int ct[16];
unsigned int i,j;
#pragma omp parallel for private(key,pt,ct) num_threads(2) schedule(static)
for(i=0; i<65000; i++)
{
rnd[0]=rand();
rnd[1]=rand();
rnd[2]=rand();
rnd[3]=rand();
for(j=0; j < 4; j++)
{
key[4*j] = (rnd[j] & 0xff);
pt[4*j] = key[4*j];
key[4*j+1] = ((rnd[j] >> 8) & 0xff) ;
pt[4*j+1] = key[4*j+1];
key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
pt[4*j+2] = key[4*j+2];
key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
pt[4*j+3] = key[4*j+3];
}
encrypt(key,pt,ct);
}
return 0;
}
You need neither schedule(dynamic) nor task constructs. As far as I am aware of the intrinsics of AES, this is a completely regular problem - each encryption takes exactly the same number of cycles and therefore the same wall-clock time, no matter what the key. This completly rules out the necessity to use dynamic scheduling and tasks. Even in the case of umbalanced problems, simply adding schedule(dynamic) is a very bad idea. The reason for that is that the default chunk size for dynamic is 1, which means that each thread executes a single iteration and then asks the OpenMP runtime for another one. In your case the overhead is multiplied 650000 times. Dynamic scheduling, when actually applicable, is very powerful but one should carefully choose the optimal chunk size with the latter often involving lots of trials until the optimal value is found.
Besides that you generate 650000 tasks. Each task has a certain overhead associated with its creation and subsequent consumption by the worker threads. Given that AES takes about 18 cycles per byte on Pentium Pro (ref: Wikipedia), each call to encrypt() probably would have taken about the same amount of time as the OpenMP runtime needs in order to execute the task if it wasn't for the printf() statement inside. printf() outputs to the terminal or to a file stream (if redirected) and doing I/O with the same descriptor is esentially a serial operation, i.e. it serialises the threads. See this answer to get an idea of how much printf() impacts parallel performance.
But the worst problem of your code is actually the multitude of data races. encrypt() depends on and changes the values of several global variables. This not only leads to slow down due to true cache sharing, but most likely also results in completely wrong ciphertexts. These global variables should be all made local to encrypt() or made threadprivate if it is necessary that they stay global. Then the parallel loop uses several shared variables, namely key, pt and ct. These should be made private.
Summary: make encrypt() use only local variables; make key, pt and ct private; change the loop schedule to static; remove the task construct; remove all printf statements that output information at each iteration.
Bonus: rand() keeps its state in global variables too.
There are so many global variables. Just make them thread-private. Add the following OpenMP pragma just after the definition of the last global variable:
...
// The Key input to the AES Program
unsigned char Key[32];
#pragma omp threadprivate(Nr,Nk,in,out,state,RoundKey,Key)
...
Also change your main() function as follows:
unsigned int i;
#pragma omp parallel for num_threads(2) schedule(static)
for(i = 0; i < 65000; i++)
{
unsigned int rnd[4];
int key[16];
int pt[16];
int ct[16];
unsigned int j;
// Per-thread PRNG initialisation
// It could be done better - this is for illustration purposes only
unsigned int rand_state = time(NULL) + 1337*omp_get_thread_num();
rnd[0] = rand_r(&rand_state);
rnd[1] = rand_r(&rand_state);
rnd[2] = rand_r(&rand_state);
rnd[3] = rand_r(&rand_state);
for(j = 0; j < 4; j++)
{
key[4*j] = (rnd[j] & 0xff);
pt[4*j] = key[4*j];
key[4*j+1] = ((rnd[j] >> 8) & 0xff) ;
pt[4*j+1] = key[4*j+1];
key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
pt[4*j+2] = key[4*j+2];
key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
pt[4*j+3] = key[4*j+3];
}
encrypt(key, pt, ct);
}
Notice - variables such as key, pt, j, etc. are defined in the scope where they are used. This frees you from the necessity to put them all in a private clause since such variables are predetermined to be private. Also each thread now has its own PRNG state.