Modular Exponentiation using mips - recursion

Write a program which prompts the user for three positive numbers x, n and p, and outputs
x^n mod p. Your program should use the recursive version of modular exponentiation.
.text
main:
sub $sp,$sp,4 # save return address on stack
sw $ra, 0($sp)
li $v0, 4 # prompt user for int x
la $a0, S1
syscall
li $v0, 5 # read int
syscall
move $s0, $v0 # cin >> x //and store x in $s0
li $v0, 4 # prompt user for int n
la $a0, S1
syscall
li $v0, 5 # read int
syscall
move $s1, $v0 # cin >> n //and store n in $s1
li $v0, 4 # prompt user for int p
la $a0, S1
syscall
li $v0, 5 # read int
syscall
move $s2, $v0 # cin >> p //and store n in $s2
li $t0, 0 #return value
li $t1, 2 #constant 2
li $t2, 0 #variable y
beq $s0, $zero, L1 #if x == 0, return 1
beq $s1, $zero, L2 #if n is 0, return 0
jal evenMod
L0:
lw $ra, 0($sp) # read registers from stack
lw $s0, 4($sp)
lw $s1, 8($sp)
addi $sp, $sp, 12 # bring back stack pointer
jr $ra
L1:
li $v0, 4
la $a0, S3
syscall
j L0
L2:
li $v0, 4
la $a0, S4
syscall
j L0
L3:
li $v0, 1
move $a0, $s1
syscall
j L0
evenMod:
beq $s0, $zero, L1 #if x == 0, return 1
beq $s1, $zero, L2 #if n is 0, return 0
rem $s3, $s1, $t1 #s3 = s1 % 2
bne $s3, $zero, oddMod #if n%2 == 0, recursive call for odd
div $s1, $s1, 2 #n = n/2
mult $t2, $t2 #y = y*y
rem $t2, $t2, $s2 #y= (y*y)%c
jal evenMod
j L3
oddMod:
beq $s0, $zero, L1 #if x == 0, return 1
beq $s1, $zero, L2 #if n is 0, return 0
rem $s3, $s1, $t1 #s3 = s1 % 2
bne $s3, $zero, evenMod #if n%2 == 0, recursive call for even
rem $s3, $s1, $s2 #s3 = s1 % P
addi $s0, 0 #x stays the same
add $s1, $s1, -1 #n = n-1
addi $s2, 0 #p stays the same
jal oddMod #call oddmod with updated values
mult $t2, $t2 #multiply y*y
rem $t2, $t2, $s2 #y = y%P
j L3
.data
S1:
.asciiz "Enter an integer --> "
S3:
.asciiz "0"
S4:
.asciiz "1"
This is what I have so far, but I'm getting stuck on where the JALs should occur.

C code:
public static int exponentMod(int A, int B, int C) {
//base cases
if (A == 0) return 0;
if (B == 0) return 1;
//If B is even
long y;
if (B % 2 == 0) {
y = exponentMod(A, B / 2, C);
y = (y * y) % C;
}
//if B is odd
else {
y = A % C;
y = (y * exponentMod(A, B - 1, C) % C) % C;
}
//return the modular exponent
return (int)((y + C) % C);
}
Transformation, still in C, to if-goto-label as follows:
public static int exponentMod(int A, int B, int C) {
//base cases
if (A != 0) goto endIf1;
return 0;
endIf1:
if (B != 0) goto endIf2;
return 1;
endIf2:
long y;
if (B % 2 != 0) goto elseIf3;
// B is even
y = exponentMod(A, B / 2, C);
y = (y * y) % C;
goto endIf3;
elseIf3:
// B is odd
y = A % C;
y = (y * exponentMod(A, B - 1, C) % C) % C;
endIf3:
//return the modular exponent
return (int)((y + C) % C);
}
The control flow has been converted to the if-goto-label style of assembly, using logical transformations.  (Note that the above transformation remains valid and runnable C, and will run the same as the structured-statement original.)  The if-goto-label version is closer to assembly. 
Of course, there are many other ways to translate the control flow, such as moving blocks of code to different places; however, I prefer the above form that stays true to the statement ordering of original C code, making it easier to follow when comparing the original and the assembly version.
So, what's left to do is transform the non-control flow statements and expressions, which should appear in orientation exactly where they are in this transformation result.

Related

Converting C to MIPS assembly with pointer

int x;
int *ptr;
x = 22;
ptr = &x;
*ptr = 100;
How can i convert this code to MIPS language?
You just need to save the memory of x in a register so you can save another word at that address. Here is the solution:
.data
x: .word 22 # int x = 22;
.text
main:
la $t0, x # ptr = &x;
addi $t1, $zero, 100 # add the value 100 in another temporary register
sw $t1, 0($t0) # *ptr = 100;
lw $t3, 0($t0) #if you try to print now the value, you will see that
li $v0, 1 #prints "100"
move $a0, $t3
syscall

Returning a value pointed to by a pointer in x86 NASM

I'm trying to write a function in x86 NASM assembly that takes a pointer to a structure (structure contains pointer to a buffer) and 2 ints (x,y) which then computes the address of the byte containing (x,y) and returns the value in this address. (The buffer contains a bmp file) I have this function written in C and it works fine.
C function
int loadByte(imgInfo* pImg, int x, int y)
{
unsigned char *pPix = pImg->pImg + (((pImg->width + 31) >> 5) << 2) * y + (x >> 3);
return *pPix;
}
x86 function
load_byte:
push ebp ; prologue
mov ebp, esp
lea ecx, [ebp + 8]
mov ecx, [ecx] ; ecx = &imgInfo
mov eax, [ecx+0] ; eax = width
add eax, 31 ; eax = width + 31
sar eax, 5 ; eax = (width + 31) >> 5
sal eax, 2 ; eax = ((width + 31) >> 5) << 2
mul DWORD [ebp+16] ; eax * y
mov edx, [ebp+12] ; edx = x
sar edx, 3 ; edx = x>>3
add eax, edx ; eax = ((width + 31) >> 5) << 2 * y + (x >> 3)
mov edx, [ecx+8] ; edx = &pImg
add eax, edx
mov eax, [eax]
pop ebp ; epilogue
ret
I tried checking if the address computed in both functions is the same so I changed the return of C to return pPix and commented the line mov eax, [eax] in x86 and to my surprise both functions returned the same number but in the unchanged form (as in the code above) the x86 function always returns -1 for some reason. Is return *pPix not equivalent to mov eax, [eax]? What is wrong with my reasoning?
imgInfo struct
typedef struct
{
int width, height;
unsigned char* pImg; //buffer
int cX, cY;
int col;
} imgInfo;
load_byte C declaration
extern int load_byte(imgInfo* pInfo, int x, int y);

MPI C - Gather 2d Array Segments into One Global Array

I am trying to print a dynamically allocated 2d array from my master process after receiving all its components from all other processes. By components I mean subarrays, or blocks.
I have made the code generic to the number of processes. The following diagram will help you see how the blocks are arranged in the complete array. Each block is handled by one process. Just for here though, let's assume that i run the program using 12 processes (natively i have 8 cores), using the command:
mpiexec -n 12 ./gather2dArray
This is the diagram, which targets specifically the 12 process scenario:
The answer by Jonathan in this question helped me a great deal, but unfortunately i have not been able to fully implement what i want.
I first create the blocks into each process, which i name them grid. Every array is a dynamically allocated 2d array. I also create the global array (universe) to be visible only by the master process (#0).
Finally i have to use MPI_Gatherv(...) to assemble all the subarrays into the global array. Then i proceed to display the local arrays and the global array.
When i run the program with the command above i get Segmentation fault when i reach the MPI_Gatherv(...) function. I can't figure out what i do incorrectly. I have provided complete code (heavily commented) below:
EDIT
I have fixed some wrongs in the code. Now MPI_Gatherv() is somewhat successful. I am able to print the entire first row of the global array correctly (i check the individual elements of the processes and they always match). But when i reach the second row some hieroglyphics appear and finally a segmentation fault. I haven't been able to figure out what is wrong there. Still looking into it..
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
void print2dCharArray(char** array, int rows, int columns);
int main(int argc, char** argv)
{
int master = 0, np, rank;
char version[10];
char processorName[20];
int strLen[10];
// Initialize MPI environment
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &np);
if (np != 12) { MPI_Abort(MPI_COMM_WORLD,1); }
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// We need a different seed for each process
srand(time(0) ^ (rank * 33 / 4));
int nDims = 2; // array dimensions
int rows = 4, columns = 6; // rows and columns of each block
int prows = 3, pcolumns = 4; // rows and columns of blocks. Each block is handled by 1 process
char** grid = malloc(rows * sizeof(char*));
for (int i = 0; i < rows; i++)
grid[i] = malloc(columns * sizeof(char));
char** universe = NULL; // Global array
char* recvPtr; // Pointer to start of Global array
int Rows = rows * prows; // Global array rows
int Columns = columns * pcolumns; // Global array columns
int sizes[2]; // No of elements in each dimension of the whole array
int subSizes[2]; // No of elements in each dimension of the subarray
int startCoords[2]; // Starting coordinates of each subarray
MPI_Datatype recvBlock, recvMagicBlock;
if (rank == master){ // For the master's eyes only
universe = malloc(Rows * sizeof(char*));
for (int i = 0; i < Rows; i++)
universe[i] = malloc(Columns * sizeof(char));
// Create a subarray (a rectangular block) datatype from a regular, 2d array
sizes[0] = Rows;
sizes[1] = Columns;
subSizes[0] = rows;
subSizes[1] = columns;
startCoords[0] = 0;
startCoords[1] = 0;
MPI_Type_create_subarray(nDims, sizes, subSizes, startCoords, MPI_ORDER_C, MPI_CHAR, &recvBlock);
// Now modify the newly created datatype to fit our needs, by specifying
// (lower bound remains the same = 0)
// - new extent
// The new region / block will now "change" sooner, as soon as we reach a region of elements
// occupied by a new block, ie. every: (columns) * sizeof(elementType) =
MPI_Type_create_resized(recvBlock, 0, columns * sizeof(char), &recvMagicBlock);
MPI_Type_commit(&recvMagicBlock);
recvPtr = &universe[0][0];
}
// populate arrays
for (int y = 0; y < rows; y++){
for (int x = 0; x < columns; x++){
if (( (double) rand() / RAND_MAX) <= density)
grid[y][x] = '#';
else
grid[y][x] = '.';
}
}
// display local array
for (int i = 0; i < np; i++){
if (i == rank) {
printf("\n[Rank] of [total]: No%d of %d\n", rank, np);
print2dCharArray(grid, rows, columns);
}
MPI_Barrier(MPI_COMM_WORLD);
}
/* MPI_Gathering.. */
int recvCounts[np], displacements[np];
// recvCounts: how many chunks of data each process has -- in units of blocks here --
for (int i = 0; i < np; i++)
recvCounts[i] = 1;
// prows * pcolumns = np
// displacements: displacement relative to global buffer (universe) at which to place the
// incoming data block from process i -- in block extents! --
int index = 0;
for (int p_row = 0; p_row < prows; p_row++)
for (int p_column = 0; p_column < pcolumns; p_column++)
displacements[index++] = p_column + p_row * (rows * pcolumns);
// MPI_Gatherv(...) is a collective routine
// Gather the local arrays to the global array in the master process
// send type: MPI_CHAR (a char)
// recv type: recvMagicBlock (a block)
MPI_Gatherv(&grid[0][0], rows * columns, MPI_CHAR, //: parameters relevant to sender
recvPtr, recvCounts, displacements, recvMagicBlock, master, //: parameters relevant to receiver
MPI_COMM_WORLD);
// display global array
MPI_Barrier(MPI_COMM_WORLD);
if (rank == master){
printf("\n---Global Array---\n");
print2dCharArray(universe, Rows, Columns);
}
MPI_Finalize();
return 0;
}
void print2dCharArray(char** array, int rows, int columns)
{
int i, j;
for (i = 0; i < rows; i++){
for (j = 0; j < columns; j++){
printf("%c ", array[i][j]);
}
printf("\n");
}
fflush(stdout);
}
The following is the output I'm getting. No matter what i try, I cannot get past this. As you can see the first line of the global array is printed properly using the first 4 blocks of the 4 processes. When jumping to next line we get hieroglyphics..
hostname#User:~/mpi$ mpiexec -n 12 ./gather2darray
MPICH Version: 3User
Processor name: User
[Rank] of [total]: No0 of 12
. . # . . #
# . # # # .
. . . # # .
. . # . . .
[Rank] of [total]: No1 of 12
. . # # . .
. . . . # #
. # . . # .
. . # . . .
[Rank] of [total]: No2 of 12
. # # # . #
. # . . . .
# # # . . .
. . . # # .
[Rank] of [total]: No3 of 12
. . # # # #
. . # # . .
# . # . # .
. . . # . .
[Rank] of [total]: No4 of 12
. # . . . #
# . # . # .
# . . . . .
# . . . . .
[Rank] of [total]: No5 of 12
# # . # # .
# . . # # .
. . . . # .
. # # . . .
[Rank] of [total]: No6 of 12
. . # # . #
. . # . # .
# . . . . .
. . . # # #
[Rank] of [total]: No7 of 12
# # . # # .
. # # . . .
. . . . . #
. . . # # .
[Rank] of [total]: No8 of 12
. # . . . .
# . # . # .
. . . # . #
# . # # # .
[Rank] of [total]: No9 of 12
. . . . . #
. . # . . .
. . # . . #
. . # # . .
[Rank] of [total]: No10 of 12
. . . . # .
# . . . . .
. . # # . .
. . . # . #
[Rank] of [total]: No11 of 12
. # . . # .
. # . # # .
. . . # . .
. # . # . #
---Global Array---
. . # . . # . . # # . . . # # # . # . . # # # #
� � < * � � e { � � � � � �
J
*** Error in `./gather2darray': double free or corruption (out): 0x0000000001e4c050 ***
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
*** stack smashing detected ***: ./gather2darray terminated
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 10979 RUNNING AT User
= EXIT CODE: 139
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
Help will be very appreciated. Thanks in advance.
Your code is almost correct, you just forgotten an MPI important principle. When you are using an array on MPI functions, MPI assumes that your array memory is allocate continuously. So you have to change your 2 dims arrays allocations.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
void print2dCharArray(char** array, int rows, int columns);
int main(int argc, char** argv)
{
int master = 0, np, rank;
char version[10];
char processorName[20];
int strLen[10];
// Initialize MPI environment
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &np);
if (np != 12) { MPI_Abort(MPI_COMM_WORLD,1); }
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// We need a different seed for each process
srand(time(0) ^ (rank * 33 / 4));
int nDims = 2; // array dimensions
int rows = 4, columns = 6; // rows and columns of each block
int prows = 3, pcolumns = 4; // rows and columns of blocks. Each block is handled by 1 process
char* pre_grid = (char*) malloc(rows * columns * sizeof(char));
char** grid = (char**) malloc(rows * sizeof(char*));
for (int i = 0; i < rows; i++)
grid[i] = &(pre_grid[i * columns]);
char** universe = NULL; // Global array
char* pre_universe = NULL;
char* recvPtr; // Pointer to start of Global array
int Rows = rows * prows; // Global array rows
int Columns = columns * pcolumns; // Global array columns
int sizes[2]; // No of elements in each dimension of the whole array
int subSizes[2]; // No of elements in each dimension of the subarray
int startCoords[2]; // Starting coordinates of each subarray
MPI_Datatype recvBlock, recvMagicBlock;
if (rank == master){ // For the master's eyes only
/* universe = malloc(Rows * sizeof(char*));*/
/* for (int i = 0; i < Rows; i++)*/
/* universe[i] = malloc(Columns * sizeof(char));*/
pre_universe = (char*) malloc(Rows * Columns * sizeof(char));
universe = (char**) malloc(Rows * sizeof(char*));
for (int i = 0; i < Rows; i++) {
universe[i] = &(pre_universe[i * Columns]);
}
// Create a subarray (a rectangular block) datatype from a regular, 2d array
sizes[0] = Rows;
sizes[1] = Columns;
subSizes[0] = rows;
subSizes[1] = columns;
startCoords[0] = 0;
startCoords[1] = 0;
MPI_Type_create_subarray(nDims, sizes, subSizes, startCoords, MPI_ORDER_C, MPI_CHAR, &recvBlock);
// Now modify the newly created datatype to fit our needs, by specifying
// (lower bound remains the same = 0)
// - new extent
// The new region / block will now "change" sooner, as soon as we reach a region of elements
// occupied by a new block, ie. every: (columns) * sizeof(elementType) =
MPI_Type_create_resized(recvBlock, 0, columns * sizeof(char), &recvMagicBlock);
MPI_Type_commit(&recvMagicBlock);
recvPtr = &universe[0][0];
}
// populate arrays
for (int y = 0; y < rows; y++){
for (int x = 0; x < columns; x++){
grid[y][x] = rank + 65;
}
}
// display local array
for (int i = 0; i < np; i++){
if (i == rank) {
printf("\n[Rank] of [total]: No%d of %d\n", rank, np);
print2dCharArray(grid, rows, columns);
}
MPI_Barrier(MPI_COMM_WORLD);
}
/* MPI_Gathering.. */
int recvCounts[np], displacements[np];
// recvCounts: how many chunks of data each process has -- in units of blocks here --
for (int i = 0; i < np; i++)
recvCounts[i] = 1;
// prows * pcolumns = np
// displacements: displacement relative to global buffer (universe) at which to place the
// incoming data block from process i -- in block extents! --
int index = 0;
for (int p_row = 0; p_row < prows; p_row++)
for (int p_column = 0; p_column < pcolumns; p_column++)
displacements[index++] = p_column + p_row * (rows * pcolumns);
// MPI_Gatherv(...) is a collective routine
// Gather the local arrays to the global array in the master process
// send type: MPI_CHAR (a char)
// recv type: recvMagicBlock (a block)
MPI_Gatherv(&grid[0][0], rows * columns, MPI_CHAR, //: parameters relevant to sender
recvPtr, recvCounts, displacements, recvMagicBlock, master, //: parameters relevant to receiver
MPI_COMM_WORLD);
// display global array
MPI_Barrier(MPI_COMM_WORLD);
if (rank == master){
printf("\n---Global Array---\n");
print2dCharArray(universe, Rows, Columns);
}
free(grid[0]);
free(grid);
if (rank == master) {
free(universe[0]);
free(universe);
MPI_Type_free(&recvMagicBlock);
MPI_Type_free(&recvBlock);
}
MPI_Finalize();
return 0;
}
void print2dCharArray(char** array, int rows, int columns)
{
int i, j;
for (i = 0; i < rows; i++){
for (j = 0; j < columns; j++){
printf("%c ", array[i][j]);
}
printf("\n");
}
fflush(stdout);
}
Output:
---Global Array---
A A A A A A B B B B B B C C C C C C D D D D D D
A A A A A A B B B B B B C C C C C C D D D D D D
A A A A A A B B B B B B C C C C C C D D D D D D
A A A A A A B B B B B B C C C C C C D D D D D D
E E E E E E F F F F F F G G G G G G H H H H H H
E E E E E E F F F F F F G G G G G G H H H H H H
E E E E E E F F F F F F G G G G G G H H H H H H
E E E E E E F F F F F F G G G G G G H H H H H H
I I I I I I J J J J J J K K K K K K L L L L L L
I I I I I I J J J J J J K K K K K K L L L L L L
I I I I I I J J J J J J K K K K K K L L L L L L
I I I I I I J J J J J J K K K K K K L L L L L L

how does second recursive that is return numberOfPaths(m-1, n) + numberOfPaths(m, n-1) call works?

How does second recursive that is return numberOfPaths(m-1, n) + numberOfPaths(m, n-1) call works??
#include <iostream>
using namespace std;
// Returns count of possible paths to reach cell at row number m and column
// number n from the topmost leftmost cell (cell at 1, 1)
int numberOfPaths(int m, int n)
{
// If either given row number is first or given column number is first
if (m == 1 || n == 1)
return 1;
// If diagonal movements are allowed then the last addition
// is required.
return numberOfPaths(m-1, n) + numberOfPaths(m, n-1);
// + numberOfPaths(m-1,n-1);
}
int main()
{
cout << numberOfPaths(3, 3);
return 0;
}
It calls the function numberOfPaths again. Once with m-1 and n and once with m and n-1 and adds their results.
If you called numberOfPaths(2,2) it would call numberOfPaths(1,2) and numberOfPaths(2,1) which both return 1 and thus numberOfPaths(2,2) would return 2.

Is there an algorithm known for power towers modulo a number managing all cases?

I would like to have an implementation in PARI/GP
for the calculation of
a_1 ^ a_2 ^ ... ^ a_n (mod m)
which manages all cases, especially the cases where high powers appear in the phi-chain.
Does anyone know such an implementation ?
Here's a possibility using Chinese remainders to make sure the modulus is a prime power. This simplifies the computation of x^n mod m in the painful case where gcd(x,m) is not 1. The code assumes the a_i are > 1; most of the code checks whether p^a_1^a_2^...^a_n is 0 mod (p^e) for a prime number p, while avoiding overflow.
\\ x[1]^x[2]^ ...^ x[#x] mod m, assuming x[i] > 1 for all i
tower(x, m) =
{ my(f = factor(m), P = f[,1], E = f[,2]);
chinese(vector(#P, i, towerp(x, P[i], E[i])));
}
towerp(x, p, e) =
{ my(q = p^e, i, t, v);
if (#x == 0, return (Mod(1, q)));
if (#x == 1, return (Mod(x[1], q)));
if (v = valuation(x[1], p),
t = x[#x]; i = #x;
while (i > 1,
if (t >= e, return (Mod(0, q)));
t = x[i]^t; i--);
if (t * v >= e, return (Mod(0, q)));
return (Mod(x[1], q)^t);
);
Mod(x[1], q)^lift(tower(x[^1], (p-1)*p^e));
}
For instance
? 5^(4^(3^2)) % 163 \\ direct computation, wouldn't scale
%1 = 158
? tower([5,4,3,2], 163)
%2 = Mod(158, 163)

Resources