Performance degrade while using alternative for Intel intrinsics SSSE3 - intel

I am developing a performance critical application which has to be ported into Intel Atom processor which just supports MMX, SSE, SSE2 and SSE3. My previous application had support for SSSE3 as well as AVX now I want to downgrade it to Intel Atom processor(MMX, SSE, SSE2, SSE3).
There is a serious performance downgrade when I replace ssse3 instruction particularly _mm_hadd_epi16 with this code
RegTemp1 = _mm_setr_epi16(RegtempRes1.m128i_i16[0], RegtempRes1.m128i_i16[2],
RegtempRes1.m128i_i16[4], RegtempRes1.m128i_i16[6],
Regfilter.m128i_i16[0], Regfilter.m128i_i16[2],
Regfilter.m128i_i16[4], Regfilter.m128i_i16[6]);
RegTemp2 = _mm_setr_epi16(RegtempRes1.m128i_i16[1], RegtempRes1.m128i_i16[3],
RegtempRes1.m128i_i16[5], RegtempRes1.m128i_i16[7],
Regfilter.m128i_i16[1], Regfilter.m128i_i16[3],
Regfilter.m128i_i16[5], Regfilter.m128i_i16[7]);
RegtempRes1 = _mm_add_epi16(RegTemp1, RegTemp2);
This is the best conversion I was able to come up with for this particular instruction. But this change has seriously affected the performance of the entire program.
Can anyone please suggest a better performance efficient alternative within MMX, SSE, SSE2 and SSE3 instructions to the _mm_hadd_epi16 instruction. Thanks in advance.

_mm_hadd_epi16(a, b) can be simulated with the following code:
/* (b3, a3, b2, a2, b1, a1, b0, a0) */
__m128i ab0 = _mm_unpacklo_epi16(a, b);
/* (b7, a7, b6, a6, b5, a5, b4, a4) */
__m128i ba0 = _mm_unpackhi_epi16(a, b);
/* (b5, b1, a5, a1, b4, b0, a4, a0) */
__m128i ab1 = _mm_unpacklo_epi16(ab0, ba0);
/* (b7, b3, a7, a3, b6, b2, a6, a2) */
__m128i ba1 = _mm_unpackhi_epi16(ab0, ba0);
/* (b6, b4, b2, b0, a6, a4, a2, a0) */
__m128i ab2 = _mm_unpacklo_epi16(ab1, ba1);
/* (b7, b5, b3, b1, a7, a5, a3, a1) */
__m128i ba2 = _mm_unpackhi_epi16(ab1, ba1);
/* (b6+b7, b4+b5, b2+b3, b0+b1, a6+a7, a4+a5, a2+a3, a0+a1) */
__m128i c = _mm_add_epi16(ab2, ba2);

If your goal is to take the horizontal sum of 8 16-bit values you can do this with SSE2 like this:
__m128i sum1 = _mm_shuffle_epi32(a,0x0E); // 4 high elements
__m128i sum2 = _mm_add_epi16(a,sum1); // 4 sums
__m128i sum3 = _mm_shuffle_epi32(sum2,0x01); // 2 high elements
__m128i sum4 = _mm_add_epi16(sum2,sum3); // 2 sums
__m128i sum5 = _mm_shufflelo_epi16(sum4,0x01); // 1 high element
__m128i sum6 = _mm_add_epi16(sum4,sum5); // 1 sum
int16_t sum7 = _mm_cvtsi128_si32(sum6); // 16 bit sum

Related

lbfgs: how to find gradient

I want to use lbfgs method for minimizing function. Problem is that the function is Svenson function (see: Svenson function) and I do not know how to find gradient of such function, where tau (time) goes from 1:15.
Any help?
This is the gradient. Take the partials wrt each of the variables.
We can numerically check the gradient is correct.
grad=function(t, b0, b1, b2, b3, l1, l2) {
a=1
b=exp(-t/l1)
c=t/l1*exp(-t/l1)
d=t/l2*exp(-t/l2)
e=t*exp(-t/l1)*(b1*l1+b2*(t-l1))/l1^3
f=-b3*t*exp(-t/l2)*(l2-t)/l2^3
return(c(a,b,c,d,e,f))
}
func=function(t, b0, b1, b2, b3, l1, l2) {
return(b0+b1*exp(-t/l1)+b2*t/l1*exp(-t/l1)+b3*t/l2*exp(-t/l2))
}
x0=c(runif(6))
x1=x0+rnorm(6, 0, .01)
f0=func(1, x0[1], x0[2], x0[3], x0[4], x0[5], x0[6])
f0grd=grad(1, x0[1], x0[2], x0[3], x0[4], x0[5], x0[6])
f1=func(1, x1[1], x1[2], x1[3], x1[4], x1[5], x1[6])
f1-f0
0.009506896
sum(f0grd * (x1-x0))
0.009467063

RISC-V Recursive Factorial Function Debugging

Im trying to create a recursive factorial function in RISCV but having some problems.
Here's what we have so far:
.globl factorial
.data
n: .word 8
.text
main:
la t0, n
lw a0, 0(t0)
jal ra, factorial
addi a1, a0, 0
addi a0, x0, 1
ecall # Print Result
addi a1, x0, '\n'
addi a0, x0, 11
ecall # Print newline
addi a0, x0, 10
ecall # Exit
factorial:
la t1, n
beq x0, t1, finish
addi t0, t1, -1
mul a0, t0, a0
j factorial
finish:
ret
ecall
We tried adding and changing around the registers to use, but its still not loading the correct values to the correct registers. We're also kinda stuck on how to do this recursively. Would love some help!
Your main code looks fine. All of the issues I see are in the factorial function. First, there are four clear issues with your factorial function:
factorial:
# This loads the address of n not the value at label n
# You need to additionally lw t1, 0(t1) to get the value
la t1, n
# t1 is never getting modified so why would this loop ever terminate?
beq x0, t1, finish
# You should do these two operations in the opposite order
# if t1 = 1, a0 would become 0
addi t0, t1, -1
mul a0, t0, a0
j factorial
finish:
ret
# Why ecall here? You have already returned. This is unreachable.
ecall
However, you can't just fix those and expect it to work. Your current implementation is lacking a plan of how to actually compute the factorial. I assume you were trying to make an implementation like the following:
int factorial_recursive(int n) {
if (n == 0) {
return 1;
}
int recursive = factorial_recursive(n-1);
return n * recursive;
}
A direct translation of that C code would need to use the stack to save n and the return address and properly follow calling convention. I am not prepared to write out a complete explanation of that though, so I will explain how to convert the looping version of factorial to get you started in the right direction.
The C code I will implement is RISC-V assembly:
int factorial_loop(int n) {
int out = 1;
while (n > 0) {
out *= n;
n -= 1;
}
return out;
}
For this code, n will start out in a0, but eventually it will need to be moved out so we can return out so we will allocate our registers so the function will look like:
int factorial_loop(int a0) {
int a1 = 1;
while (a0 > 0) {
a1 *= a0;
a0 -= 1;
}
a0 = a1;
return a0;
}
From here it is pretty easy to do a direct conversion.
factorial_loop:
li a1, 1 # int a1 = 1;
loop:
beq a0, x0, finish # while (a0 > 0) {
mul a1, a1, a0 # a1 *= a0;
addi a0, a0, -1 # a0 -= 1;
j loop # }
finish:
mv a0, a1 # a0 = a1;
ret # return a0;

mpi_alltoall on nonequal number of grids and processes

I understand the general usage of MPI_alltoall, which can be described by the following figure
But in practice, it is almost not always that the number of processes will equal to the number grids. The above case assume process = grid = 4. If numbers are not equal, I will have rectangular grids. Below I show an example showing a similar alltoall operation, but nonequal number of grids and processes (grid = 8, process = 2).
My question is then very straightforward, how should I achieve that?
I have looked over alltoallv, but I don't think it will work.
Any suggestions are welcome.
Thank you
a "natural" alltoall would be
MPI_Alltoall(sbuf, 4, MPI_INT, rbuf, 4, MPI_INT, MPI_COMM_WORLD);
and you would end up with
P0 = { A0, A1, A2, A3, C0, C1, C2, C3}
P1 = { B0, B1, B2, B3, D0, D1, D2, D3}
your case is a bit convoluted and you have to use (complex) derived datatypes. (note I did not free the intermediate datatypes in order to keep the code readable)
MPI_Datatype tmp, stype, rtype;
/* derived datatype for send */
MPI_Type_vector(2, 1, 4, MPI_INT, &tmp); /* {0, 4} */
MPI_Type_create_resized(tmp, 0, 4, &tmp); /* next type starts at 1 */
MPI_Type_contiguous(2, tmp, &tmp); /* {0, 4, 1, 5} */
MPI_Type_create_resized(tmp, 0, 8, &stype); /* next type starts at 2, likely unnecessary */
MPI_Type_commit(&stype);
/* derived datatype for recv */
MPI_Type_vector(2, 2, 4, MPI_INT, &tmp); /* {0, 1, 4, 5 } */
MPI_Type_create_resized(tmp, 0, 8, &rtype); /* next type starts at 2 */
MPI_Type_commit(&rtype);
/* all2all */
/* thanks to the derived datatypes :
P0 sends {A0, B0, A1, B1} to P0 and {A2, B2, A3, B3} to P1
P0 receives {A0, B0, .., .., A1, B1, .., ..} from itself, and
{ .., .., C0, D0, .., .., C1, D1} from P1 } */
MPI_Alltoall(sbuf, 1, stype, rbuf, 1, rtype, MPI_COMM_WORLD);

Determining whether there is a descending pattern between two sampled numbers

I have two numbers that are samples of two different quantities (it doesn't really matter what it is). They are both fluctuating with time. I have samples for these values from two different points in time. Call them a0, a1, b0, b1. I can use the differences (a1-a0, b1-b0) the difference and sum of the differences ( (a1-a0)-(b1-b0) ) ( (a1-a0) + (b1-b0) ) )
My questions is how do you determine when both of them are descending in an fashion that doesn't hard code any constants. Let me explain.
I want to detect when both of these quantities have decreased by a certain amount but that amount may change if I change the quantities I'm sampling so I can't hard code a constant.
I'm sorry if this is vague but that's really all the information I have. I was just wondering if this is even solvable.
if ( a1 - a0 < 0)
if( b1 - b0 < 0) {
//... descending
}
or:
if ( a1 - a0 + b1 - b0 < a1 - a0) // b1 - b0 is negative
if( a1 - a0 + b1 - b0 < b1 - b0) { // a1 - a0 is negative
//... descending
}
To add a threshold is simple:
if ( a1 - a0 < -K)
if( b1 - b0 < -K) {
//... descending, more than K
}
or:
if ( a1 - a0 + b1 - b0 < a1 - a0 - K) // b1 - b0 is less than -K
if( a1 - a0 + b1 - b0 < b1 - b0 - K) { // a1 - a0 is less than -K
//... descending more than K
}

Multiply multiple _mm128 with single entry of _mm256

I have 8 _mm128 registers and each register needs to be multiplied by a single entry of another _mm256 register.
One solution that jumps to my mind would be:
INPUT: __m128 a[8]; __m256 b;
__m128 tmp = _mm256_extractf128_ps(b,0);
a[0] = _mm_mul_ps(a[0],_mm_shuffle_ps(tmp,tmp,0));
a[1] = _mm_mul_ps(a[1],_mm_shuffle_ps(tmp,tmp,0x55));
a[2] = _mm_mul_ps(a[2],_mm_shuffle_ps(tmp,tmp,0xAA));
a[3] = _mm_mul_ps(a[3],_mm_shuffle_ps(tmp,tmp,0xFF));
tmp = _mm256_extractf128_ps(b,1);
a[4] = _mm_mul_ps(a[4],_mm_shuffle_ps(tmp,tmp,0));
a[5] = _mm_mul_ps(a[5],_mm_shuffle_ps(tmp,tmp,0x55));
a[6] = _mm_mul_ps(a[6],_mm_shuffle_ps(tmp,tmp,0xAA));
a[7] = _mm_mul_ps(a[7],_mm_shuffle_ps(tmp,tmp,0xFF));
What would be the best way to achieve this? Thank you.
I think your solution is about as good as it's going to get, except that I would use explicit variables rather than an array, so that everything stays in registers as far as possible:
__m128 a0, a1, a2, a3, a4, a5, a6, a7;
__m256 b;
__m128 tmp = _mm256_extractf128_ps(b,0);
a0 = _mm_mul_ps(a0, _mm_shuffle_ps(tmp,tmp,0));
a1 = _mm_mul_ps(a1, _mm_shuffle_ps(tmp,tmp,0x55));
a2 = _mm_mul_ps(a2, _mm_shuffle_ps(tmp,tmp,0xAA));
a3 = _mm_mul_ps(a3, _mm_shuffle_ps(tmp,tmp,0xFF));
tmp = _mm256_extractf128_ps(b,1);
a4 = _mm_mul_ps(a4, _mm_shuffle_ps(tmp,tmp,0));
a5 = _mm_mul_ps(a5, _mm_shuffle_ps(tmp,tmp,0x55));
a6 = _mm_mul_ps(a6, _mm_shuffle_ps(tmp,tmp,0xAA));
a7 = _mm_mul_ps(a7, _mm_shuffle_ps(tmp,tmp,0xFF));

Resources