I'm analyzing a piece of linear algebra code which is calling intrinsics directly, e.g.
v_dot0 = _mm256_fmadd_pd( v_x0, v_y0, v_dot0 );
My test script computes the dot product of two double precision vectors of length 4 (so only one call to _mm256_fmadd_pd needed), repeated 1 billion times. When I count the number of operations with perf I get something as follows:
Performance counter stats for './main':
0 r5380c7 (skl::FP_ARITH:512B_PACKED_SINGLE) (49.99%)
0 r5340c7 (skl::FP_ARITH:512B_PACKED_DOUBLE) (49.99%)
0 r5320c7 (skl::FP_ARITH:256B_PACKED_SINGLE) (49.99%)
2'998'943'659 r5310c7 (skl::FP_ARITH:256B_PACKED_DOUBLE) (50.01%)
0 r5308c7 (skl::FP_ARITH:128B_PACKED_SINGLE) (50.01%)
1'999'928'140 r5304c7 (skl::FP_ARITH:128B_PACKED_DOUBLE) (50.01%)
0 r5302c7 (skl::FP_ARITH:SCALAR_SINGLE) (50.01%)
1'000'352'249 r5301c7 (skl::FP_ARITH:SCALAR_DOUBLE) (49.99%)
I was surprised that the number of 256B_PACKED_DOUBLE operations is approx. 3 billion, instead of 1 billion, as this is an instruction from my architecture's instruction set. Why does perf count 3 packed double operations per call to _mm256_fmadd_pd?
Note: to test that the code is not calling other floating point operations accidentally, I commented out the call to the above mentioned intrinsic, and perf counts exactly zero 256B_PACKED_DOUBLE operations, as expected.
Edit: MCVE, as requested:
ddot.c
#include <immintrin.h> // AVX
double ddot(int m, double *x, double *y) {
int ii;
double dot = 0.0;
__m128d u_dot0, u_x0, u_y0, u_tmp;
__m256d v_dot0, v_dot1, v_x0, v_x1, v_y0, v_y1, v_tmp;
v_dot0 = _mm256_setzero_pd();
v_dot1 = _mm256_setzero_pd();
u_dot0 = _mm_setzero_pd();
ii = 0;
for (; ii < m - 3; ii += 4) {
v_x0 = _mm256_loadu_pd(&x[ii + 0]);
v_y0 = _mm256_loadu_pd(&y[ii + 0]);
v_dot0 = _mm256_fmadd_pd(v_x0, v_y0, v_dot0);
}
// reduce
v_dot0 = _mm256_add_pd(v_dot0, v_dot1);
u_tmp = _mm_add_pd(_mm256_castpd256_pd128(v_dot0), _mm256_extractf128_pd(v_dot0, 0x1));
u_tmp = _mm_hadd_pd(u_tmp, u_tmp);
u_dot0 = _mm_add_sd(u_dot0, u_tmp);
_mm_store_sd(&dot, u_dot0);
return dot;
}
main.c:
#include <stdio.h>
double ddot(int, double *, double *);
int main(int argc, char const *argv[]) {
double x[4] = {1.0, 2.0, 3.0, 4.0}, y[4] = {5.0, 5.0, 5.0, 5.0};
double xTy;
for (int i = 0; i < 1000000000; ++i) {
ddot(4, x, y);
}
printf(" %f\n", xTy);
return 0;
}
I run perf as
sudo perf stat -e r5380c7 -e r5340c7 -e r5320c7 -e r5310c7 -e r5308c7 -e r5304c7 -e r5302c7 -e r5301c7 ./a.out
The disassembly of ddot looks as follows:
0000000000000790 <ddot>:
790: 83 ff 03 cmp $0x3,%edi
793: 7e 6b jle 800 <ddot+0x70>
795: 8d 4f fc lea -0x4(%rdi),%ecx
798: c5 e9 57 d2 vxorpd %xmm2,%xmm2,%xmm2
79c: 31 c0 xor %eax,%eax
79e: c1 e9 02 shr $0x2,%ecx
7a1: 48 83 c1 01 add $0x1,%rcx
7a5: 48 c1 e1 05 shl $0x5,%rcx
7a9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
7b0: c5 f9 10 0c 06 vmovupd (%rsi,%rax,1),%xmm1
7b5: c5 f9 10 04 02 vmovupd (%rdx,%rax,1),%xmm0
7ba: c4 e3 75 18 4c 06 10 vinsertf128 $0x1,0x10(%rsi,%rax,1),%ymm1,%ymm1
7c1: 01
7c2: c4 e3 7d 18 44 02 10 vinsertf128 $0x1,0x10(%rdx,%rax,1),%ymm0,%ymm0
7c9: 01
7ca: 48 83 c0 20 add $0x20,%rax
7ce: 48 39 c1 cmp %rax,%rcx
7d1: c4 e2 f5 b8 d0 vfmadd231pd %ymm0,%ymm1,%ymm2
7d6: 75 d8 jne 7b0 <ddot+0x20>
7d8: c5 f9 57 c0 vxorpd %xmm0,%xmm0,%xmm0
7dc: c5 ed 58 d0 vaddpd %ymm0,%ymm2,%ymm2
7e0: c4 e3 7d 19 d0 01 vextractf128 $0x1,%ymm2,%xmm0
7e6: c5 f9 58 d2 vaddpd %xmm2,%xmm0,%xmm2
7ea: c5 f9 57 c0 vxorpd %xmm0,%xmm0,%xmm0
7ee: c5 e9 7c d2 vhaddpd %xmm2,%xmm2,%xmm2
7f2: c5 fb 58 d2 vaddsd %xmm2,%xmm0,%xmm2
7f6: c5 f9 28 c2 vmovapd %xmm2,%xmm0
7fa: c5 f8 77 vzeroupper
7fd: c3 retq
7fe: 66 90 xchg %ax,%ax
800: c5 e9 57 d2 vxorpd %xmm2,%xmm2,%xmm2
804: eb da jmp 7e0 <ddot+0x50>
806: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
80d: 00 00 00
I just tested with an asm loop on SKL. An FMA instructions like vfmadd231pd ymm0, ymm1, ymm3 counts for 2 counts of fp_arith_inst_retired.256b_packed_double, even though it's a single uop!
I guess Intel really wanted a FLOP counter, not an instruction or uop counter.
Your 3rd 256-bit FP uop is probably coming from something else you're doing, like a horizontal sum that starts out doing a 256-bit shuffle and another 256-bit add, instead of reducing to 128-bit first. I hope you're not using _mm256_hadd_pd!
Test code inner loop:
$ asm-link -d -n "testloop.asm" # assemble with NASM -felf64 and link with ld into a static binary
mov ebp, 100000000 # setup stuff outside the loop
vzeroupper
0000000000401040 <_start.loop>:
401040: c4 e2 f5 b8 c3 vfmadd231pd ymm0,ymm1,ymm3
401045: c4 e2 f5 b8 e3 vfmadd231pd ymm4,ymm1,ymm3
40104a: ff cd dec ebp
40104c: 75 f2 jne 401040 <_start.loop>
$ taskset -c 3 perf stat -etask-clock,context-switches,cpu-migrations,page-faults,cycles,branches,instructions,uops_issued.any,uops_executed.thread,fp_arith_inst_retired.256b_packed_double -r4 ./"$t"
Performance counter stats for './testloop-cvtss2sd' (4 runs):
102.67 msec task-clock # 0.999 CPUs utilized ( +- 0.00% )
2 context-switches # 24.510 M/sec ( +- 20.00% )
0 cpu-migrations # 0.000 K/sec
2 page-faults # 22.059 M/sec ( +- 11.11% )
400,388,898 cycles # 3925381.355 GHz ( +- 0.00% )
100,050,708 branches # 980889291.667 M/sec ( +- 0.00% )
400,256,258 instructions # 1.00 insn per cycle ( +- 0.00% )
300,377,737 uops_issued.any # 2944879772.059 M/sec ( +- 0.00% )
300,389,230 uops_executed.thread # 2944992450.980 M/sec ( +- 0.00% )
400,000,000 fp_arith_inst_retired.256b_packed_double # 3921568627.451 M/sec
0.1028042 +- 0.0000170 seconds time elapsed ( +- 0.02% )
400M counts of fp_arith_inst_retired.256b_packed_double for 200M FMA instructions / 100M loop iterations.
(IDK what up with perf 4.20.g8fe28c + kernel 4.20.3-arch1-1-ARCH. They calculate per-second stuff with the decimal in the wrong place for the unit. e.g. 3925381.355 kHz is correct, not GHz. Not sure if it's a bug in perf or the kernel.
Without vzeroupper, I'd sometimes see a latency of 5 cycles, not 4, for FMA. IDK if the kernel left a register in a polluted state or something.
Why do I get three though, and not two? (see MCVE added to original post)
Your ddot4 runs _mm256_add_pd(v_dot0, v_dot1); at the start of the cleanup, and since you call it with size=4, you get the cleanup once per FMA.
Note that your v_dot1 is always zero (because you didn't actually unroll with 2 accumulators like you're planning to?) So this is pointless, but the CPU doesn't know that. My guess was wrong, it's not a 256-bit hadd, it's just a useless 256-bit vertical add.
(For larger vectors, yes multiple accumulators are very valuable to hide FMA latency. You'll want at least 8 vectors. See Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? for more about unrolling with multiple accumulators. But then you'll want a cleanup loop that does 1 vector at a time until you're down to the last up-to-3 elements.)
Also, I think your final _mm_add_sd(u_dot0, u_tmp); is actually a bug: you've already added the last pair of elements with an inefficient 128-bit hadd, so this double-counts the lowest element.
See Get sum of values stored in __m256d with SSE/AVX for a way that doesn't suck.
Also note that GCC is splitting your unaligned loads into 128-bit halves with vinsertf128 because you compiled with the default -mtune=generic (which favours Sandybridge) instead of using -march=haswell to enable AVX+FMA and set -mtune=haswell. (Or use -march=native)
Given a base array, I need to compute the value of a function given below:
A[] = { a0, a1, a2, a3, .. an }
F(0,i) = ai [Base case]
F(1,i) = F(0,i) xor F(0,i+1) xor F(0,i+2) ... xor F(0,n)
F(2,i) = F(1,i) xor F(1,i+1) xor F(1,i+2) ... xor F(1,n)
.
.
F(x,i) = F(x-1,i) xor F(x-1,i+1) xor F(x-1,i+2) ... xor F(x-1,n)
0 < x < 10^18
0 < n < 10^5
I need to find F(x,0).
I am trying to solve this equation for the past 3 days. I have failed to optimise it and come up with a feasible solution. Any help to find F(x,0) in less than linear time is appreciated.
My observation (if it is of any importance) :
F(0,0) = a0
F(1,0) = a0^a1^a2^a3 ....
F(2,0) = a0^a2^a4 ....
F(3,0) = (a0^a4^a8...) ^ (a1^a5^a9...)
F(4,0) = a0^a4^a8 ....
F(5,0) = (a0^a1^a2^a3) ^ (a8^a9^a10^a11) ^ (a16^..a19) ^ ...
F(6,0) = (a0^a8^a16...) ^ (a2^a10^a18...)
F(7,0) = (a0^a8^a16...) ^ (a1^a9^a17...)
F(8,0) = a0^a8^a16^ ....
Maybe it becomes easier if you reverse the array, with the recurrence relation
F(0, i) = a[n-i]
F(x, i) = XOR[0 <= j < i]( F(x-1, j) )
that is, going from 0 to i instead of i to n. We are then looking for F(x, n). If we make a table:
| 0 1 2 3
--+-------------------------------------------------------------------------------------------
0 | A=a[n] B=a[n-1] C=a[n-2] D=a[n-3]
1 | A A^B A^B^C A^B^C^D
2 | A A^(A^B) A^(A^B)^(A^B^C) A^(A^B)^(A^B^C)^(A^B^C^D)^B^C^D
3 | A A^(A^(A^B)) A^(A^(A^B))^(A^(A^B)^(A^B^C)) A^(A^(A^B))^(A^(A^B)^(A^B^C))^(A^(A^B)^(A^B^C)^(A^B^C^D))
or let's stop using the XOR sign and just count how often each term appears:
| 0 1 2 3 4
--+-----------------------------------------------
0 | 1A 1B 1C 1D 1E
1 | 1A 1A1B 1A1B1C 1A 1B1C1D 1A 1B 1C1D1E
2 | 1A 2A1B 3A2B1C 4A 3B2C1D 5A 4B 3C2D1E
3 | 1A 3A1B 6A3B1C 10A 6B3C1D 15A10B 6C3D1E
4 | 1A 4A1B 10A4B1C 20A10B4C1D 35A20B10C4D1E
We can also see that the recurrence relation is the same as
F(x, i) = F(x-1, i) + F(x, i-1)
This is Pascal's Triangle. Specifically, the values we are after are the nth diagonal, which can of course be calculated on its own. Then check whether the number is even or odd, so that you know whether the XOR operations on that element cancel themselves out or not.
I am receiving data over UART from a heat meter, but I need some help to understand how i should deal with the data.
I have the documentation but that is not enough for me, I have to little experience with this kind of calculations.
Maybe someone with the right skill could explain to me how it should be done with a better example that I have from the documentation.
One value consists of the following bytes:
[number of bytes][sign+exponent] (integer)
(integer) is the register data value. The length of the integer value is
specified by [number of bytes]. [sign+exponent] is an 8-bit value that
specifies the sign of the data value and sign and value of the exponent. The
meaning of the individual bits in the [sign+exponent] byte is shown below:
Examples:
-123.45 = 04h, C2h, 0h, 0h, 30h, 39h
87654321*103 = 04h, 03h , 05h, 39h, 7Fh, B1h
255*103 = 01h, 03h , FFh
And now to one more example with actual data.
This is the information that I have from the documentation about this.
This is some data that I have received from my heat meter
10 00 56 25 04 42 00 00 1B E4
So in my example then 04 is the [number of bytes], 42 is the [sign+exponent] and 00 00 1B E4 is the (integer).
But I do not know how I should make the calculation to receive the actual value.
Any help?
Your data appears to be big-endian, according to your example. So here's how you break those bytes into the fields you need using bit shifting and masking.
n = b[0]
SI = (b[1] & 0x80) >> 7
SE = (b[1] & 0x40) >> 6
exponent = b[1] & 0x3f
integer = 0
for i = 0 to n-1:
integer = (integer << 8) + b[2+i]
The sign of the mantissa is obtained from the MSb of the Sign+exponent byte, by masking (byte & 80h != 0 => SI = -1).
The sign of the exponent is similarly obtained by byte & 40h != 0 => SE = -1.
The exponent value is EXP = byte & 3Fh.
The mantissa INT is the binary number formed by the four other bytes, which can be read as a single integer (but mind the indianness).
Finally, compute SI * INT * pow(10, SE * EXP).
In your example, SI = 1, SE = -1, EXP = 2, INT = 7140, hence
1 * 7140 * pow(10, -1 * 2) = +71.4
It is not in the scope of this answer to explain how to implement this efficiently.
We are given a unsigned integer, suppose. And without using any arithmetic operators ie + - / * or %, we are to find x mod 15. We may use binary bit manipulations.
As far as I could go, I got this based on 2 points.
a = a mod 15 = a mod 16 for a<15
Let a = x mod 15
then a = x - 15k (for some non-negative k).
ie a = x - 16k + k...
ie a mod 16 = ( x mod 16 + k mod 16 ) mod 16
ie a mod 15 = ( x mod 16 + k mod 16 ) mod 16
ie a = ( x mod 16 + k mod 16 ) mod 16
OK. Now to implement this. A mod16 operations is basically & OxF. and k is basically x>>4
So a = ( x & OxF + (x>>4) & OxF ) & OxF.
It boils down to adding 2 4-bit numbers. Which can be done by bit expressions.
sum[0] = a[0] ^ b[0]
sum[1] = a[1] ^ b[1] ^ (a[0] & b[0])
...
and so on
This seems like cheating to me. I'm hoping for a more elegant solution
This reminds me of an old trick from base 10 called "casting out the 9s". This was used for checking the result of large sums performed by hand.
In this case 123 mod 9 = 1 + 2 + 3 mod 9 = 6.
This happens because 9 is one less than the base of the digits (10). (Proof omitted ;) )
So considering the number in base 16 (Hex). you should be able to do:
0xABCE123 mod 0xF = (0xA + 0xB + 0xC + 0xD + 0xE + 0x1 + 0x2 + 0x3 ) mod 0xF
= 0x42 mod 0xF
= 0x6
Now you'll still need to do some magic to make the additions disappear. But it gives the right answer.
UPDATE:
Heres a complete implementation in C++. The f lookup table takes pairs of digits to their sum mod 15. (which is the same as the byte mod 15). We then repack these results and reapply on half as much data each round.
#include <iostream>
uint8_t f[256]={
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,
1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,1,
2,3,4,5,6,7,8,9,10,11,12,13,14,0,1,2,
3,4,5,6,7,8,9,10,11,12,13,14,0,1,2,3,
4,5,6,7,8,9,10,11,12,13,14,0,1,2,3,4,
5,6,7,8,9,10,11,12,13,14,0,1,2,3,4,5,
6,7,8,9,10,11,12,13,14,0,1,2,3,4,5,6,
7,8,9,10,11,12,13,14,0,1,2,3,4,5,6,7,
8,9,10,11,12,13,14,0,1,2,3,4,5,6,7,8,
9,10,11,12,13,14,0,1,2,3,4,5,6,7,8,9,
10,11,12,13,14,0,1,2,3,4,5,6,7,8,9,10,
11,12,13,14,0,1,2,3,4,5,6,7,8,9,10,11,
12,13,14,0,1,2,3,4,5,6,7,8,9,10,11,12,
13,14,0,1,2,3,4,5,6,7,8,9,10,11,12,13,
14,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0};
uint64_t mod15( uint64_t in_v )
{
uint8_t * in = (uint8_t*)&in_v;
// 12 34 56 78 12 34 56 78 => aa bb cc dd
in[0] = f[in[0]] | (f[in[1]]<<4);
in[1] = f[in[2]] | (f[in[3]]<<4);
in[2] = f[in[4]] | (f[in[5]]<<4);
in[3] = f[in[6]] | (f[in[7]]<<4);
// aa bb cc dd => AA BB
in[0] = f[in[0]] | (f[in[1]]<<4);
in[1] = f[in[2]] | (f[in[3]]<<4);
// AA BB => DD
in[0] = f[in[0]] | (f[in[1]]<<4);
// DD => D
return f[in[0]];
}
int main()
{
uint64_t x = 12313231;
std::cout<< mod15(x)<<" "<< (x%15)<<std::endl;
}
Your logic is somewhere flawed but I can't put a finger on it. Think about it yourself, your final formula operates on first 8 bits and ignores the rest. That could only be valid if the part you throw away (9+ bits) are always the multiplication of 15. However, in reality (in binary numbers) 9+ bits are always multiplications of 16 but not 15. For example try putting 1 0000 0000 and 11 0000 0000 in your formula. Your formula will give 0 as a result for both cases, while in reality the answer is 1 and 3.
In essense I'm almost sure that your task can not be solved without loops. And if you are allowed to use loops - then it's nothing easier than to implement bitwiseAdd function and do whatever you like with it.
Added:
Found your problem. Here it is:
... a = x - 15k (for some non-negative k).
... and k is basically x>>4
It equals x>>4 only by pure coincidence for some numbers. Take any big example, for instance x=11110000. By your calculation k = 15, while in reality it is k=16: 16*15 = 11110000.