I am trying to create a view of an array object to better utilise SIMD vectors on the x86_64 platform.
Here's the main idea:
type Char_Set_Index is range 0 .. 7;
type Char_Set_Element is mod 2 ** 32;
type Character_Set_Vector is array (Char_Set_Index) of Char_Set_Element
with Alignment => 32,Component_Size => 32, Object_Size => 256, Size => 256;
type Character_Set is array (Character) of Boolean
with Alignment => 32, Component_Size => 1, Object_Size => 256, Size => 256;
Essentially, some of the operations in Ada.Character.Maps can better be processed using SIMD arithmetic. For instance the "=" operation, perhaps coded as,
function "="
(Left, Right : in Character_Set)
return Boolean
is
(for all k in Character_Set'Range =>
(Left(k) = Right(k)));
.. gives us the following output
.LFB4:
.cfi_startproc
movq %rdi, %r8
movq %rsi, %rdi
xorl %esi, %esi
jmp .L6
.p2align 4,,10
.p2align 3
.L10:
addl $1, %esi
cmpl $256, %esi
je .L9
.L6:
movl %esi, %edx
movl %esi, %ecx
sarl $3, %edx
andl $7, %ecx
movslq %edx, %rdx
movzbl (%rdi,%rdx), %eax
xorb (%r8,%rdx), %al
shrb %cl, %al
testb $1, %al
je .L10
xorl %eax, %eax
ret
.L9:
movl $1, %eax
ret
.cfi_endproc
Critically, it is comparing each bit, and GCC won't vectorise it. However, if we write,
function "="
(Left, Right : in Character_Set)
return Boolean
is
u : aliased constant Character_Set_Vector
with Import, Address => Left'Address;
v : aliased constant Character_Set_Vector
with Import, Address => Right'Address;
Temp : array (Char_Set_Index) of Integer;
Sum : Integer;
begin
for j in Temp'Range loop
pragma Loop_Optimize (Vector);
Temp(j) := (if u(j) = v(j) then 0 else 1);
end loop;
Sum := 0;
for j in Temp'Range loop
Sum := Sum + Temp(j);
end loop;
return Sum = 0;
end "=";
We get the branch-free SIMD instructions that we kind of expect,
.cfi_startproc
vmovdqa (%rdi), %ymm1
vpcmpeqd (%rsi), %ymm1, %ymm1
vpandn .LC0(%rip), %ymm1, %ymm1
vextracti128 $0x1, %ymm1, %xmm0
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $8, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $4, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vmovd %xmm0, %eax
testl %eax, %eax
sete %al
vzeroupper
ret
.cfi_endproc
Which all works rather well. Now, the problem at hand. If you push this code through SPARK Ada, there are a number of complaints regarding alignment, aliasing, and constants, so you have to end up writing,
function "="
(Left, Right : in Character_Set)
return Boolean
is
Left_Aligned : constant Character_Set := Left
with Alignment => 32;
Right_Aligned : constant Character_Set := Right
with Alignment => 32;
u : aliased constant Character_Set_Vector
with Import, Alignment => 32, Address => Left_Aligned'Address;
v : aliased constant Character_Set_Vector
with Import, Alignment => 32, Address => Right_Aligned'Address;
Temp : array (Char_Set_Index) of Integer;
Sum : Integer;
begin
for j in Temp'Range loop
pragma Loop_Optimize (Vector);
Temp(j) := (if u(j) = v(j) then 0 else 1);
end loop;
Sum := 0;
for j in Temp'Range loop
Sum := Sum + Temp(j);
end loop;
return Sum = 0;
end "=";
which gives us an awful lot of precopying, presumably to ensure that everything is aligned OK - even though the declarations already have the correct alignment,
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-32, %rsp
vmovdqa (%rdi), %xmm2
vmovdqa 16(%rdi), %xmm3
vmovdqa (%rsi), %xmm4
vmovdqa 16(%rsi), %xmm5
vmovdqa %xmm2, -64(%rsp)
vmovdqa %xmm3, -48(%rsp)
vmovdqa -64(%rsp), %ymm6
vmovdqa %xmm4, -32(%rsp)
vmovdqa %xmm5, -16(%rsp)
vpcmpeqd -32(%rsp), %ymm6, %ymm1
vpandn .LC0(%rip), %ymm1, %ymm1
vextracti128 $0x1, %ymm1, %xmm0
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $8, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vpsrldq $4, %xmm0, %xmm1
vpaddd %xmm1, %xmm0, %xmm0
vmovd %xmm0, %eax
testl %eax, %eax
sete %al
vzeroupper
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
Obviously, the only reason one would even bother with this is for greater performance, however, the SPARK Ada rules seem too restrictive in this case, hurting performance. So, my question is, is there a better way of doing this that doesn't result in the excessive moving data around, where, as far as I can tell, it's not required.
Incidentally, Ada.Unchecked_Conversion similarly does a lot of moving data around at the beginning, too.
Also, I realise that I can justify the SPARK Ada checks (false-positive) so I can use the Ada version, but I am hoping that I am missing something, here, and that there is an easier way to do this.
Perhaps there is a way of vectorising arrays of Booleans?
EDIT: I am compiling it using
gnatmake -O3 -mavx2 -gnatn -gnatp -S name-of-package.adb
The question of why the alignment of Left and Right is unknown within the body of the function is interesting. You indeed can neither assert on the alignment attribute nor add a precondition to the function stating a requirement on parameter alignment (at least for GNATprove FSF 11.2.0). There is some comment on the issue in the SPARK source code though (see line 3276 in spark_definition.adb).
On the other hand, it seems that you can work around the additional copying of the unchecked conversion by applying the conversion in the loop. Below is what I was able to achieve with GNAT FSF 11.3.1:
character_sets.ads
package Character_Sets with SPARK_Mode is
type Character_Set is array (Character) of Boolean
with
Alignment => 32,
Component_Size => 1,
Object_Size => 256,
Size => 256;
function "=" (Left, Right : in Character_Set) return Boolean;
end Character_Sets;
character_sets.adb
with Ada.Unchecked_Conversion;
package body Character_Sets with SPARK_Mode is
type Char_Set_Index is range 0 .. 7;
type Char_Set_Element is mod 2 ** 32;
type Character_Set_Vector is array (Char_Set_Index) of aliased Char_Set_Element
with
Alignment => 32,
Component_Size => 32,
Object_Size => 256,
Size => 256;
function To_Vector is new Ada.Unchecked_Conversion
(Source => Character_Set,
Target => Character_Set_Vector);
---------
-- "=" --
---------
function "=" (Left, Right : in Character_Set) return Boolean is
Temp : array (Char_Set_Index) of Integer;
Sum : Integer;
begin
for J in Temp'Range loop
pragma Loop_Optimize (Vector);
Temp (J) := (if To_Vector (Left) (J) = To_Vector (Right) (J) then 0 else 1); -- !!!
end loop;
Sum := 0;
for J in Temp'Range loop
Sum := Sum + Temp (J);
end loop;
return Sum = 0;
end "=";
end Character_Sets;
default.gpr
project Default is
for Source_Dirs use ("src");
for Object_Dir use "obj";
for Main use ();
package Compiler is
for Switches ("ada") use ("-O3", "-mavx2", "-gnatn", "-gnatp");
end Compiler;
end Default;
output (objdump)
$ objdump -d -M intel ./obj/character_sets.o
./obj/character_sets.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <character_sets__Tcharacter_setBIP>:
0: c3 ret
1: 90 nop
2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
9: 00 00 00 00
d: 0f 1f 00 nop DWORD PTR [rax]
0000000000000010 <character_sets__Tcharacter_set_vectorBIP>:
10: c3 ret
11: 90 nop
12: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
19: 00 00 00 00
1d: 0f 1f 00 nop DWORD PTR [rax]
0000000000000020 <character_sets__Oeq>:
20: c5 fd 6f 0f vmovdqa ymm1,YMMWORD PTR [rdi]
24: c5 f5 76 0e vpcmpeqd ymm1,ymm1,YMMWORD PTR [rsi]
28: c5 f5 df 0d 00 00 00 vpandn ymm1,ymm1,YMMWORD PTR [rip+0x0] # 30 <character_sets__Oeq+0x10>
2f: 00
30: c4 e3 7d 39 c8 01 vextracti128 xmm0,ymm1,0x1
36: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
3a: c5 f1 73 d8 08 vpsrldq xmm1,xmm0,0x8
3f: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
43: c5 f1 73 d8 04 vpsrldq xmm1,xmm0,0x4
48: c5 f9 fe c1 vpaddd xmm0,xmm0,xmm1
4c: c5 f9 7e c0 vmovd eax,xmm0
50: 85 c0 test eax,eax
52: 0f 94 c0 sete al
55: c5 f8 77 vzeroupper
58: c3 ret
output (gnatprove)
$ gnatprove -P ./default.gpr -f
Phase 1 of 2: generation of Global contracts ...
Phase 2 of 2: flow analysis and proof ...
character_sets.adb:31:10: warning: pragma "Loop_Optimize" ignored (not yet supported)
31 | pragma Loop_Optimize (Vector);
| ^ here
Summary logged in /home/deedee/72423385-spark-ada-overlays-without-copying/obj/gnatprove/gnatprove.out
Here's the resulting (over-optimised) function after DeeDee's solution,
function "="
(Left, Right : in Character_Set)
return Boolean
is
Temp : array (Char_Set_Index) of Integer;
Sum : Integer;
begin
for j in Temp'Range loop
Temp(j) := (if To_Vector(Left)(j) = To_Vector(Right)(j) then -1 else 0);
end loop;
Sum := 0;
for j in Temp'Range loop
Sum := Sum + Temp(j);
end loop;
return Sum = -Temp'Length;
end "=";
Note the change of Temp's values, to match up with Intel's documentation to match properly the result of vpcmpeqd For all that effort (and complication) you get to drop one vpand
Also, it seems possible after moving the vector array into the body instead being private in the specification, allows you to drop the pragma Loop_Optimize
Indeed, if you don't have SIMD available you get,
.cfi_startproc
movl (%rsi), %eax
cmpl %eax, (%rdi)
sete %dl
movl 4(%rsi), %ecx
xorl %r9d, %r9d
movl 8(%rsi), %r10d
movzbl %dl, %r8d
movl 12(%rsi), %eax
negl %r8d
cmpl %ecx, 4(%rdi)
movl 16(%rsi), %ecx
sete %r9b
xorl %r11d, %r11d
subl %r9d, %r8d
cmpl %r10d, 8(%rdi)
movl 20(%rsi), %r10d
sete %r11b
xorl %edx, %edx
subl %r11d, %r8d
cmpl %eax, 12(%rdi)
movl 24(%rsi), %eax
sete %dl
xorl %r9d, %r9d
movl 28(%rsi), %esi
subl %edx, %r8d
cmpl %ecx, 16(%rdi)
sete %r9b
xorl %r11d, %r11d
subl %r9d, %r8d
cmpl %r10d, 20(%rdi)
sete %r11b
xorl %edx, %edx
subl %r11d, %r8d
cmpl %eax, 24(%rdi)
sete %dl
xorl %ecx, %ecx
subl %edx, %r8d
cmpl %esi, 28(%rdi)
sete %cl
subl %ecx, %r8d
cmpl $-8, %r8d
sete %al
ret
.cfi_endproc
with,
gnatmake -O2 -funroll-loops -gnatn -gnatp -S name-of-package.adb
which, if you want to avoid branching, seems better than the naieve version
My first thought on seeing this was, 'Why are you defining "=" for Character_Set?' It comes with "=" predefined.
Let's see what it does:
package Packed_Vectorization is
type CS is array (Character) of Boolean with
Component_Size => 1, Size => 256;
type Character_Set is new CS with
Component_Size => 1, Size => 256;
function "=" (Left : in Character_Set; Right : in Character_Set) return Boolean is
(CS (Left) = CS (Right) );
end Packed_Vectorization;
The type derivation is there so we can see what code is produced for the predefined "=".
Compiling with
gnatmake -gnatnp -O3 -S packed_vectorization.ads
gives the important part as
packed_vectorization__Oeq:
.LFB2:
.cfi_startproc
movq %rsi, %rdx
movl $256, %ecx
movl $256, %esi
jmp system__bit_ops__bit_eq#PLT
.cfi_endproc
The compiler has a special function just for comparing bit-packed arrays, presumably to optimize this common action. You can look at the implementation of System.Bit_Ops.Bit_Eq; the important part seems to be
if LeftB (1 .. BLen) /= RightB (1 .. BLen) then
where Leftb and Rightb are views of the two arrays as packed arrays of bytes. This is the predefined "/=" for the array-of-bytes type. I was unable to find an object file for System.Bit_Ops, but I'd guess that that "/=" is optimized, too.
Is this acceptable for your use? (I presume you need to optimize your "=" in order to meet your quantified timing requirements, as otherwise there's no reason to worry about this.) If so, then a lot of effort has been expended for nothing.
"Ada Outperforms Assembly: A Case Study", Proceedings of TRI-Ada '92, reports on an Ada (83) compiler producing faster and smaller code than assembler hand optimized by a team of experts. That was 30 years ago. Optimizer technology has no doubt improved since then. Typically, the compiler knows more about optimization than any of us ever will.
"Premature optimization is the root of all evil ..." -- Donald Knuth
Related
I am trying to code a recursive Fibonacci program in x86 Assembly (Intel AT&T syntax). I get a StackOverflow error in the form of a segmentation fault. Below is my code:
# Function signature:
# int factorial(int n)
.text
.equ n, 8
.equ fibMinus1, -4
.global fibonacci
fibonacci:
# Prologue (prepare the stack frame)
push %ebp
mov %esp, %ebp
# ECX is the non-volatile register which stores n
movl n(%ebp), %ecx
# Make space on the stack frame to store fib(n - 1)
subl $4, %esp
# If n == 0:
cmpl $0, %ecx
# return 0
je retZero
# If n == 1:
cmpl $1, %ecx
# return 1
je retOne
# EDX = fibonacci(n - 1)
decl %ecx
push %ecx
call fibonacci
movl %eax, fibMinus1(%esp)
# EAX = fibonacci(n - 2)
movl n(%ebp), %ecx
subl $2, %ecx
push %ecx
call fibonacci
# fibonacci(n - 1) + fibonacci(n - 2)
addl fibMinus1(%esp), %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
# Return fibonacci(n - 1) + fibonacci(n - 2)
ret
retZero:
movl $0, %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
# Return the maximum possible weight of the bags :D
ret
retOne:
movl $1, %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
# Return the maximum possible weight of the bags :D
ret
This seems strange, considering that I modify the necessary parameters when pushing them onto the stack before the function call, and after each function call, I execute the epilogue sequence to reframe the stack to its proper position such that the proper parameters are retrieved during the next call.
My code models the following Python recursive code:
I am trying to implement Knapsack's algorithm through recursion in x86 Assembly, modeling the following Java code. However, when I run my program, it seems as if the parameter taking in the capacity of the bag (the 4th parameter) is changed following a recursive call (specifically following the prologue).
The initial call is:
knapsack(weights, values, 2, 2, 0)
When I look in the debugger, initially all the parameters are taken in as correctly:
weights is the pointer to the correct array (0x5655b2f0) ($ebp + 8)
values is the pointer to the correct array (0x5655b300) ($ebp + 12)
num_items = 2 ($ebp + 16)
capacity = 2 ($ebp + 20)
cur_value = 0 ($ebp + 24)
However, in the code executed at maximizeItemUsage, I execute the following recursive call:
knapsack(weights, values, 2, 1, 0)
However, when I look at my debugger, after the following line of code in the prologue (in knapsack):
mov %esp, %ebp
I get the following data in the parameters:
0 ($ebp + 8)
2 ($ebp + 12)
1 ($ebp + 16)
0x5655b300 ($ebp + 20)
0x5655b2f0 ($ebp + 24)
This seems quite strange, considering that the prologue is to supposed to align the stack properly. Below is my code:
# Function signature:
# int knapsack(int* w, int* v, int num_items,
# int capacity, int current);
.text
# Define our macros, which store the location of the parameters and values
# relative to the stack's base pointer (EBP)
.equ weights, 8
.equ values, 12
.equ num_items, 16
.equ capacity, 20
.equ cur_value, 24
.equ do_not_use, -4
.equ use, -8
.global knapsack
knapsack:
# solves the knapsack problem
# #weights: an array containing how much each item weighs
# #values: an array containing the value of each item
# #num_items: how many items that we have
# #capacity: the maximum amount of weight that can be carried
# #cur_weight: the current weight
# #cur_value: the current value of the items in the pack
# Prologue (prepare the stack frame)
push %ebp
mov %esp, %ebp
# Make space for local variables on the stack
# 3 variables (4 bytes each)
# Default values are 0
#
# 1. the value of the bag if we DO NOT use the current item
# 2. the value of the bag if we USE the current item
# 3. the maximum value of the bag currently
sub $8, %esp
movl $0, do_not_use(%ebp)
movl $0, use(%ebp)
# Base Case: we have utilized all the items or there is no more space left
# in the bag (num_items = 0 or capacity = 0)
cmpl $0, num_items(%ebp)
jle emptyBag
cmpl $0, capacity(%ebp)
jle emptyBag
# Case 1: We do not use the current element because adding it wil surpass capacity
# weights[n - 1] > capacity
# Push the new parameters in the stack (everything stays the same, except that
# n -> n - 1 since we are no longer using the current element)
# Compute weights[n - 1] (stored in ECX register)
# Get the memory address of the values array
movl weights(%ebp), %ecx
# Move to the memory address of weights[n - 1]
push %edx
movl num_items(%ebp), %edx # num_items
decl %edx # num_items - 1
imul $4, %edx # 4(num_items - 1)
addl %edx, %ecx # m_v + 4(num_items - 1)
# Get the actual value of weights[n - 1]
movl (%ecx), %ecx # Get value at address m_w + 4(num_items - 1)
pop %edx
# If weights[n - 1] > capacity
cmpl %ecx, capacity(%ebp)
# Shift to analyzing the previous items
jl analyzePreviousItems
# Case 2: We can use the current element (in this case, find the maximum of the values
# we use the element or if we do not use the element
jmp maximizeItemUsage
# Solidify the EAX register data
movl %eax, %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
# Return the maximum possible weight of the bags :D
ret
maximizeItemUsage:
# knapsack(weights, values, num_items - 1, capacity, current_value)
# All parameters remain the same (except that n > n - 1)
push weights(%ebp)
push values(%ebp)
# num_items - 1
movl num_items(%ebp), %edx
decl %edx
push %edx
push capacity(%ebp)
push cur_value(%ebp)
# Call knapsack(weights, values, num_items - 1, capacity, cur_value)
call knapsack
# The value if we DO NOT use the current item
movl %eax, do_not_use(%ebp)
# knapsack(weights, values, n - 1, c - weights[n]) + values[n]
# All parameters remain the same (except that n > n - 1 and c > c - weights[n])
push weights(%ebp)
push values(%ebp)
# num_items - 1
movl num_items(%ebp), %edx
decl %edx
# capacity - weights[n] (stored in the ECX register)
# Get the memory address of the values array
movl weights(%ebp), %ecx
# Move to the memory address of weights[n]
push %edx
movl num_items(%ebp), %edx # num_items
imul $4, %edx # 4(num_items)
addl %edx, %ecx # m_w + 4(num_items)
# Restore the value of the EDX register
pop %edx
# Get the actual value of weights[n]
movl (%ecx), %ecx # Get value at address m_w + 4(num_items)
# -weights[n]
neg %ecx
# capacity - weights[n]
addl capacity(%ebp), %ecx
push %ecx
push cur_value(%ebp)
# Call knapsack(weights, values, num_items - 1, capacity - weights[n], cur_value)
call knapsack
# The value if we USE the current item
movl %eax, use(%ebp)
# Compute the maximum value of knapsack(weights, values, num_items - 1, capacity, cur_value)
# and knapsack(weights, values, n - 1, c - weights[n]) + values[n]
movl use(%ebp), %ecx
cmpl %ecx, do_not_use(%ebp)
jl setUseAsMax
movl do_not_use(%ebp), %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
ret
setUseAsMax:
movl use(%ebp), %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
ret
analyzePreviousItems:
# Recursive Call 1: knapsack(weights, values, n - 1, c)
# All parameters remain the same (except that n -> n - 1)
push weights(%ebp)
push values(%ebp)
# We use the EDX to contain the changed parameters since it is a
# non-volatile register
movl num_items(%ebp), %edx
decl %edx
push %edx
push capacity(%ebp)
push cur_value(%ebp)
# Call knapsack(weights, values, num_items - 1, capacity, cur_value)
call knapsack
# The value if we DO NOT use the current item
movl %eax, do_not_use(%ebp)
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
# Return knapsack(weights, values, n - 1, c)
ret
emptyBag:
movl 0, %eax
# Epilogue (cleanup/realign the stack frame)
mov %ebp, %esp
pop %ebp
# Return 0
ret
# 3 variables (4 bytes each)
# Default values are 0
#
# 1. the value of the bag if we DO NOT use the current item
# 2. the value of the bag if we USE the current item
# 3. the maximum value of the bag currently
sub $8, %esp
movl $0, do_not_use(%ebp)
movl $0, use(%ebp)
The comments claim to reserve space for 3 variables (12 bytes) but the code only has sub $8, %esp.
push weights(%ebp)
push values(%ebp)
# num_items - 1
movl num_items(%ebp), %edx
decl %edx
push %edx
push capacity(%ebp)
push cur_value(%ebp)
# Call knapsack(weights, values, num_items - 1, capacity, cur_value)
call knapsack
In all 3 recursive calls you have pushed the arguments in the wrong order!
This is the correct way:
push cur_value(%ebp)
push capacity(%ebp)
movl num_items(%ebp), %edx
decl %edx
push %edx
push values(%ebp)
push weights(%ebp)
call knapsack
push %edx
movl num_items(%ebp), %edx # num_items
imul $4, %edx # 4(num_items)
addl %edx, %ecx # m_w + 4(num_items)
# Restore the value of the EDX register
pop %edx
In the 2nd recursive call, you have 1 argument too few! Why is there a pop %edx ?
# Case 2: We can use the current element (in this case, find the maximum of the values
# we use the element or if we do not use the element
jmp maximizeItemUsage
# Solidify the EAX register data
movl %eax, %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
# Return the maximum possible weight of the bags :D
ret
Please note that the code below the jmp maximizeItemUsage is unreachable. It will never run.
call knapsack # -> %EAX
# The value if we DO NOT use the current item
movl %eax, do_not_use(%ebp)
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
# Return knapsack(weights, values, n - 1, c)
ret
In analyzePreviousItems that movl %eax, do_not_use(%ebp) instruction is silly because the concerned local variable is just about to stop existing.
And an optimization for free
call knapsack
# The value if we USE the current item
movl %eax, use(%ebp)
# Compute the maximum value of knapsack(weights, values, num_items - 1, capacity, cur_value)
# and knapsack(weights, values, n - 1, c - weights[n]) + values[n]
movl use(%ebp), %ecx
cmpl %ecx, do_not_use(%ebp)
jl setUseAsMax
movl do_not_use(%ebp), %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
ret
setUseAsMax:
movl use(%ebp), %eax
# Epilogue (cleanup the stack frame)
mov %ebp, %esp
pop %ebp
ret
This part of your code gets much simpler if you don't reload the use variable to a different register when it is already present in %EAX.
call knapsack
movl %eax, use(%ebp) (*)
cmpl %eax, do_not_use(%ebp)
jl setUseAsMax
movl do_not_use(%ebp), %eax
setUseAsMax:
mov %ebp, %esp
pop %ebp
ret
(*) Here also you don't need to store movl %eax, use(%ebp) because the use variable is about to get terminated.
I know that Julia has a #time macro that outputs the amount of memory that is allocated, but is there any way to measure the number of assignments made in a function?
The problem is counting assignments is that by the time the machine runs the code, register or memory loads and stores no longer correspond to the assignments of the original code. For instance, the code
julia> g(x) = x^3
g (generic function with 1 method)
julia> #code_llvm g(1)
define i64 #julia_g_70778(i64) #0 {
top:
%1 = mul i64 %0, %0
%2 = mul i64 %1, %0
ret i64 %2
}
julia> #code_native g(1)
.text
Filename: REPL[7]
pushq %rbp
movq %rsp, %rbp
Source line: 1
movq %rdi, %rax
imulq %rax, %rax
imulq %rdi, %rax
popq %rbp
retq
nopw %cs:(%rax,%rax)
clearly has four "assignments", two movq and two imulq. But the original code did not have a single assignment.
The closest you can get, therefore, is to use a macro to rewrite assignments so that they increment a counter (in addition to actually doing the assigning). This will of course likely slow down your code substantially, so I do not recommend it.
I'm trying to let the user enter 2 digits, the first one is the base and the second one the exponent.
These two values are stored correctly. I know this by printing them (this printing code is currently commented out).
However, my loop to calculate the answer of the base^exponent is returning a wrong value.
Can anyone point me in the right direction or even solve my problem?
This is my code:
#/**
#* The pow subroutine calculates powers of natural bases
#* and exponents.
#*
#* Arguments:
#*
#* base - the exponential base
#* exp - the exponent
#*
#* Return value: 'base' raised to the power of 'exp'.
#*/
#int pow( int base, int exp )
# {
# int total = 1;
# while !(exp <= 0){
# total = total * base;
# exp = exp -1;
# }
# return total;
# }
.bss
EXP: .long
BASE: .long
TOTAL: .long
.text
FSTR: .asciz "%d"
PSTR: .asciz "%d\n"
.global main
inout:
pushl %ebp # Prolog: push the base pointer.
movl %esp, %ebp # and copy stack pointer to EBP.
subl $4, %esp # Reserve stack space for variable
leal -4(%ebp), %eax # Load address of stack var in eax
pushl %eax # Push second argument of scanf
pushl $FSTR # Push first argument of scanf
call scanf # Call scanf
movl -4(%ebp), %eax # Move result of scanf from stack to eax
movl %ebp, %esp # Clear local variables from stack.
popl %ebp # Restore caller's base pointer.
ret # return from subroutine.
main:
call inout
movl %eax, BASE
#pushl BASE
#pushl $PSTR
#call printf
call inout
movl %eax, EXP
#pushl EXP
#pushl $PSTR
#call printf
#subl $4, %esp
#leal -4(%ebp), %eax
#movl %eax, TOTAL
movl $1, TOTAL
loop:
cmpl $0, EXP
jle end
movl TOTAL, %eax
mull BASE
movl %eax, TOTAL
decl EXP
jmp loop
end:
pushl %eax
pushl $PSTR
call printf
#addl $4, %esp #\
pushl $0 #- Clean up and exit
call exit #/
Thanks in advance.
One possibility is to single step the code in a debugger and verify that the working registers contain the expected values.
How can i calculate day number from a unix-timestamp, in a mathematical way and without using any functions and in simple math formula.
1313905026 --> 8 (Today 08/21/2011)
A unix timestamp doesn't include leap seconds, so we don't have to worry about that. Here is a branch-less1, loop-less algorithm for getting the y/m/d fields from a unix timestamp:
#include <iostream>
int
main()
{
int s = 1313905026;
int z = s / 86400 + 719468;
int era = (z >= 0 ? z : z - 146096) / 146097;
unsigned doe = static_cast<unsigned>(z - era * 146097);
unsigned yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365;
int y = static_cast<int>(yoe) + era * 400;
unsigned doy = doe - (365*yoe + yoe/4 - yoe/100);
unsigned mp = (5*doy + 2)/153;
unsigned d = doy - (153*mp+2)/5 + 1;
unsigned m = mp + (mp < 10 ? 3 : -9);
y += (m <= 2);
std::cout << m << '/' << d << '/' << y << '\n'; // 8/21/2011
}
This outputs:
8/21/2011
As you're not interested in y and m (only in d), you can eliminate the last couple of lines from the above computation.
This algorithm is described in excruciating detail here. The link includes a complete derivation, and unit tests spanning millions of years (which is overkill).
1 Branch-less: What looks like small branches in the algorithm above are optimized away by clang at -O3 on macOS:
__Z14get_day_numberi: ## #_Z14get_day_numberi
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
movslq %edi, %rax
imulq $-1037155065, %rax, %rcx ## imm = 0xFFFFFFFFC22E4507
shrq $32, %rcx
addl %ecx, %eax
movl %eax, %ecx
shrl $31, %ecx
sarl $16, %eax
leal (%rax,%rcx), %edx
leal 719468(%rax,%rcx), %esi
testl %esi, %esi
leal 573372(%rax,%rcx), %eax
cmovnsl %esi, %eax
cltq
imulq $963315389, %rax, %rcx ## imm = 0x396B06BD
movq %rcx, %rsi
shrq $63, %rsi
shrq $32, %rcx
sarl $15, %ecx
addl %esi, %ecx
imull $146097, %ecx, %ecx ## imm = 0x23AB1
movl %eax, %esi
subl %ecx, %esi
subl %eax, %esi
leal 719468(%rsi,%rdx), %eax
movl %eax, %ecx
shrl $2, %ecx
imulq $1506180313, %rcx, %rdx ## imm = 0x59C67CD9
shrq $39, %rdx
movl %eax, %esi
subl %edx, %esi
imulq $963321983, %rcx, %rcx ## imm = 0x396B207F
shrq $43, %rcx
addl %esi, %ecx
movl %eax, %edx
shrl $4, %edx
imulq $7525953, %rdx, %rdx ## imm = 0x72D641
shrq $36, %rdx
subl %edx, %ecx
imulq $1729753953, %rcx, %rsi ## imm = 0x6719F361
shrq $32, %rsi
movl %ecx, %r8d
subl %ecx, %eax
movl %ecx, %edi
movl $3855821599, %edx ## imm = 0xE5D32B1F
imulq %rcx, %rdx
subl %esi, %ecx
shrl %ecx
addl %esi, %ecx
shrl $8, %ecx
imull $365, %ecx, %ecx ## imm = 0x16D
subl %ecx, %r8d
shrl $2, %edi
imulq $1506180313, %rdi, %rcx ## imm = 0x59C67CD9
shrq $39, %rcx
shrq $47, %rdx
addl %r8d, %eax
subl %ecx, %eax
leal (%rax,%rdx), %ecx
leal 2(%rcx,%rcx,4), %esi
movl $3593175255, %edi ## imm = 0xD62B80D7
imulq %rsi, %rdi
shrq $39, %rdi
imull $153, %edi, %edi
subl %edi, %esi
leal 4(%rcx,%rcx,4), %ecx
subl %esi, %ecx
movl $3435973837, %esi ## imm = 0xCCCCCCCD
imulq %rcx, %rsi
shrq $34, %rsi
leal 1(%rax,%rdx), %eax
subl %esi, %eax
popq %rbp
retq
.cfi_endproc
t = unix time
second = t MOD 60
minute = INT(t / 60) MOD 60
hour = INT(t / 60 / 60) MOD 24
days = INT(t / 60 / 60 / 24)
years = INT(days / 365.25)
year = 1970 + years + 1
1970 started with a Thursday so, we can calculate the day of the week:
weekday = (days + 4) MOD 7
If Sunday is day 0. If you want Sunday to be day 1 just add 1.
Now, let's find out how many days we are into the year in question.
days = days - years * 365 - leapdays
Finally, we find the month and day of the month.
IF year MOD 4 = 0 THEN ly = 1 ELSE ly = 0
WHILE month <= 12
month = month + 1
IF month = 2 THEN
DaysInMonth = 28 + NOT(year MOD 4) + NOT(year MOD 100)
+ NOT(year MOD 400)
ELSE
DaysInMonth = 30 + (month + (month < 7)) MOD 2
END IF
IF days > DaysInMonth THEN days = days - DaysInMonth
END WHILE
This assumes Boolean values of TRUE = 1, FALSE = 0, NOT TRUE = 0, and NOT FALSE = 1.
Now we have the year, month, day of the month, hour, minute, and second calculated with adjustments for leap years.
There is no simple formula to do this. You would need to subtract the number of years (accounting for leap years) since the epoch, which would probably require a loop or a discrete calculation of some kind. Then use some type of loop to subtract out the number of seconds in each month for the current year. What you are left with is the number of seconds currently into the month.
I would do something like this.
x = ...//the number of seconds
year = 1970
while (x > /*one year*/){
x = x - /*seconds in january, and march-december*/
if(year % 4 == 0){
x -= /*leapeay seconds in february*/
}else{
x -= /*regular seconds in february*/
}
}
//Then something like this:
if(x > /*seconds in january*/){
x -= /*seconds in january*/
}
if(x > /*seconds in february*/){
x -= /*seconds in january*/
}
.
.
.
//After that just get the number of days from x seconds and you're set.
Edit
I recommend using date functions for simplicity, but here is a possible non-loopy alternative answer in case anyone needs it, or would care to develop it further.
First let t be the current time in seconds since the epoch.
Let F be the number of seconds in four years. That is three regular years and one leap year. That should be: 126230400.
Now if you take away all of the time contributed by F, you will get a remainder: y.
So y = n % F.
There are several cases now:
1. y is less that one year
2. y is less than two years
3. y is less than three years and less than two months
4. y is less than three years and greater than two months
5. y is less than four years
Note that 1972 was a leap year, so if you count up by four from 1970, wherever you left off will be a leap year in two years.
let jan, feb, febLY, mar, may, ..., dec be the number of seconds in each month (you'd need to calculate it out).
d represents the day number of the current month and D represents the number of seconds in a day (86400).
y represents the number of seconds in a regular year, and yLY represents the number of seconds in a leap year.
y = (t % F)
if(y < Y){
if(y > jan){
y -= jan
}
if(y > feb){
y -= feb
}
.
.
.
d = y % D
}
else if(y < 2 * y){
y = y - Y
if(y > jan){
y -= jan
}
if(y > feb){
y -= feb
}
.
.
.
d = y % D
}
else if(y < 2 * y + yLY){
y = y - 2 * Y
if(y > jan){
y -= jan
}
if(y > febLY){
y -= febLY
}
.
.
.
d = y % D
}
else{
y = y - 2 * Y - yLY
if(y > jan){
y -= jan
}
if(y > feb){
y -= feb
}
.
.
.
d = y % D
}
Not tested. Also, since the Earth doesn't spin at EXACTLY 1 rotation / 24 hours, they've occasionally made adjustments to time. You need to do a bit of research factor that in.
In rust :
fn date(mut months_to_shift: i32, timezone_shift: i32) -> String {
months_to_shift = months_to_shift * 2_628_000;
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Before time!")
.as_secs() as f32;
let adjusted_time = ((((timestamp / 31_557_600.0) / 4.0).round() as i32 * 86_400)
+ 604800
+ (timezone_shift * 3600)
+ months_to_shift) as f32
+ timestamp; // 608400 offset for number of days missing - 7 - (???) + leap year days +/- timezone shift from EST -- Using timezone shift in my project but not necessary
let years = (1970.0 + (adjusted_time / 31_536_000.0)) as i32;
let mut months = ((adjusted_time % 31_536_000.0) / 2_628_000.0) as i32;
months = if months == 0 { 12 } else { months };
let days = ((adjusted_time % 2_628_000.0) / 86_400.0) as i32;
years.to_string() + "-" + &months.to_string() + "-" + &days.to_string()
}