Which is faster for reverse iteration, for or while loops? - for-loop

I am trying to implement the standard memmove function in Rust and I was wondering which method is faster for downwards iteration (where src < dest):
for i in (0..n).rev() {
//Do copying
}
or
let mut i = n;
while i != 0 {
i -= 1;
// Do copying
}
Will the rev() in the for loops version significantly slow it down?

TL;DR: Use the for loop.
Both should be equally fast. We can check the compiler's ability to peel away the layers of abstraction involved in the for loop quite simply:
#[inline(never)]
fn blackhole() {}
#[inline(never)]
fn with_for(n: usize) {
for i in (0..n).rev() { blackhole(); }
}
#[inline(never)]
fn with_while(n: usize) {
let mut i = n;
while i > 0 {
blackhole();
i -= 1;
}
}
This generates this LLVM IR:
; Function Attrs: noinline nounwind readnone uwtable
define internal void #_ZN8with_for20h645c385965fcce1fhaaE(i64) unnamed_addr #0 {
entry-block:
ret void
}
; Function Attrs: noinline nounwind readnone uwtable
define internal void #_ZN10with_while20hc09c3331764a9434yaaE(i64) unnamed_addr #0 {
entry-block:
ret void
}
Even if you are not versed in LLVM, it is obvious that both functions compiled down to the same IR (and thus obviously to the same assembly).
Since their performance is the same, one should prefer the more explicit for loop and reserve the while loop to cases where the iteration is irregular.
EDIT: to address starblue's concern of unfitness.
#[link(name = "snappy")]
extern {
fn blackhole(i: libc::c_int) -> libc::c_int;
}
#[inline(never)]
fn with_for(n: i32) {
for i in (0..n).rev() { unsafe { blackhole(i as libc::c_int); } }
}
#[inline(never)]
fn with_while(n: i32) {
let mut i = n;
while i > 0 {
unsafe { blackhole(i as libc::c_int); }
i -= 1;
}
}
compiles down to:
; Function Attrs: noinline nounwind uwtable
define internal void #_ZN8with_for20h7cf06f33e247fa35maaE(i32) unnamed_addr #1 {
entry-block:
%1 = icmp sgt i32 %0, 0
br i1 %1, label %match_case.preheader, label %clean_ast_95_
match_case.preheader: ; preds = %entry-block
br label %match_case
match_case: ; preds = %match_case.preheader, %match_case
%.in = phi i32 [ %2, %match_case ], [ %0, %match_case.preheader ]
%2 = add i32 %.in, -1
%3 = tail call i32 #blackhole(i32 %2)
%4 = icmp sgt i32 %2, 0
br i1 %4, label %match_case, label %clean_ast_95_.loopexit
clean_ast_95_.loopexit: ; preds = %match_case
br label %clean_ast_95_
clean_ast_95_: ; preds = %clean_ast_95_.loopexit, %entry-block
ret void
}
; Function Attrs: noinline nounwind uwtable
define internal void #_ZN10with_while20hee8edd624cfe9293IaaE(i32) unnamed_addr #1 {
entry-block:
%1 = icmp sgt i32 %0, 0
br i1 %1, label %while_body.preheader, label %while_exit
while_body.preheader: ; preds = %entry-block
br label %while_body
while_exit.loopexit: ; preds = %while_body
br label %while_exit
while_exit: ; preds = %while_exit.loopexit, %entry-block
ret void
while_body: ; preds = %while_body.preheader, %while_body
%i.05 = phi i32 [ %3, %while_body ], [ %0, %while_body.preheader ]
%2 = tail call i32 #blackhole(i32 %i.05)
%3 = add nsw i32 %i.05, -1
%4 = icmp sgt i32 %i.05, 1
br i1 %4, label %while_body, label %while_exit.loopexit
}
The core loops are:
; -- for loop
match_case: ; preds = %match_case.preheader, %match_case
%.in = phi i32 [ %2, %match_case ], [ %0, %match_case.preheader ]
%2 = add i32 %.in, -1
%3 = tail call i32 #blackhole(i32 %2)
%4 = icmp sgt i32 %2, 0
br i1 %4, label %match_case, label %clean_ast_95_.loopexit
; -- while loop
while_body: ; preds = %while_body.preheader, %while_body
%i.05 = phi i32 [ %3, %while_body ], [ %0, %while_body.preheader ]
%2 = tail call i32 #blackhole(i32 %i.05)
%3 = add nsw i32 %i.05, -1
%4 = icmp sgt i32 %i.05, 1
br i1 %4, label %while_body, label %while_exit.loopexit
And the only difference is that:
for decrements before calling blackhole, while decrements after
for compares against 0, while compares against 1
otherwise, it's the same core loop.

In short: They are (nearly) equally fast -- use the for loop!
Longer version:
First: rev() only works for iterators that implement DoubleEndedIterator, which provides a next_back() method. This method is expected to run in o(n) (sublinear time), usually even O(1) (constant time). And indeed, by looking at the implementation of next_back() for Range, we can see that it runs in constant time.
Now we know that both versions have asymptotically identical runtime. If this is the case, you should usually stop thinking about it and use the solution that is more idiomatic (which is for in this case). Thinking about optimization too early often decreases programming productivity, because performance matters only in a tiny percentage of all code you write.
But since you are implementing memmove, performance might actually really matter to you. So lets try to look at the resulting ASM. I used this code:
#![feature(start)]
#![feature(test)]
extern crate test;
#[inline(never)]
#[no_mangle]
fn with_for(n: usize) {
for i in (0..n).rev() {
test::black_box(i);
}
}
#[inline(never)]
#[no_mangle]
fn with_while(n: usize) {
let mut i = n;
while i > 0 {
test::black_box(i);
i -= 1;
}
}
#[start]
fn main(_: isize, vargs: *const *const u8) -> isize {
let random_enough_value = unsafe {
**vargs as usize
};
with_for(random_enough_value);
with_while(random_enough_value);
0
}
(Playground Link)
The #[no_mangle] is to improve readability in the resulting ASM. The #inline(never) and the random_enough_value as well as the black_box are used to prevent LLVM to optimize things we don't want to be optimized. The generated ASM of this (in release mode!) with some cleanup looks like:
with_for: | with_while:
testq %rdi, %rdi | testq %rdi, %rdi
je .LBB0_3 | je .LBB1_3
decq %rdi |
leaq -8(%rsp), %rax | leaq -8(%rsp), %rax
.LBB0_2: | .LBB1_2:
movq %rdi, -8(%rsp) | movq %rdi, -8(%rsp)
decq %rdi | decq %rdi
cmpq $-1, %rdi |
jne .LBB0_2 | jne .LBB1_2
.LBB0_3: | .LBB1_3:
retq | retq
The only difference is that with_while has two instructions less, because it's counting down to 0 instead of -1, like with_for does.
Conclusion: if you can tell that the asymptotic runtime is optimal, you should probably not think about optimization at all. Modern optimizers are clever enough to compile high level constructs down to pretty perfect ASM. Often, data layout and resulting cache efficiency is much more important than a minimal count of instructions, anyway.
If you actually need to think about optimization though, look at the ASM (or LLVM IR). In this case the for loop is actually a bit slower (more instructions, comparison with -1 instead of 0). But the number of cases where a Rust programmers should care about this, is probably miniscule.

For small N, it really shouldn't matter.
Rust is lazy on iterators; 0..n won't cause any evaluation until you actually ask for an element. rev() asks for the last element first. As far as I know, the Rust counter iterator is clever and doesn't need to generate the first N-1 elements to get the Nth one. In this specific case, the rev method is probably even faster.
In the general case, it depends on what kind of access paradigm and access time your iterator has; make sure that accessing the end takes constant time, and it doesn't make a difference.
As with all benchmarking questions, it depends. Test for your N values yourself!
Premature optimization is also evil, so if your N is small, and your loop isn't done very often... don't worry.

Related

Which one is faster x+=x or x*=2

I recently make a program to make an image of the Mandelbrot set. To do this is I have written a function which returns if a point is a point of Mandelbrot set. And in this function I found 2 ways to do my calculation:
let temp=a;
a=a*a-b*b+x;
b=2.0*b*temp+y;
or
let temp=a;
a=a*a-b*b+x;
b*=temp;
b+=b+y;
Which one is faster if there is a faster one? (I use rust language if this changes something)?
I've put both your codes into the playground, as a public functions (assuming that your values are all floats, but this shouldn't make any real diference):
pub fn mult(mut a: f32, mut b: f32, x: f32, y: f32) -> f32 {
let temp = a;
a = a * a - b * b + x;
b = 2.0 * b * temp + y;
b
}
pub fn add(mut a: f32, mut b: f32, x: f32, y: f32) -> f32 {
let temp = a;
a = a * a - b * b + x;
b *= temp;
b += b + y;
b
}
The assembly generated in release mode is almost identical (just reordered):
playground::mult:
addss xmm1, xmm1
mulss xmm0, xmm1
addss xmm0, xmm3
ret
playground::add:
mulss xmm0, xmm1
addss xmm3, xmm0
addss xmm0, xmm3
ret
So, there should be no measurable difference. However, if you're worried, you should benchmark your real case to see whether some of these approaches leads to missing optimizations in the larger picture.

F# Performance Impact of Checked Calcs?

Is there a performance impact from using the Checked module? I've tested it out with sequences of type int and see no noticeable difference. Sometimes the checked version is faster and sometimes unchecked is faster, but generally not by much.
Seq.initInfinite (fun x-> x) |> Seq.item 1000000000;;
Real: 00:00:05.272, CPU: 00:00:05.272, GC gen0: 0, gen1: 0, gen2: 0
val it : int = 1000000000
open Checked
Seq.initInfinite (fun x-> x) |> Seq.item 1000000000;;
Real: 00:00:04.785, CPU: 00:00:04.773, GC gen0: 0, gen1: 0, gen2: 0
val it : int = 1000000000
Basically I'm trying to figure out if there would be any downside to always opening Checked. (I encountered an overflow that wasn't immediately obvious, so I'm now playing the role of the jilted lover who doesn't want another broken heart.) The only non-contrived reason I can come up with for not always using Checked is if there were some performance hit, but I haven't seen one yet.
When you measure performance it's usually not a good idea to include Seq as Seq adds lots of overhead (at least compared to int operations) so you risk that most of the time is spent in Seq, not in the code you like to test.
I wrote a small test program for (+):
let clock =
let sw = System.Diagnostics.Stopwatch ()
sw.Start ()
fun () ->
sw.ElapsedMilliseconds
let dbreak () = System.Diagnostics.Debugger.Break ()
let time a =
let b = clock ()
let r = a ()
let n = clock ()
let d = n - b
d, r
module Unchecked =
let run c () =
let rec loop a i =
if i < c then
loop (a + 1) (i + 1)
else
a
loop 0 0
module Checked =
open Checked
let run c () =
let rec loop a i =
if i < c then
loop (a + 1) (i + 1)
else
a
loop 0 0
[<EntryPoint>]
let main argv =
let count = 1000000000
let testCases =
[|
"Unchecked" , Unchecked.run
"Checked" , Checked.run
|]
for nm, a in testCases do
printfn "Running %s ..." nm
let ms, r = time (a count)
printfn "... it took %d ms, result is %A" ms r
0
The performance results are this:
Running Unchecked ...
... it took 561 ms, result is 1000000000
Running Checked ...
... it took 1103 ms, result is 1000000000
So it seems some overhead is added by using Checked. The cost of int add should be less than the loop overhead so the overhead of Checked is higher than 2x maybe closer to 4x.
Out of curiousity we can check the IL Code using tools like ILSpy:
Unchecked:
IL_0000: nop
IL_0001: ldarg.2
IL_0002: ldarg.0
IL_0003: bge.s IL_0014
IL_0005: ldarg.0
IL_0006: ldarg.1
IL_0007: ldc.i4.1
IL_0008: add
IL_0009: ldarg.2
IL_000a: ldc.i4.1
IL_000b: add
IL_000c: starg.s i
IL_000e: starg.s a
IL_0010: starg.s c
IL_0012: br.s IL_0000
Checked:
IL_0000: nop
IL_0001: ldarg.2
IL_0002: ldarg.0
IL_0003: bge.s IL_0014
IL_0005: ldarg.0
IL_0006: ldarg.1
IL_0007: ldc.i4.1
IL_0008: add.ovf
IL_0009: ldarg.2
IL_000a: ldc.i4.1
IL_000b: add.ovf
IL_000c: starg.s i
IL_000e: starg.s a
IL_0010: starg.s c
IL_0012: br.s IL_0000
The only difference is that Unchecked uses add and Checked uses add.ovf. add.ovf is add with overflow check.
We can dig even deeper by looking at the jitted x86_64 code.
Unchecked:
; if i < c then
00007FF926A611B3 cmp esi,ebx
00007FF926A611B5 jge 00007FF926A611BD
; i + 1
00007FF926A611B7 inc esi
; a + 1
00007FF926A611B9 inc edi
; loop (a + 1) (i + 1)
00007FF926A611BB jmp 00007FF926A611B3
Checked:
; if i < c then
00007FF926A62613 cmp esi,ebx
00007FF926A62615 jge 00007FF926A62623
; a + 1
00007FF926A62617 add edi,1
; Overflow?
00007FF926A6261A jo 00007FF926A6262D
; i + 1
00007FF926A6261C add esi,1
; Overflow?
00007FF926A6261F jo 00007FF926A6262D
; loop (a + 1) (i + 1)
00007FF926A62621 jmp 00007FF926A62613
Now the reason for the Checked overhead is visible. After each operation the jitter inserts the conditional instruction jo which jumps to code that raises OverflowException if the overflow flag is set.
This chart shows us that the cost of an integer add is less than 1 clock cycle. The reason it's less than 1 clock cycle is that modern CPU can execute certain instructions in parallel.
The chart also shows us that branch that was correctly predicted by the CPU takes around 1-2 clock cycles.
So assuming a throughtput of at least 2 the cost of two integer additions in the Unchecked example should be 1 clock cycle.
In the Checked example we do add, jo, add, jo. Most likely CPU can't parallelize in this case and the cost of this should be around 4-6 clock cycles.
Another interesting difference is that the order of additions changed. With checked additions the order of the operations matter but with unchecked the jitter (and the CPU) has a greater flexibility moving the operations possibly improving performance.
So long story short; for cheap operations like (+) the overhead of Checked should be around 4x-6x compared to Unchecked.
This assumes no overflow exception. The cost of a .NET exception is probably around 100,000x times more expensive than an integer addition.

What is the fastest way to handle overflow on integer division/remainder without panic?

I'm still improving overflower to handle integer overflow. One goal was to be able use #[overflow(wrap)] to avoid panics on overflow. However, I found out that the .wrapping_div(_) and .wrapping_rem(_) functions of the standard integer types do in fact panic when dividing by zero. Edit: To motivate this use case better: Within interrupt handlers, we absolutely want to avoid panics. I assume that the div-by-zero condition is highly unlikely, but we still need to return a "valid" value for some definition of valid.
One possible solution is saturating the value (which I do when code is annotated with #[overflow(saturate)]), but this is likely relatively slow (especially since other,more operations are also saturated). So I want to add an #[overflow(no_panic)] mode that avoids panics completely, and is almost as fast as #[overflow(wrap)] in all cases.
My question is: What is the fastest way to return something (don't care what) without panicking on dividing (or getting the remainder) by zero?
Disclaimer: this isn't really a serious answer. It is almost certainly slower than the naive solution of using an if statement to check whether the divisor is zero.
#![feature(asm)]
fn main() {
println!("18 / 3 = {}", f(18, 3));
println!("2555 / 10 = {}", f(2555, 10));
println!("-16 / 3 = {}", f(-16, 3));
println!("7784388 / 0 = {}", f(7784388, 0));
}
fn f(x: i32, y: i32) -> i32 {
let z: i32;
unsafe {
asm!(
"
test %ecx, %ecx
lahf
and $$0x4000, %eax
or %eax, %ecx
mov %ebx, %eax
cdq
idiv %ecx
"
: "={eax}"(z)
: "{ebx}"(x), "{ecx}"(y)
: "{edx}"
);
}
z
}
Rust Playground
pub fn nopanic_signed_div(x: i32, y: i32) -> i32 {
if y == 0 || y == -1 {
// Divide by -1 is equivalent to neg; we don't care what
// divide by zero returns.
x.wrapping_neg()
} else {
// (You can replace this with unchecked_div to make it more
// obvious this will never panic.)
x / y
}
}
This produces the following on x86-64 with "rustc 1.11.0-nightly (6e00b5556 2016-05-29)":
movl %edi, %eax
leal 1(%rsi), %ecx
cmpl $1, %ecx
ja .LBB0_2
negl %eax
retq
.LBB0_2:
cltd
idivl %esi
retq
It should produce something similar on other platforms.
At least one branch is necessary because LLVM IR considers divide by zero to be undefined behavior. Checking for 0 and -1 separately would involve an extra branch. With those constraints, there isn't really any other choice.
(It might be possible to come up with something slightly faster with inline assembly, but it would be a terrible idea because you would end up generating much worse code in the case of dividing by a constant.)
Whether this solution is actually appropriate probably depends on what your goal is; a divide by zero is probably a logic error, so silently accepting it seems like a bad idea.

Replicate llvm instructions

I'm trying to replicate instruction (addition binary operation for example), and show them in the LLVM IR, but the following code only returns the 1st instruction (add1) that I built.How to return both built instructions ?
IRBuilder<> builder(op);
Value *lhs = op->getOperand(0);
Value *rhs = op->getOperand(1);
Value *add1 = builder.CreateAdd(lhs, rhs);
Value *add2 = builder.CreateAdd(lhs, rhs);
for (auto &U : op->uses()) {
User *user = U.getUser(); // A User is anything with operands.
user->setOperand(U.getOperandNo(), add1);
user->setOperand(U.getOperandNo(), add2);
}
Assume an add instruction. You have a BinaryOperator which has two operands e.g.,: %op = add i32 10, 32
You take them as Value *lhs = op->getOperand(0); and Value *rhs = op->getOperand(1);
So fare so good. Now you are creating two new add tnstructions before the actual add since you are construction your IRBuilder with op as insertion point.
%add1 = add i32 10, 32
%add2 = add i32 10, 32
%op = add i32 10, 32
Finally you update the Users of your original instruction e.g., something like another BinaryOperator: %0 = mul i32 %op, %op
When you look closely on your loop you will see that you set both (add1 and add2) to the same operand of the User. After your loop the multiplication will look like %0 = mul i32 %add2, %add2
If you dump the BasicBlock where the instructions are inserted directly after insertion, you should see something like:
%add1 = add i32 10, 32
%add2 = add i32 10, 32
%op = add i32 10, 32
%0 = mul i32 %add2, %add2
But if you run another LLVM Pass that performs dead code elimination (e.g., InstCombine) you will end up with:
%add2 = add i32 10, 32
%0 = mul i32 %add2, %add2
Because add1 has no users. You have immediately replaced the uses of add1 with add2. And op is also gone because all users now use add2 instead of op.
From your question it is hard to guess what you have intended with your code but this is why you will see only one of your instructions in the final IR.

What is tail call optimization?

Very simply, what is tail-call optimization?
More specifically, what are some small code snippets where it could be applied, and where not, with an explanation of why?
Tail-call optimization is where you are able to avoid allocating a new stack frame for a function because the calling function will simply return the value that it gets from the called function. The most common use is tail-recursion, where a recursive function written to take advantage of tail-call optimization can use constant stack space.
Scheme is one of the few programming languages that guarantee in the spec that any implementation must provide this optimization, so here are two examples of the factorial function in Scheme:
(define (fact x)
(if (= x 0) 1
(* x (fact (- x 1)))))
(define (fact x)
(define (fact-tail x accum)
(if (= x 0) accum
(fact-tail (- x 1) (* x accum))))
(fact-tail x 1))
The first function is not tail recursive because when the recursive call is made, the function needs to keep track of the multiplication it needs to do with the result after the call returns. As such, the stack looks as follows:
(fact 3)
(* 3 (fact 2))
(* 3 (* 2 (fact 1)))
(* 3 (* 2 (* 1 (fact 0))))
(* 3 (* 2 (* 1 1)))
(* 3 (* 2 1))
(* 3 2)
6
In contrast, the stack trace for the tail recursive factorial looks as follows:
(fact 3)
(fact-tail 3 1)
(fact-tail 2 3)
(fact-tail 1 6)
(fact-tail 0 6)
6
As you can see, we only need to keep track of the same amount of data for every call to fact-tail because we are simply returning the value we get right through to the top. This means that even if I were to call (fact 1000000), I need only the same amount of space as (fact 3). This is not the case with the non-tail-recursive fact, and as such large values may cause a stack overflow.
Let's walk through a simple example: the factorial function implemented in C.
We start with the obvious recursive definition
unsigned fac(unsigned n)
{
if (n < 2) return 1;
return n * fac(n - 1);
}
A function ends with a tail call if the last operation before the function returns is another function call. If this call invokes the same function, it is tail-recursive.
Even though fac() looks tail-recursive at first glance, it is not as what actually happens is
unsigned fac(unsigned n)
{
if (n < 2) return 1;
unsigned acc = fac(n - 1);
return n * acc;
}
ie the last operation is the multiplication and not the function call.
However, it's possible to rewrite fac() to be tail-recursive by passing the accumulated value down the call chain as an additional argument and passing only the final result up again as the return value:
unsigned fac(unsigned n)
{
return fac_tailrec(1, n);
}
unsigned fac_tailrec(unsigned acc, unsigned n)
{
if (n < 2) return acc;
return fac_tailrec(n * acc, n - 1);
}
Now, why is this useful? Because we immediately return after the tail call, we can discard the previous stackframe before invoking the function in tail position, or, in case of recursive functions, reuse the stackframe as-is.
The tail-call optimization transforms our recursive code into
unsigned fac_tailrec(unsigned acc, unsigned n)
{
TOP:
if (n < 2) return acc;
acc = n * acc;
n = n - 1;
goto TOP;
}
This can be inlined into fac() and we arrive at
unsigned fac(unsigned n)
{
unsigned acc = 1;
TOP:
if (n < 2) return acc;
acc = n * acc;
n = n - 1;
goto TOP;
}
which is equivalent to
unsigned fac(unsigned n)
{
unsigned acc = 1;
for (; n > 1; --n)
acc *= n;
return acc;
}
As we can see here, a sufficiently advanced optimizer can replace tail-recursion with iteration, which is far more efficient as you avoid function call overhead and only use a constant amount of stack space.
TCO (Tail Call Optimization) is the process by which a smart compiler can make a call to a function and take no additional stack space. The only situation in which this happens is if the last instruction executed in a function f is a call to a function g (Note: g can be f). The key here is that f no longer needs stack space - it simply calls g and then returns whatever g would return. In this case the optimization can be made that g just runs and returns whatever value it would have to the thing that called f.
This optimization can make recursive calls take constant stack space, rather than explode.
Example: this factorial function is not TCOptimizable:
from dis import dis
def fact(n):
if n == 0:
return 1
return n * fact(n-1)
dis(fact)
2 0 LOAD_FAST 0 (n)
2 LOAD_CONST 1 (0)
4 COMPARE_OP 2 (==)
6 POP_JUMP_IF_FALSE 12
3 8 LOAD_CONST 2 (1)
10 RETURN_VALUE
4 >> 12 LOAD_FAST 0 (n)
14 LOAD_GLOBAL 0 (fact)
16 LOAD_FAST 0 (n)
18 LOAD_CONST 2 (1)
20 BINARY_SUBTRACT
22 CALL_FUNCTION 1
24 BINARY_MULTIPLY
26 RETURN_VALUE
This function does things besides call another function in its return statement.
This below function is TCOptimizable:
def fact_h(n, acc):
if n == 0:
return acc
return fact_h(n-1, acc*n)
def fact(n):
return fact_h(n, 1)
dis(fact)
2 0 LOAD_GLOBAL 0 (fact_h)
2 LOAD_FAST 0 (n)
4 LOAD_CONST 1 (1)
6 CALL_FUNCTION 2
8 RETURN_VALUE
This is because the last thing to happen in any of these functions is to call another function.
Probably the best high level description I have found for tail calls, recursive tail calls and tail call optimization is the blog post
"What the heck is: A tail call"
by Dan Sugalski. On tail call optimization he writes:
Consider, for a moment, this simple function:
sub foo (int a) {
a += 15;
return bar(a);
}
So, what can you, or rather your language compiler, do? Well, what it can do is turn code of the form return somefunc(); into the low-level sequence pop stack frame; goto somefunc();. In our example, that means before we call bar, foo cleans itself up and then, rather than calling bar as a subroutine, we do a low-level goto operation to the start of bar. Foo's already cleaned itself out of the stack, so when bar starts it looks like whoever called foo has really called bar, and when bar returns its value, it returns it directly to whoever called foo, rather than returning it to foo which would then return it to its caller.
And on tail recursion:
Tail recursion happens if a function, as its last operation, returns
the result of calling itself. Tail recursion is easier to deal with
because rather than having to jump to the beginning of some random
function somewhere, you just do a goto back to the beginning of
yourself, which is a darned simple thing to do.
So that this:
sub foo (int a, int b) {
if (b == 1) {
return a;
} else {
return foo(a*a + a, b - 1);
}
gets quietly turned into:
sub foo (int a, int b) {
label:
if (b == 1) {
return a;
} else {
a = a*a + a;
b = b - 1;
goto label;
}
What I like about this description is how succinct and easy it is to grasp for those coming from an imperative language background (C, C++, Java)
GCC C minimal runnable example with x86 disassembly analysis
Let's see how GCC can automatically do tail call optimizations for us by looking at the generated assembly.
This will serve as an extremely concrete example of what was mentioned in other answers such as https://stackoverflow.com/a/9814654/895245 that the optimization can convert recursive function calls to a loop.
This in turn saves memory and improves performance, since memory accesses are often the main thing that makes programs slow nowadays.
As an input, we give GCC a non-optimized naive stack based factorial:
tail_call.c
#include <stdio.h>
#include <stdlib.h>
unsigned factorial(unsigned n) {
if (n == 1) {
return 1;
}
return n * factorial(n - 1);
}
int main(int argc, char **argv) {
int input;
if (argc > 1) {
input = strtoul(argv[1], NULL, 0);
} else {
input = 5;
}
printf("%u\n", factorial(input));
return EXIT_SUCCESS;
}
GitHub upstream.
Compile and disassemble:
gcc -O1 -foptimize-sibling-calls -ggdb3 -std=c99 -Wall -Wextra -Wpedantic \
-o tail_call.out tail_call.c
objdump -d tail_call.out
where -foptimize-sibling-calls is the name of generalization of tail calls according to man gcc:
-foptimize-sibling-calls
Optimize sibling and tail recursive calls.
Enabled at levels -O2, -O3, -Os.
as mentioned at: How do I check if gcc is performing tail-recursion optimization?
I choose -O1 because:
the optimization is not done with -O0. I suspect that this is because there are required intermediate transformations missing.
-O3 produces ungodly efficient code that would not be very educative, although it is also tail call optimized.
Disassembly with -fno-optimize-sibling-calls:
0000000000001145 <factorial>:
1145: 89 f8 mov %edi,%eax
1147: 83 ff 01 cmp $0x1,%edi
114a: 74 10 je 115c <factorial+0x17>
114c: 53 push %rbx
114d: 89 fb mov %edi,%ebx
114f: 8d 7f ff lea -0x1(%rdi),%edi
1152: e8 ee ff ff ff callq 1145 <factorial>
1157: 0f af c3 imul %ebx,%eax
115a: 5b pop %rbx
115b: c3 retq
115c: c3 retq
With -foptimize-sibling-calls:
0000000000001145 <factorial>:
1145: b8 01 00 00 00 mov $0x1,%eax
114a: 83 ff 01 cmp $0x1,%edi
114d: 74 0e je 115d <factorial+0x18>
114f: 8d 57 ff lea -0x1(%rdi),%edx
1152: 0f af c7 imul %edi,%eax
1155: 89 d7 mov %edx,%edi
1157: 83 fa 01 cmp $0x1,%edx
115a: 75 f3 jne 114f <factorial+0xa>
115c: c3 retq
115d: 89 f8 mov %edi,%eax
115f: c3 retq
The key difference between the two is that:
the -fno-optimize-sibling-calls uses callq, which is the typical non-optimized function call.
This instruction pushes the return address to the stack, therefore increasing it.
Furthermore, this version also does push %rbx, which pushes %rbx to the stack.
GCC does this because it stores edi, which is the first function argument (n) into ebx, then calls factorial.
GCC needs to do this because it is preparing for another call to factorial, which will use the new edi == n-1.
It chooses ebx because this register is callee-saved: What registers are preserved through a linux x86-64 function call so the subcall to factorial won't change it and lose n.
the -foptimize-sibling-calls does not use any instructions that push to the stack: it only does goto jumps within factorial with the instructions je and jne.
Therefore, this version is equivalent to a while loop, without any function calls. Stack usage is constant.
Tested in Ubuntu 18.10, GCC 8.2.
Note first of all that not all languages support it.
TCO applys to a special case of recursion. The gist of it is, if the last thing you do in a function is call itself (e.g. it is calling itself from the "tail" position), this can be optimized by the compiler to act like iteration instead of standard recursion.
You see, normally during recursion, the runtime needs to keep track of all the recursive calls, so that when one returns it can resume at the previous call and so on. (Try manually writing out the result of a recursive call to get a visual idea of how this works.) Keeping track of all the calls takes up space, which gets significant when the function calls itself a lot. But with TCO, it can just say "go back to the beginning, only this time change the parameter values to these new ones." It can do that because nothing after the recursive call refers to those values.
Look here:
http://tratt.net/laurie/tech_articles/articles/tail_call_optimization
As you probably know, recursive function calls can wreak havoc on a stack; it is easy to quickly run out of stack space. Tail call optimization is way by which you can create a recursive style algorithm that uses constant stack space, therefore it does not grow and grow and you get stack errors.
The recursive function approach has a problem. It builds up a call stack of size O(n), which makes our total memory cost O(n). This makes it vulnerable to a stack overflow error, where the call stack gets too big and runs out of space.
Tail call optimization (TCO) scheme. Where it can optimize recursive functions to avoid building up a tall call stack and hence saves the memory cost.
There are many languages who are doing TCO like (JavaScript, Ruby and few C) whereas Python and Java do not do TCO.
JavaScript language has confirmed using :) http://2ality.com/2015/06/tail-call-optimization.html
We should ensure that there are no goto statements in the function itself .. taken care by function call being the last thing in the callee function.
Large scale recursions can use this for optimizations, but in small scale, the instruction overhead for making the function call a tail call reduces the actual purpose.
TCO might cause a forever running function:
void eternity()
{
eternity();
}
In a functional language, tail call optimization is as if a function call could return a partially evaluated expression as the result, which would then be evaluated by the caller.
f x = g x
f 6 reduces to g 6. So if the implementation could return g 6 as the result, and then call that expression it would save a stack frame.
Also
f x = if c x then g x else h x.
Reduces to f 6 to either g 6 or h 6. So if the implementation evaluates c 6 and finds it is true then it can reduce,
if true then g x else h x ---> g x
f x ---> h x
A simple non tail call optimization interpreter might look like this,
class simple_expresion
{
...
public:
virtual ximple_value *DoEvaluate() const = 0;
};
class simple_value
{
...
};
class simple_function : public simple_expresion
{
...
private:
simple_expresion *m_Function;
simple_expresion *m_Parameter;
public:
virtual simple_value *DoEvaluate() const
{
vector<simple_expresion *> parameterList;
parameterList->push_back(m_Parameter);
return m_Function->Call(parameterList);
}
};
class simple_if : public simple_function
{
private:
simple_expresion *m_Condition;
simple_expresion *m_Positive;
simple_expresion *m_Negative;
public:
simple_value *DoEvaluate() const
{
if (m_Condition.DoEvaluate()->IsTrue())
{
return m_Positive.DoEvaluate();
}
else
{
return m_Negative.DoEvaluate();
}
}
}
A tail call optimization interpreter might look like this,
class tco_expresion
{
...
public:
virtual tco_expresion *DoEvaluate() const = 0;
virtual bool IsValue()
{
return false;
}
};
class tco_value
{
...
public:
virtual bool IsValue()
{
return true;
}
};
class tco_function : public tco_expresion
{
...
private:
tco_expresion *m_Function;
tco_expresion *m_Parameter;
public:
virtual tco_expression *DoEvaluate() const
{
vector< tco_expression *> parameterList;
tco_expression *function = const_cast<SNI_Function *>(this);
while (!function->IsValue())
{
function = function->DoCall(parameterList);
}
return function;
}
tco_expresion *DoCall(vector<tco_expresion *> &p_ParameterList)
{
p_ParameterList.push_back(m_Parameter);
return m_Function;
}
};
class tco_if : public tco_function
{
private:
tco_expresion *m_Condition;
tco_expresion *m_Positive;
tco_expresion *m_Negative;
tco_expresion *DoEvaluate() const
{
if (m_Condition.DoEvaluate()->IsTrue())
{
return m_Positive;
}
else
{
return m_Negative;
}
}
}

Resources