?

Log in

No account? Create an account

Game, Which is faster?

« previous entry | next entry »
Oct. 29th, 2009 | 12:41 pm

Last night while working with someone I am tutoring we got into a conversation of "what is faster". Below is some assembler that was created from it, for two different programs which did exactly the same thing, but in different ways.

The question is, which is faster?
Second Question, which do you think was the more maintainable code?

	.file	"?"
	.text
.globl main
	.type	main, @function
main:
.LFB0:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	movq	%rsp, %rbp
	.cfi_offset 6, -16
	.cfi_def_cfa_register 6
	subq	$2097184, %rsp
	leaq	-2097184(%rbp), %rax
	movl	$1048576, %edx
	movl	$64, %esi
	movq	%rax, %rdi
	call	memset
	movl	$0, -20(%rbp)
	jmp	.L2
.L5:
	leaq	-1048608(%rbp), %rax
	movq	%rax, -16(%rbp)
	leaq	-2097184(%rbp), %rax
	movq	%rax, -8(%rbp)
	jmp	.L3
.L7:
	nop
.L3:
	movq	-8(%rbp), %rax
	movzbl	(%rax), %edx
	movq	-16(%rbp), %rax
	movb	%dl, (%rax)
	movq	-16(%rbp), %rax
	leaq	1(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$1, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	2(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$2, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	3(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$3, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	4(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$4, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	5(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$5, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	6(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$6, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	7(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$7, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	addq	$8, -16(%rbp)
	addq	$8, -8(%rbp)
	leaq	-1048608(%rbp), %rax
	addq	$1048576, %rax
	cmpq	-16(%rbp), %rax
	jne	.L7
.L4:
	addl	$1, -20(%rbp)
.L2:
	cmpl	$4095, -20(%rbp)
	jbe	.L5
	movl	$0, %eax
	leave
	ret
	.cfi_endproc
.LFE0:
	.size	main, .-main
	.ident	"GCC: (GNU) 4.4.1 20090725 (Red Hat 4.4.1-2)"
	.section	.note.GNU-stack,"",@progbits


	.file	"?"
	.text
.globl main
	.type	main, @function
main:
.LFB0:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	movq	%rsp, %rbp
	.cfi_offset 6, -16
	.cfi_def_cfa_register 6
	subq	$2097184, %rsp
	leaq	-2097184(%rbp), %rax
	movl	$1048576, %edx
	movl	$64, %esi
	movq	%rax, %rdi
	call	memset
	movl	$0, -20(%rbp)
	jmp	.L2
.L5:
	leaq	-1048608(%rbp), %rax
	movq	%rax, -16(%rbp)
	leaq	-2097184(%rbp), %rax
	movq	%rax, -8(%rbp)
	jmp	.L3
.L4:
	movq	-8(%rbp), %rax
	movzbl	(%rax), %edx
	movq	-16(%rbp), %rax
	movb	%dl, (%rax)
	addq	$1, -16(%rbp)
	addq	$1, -8(%rbp)
.L3:
	leaq	-1048608(%rbp), %rax
	addq	$1048576, %rax
	cmpq	-16(%rbp), %rax
	jne	.L4
	addl	$1, -20(%rbp)
.L2:
	cmpl	$4095, -20(%rbp)
	jbe	.L5
	movl	$0, %eax
	leave
	ret
	.cfi_endproc
.LFE0:
	.size	main, .-main
	.ident	"GCC: (GNU) 4.4.1 20090725 (Red Hat 4.4.1-2)"
	.section	.note.GNU-stack,"",@progbits

Link | Leave a comment |

Comments {7}

Brian "Krow" Aker

(no subject)

from: krow
date: Oct. 30th, 2009 04:17 pm (UTC)
Link

You might be surprised about how poorly gcc handles the above :)

The second is much more maintainable, though not really by a lot (aka... it is about 9 lines of different code).

Reply | Parent | Thread

Lover of Ideas

(no subject)

from: omnifarious
date: Oct. 30th, 2009 10:20 pm (UTC)
Link

You know, after examining things more carefully, I'm guessing the second might well be faster. The second does have some mildly expensive operations for each loop, but the first has several inefficiencies with each section of unrolled code.

It's also clear that neither were compiled with compiler optimizations turned on. gcc can do a much better job than that.

This code is copying the least significant byte of an array of 218 4 byte values into a different array of chars or unsigned chars. They are both allocated on the stack.



Edited at 2009-10-30 10:21 pm (UTC)

Reply | Parent | Thread