?

Log in

No account? Create an account

Game, Which is faster?

« previous entry | next entry »
Oct. 29th, 2009 | 12:41 pm

Last night while working with someone I am tutoring we got into a conversation of "what is faster". Below is some assembler that was created from it, for two different programs which did exactly the same thing, but in different ways.

The question is, which is faster?
Second Question, which do you think was the more maintainable code?

	.file	"?"
	.text
.globl main
	.type	main, @function
main:
.LFB0:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	movq	%rsp, %rbp
	.cfi_offset 6, -16
	.cfi_def_cfa_register 6
	subq	$2097184, %rsp
	leaq	-2097184(%rbp), %rax
	movl	$1048576, %edx
	movl	$64, %esi
	movq	%rax, %rdi
	call	memset
	movl	$0, -20(%rbp)
	jmp	.L2
.L5:
	leaq	-1048608(%rbp), %rax
	movq	%rax, -16(%rbp)
	leaq	-2097184(%rbp), %rax
	movq	%rax, -8(%rbp)
	jmp	.L3
.L7:
	nop
.L3:
	movq	-8(%rbp), %rax
	movzbl	(%rax), %edx
	movq	-16(%rbp), %rax
	movb	%dl, (%rax)
	movq	-16(%rbp), %rax
	leaq	1(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$1, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	2(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$2, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	3(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$3, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	4(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$4, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	5(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$5, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	6(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$6, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	movq	-16(%rbp), %rax
	leaq	7(%rax), %rdx
	movq	-8(%rbp), %rax
	addq	$7, %rax
	movzbl	(%rax), %eax
	movb	%al, (%rdx)
	addq	$8, -16(%rbp)
	addq	$8, -8(%rbp)
	leaq	-1048608(%rbp), %rax
	addq	$1048576, %rax
	cmpq	-16(%rbp), %rax
	jne	.L7
.L4:
	addl	$1, -20(%rbp)
.L2:
	cmpl	$4095, -20(%rbp)
	jbe	.L5
	movl	$0, %eax
	leave
	ret
	.cfi_endproc
.LFE0:
	.size	main, .-main
	.ident	"GCC: (GNU) 4.4.1 20090725 (Red Hat 4.4.1-2)"
	.section	.note.GNU-stack,"",@progbits


	.file	"?"
	.text
.globl main
	.type	main, @function
main:
.LFB0:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	movq	%rsp, %rbp
	.cfi_offset 6, -16
	.cfi_def_cfa_register 6
	subq	$2097184, %rsp
	leaq	-2097184(%rbp), %rax
	movl	$1048576, %edx
	movl	$64, %esi
	movq	%rax, %rdi
	call	memset
	movl	$0, -20(%rbp)
	jmp	.L2
.L5:
	leaq	-1048608(%rbp), %rax
	movq	%rax, -16(%rbp)
	leaq	-2097184(%rbp), %rax
	movq	%rax, -8(%rbp)
	jmp	.L3
.L4:
	movq	-8(%rbp), %rax
	movzbl	(%rax), %edx
	movq	-16(%rbp), %rax
	movb	%dl, (%rax)
	addq	$1, -16(%rbp)
	addq	$1, -8(%rbp)
.L3:
	leaq	-1048608(%rbp), %rax
	addq	$1048576, %rax
	cmpq	-16(%rbp), %rax
	jne	.L4
	addl	$1, -20(%rbp)
.L2:
	cmpl	$4095, -20(%rbp)
	jbe	.L5
	movl	$0, %eax
	leave
	ret
	.cfi_endproc
.LFE0:
	.size	main, .-main
	.ident	"GCC: (GNU) 4.4.1 20090725 (Red Hat 4.4.1-2)"
	.section	.note.GNU-stack,"",@progbits

Link | Leave a comment |

Comments {7}

Lumiere

(no subject)

from: lumiere
date: Oct. 30th, 2009 03:23 am (UTC)
Link

In what context?

Micro-benchmarks can have misleading results when taken out of context. For example, assuming the unrolled code segment runs slightly faster on its own, you might then want to check what effect it has on the L1 cache and other CPU resources, as these could make the overall system slower even if you're spending less time in this one section.

Of course, if that's the hottest inner loop--and it ought to be if you're doing performance tuning at this level--then this is unlikely...

Reply | Thread