patch-2.4.21 linux-2.4.21/arch/x86_64/lib/memcpy.S

Next file: linux-2.4.21/arch/x86_64/lib/memset.S
Previous file: linux-2.4.21/arch/x86_64/lib/delay.c
Back to the patch index
Back to the overall index

diff -urN linux-2.4.20/arch/x86_64/lib/memcpy.S linux-2.4.21/arch/x86_64/lib/memcpy.S
@@ -12,103 +12,78 @@
  * rax original destination
  */	
 
- // #define FIX_ALIGNMENT
  	.globl __memcpy
 	.globl memcpy
-	.p2align
+	.p2align 4
 __memcpy:
 memcpy:		
 	pushq %rbx
 	movq %rdi,%rax
 
-#ifdef FIX_ALIGNMENT
-	movl %edi,%ecx
-	andl $7,%ecx
-	jnz  bad_alignment	
-after_bad_alignment:
-#endif
-
-	movq %rdx,%rcx
-	movl $64,%ebx
-	shrq $6,%rcx
-	jz handle_tail
+	movl %edx,%ecx
+	shrl $6,%ecx
+	jz .Lhandle_tail
 	
-loop_64:
+	.p2align 4
+.Lloop_64:
+	decl %ecx
+
 	movq (%rsi),%r11
 	movq 8(%rsi),%r8
-	movq 2*8(%rsi),%r9
-	movq 3*8(%rsi),%r10
+
 	movq %r11,(%rdi)
 	movq %r8,1*8(%rdi)
+
+	movq 2*8(%rsi),%r9
+	movq 3*8(%rsi),%r10
+
 	movq %r9,2*8(%rdi)
 	movq %r10,3*8(%rdi)
 		
 	movq 4*8(%rsi),%r11
 	movq 5*8(%rsi),%r8
-	movq 6*8(%rsi),%r9
-	movq 7*8(%rsi),%r10
+
 	movq %r11,4*8(%rdi)
 	movq %r8,5*8(%rdi)
+
+	movq 6*8(%rsi),%r9
+	movq 7*8(%rsi),%r10
+
 	movq %r9,6*8(%rdi)
 	movq %r10,7*8(%rdi)
 
-	addq %rbx,%rsi	
-	addq %rbx,%rdi
-	decl %ecx
-	jnz  loop_64
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	jnz  .Lloop_64
 
-handle_tail:
+.Lhandle_tail:
 	movl %edx,%ecx
 	andl $63,%ecx
 	shrl $3,%ecx
-	jz   handle_7
-	movl $8,%ebx
-loop_8: 
+	jz   .Lhandle_7
+	.p2align 4
+.Lloop_8: 
+	decl %ecx
 	movq (%rsi),%r8
 	movq %r8,(%rdi) 
-	addq %rbx,%rdi
-	addq %rbx,%rsi
-	decl %ecx
-	jnz  loop_8
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz  .Lloop_8
 
-handle_7:
+.Lhandle_7:
 	movl %edx,%ecx
 	andl $7,%ecx
-	jz ende
-loop_1:
+	jz .Lende
+	.p2align 4
+.Lloop_1:
 	movb (%rsi),%r8b
 	movb %r8b,(%rdi) 
 	incq %rdi
 	incq %rsi
 	decl %ecx
-	jnz loop_1
+	jnz .Lloop_1
 	
-ende: 	
-	sfence
+.Lende: 	
 	popq %rbx
 	ret
 
-
-#ifdef FIX_ALIGNMENT
-	/* align destination */
-	/* This is simpleminded. For bigger blocks it may make sense to align
-	   src and dst to their aligned subset and handle the rest separately */
-bad_alignment:
-	movl $8,%r9d
-	subl %ecx,%r9d
-	movl %r9d,%ecx
-	subq %r9,%rdx
-	js   small_alignment
-	jz   small_alignment
-align_1:
-	movb (%rsi),%r8b
-	movb %r8b,(%rdi) 
-	incq %rdi
-	incq %rsi
-	decl %ecx
-	jnz  align_1
-	jmp after_bad_alignment
-small_alignment:
-	addq %r9,%rdx
-	jmp handle_7
-#endif	

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)