patch-2.4.15 linux/arch/ia64/lib/copy_user.S

Next file: linux/arch/ia64/lib/do_csum.S
Previous file: linux/arch/ia64/lib/copy_page.S
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.14/linux/arch/ia64/lib/copy_user.S linux/arch/ia64/lib/copy_user.S
@@ -19,8 +19,8 @@
  *	ret0	0 in case of success. The number of bytes NOT copied in
  *		case of error.
  *
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2000-2001 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
  *
  * Fixme:
  *	- handle the case where we have more than 16 bytes and the alignment
@@ -85,7 +85,7 @@
 	cmp.eq p8,p0=r0,len	// check for zero length
 	.save ar.lc, saved_lc
 	mov saved_lc=ar.lc	// preserve ar.lc (slow)
-(p8)	br.ret.spnt.few rp	// empty mempcy()
+(p8)	br.ret.spnt.many rp	// empty mempcy()
 	;;
 	add enddst=dst,len	// first byte after end of source
 	add endsrc=src,len	// first byte after end of destination
@@ -103,26 +103,26 @@
 	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
 
 	xor tmp=src,dst		// same alignment test prepare
-(p10)	br.cond.dptk.few long_copy_user
+(p10)	br.cond.dptk .long_copy_user
 	;;			// RAW pr.rot/p16 ?
 	//
 	// Now we do the byte by byte loop with software pipeline
 	//
 	// p7 is necessarily false by now
 1:
-	EX(failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
 	br.ctop.dptk.few 1b
 	;;
 	mov ar.lc=saved_lc
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.pfs=saved_pfs		// restore ar.ec
-	br.ret.sptk.few rp	// end of short memcpy
+	br.ret.sptk.many rp		// end of short memcpy
 
 	//
 	// Not 8-byte aligned
 	//
-diff_align_copy_user:
+.diff_align_copy_user:
 	// At this point we know we have more than 16 bytes to copy
 	// and also that src and dest do _not_ have the same alignment.
 	and src2=0x7,src1				// src offset
@@ -153,7 +153,7 @@
 	// We know src1 is not 8-byte aligned in this case.
 	//
 	cmp.eq p14,p15=r0,dst2
-(p15)	br.cond.spnt.few 1f
+(p15)	br.cond.spnt 1f
 	;;
 	sub t1=8,src2
 	mov t2=src2
@@ -163,7 +163,7 @@
 	;;
 	sub lshift=64,rshift
 	;;
-	br.cond.spnt.few word_copy_user
+	br.cond.spnt .word_copy_user
 	;;
 1:
 	cmp.leu	p14,p15=src2,dst2
@@ -192,15 +192,15 @@
 	mov ar.lc=cnt
 	;;
 2:
-	EX(failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
-	EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
+	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
 	br.ctop.dptk.few 2b
 	;;
 	clrrrb
 	;;
-word_copy_user:
+.word_copy_user:
 	cmp.gtu p9,p0=16,len1
-(p9)	br.cond.spnt.few 4f		// if (16 > len1) skip 8-byte copy
+(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
 	;;
 	shr.u cnt=len1,3		// number of 64-bit words
 	;;
@@ -232,24 +232,24 @@
 #define EPI_1		p[PIPE_DEPTH-2]
 #define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
 #define CASE(pred, shift)	\
-	(pred)	br.cond.spnt.few copy_user_bit##shift
+	(pred)	br.cond.spnt .copy_user_bit##shift
 #define BODY(rshift)						\
-copy_user_bit##rshift:						\
+.copy_user_bit##rshift:						\
 1:								\
-	EX(failure_out,(EPI) st8 [dst1]=tmp,8);			\
+	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
 (EPI_1) shrp tmp=val1[PIPE_DEPTH-3],val1[PIPE_DEPTH-2],rshift;	\
 	EX(3f,(p16) ld8 val1[0]=[src1],8);			\
-	br.ctop.dptk.few 1b;					\
+	br.ctop.dptk 1b;					\
 	;;							\
-	br.cond.sptk.few .diff_align_do_tail;			\
+	br.cond.sptk.many .diff_align_do_tail;			\
 2:								\
 (EPI)	st8 [dst1]=tmp,8;					\
 (EPI_1)	shrp tmp=val1[PIPE_DEPTH-3],val1[PIPE_DEPTH-2],rshift;	\
 3:								\
 (p16)	mov val1[0]=r0;						\
-	br.ctop.dptk.few 2b;					\
+	br.ctop.dptk 2b;					\
 	;;							\
-	br.cond.sptk.few failure_in2
+	br.cond.sptk.many .failure_in2
 
 	//
 	// Since the instruction 'shrp' requires a fixed 128-bit value
@@ -301,25 +301,25 @@
 	mov ar.lc=len1
 	;;
 5:
-	EX(failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
 	br.ctop.dptk.few 5b
 	;;
 	mov ar.lc=saved_lc
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.pfs=saved_pfs
-	br.ret.dptk.few rp
+	br.ret.sptk.many rp
 
 	//
 	// Beginning of long mempcy (i.e. > 16 bytes)
 	//
-long_copy_user:
+.long_copy_user:
 	tbit.nz p6,p7=src1,0	// odd alignement
 	and tmp=7,tmp
 	;;
 	cmp.eq p10,p8=r0,tmp
 	mov len1=len		// copy because of rotation
-(p8)	br.cond.dpnt.few diff_align_copy_user
+(p8)	br.cond.dpnt .diff_align_copy_user
 	;;
 	// At this point we know we have more than 16 bytes to copy
 	// and also that both src and dest have the same alignment
@@ -327,11 +327,11 @@
 	// forward slowly until we reach 16byte alignment: no need to
 	// worry about reaching the end of buffer.
 	//
-	EX(failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
+	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
 (p6)	adds len1=-1,len1;;
 	tbit.nz p7,p0=src1,1
 	;;
-	EX(failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
+	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
 (p7)	adds len1=-2,len1;;
 	tbit.nz p8,p0=src1,2
 	;;
@@ -339,28 +339,28 @@
 	// Stop bit not required after ld4 because if we fail on ld4
 	// we have never executed the ld1, therefore st1 is not executed.
 	//
-	EX(failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
+	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
 	;;
-	EX(failure_out,(p6) st1 [dst1]=val1[0],1)
+	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
 	tbit.nz p9,p0=src1,3
 	;;
 	//
 	// Stop bit not required after ld8 because if we fail on ld8
 	// we have never executed the ld2, therefore st2 is not executed.
 	//
-	EX(failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
-	EX(failure_out,(p7) st2 [dst1]=val1[1],2)
+	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
+	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
 (p8)	adds len1=-4,len1
 	;;
-	EX(failure_out, (p8) st4 [dst1]=val2[0],4)
+	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
 (p9)	adds len1=-8,len1;;
 	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
 	;;
-	EX(failure_out, (p9) st8 [dst1]=val2[1],8)
+	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
 	tbit.nz p6,p0=len1,3
 	cmp.eq p7,p0=r0,cnt
 	adds tmp=-1,cnt			// br.ctop is repeat/until
-(p7)	br.cond.dpnt.few .dotail	// we have less than 16 bytes left
+(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
 	;;
 	adds src2=8,src1
 	adds dst2=8,dst1
@@ -370,12 +370,12 @@
 	// 16bytes/iteration
 	//
 2:
-	EX(failure_in3,(p16) ld8 val1[0]=[src1],16)
+	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
 (p16)	ld8 val2[0]=[src2],16
 
-	EX(failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
+	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
 (EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk.few 2b
+	br.ctop.dptk 2b
 	;;			// RAW on src1 when fall through from loop
 	//
 	// Tail correction based on len only
@@ -384,29 +384,28 @@
 	// is 16 byte aligned AND we have less than 16 bytes to copy.
 	//
 .dotail:
-	EX(failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
+	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
 	tbit.nz p7,p0=len1,2
 	;;
-	EX(failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
+	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
 	tbit.nz p8,p0=len1,1
 	;;
-	EX(failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
+	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
 	tbit.nz p9,p0=len1,0
 	;;
-	EX(failure_out, (p6) st8 [dst1]=val1[0],8)
+	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
 	;;
-	EX(failure_in1,(p9) ld1 val2[1]=[src1])		// only 1 byte left
+	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
 	mov ar.lc=saved_lc
 	;;
-	EX(failure_out,(p7) st4 [dst1]=val1[1],4)
+	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
 	mov pr=saved_pr,0xffffffffffff0000
 	;;
-	EX(failure_out, (p8)	st2 [dst1]=val2[0],2)
+	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
 	mov ar.pfs=saved_pfs
 	;;
-	EX(failure_out, (p9)	st1 [dst1]=val2[1])
-	br.ret.dptk.few rp
-
+	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
+	br.ret.sptk.many rp
 
 
 	//
@@ -433,32 +432,32 @@
 	//	  pipeline going. We can't really do this inline because
 	//	  p16 is always reset to 1 when lc > 0.
 	//
-failure_in_pipe1:
+.failure_in_pipe1:
 	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
 1:
 (p16)	mov val1[0]=r0
 (EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk.few 1b
+	br.ctop.dptk 1b
 	;;
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.lc=saved_lc
 	mov ar.pfs=saved_pfs
-	br.ret.dptk.few rp
+	br.ret.sptk.many rp
 
 	//
 	// This is the case where the byte by byte copy fails on the load
 	// when we copy the head. We need to finish the pipeline and copy
 	// zeros for the rest of the destination. Since this happens
 	// at the top we still need to fill the body and tail.
-failure_in_pipe2:
+.failure_in_pipe2:
 	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
 2:
 (p16)	mov val1[0]=r0
 (EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk.few 2b
+	br.ctop.dptk 2b
 	;;
 	sub len=enddst,dst1,1		// precompute len
-	br.cond.dptk.few failure_in1bis
+	br.cond.dptk.many .failure_in1bis
 	;;
 
 	//
@@ -533,9 +532,7 @@
 	// This means that we are in a situation similar the a fault in the
 	// head part. That's nice!
 	//
-failure_in1:
-//	sub ret0=enddst,dst1	// number of bytes to zero, i.e. not copied
-//	sub len=enddst,dst1,1
+.failure_in1:
 	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
 	sub len=endsrc,src1,1
 	//
@@ -546,18 +543,17 @@
 	// calling side.
 	//
 	;;
-failure_in1bis:			// from (failure_in3)
+.failure_in1bis:		// from (.failure_in3)
 	mov ar.lc=len		// Continue with a stupid byte store.
 	;;
 5:
 	st1 [dst1]=r0,1
-	br.cloop.dptk.few 5b
+	br.cloop.dptk 5b
 	;;
-skip_loop:
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.lc=saved_lc
 	mov ar.pfs=saved_pfs
-	br.ret.dptk.few rp
+	br.ret.sptk.many rp
 
 	//
 	// Here we simply restart the loop but instead
@@ -569,7 +565,7 @@
 	// we MUST use src1/endsrc here and not dst1/enddst because
 	// of the pipeline effect.
 	//
-failure_in3:
+.failure_in3:
 	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
 	;;
 2:
@@ -577,36 +573,36 @@
 (p16)	mov val2[0]=r0
 (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
 (EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk.few 2b
+	br.ctop.dptk 2b
 	;;
 	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
 	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk.few failure_in1bis
+(p6)	br.cond.dptk .failure_in1bis
 	;;
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.lc=saved_lc
 	mov ar.pfs=saved_pfs
-	br.ret.dptk.few rp
+	br.ret.sptk.many rp
 
-failure_in2:
+.failure_in2:
 	sub ret0=endsrc,src1
 	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
 	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk.few failure_in1bis
+(p6)	br.cond.dptk .failure_in1bis
 	;;
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.lc=saved_lc
 	mov ar.pfs=saved_pfs
-	br.ret.dptk.few rp
+	br.ret.sptk.many rp
 
 	//
 	// handling of failures on stores: that's the easy part
 	//
-failure_out:
+.failure_out:
 	sub ret0=enddst,dst1
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.lc=saved_lc
 
 	mov ar.pfs=saved_pfs
-	br.ret.dptk.few rp
+	br.ret.sptk.many rp
 END(__copy_user)

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)