patch-2.4.8 linux/arch/ia64/lib/do_csum.S

Next file: linux/arch/ia64/lib/swiotlb.c
Previous file: linux/arch/ia64/lib/csum_partial_copy.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.7/linux/arch/ia64/lib/do_csum.S linux/arch/ia64/lib/do_csum.S
@@ -11,6 +11,12 @@
  * Copyright (C) 1999, 2001 Hewlett-Packard Co
  * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
  *
+ * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
+ *		Clean up and optimize and the software pipeline, loading two
+ *		back-to-back 8-byte words per loop. Clean up the initialization
+ *		for the loop. Support the cases where load latency = 1 or 2.
+ *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
+ *
  */
 
 #include <asm/asmmacro.h>
@@ -18,51 +24,54 @@
 //
 // Theory of operations:
 //	The goal is to go as quickly as possible to the point where
-//	we can checksum 8 bytes/loop. Before reaching that point we must
+//	we can checksum 16 bytes/loop. Before reaching that point we must
 //	take care of incorrect alignment of first byte.
 //
 //	The code hereafter also takes care of the "tail" part of the buffer
 //	before entering the core loop, if any. The checksum is a sum so it
-//	allows us to commute operations. So we do do the "head" and "tail"
+//	allows us to commute operations. So we do the "head" and "tail"
 //	first to finish at full speed in the body. Once we get the head and
 //	tail values, we feed them into the pipeline, very handy initialization.
 //
 //	Of course we deal with the special case where the whole buffer fits
 //	into one 8 byte word. In this case we have only one entry in the pipeline.
 //
-//	We use a (3+1)-stage pipeline in the loop to account for possible
-//	load latency and also to accomodate for head and tail.
+//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
+//	possible load latency and also to accomodate for head and tail.
 //
 //	The end of the function deals with folding the checksum from 64bits
 //	down to 16bits taking care of the carry.
 //
 //	This version avoids synchronization in the core loop by also using a
-//	pipeline for the accumulation of the checksum in result[].
+//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
 //
-//	 p[]
+//	 wordx[] (x=1,2)
 //	|---|
-//     0|   | r32 : new value loaded in pipeline
+//      |   | 0			: new value loaded in pipeline
 //	|---|
-//     1|   | r33 : in transit data
+//      |   | -			: in transit data
 //	|---|
-//     2|   | r34 : current value to add to checksum
+//      |   | LOAD_LATENCY	: current value to add to checksum
 //	|---|
-//     3|   | r35 : previous value added to checksum (previous iteration)
-//      |---|
+//      |   | LOAD_LATENCY+1	: previous value added to checksum
+//      |---|			(previous iteration)
 //
-//	result[]
+//	resultx[] (x=1,2)
 //	|---|
-//     0|   | r36 : new checksum
+//      |   | 0			: initial value
 //	|---|
-//     1|   | r37 : previous value of checksum
+//      |   | LOAD_LATENCY-1	: new checksum
 //	|---|
-//     2|   | r38 : final checksum when out of the loop (after 2 epilogue rots)
+//      |   | LOAD_LATENCY	: previous value of checksum
 //	|---|
+//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
+//      |---|
+//
 //
+//	See RFC1071 "Computing the Internet Checksum" for various techniques for
+//	calculating the Internet checksum.
 //
 // NOT YET DONE:
-//	- Take advantage of the MMI bandwidth to load more than 8byte per loop
-//	  iteration
 //	- use the lfetch instruction to augment the chances of the data being in
 //	  the cache when we need it.
 //	- Maybe another algorithm which would take care of the folding at the
@@ -71,14 +80,12 @@
 //	  to figure out if we could not split the function depending on the
 //	  type of packet or alignment we get. Like the ip_fast_csum() routine
 //	  where we know we have at least 20bytes worth of data to checksum.
-//	- Look at RFCs about checksums to see whether or not we can do better
-//
 //	- Do a better job of handling small packets.
-//
+
 #define saved_pfs	r11
 #define hmask		r16
 #define tmask		r17
-#define first		r18
+#define first1		r18
 #define firstval	r19
 #define firstoff	r20
 #define last		r21
@@ -89,32 +96,47 @@
 #define tmp1		r26
 #define tmp2		r27
 #define tmp3		r28
-#define carry		r29
+#define carry1		r29
+#define carry2		r30
+#define first2		r31
 
 #define buf		in0
 #define len		in1
 
-// unsigned long do_csum(unsigned char *buf,int len)
+#ifndef CONFIG_IA64_LOAD_LATENCY
+#define CONFIG_IA64_LOAD_LATENCY	2
+#endif
+
+#define LOAD_LATENCY	2	// XXX fix me
+
+#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
+# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
+#endif
+
+#define PIPE_DEPTH			(LOAD_LATENCY+2)
+#define ELD	p[LOAD_LATENCY]		// end of load
+#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
+
+// unsigned long do_csum(unsigned char *buf,long len)
 
 GLOBAL_ENTRY(do_csum)
 	.prologue
 	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,2,8,0,8
-
-	.rotr p[4], result[3]
+	alloc saved_pfs=ar.pfs,2,16,1,16
+	.rotr word1[4], word2[4],result1[4],result2[4]
+	.rotp p[PIPE_DEPTH]
 	mov ret0=r0		// in case we have zero length
-	cmp4.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
+	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
 	;;			// avoid WAW on CFM
 	mov tmp3=0x7		// a temporary mask/value
 	add tmp1=buf,len	// last byte's address
 (p6)	br.ret.spnt.few rp	// return if true (hope we can avoid that)
 
-	and firstoff=7,buf	// how many bytes off for first element
-	tbit.nz p10,p0=buf,0	// is buf an odd address ?
+	and firstoff=7,buf	// how many bytes off for first1 element
+	tbit.nz p15,p0=buf,0	// is buf an odd address ?
 	mov hmask=-1		// intialize head mask
 	;;
-
-	andcm first=buf,tmp3	// 8byte aligned down address of first element
+	andcm first1=buf,tmp3	// 8byte aligned down address of first1 element
 	mov tmask=-1		// initialize tail mask
 	adds tmp2=-1,tmp1	// last-1
 	;;
@@ -123,75 +145,125 @@
 	.save pr, saved_pr
 	mov saved_pr=pr		// preserve predicates (rotation)
 	;;
-	sub tmp3=last,first	// tmp3=distance from first to last
-	cmp.eq p8,p9=last,first	// everything fits in one word ?
+	sub tmp3=last,first1	// tmp3=distance from first1 to last
+	cmp.eq p8,p9=last,first1	// everything fits in one word ?
 	sub tmp1=8,lastoff	// complement to lastoff
-
-	ld8 firstval=[first],8	// load,ahead of time, "first" word
+	ld8 firstval=[first1],8	// load,ahead of time, "first1" word
 	shl tmp2=firstoff,3	// number of bits
 	;;
 	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
-
 (p9)	ld8 lastval=[last]	// load,ahead of time, "last" word, if needed
-(p8)	mov lastval=r0		// we don't need lastval if first==last
-	mov result[1]=r0	// initialize result
+(p9)	adds tmp3=-8,tmp3	// effectively loaded
 	;;
-
+(p8)	mov lastval=r0		// we don't need lastval if first1==last
 	shl tmp1=tmp1,3		// number of bits
-	shl hmask=hmask,tmp2	// build head mask, mask off [0,firstoff[
+	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
 	;;
 	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
 	.save ar.lc, saved_lc
 	mov saved_lc=ar.lc	// save lc
 	;;
-
 	.body
+#define count tmp3
 
 (p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
-(p9)	and p[1]=lastval,tmask	// mask last it as appropriate
-	shr.u tmp3=tmp3,3	// we do 8 bytes per loop
+(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
+	shr.u count=count,3	// we do 8 bytes per loop (count)
 	;;
-	cmp.lt p6,p7=2,tmp3	// tmp3 > 2 ?
-	and p[2]=firstval,hmask	// and mask it as appropriate
-	add tmp1=-2,tmp3	// -2 = -1 (br.ctop) -1 (last-first)
+	// If count is odd, finish this 8-byte word so that we can
+	// load two back-to-back 8-byte words per loop thereafter.
+	tbit.nz p10,p11=count,0		// if (count is odd)
+	and word1[0]=firstval,hmask	// and mask it as appropriate
+	;;
+(p8)	mov result1[0]=word1[0]
+(p9)	add result1[0]=word1[0],word2[0]
+	;;
+	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
+	;;
+(p6)	adds result1[0]=1,result1[0]
+(p8)	br.cond.dptk.few do_csum_exit	// if (within an 8-byte word)
+	;;
+(p11)	br.cond.dptk.few do_csum16	// if (count is even)
+	;;
+	// Here count is odd.
+	ld8 word1[1]=[first1],8		// load an 8-byte word
+	cmp.eq p9,p10=1,count		// if (count == 1)
+	adds count=-1,count		// loaded an 8-byte word
+	;;
+	add result1[0]=result1[0],word1[1]
+	;;
+	cmp.ltu p6,p0=result1[0],word1[1]
+	;;
+(p6)	adds result1[0]=1,result1[0]
+	;;
+(p9)	br.cond.sptk.few do_csum_exit	// if (count == 1) exit
+	// Fall through to caluculate the checksum, feeding result1[0] as
+	// the initial value in result1[0].
 	;;
-	// XXX Fixme: not very nice initialization here
-	//
-	// Setup loop control registers:
 	//
-	// tmp3=0 (1 word)   : lc=0, ec=2, p16=F
-	// tmp3=1 (2 words)  : lc=0, ec=3, p16=F
-	// tmp3=2 (3 words)  : lc=0, ec=4, p16=T
-	// tmp3>2 (4 or more): lc=tmp3-2, ec=4, p16=T
+	// Calculate the checksum loading two 8-byte words per loop.
 	//
-	cmp.eq p8,p9=r0,tmp3	// tmp3 == 0 ?
-(p6)	mov ar.lc=tmp1
-(p7)	mov ar.lc=0
-	;;
-	cmp.lt p6,p7=1,tmp3	// tmp3 > 1 ?
-(p8)	mov ar.ec=2		// we need the extra rotation on result[]
-(p9)	mov ar.ec=3		// hard not to set it twice sometimes
-	;;
-	mov carry=r0			// initialize carry
-(p6)	mov ar.ec=4
-(p6)	mov pr.rot=0xffffffffffff0000	// p16=T, p18=T
-
-	cmp.ne p8,p0=r0,r0		// p8 is false
-	mov p[3]=r0			// make sure first compare fails
-(p7)	mov pr.rot=0xfffffffffffe0000	// p16=F, p18=T
+do_csum16:
+	mov saved_lc=ar.lc
+	shr.u count=count,1	// we do 16 bytes per loop
+	;;
+	cmp.eq p9,p10=r0,count	// if (count == 0)
+	brp.loop.imp 1f,2f
+	;;
+	adds count=-1,count
+	mov ar.ec=PIPE_DEPTH
+	;;
+	mov ar.lc=count	// set lc
+	;;
+	// result1[0] must be initialized in advance.
+	mov result2[0]=r0
 	;;
+	mov pr.rot=1<<16
+	;;
+	mov carry1=r0
+	mov carry2=r0
+	;;
+	add first2=8,first1
+	;;
+(p9)	br.cond.sptk.few do_csum_exit
+	;;
+	nop.m	0
+	nop.i	0
+	;;
+	.align 32
 1:
-(p16)	ld8 p[0]=[first],8		// load next
-(p8)	adds carry=1,carry		// add carry on prev_prev_value
-(p18)	add result[0]=result[1],p[2]	// new_res = prev_res + cur_val
-	cmp.ltu p8,p0=result[1],p[3]	// p8= prev_result < prev_val
-	br.ctop.dptk.few 1b		// loop until lc--==0
-	;;				// RAW on carry when loop exits
- (p8)	adds carry=1,carry;;		// correct for carry on prev_value
-	add result[2]=carry,result[2];;	// add carry to final result
-	cmp.ltu p6,p7=result[2], carry	// check for new carry
+(ELD_1)	cmp.ltu p31,p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
+(p32)	adds carry1=1,carry1
+(ELD_1)	cmp.ltu p47,p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
+(p48)	adds carry2=1,carry2
+(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
+(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
+2:
+(p16)	ld8 word1[0]=[first1],16
+(p16)	ld8 word2[0]=[first2],16
+	br.ctop.sptk.few 1b
+	;;
+	// Since len is a 32-bit value, carry cannot be larger than
+	// a 64-bit value.
+(p32)	adds carry1=1,carry1	// since we miss the last one
+(p48)	adds carry2=1,carry2
+	;;
+	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
+	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
+	;;
+	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
+	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
+	;;
+(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
+(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
 	;;
-(p6)	adds result[2]=1,result[1]	// correct if required
+	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
+	;;
+	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
+	;;
+(p6)	adds result1[0]=1,result1[0]
+	;;
+do_csum_exit:
 	movl tmp3=0xffffffff
 	;;
 	// XXX Fixme
@@ -199,33 +271,66 @@
 	// now fold 64 into 16 bits taking care of carry
 	// that's not very good because it has lots of sequentiality
 	//
-	and tmp1=result[2],tmp3
-	shr.u tmp2=result[2],32
+	and tmp1=result1[0],tmp3
+	shr.u tmp2=result1[0],32
 	;;
-	add result[2]=tmp1,tmp2
+	add result1[0]=tmp1,tmp2
 	shr.u tmp3=tmp3,16
 	;;
-	and tmp1=result[2],tmp3
-	shr.u tmp2=result[2],16
+	and tmp1=result1[0],tmp3
+	shr.u tmp2=result1[0],16
 	;;
-	add result[2]=tmp1,tmp2
+	add result1[0]=tmp1,tmp2
 	;;
-	and tmp1=result[2],tmp3
-	shr.u tmp2=result[2],16
+	and tmp1=result1[0],tmp3
+	shr.u tmp2=result1[0],16
 	;;
-	add result[2]=tmp1,tmp2
+	add result1[0]=tmp1,tmp2
 	;;
-	and tmp1=result[2],tmp3
-	shr.u tmp2=result[2],16
+	and tmp1=result1[0],tmp3
+	shr.u tmp2=result1[0],16
 	;;
 	add ret0=tmp1,tmp2
 	mov pr=saved_pr,0xffffffffffff0000
 	;;
 	// if buf was odd then swap bytes
 	mov ar.pfs=saved_pfs		// restore ar.ec
-(p10)	mux1 ret0=ret0,@rev		// reverse word
+(p15)	mux1 ret0=ret0,@rev		// reverse word
 	;;
 	mov ar.lc=saved_lc
-(p10)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
+(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
 	br.ret.sptk.few rp
+
+//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
+//	not much better than the original. So keep the original there so that
+//	someone else can challenge.
+//
+//	shr.u word1[0]=result1[0],32
+//	zxt4 result1[0]=result1[0]
+//	;;
+//	add result1[0]=result1[0],word1[0]
+//	;;
+//	zxt2 result2[0]=result1[0]
+//	extr.u word1[0]=result1[0],16,16
+//	shr.u carry1=result1[0],32
+//	;;
+//	add result2[0]=result2[0],word1[0]
+//	;;
+//	add result2[0]=result2[0],carry1
+//	;;
+//	extr.u ret0=result2[0],16,16
+//	;;
+//	add ret0=ret0,result2[0]
+//	;;
+//	zxt2 ret0=ret0
+//	mov ar.pfs=saved_pfs		 // restore ar.ec
+//	mov pr=saved_pr,0xffffffffffff0000
+//	;;
+//	// if buf was odd then swap bytes
+//	mov ar.lc=saved_lc
+//(p15)	mux1 ret0=ret0,@rev		// reverse word
+//	;;
+//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
+//	br.ret.sptk.few rp
+
 END(do_csum)

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)