#ifndef __ASSEMBLER__
# define __ASSEMBLER__ 1
#endif
#include "crypto/sparc_arch.h"

#ifdef	__arch64__
.register	%g2,#scratch
.register	%g3,#scratch
#endif

.text

.globl	aes_t4_encrypt
.align	32
aes_t4_encrypt:
	andcc		%o0, 7, %g1		! is input aligned?
	andn		%o0, 7, %o0

	ldx		[%o2 + 0], %g4
	ldx		[%o2 + 8], %g5

	ldx		[%o0 + 0], %o4
	bz,pt		%icc, 1f
	ldx		[%o0 + 8], %o5
	ldx		[%o0 + 16], %o0
	sll		%g1, 3, %g1
	sub		%g0, %g1, %o3
	sllx		%o4, %g1, %o4
	sllx		%o5, %g1, %g1
	srlx		%o5, %o3, %o5
	srlx		%o0, %o3, %o3
	or		%o5, %o4, %o4
	or		%o3, %g1, %o5
1:
	ld		[%o2 + 240], %o3
	ldd		[%o2 + 16], %f12
	ldd		[%o2 + 24], %f14
	xor		%g4, %o4, %o4
	xor		%g5, %o5, %o5
	.word	0x81b0230c !movxtod	%o4,%f0
	.word	0x85b0230d !movxtod	%o5,%f2
	srl		%o3, 1, %o3
	ldd		[%o2 + 32], %f16
	sub		%o3, 1, %o3
	ldd		[%o2 + 40], %f18
	add		%o2, 48, %o2

.Lenc:
	.word	0x88cb0400 !aes_eround01	%f12,%f0,%f2,%f4
	.word	0x84cb8420 !aes_eround23	%f14,%f0,%f2,%f2
	ldd		[%o2 + 0], %f12
	ldd		[%o2 + 8], %f14
	sub		%o3,1,%o3
	.word	0x80cc0404 !aes_eround01	%f16,%f4,%f2,%f0
	.word	0x84cc8424 !aes_eround23	%f18,%f4,%f2,%f2
	ldd		[%o2 + 16], %f16
	ldd		[%o2 + 24], %f18
	brnz,pt		%o3, .Lenc
	add		%o2, 32, %o2

	andcc		%o1, 7, %o4		! is output aligned?
	.word	0x88cb0400 !aes_eround01	%f12,%f0,%f2,%f4
	.word	0x84cb8420 !aes_eround23	%f14,%f0,%f2,%f2
	.word	0x80cc0484 !aes_eround01_l	%f16,%f4,%f2,%f0
	.word	0x84cc84a4 !aes_eround23_l	%f18,%f4,%f2,%f2

	bnz,pn		%icc, 2f
	nop

	std		%f0, [%o1 + 0]
	retl
	std		%f2, [%o1 + 8]

2:	.word	0x93b24340 !alignaddrl	%o1,%g0,%o1
	mov		0xff, %o5
	srl		%o5, %o4, %o5

	.word	0x89b00900 !faligndata	%f0,%f0,%f4
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8

	stda		%f4, [%o1 + %o5]0xc0	! partial store
	std		%f6, [%o1 + 8]
	add		%o1, 16, %o1
	orn		%g0, %o5, %o5
	retl
	stda		%f8, [%o1 + %o5]0xc0	! partial store
.type	aes_t4_encrypt,#function
.size	aes_t4_encrypt,.-aes_t4_encrypt

.globl	aes_t4_decrypt
.align	32
aes_t4_decrypt:
	andcc		%o0, 7, %g1		! is input aligned?
	andn		%o0, 7, %o0

	ldx		[%o2 + 0], %g4
	ldx		[%o2 + 8], %g5

	ldx		[%o0 + 0], %o4
	bz,pt		%icc, 1f
	ldx		[%o0 + 8], %o5
	ldx		[%o0 + 16], %o0
	sll		%g1, 3, %g1
	sub		%g0, %g1, %o3
	sllx		%o4, %g1, %o4
	sllx		%o5, %g1, %g1
	srlx		%o5, %o3, %o5
	srlx		%o0, %o3, %o3
	or		%o5, %o4, %o4
	or		%o3, %g1, %o5
1:
	ld		[%o2 + 240], %o3
	ldd		[%o2 + 16], %f12
	ldd		[%o2 + 24], %f14
	xor		%g4, %o4, %o4
	xor		%g5, %o5, %o5
	.word	0x81b0230c !movxtod	%o4,%f0
	.word	0x85b0230d !movxtod	%o5,%f2
	srl		%o3, 1, %o3
	ldd		[%o2 + 32], %f16
	sub		%o3, 1, %o3
	ldd		[%o2 + 40], %f18
	add		%o2, 48, %o2

.Ldec:
	.word	0x88cb0440 !aes_dround01	%f12,%f0,%f2,%f4
	.word	0x84cb8460 !aes_dround23	%f14,%f0,%f2,%f2
	ldd		[%o2 + 0], %f12
	ldd		[%o2 + 8], %f14
	sub		%o3,1,%o3
	.word	0x80cc0444 !aes_dround01	%f16,%f4,%f2,%f0
	.word	0x84cc8464 !aes_dround23	%f18,%f4,%f2,%f2
	ldd		[%o2 + 16], %f16
	ldd		[%o2 + 24], %f18
	brnz,pt		%o3, .Ldec
	add		%o2, 32, %o2

	andcc		%o1, 7, %o4		! is output aligned?
	.word	0x88cb0440 !aes_dround01	%f12,%f0,%f2,%f4
	.word	0x84cb8460 !aes_dround23	%f14,%f0,%f2,%f2
	.word	0x80cc04c4 !aes_dround01_l	%f16,%f4,%f2,%f0
	.word	0x84cc84e4 !aes_dround23_l	%f18,%f4,%f2,%f2

	bnz,pn		%icc, 2f
	nop

	std		%f0, [%o1 + 0]
	retl
	std		%f2, [%o1 + 8]

2:	.word	0x93b24340 !alignaddrl	%o1,%g0,%o1
	mov		0xff, %o5
	srl		%o5, %o4, %o5

	.word	0x89b00900 !faligndata	%f0,%f0,%f4
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8

	stda		%f4, [%o1 + %o5]0xc0	! partial store
	std		%f6, [%o1 + 8]
	add		%o1, 16, %o1
	orn		%g0, %o5, %o5
	retl
	stda		%f8, [%o1 + %o5]0xc0	! partial store
.type	aes_t4_decrypt,#function
.size	aes_t4_decrypt,.-aes_t4_decrypt
.globl	aes_t4_set_encrypt_key
.align	32
aes_t4_set_encrypt_key:
.Lset_encrypt_key:
	and		%o0, 7, %o3
	.word	0x91b20300 !alignaddr	%o0,%g0,%o0
	cmp		%o1, 192
	ldd		[%o0 + 0], %f0
	bl,pt		%icc,.L128
	ldd		[%o0 + 8], %f2

	be,pt		%icc,.L192
	ldd		[%o0 + 16], %f4
	brz,pt		%o3, .L256aligned
	ldd		[%o0 + 24], %f6

	ldd		[%o0 + 32], %f8
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
	.word	0x89b10906 !faligndata	%f4,%f6,%f4
	.word	0x8db18908 !faligndata	%f6,%f8,%f6
.L256aligned:
	std		%f0, [%o2 + 0]
	.word	0x80c80106 !aes_kexpand1	%f0,%f6,0,%f0
	std		%f2, [%o2 + 8]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 16]
	.word	0x89b12602 !aes_kexpand0	%f4,%f2,%f4
	std		%f6, [%o2 + 24]
	.word	0x8db1a624 !aes_kexpand2	%f6,%f4,%f6
	std		%f0, [%o2 + 32]
	.word	0x80c80306 !aes_kexpand1	%f0,%f6,1,%f0
	std		%f2, [%o2 + 40]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 48]
	.word	0x89b12602 !aes_kexpand0	%f4,%f2,%f4
	std		%f6, [%o2 + 56]
	.word	0x8db1a624 !aes_kexpand2	%f6,%f4,%f6
	std		%f0, [%o2 + 64]
	.word	0x80c80506 !aes_kexpand1	%f0,%f6,2,%f0
	std		%f2, [%o2 + 72]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 80]
	.word	0x89b12602 !aes_kexpand0	%f4,%f2,%f4
	std		%f6, [%o2 + 88]
	.word	0x8db1a624 !aes_kexpand2	%f6,%f4,%f6
	std		%f0, [%o2 + 96]
	.word	0x80c80706 !aes_kexpand1	%f0,%f6,3,%f0
	std		%f2, [%o2 + 104]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 112]
	.word	0x89b12602 !aes_kexpand0	%f4,%f2,%f4
	std		%f6, [%o2 + 120]
	.word	0x8db1a624 !aes_kexpand2	%f6,%f4,%f6
	std		%f0, [%o2 + 128]
	.word	0x80c80906 !aes_kexpand1	%f0,%f6,4,%f0
	std		%f2, [%o2 + 136]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 144]
	.word	0x89b12602 !aes_kexpand0	%f4,%f2,%f4
	std		%f6, [%o2 + 152]
	.word	0x8db1a624 !aes_kexpand2	%f6,%f4,%f6
	std		%f0, [%o2 + 160]
	.word	0x80c80b06 !aes_kexpand1	%f0,%f6,5,%f0
	std		%f2, [%o2 + 168]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 176]
	.word	0x89b12602 !aes_kexpand0	%f4,%f2,%f4
	std		%f6, [%o2 + 184]
	.word	0x8db1a624 !aes_kexpand2	%f6,%f4,%f6
	std		%f0, [%o2 + 192]
	.word	0x80c80d06 !aes_kexpand1	%f0,%f6,6,%f0
	std		%f2, [%o2 + 200]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 208]
	std		%f6, [%o2 + 216]
	std		%f0, [%o2 + 224]
	std		%f2, [%o2 + 232]

	mov		14, %o3
	st		%o3, [%o2 + 240]
	retl
	xor		%o0, %o0, %o0

.align	16
.L192:
	brz,pt		%o3, .L192aligned
	nop

	ldd		[%o0 + 24], %f6
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
	.word	0x89b10906 !faligndata	%f4,%f6,%f4
.L192aligned:
	std		%f0, [%o2 + 0]
	.word	0x80c80104 !aes_kexpand1	%f0,%f4,0,%f0
	std		%f2, [%o2 + 8]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 16]
	.word	0x89b12622 !aes_kexpand2	%f4,%f2,%f4
	std		%f0, [%o2 + 24]
	.word	0x80c80304 !aes_kexpand1	%f0,%f4,1,%f0
	std		%f2, [%o2 + 32]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 40]
	.word	0x89b12622 !aes_kexpand2	%f4,%f2,%f4
	std		%f0, [%o2 + 48]
	.word	0x80c80504 !aes_kexpand1	%f0,%f4,2,%f0
	std		%f2, [%o2 + 56]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 64]
	.word	0x89b12622 !aes_kexpand2	%f4,%f2,%f4
	std		%f0, [%o2 + 72]
	.word	0x80c80704 !aes_kexpand1	%f0,%f4,3,%f0
	std		%f2, [%o2 + 80]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 88]
	.word	0x89b12622 !aes_kexpand2	%f4,%f2,%f4
	std		%f0, [%o2 + 96]
	.word	0x80c80904 !aes_kexpand1	%f0,%f4,4,%f0
	std		%f2, [%o2 + 104]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 112]
	.word	0x89b12622 !aes_kexpand2	%f4,%f2,%f4
	std		%f0, [%o2 + 120]
	.word	0x80c80b04 !aes_kexpand1	%f0,%f4,5,%f0
	std		%f2, [%o2 + 128]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 136]
	.word	0x89b12622 !aes_kexpand2	%f4,%f2,%f4
	std		%f0, [%o2 + 144]
	.word	0x80c80d04 !aes_kexpand1	%f0,%f4,6,%f0
	std		%f2, [%o2 + 152]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 160]
	.word	0x89b12622 !aes_kexpand2	%f4,%f2,%f4
	std		%f0, [%o2 + 168]
	.word	0x80c80f04 !aes_kexpand1	%f0,%f4,7,%f0
	std		%f2, [%o2 + 176]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f4, [%o2 + 184]
	std		%f0, [%o2 + 192]
	std		%f2, [%o2 + 200]

	mov		12, %o3
	st		%o3, [%o2 + 240]
	retl
	xor		%o0, %o0, %o0

.align	16
.L128:
	brz,pt		%o3, .L128aligned
	nop

	ldd		[%o0 + 16], %f4
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
.L128aligned:
	std		%f0, [%o2 + 0]
	.word	0x80c80102 !aes_kexpand1	%f0,%f2,0,%f0
	std		%f2, [%o2 + 8]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 16]
	.word	0x80c80302 !aes_kexpand1	%f0,%f2,1,%f0
	std		%f2, [%o2 + 24]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 32]
	.word	0x80c80502 !aes_kexpand1	%f0,%f2,2,%f0
	std		%f2, [%o2 + 40]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 48]
	.word	0x80c80702 !aes_kexpand1	%f0,%f2,3,%f0
	std		%f2, [%o2 + 56]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 64]
	.word	0x80c80902 !aes_kexpand1	%f0,%f2,4,%f0
	std		%f2, [%o2 + 72]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 80]
	.word	0x80c80b02 !aes_kexpand1	%f0,%f2,5,%f0
	std		%f2, [%o2 + 88]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 96]
	.word	0x80c80d02 !aes_kexpand1	%f0,%f2,6,%f0
	std		%f2, [%o2 + 104]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 112]
	.word	0x80c80f02 !aes_kexpand1	%f0,%f2,7,%f0
	std		%f2, [%o2 + 120]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 128]
	.word	0x80c81102 !aes_kexpand1	%f0,%f2,8,%f0
	std		%f2, [%o2 + 136]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 144]
	.word	0x80c81302 !aes_kexpand1	%f0,%f2,9,%f0
	std		%f2, [%o2 + 152]
	.word	0x85b0a620 !aes_kexpand2	%f2,%f0,%f2
	std		%f0, [%o2 + 160]
	std		%f2, [%o2 + 168]

	mov		10, %o3
	st		%o3, [%o2 + 240]
	retl
	xor		%o0, %o0, %o0
.type	aes_t4_set_encrypt_key,#function
.size	aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key

.globl	aes_t4_set_decrypt_key
.align	32
aes_t4_set_decrypt_key:
	mov		%o7, %o5
	call		.Lset_encrypt_key
	nop

	mov		%o5, %o7
	sll		%o3, 4, %o0		! %o3 is number of rounds
	add		%o3, 2, %o3
	add		%o2, %o0, %o0	! %o0=%o2+16*rounds
	srl		%o3, 2, %o3		! %o3=(rounds+2)/4

.Lkey_flip:
	ldd		[%o2 + 0],  %f0
	ldd		[%o2 + 8],  %f2
	ldd		[%o2 + 16], %f4
	ldd		[%o2 + 24], %f6
	ldd		[%o0 + 0],  %f8
	ldd		[%o0 + 8],  %f10
	ldd		[%o0 - 16], %f12
	ldd		[%o0 - 8],  %f14
	sub		%o3, 1, %o3
	std		%f0, [%o0 + 0]
	std		%f2, [%o0 + 8]
	std		%f4, [%o0 - 16]
	std		%f6, [%o0 - 8]
	std		%f8, [%o2 + 0]
	std		%f10, [%o2 + 8]
	std		%f12, [%o2 + 16]
	std		%f14, [%o2 + 24]
	add		%o2, 32, %o2
	brnz		%o3, .Lkey_flip
	sub		%o0, 32, %o0

	retl
	xor		%o0, %o0, %o0
.type	aes_t4_set_decrypt_key,#function
.size	aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
.align	32
_aes128_encrypt_1x:
	.word	0x88cc0400 !aes_eround01	%f16,%f0,%f2,%f4
	.word	0x84cc8420 !aes_eround23	%f18,%f0,%f2,%f2
	.word	0x80cd0404 !aes_eround01	%f20,%f4,%f2,%f0
	.word	0x84cd8424 !aes_eround23	%f22,%f4,%f2,%f2
	.word	0x88ce0400 !aes_eround01	%f24,%f0,%f2,%f4
	.word	0x84ce8420 !aes_eround23	%f26,%f0,%f2,%f2
	.word	0x80cf0404 !aes_eround01	%f28,%f4,%f2,%f0
	.word	0x84cf8424 !aes_eround23	%f30,%f4,%f2,%f2
	.word	0x88c84400 !aes_eround01	%f32,%f0,%f2,%f4
	.word	0x84c8c420 !aes_eround23	%f34,%f0,%f2,%f2
	.word	0x80c94404 !aes_eround01	%f36,%f4,%f2,%f0
	.word	0x84c9c424 !aes_eround23	%f38,%f4,%f2,%f2
	.word	0x88ca4400 !aes_eround01	%f40,%f0,%f2,%f4
	.word	0x84cac420 !aes_eround23	%f42,%f0,%f2,%f2
	.word	0x80cb4404 !aes_eround01	%f44,%f4,%f2,%f0
	.word	0x84cbc424 !aes_eround23	%f46,%f4,%f2,%f2
	.word	0x88cc4400 !aes_eround01	%f48,%f0,%f2,%f4
	.word	0x84ccc420 !aes_eround23	%f50,%f0,%f2,%f2
	.word	0x80cd4484 !aes_eround01_l	%f52,%f4,%f2,%f0
	retl
	.word	0x84cdc4a4 !aes_eround23_l	%f54,%f4,%f2,%f2
.type	_aes128_encrypt_1x,#function
.size	_aes128_encrypt_1x,.-_aes128_encrypt_1x

.align	32
_aes128_encrypt_2x:
	.word	0x90cc0400 !aes_eround01	%f16,%f0,%f2,%f8
	.word	0x84cc8420 !aes_eround23	%f18,%f0,%f2,%f2
	.word	0x94cc0c04 !aes_eround01	%f16,%f4,%f6,%f10
	.word	0x8ccc8c24 !aes_eround23	%f18,%f4,%f6,%f6
	.word	0x80cd0408 !aes_eround01	%f20,%f8,%f2,%f0
	.word	0x84cd8428 !aes_eround23	%f22,%f8,%f2,%f2
	.word	0x88cd0c0a !aes_eround01	%f20,%f10,%f6,%f4
	.word	0x8ccd8c2a !aes_eround23	%f22,%f10,%f6,%f6
	.word	0x90ce0400 !aes_eround01	%f24,%f0,%f2,%f8
	.word	0x84ce8420 !aes_eround23	%f26,%f0,%f2,%f2
	.word	0x94ce0c04 !aes_eround01	%f24,%f4,%f6,%f10
	.word	0x8cce8c24 !aes_eround23	%f26,%f4,%f6,%f6
	.word	0x80cf0408 !aes_eround01	%f28,%f8,%f2,%f0
	.word	0x84cf8428 !aes_eround23	%f30,%f8,%f2,%f2
	.word	0x88cf0c0a !aes_eround01	%f28,%f10,%f6,%f4
	.word	0x8ccf8c2a !aes_eround23	%f30,%f10,%f6,%f6
	.word	0x90c84400 !aes_eround01	%f32,%f0,%f2,%f8
	.word	0x84c8c420 !aes_eround23	%f34,%f0,%f2,%f2
	.word	0x94c84c04 !aes_eround01	%f32,%f4,%f6,%f10
	.word	0x8cc8cc24 !aes_eround23	%f34,%f4,%f6,%f6
	.word	0x80c94408 !aes_eround01	%f36,%f8,%f2,%f0
	.word	0x84c9c428 !aes_eround23	%f38,%f8,%f2,%f2
	.word	0x88c94c0a !aes_eround01	%f36,%f10,%f6,%f4
	.word	0x8cc9cc2a !aes_eround23	%f38,%f10,%f6,%f6
	.word	0x90ca4400 !aes_eround01	%f40,%f0,%f2,%f8
	.word	0x84cac420 !aes_eround23	%f42,%f0,%f2,%f2
	.word	0x94ca4c04 !aes_eround01	%f40,%f4,%f6,%f10
	.word	0x8ccacc24 !aes_eround23	%f42,%f4,%f6,%f6
	.word	0x80cb4408 !aes_eround01	%f44,%f8,%f2,%f0
	.word	0x84cbc428 !aes_eround23	%f46,%f8,%f2,%f2
	.word	0x88cb4c0a !aes_eround01	%f44,%f10,%f6,%f4
	.word	0x8ccbcc2a !aes_eround23	%f46,%f10,%f6,%f6
	.word	0x90cc4400 !aes_eround01	%f48,%f0,%f2,%f8
	.word	0x84ccc420 !aes_eround23	%f50,%f0,%f2,%f2
	.word	0x94cc4c04 !aes_eround01	%f48,%f4,%f6,%f10
	.word	0x8ccccc24 !aes_eround23	%f50,%f4,%f6,%f6
	.word	0x80cd4488 !aes_eround01_l	%f52,%f8,%f2,%f0
	.word	0x84cdc4a8 !aes_eround23_l	%f54,%f8,%f2,%f2
	.word	0x88cd4c8a !aes_eround01_l	%f52,%f10,%f6,%f4
	retl
	.word	0x8ccdccaa !aes_eround23_l	%f54,%f10,%f6,%f6
.type	_aes128_encrypt_2x,#function
.size	_aes128_encrypt_2x,.-_aes128_encrypt_2x

.align	32
_aes128_loadkey:
	ldx		[%i3 + 0], %g4
	ldx		[%i3 + 8], %g5
	ldd		[%i3 + 16], %f16
	ldd		[%i3 + 24], %f18
	ldd		[%i3 + 32], %f20
	ldd		[%i3 + 40], %f22
	ldd		[%i3 + 48], %f24
	ldd		[%i3 + 56], %f26
	ldd		[%i3 + 64], %f28
	ldd		[%i3 + 72], %f30
	ldd		[%i3 + 80], %f32
	ldd		[%i3 + 88], %f34
	ldd		[%i3 + 96], %f36
	ldd		[%i3 + 104], %f38
	ldd		[%i3 + 112], %f40
	ldd		[%i3 + 120], %f42
	ldd		[%i3 + 128], %f44
	ldd		[%i3 + 136], %f46
	ldd		[%i3 + 144], %f48
	ldd		[%i3 + 152], %f50
	ldd		[%i3 + 160], %f52
	ldd		[%i3 + 168], %f54
	retl
	nop
.type	_aes128_loadkey,#function
.size	_aes128_loadkey,.-_aes128_loadkey
_aes128_load_enckey=_aes128_loadkey
_aes128_load_deckey=_aes128_loadkey

.globl	aes128_t4_cbc_encrypt
.align	32
aes128_t4_cbc_encrypt:
	save		%sp, -STACK_FRAME, %sp
	cmp		%i2, 0
	be,pn		SIZE_T_CC, .L128_cbc_enc_abort
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9
	sub		%i0, %i1, %l5	! %i0!=%i1
	ld		[%i4 + 0], %f0
	ld		[%i4 + 4], %f1
	ld		[%i4 + 8], %f2
	ld		[%i4 + 12], %f3
	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes128_load_enckey
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 127
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<128 ||
	brnz,pn		%l5, .L128cbc_enc_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	srlx		%i2, 4, %i2
	prefetch	[%i1], 22

.L128_cbc_enc_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes128_encrypt_1x
	add		%i0, 16, %i0

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L128_cbc_enc_loop
	add		%i1, 16, %i1
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
.L128_cbc_enc_abort:
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8

	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_cbc_enc_loop+4
	orn		%g0, %l3, %l3
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128cbc_enc_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5

.L128_cbc_enc_blk_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 5f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
5:
	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	prefetch	[%i0 + 16+63], 20
	call		_aes128_encrypt_1x
	add		%i0, 16, %i0
	sub		%i2, 1, %i2

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	brnz,pt		%i2, .L128_cbc_enc_blk_loop
	add		%i1, 8, %i1

	membar		#StoreLoad|#StoreStore
	brnz,pt		%l5, .L128_cbc_enc_loop
	mov		%l5, %i2
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
	ret
	restore
.type	aes128_t4_cbc_encrypt,#function
.size	aes128_t4_cbc_encrypt,.-aes128_t4_cbc_encrypt
.globl	aes128_t4_ctr32_encrypt
.align	32
aes128_t4_ctr32_encrypt:
	save		%sp, -STACK_FRAME, %sp
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9

	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes128_load_enckey
	sllx		%i2, 4, %i2

	ld		[%i4 + 0], %l4	! counter
	ld		[%i4 + 4], %l5
	ld		[%i4 + 8], %l6
	ld		[%i4 + 12], %l7

	sllx		%l4, 32, %o5
	or		%l5, %o5, %o5
	sllx		%l6, 32, %g1
	xor		%o5, %g4, %g4		! ^= rk[0]
	xor		%g1, %g5, %g5
	.word	0x9db02304 !movxtod	%g4,%f14		! most significant 64 bits

	sub		%i0, %i1, %l5	! %i0!=%i1
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L128_ctr32_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L128_ctr32_loop2x
	srlx		%i2, 4, %i2
.L128_ctr32_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	.word	0x88cc040e !aes_eround01	%f16,%f14,%f2,%f4
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	call		_aes128_encrypt_1x+8
	add		%i0, 16, %i0

	.word	0x95b02308 !movxtod	%o0,%f10
	.word	0x99b02309 !movxtod	%o1,%f12
	.word	0x81b28d80 !fxor	%f10,%f0,%f0		! ^= inp
	.word	0x85b30d82 !fxor	%f12,%f2,%f2

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L128_ctr32_loop2x
	add		%i1, 16, %i1

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8
	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_ctr32_loop2x+4
	orn		%g0, %l3, %l3

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_ctr32_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	xor		%g5, %l7, %g1
	add		%l7, 1, %l7
	.word	0x8db02301 !movxtod	%g1,%f6
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	.word	0x90cc040e !aes_eround01	%f16,%f14,%f2,%f8
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	.word	0x94cc0c0e !aes_eround01	%f16,%f14,%f6,%f10
	.word	0x8ccc8c2e !aes_eround23	%f18,%f14,%f6,%f6
	call		_aes128_encrypt_2x+16
	add		%i0, 32, %i0

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x81b20d80 !fxor	%f8,%f0,%f0		! ^= inp
	.word	0x91b0230b !movxtod	%o3,%f8
	.word	0x85b28d82 !fxor	%f10,%f2,%f2
	.word	0x89b30d84 !fxor	%f12,%f4,%f4
	.word	0x8db20d86 !fxor	%f8,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L128_ctr32_loop2x
	add		%i1, 32, %i1

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
	.word	0x89b10906 !faligndata	%f4,%f6,%f4
	.word	0x8db18906 !faligndata	%f6,%f6,%f6

	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f0, [%i1 + 8]
	std		%f2, [%i1 + 16]
	std		%f4, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f6, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_ctr32_loop2x+4
	orn		%g0, %l3, %l3

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_ctr32_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L128_ctr32_blk_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	xor		%g5, %l7, %g1
	add		%l7, 1, %l7
	.word	0x8db02301 !movxtod	%g1,%f6
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i0 + 32+63], 20
	.word	0x90cc040e !aes_eround01	%f16,%f14,%f2,%f8
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	.word	0x94cc0c0e !aes_eround01	%f16,%f14,%f6,%f10
	.word	0x8ccc8c2e !aes_eround23	%f18,%f14,%f6,%f6
	call		_aes128_encrypt_2x+16
	add		%i0, 32, %i0
	subcc		%i2, 2, %i2

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x81b20d80 !fxor	%f8,%f0,%f0		! ^= inp
	.word	0x91b0230b !movxtod	%o3,%f8
	.word	0x85b28d82 !fxor	%f10,%f2,%f2
	.word	0x89b30d84 !fxor	%f12,%f4,%f4
	.word	0x8db20d86 !fxor	%f8,%f6,%f6

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L128_ctr32_blk_loop2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L128_ctr32_loop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L128_ctr32_loop2x
	nop

	ret
	restore
.type	aes128_t4_ctr32_encrypt,#function
.size	aes128_t4_ctr32_encrypt,.-aes128_t4_ctr32_encrypt
.globl	aes128_t4_xts_encrypt
.align	32
aes128_t4_xts_encrypt:
	save		%sp, -STACK_FRAME-16, %sp
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9

	mov		%i5, %o0
	add		%fp, STACK_BIAS-16, %o1
	call		aes_t4_encrypt
	mov		%i4, %o2

	add		%fp, STACK_BIAS-16, %l7
	ldxa		[%l7]0x88, %g2
	add		%fp, STACK_BIAS-8, %l7
	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak

	sethi		%hi(0x76543210), %l7
	or		%l7, %lo(0x76543210), %l7
	.word	0x81b5c320 !bmask	%l7,%g0,%g0		! byte swap mask

	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes128_load_enckey
	and		%i2, 15,  %i5
	and		%i2, -16, %i2

	sub		%i0, %i1, %l5	! %i0!=%i1
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L128_xts_enblk !	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L128_xts_enloop2x
	srlx		%i2, 4, %i2
.L128_xts_enloop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes128_encrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L128_xts_enloop2x
	add		%i1, 16, %i1

	brnz,pn		%i5, .L128_xts_ensteal
	nop

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8
	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_xts_enloop2x+4
	orn		%g0, %l3, %l3

	brnz,pn		%i5, .L128_xts_ensteal
	nop

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_xts_enloop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10
	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	xor		%g4, %o2, %o2		! ^= rk[0]
	xor		%g5, %o3, %o3
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2
	.word	0x89b0230a !movxtod	%o2,%f4
	.word	0x8db0230b !movxtod	%o3,%f6

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4		! ^= tweak[0]
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	call		_aes128_encrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L128_xts_enloop2x
	add		%i1, 32, %i1

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L128_xts_ensteal
	nop

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x95b00902 !faligndata	%f0,%f2,%f10
	.word	0x99b08904 !faligndata	%f2,%f4,%f12
	.word	0x9db10906 !faligndata	%f4,%f6,%f14
	.word	0x81b18906 !faligndata	%f6,%f6,%f0

	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f10, [%i1 + 8]
	std		%f12, [%i1 + 16]
	std		%f14, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f0, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_xts_enloop2x+4
	orn		%g0, %l3, %l3

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L128_xts_ensteal
	nop

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_xts_enblk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L128_xts_enblk2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10
	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	xor		%g4, %o2, %o2		! ^= rk[0]
	xor		%g5, %o3, %o3
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2
	.word	0x89b0230a !movxtod	%o2,%f4
	.word	0x8db0230b !movxtod	%o3,%f6

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4		! ^= tweak[0]
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	prefetch	[%i0 + 32+63], 20
	call		_aes128_encrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	subcc		%i2, 2, %i2
	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L128_xts_enblk2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L128_xts_enloop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L128_xts_enloop2x
	nop

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L128_xts_ensteal
	nop

	ret
	restore
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_xts_ensteal:
	std		%f0, [%fp + STACK_BIAS-16]	! copy of output
	std		%f2, [%fp + STACK_BIAS-8]

	srl		%l0, 3, %l0
	add		%fp, STACK_BIAS-16, %l7
	add		%i0, %l0, %i0	! original %i0+%i2&-15
	add		%i1, %l2, %i1	! original %i1+%i2&-15
	mov		0, %l0
	nop					! align

.L128_xts_enstealing:
	ldub		[%i0 + %l0], %o0
	ldub		[%l7  + %l0], %o1
	dec		%i5
	stb		%o0, [%l7  + %l0]
	stb		%o1, [%i1 + %l0]
	brnz		%i5, .L128_xts_enstealing
	inc		%l0

	mov		%l7, %i0
	sub		%i1, 16, %i1
	mov		0, %l0
	sub		%i1, %l2, %i1
	ba		.L128_xts_enloop	! one more time
	mov		1, %i2				! %i5 is 0
	ret
	restore
.type	aes128_t4_xts_encrypt,#function
.size	aes128_t4_xts_encrypt,.-aes128_t4_xts_encrypt
.globl	aes128_t4_xts_decrypt
.align	32
aes128_t4_xts_decrypt:
	save		%sp, -STACK_FRAME-16, %sp
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9

	mov		%i5, %o0
	add		%fp, STACK_BIAS-16, %o1
	call		aes_t4_encrypt
	mov		%i4, %o2

	add		%fp, STACK_BIAS-16, %l7
	ldxa		[%l7]0x88, %g2
	add		%fp, STACK_BIAS-8, %l7
	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak

	sethi		%hi(0x76543210), %l7
	or		%l7, %lo(0x76543210), %l7
	.word	0x81b5c320 !bmask	%l7,%g0,%g0		! byte swap mask

	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes128_load_deckey
	and		%i2, 15,  %i5
	and		%i2, -16, %i2
	mov		0, %l7
	movrnz		%i5, 16,  %l7
	sub		%i2, %l7, %i2

	sub		%i0, %i1, %l5	! %i0!=%i1
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L128_xts_deblk !	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	brz,pn		%i2, .L128_xts_desteal
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L128_xts_deloop2x
	srlx		%i2, 4, %i2
.L128_xts_deloop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes128_decrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L128_xts_deloop2x
	add		%i1, 16, %i1

	brnz,pn		%i5, .L128_xts_desteal
	nop

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8
	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_xts_deloop2x+4
	orn		%g0, %l3, %l3

	brnz,pn		%i5, .L128_xts_desteal
	nop

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_xts_deloop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10
	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	xor		%g4, %o2, %o2		! ^= rk[0]
	xor		%g5, %o3, %o3
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2
	.word	0x89b0230a !movxtod	%o2,%f4
	.word	0x8db0230b !movxtod	%o3,%f6

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4		! ^= tweak[0]
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	call		_aes128_decrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L128_xts_deloop2x
	add		%i1, 32, %i1

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L128_xts_desteal
	nop

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x95b00902 !faligndata	%f0,%f2,%f10
	.word	0x99b08904 !faligndata	%f2,%f4,%f12
	.word	0x9db10906 !faligndata	%f4,%f6,%f14
	.word	0x81b18906 !faligndata	%f6,%f6,%f0

	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f10, [%i1 + 8]
	std		%f12, [%i1 + 16]
	std		%f14, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f0, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_xts_deloop2x+4
	orn		%g0, %l3, %l3

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L128_xts_desteal
	nop

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_xts_deblk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L128_xts_deblk2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10
	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	xor		%g4, %o2, %o2		! ^= rk[0]
	xor		%g5, %o3, %o3
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2
	.word	0x89b0230a !movxtod	%o2,%f4
	.word	0x8db0230b !movxtod	%o3,%f6

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4		! ^= tweak[0]
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	prefetch	[%i0 + 32+63], 20
	call		_aes128_decrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	subcc		%i2, 2, %i2
	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L128_xts_deblk2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L128_xts_deloop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L128_xts_deloop2x
	nop

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L128_xts_desteal
	nop

	ret
	restore
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_xts_desteal:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 8f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
8:
	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %o2
	and		%l7, 0x87, %l7
	.word	0x97b0c223 !addxc	%g3,%g3,%o3
	xor		%l7, %o2, %o2

	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x9db0230b !movxtod	%o3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	call		_aes128_decrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	std		%f0, [%fp + STACK_BIAS-16]
	std		%f2, [%fp + STACK_BIAS-8]

	srl		%l0, 3, %l0
	add		%fp, STACK_BIAS-16, %l7
	add		%i0, %l0, %i0	! original %i0+%i2&-15
	add		%i1, %l2, %i1	! original %i1+%i2&-15
	mov		0, %l0
	add		%i1, 16, %i1
	nop					! align

.L128_xts_destealing:
	ldub		[%i0 + %l0], %o0
	ldub		[%l7  + %l0], %o1
	dec		%i5
	stb		%o0, [%l7  + %l0]
	stb		%o1, [%i1 + %l0]
	brnz		%i5, .L128_xts_destealing
	inc		%l0

	mov		%l7, %i0
	sub		%i1, 16, %i1
	mov		0, %l0
	sub		%i1, %l2, %i1
	ba		.L128_xts_deloop	! one more time
	mov		1, %i2				! %i5 is 0
	ret
	restore
.type	aes128_t4_xts_decrypt,#function
.size	aes128_t4_xts_decrypt,.-aes128_t4_xts_decrypt
.globl	aes128_t4_cbc_decrypt
.align	32
aes128_t4_cbc_decrypt:
	save		%sp, -STACK_FRAME, %sp
	cmp		%i2, 0
	be,pn		SIZE_T_CC, .L128_cbc_dec_abort
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9
	sub		%i0, %i1, %l5	! %i0!=%i1
	ld		[%i4 + 0], %f12	! load ivec
	ld		[%i4 + 4], %f13
	ld		[%i4 + 8], %f14
	ld		[%i4 + 12], %f15
	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes128_load_deckey
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L128cbc_dec_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	srlx		%i2, 4, %i2
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L128_cbc_dec_loop2x
	prefetch	[%i1], 22
.L128_cbc_dec_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g4, %o0, %o2		! ^= rk[0]
	xor		%g5, %o1, %o3
	.word	0x81b0230a !movxtod	%o2,%f0
	.word	0x85b0230b !movxtod	%o3,%f2

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes128_decrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L128_cbc_dec_loop2x
	add		%i1, 16, %i1
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
.L128_cbc_dec_abort:
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8

	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_cbc_dec_loop2x+4
	orn		%g0, %l3, %l3
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128_cbc_dec_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	xor		%g4, %o0, %o4		! ^= rk[0]
	xor		%g5, %o1, %o5
	.word	0x81b0230c !movxtod	%o4,%f0
	.word	0x85b0230d !movxtod	%o5,%f2
	xor		%g4, %o2, %o4
	xor		%g5, %o3, %o5
	.word	0x89b0230c !movxtod	%o4,%f4
	.word	0x8db0230d !movxtod	%o5,%f6

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	call		_aes128_decrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x9db0230b !movxtod	%o3,%f14
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L128_cbc_dec_loop2x
	add		%i1, 32, %i1
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
	.word	0x89b10906 !faligndata	%f4,%f6,%f4
	.word	0x8db18906 !faligndata	%f6,%f6,%f6
	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f0, [%i1 + 8]
	std		%f2, [%i1 + 16]
	std		%f4, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f6, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L128_cbc_dec_loop2x+4
	orn		%g0, %l3, %l3
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L128cbc_dec_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L128_cbc_dec_blk_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	xor		%g4, %o0, %o4		! ^= rk[0]
	xor		%g5, %o1, %o5
	.word	0x81b0230c !movxtod	%o4,%f0
	.word	0x85b0230d !movxtod	%o5,%f2
	xor		%g4, %o2, %o4
	xor		%g5, %o3, %o5
	.word	0x89b0230c !movxtod	%o4,%f4
	.word	0x8db0230d !movxtod	%o5,%f6

	prefetch	[%i0 + 32+63], 20
	call		_aes128_decrypt_2x
	add		%i0, 32, %i0
	subcc		%i2, 2, %i2

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x9db0230b !movxtod	%o3,%f14
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L128_cbc_dec_blk_loop2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L128_cbc_dec_loop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L128_cbc_dec_loop2x
	nop
	st		%f12, [%i4 + 0]	! write out ivec
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore
.type	aes128_t4_cbc_decrypt,#function
.size	aes128_t4_cbc_decrypt,.-aes128_t4_cbc_decrypt
.align	32
_aes128_decrypt_1x:
	.word	0x88cc0440 !aes_dround01	%f16,%f0,%f2,%f4
	.word	0x84cc8460 !aes_dround23	%f18,%f0,%f2,%f2
	.word	0x80cd0444 !aes_dround01	%f20,%f4,%f2,%f0
	.word	0x84cd8464 !aes_dround23	%f22,%f4,%f2,%f2
	.word	0x88ce0440 !aes_dround01	%f24,%f0,%f2,%f4
	.word	0x84ce8460 !aes_dround23	%f26,%f0,%f2,%f2
	.word	0x80cf0444 !aes_dround01	%f28,%f4,%f2,%f0
	.word	0x84cf8464 !aes_dround23	%f30,%f4,%f2,%f2
	.word	0x88c84440 !aes_dround01	%f32,%f0,%f2,%f4
	.word	0x84c8c460 !aes_dround23	%f34,%f0,%f2,%f2
	.word	0x80c94444 !aes_dround01	%f36,%f4,%f2,%f0
	.word	0x84c9c464 !aes_dround23	%f38,%f4,%f2,%f2
	.word	0x88ca4440 !aes_dround01	%f40,%f0,%f2,%f4
	.word	0x84cac460 !aes_dround23	%f42,%f0,%f2,%f2
	.word	0x80cb4444 !aes_dround01	%f44,%f4,%f2,%f0
	.word	0x84cbc464 !aes_dround23	%f46,%f4,%f2,%f2
	.word	0x88cc4440 !aes_dround01	%f48,%f0,%f2,%f4
	.word	0x84ccc460 !aes_dround23	%f50,%f0,%f2,%f2
	.word	0x80cd44c4 !aes_dround01_l	%f52,%f4,%f2,%f0
	retl
	.word	0x84cdc4e4 !aes_dround23_l	%f54,%f4,%f2,%f2
.type	_aes128_decrypt_1x,#function
.size	_aes128_decrypt_1x,.-_aes128_decrypt_1x

.align	32
_aes128_decrypt_2x:
	.word	0x90cc0440 !aes_dround01	%f16,%f0,%f2,%f8
	.word	0x84cc8460 !aes_dround23	%f18,%f0,%f2,%f2
	.word	0x94cc0c44 !aes_dround01	%f16,%f4,%f6,%f10
	.word	0x8ccc8c64 !aes_dround23	%f18,%f4,%f6,%f6
	.word	0x80cd0448 !aes_dround01	%f20,%f8,%f2,%f0
	.word	0x84cd8468 !aes_dround23	%f22,%f8,%f2,%f2
	.word	0x88cd0c4a !aes_dround01	%f20,%f10,%f6,%f4
	.word	0x8ccd8c6a !aes_dround23	%f22,%f10,%f6,%f6
	.word	0x90ce0440 !aes_dround01	%f24,%f0,%f2,%f8
	.word	0x84ce8460 !aes_dround23	%f26,%f0,%f2,%f2
	.word	0x94ce0c44 !aes_dround01	%f24,%f4,%f6,%f10
	.word	0x8cce8c64 !aes_dround23	%f26,%f4,%f6,%f6
	.word	0x80cf0448 !aes_dround01	%f28,%f8,%f2,%f0
	.word	0x84cf8468 !aes_dround23	%f30,%f8,%f2,%f2
	.word	0x88cf0c4a !aes_dround01	%f28,%f10,%f6,%f4
	.word	0x8ccf8c6a !aes_dround23	%f30,%f10,%f6,%f6
	.word	0x90c84440 !aes_dround01	%f32,%f0,%f2,%f8
	.word	0x84c8c460 !aes_dround23	%f34,%f0,%f2,%f2
	.word	0x94c84c44 !aes_dround01	%f32,%f4,%f6,%f10
	.word	0x8cc8cc64 !aes_dround23	%f34,%f4,%f6,%f6
	.word	0x80c94448 !aes_dround01	%f36,%f8,%f2,%f0
	.word	0x84c9c468 !aes_dround23	%f38,%f8,%f2,%f2
	.word	0x88c94c4a !aes_dround01	%f36,%f10,%f6,%f4
	.word	0x8cc9cc6a !aes_dround23	%f38,%f10,%f6,%f6
	.word	0x90ca4440 !aes_dround01	%f40,%f0,%f2,%f8
	.word	0x84cac460 !aes_dround23	%f42,%f0,%f2,%f2
	.word	0x94ca4c44 !aes_dround01	%f40,%f4,%f6,%f10
	.word	0x8ccacc64 !aes_dround23	%f42,%f4,%f6,%f6
	.word	0x80cb4448 !aes_dround01	%f44,%f8,%f2,%f0
	.word	0x84cbc468 !aes_dround23	%f46,%f8,%f2,%f2
	.word	0x88cb4c4a !aes_dround01	%f44,%f10,%f6,%f4
	.word	0x8ccbcc6a !aes_dround23	%f46,%f10,%f6,%f6
	.word	0x90cc4440 !aes_dround01	%f48,%f0,%f2,%f8
	.word	0x84ccc460 !aes_dround23	%f50,%f0,%f2,%f2
	.word	0x94cc4c44 !aes_dround01	%f48,%f4,%f6,%f10
	.word	0x8ccccc64 !aes_dround23	%f50,%f4,%f6,%f6
	.word	0x80cd44c8 !aes_dround01_l	%f52,%f8,%f2,%f0
	.word	0x84cdc4e8 !aes_dround23_l	%f54,%f8,%f2,%f2
	.word	0x88cd4cca !aes_dround01_l	%f52,%f10,%f6,%f4
	retl
	.word	0x8ccdccea !aes_dround23_l	%f54,%f10,%f6,%f6
.type	_aes128_decrypt_2x,#function
.size	_aes128_decrypt_2x,.-_aes128_decrypt_2x
.align	32
_aes192_encrypt_1x:
	.word	0x88cc0400 !aes_eround01	%f16,%f0,%f2,%f4
	.word	0x84cc8420 !aes_eround23	%f18,%f0,%f2,%f2
	.word	0x80cd0404 !aes_eround01	%f20,%f4,%f2,%f0
	.word	0x84cd8424 !aes_eround23	%f22,%f4,%f2,%f2
	.word	0x88ce0400 !aes_eround01	%f24,%f0,%f2,%f4
	.word	0x84ce8420 !aes_eround23	%f26,%f0,%f2,%f2
	.word	0x80cf0404 !aes_eround01	%f28,%f4,%f2,%f0
	.word	0x84cf8424 !aes_eround23	%f30,%f4,%f2,%f2
	.word	0x88c84400 !aes_eround01	%f32,%f0,%f2,%f4
	.word	0x84c8c420 !aes_eround23	%f34,%f0,%f2,%f2
	.word	0x80c94404 !aes_eround01	%f36,%f4,%f2,%f0
	.word	0x84c9c424 !aes_eround23	%f38,%f4,%f2,%f2
	.word	0x88ca4400 !aes_eround01	%f40,%f0,%f2,%f4
	.word	0x84cac420 !aes_eround23	%f42,%f0,%f2,%f2
	.word	0x80cb4404 !aes_eround01	%f44,%f4,%f2,%f0
	.word	0x84cbc424 !aes_eround23	%f46,%f4,%f2,%f2
	.word	0x88cc4400 !aes_eround01	%f48,%f0,%f2,%f4
	.word	0x84ccc420 !aes_eround23	%f50,%f0,%f2,%f2
	.word	0x80cd4404 !aes_eround01	%f52,%f4,%f2,%f0
	.word	0x84cdc424 !aes_eround23	%f54,%f4,%f2,%f2
	.word	0x88ce4400 !aes_eround01	%f56,%f0,%f2,%f4
	.word	0x84cec420 !aes_eround23	%f58,%f0,%f2,%f2
	.word	0x80cf4484 !aes_eround01_l	%f60,%f4,%f2,%f0
	retl
	.word	0x84cfc4a4 !aes_eround23_l	%f62,%f4,%f2,%f2
.type	_aes192_encrypt_1x,#function
.size	_aes192_encrypt_1x,.-_aes192_encrypt_1x

.align	32
_aes192_encrypt_2x:
	.word	0x90cc0400 !aes_eround01	%f16,%f0,%f2,%f8
	.word	0x84cc8420 !aes_eround23	%f18,%f0,%f2,%f2
	.word	0x94cc0c04 !aes_eround01	%f16,%f4,%f6,%f10
	.word	0x8ccc8c24 !aes_eround23	%f18,%f4,%f6,%f6
	.word	0x80cd0408 !aes_eround01	%f20,%f8,%f2,%f0
	.word	0x84cd8428 !aes_eround23	%f22,%f8,%f2,%f2
	.word	0x88cd0c0a !aes_eround01	%f20,%f10,%f6,%f4
	.word	0x8ccd8c2a !aes_eround23	%f22,%f10,%f6,%f6
	.word	0x90ce0400 !aes_eround01	%f24,%f0,%f2,%f8
	.word	0x84ce8420 !aes_eround23	%f26,%f0,%f2,%f2
	.word	0x94ce0c04 !aes_eround01	%f24,%f4,%f6,%f10
	.word	0x8cce8c24 !aes_eround23	%f26,%f4,%f6,%f6
	.word	0x80cf0408 !aes_eround01	%f28,%f8,%f2,%f0
	.word	0x84cf8428 !aes_eround23	%f30,%f8,%f2,%f2
	.word	0x88cf0c0a !aes_eround01	%f28,%f10,%f6,%f4
	.word	0x8ccf8c2a !aes_eround23	%f30,%f10,%f6,%f6
	.word	0x90c84400 !aes_eround01	%f32,%f0,%f2,%f8
	.word	0x84c8c420 !aes_eround23	%f34,%f0,%f2,%f2
	.word	0x94c84c04 !aes_eround01	%f32,%f4,%f6,%f10
	.word	0x8cc8cc24 !aes_eround23	%f34,%f4,%f6,%f6
	.word	0x80c94408 !aes_eround01	%f36,%f8,%f2,%f0
	.word	0x84c9c428 !aes_eround23	%f38,%f8,%f2,%f2
	.word	0x88c94c0a !aes_eround01	%f36,%f10,%f6,%f4
	.word	0x8cc9cc2a !aes_eround23	%f38,%f10,%f6,%f6
	.word	0x90ca4400 !aes_eround01	%f40,%f0,%f2,%f8
	.word	0x84cac420 !aes_eround23	%f42,%f0,%f2,%f2
	.word	0x94ca4c04 !aes_eround01	%f40,%f4,%f6,%f10
	.word	0x8ccacc24 !aes_eround23	%f42,%f4,%f6,%f6
	.word	0x80cb4408 !aes_eround01	%f44,%f8,%f2,%f0
	.word	0x84cbc428 !aes_eround23	%f46,%f8,%f2,%f2
	.word	0x88cb4c0a !aes_eround01	%f44,%f10,%f6,%f4
	.word	0x8ccbcc2a !aes_eround23	%f46,%f10,%f6,%f6
	.word	0x90cc4400 !aes_eround01	%f48,%f0,%f2,%f8
	.word	0x84ccc420 !aes_eround23	%f50,%f0,%f2,%f2
	.word	0x94cc4c04 !aes_eround01	%f48,%f4,%f6,%f10
	.word	0x8ccccc24 !aes_eround23	%f50,%f4,%f6,%f6
	.word	0x80cd4408 !aes_eround01	%f52,%f8,%f2,%f0
	.word	0x84cdc428 !aes_eround23	%f54,%f8,%f2,%f2
	.word	0x88cd4c0a !aes_eround01	%f52,%f10,%f6,%f4
	.word	0x8ccdcc2a !aes_eround23	%f54,%f10,%f6,%f6
	.word	0x90ce4400 !aes_eround01	%f56,%f0,%f2,%f8
	.word	0x84cec420 !aes_eround23	%f58,%f0,%f2,%f2
	.word	0x94ce4c04 !aes_eround01	%f56,%f4,%f6,%f10
	.word	0x8ccecc24 !aes_eround23	%f58,%f4,%f6,%f6
	.word	0x80cf4488 !aes_eround01_l	%f60,%f8,%f2,%f0
	.word	0x84cfc4a8 !aes_eround23_l	%f62,%f8,%f2,%f2
	.word	0x88cf4c8a !aes_eround01_l	%f60,%f10,%f6,%f4
	retl
	.word	0x8ccfccaa !aes_eround23_l	%f62,%f10,%f6,%f6
.type	_aes192_encrypt_2x,#function
.size	_aes192_encrypt_2x,.-_aes192_encrypt_2x

.align	32
_aes256_encrypt_1x:
	.word	0x88cc0400 !aes_eround01	%f16,%f0,%f2,%f4
	.word	0x84cc8420 !aes_eround23	%f18,%f0,%f2,%f2
	ldd		[%i3 + 208], %f16
	ldd		[%i3 + 216], %f18
	.word	0x80cd0404 !aes_eround01	%f20,%f4,%f2,%f0
	.word	0x84cd8424 !aes_eround23	%f22,%f4,%f2,%f2
	ldd		[%i3 + 224], %f20
	ldd		[%i3 + 232], %f22
	.word	0x88ce0400 !aes_eround01	%f24,%f0,%f2,%f4
	.word	0x84ce8420 !aes_eround23	%f26,%f0,%f2,%f2
	.word	0x80cf0404 !aes_eround01	%f28,%f4,%f2,%f0
	.word	0x84cf8424 !aes_eround23	%f30,%f4,%f2,%f2
	.word	0x88c84400 !aes_eround01	%f32,%f0,%f2,%f4
	.word	0x84c8c420 !aes_eround23	%f34,%f0,%f2,%f2
	.word	0x80c94404 !aes_eround01	%f36,%f4,%f2,%f0
	.word	0x84c9c424 !aes_eround23	%f38,%f4,%f2,%f2
	.word	0x88ca4400 !aes_eround01	%f40,%f0,%f2,%f4
	.word	0x84cac420 !aes_eround23	%f42,%f0,%f2,%f2
	.word	0x80cb4404 !aes_eround01	%f44,%f4,%f2,%f0
	.word	0x84cbc424 !aes_eround23	%f46,%f4,%f2,%f2
	.word	0x88cc4400 !aes_eround01	%f48,%f0,%f2,%f4
	.word	0x84ccc420 !aes_eround23	%f50,%f0,%f2,%f2
	.word	0x80cd4404 !aes_eround01	%f52,%f4,%f2,%f0
	.word	0x84cdc424 !aes_eround23	%f54,%f4,%f2,%f2
	.word	0x88ce4400 !aes_eround01	%f56,%f0,%f2,%f4
	.word	0x84cec420 !aes_eround23	%f58,%f0,%f2,%f2
	.word	0x80cf4404 !aes_eround01	%f60,%f4,%f2,%f0
	.word	0x84cfc424 !aes_eround23	%f62,%f4,%f2,%f2
	.word	0x88cc0400 !aes_eround01	%f16,%f0,%f2,%f4
	.word	0x84cc8420 !aes_eround23	%f18,%f0,%f2,%f2
	ldd		[%i3 + 16], %f16
	ldd		[%i3 + 24], %f18
	.word	0x80cd0484 !aes_eround01_l	%f20,%f4,%f2,%f0
	.word	0x84cd84a4 !aes_eround23_l	%f22,%f4,%f2,%f2
	ldd		[%i3 + 32], %f20
	retl
	ldd		[%i3 + 40], %f22
.type	_aes256_encrypt_1x,#function
.size	_aes256_encrypt_1x,.-_aes256_encrypt_1x

.align	32
_aes256_encrypt_2x:
	.word	0x90cc0400 !aes_eround01	%f16,%f0,%f2,%f8
	.word	0x84cc8420 !aes_eround23	%f18,%f0,%f2,%f2
	.word	0x94cc0c04 !aes_eround01	%f16,%f4,%f6,%f10
	.word	0x8ccc8c24 !aes_eround23	%f18,%f4,%f6,%f6
	ldd		[%i3 + 208], %f16
	ldd		[%i3 + 216], %f18
	.word	0x80cd0408 !aes_eround01	%f20,%f8,%f2,%f0
	.word	0x84cd8428 !aes_eround23	%f22,%f8,%f2,%f2
	.word	0x88cd0c0a !aes_eround01	%f20,%f10,%f6,%f4
	.word	0x8ccd8c2a !aes_eround23	%f22,%f10,%f6,%f6
	ldd		[%i3 + 224], %f20
	ldd		[%i3 + 232], %f22
	.word	0x90ce0400 !aes_eround01	%f24,%f0,%f2,%f8
	.word	0x84ce8420 !aes_eround23	%f26,%f0,%f2,%f2
	.word	0x94ce0c04 !aes_eround01	%f24,%f4,%f6,%f10
	.word	0x8cce8c24 !aes_eround23	%f26,%f4,%f6,%f6
	.word	0x80cf0408 !aes_eround01	%f28,%f8,%f2,%f0
	.word	0x84cf8428 !aes_eround23	%f30,%f8,%f2,%f2
	.word	0x88cf0c0a !aes_eround01	%f28,%f10,%f6,%f4
	.word	0x8ccf8c2a !aes_eround23	%f30,%f10,%f6,%f6
	.word	0x90c84400 !aes_eround01	%f32,%f0,%f2,%f8
	.word	0x84c8c420 !aes_eround23	%f34,%f0,%f2,%f2
	.word	0x94c84c04 !aes_eround01	%f32,%f4,%f6,%f10
	.word	0x8cc8cc24 !aes_eround23	%f34,%f4,%f6,%f6
	.word	0x80c94408 !aes_eround01	%f36,%f8,%f2,%f0
	.word	0x84c9c428 !aes_eround23	%f38,%f8,%f2,%f2
	.word	0x88c94c0a !aes_eround01	%f36,%f10,%f6,%f4
	.word	0x8cc9cc2a !aes_eround23	%f38,%f10,%f6,%f6
	.word	0x90ca4400 !aes_eround01	%f40,%f0,%f2,%f8
	.word	0x84cac420 !aes_eround23	%f42,%f0,%f2,%f2
	.word	0x94ca4c04 !aes_eround01	%f40,%f4,%f6,%f10
	.word	0x8ccacc24 !aes_eround23	%f42,%f4,%f6,%f6
	.word	0x80cb4408 !aes_eround01	%f44,%f8,%f2,%f0
	.word	0x84cbc428 !aes_eround23	%f46,%f8,%f2,%f2
	.word	0x88cb4c0a !aes_eround01	%f44,%f10,%f6,%f4
	.word	0x8ccbcc2a !aes_eround23	%f46,%f10,%f6,%f6
	.word	0x90cc4400 !aes_eround01	%f48,%f0,%f2,%f8
	.word	0x84ccc420 !aes_eround23	%f50,%f0,%f2,%f2
	.word	0x94cc4c04 !aes_eround01	%f48,%f4,%f6,%f10
	.word	0x8ccccc24 !aes_eround23	%f50,%f4,%f6,%f6
	.word	0x80cd4408 !aes_eround01	%f52,%f8,%f2,%f0
	.word	0x84cdc428 !aes_eround23	%f54,%f8,%f2,%f2
	.word	0x88cd4c0a !aes_eround01	%f52,%f10,%f6,%f4
	.word	0x8ccdcc2a !aes_eround23	%f54,%f10,%f6,%f6
	.word	0x90ce4400 !aes_eround01	%f56,%f0,%f2,%f8
	.word	0x84cec420 !aes_eround23	%f58,%f0,%f2,%f2
	.word	0x94ce4c04 !aes_eround01	%f56,%f4,%f6,%f10
	.word	0x8ccecc24 !aes_eround23	%f58,%f4,%f6,%f6
	.word	0x80cf4408 !aes_eround01	%f60,%f8,%f2,%f0
	.word	0x84cfc428 !aes_eround23	%f62,%f8,%f2,%f2
	.word	0x88cf4c0a !aes_eround01	%f60,%f10,%f6,%f4
	.word	0x8ccfcc2a !aes_eround23	%f62,%f10,%f6,%f6
	.word	0x90cc0400 !aes_eround01	%f16,%f0,%f2,%f8
	.word	0x84cc8420 !aes_eround23	%f18,%f0,%f2,%f2
	.word	0x94cc0c04 !aes_eround01	%f16,%f4,%f6,%f10
	.word	0x8ccc8c24 !aes_eround23	%f18,%f4,%f6,%f6
	ldd		[%i3 + 16], %f16
	ldd		[%i3 + 24], %f18
	.word	0x80cd0488 !aes_eround01_l	%f20,%f8,%f2,%f0
	.word	0x84cd84a8 !aes_eround23_l	%f22,%f8,%f2,%f2
	.word	0x88cd0c8a !aes_eround01_l	%f20,%f10,%f6,%f4
	.word	0x8ccd8caa !aes_eround23_l	%f22,%f10,%f6,%f6
	ldd		[%i3 + 32], %f20
	retl
	ldd		[%i3 + 40], %f22
.type	_aes256_encrypt_2x,#function
.size	_aes256_encrypt_2x,.-_aes256_encrypt_2x

.align	32
_aes192_loadkey:
	ldx		[%i3 + 0], %g4
	ldx		[%i3 + 8], %g5
	ldd		[%i3 + 16], %f16
	ldd		[%i3 + 24], %f18
	ldd		[%i3 + 32], %f20
	ldd		[%i3 + 40], %f22
	ldd		[%i3 + 48], %f24
	ldd		[%i3 + 56], %f26
	ldd		[%i3 + 64], %f28
	ldd		[%i3 + 72], %f30
	ldd		[%i3 + 80], %f32
	ldd		[%i3 + 88], %f34
	ldd		[%i3 + 96], %f36
	ldd		[%i3 + 104], %f38
	ldd		[%i3 + 112], %f40
	ldd		[%i3 + 120], %f42
	ldd		[%i3 + 128], %f44
	ldd		[%i3 + 136], %f46
	ldd		[%i3 + 144], %f48
	ldd		[%i3 + 152], %f50
	ldd		[%i3 + 160], %f52
	ldd		[%i3 + 168], %f54
	ldd		[%i3 + 176], %f56
	ldd		[%i3 + 184], %f58
	ldd		[%i3 + 192], %f60
	ldd		[%i3 + 200], %f62
	retl
	nop
.type	_aes192_loadkey,#function
.size	_aes192_loadkey,.-_aes192_loadkey
_aes256_loadkey=_aes192_loadkey
_aes192_load_enckey=_aes192_loadkey
_aes192_load_deckey=_aes192_loadkey
_aes256_load_enckey=_aes192_loadkey
_aes256_load_deckey=_aes192_loadkey
.globl	aes256_t4_cbc_encrypt
.align	32
aes256_t4_cbc_encrypt:
	save		%sp, -STACK_FRAME, %sp
	cmp		%i2, 0
	be,pn		SIZE_T_CC, .L256_cbc_enc_abort
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9
	sub		%i0, %i1, %l5	! %i0!=%i1
	ld		[%i4 + 0], %f0
	ld		[%i4 + 4], %f1
	ld		[%i4 + 8], %f2
	ld		[%i4 + 12], %f3
	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes256_load_enckey
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 127
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<128 ||
	brnz,pn		%l5, .L256cbc_enc_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	srlx		%i2, 4, %i2
	prefetch	[%i1], 22

.L256_cbc_enc_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes256_encrypt_1x
	add		%i0, 16, %i0

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L256_cbc_enc_loop
	add		%i1, 16, %i1
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
.L256_cbc_enc_abort:
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8

	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_cbc_enc_loop+4
	orn		%g0, %l3, %l3
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256cbc_enc_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5

.L256_cbc_enc_blk_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 5f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
5:
	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	prefetch	[%i0 + 16+63], 20
	call		_aes256_encrypt_1x
	add		%i0, 16, %i0
	sub		%i2, 1, %i2

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	brnz,pt		%i2, .L256_cbc_enc_blk_loop
	add		%i1, 8, %i1

	membar		#StoreLoad|#StoreStore
	brnz,pt		%l5, .L256_cbc_enc_loop
	mov		%l5, %i2
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
	ret
	restore
.type	aes256_t4_cbc_encrypt,#function
.size	aes256_t4_cbc_encrypt,.-aes256_t4_cbc_encrypt
.globl	aes192_t4_cbc_encrypt
.align	32
aes192_t4_cbc_encrypt:
	save		%sp, -STACK_FRAME, %sp
	cmp		%i2, 0
	be,pn		SIZE_T_CC, .L192_cbc_enc_abort
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9
	sub		%i0, %i1, %l5	! %i0!=%i1
	ld		[%i4 + 0], %f0
	ld		[%i4 + 4], %f1
	ld		[%i4 + 8], %f2
	ld		[%i4 + 12], %f3
	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes192_load_enckey
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 127
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<128 ||
	brnz,pn		%l5, .L192cbc_enc_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	srlx		%i2, 4, %i2
	prefetch	[%i1], 22

.L192_cbc_enc_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes192_encrypt_1x
	add		%i0, 16, %i0

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L192_cbc_enc_loop
	add		%i1, 16, %i1
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
.L192_cbc_enc_abort:
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8

	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L192_cbc_enc_loop+4
	orn		%g0, %l3, %l3
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L192cbc_enc_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5

.L192_cbc_enc_blk_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 5f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
5:
	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	prefetch	[%i0 + 16+63], 20
	call		_aes192_encrypt_1x
	add		%i0, 16, %i0
	sub		%i2, 1, %i2

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	brnz,pt		%i2, .L192_cbc_enc_blk_loop
	add		%i1, 8, %i1

	membar		#StoreLoad|#StoreStore
	brnz,pt		%l5, .L192_cbc_enc_loop
	mov		%l5, %i2
	st		%f0, [%i4 + 0]
	st		%f1, [%i4 + 4]
	st		%f2, [%i4 + 8]
	st		%f3, [%i4 + 12]
	ret
	restore
.type	aes192_t4_cbc_encrypt,#function
.size	aes192_t4_cbc_encrypt,.-aes192_t4_cbc_encrypt
.globl	aes256_t4_ctr32_encrypt
.align	32
aes256_t4_ctr32_encrypt:
	save		%sp, -STACK_FRAME, %sp
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9

	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes256_load_enckey
	sllx		%i2, 4, %i2

	ld		[%i4 + 0], %l4	! counter
	ld		[%i4 + 4], %l5
	ld		[%i4 + 8], %l6
	ld		[%i4 + 12], %l7

	sllx		%l4, 32, %o5
	or		%l5, %o5, %o5
	sllx		%l6, 32, %g1
	xor		%o5, %g4, %g4		! ^= rk[0]
	xor		%g1, %g5, %g5
	.word	0x9db02304 !movxtod	%g4,%f14		! most significant 64 bits

	sub		%i0, %i1, %l5	! %i0!=%i1
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L256_ctr32_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L256_ctr32_loop2x
	srlx		%i2, 4, %i2
.L256_ctr32_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	.word	0x88cc040e !aes_eround01	%f16,%f14,%f2,%f4
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	call		_aes256_encrypt_1x+8
	add		%i0, 16, %i0

	.word	0x95b02308 !movxtod	%o0,%f10
	.word	0x99b02309 !movxtod	%o1,%f12
	.word	0x81b28d80 !fxor	%f10,%f0,%f0		! ^= inp
	.word	0x85b30d82 !fxor	%f12,%f2,%f2

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L256_ctr32_loop2x
	add		%i1, 16, %i1

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8
	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_ctr32_loop2x+4
	orn		%g0, %l3, %l3

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_ctr32_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	xor		%g5, %l7, %g1
	add		%l7, 1, %l7
	.word	0x8db02301 !movxtod	%g1,%f6
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	.word	0x90cc040e !aes_eround01	%f16,%f14,%f2,%f8
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	.word	0x94cc0c0e !aes_eround01	%f16,%f14,%f6,%f10
	.word	0x8ccc8c2e !aes_eround23	%f18,%f14,%f6,%f6
	call		_aes256_encrypt_2x+16
	add		%i0, 32, %i0

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x81b20d80 !fxor	%f8,%f0,%f0		! ^= inp
	.word	0x91b0230b !movxtod	%o3,%f8
	.word	0x85b28d82 !fxor	%f10,%f2,%f2
	.word	0x89b30d84 !fxor	%f12,%f4,%f4
	.word	0x8db20d86 !fxor	%f8,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L256_ctr32_loop2x
	add		%i1, 32, %i1

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
	.word	0x89b10906 !faligndata	%f4,%f6,%f4
	.word	0x8db18906 !faligndata	%f6,%f6,%f6

	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f0, [%i1 + 8]
	std		%f2, [%i1 + 16]
	std		%f4, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f6, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_ctr32_loop2x+4
	orn		%g0, %l3, %l3

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_ctr32_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L256_ctr32_blk_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	xor		%g5, %l7, %g1
	add		%l7, 1, %l7
	.word	0x8db02301 !movxtod	%g1,%f6
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i0 + 32+63], 20
	.word	0x90cc040e !aes_eround01	%f16,%f14,%f2,%f8
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	.word	0x94cc0c0e !aes_eround01	%f16,%f14,%f6,%f10
	.word	0x8ccc8c2e !aes_eround23	%f18,%f14,%f6,%f6
	call		_aes256_encrypt_2x+16
	add		%i0, 32, %i0
	subcc		%i2, 2, %i2

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x81b20d80 !fxor	%f8,%f0,%f0		! ^= inp
	.word	0x91b0230b !movxtod	%o3,%f8
	.word	0x85b28d82 !fxor	%f10,%f2,%f2
	.word	0x89b30d84 !fxor	%f12,%f4,%f4
	.word	0x8db20d86 !fxor	%f8,%f6,%f6

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L256_ctr32_blk_loop2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L256_ctr32_loop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L256_ctr32_loop2x
	nop

	ret
	restore
.type	aes256_t4_ctr32_encrypt,#function
.size	aes256_t4_ctr32_encrypt,.-aes256_t4_ctr32_encrypt
.globl	aes256_t4_xts_encrypt
.align	32
aes256_t4_xts_encrypt:
	save		%sp, -STACK_FRAME-16, %sp
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9

	mov		%i5, %o0
	add		%fp, STACK_BIAS-16, %o1
	call		aes_t4_encrypt
	mov		%i4, %o2

	add		%fp, STACK_BIAS-16, %l7
	ldxa		[%l7]0x88, %g2
	add		%fp, STACK_BIAS-8, %l7
	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak

	sethi		%hi(0x76543210), %l7
	or		%l7, %lo(0x76543210), %l7
	.word	0x81b5c320 !bmask	%l7,%g0,%g0		! byte swap mask

	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes256_load_enckey
	and		%i2, 15,  %i5
	and		%i2, -16, %i2

	sub		%i0, %i1, %l5	! %i0!=%i1
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L256_xts_enblk !	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L256_xts_enloop2x
	srlx		%i2, 4, %i2
.L256_xts_enloop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes256_encrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L256_xts_enloop2x
	add		%i1, 16, %i1

	brnz,pn		%i5, .L256_xts_ensteal
	nop

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8
	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_xts_enloop2x+4
	orn		%g0, %l3, %l3

	brnz,pn		%i5, .L256_xts_ensteal
	nop

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_xts_enloop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10
	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	xor		%g4, %o2, %o2		! ^= rk[0]
	xor		%g5, %o3, %o3
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2
	.word	0x89b0230a !movxtod	%o2,%f4
	.word	0x8db0230b !movxtod	%o3,%f6

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4		! ^= tweak[0]
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	call		_aes256_encrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L256_xts_enloop2x
	add		%i1, 32, %i1

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L256_xts_ensteal
	nop

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x95b00902 !faligndata	%f0,%f2,%f10
	.word	0x99b08904 !faligndata	%f2,%f4,%f12
	.word	0x9db10906 !faligndata	%f4,%f6,%f14
	.word	0x81b18906 !faligndata	%f6,%f6,%f0

	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f10, [%i1 + 8]
	std		%f12, [%i1 + 16]
	std		%f14, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f0, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_xts_enloop2x+4
	orn		%g0, %l3, %l3

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L256_xts_ensteal
	nop

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_xts_enblk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L256_xts_enblk2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10
	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	xor		%g4, %o2, %o2		! ^= rk[0]
	xor		%g5, %o3, %o3
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2
	.word	0x89b0230a !movxtod	%o2,%f4
	.word	0x8db0230b !movxtod	%o3,%f6

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4		! ^= tweak[0]
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	prefetch	[%i0 + 32+63], 20
	call		_aes256_encrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	subcc		%i2, 2, %i2
	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L256_xts_enblk2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L256_xts_enloop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L256_xts_enloop2x
	nop

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L256_xts_ensteal
	nop

	ret
	restore
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_xts_ensteal:
	std		%f0, [%fp + STACK_BIAS-16]	! copy of output
	std		%f2, [%fp + STACK_BIAS-8]

	srl		%l0, 3, %l0
	add		%fp, STACK_BIAS-16, %l7
	add		%i0, %l0, %i0	! original %i0+%i2&-15
	add		%i1, %l2, %i1	! original %i1+%i2&-15
	mov		0, %l0
	nop					! align

.L256_xts_enstealing:
	ldub		[%i0 + %l0], %o0
	ldub		[%l7  + %l0], %o1
	dec		%i5
	stb		%o0, [%l7  + %l0]
	stb		%o1, [%i1 + %l0]
	brnz		%i5, .L256_xts_enstealing
	inc		%l0

	mov		%l7, %i0
	sub		%i1, 16, %i1
	mov		0, %l0
	sub		%i1, %l2, %i1
	ba		.L256_xts_enloop	! one more time
	mov		1, %i2				! %i5 is 0
	ret
	restore
.type	aes256_t4_xts_encrypt,#function
.size	aes256_t4_xts_encrypt,.-aes256_t4_xts_encrypt
.globl	aes256_t4_xts_decrypt
.align	32
aes256_t4_xts_decrypt:
	save		%sp, -STACK_FRAME-16, %sp
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9

	mov		%i5, %o0
	add		%fp, STACK_BIAS-16, %o1
	call		aes_t4_encrypt
	mov		%i4, %o2

	add		%fp, STACK_BIAS-16, %l7
	ldxa		[%l7]0x88, %g2
	add		%fp, STACK_BIAS-8, %l7
	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak

	sethi		%hi(0x76543210), %l7
	or		%l7, %lo(0x76543210), %l7
	.word	0x81b5c320 !bmask	%l7,%g0,%g0		! byte swap mask

	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes256_load_deckey
	and		%i2, 15,  %i5
	and		%i2, -16, %i2
	mov		0, %l7
	movrnz		%i5, 16,  %l7
	sub		%i2, %l7, %i2

	sub		%i0, %i1, %l5	! %i0!=%i1
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L256_xts_deblk !	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	brz,pn		%i2, .L256_xts_desteal
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L256_xts_deloop2x
	srlx		%i2, 4, %i2
.L256_xts_deloop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes256_decrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L256_xts_deloop2x
	add		%i1, 16, %i1

	brnz,pn		%i5, .L256_xts_desteal
	nop

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8
	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_xts_deloop2x+4
	orn		%g0, %l3, %l3

	brnz,pn		%i5, .L256_xts_desteal
	nop

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_xts_deloop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10
	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	xor		%g4, %o2, %o2		! ^= rk[0]
	xor		%g5, %o3, %o3
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2
	.word	0x89b0230a !movxtod	%o2,%f4
	.word	0x8db0230b !movxtod	%o3,%f6

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4		! ^= tweak[0]
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	call		_aes256_decrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L256_xts_deloop2x
	add		%i1, 32, %i1

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L256_xts_desteal
	nop

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x95b00902 !faligndata	%f0,%f2,%f10
	.word	0x99b08904 !faligndata	%f2,%f4,%f12
	.word	0x9db10906 !faligndata	%f4,%f6,%f14
	.word	0x81b18906 !faligndata	%f6,%f6,%f0

	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f10, [%i1 + 8]
	std		%f12, [%i1 + 16]
	std		%f14, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f0, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_xts_deloop2x+4
	orn		%g0, %l3, %l3

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L256_xts_desteal
	nop

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_xts_deblk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L256_xts_deblk2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	.word	0x99b02302 !movxtod	%g2,%f12
	.word	0x9db02303 !movxtod	%g3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10
	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	xor		%g4, %o2, %o2		! ^= rk[0]
	xor		%g5, %o3, %o3
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2
	.word	0x89b0230a !movxtod	%o2,%f4
	.word	0x8db0230b !movxtod	%o3,%f6

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4		! ^= tweak[0]
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	prefetch	[%i0 + 32+63], 20
	call		_aes256_decrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02302 !movxtod	%g2,%f8
	.word	0x95b02303 !movxtod	%g3,%f10

	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %g2
	and		%l7, 0x87, %l7
	.word	0x87b0c223 !addxc	%g3,%g3,%g3
	xor		%l7, %g2, %g2

	.word	0x91b20988 !bshuffle	%f8,%f8,%f8
	.word	0x95b2898a !bshuffle	%f10,%f10,%f10

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	subcc		%i2, 2, %i2
	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L256_xts_deblk2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L256_xts_deloop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L256_xts_deloop2x
	nop

	.word	0x81b00f04 !fsrc2	%f0,%f4,%f0
	.word	0x85b00f06 !fsrc2	%f0,%f6,%f2
	brnz,pn		%i5, .L256_xts_desteal
	nop

	ret
	restore
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_xts_desteal:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 8f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
8:
	srax		%g3, 63, %l7		! next tweak value
	addcc		%g2, %g2, %o2
	and		%l7, 0x87, %l7
	.word	0x97b0c223 !addxc	%g3,%g3,%o3
	xor		%l7, %o2, %o2

	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x9db0230b !movxtod	%o3,%f14
	.word	0x99b3098c !bshuffle	%f12,%f12,%f12
	.word	0x9db3898e !bshuffle	%f14,%f14,%f14

	xor		%g4, %o0, %o0		! ^= rk[0]
	xor		%g5, %o1, %o1
	.word	0x81b02308 !movxtod	%o0,%f0
	.word	0x85b02309 !movxtod	%o1,%f2

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	call		_aes256_decrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= tweak[0]
	.word	0x85b38d82 !fxor	%f14,%f2,%f2

	std		%f0, [%fp + STACK_BIAS-16]
	std		%f2, [%fp + STACK_BIAS-8]

	srl		%l0, 3, %l0
	add		%fp, STACK_BIAS-16, %l7
	add		%i0, %l0, %i0	! original %i0+%i2&-15
	add		%i1, %l2, %i1	! original %i1+%i2&-15
	mov		0, %l0
	add		%i1, 16, %i1
	nop					! align

.L256_xts_destealing:
	ldub		[%i0 + %l0], %o0
	ldub		[%l7  + %l0], %o1
	dec		%i5
	stb		%o0, [%l7  + %l0]
	stb		%o1, [%i1 + %l0]
	brnz		%i5, .L256_xts_destealing
	inc		%l0

	mov		%l7, %i0
	sub		%i1, 16, %i1
	mov		0, %l0
	sub		%i1, %l2, %i1
	ba		.L256_xts_deloop	! one more time
	mov		1, %i2				! %i5 is 0
	ret
	restore
.type	aes256_t4_xts_decrypt,#function
.size	aes256_t4_xts_decrypt,.-aes256_t4_xts_decrypt
.globl	aes192_t4_ctr32_encrypt
.align	32
aes192_t4_ctr32_encrypt:
	save		%sp, -STACK_FRAME, %sp
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9

	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes192_load_enckey
	sllx		%i2, 4, %i2

	ld		[%i4 + 0], %l4	! counter
	ld		[%i4 + 4], %l5
	ld		[%i4 + 8], %l6
	ld		[%i4 + 12], %l7

	sllx		%l4, 32, %o5
	or		%l5, %o5, %o5
	sllx		%l6, 32, %g1
	xor		%o5, %g4, %g4		! ^= rk[0]
	xor		%g1, %g5, %g5
	.word	0x9db02304 !movxtod	%g4,%f14		! most significant 64 bits

	sub		%i0, %i1, %l5	! %i0!=%i1
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L192_ctr32_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L192_ctr32_loop2x
	srlx		%i2, 4, %i2
.L192_ctr32_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	.word	0x88cc040e !aes_eround01	%f16,%f14,%f2,%f4
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	call		_aes192_encrypt_1x+8
	add		%i0, 16, %i0

	.word	0x95b02308 !movxtod	%o0,%f10
	.word	0x99b02309 !movxtod	%o1,%f12
	.word	0x81b28d80 !fxor	%f10,%f0,%f0		! ^= inp
	.word	0x85b30d82 !fxor	%f12,%f2,%f2

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L192_ctr32_loop2x
	add		%i1, 16, %i1

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8
	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L192_ctr32_loop2x+4
	orn		%g0, %l3, %l3

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L192_ctr32_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	xor		%g5, %l7, %g1
	add		%l7, 1, %l7
	.word	0x8db02301 !movxtod	%g1,%f6
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	.word	0x90cc040e !aes_eround01	%f16,%f14,%f2,%f8
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	.word	0x94cc0c0e !aes_eround01	%f16,%f14,%f6,%f10
	.word	0x8ccc8c2e !aes_eround23	%f18,%f14,%f6,%f6
	call		_aes192_encrypt_2x+16
	add		%i0, 32, %i0

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x81b20d80 !fxor	%f8,%f0,%f0		! ^= inp
	.word	0x91b0230b !movxtod	%o3,%f8
	.word	0x85b28d82 !fxor	%f10,%f2,%f2
	.word	0x89b30d84 !fxor	%f12,%f4,%f4
	.word	0x8db20d86 !fxor	%f8,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L192_ctr32_loop2x
	add		%i1, 32, %i1

	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
	.word	0x89b10906 !faligndata	%f4,%f6,%f4
	.word	0x8db18906 !faligndata	%f6,%f6,%f6

	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f0, [%i1 + 8]
	std		%f2, [%i1 + 16]
	std		%f4, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f6, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L192_ctr32_loop2x+4
	orn		%g0, %l3, %l3

	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L192_ctr32_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L192_ctr32_blk_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	xor		%g5, %l7, %g1		! ^= rk[0]
	add		%l7, 1, %l7
	.word	0x85b02301 !movxtod	%g1,%f2
	srl		%l7, 0, %l7		! clruw
	xor		%g5, %l7, %g1
	add		%l7, 1, %l7
	.word	0x8db02301 !movxtod	%g1,%f6
	srl		%l7, 0, %l7		! clruw
	prefetch	[%i0 + 32+63], 20
	.word	0x90cc040e !aes_eround01	%f16,%f14,%f2,%f8
	.word	0x84cc842e !aes_eround23	%f18,%f14,%f2,%f2
	.word	0x94cc0c0e !aes_eround01	%f16,%f14,%f6,%f10
	.word	0x8ccc8c2e !aes_eround23	%f18,%f14,%f6,%f6
	call		_aes192_encrypt_2x+16
	add		%i0, 32, %i0
	subcc		%i2, 2, %i2

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x81b20d80 !fxor	%f8,%f0,%f0		! ^= inp
	.word	0x91b0230b !movxtod	%o3,%f8
	.word	0x85b28d82 !fxor	%f10,%f2,%f2
	.word	0x89b30d84 !fxor	%f12,%f4,%f4
	.word	0x8db20d86 !fxor	%f8,%f6,%f6

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L192_ctr32_blk_loop2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L192_ctr32_loop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L192_ctr32_loop2x
	nop

	ret
	restore
.type	aes192_t4_ctr32_encrypt,#function
.size	aes192_t4_ctr32_encrypt,.-aes192_t4_ctr32_encrypt
.globl	aes192_t4_cbc_decrypt
.align	32
aes192_t4_cbc_decrypt:
	save		%sp, -STACK_FRAME, %sp
	cmp		%i2, 0
	be,pn		SIZE_T_CC, .L192_cbc_dec_abort
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9
	sub		%i0, %i1, %l5	! %i0!=%i1
	ld		[%i4 + 0], %f12	! load ivec
	ld		[%i4 + 4], %f13
	ld		[%i4 + 8], %f14
	ld		[%i4 + 12], %f15
	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes192_load_deckey
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L192cbc_dec_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	srlx		%i2, 4, %i2
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L192_cbc_dec_loop2x
	prefetch	[%i1], 22
.L192_cbc_dec_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g4, %o0, %o2		! ^= rk[0]
	xor		%g5, %o1, %o3
	.word	0x81b0230a !movxtod	%o2,%f0
	.word	0x85b0230b !movxtod	%o3,%f2

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes192_decrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L192_cbc_dec_loop2x
	add		%i1, 16, %i1
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
.L192_cbc_dec_abort:
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8

	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L192_cbc_dec_loop2x+4
	orn		%g0, %l3, %l3
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L192_cbc_dec_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	xor		%g4, %o0, %o4		! ^= rk[0]
	xor		%g5, %o1, %o5
	.word	0x81b0230c !movxtod	%o4,%f0
	.word	0x85b0230d !movxtod	%o5,%f2
	xor		%g4, %o2, %o4
	xor		%g5, %o3, %o5
	.word	0x89b0230c !movxtod	%o4,%f4
	.word	0x8db0230d !movxtod	%o5,%f6

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	call		_aes192_decrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x9db0230b !movxtod	%o3,%f14
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L192_cbc_dec_loop2x
	add		%i1, 32, %i1
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
	.word	0x89b10906 !faligndata	%f4,%f6,%f4
	.word	0x8db18906 !faligndata	%f6,%f6,%f6
	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f0, [%i1 + 8]
	std		%f2, [%i1 + 16]
	std		%f4, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f6, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L192_cbc_dec_loop2x+4
	orn		%g0, %l3, %l3
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L192cbc_dec_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L192_cbc_dec_blk_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	xor		%g4, %o0, %o4		! ^= rk[0]
	xor		%g5, %o1, %o5
	.word	0x81b0230c !movxtod	%o4,%f0
	.word	0x85b0230d !movxtod	%o5,%f2
	xor		%g4, %o2, %o4
	xor		%g5, %o3, %o5
	.word	0x89b0230c !movxtod	%o4,%f4
	.word	0x8db0230d !movxtod	%o5,%f6

	prefetch	[%i0 + 32+63], 20
	call		_aes192_decrypt_2x
	add		%i0, 32, %i0
	subcc		%i2, 2, %i2

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x9db0230b !movxtod	%o3,%f14
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L192_cbc_dec_blk_loop2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L192_cbc_dec_loop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L192_cbc_dec_loop2x
	nop
	st		%f12, [%i4 + 0]	! write out ivec
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore
.type	aes192_t4_cbc_decrypt,#function
.size	aes192_t4_cbc_decrypt,.-aes192_t4_cbc_decrypt
.globl	aes256_t4_cbc_decrypt
.align	32
aes256_t4_cbc_decrypt:
	save		%sp, -STACK_FRAME, %sp
	cmp		%i2, 0
	be,pn		SIZE_T_CC, .L256_cbc_dec_abort
	srln		%i2, 0, %i2		! needed on v8+, "nop" on v9
	sub		%i0, %i1, %l5	! %i0!=%i1
	ld		[%i4 + 0], %f12	! load ivec
	ld		[%i4 + 4], %f13
	ld		[%i4 + 8], %f14
	ld		[%i4 + 12], %f15
	prefetch	[%i0], 20
	prefetch	[%i0 + 63], 20
	call		_aes256_load_deckey
	and		%i0, 7, %l0
	andn		%i0, 7, %i0
	sll		%l0, 3, %l0
	mov		64, %l1
	mov		0xff, %l3
	sub		%l1, %l0, %l1
	and		%i1, 7, %l2
	cmp		%i2, 255
	movrnz		%l2, 0, %l5		! if (	%i1&7 ||
	movleu		SIZE_T_CC, 0, %l5	!	%i2<256 ||
	brnz,pn		%l5, .L256cbc_dec_blk	!	%i0==%i1)
	srl		%l3, %l2, %l3

	andcc		%i2, 16, %g0		! is number of blocks even?
	srlx		%i2, 4, %i2
	.word	0xb3b64340 !alignaddrl	%i1,%g0,%i1
	bz		%icc, .L256_cbc_dec_loop2x
	prefetch	[%i1], 22
.L256_cbc_dec_loop:
	ldx		[%i0 + 0], %o0
	brz,pt		%l0, 4f
	ldx		[%i0 + 8], %o1

	ldx		[%i0 + 16], %o2
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	sllx		%o1, %l0, %o1
	or		%g1, %o0, %o0
	srlx		%o2, %l1, %o2
	or		%o2, %o1, %o1
4:
	xor		%g4, %o0, %o2		! ^= rk[0]
	xor		%g5, %o1, %o3
	.word	0x81b0230a !movxtod	%o2,%f0
	.word	0x85b0230b !movxtod	%o3,%f2

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 16+63], 20
	call		_aes256_decrypt_1x
	add		%i0, 16, %i0

	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b02308 !movxtod	%o0,%f12
	.word	0x9db02309 !movxtod	%o1,%f14

	brnz,pn		%l2, 2f
	sub		%i2, 1, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	brnz,pt		%i2, .L256_cbc_dec_loop2x
	add		%i1, 16, %i1
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
.L256_cbc_dec_abort:
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x89b00900 !faligndata	%f0,%f0,%f4		! handle unaligned output
	.word	0x8db00902 !faligndata	%f0,%f2,%f6
	.word	0x91b08902 !faligndata	%f2,%f2,%f8

	stda		%f4, [%i1 + %l3]0xc0	! partial store
	std		%f6, [%i1 + 8]
	add		%i1, 16, %i1
	orn		%g0, %l3, %l3
	stda		%f8, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_cbc_dec_loop2x+4
	orn		%g0, %l3, %l3
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256_cbc_dec_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 4f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
4:
	xor		%g4, %o0, %o4		! ^= rk[0]
	xor		%g5, %o1, %o5
	.word	0x81b0230c !movxtod	%o4,%f0
	.word	0x85b0230d !movxtod	%o5,%f2
	xor		%g4, %o2, %o4
	xor		%g5, %o3, %o5
	.word	0x89b0230c !movxtod	%o4,%f4
	.word	0x8db0230d !movxtod	%o5,%f6

	prefetch	[%i1 + 63], 22
	prefetch	[%i0 + 32+63], 20
	call		_aes256_decrypt_2x
	add		%i0, 32, %i0

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x9db0230b !movxtod	%o3,%f14
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	brnz,pn		%l2, 2f
	sub		%i2, 2, %i2

	std		%f0, [%i1 + 0]
	std		%f2, [%i1 + 8]
	std		%f4, [%i1 + 16]
	std		%f6, [%i1 + 24]
	brnz,pt		%i2, .L256_cbc_dec_loop2x
	add		%i1, 32, %i1
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

.align	16
2:	ldxa		[%i0]0x82, %o0		! avoid read-after-write hazard
						! and ~3x deterioration
						! in inp==out case
	.word	0x91b00900 !faligndata	%f0,%f0,%f8		! handle unaligned output
	.word	0x81b00902 !faligndata	%f0,%f2,%f0
	.word	0x85b08904 !faligndata	%f2,%f4,%f2
	.word	0x89b10906 !faligndata	%f4,%f6,%f4
	.word	0x8db18906 !faligndata	%f6,%f6,%f6
	stda		%f8, [%i1 + %l3]0xc0	! partial store
	std		%f0, [%i1 + 8]
	std		%f2, [%i1 + 16]
	std		%f4, [%i1 + 24]
	add		%i1, 32, %i1
	orn		%g0, %l3, %l3
	stda		%f6, [%i1 + %l3]0xc0	! partial store

	brnz,pt		%i2, .L256_cbc_dec_loop2x+4
	orn		%g0, %l3, %l3
	st		%f12, [%i4 + 0]
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align	32
.L256cbc_dec_blk:
	add	%i1, %i2, %l5
	and	%l5, 63, %l5	! tail
	sub	%i2, %l5, %i2
	add	%l5, 15, %l5	! round up to 16n
	srlx	%i2, 4, %i2
	srl	%l5, 4, %l5
	sub	%i2, 1, %i2
	add	%l5, 1, %l5

.L256_cbc_dec_blk_loop2x:
	ldx		[%i0 + 0], %o0
	ldx		[%i0 + 8], %o1
	ldx		[%i0 + 16], %o2
	brz,pt		%l0, 5f
	ldx		[%i0 + 24], %o3

	ldx		[%i0 + 32], %o4
	sllx		%o0, %l0, %o0
	srlx		%o1, %l1, %g1
	or		%g1, %o0, %o0
	sllx		%o1, %l0, %o1
	srlx		%o2, %l1, %g1
	or		%g1, %o1, %o1
	sllx		%o2, %l0, %o2
	srlx		%o3, %l1, %g1
	or		%g1, %o2, %o2
	sllx		%o3, %l0, %o3
	srlx		%o4, %l1, %o4
	or		%o4, %o3, %o3
5:
	xor		%g4, %o0, %o4		! ^= rk[0]
	xor		%g5, %o1, %o5
	.word	0x81b0230c !movxtod	%o4,%f0
	.word	0x85b0230d !movxtod	%o5,%f2
	xor		%g4, %o2, %o4
	xor		%g5, %o3, %o5
	.word	0x89b0230c !movxtod	%o4,%f4
	.word	0x8db0230d !movxtod	%o5,%f6

	prefetch	[%i0 + 32+63], 20
	call		_aes256_decrypt_2x
	add		%i0, 32, %i0
	subcc		%i2, 2, %i2

	.word	0x91b02308 !movxtod	%o0,%f8
	.word	0x95b02309 !movxtod	%o1,%f10
	.word	0x81b30d80 !fxor	%f12,%f0,%f0		! ^= ivec
	.word	0x85b38d82 !fxor	%f14,%f2,%f2
	.word	0x99b0230a !movxtod	%o2,%f12
	.word	0x9db0230b !movxtod	%o3,%f14
	.word	0x89b20d84 !fxor	%f8,%f4,%f4
	.word	0x8db28d86 !fxor	%f10,%f6,%f6

	stda		%f0, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f2, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f4, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	add		%i1, 8, %i1
	stda		%f6, [%i1]0xe2		! ASI_BLK_INIT, T4-specific
	bgu,pt		SIZE_T_CC, .L256_cbc_dec_blk_loop2x
	add		%i1, 8, %i1

	add		%l5, %i2, %i2
	andcc		%i2, 1, %g0		! is number of blocks even?
	membar		#StoreLoad|#StoreStore
	bnz,pt		%icc, .L256_cbc_dec_loop
	srl		%i2, 0, %i2
	brnz,pn		%i2, .L256_cbc_dec_loop2x
	nop
	st		%f12, [%i4 + 0]	! write out ivec
	st		%f13, [%i4 + 4]
	st		%f14, [%i4 + 8]
	st		%f15, [%i4 + 12]
	ret
	restore
.type	aes256_t4_cbc_decrypt,#function
.size	aes256_t4_cbc_decrypt,.-aes256_t4_cbc_decrypt
.align	32
_aes256_decrypt_1x:
	.word	0x88cc0440 !aes_dround01	%f16,%f0,%f2,%f4
	.word	0x84cc8460 !aes_dround23	%f18,%f0,%f2,%f2
	ldd		[%i3 + 208], %f16
	ldd		[%i3 + 216], %f18
	.word	0x80cd0444 !aes_dround01	%f20,%f4,%f2,%f0
	.word	0x84cd8464 !aes_dround23	%f22,%f4,%f2,%f2
	ldd		[%i3 + 224], %f20
	ldd		[%i3 + 232], %f22
	.word	0x88ce0440 !aes_dround01	%f24,%f0,%f2,%f4
	.word	0x84ce8460 !aes_dround23	%f26,%f0,%f2,%f2
	.word	0x80cf0444 !aes_dround01	%f28,%f4,%f2,%f0
	.word	0x84cf8464 !aes_dround23	%f30,%f4,%f2,%f2
	.word	0x88c84440 !aes_dround01	%f32,%f0,%f2,%f4
	.word	0x84c8c460 !aes_dround23	%f34,%f0,%f2,%f2
	.word	0x80c94444 !aes_dround01	%f36,%f4,%f2,%f0
	.word	0x84c9c464 !aes_dround23	%f38,%f4,%f2,%f2
	.word	0x88ca4440 !aes_dround01	%f40,%f0,%f2,%f4
	.word	0x84cac460 !aes_dround23	%f42,%f0,%f2,%f2
	.word	0x80cb4444 !aes_dround01	%f44,%f4,%f2,%f0
	.word	0x84cbc464 !aes_dround23	%f46,%f4,%f2,%f2
	.word	0x88cc4440 !aes_dround01	%f48,%f0,%f2,%f4
	.word	0x84ccc460 !aes_dround23	%f50,%f0,%f2,%f2
	.word	0x80cd4444 !aes_dround01	%f52,%f4,%f2,%f0
	.word	0x84cdc464 !aes_dround23	%f54,%f4,%f2,%f2
	.word	0x88ce4440 !aes_dround01	%f56,%f0,%f2,%f4
	.word	0x84cec460 !aes_dround23	%f58,%f0,%f2,%f2
	.word	0x80cf4444 !aes_dround01	%f60,%f4,%f2,%f0
	.word	0x84cfc464 !aes_dround23	%f62,%f4,%f2,%f2
	.word	0x88cc0440 !aes_dround01	%f16,%f0,%f2,%f4
	.word	0x84cc8460 !aes_dround23	%f18,%f0,%f2,%f2
	ldd		[%i3 + 16], %f16
	ldd		[%i3 + 24], %f18
	.word	0x80cd04c4 !aes_dround01_l	%f20,%f4,%f2,%f0
	.word	0x84cd84e4 !aes_dround23_l	%f22,%f4,%f2,%f2
	ldd		[%i3 + 32], %f20
	retl
	ldd		[%i3 + 40], %f22
.type	_aes256_decrypt_1x,#function
.size	_aes256_decrypt_1x,.-_aes256_decrypt_1x

.align	32
_aes256_decrypt_2x:
	.word	0x90cc0440 !aes_dround01	%f16,%f0,%f2,%f8
	.word	0x84cc8460 !aes_dround23	%f18,%f0,%f2,%f2
	.word	0x94cc0c44 !aes_dround01	%f16,%f4,%f6,%f10
	.word	0x8ccc8c64 !aes_dround23	%f18,%f4,%f6,%f6
	ldd		[%i3 + 208], %f16
	ldd		[%i3 + 216], %f18
	.word	0x80cd0448 !aes_dround01	%f20,%f8,%f2,%f0
	.word	0x84cd8468 !aes_dround23	%f22,%f8,%f2,%f2
	.word	0x88cd0c4a !aes_dround01	%f20,%f10,%f6,%f4
	.word	0x8ccd8c6a !aes_dround23	%f22,%f10,%f6,%f6
	ldd		[%i3 + 224], %f20
	ldd		[%i3 + 232], %f22
	.word	0x90ce0440 !aes_dround01	%f24,%f0,%f2,%f8
	.word	0x84ce8460 !aes_dround23	%f26,%f0,%f2,%f2
	.word	0x94ce0c44 !aes_dround01	%f24,%f4,%f6,%f10
	.word	0x8cce8c64 !aes_dround23	%f26,%f4,%f6,%f6
	.word	0x80cf0448 !aes_dround01	%f28,%f8,%f2,%f0
	.word	0x84cf8468 !aes_dround23	%f30,%f8,%f2,%f2
	.word	0x88cf0c4a !aes_dround01	%f28,%f10,%f6,%f4
	.word	0x8ccf8c6a !aes_dround23	%f30,%f10,%f6,%f6
	.word	0x90c84440 !aes_dround01	%f32,%f0,%f2,%f8
	.word	0x84c8c460 !aes_dround23	%f34,%f0,%f2,%f2
	.word	0x94c84c44 !aes_dround01	%f32,%f4,%f6,%f10
	.word	0x8cc8cc64 !aes_dround23	%f34,%f4,%f6,%f6
	.word	0x80c94448 !aes_dround01	%f36,%f8,%f2,%f0
	.word	0x84c9c468 !aes_dround23	%f38,%f8,%f2,%f2
	.word	0x88c94c4a !aes_dround01	%f36,%f10,%f6,%f4
	.word	0x8cc9cc6a !aes_dround23	%f38,%f10,%f6,%f6
	.word	0x90ca4440 !aes_dround01	%f40,%f0,%f2,%f8
	.word	0x84cac460 !aes_dround23	%f42,%f0,%f2,%f2
	.word	0x94ca4c44 !aes_dround01	%f40,%f4,%f6,%f10
	.word	0x8ccacc64 !aes_dround23	%f42,%f4,%f6,%f6
	.word	0x80cb4448 !aes_dround01	%f44,%f8,%f2,%f0
	.word	0x84cbc468 !aes_dround23	%f46,%f8,%f2,%f2
	.word	0x88cb4c4a !aes_dround01	%f44,%f10,%f6,%f4
	.word	0x8ccbcc6a !aes_dround23	%f46,%f10,%f6,%f6
	.word	0x90cc4440 !aes_dround01	%f48,%f0,%f2,%f8
	.word	0x84ccc460 !aes_dround23	%f50,%f0,%f2,%f2
	.word	0x94cc4c44 !aes_dround01	%f48,%f4,%f6,%f10
	.word	0x8ccccc64 !aes_dround23	%f50,%f4,%f6,%f6
	.word	0x80cd4448 !aes_dround01	%f52,%f8,%f2,%f0
	.word	0x84cdc468 !aes_dround23	%f54,%f8,%f2,%f2
	.word	0x88cd4c4a !aes_dround01	%f52,%f10,%f6,%f4
	.word	0x8ccdcc6a !aes_dround23	%f54,%f10,%f6,%f6
	.word	0x90ce4440 !aes_dround01	%f56,%f0,%f2,%f8
	.word	0x84cec460 !aes_dround23	%f58,%f0,%f2,%f2
	.word	0x94ce4c44 !aes_dround01	%f56,%f4,%f6,%f10
	.word	0x8ccecc64 !aes_dround23	%f58,%f4,%f6,%f6
	.word	0x80cf4448 !aes_dround01	%f60,%f8,%f2,%f0
	.word	0x84cfc468 !aes_dround23	%f62,%f8,%f2,%f2
	.word	0x88cf4c4a !aes_dround01	%f60,%f10,%f6,%f4
	.word	0x8ccfcc6a !aes_dround23	%f62,%f10,%f6,%f6
	.word	0x90cc0440 !aes_dround01	%f16,%f0,%f2,%f8
	.word	0x84cc8460 !aes_dround23	%f18,%f0,%f2,%f2
	.word	0x94cc0c44 !aes_dround01	%f16,%f4,%f6,%f10
	.word	0x8ccc8c64 !aes_dround23	%f18,%f4,%f6,%f6
	ldd		[%i3 + 16], %f16
	ldd		[%i3 + 24], %f18
	.word	0x80cd04c8 !aes_dround01_l	%f20,%f8,%f2,%f0
	.word	0x84cd84e8 !aes_dround23_l	%f22,%f8,%f2,%f2
	.word	0x88cd0cca !aes_dround01_l	%f20,%f10,%f6,%f4
	.word	0x8ccd8cea !aes_dround23_l	%f22,%f10,%f6,%f6
	ldd		[%i3 + 32], %f20
	retl
	ldd		[%i3 + 40], %f22
.type	_aes256_decrypt_2x,#function
.size	_aes256_decrypt_2x,.-_aes256_decrypt_2x

.align	32
_aes192_decrypt_1x:
	.word	0x88cc0440 !aes_dround01	%f16,%f0,%f2,%f4
	.word	0x84cc8460 !aes_dround23	%f18,%f0,%f2,%f2
	.word	0x80cd0444 !aes_dround01	%f20,%f4,%f2,%f0
	.word	0x84cd8464 !aes_dround23	%f22,%f4,%f2,%f2
	.word	0x88ce0440 !aes_dround01	%f24,%f0,%f2,%f4
	.word	0x84ce8460 !aes_dround23	%f26,%f0,%f2,%f2
	.word	0x80cf0444 !aes_dround01	%f28,%f4,%f2,%f0
	.word	0x84cf8464 !aes_dround23	%f30,%f4,%f2,%f2
	.word	0x88c84440 !aes_dround01	%f32,%f0,%f2,%f4
	.word	0x84c8c460 !aes_dround23	%f34,%f0,%f2,%f2
	.word	0x80c94444 !aes_dround01	%f36,%f4,%f2,%f0
	.word	0x84c9c464 !aes_dround23	%f38,%f4,%f2,%f2
	.word	0x88ca4440 !aes_dround01	%f40,%f0,%f2,%f4
	.word	0x84cac460 !aes_dround23	%f42,%f0,%f2,%f2
	.word	0x80cb4444 !aes_dround01	%f44,%f4,%f2,%f0
	.word	0x84cbc464 !aes_dround23	%f46,%f4,%f2,%f2
	.word	0x88cc4440 !aes_dround01	%f48,%f0,%f2,%f4
	.word	0x84ccc460 !aes_dround23	%f50,%f0,%f2,%f2
	.word	0x80cd4444 !aes_dround01	%f52,%f4,%f2,%f0
	.word	0x84cdc464 !aes_dround23	%f54,%f4,%f2,%f2
	.word	0x88ce4440 !aes_dround01	%f56,%f0,%f2,%f4
	.word	0x84cec460 !aes_dround23	%f58,%f0,%f2,%f2
	.word	0x80cf44c4 !aes_dround01_l	%f60,%f4,%f2,%f0
	retl
	.word	0x84cfc4e4 !aes_dround23_l	%f62,%f4,%f2,%f2
.type	_aes192_decrypt_1x,#function
.size	_aes192_decrypt_1x,.-_aes192_decrypt_1x

.align	32
_aes192_decrypt_2x:
	.word	0x90cc0440 !aes_dround01	%f16,%f0,%f2,%f8
	.word	0x84cc8460 !aes_dround23	%f18,%f0,%f2,%f2
	.word	0x94cc0c44 !aes_dround01	%f16,%f4,%f6,%f10
	.word	0x8ccc8c64 !aes_dround23	%f18,%f4,%f6,%f6
	.word	0x80cd0448 !aes_dround01	%f20,%f8,%f2,%f0
	.word	0x84cd8468 !aes_dround23	%f22,%f8,%f2,%f2
	.word	0x88cd0c4a !aes_dround01	%f20,%f10,%f6,%f4
	.word	0x8ccd8c6a !aes_dround23	%f22,%f10,%f6,%f6
	.word	0x90ce0440 !aes_dround01	%f24,%f0,%f2,%f8
	.word	0x84ce8460 !aes_dround23	%f26,%f0,%f2,%f2
	.word	0x94ce0c44 !aes_dround01	%f24,%f4,%f6,%f10
	.word	0x8cce8c64 !aes_dround23	%f26,%f4,%f6,%f6
	.word	0x80cf0448 !aes_dround01	%f28,%f8,%f2,%f0
	.word	0x84cf8468 !aes_dround23	%f30,%f8,%f2,%f2
	.word	0x88cf0c4a !aes_dround01	%f28,%f10,%f6,%f4
	.word	0x8ccf8c6a !aes_dround23	%f30,%f10,%f6,%f6
	.word	0x90c84440 !aes_dround01	%f32,%f0,%f2,%f8
	.word	0x84c8c460 !aes_dround23	%f34,%f0,%f2,%f2
	.word	0x94c84c44 !aes_dround01	%f32,%f4,%f6,%f10
	.word	0x8cc8cc64 !aes_dround23	%f34,%f4,%f6,%f6
	.word	0x80c94448 !aes_dround01	%f36,%f8,%f2,%f0
	.word	0x84c9c468 !aes_dround23	%f38,%f8,%f2,%f2
	.word	0x88c94c4a !aes_dround01	%f36,%f10,%f6,%f4
	.word	0x8cc9cc6a !aes_dround23	%f38,%f10,%f6,%f6
	.word	0x90ca4440 !aes_dround01	%f40,%f0,%f2,%f8
	.word	0x84cac460 !aes_dround23	%f42,%f0,%f2,%f2
	.word	0x94ca4c44 !aes_dround01	%f40,%f4,%f6,%f10
	.word	0x8ccacc64 !aes_dround23	%f42,%f4,%f6,%f6
	.word	0x80cb4448 !aes_dround01	%f44,%f8,%f2,%f0
	.word	0x84cbc468 !aes_dround23	%f46,%f8,%f2,%f2
	.word	0x88cb4c4a !aes_dround01	%f44,%f10,%f6,%f4
	.word	0x8ccbcc6a !aes_dround23	%f46,%f10,%f6,%f6
	.word	0x90cc4440 !aes_dround01	%f48,%f0,%f2,%f8
	.word	0x84ccc460 !aes_dround23	%f50,%f0,%f2,%f2
	.word	0x94cc4c44 !aes_dround01	%f48,%f4,%f6,%f10
	.word	0x8ccccc64 !aes_dround23	%f50,%f4,%f6,%f6
	.word	0x80cd4448 !aes_dround01	%f52,%f8,%f2,%f0
	.word	0x84cdc468 !aes_dround23	%f54,%f8,%f2,%f2
	.word	0x88cd4c4a !aes_dround01	%f52,%f10,%f6,%f4
	.word	0x8ccdcc6a !aes_dround23	%f54,%f10,%f6,%f6
	.word	0x90ce4440 !aes_dround01	%f56,%f0,%f2,%f8
	.word	0x84cec460 !aes_dround23	%f58,%f0,%f2,%f2
	.word	0x94ce4c44 !aes_dround01	%f56,%f4,%f6,%f10
	.word	0x8ccecc64 !aes_dround23	%f58,%f4,%f6,%f6
	.word	0x80cf44c8 !aes_dround01_l	%f60,%f8,%f2,%f0
	.word	0x84cfc4e8 !aes_dround23_l	%f62,%f8,%f2,%f2
	.word	0x88cf4cca !aes_dround01_l	%f60,%f10,%f6,%f4
	retl
	.word	0x8ccfccea !aes_dround23_l	%f62,%f10,%f6,%f6
.type	_aes192_decrypt_2x,#function
.size	_aes192_decrypt_2x,.-_aes192_decrypt_2x
.asciz	"AES for SPARC T4, David S. Miller, Andy Polyakov"
.align	4