/* Optimized memset implementation for PowerPC32/POWER7.
   Copyright (C) 2010 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>
#include <bp-sym.h>
#include <bp-asm.h>

/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
   Returns 's'.  */

	.machine  power7
EALIGN (BP_SYM (memset), 5, 0)
	CALL_MCOUNT

	.align	4
L(_memset):
	cmplwi	cr7,5,31
	cmplwi	cr6,5,8
	mr	10,3		/* Save original argument for later.  */
	mr	7,1		/* Save original r1 for later.  */
	cfi_offset(31,-8)

	/* Replicate byte to word.  */
	rlwimi	4,4,8,16,23
	rlwimi	4,4,16,0,15

	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */

	neg	0,3
	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */

	/* Save our word twice to create a doubleword that we will later
	   copy to a FPR.  */
	stwu	1,-32(1)
	andi.	11,10,7		/* Check alignment of DST.  */
	mr	12,5
	stw	4,24(1)
	stw	4,28(1)
	beq	L(big_aligned)

	clrlwi	0,0,29
	mtocrf	0x01,0
	subf	5,0,5

	/* Get DST aligned to 8 bytes.  */
1:	bf	31,2f

	stb	4,0(10)
	addi	10,10,1
2:	bf	30,4f

	sth	4,0(10)
	addi	10,10,2
4:	bf	29,L(big_aligned)

	stw	4,0(10)
	addi	10,10,4

	.align	4
L(big_aligned):
	cmplwi	cr5,5,255
	li	0,32
	cmplwi	cr1,5,160
	dcbtst	0,10
	cmplwi	cr6,4,0
	srwi	9,5,3		/* Number of full doublewords remaining.  */
	crand	27,26,21
	mtocrf	0x01,9
	bt	27,L(huge)

	/* From this point on, we'll copy 32+ bytes and the value
	   isn't 0 (so we can't use dcbz).  */

	srwi	8,5,5
	clrlwi	11,5,29
	cmplwi	cr6,11,0
	cmplwi	cr1,9,4
	mtctr	8

	/* Copy 1~3 doublewords so the main loop starts
	at a multiple of 32 bytes.  */

	bf	30,1f

	stw	4,0(10)
	stw	4,4(10)
	stw	4,8(10)
	stw	4,12(10)
	addi	10,10,16
	bf	31,L(big_loop)

	stw	4,0(10)
	stw	4,4(10)
	addi	10,10,8
	mr	12,10
	blt	cr1,L(tail_bytes)

	b	L(big_loop)

	.align	4
1:	/* Copy 1 doubleword.  */
	bf	31,L(big_loop)

	stw	4,0(10)
	stw	4,4(10)
	addi	10,10,8

	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
	   to the lfd we will do next.  Also, ping-pong through r10 and r12
	   to avoid AGEN delays.  */
	.align	4
L(big_loop):
	addi	12,10,32
	stw	4,0(10)
	stw	4,4(10)
	stw	4,8(10)
	stw	4,12(10)
	stw	4,16(10)
	stw	4,20(10)
	stw	4,24(10)
	stw	4,28(10)
	bdz	L(tail_bytes)

	addi	10,10,64
	stw	4,0(12)
	stw	4,4(12)
	stw	4,8(12)
	stw	4,12(12)
	stw	4,16(12)
	stw	4,20(12)
	stw	4,24(12)
	stw	4,28(12)
	bdnz	L(big_loop_fast_setup)

	mr	12,10
	b	L(tail_bytes)

	/* Now that we're probably past the LHS window, use the VSX to
	   speed up the loop.  */
L(big_loop_fast_setup):
	li	11,24
	li	6,16
	lxvdsx	4,1,11

	.align	4
L(big_loop_fast):
	addi	12,10,32
	stxvd2x	4,0,10
	stxvd2x	4,10,6
	bdz	L(tail_bytes)

	addi	10,10,64
	stxvd2x	4,0,12
	stxvd2x	4,12,6
	bdnz	L(big_loop_fast)

	mr	12,10

	.align	4
L(tail_bytes):

	/* Check for tail bytes.  */
	mr	1,7		/* Restore r1.  */
	beqlr	cr6

	clrlwi	0,5,29
	mtocrf	0x01,0

	/*  At this point we have a tail of 0-7 bytes and we know that the
	destination is doubleword-aligned.  */
4:	/* Copy 4 bytes.  */
	bf	29,2f

	stw	4,0(12)
	addi	12,12,4
2:	/* Copy 2 bytes.  */
	bf	30,1f

	sth	4,0(12)
	addi	12,12,2
1:	/* Copy 1 byte.  */
	bflr	31

	stb	4,0(12)
	blr


	/* Special case when value is 0 and we have a long length to deal
	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
	   dcbz though, we need to get the destination 128-bytes aligned.  */
	.align	4
L(huge):
	lfd	4,24(1)
	andi.	11,10,127
	neg	0,10
	beq	L(huge_aligned)

	clrlwi	0,0,25
	subf	5,0,5
	srwi	0,0,3
	mtocrf  0x01,0

	/* Get DST aligned to 128 bytes.  */
8:	bf	28,4f

	stfd	4,0(10)
	stfd	4,8(10)
	stfd	4,16(10)
	stfd	4,24(10)
	stfd	4,32(10)
	stfd	4,40(10)
	stfd	4,48(10)
	stfd	4,56(10)
	addi	10,10,64
	.align	4
4:	bf	29,2f

	stfd	4,0(10)
	stfd	4,8(10)
	stfd	4,16(10)
	stfd	4,24(10)
	addi	10,10,32
	.align	4
2:	bf	30,1f

	stfd	4,0(10)
	stfd	4,8(10)
	addi	10,10,16
	.align	4
1:	bf	31,L(huge_aligned)

	stfd	4,0(10)
	addi	10,10,8

L(huge_aligned):
	srwi	8,5,7
	clrlwi	11,5,25
	cmplwi	cr6,11,0
	mtctr	8

	/* Copies 128-bytes at a time.  */
	.align	4
L(huge_loop):
	dcbz	0,10
	addi	10,10,128
	bdnz	L(huge_loop)

	/* We have a tail of 0~127 bytes to handle.  */
	mr	1,7		/* Restore r1.  */
	beqlr	cr6

	subf	9,3,10
	subf	5,9,12
	srwi	8,5,3
	cmplwi	cr6,8,0
	mtocrf	0x01,8

	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
	speed.  We'll handle the resulting tail bytes later.  */
	beq	cr6,L(tail)

8:	bf	28,4f

	stfd	4,0(10)
	stfd	4,8(10)
	stfd	4,16(10)
	stfd	4,24(10)
	stfd	4,32(10)
	stfd	4,40(10)
	stfd	4,48(10)
	stfd	4,56(10)
	addi	10,10,64
	.align	4
4:	bf	29,2f

	stfd	4,0(10)
	stfd	4,8(10)
	stfd	4,16(10)
	stfd	4,24(10)
	addi	10,10,32
	.align	4
2:	bf	30,1f

	stfd	4,0(10)
	stfd	4,8(10)
	addi	10,10,16
	.align	4
1:	bf	31,L(tail)

	stfd	4,0(10)
	addi	10,10,8

	/* Handle the rest of the tail bytes here.  */
L(tail):
	mtocrf	0x01,5

	.align	4
4:	bf	29,2f

	stw	4,0(10)
	addi	10,10,4
	.align	4
2:	bf	30,1f

	sth	4,0(10)
	addi	10,10,2
	.align	4
1:	bflr	31

	stb	4,0(10)
	blr


	/* Expanded tree to copy tail bytes without increments.  */
	.align	4
L(copy_tail):
	bf	29,L(FXX)

	stw	4,0(10)
	bf	30,L(TFX)

	sth	4,4(10)
	bflr	31

	stb	4,6(10)
	blr

	.align	4
L(FXX):	bf	30,L(FFX)

	sth	4,0(10)
	bflr	31

	stb	4,2(10)
	blr

	.align	4
L(TFX):	bflr	31

	stb	4,4(10)
	blr

	.align	4
L(FFX):	bflr	31

	stb	4,0(10)
	blr

	/* Handle copies of 9~31 bytes.  */
	.align	4
L(medium):
	/* At least 9 bytes to go.  */
	andi.	11,10,3
	clrlwi	0,0,30
	beq	L(medium_aligned)

	/* Force 4-bytes alignment for DST.  */
	mtocrf	0x01,0
	subf	5,0,5
1:	/* Copy 1 byte.  */
	bf	31,2f

	stb	4,0(10)
	addi	10,10,1
2:	/* Copy 2 bytes.  */
	bf	30,L(medium_aligned)

	sth	4,0(10)
	addi	10,10,2

	.align	4
L(medium_aligned):
	/* At least 6 bytes to go, and DST is word-aligned.  */
	cmplwi	cr1,5,16
	mtocrf	0x01,5
	blt	cr1,8f

	/* Copy 16 bytes.  */
	stw	4,0(10)
	stw	4,4(10)
	stw	4,8(10)
	stw	4,12(10)
	addi	10,10,16
8:	/* Copy 8 bytes.  */
	bf	28,4f

	stw	4,0(10)
	stw	4,4(10)
	addi	10,10,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	stw	4,0(10)
	addi	10,10,4
2:	/* Copy 2-3 bytes.  */
	bf	30,1f

	sth	4,0(10)
	addi	10,10,2
1:	/* Copy 1 byte.  */
	bflr	31

	stb	4,0(10)
	blr

	/* Handles copies of 0~8 bytes.  */
	.align	4
L(small):
	mtocrf	0x01,5
	bne	cr6,L(copy_tail)

	stw	4,0(10)
	stw	4,4(10)
	blr

END (BP_SYM (memset))
libc_hidden_builtin_def (memset)