diff options
Diffstat (limited to 'lib/crypto/mips/chacha-core.S')
| -rw-r--r-- | lib/crypto/mips/chacha-core.S | 491 | 
1 files changed, 491 insertions, 0 deletions
| diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S new file mode 100644 index 000000000000..706aeb850fb0 --- /dev/null +++ b/lib/crypto/mips/chacha-core.S @@ -0,0 +1,491 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#define MASK_U32		0x3c +#define CHACHA20_BLOCK_SIZE	64 +#define STACK_SIZE		32 + +#define X0	$t0 +#define X1	$t1 +#define X2	$t2 +#define X3	$t3 +#define X4	$t4 +#define X5	$t5 +#define X6	$t6 +#define X7	$t7 +#define X8	$t8 +#define X9	$t9 +#define X10	$v1 +#define X11	$s6 +#define X12	$s5 +#define X13	$s4 +#define X14	$s3 +#define X15	$s2 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ +#define T0	$s1 +#define T1	$s0 +#define T(n)	T ## n +#define X(n)	X ## n + +/* Input arguments */ +#define STATE		$a0 +#define OUT		$a1 +#define IN		$a2 +#define BYTES		$a3 + +/* Output argument */ +/* NONCE[0] is kept in a register and not in memory. + * We don't want to touch original value in memory. + * Must be incremented every loop iteration. + */ +#define NONCE_0		$v0 + +/* SAVED_X and SAVED_CA are set in the jump table. + * Use regs which are overwritten on exit else we don't leak clear data. + * They are used to handling the last bytes which are not multiple of 4. + */ +#define SAVED_X		X15 +#define SAVED_CA	$s7 + +#define IS_UNALIGNED	$s7 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#define	CPU_TO_LE32(n) \ +	wsbh	n, n; \ +	rotr	n, 16; +#else +#define MSB 3 +#define LSB 0 +#define CPU_TO_LE32(n) +#endif + +#define FOR_EACH_WORD(x) \ +	x( 0); \ +	x( 1); \ +	x( 2); \ +	x( 3); \ +	x( 4); \ +	x( 5); \ +	x( 6); \ +	x( 7); \ +	x( 8); \ +	x( 9); \ +	x(10); \ +	x(11); \ +	x(12); \ +	x(13); \ +	x(14); \ +	x(15); + +#define FOR_EACH_WORD_REV(x) \ +	x(15); \ +	x(14); \ +	x(13); \ +	x(12); \ +	x(11); \ +	x(10); \ +	x( 9); \ +	x( 8); \ +	x( 7); \ +	x( 6); \ +	x( 5); \ +	x( 4); \ +	x( 3); \ +	x( 2); \ +	x( 1); \ +	x( 0); + +#define PLUS_ONE_0	 1 +#define PLUS_ONE_1	 2 +#define PLUS_ONE_2	 3 +#define PLUS_ONE_3	 4 +#define PLUS_ONE_4	 5 +#define PLUS_ONE_5	 6 +#define PLUS_ONE_6	 7 +#define PLUS_ONE_7	 8 +#define PLUS_ONE_8	 9 +#define PLUS_ONE_9	10 +#define PLUS_ONE_10	11 +#define PLUS_ONE_11	12 +#define PLUS_ONE_12	13 +#define PLUS_ONE_13	14 +#define PLUS_ONE_14	15 +#define PLUS_ONE_15	16 +#define PLUS_ONE(x)	PLUS_ONE_ ## x +#define _CONCAT3(a,b,c)	a ## b ## c +#define CONCAT3(a,b,c)	_CONCAT3(a,b,c) + +#define STORE_UNALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ +	.if (x != 12); \ +		lw	T0, (x*4)(STATE); \ +	.endif; \ +	lwl	T1, (x*4)+MSB ## (IN); \ +	lwr	T1, (x*4)+LSB ## (IN); \ +	.if (x == 12); \ +		addu	X ## x, NONCE_0; \ +	.else; \ +		addu	X ## x, T0; \ +	.endif; \ +	CPU_TO_LE32(X ## x); \ +	xor	X ## x, T1; \ +	swl	X ## x, (x*4)+MSB ## (OUT); \ +	swr	X ## x, (x*4)+LSB ## (OUT); + +#define STORE_ALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ +	.if (x != 12); \ +		lw	T0, (x*4)(STATE); \ +	.endif; \ +	lw	T1, (x*4) ## (IN); \ +	.if (x == 12); \ +		addu	X ## x, NONCE_0; \ +	.else; \ +		addu	X ## x, T0; \ +	.endif; \ +	CPU_TO_LE32(X ## x); \ +	xor	X ## x, T1; \ +	sw	X ## x, (x*4) ## (OUT); + +/* Jump table macro. + * Used for setup and handling the last bytes, which are not multiple of 4. + * X15 is free to store Xn + * Every jumptable entry must be equal in size. + */ +#define JMPTBL_ALIGNED(x) \ +.Lchacha_mips_jmptbl_aligned_ ## x: ; \ +	.set	noreorder; \ +	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \ +	.if (x == 12); \ +		addu	SAVED_X, X ## x, NONCE_0; \ +	.else; \ +		addu	SAVED_X, X ## x, SAVED_CA; \ +	.endif; \ +	.set	reorder + +#define JMPTBL_UNALIGNED(x) \ +.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ +	.set	noreorder; \ +	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \ +	.if (x == 12); \ +		addu	SAVED_X, X ## x, NONCE_0; \ +	.else; \ +		addu	SAVED_X, X ## x, SAVED_CA; \ +	.endif; \ +	.set	reorder + +#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \ +	addu	X(A), X(K); \ +	addu	X(B), X(L); \ +	addu	X(C), X(M); \ +	addu	X(D), X(N); \ +	xor	X(V), X(A); \ +	xor	X(W), X(B); \ +	xor	X(Y), X(C); \ +	xor	X(Z), X(D); \ +	rotr	X(V), 32 - S; \ +	rotr	X(W), 32 - S; \ +	rotr	X(Y), 32 - S; \ +	rotr	X(Z), 32 - S; + +.text +.set	reorder +.set	noat +.globl	chacha_crypt_arch +.ent	chacha_crypt_arch +chacha_crypt_arch: +	.frame	$sp, STACK_SIZE, $ra + +	/* Load number of rounds */ +	lw	$at, 16($sp) + +	addiu	$sp, -STACK_SIZE + +	/* Return bytes = 0. */ +	beqz	BYTES, .Lchacha_mips_end + +	lw	NONCE_0, 48(STATE) + +	/* Save s0-s7 */ +	sw	$s0,  0($sp) +	sw	$s1,  4($sp) +	sw	$s2,  8($sp) +	sw	$s3, 12($sp) +	sw	$s4, 16($sp) +	sw	$s5, 20($sp) +	sw	$s6, 24($sp) +	sw	$s7, 28($sp) + +	/* Test IN or OUT is unaligned. +	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 +	 */ +	or	IS_UNALIGNED, IN, OUT +	andi	IS_UNALIGNED, 0x3 + +	b	.Lchacha_rounds_start + +.align 4 +.Loop_chacha_rounds: +	addiu	IN,  CHACHA20_BLOCK_SIZE +	addiu	OUT, CHACHA20_BLOCK_SIZE +	addiu	NONCE_0, 1 + +.Lchacha_rounds_start: +	lw	X0,  0(STATE) +	lw	X1,  4(STATE) +	lw	X2,  8(STATE) +	lw	X3,  12(STATE) + +	lw	X4,  16(STATE) +	lw	X5,  20(STATE) +	lw	X6,  24(STATE) +	lw	X7,  28(STATE) +	lw	X8,  32(STATE) +	lw	X9,  36(STATE) +	lw	X10, 40(STATE) +	lw	X11, 44(STATE) + +	move	X12, NONCE_0 +	lw	X13, 52(STATE) +	lw	X14, 56(STATE) +	lw	X15, 60(STATE) + +.Loop_chacha_xor_rounds: +	addiu	$at, -2 +	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16); +	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12); +	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8); +	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7); +	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16); +	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12); +	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8); +	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7); +	bnez	$at, .Loop_chacha_xor_rounds + +	addiu	BYTES, -(CHACHA20_BLOCK_SIZE) + +	/* Is data src/dst unaligned? Jump */ +	bnez	IS_UNALIGNED, .Loop_chacha_unaligned + +	/* Set number rounds here to fill delayslot. */ +	lw	$at, (STACK_SIZE+16)($sp) + +	/* BYTES < 0, it has no full block. */ +	bltz	BYTES, .Lchacha_mips_no_full_block_aligned + +	FOR_EACH_WORD_REV(STORE_ALIGNED) + +	/* BYTES > 0? Loop again. */ +	bgtz	BYTES, .Loop_chacha_rounds + +	/* Place this here to fill delay slot */ +	addiu	NONCE_0, 1 + +	/* BYTES < 0? Handle last bytes */ +	bltz	BYTES, .Lchacha_mips_xor_bytes + +.Lchacha_mips_xor_done: +	/* Restore used registers */ +	lw	$s0,  0($sp) +	lw	$s1,  4($sp) +	lw	$s2,  8($sp) +	lw	$s3, 12($sp) +	lw	$s4, 16($sp) +	lw	$s5, 20($sp) +	lw	$s6, 24($sp) +	lw	$s7, 28($sp) + +	/* Write NONCE_0 back to right location in state */ +	sw	NONCE_0, 48(STATE) + +.Lchacha_mips_end: +	addiu	$sp, STACK_SIZE +	jr	$ra + +.Lchacha_mips_no_full_block_aligned: +	/* Restore the offset on BYTES */ +	addiu	BYTES, CHACHA20_BLOCK_SIZE + +	/* Get number of full WORDS */ +	andi	$at, BYTES, MASK_U32 + +	/* Load upper half of jump table addr */ +	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0) + +	/* Calculate lower half jump table offset */ +	ins	T0, $at, 1, 6 + +	/* Add offset to STATE */ +	addu	T1, STATE, $at + +	/* Add lower half jump table addr */ +	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0) + +	/* Read value from STATE */ +	lw	SAVED_CA, 0(T1) + +	/* Store remaining bytecounter as negative value */ +	subu	BYTES, $at, BYTES + +	jr	T0 + +	/* Jump table */ +	FOR_EACH_WORD(JMPTBL_ALIGNED) + + +.Loop_chacha_unaligned: +	/* Set number rounds here to fill delayslot. */ +	lw	$at, (STACK_SIZE+16)($sp) + +	/* BYTES > 0, it has no full block. */ +	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned + +	FOR_EACH_WORD_REV(STORE_UNALIGNED) + +	/* BYTES > 0? Loop again. */ +	bgtz	BYTES, .Loop_chacha_rounds + +	/* Write NONCE_0 back to right location in state */ +	sw	NONCE_0, 48(STATE) + +	.set noreorder +	/* Fall through to byte handling */ +	bgez	BYTES, .Lchacha_mips_xor_done +.Lchacha_mips_xor_unaligned_0_b: +.Lchacha_mips_xor_aligned_0_b: +	/* Place this here to fill delay slot */ +	addiu	NONCE_0, 1 +	.set reorder + +.Lchacha_mips_xor_bytes: +	addu	IN, $at +	addu	OUT, $at +	/* First byte */ +	lbu	T1, 0(IN) +	addiu	$at, BYTES, 1 +	xor	T1, SAVED_X +	sb	T1, 0(OUT) +	beqz	$at, .Lchacha_mips_xor_done +	/* Second byte */ +	lbu	T1, 1(IN) +	addiu	$at, BYTES, 2 +	rotr	SAVED_X, 8 +	xor	T1, SAVED_X +	sb	T1, 1(OUT) +	beqz	$at, .Lchacha_mips_xor_done +	/* Third byte */ +	lbu	T1, 2(IN) +	rotr	SAVED_X, 8 +	xor	T1, SAVED_X +	sb	T1, 2(OUT) +	b	.Lchacha_mips_xor_done + +.Lchacha_mips_no_full_block_unaligned: +	/* Restore the offset on BYTES */ +	addiu	BYTES, CHACHA20_BLOCK_SIZE + +	/* Get number of full WORDS */ +	andi	$at, BYTES, MASK_U32 + +	/* Load upper half of jump table addr */ +	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) + +	/* Calculate lower half jump table offset */ +	ins	T0, $at, 1, 6 + +	/* Add offset to STATE */ +	addu	T1, STATE, $at + +	/* Add lower half jump table addr */ +	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) + +	/* Read value from STATE */ +	lw	SAVED_CA, 0(T1) + +	/* Store remaining bytecounter as negative value */ +	subu	BYTES, $at, BYTES + +	jr	T0 + +	/* Jump table */ +	FOR_EACH_WORD(JMPTBL_UNALIGNED) +.end chacha_crypt_arch +.set at + +/* Input arguments + * STATE	$a0 + * OUT		$a1 + * NROUND	$a2 + */ + +#undef X12 +#undef X13 +#undef X14 +#undef X15 + +#define X12	$a3 +#define X13	$at +#define X14	$v0 +#define X15	STATE + +.set noat +.globl	hchacha_block_arch +.ent	hchacha_block_arch +hchacha_block_arch: +	.frame	$sp, STACK_SIZE, $ra + +	addiu	$sp, -STACK_SIZE + +	/* Save X11(s6) */ +	sw	X11, 0($sp) + +	lw	X0,  0(STATE) +	lw	X1,  4(STATE) +	lw	X2,  8(STATE) +	lw	X3,  12(STATE) +	lw	X4,  16(STATE) +	lw	X5,  20(STATE) +	lw	X6,  24(STATE) +	lw	X7,  28(STATE) +	lw	X8,  32(STATE) +	lw	X9,  36(STATE) +	lw	X10, 40(STATE) +	lw	X11, 44(STATE) +	lw	X12, 48(STATE) +	lw	X13, 52(STATE) +	lw	X14, 56(STATE) +	lw	X15, 60(STATE) + +.Loop_hchacha_xor_rounds: +	addiu	$a2, -2 +	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16); +	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12); +	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8); +	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7); +	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16); +	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12); +	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8); +	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7); +	bnez	$a2, .Loop_hchacha_xor_rounds + +	/* Restore used register */ +	lw	X11, 0($sp) + +	sw	X0,  0(OUT) +	sw	X1,  4(OUT) +	sw	X2,  8(OUT) +	sw	X3,  12(OUT) +	sw	X12, 16(OUT) +	sw	X13, 20(OUT) +	sw	X14, 24(OUT) +	sw	X15, 28(OUT) + +	addiu	$sp, STACK_SIZE +	jr	$ra +.end hchacha_block_arch +.set at | 
