summaryrefslogtreecommitdiff
path: root/arch/ia64/lib/ip_fast_csum.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/lib/ip_fast_csum.S')
-rw-r--r--arch/ia64/lib/ip_fast_csum.S90
1 files changed, 90 insertions, 0 deletions
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
new file mode 100644
index 00000000000..19674ca2acf
--- /dev/null
+++ b/arch/ia64/lib/ip_fast_csum.S
@@ -0,0 +1,90 @@
+/*
+ * Optmized version of the ip_fast_csum() function
+ * Used for calculating IP header checksum
+ *
+ * Return: 16bit checksum, complemented
+ *
+ * Inputs:
+ * in0: address of buffer to checksum (char *)
+ * in1: length of the buffer (int)
+ *
+ * Copyright (C) 2002 Intel Corp.
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ */
+
+#include <asm/asmmacro.h>
+
+/*
+ * Since we know that most likely this function is called with buf aligned
+ * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
+ * versus calling generic version of do_csum, which has lots of overhead in
+ * handling various alignments and sizes. However, due to lack of constrains
+ * put on the function input argument, cases with alignment not on 4-byte or
+ * size not equal to 20 bytes will be handled by the generic do_csum function.
+ */
+
+#define in0 r32
+#define in1 r33
+#define ret0 r8
+
+GLOBAL_ENTRY(ip_fast_csum)
+ .prologue
+ .body
+ cmp.ne p6,p7=5,in1 // size other than 20 byte?
+ and r14=3,in0 // is it aligned on 4-byte?
+ add r15=4,in0 // second source pointer
+ ;;
+ cmp.ne.or.andcm p6,p7=r14,r0
+ ;;
+(p7) ld4 r20=[in0],8
+(p7) ld4 r21=[r15],8
+(p6) br.spnt .generic
+ ;;
+ ld4 r22=[in0],8
+ ld4 r23=[r15],8
+ ;;
+ ld4 r24=[in0]
+ add r20=r20,r21
+ add r22=r22,r23
+ ;;
+ add r20=r20,r22
+ ;;
+ add r20=r20,r24
+ ;;
+ shr.u ret0=r20,16 // now need to add the carry
+ zxt2 r20=r20
+ ;;
+ add r20=ret0,r20
+ ;;
+ shr.u ret0=r20,16 // add carry again
+ zxt2 r20=r20
+ ;;
+ add r20=ret0,r20
+ ;;
+ shr.u ret0=r20,16
+ zxt2 r20=r20
+ ;;
+ add r20=ret0,r20
+ ;;
+ andcm ret0=-1,r20
+ .restore sp // reset frame state
+ br.ret.sptk.many b0
+ ;;
+
+.generic:
+ .prologue
+ .save ar.pfs, r35
+ alloc r35=ar.pfs,2,2,2,0
+ .save rp, r34
+ mov r34=b0
+ .body
+ dep.z out1=in1,2,30
+ mov out0=in0
+ ;;
+ br.call.sptk.many b0=do_csum
+ ;;
+ andcm ret0=-1,ret0
+ mov ar.pfs=r35
+ mov b0=r34
+ br.ret.sptk.many b0
+END(ip_fast_csum)