/* From the Intel IA-64 Optimization Guide, choose the minimum latency alternative. */ #include #undef ret #include #if SHLIB_COMPAT(libc, GLIBC_2_2, GLIBC_2_2_6) /* __divtf3 Compute a 80-bit IEEE double-extended quotient. farg0 holds the dividend. farg1 holds the divisor. */ ENTRY(___divtf3) cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fnma.s1 f11 = farg1, f10, f1 (p6) fma.s1 f12 = farg0, f10, f0 ;; (p6) fma.s1 f13 = f11, f11, f0 (p6) fma.s1 f14 = f11, f11, f11 ;; (p6) fma.s1 f11 = f13, f13, f11 (p6) fma.s1 f13 = f14, f10, f10 ;; (p6) fma.s1 f10 = f13, f11, f10 (p6) fnma.s1 f11 = farg1, f12, farg0 ;; (p6) fma.s1 f11 = f11, f10, f12 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f10 = f12, f10, f10 (p6) fnma.s1 f12 = farg1, f11, farg0 ;; (p6) fma.s0 fret0 = f12, f10, f11 (p7) mov fret0 = f10 br.ret.sptk rp END(___divtf3) .symver ___divtf3, __divtf3@GLIBC_2.2 /* __divdf3 Compute a 64-bit IEEE double quotient. farg0 holds the dividend. farg1 holds the divisor. */ ENTRY(___divdf3) cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fmpy.s1 f11 = farg0, f10 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fmpy.s1 f13 = f12, f12 ;; (p6) fma.s1 f10 = f12, f10, f10 (p6) fma.s1 f11 = f13, f11, f11 ;; (p6) fmpy.s1 f12 = f13, f13 (p6) fma.s1 f10 = f13, f10, f10 ;; (p6) fma.d.s1 f11 = f12, f11, f11 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fnma.d.s1 f8 = farg1, f11, farg0 ;; (p6) fma.d fret0 = f8, f10, f11 (p7) mov fret0 = f10 br.ret.sptk rp ;; END(___divdf3) .symver ___divdf3, __divdf3@GLIBC_2.2 /* __divsf3 Compute a 32-bit IEEE float quotient. farg0 holds the dividend. farg1 holds the divisor. */ ENTRY(___divsf3) cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fmpy.s1 f8 = farg0, f10 (p6) fnma.s1 f9 = farg1, f10, f1 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fmpy.s1 f9 = f9, f9 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fmpy.s1 f9 = f9, f9 ;; (p6) fma.d.s1 f10 = f9, f8, f8 ;; (p6) fnorm.s.s0 fret0 = f10 (p7) mov fret0 = f10 br.ret.sptk rp ;; END(___divsf3) .symver ___divsf3, __divsf3@GLIBC_2.2 /* __divdi3 Compute a 64-bit integer quotient. in0 holds the dividend. in1 holds the divisor. */ ENTRY(___divdi3) .regstk 2,0,0,0 /* Transfer inputs to FP registers. */ setf.sig f8 = in0 setf.sig f9 = in1 ;; /* Convert the inputs to FP, so that they won't be treated as unsigned. */ fcvt.xf f8 = f8 fcvt.xf f9 = f9 ;; /* Compute the reciprocal approximation. */ frcpa.s1 f10, p6 = f8, f9 ;; /* 3 Newton-Raphson iterations. */ (p6) fnma.s1 f11 = f9, f10, f1 (p6) fmpy.s1 f12 = f8, f10 ;; (p6) fmpy.s1 f13 = f11, f11 (p6) fma.s1 f12 = f11, f12, f12 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; (p6) fma.s1 f10 = f12, f10, f11 ;; /* Round quotient to an integer. */ fcvt.fx.trunc.s1 f10 = f10 ;; /* Transfer result to GP registers. */ getf.sig ret0 = f10 br.ret.sptk rp ;; END(___divdi3) .symver ___divdi3, __divdi3@GLIBC_2.2 /* __moddi3 Compute a 64-bit integer modulus. in0 holds the dividend (a). in1 holds the divisor (b). */ ENTRY(___moddi3) .regstk 2,0,0,0 /* Transfer inputs to FP registers. */ setf.sig f14 = in0 setf.sig f9 = in1 ;; /* Convert the inputs to FP, so that they won't be treated as unsigned. */ fcvt.xf f8 = f14 fcvt.xf f9 = f9 ;; /* Compute the reciprocal approximation. */ frcpa.s1 f10, p6 = f8, f9 ;; /* 3 Newton-Raphson iterations. */ (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f11 = f9, f10, f1 ;; (p6) fma.s1 f12 = f11, f12, f12 (p6) fmpy.s1 f13 = f11, f11 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; sub in1 = r0, in1 (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; setf.sig f9 = in1 (p6) fma.s1 f10 = f12, f10, f11 ;; fcvt.fx.trunc.s1 f10 = f10 ;; /* r = q * (-b) + a */ xma.l f10 = f10, f9, f14 ;; /* Transfer result to GP registers. */ getf.sig ret0 = f10 br.ret.sptk rp ;; END(___moddi3) .symver ___moddi3, __moddi3@GLIBC_2.2 /* __udivdi3 Compute a 64-bit unsigned integer quotient. in0 holds the dividend. in1 holds the divisor. */ ENTRY(___udivdi3) .regstk 2,0,0,0 /* Transfer inputs to FP registers. */ setf.sig f8 = in0 setf.sig f9 = in1 ;; /* Convert the inputs to FP, to avoid FP software-assist faults. */ fcvt.xuf.s1 f8 = f8 fcvt.xuf.s1 f9 = f9 ;; /* Compute the reciprocal approximation. */ frcpa.s1 f10, p6 = f8, f9 ;; /* 3 Newton-Raphson iterations. */ (p6) fnma.s1 f11 = f9, f10, f1 (p6) fmpy.s1 f12 = f8, f10 ;; (p6) fmpy.s1 f13 = f11, f11 (p6) fma.s1 f12 = f11, f12, f12 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; (p6) fma.s1 f10 = f12, f10, f11 ;; /* Round quotient to an unsigned integer. */ fcvt.fxu.trunc.s1 f10 = f10 ;; /* Transfer result to GP registers. */ getf.sig ret0 = f10 br.ret.sptk rp ;; END(___udivdi3) .symver ___udivdi3, __udivdi3@GLIBC_2.2 /* __umoddi3 Compute a 64-bit unsigned integer modulus. in0 holds the dividend (a). in1 holds the divisor (b). */ ENTRY(___umoddi3) .regstk 2,0,0,0 /* Transfer inputs to FP registers. */ setf.sig f14 = in0 setf.sig f9 = in1 ;; /* Convert the inputs to FP, to avoid FP software assist faults. */ fcvt.xuf.s1 f8 = f14 fcvt.xuf.s1 f9 = f9 ;; /* Compute the reciprocal approximation. */ frcpa.s1 f10, p6 = f8, f9 ;; /* 3 Newton-Raphson iterations. */ (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f11 = f9, f10, f1 ;; (p6) fma.s1 f12 = f11, f12, f12 (p6) fmpy.s1 f13 = f11, f11 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; sub in1 = r0, in1 (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; setf.sig f9 = in1 (p6) fma.s1 f10 = f12, f10, f11 ;; /* Round quotient to an unsigned integer. */ fcvt.fxu.trunc.s1 f10 = f10 ;; /* r = q * (-b) + a */ xma.l f10 = f10, f9, f14 ;; /* Transfer result to GP registers. */ getf.sig ret0 = f10 br.ret.sptk rp ;; END(___umoddi3) .symver ___umoddi3, __umoddi3@GLIBC_2.2 /* __multi3 Compute a 128-bit multiply of 128-bit multiplicands. in0/in1 holds one multiplicand (a), in2/in3 holds the other one (b). */ ENTRY(___multi3) .regstk 4,0,0,0 setf.sig f6 = in1 movl r19 = 0xffffffff setf.sig f7 = in2 ;; and r14 = r19, in0 ;; setf.sig f10 = r14 and r14 = r19, in2 xmpy.l f9 = f6, f7 ;; setf.sig f6 = r14 shr.u r14 = in0, 32 ;; setf.sig f7 = r14 shr.u r14 = in2, 32 ;; setf.sig f8 = r14 xmpy.l f11 = f10, f6 xmpy.l f6 = f7, f6 ;; getf.sig r16 = f11 xmpy.l f7 = f7, f8 ;; shr.u r14 = r16, 32 and r16 = r19, r16 getf.sig r17 = f6 setf.sig f6 = in0 ;; setf.sig f11 = r14 getf.sig r21 = f7 setf.sig f7 = in3 ;; xma.l f11 = f10, f8, f11 xma.l f6 = f6, f7, f9 ;; getf.sig r18 = f11 ;; add r18 = r18, r17 ;; and r15 = r19, r18 cmp.ltu p7, p6 = r18, r17 ;; getf.sig r22 = f6 (p7) adds r14 = 1, r19 ;; (p7) add r21 = r21, r14 shr.u r14 = r18, 32 shl r15 = r15, 32 ;; add r20 = r21, r14 ;; add ret0 = r15, r16 add ret1 = r22, r20 br.ret.sptk rp ;; END(___multi3) .symver ___multi3, __multi3@GLIBC_2.2 #endif