summaryrefslogtreecommitdiff
path: root/sysdeps/ia64/fpu/libm_sincos.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/ia64/fpu/libm_sincos.S')
-rw-r--r--sysdeps/ia64/fpu/libm_sincos.S164
1 files changed, 82 insertions, 82 deletions
diff --git a/sysdeps/ia64/fpu/libm_sincos.S b/sysdeps/ia64/fpu/libm_sincos.S
index a3f4c72743..3475b6281b 100644
--- a/sysdeps/ia64/fpu/libm_sincos.S
+++ b/sysdeps/ia64/fpu/libm_sincos.S
@@ -46,12 +46,13 @@
// 03/19/02 Added stack unwind around call to __libm_cis_large
// 09/05/02 Work range is widened by reduction strengthen (3 parts of Pi/16)
// 02/10/03 Reordered header: .section, .global, .proc, .align
-//
+// 08/08/03 Improved performance
+// 02/11/04 cis is moved to the separate file.
+//
// API
//==============================================================
-// 1) double _Complex cis(double)
-// 2) void sincos(double, double*s, double*c)
-// 3) __libm_sincos - internal LIBM function, that accepts
+// 1) void sincos(double, double*s, double*c)
+// 2) __libm_sincos - internal LIBM function, that accepts
// argument in f8 and returns cosine through f8, sine through f9
//
// Overview of operation
@@ -65,12 +66,12 @@
// nfloat = Round result to integer (round-to-nearest)
//
// r = x - nfloat * pi/2^k
-// Do this as ((((x - nfloat * HIGH(pi/2^k))) -
-// nfloat * LOW(pi/2^k)) -
+// Do this as ((((x - nfloat * HIGH(pi/2^k))) -
+// nfloat * LOW(pi/2^k)) -
// nfloat * LOWEST(pi/2^k) for increased accuracy.
// pi/2^k is stored as two numbers that when added make pi/2^k.
// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k)
-// HIGH and LOW parts are rounded to zero values,
+// HIGH and LOW parts are rounded to zero values,
// and LOWEST is rounded to nearest one.
//
// x = (nfloat * pi/2^k) + r
@@ -166,15 +167,14 @@
// Registers used
//==============================================================
// general input registers:
-// r14 -> r19
-// r32 -> r49
+// r14 -> r39
// predicate registers used:
// p6 -> p14
-
+//
// floating-point registers used
// f9 -> f15
-// f32 -> f100
+// f32 -> f67
// Assembly macros
//==============================================================
@@ -246,38 +246,32 @@ cis_Q = f67
cis_pResSin = r33
cis_pResCos = r34
-cis_exp_limit = r35
-cis_r_signexp = r36
-cis_AD_beta_table = r37
-cis_r_sincos = r38
-
-cis_r_exp = r39
-cis_r_17_ones = r40
-
cis_GR_sig_inv_pi_by_16 = r14
cis_GR_rshf_2to61 = r15
cis_GR_rshf = r16
cis_GR_exp_2tom61 = r17
cis_GR_n = r18
-
cis_GR_n_sin = r19
-cis_GR_m_sin = r41
-cis_GR_32m_sin = r41
-
-cis_GR_n_cos = r42
-cis_GR_m_cos = r43
-cis_GR_32m_cos = r43
-
-cis_AD_2_sin = r44
-cis_AD_2_cos = r45
-
-cis_gr_tmp = r46
-GR_SAVE_B0 = r47
-GR_SAVE_GP = r48
-rB0_SAVED = r49
-GR_SAVE_PFS = r50
-GR_SAVE_PR = r51
-cis_AD_1 = r52
+cis_exp_limit = r20
+cis_r_signexp = r21
+cis_AD_1 = r22
+cis_r_sincos = r23
+cis_r_exp = r24
+cis_r_17_ones = r25
+cis_GR_m_sin = r26
+cis_GR_32m_sin = r26
+cis_GR_n_cos = r27
+cis_GR_m_cos = r28
+cis_GR_32m_cos = r28
+cis_AD_2_sin = r29
+cis_AD_2_cos = r30
+cis_gr_tmp = r31
+
+GR_SAVE_B0 = r35
+GR_SAVE_GP = r36
+rB0_SAVED = r37
+GR_SAVE_PFS = r38
+GR_SAVE_PR = r39
RODATA
@@ -408,14 +402,14 @@ LOCAL_OBJECT_END(double_sin_cos_beta_k4)
GLOBAL_IEEE754_ENTRY(sincos)
// cis_GR_sig_inv_pi_by_16 = significand of 16/pi
{ .mlx
- alloc GR_SAVE_PFS = ar.pfs, 0, 21, 0, 0
+ getf.exp cis_r_signexp = cis_Arg
movl cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A
-
+
}
// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2)
{ .mlx
addl cis_AD_1 = @ltoff(double_cis_pi), gp
- movl cis_GR_rshf_2to61 = 0x47b8000000000000
+ movl cis_GR_rshf_2to61 = 0x47b8000000000000
};;
{ .mfi
@@ -430,12 +424,11 @@ GLOBAL_IEEE754_ENTRY(sincos)
br.cond.sptk _CIS_COMMON
};;
GLOBAL_IEEE754_END(sincos)
-LOCAL_LIBM_ENTRY(cis)
-LOCAL_LIBM_END(cis)
+
GLOBAL_LIBM_ENTRY(__libm_sincos)
// cis_GR_sig_inv_pi_by_16 = significand of 16/pi
{ .mlx
- alloc GR_SAVE_PFS = ar.pfs,0,21,0,0
+ getf.exp cis_r_signexp = cis_Arg
movl cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A
}
// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2)
@@ -443,11 +436,12 @@ GLOBAL_LIBM_ENTRY(__libm_sincos)
addl cis_AD_1 = @ltoff(double_cis_pi), gp
movl cis_GR_rshf_2to61 = 0x47b8000000000000
};;
+
// p14 set for __libm_sincos and cis
{ .mfi
ld8 cis_AD_1 = [cis_AD_1]
fnorm.s1 cis_NORM_f8 = cis_Arg
- cmp.eq p14, p13 = r0, r0
+ cmp.eq p14, p13 = r0, r0
}
// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61
{ .mib
@@ -476,10 +470,15 @@ _CIS_COMMON:
// 2^-61 for scaling Nfloat
// 0x1001a is register_bias + 27.
// So if f8 >= 2^27, go to large arguments routine
-{ .mmi
- getf.exp cis_r_signexp = cis_Arg
- setf.exp cis_2TOM61 = cis_GR_exp_2tom61
+{ .mfi
+ alloc GR_SAVE_PFS = ar.pfs, 3, 5, 0, 0
+ fclass.m p11,p0 = cis_Arg, 0x0b // Test for x=unorm
mov cis_exp_limit = 0x1001a
+}
+{ .mib
+ setf.exp cis_2TOM61 = cis_GR_exp_2tom61
+ nop.i 0
+(p6) br.cond.spnt _CIS_SPECIAL_ARGS
};;
// Load the two pieces of pi/16
@@ -488,9 +487,11 @@ _CIS_COMMON:
{ .mmb
ldfe cis_Pi_by_16_hi = [cis_AD_1],16
setf.d cis_RSHF = cis_GR_rshf
-(p6) br.cond.spnt _CIS_SPECIAL_ARGS
+(p11) br.cond.spnt _CIS_UNORM // Branch if x=unorm
};;
+_CIS_COMMON2:
+// Return here if x=unorm
// Create constant inexact set
{ .mmi
ldfe cis_Pi_by_16_lo = [cis_AD_1],16
@@ -498,23 +499,18 @@ _CIS_COMMON:
nop.i 0
};;
+// Select exponent (17 lsb)
{ .mfi
ldfe cis_Pi_by_16_lowest = [cis_AD_1],16
nop.f 0
- nop.i 0
-};;
-
-// Start loading P, Q coefficients
-{ .mib
- ldfpd cis_P4,cis_Q4 = [cis_AD_1],16
dep.z cis_r_exp = cis_r_signexp, 0, 17
- nop.b 0
};;
+// Start loading P, Q coefficients
// p10 is true if we must call routines to handle larger arguments
// p10 is true if f8 exp is > 0x1001a
{ .mmb
- ldfpd cis_P3,cis_Q3 = [cis_AD_1],16
+ ldfpd cis_P4,cis_Q4 = [cis_AD_1],16
cmp.ge p10, p0 = cis_r_exp, cis_exp_limit
(p10) br.cond.spnt _CIS_LARGE_ARGS // go to |x| >= 2^27 path
};;
@@ -523,39 +519,33 @@ _CIS_COMMON:
// Multiply x by scaled 16/pi and add large const to shift integer part of W to
// rightmost bits of significand
{ .mfi
- ldfpd cis_P2,cis_Q2 = [cis_AD_1],16
+ ldfpd cis_P3,cis_Q3 = [cis_AD_1],16
fma.s1 cis_W_2TO61_RSH = cis_NORM_f8,cis_SIG_INV_PI_BY_16_2TO61,cis_RSHF_2TO61
nop.i 0
};;
+// get N = (int)cis_int_Nfloat
// cis_NFLOAT = Round_Int_Nearest(cis_W)
-{ .mfi
- ldfpd cis_P1,cis_Q1 = [cis_AD_1], 16
+{ .mmf
+ getf.sig cis_GR_n = cis_W_2TO61_RSH
+ ldfpd cis_P2,cis_Q2 = [cis_AD_1],16
fms.s1 cis_NFLOAT = cis_W_2TO61_RSH,cis_2TOM61,cis_RSHF
- nop.i 0
-};;
-
-// get N = (int)cis_int_Nfloat
-{ .mfi
- getf.sig cis_GR_n = cis_W_2TO61_RSH
- nop.f 0
- nop.i 0
};;
-// Add 2^(k-1) (which is in cis_r_sincos) to N
// cis_r = -cis_Nfloat * cis_Pi_by_16_hi + x
-// cis_r = cis_r -cis_Nfloat * cis_Pi_by_16_lo
{ .mfi
- add cis_GR_n_cos = 0x8, cis_GR_n
+ ldfpd cis_P1,cis_Q1 = [cis_AD_1], 16
fnma.s1 cis_r = cis_NFLOAT,cis_Pi_by_16_hi,cis_NORM_f8
nop.i 0
};;
-//Get M (least k+1 bits of N)
+// Add 2^(k-1) (which is in cis_r_sincos) to N
{ .mmi
+ add cis_GR_n_cos = 0x8, cis_GR_n
+;;
+//Get M (least k+1 bits of N)
and cis_GR_m_sin = 0x1f,cis_GR_n
and cis_GR_m_cos = 0x1f,cis_GR_n_cos
- nop.i 0
};;
{ .mmi
@@ -565,9 +555,10 @@ _CIS_COMMON:
};;
// Add 32*M to address of sin_cos_beta table
-{ .mmi
+// cis_r = cis_r -cis_Nfloat * cis_Pi_by_16_lo
+{ .mfi
add cis_AD_2_sin = cis_GR_32m_sin, cis_AD_1
- nop.m 0
+ fnma.s1 cis_r = cis_NFLOAT, cis_Pi_by_16_lo, cis_r
shl cis_GR_32m_cos = cis_GR_m_cos,5
};;
@@ -580,7 +571,6 @@ _CIS_COMMON:
{ .mfi
ldfe cis_Sm_cos = [cis_AD_2_cos], 16
- fnma.s1 cis_r = cis_NFLOAT, cis_Pi_by_16_lo, cis_r
nop.i 0
};;
@@ -604,7 +594,7 @@ _CIS_COMMON:
{ .mfi
ldfe cis_Cm_cos = [cis_AD_2_cos]
- fma.s1 cis_P_temp1 = cis_rsq, cis_P4, cis_P3
+ fma.s1 cis_P_temp1 = cis_rsq, cis_P4, cis_P3
nop.i 0
}
@@ -638,18 +628,18 @@ _CIS_COMMON:
{ .mfi
nop.m 0
- fma.s1 cis_Q = cis_rsq, cis_Q_temp2, cis_Q1
+ fmpy.s1 cis_rcub = cis_r_exact, cis_rsq // get r^3
nop.i 0
-}
+};;
+
{ .mfi
nop.m 0
- fma.s1 cis_P = cis_rsq, cis_P_temp2, cis_P1
+ fma.s1 cis_Q = cis_rsq, cis_Q_temp2, cis_Q1
nop.i 0
-};;
-
+}
{ .mfi
nop.m 0
- fmpy.s1 cis_rcub = cis_r_exact, cis_rsq // get r^3
+ fma.s1 cis_P = cis_rsq, cis_P_temp2, cis_P1
nop.i 0
};;
@@ -717,7 +707,17 @@ _CIS_SPECIAL_ARGS:
stfd [cis_pResCos] = cis_Cos_res
br.ret.sptk b0 // common exit for sincos main path
};;
+
+_CIS_UNORM:
+// Here if x=unorm
+{ .mfb
+ getf.exp cis_r_signexp = cis_NORM_f8 // Get signexp of x
+ fcmp.eq.s0 p11,p0 = cis_Arg, f0 // Dummy op to set denorm
+ br.cond.sptk _CIS_COMMON2 // Return to main path
+};;
+
GLOBAL_LIBM_END(__libm_sincos)
+
//// |x| > 2^27 path ///////
.proc _CIS_LARGE_ARGS
_CIS_LARGE_ARGS: